From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit
Date: Mon, 16 Apr 2018 21:37:13 +0200
Subject: PCI: Add two more values for PCIe Max_Read_Request_Size

This patch adds missing values for the max read request size.
E.g. network driver r8169 uses a value of 4K.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pci_regs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..83ade9b5cf95 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -506,6 +506,8 @@
 #define  PCI_EXP_DEVCTL_READRQ_256B  0x1000 /* 256 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_512B  0x2000 /* 512 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */
 #define  PCI_EXP_DEVCTL_BCR_FLR 0x8000  /* Bridge Configuration Retry / FLR */
 #define PCI_EXP_DEVSTA		10	/* Device Status */
 #define  PCI_EXP_DEVSTA_CED	0x0001	/* Correctable Error Detected */
-- 
cgit v1.2.3-55-g7522


From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001
From: Hangbin Liu
Date: Tue, 17 Apr 2018 14:11:28 +0800
Subject: vxlan: add ttl inherit support

Like tos inherit, ttl inherit should also means inherit the inner protocol's
ttl values, which actually not implemented in vxlan yet.

But we could not treat ttl == 0 as "use the inner TTL", because that would be
used also when the "ttl" option is not specified and that would be a behavior
change, and breaking real use cases.

So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is
specified with ip cmd.

Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 17 ++++++++++++++---
 include/net/ip_tunnels.h     | 11 +++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index fab7a4db249e..aee0e60471f1 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2085,9 +2085,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		local_ip = vxlan->cfg.saddr;
 		dst_cache = &rdst->dst_cache;
 		md->gbp = skb->mark;
-		ttl = vxlan->cfg.ttl;
-		if (!ttl && vxlan_addr_multicast(dst))
-			ttl = 1;
+		if (flags & VXLAN_F_TTL_INHERIT) {
+			ttl = ip_tunnel_get_ttl(old_iph, skb);
+		} else {
+			ttl = vxlan->cfg.ttl;
+			if (!ttl && vxlan_addr_multicast(dst))
+				ttl = 1;
+		}
 
 		tos = vxlan->cfg.tos;
 		if (tos == 1)
@@ -2709,6 +2713,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -3254,6 +3259,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_VXLAN_TTL])
 		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_TTL_INHERIT]) {
+		if (changelink)
+			return -EOPNOTSUPP;
+		conf->flags |= VXLAN_F_TTL_INHERIT;
+	}
+
 	if (data[IFLA_VXLAN_LABEL])
 		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
 			     IPV6_FLOWLABEL_MASK;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 540a4b4417bf..751646adc769 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 		return 0;
 }
 
+static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
+				       const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->ttl;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ((const struct ipv6hdr *)iph)->hop_limit;
+	else
+		return 0;
+}
+
 /* Propogate ECN bits out */
 static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 				     const struct sk_buff *skb)
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index ad73d8b3fcc2..b99a02ae3934 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -262,6 +262,7 @@ struct vxlan_dev {
 #define VXLAN_F_COLLECT_METADATA	0x2000
 #define VXLAN_F_GPE			0x4000
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
+#define VXLAN_F_TTL_INHERIT		0x10000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 68699f654118..b85266420bfb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -516,6 +516,7 @@ enum {
 	IFLA_VXLAN_COLLECT_METADATA,
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
+	IFLA_VXLAN_TTL_INHERIT,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3-55-g7522


From b32cc5b9a346319c171e3ad905e0cddda032b5eb Mon Sep 17 00:00:00 2001
From: Nikita V. Shirokov
Date: Tue, 17 Apr 2018 21:42:13 -0700
Subject: bpf: adding bpf_xdp_adjust_tail helper

Adding new bpf helper which would allow us to manipulate
xdp's data_end pointer, and allow us to reduce packet's size
indended use case: to generate ICMP messages from XDP context,
where such message would contain truncated original packet.

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 29 ++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..9a2d1a04eb24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -755,6 +755,13 @@ union bpf_attr {
  *     @addr: pointer to struct sockaddr to bind socket to
  *     @addr_len: length of sockaddr structure
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_xdp_adjust_tail(xdp_md, delta)
+ *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
+ *     size is supported.
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: A negative integer to be added to xdp_md.data_end
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -821,7 +828,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(xdp_adjust_tail),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..29318598fd60 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2725,6 +2725,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+	void *data_end = xdp->data_end + offset;
+
+	/* only shrinking is allowed for now. */
+	if (unlikely(offset >= 0))
+		return -EINVAL;
+
+	if (unlikely(data_end < xdp->data + ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data_end = data_end;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+	.func		= bpf_xdp_adjust_tail,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
 {
 	void *meta = xdp->data_meta + offset;
@@ -3074,7 +3098,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
 	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data)
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail)
 		return true;
 
 	return false;
@@ -3888,6 +3913,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
 		return &bpf_xdp_redirect_map_proto;
+	case BPF_FUNC_xdp_adjust_tail:
+		return &bpf_xdp_adjust_tail_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-55-g7522


From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng
Date: Tue, 17 Apr 2018 23:18:49 -0700
Subject: tcp: export packets delivery info

Export data delivered and delivered with CE marks to
1) SNMP TCPDelivered and TCPDeliveredCE
2) getsockopt(TCP_INFO)
3) Timestamping API SOF_TIMESTAMPING_OPT_STATS

Note that for SCM_TSTAMP_ACK, the delivery info in
SOF_TIMESTAMPING_OPT_STATS is reported before the info
was fully updated on the ACK.

These stats help application monitor TCP delivery and ECN status
on per host, per connection, even per message level.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 include/uapi/linux/tcp.h  | 5 +++++
 net/ipv4/proc.c           | 2 ++
 net/ipv4/tcp.c            | 7 ++++++-
 net/ipv4/tcp_input.c      | 6 +++++-
 5 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 33a70ece462f..d02e859301ff 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -276,6 +276,8 @@ enum
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
+	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
+	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 560374c978f9..379b08700a54 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -224,6 +224,9 @@ struct tcp_info {
 	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
 	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	__u32	tcpi_delivered;
+	__u32	tcpi_delivered_ce;
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -244,6 +247,8 @@ enum {
 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
 	TCP_NLA_CA_STATE,	/* ca_state of socket */
 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
+	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
+	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de677e94..261b71d0ccc5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
 	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5a5ce6da4792..4022073b0aee 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3167,6 +3167,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	rate64 = tcp_compute_delivery_rate(tp);
 	if (rate64)
 		info->tcpi_delivery_rate = rate64;
+	info->tcpi_delivered = tp->delivered;
+	info->tcpi_delivered_ce = tp->delivered_ce;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3180,7 +3182,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  5 * nla_total_size(sizeof(u32)) +
+			  7 * nla_total_size(sizeof(u32)) +
 			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3211,9 +3213,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b3bff9c20606..0396fb919b5d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3499,12 +3499,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 /* Returns the number of packets newly acked or sacked by the current ACK */
 static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 {
+	const struct net *net = sock_net(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 delivered;
 
 	delivered = tp->delivered - prior_delivered;
-	if (flag & FLAG_ECE)
+	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+	if (flag & FLAG_ECE) {
 		tp->delivered_ce += delivered;
+		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+	}
 	return delivered;
 }
 
-- 
cgit v1.2.3-55-g7522


From 69b693f0aefa0ed521e8bd02260523b5ae446ad7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Wed, 18 Apr 2018 15:55:57 -0700
Subject: bpf: btf: Introduce BPF Type Format (BTF)

This patch introduces BPF type Format (BTF).

BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map.  Hence, it basically focus
on the C programming language which the modern BPF is primary
using.  The first use case is to provide a generic pretty print
capability for a BPF map.

BTF has its root from CTF (Compact C-Type format).  To simplify
the handling of BTF data, BTF removes the differences between
small and big type/struct-member.  Hence, BTF consistently uses u32
instead of supporting both "one u16" and "two u32 (+padding)" in
describing type and struct-member.

It also raises the number of types (and functions) limit
from 0x7fff to 0x7fffffff.

Due to the above changes,  the format is not compatible to CTF.
Hence, BTF starts with a new BTF_MAGIC and version number.

This patch does the first verification pass to the BTF.  The first
pass checks:
1. meta-data size (e.g. It does not go beyond the total btf's size)
2. name_offset is valid
3. Each BTF_KIND (e.g. int, enum, struct....) does its
   own check of its meta-data.

Some other checks, like checking a struct's member is referring
to a valid type, can only be done in the second pass.  The second
verification pass will be implemented in the next patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 130 +++++++
 kernel/bpf/Makefile      |   1 +
 kernel/bpf/btf.c         | 915 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1046 insertions(+)
 create mode 100644 include/uapi/linux/btf.h
 create mode 100644 kernel/bpf/btf.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..74a30b1090df
--- /dev/null
+++ b/include/uapi/linux/btf.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_MAGIC_SWAP	0x9FeB
+#define BTF_VERSION	1
+#define BTF_FLAGS_COMPR	0x01
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+
+	__u32	parent_label;
+	__u32	parent_name;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	label_off;	/* offset of label section	*/
+	__u32	object_off;	/* offset of data object section*/
+	__u32	func_off;	/* offset of function section	*/
+	__u32	type_off;	/* offset of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+	__u32 name;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
+	 * bits    31: root
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_PTR		2	/* Pointer	*/
+#define BTF_KIND_ARRAY		3	/* Array	*/
+#define BTF_KIND_STRUCT		4	/* Struct	*/
+#define BTF_KIND_UNION		5	/* Union	*/
+#define BTF_KIND_ENUM		6	/* Enumeration	*/
+#define BTF_KIND_FWD		7	/* Forward	*/
+#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
+#define BTF_KIND_VOLATILE	9	/* Volatile	*/
+#define BTF_KIND_CONST		10	/* Const	*/
+#define BTF_KIND_RESTRICT	11	/* Restrict	*/
+#define BTF_KIND_MAX		11
+#define NR_BTF_KINDS		12
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	0x1
+#define BTF_INT_CHAR	0x2
+#define BTF_INT_BOOL	0x4
+#define BTF_INT_VARARGS	0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..35c485fa9ea3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,6 +4,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..26e9ed7cea5f
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,915 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map.  Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data.  F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type.  Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id.  The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section.  The first one has type_id 1.  The second
+ * one has type_id 2...etc.  Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *".  A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *".  This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ *   - Each line started with "[?]" is a btf_type object
+ *   - [?] is the type_id of the btf_type object.
+ *   - CONST/PTR is the BTF_KIND_XXX
+ *   - "(anon)" is the name of the type.  It just
+ *     happens that CONST and PTR has no name.
+ *   - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data.  We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done.  e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+/* 64k. We can raise it later. The hard limit is S32_MAX. */
+#define BTF_MAX_NR_TYPES 65535
+
+#define for_each_member(i, struct_type, member)			\
+	for (i = 0, member = btf_type_member(struct_type);	\
+	     i < btf_type_vlen(struct_type);			\
+	     i++, member++)
+
+struct btf {
+	union {
+		struct btf_header *hdr;
+		void *data;
+	};
+	struct btf_type **types;
+	const char *strings;
+	void *nohdr_data;
+	u32 nr_types;
+	u32 types_size;
+	u32 data_size;
+};
+
+struct btf_verifier_env {
+	struct btf *btf;
+	struct bpf_verifier_log log;
+	u32 log_type_id;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+	[BTF_KIND_UNKN]		= "UNKNOWN",
+	[BTF_KIND_INT]		= "INT",
+	[BTF_KIND_PTR]		= "PTR",
+	[BTF_KIND_ARRAY]	= "ARRAY",
+	[BTF_KIND_STRUCT]	= "STRUCT",
+	[BTF_KIND_UNION]	= "UNION",
+	[BTF_KIND_ENUM]		= "ENUM",
+	[BTF_KIND_FWD]		= "FWD",
+	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
+	[BTF_KIND_VOLATILE]	= "VOLATILE",
+	[BTF_KIND_CONST]	= "CONST",
+	[BTF_KIND_RESTRICT]	= "RESTRICT",
+};
+
+struct btf_kind_operations {
+	s32 (*check_meta)(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left);
+	void (*log_details)(struct btf_verifier_env *env,
+			    const struct btf_type *t);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+	if (encoding == 0)
+		return "(none)";
+	else if (encoding == BTF_INT_SIGNED)
+		return "SIGNED";
+	else if (encoding == BTF_INT_CHAR)
+		return "CHAR";
+	else if (encoding == BTF_INT_BOOL)
+		return "BOOL";
+	else if (encoding == BTF_INT_VARARGS)
+		return "VARARGS";
+	else
+		return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+	return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+	return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+	return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+	return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+	return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+	return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+	return !BTF_STR_TBL_ELF_ID(offset) &&
+		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+	if (!BTF_STR_OFFSET(offset))
+		return "(anon)";
+	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else
+		return "(invalid-name-offset)";
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+					      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+					    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+						   const struct btf_type *t,
+						   bool log_details,
+						   const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	u8 kind = BTF_INFO_KIND(t->info);
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "[%u] %s %s%s",
+			   env->log_type_id,
+			   btf_kind_str[kind],
+			   btf_name_by_offset(btf, t->name),
+			   log_details ? " " : "");
+
+	if (log_details)
+		btf_type_ops(t)->log_details(env, t);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+	__btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+	__btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+				    const struct btf_type *struct_type,
+				    const struct btf_member *member,
+				    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+			   btf_name_by_offset(btf, member->name),
+			   member->type, member->offset);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+{
+	struct bpf_verifier_log *log = &env->log;
+	const struct btf *btf = env->btf;
+	const struct btf_header *hdr;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	hdr = btf->hdr;
+	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+	__btf_verifier_log(log, "version: %u\n", hdr->version);
+	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
+	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
+	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
+	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
+	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+	struct btf *btf = env->btf;
+
+	/* < 2 because +1 for btf_void which is always in btf->types[0].
+	 * btf_void is not accounted in btf->nr_types because btf_void
+	 * does not come from the BTF file.
+	 */
+	if (btf->types_size - btf->nr_types < 2) {
+		/* Expand 'types' array */
+
+		struct btf_type **new_types;
+		u32 expand_by, new_size;
+
+		if (btf->types_size == BTF_MAX_NR_TYPES) {
+			btf_verifier_log(env, "Exceeded max num of types");
+			return -E2BIG;
+		}
+
+		expand_by = max_t(u32, btf->types_size >> 2, 16);
+		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+				 btf->types_size + expand_by);
+
+		new_types = kvzalloc(new_size * sizeof(*new_types),
+				     GFP_KERNEL | __GFP_NOWARN);
+		if (!new_types)
+			return -ENOMEM;
+
+		if (btf->nr_types == 0)
+			new_types[0] = &btf_void;
+		else
+			memcpy(new_types, btf->types,
+			       sizeof(*btf->types) * (btf->nr_types + 1));
+
+		kvfree(btf->types);
+		btf->types = new_types;
+		btf->types_size = new_size;
+	}
+
+	btf->types[++(btf->nr_types)] = t;
+
+	return 0;
+}
+
+static void btf_free(struct btf *btf)
+{
+	kvfree(btf->types);
+	kvfree(btf->data);
+	kfree(btf);
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+	kfree(env);
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+	u16 encoding;
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+	if (nr_bits > BITS_PER_U64) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+				      BITS_PER_U64);
+		return -EINVAL;
+	}
+
+	if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+		return -EINVAL;
+	}
+
+	encoding = BTF_INT_ENCODING(int_data);
+	if (encoding &&
+	    encoding != BTF_INT_SIGNED &&
+	    encoding != BTF_INT_CHAR &&
+	    encoding != BTF_INT_BOOL &&
+	    encoding != BTF_INT_VARARGS) {
+		btf_verifier_log_type(env, t, "Unsupported encoding");
+		return -ENOTSUPP;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+			const struct btf_type *t)
+{
+	int int_data = btf_type_int(t);
+
+	btf_verifier_log(env,
+			 "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+			 t->size, BTF_INT_OFFSET(int_data),
+			 BTF_INT_BITS(int_data),
+			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static const struct btf_kind_operations int_ops = {
+	.check_meta = btf_int_check_meta,
+	.log_details = btf_int_log,
+};
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+				   const struct btf_type *t,
+				   u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (BTF_TYPE_PARENT(t->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+			     const struct btf_type *t)
+{
+	btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations ptr_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations fwd_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+				const struct btf_type *t,
+				u32 meta_left)
+{
+	const struct btf_array *array = btf_type_array(t);
+	u32 meta_needed = sizeof(*array);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	/* We are a little forgiving on array->index_type since
+	 * the kernel is not using it.
+	 */
+	/* Array elem cannot be in type void,
+	 * so !array->type is not allowed.
+	 */
+	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_array *array = btf_type_array(t);
+
+	btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+			 array->type, array->index_type, array->nelems);
+}
+
+static struct btf_kind_operations array_ops = {
+	.check_meta = btf_array_check_meta,
+	.log_details = btf_array_log,
+};
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+	const struct btf_member *member;
+	struct btf *btf = env->btf;
+	u32 struct_size = t->size;
+	u32 meta_needed;
+	u16 i;
+
+	meta_needed = btf_type_vlen(t) * sizeof(*member);
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for_each_member(i, t, member) {
+		if (!btf_name_offset_valid(btf, member->name)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member name_offset:%u",
+						member->name);
+			return -EINVAL;
+		}
+
+		/* A member cannot be in type void */
+		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid type_id");
+			return -EINVAL;
+		}
+
+		if (is_union && member->offset) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member bits_offset");
+			return -EINVAL;
+		}
+
+		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+			btf_verifier_log_member(env, t, member,
+						"Memmber bits_offset exceeds its struct size");
+			return -EINVAL;
+		}
+
+		btf_verifier_log_member(env, t, member, NULL);
+	}
+
+	return meta_needed;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+			   const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations struct_ops = {
+	.check_meta = btf_struct_check_meta,
+	.log_details = btf_struct_log,
+};
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	struct btf *btf = env->btf;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size != sizeof(int)) {
+		btf_verifier_log_type(env, t, "Expected size:%zu",
+				      sizeof(int));
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name);
+			return -EINVAL;
+		}
+
+		btf_verifier_log(env, "\t%s val=%d\n",
+				 btf_name_by_offset(btf, enums[i].name),
+				 enums[i].val);
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+			 const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations enum_ops = {
+	.check_meta = btf_enum_check_meta,
+	.log_details = btf_enum_log,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+	[BTF_KIND_INT] = &int_ops,
+	[BTF_KIND_PTR] = &ptr_ops,
+	[BTF_KIND_ARRAY] = &array_ops,
+	[BTF_KIND_STRUCT] = &struct_ops,
+	[BTF_KIND_UNION] = &struct_ops,
+	[BTF_KIND_ENUM] = &enum_ops,
+	[BTF_KIND_FWD] = &fwd_ops,
+	[BTF_KIND_TYPEDEF] = &modifier_ops,
+	[BTF_KIND_VOLATILE] = &modifier_ops,
+	[BTF_KIND_CONST] = &modifier_ops,
+	[BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left)
+{
+	u32 saved_meta_left = meta_left;
+	s32 var_meta_size;
+
+	if (meta_left < sizeof(*t)) {
+		btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+				 env->log_type_id, meta_left, sizeof(*t));
+		return -EINVAL;
+	}
+	meta_left -= sizeof(*t);
+
+	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+		btf_verifier_log(env, "[%u] Invalid kind:%u",
+				 env->log_type_id, BTF_INFO_KIND(t->info));
+		return -EINVAL;
+	}
+
+	if (!btf_name_offset_valid(env->btf, t->name)) {
+		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+				 env->log_type_id, t->name);
+		return -EINVAL;
+	}
+
+	var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+	if (var_meta_size < 0)
+		return var_meta_size;
+
+	meta_left -= var_meta_size;
+
+	return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	struct btf_header *hdr;
+	void *cur, *end;
+
+	hdr = btf->hdr;
+	cur = btf->nohdr_data + hdr->type_off;
+	end = btf->nohdr_data + hdr->str_off;
+
+	env->log_type_id = 1;
+	while (cur < end) {
+		struct btf_type *t = cur;
+		s32 meta_size;
+
+		meta_size = btf_check_meta(env, t, end - cur);
+		if (meta_size < 0)
+			return meta_size;
+
+		btf_add_type(env, t);
+		cur += meta_size;
+		env->log_type_id++;
+	}
+
+	return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+	return btf_check_all_metas(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	const char *start, *end;
+
+	hdr = btf->hdr;
+	start = btf->nohdr_data + hdr->str_off;
+	end = start + hdr->str_len;
+
+	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+	    start[0] || end[-1]) {
+		btf_verifier_log(env, "Invalid string section");
+		return -EINVAL;
+	}
+
+	btf->strings = start;
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	u32 meta_left;
+
+	if (btf->data_size < sizeof(*hdr)) {
+		btf_verifier_log(env, "btf_header not found");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_hdr(env);
+
+	hdr = btf->hdr;
+	if (hdr->magic != BTF_MAGIC) {
+		btf_verifier_log(env, "Invalid magic");
+		return -EINVAL;
+	}
+
+	if (hdr->version != BTF_VERSION) {
+		btf_verifier_log(env, "Unsupported version");
+		return -ENOTSUPP;
+	}
+
+	if (hdr->flags) {
+		btf_verifier_log(env, "Unsupported flags");
+		return -ENOTSUPP;
+	}
+
+	meta_left = btf->data_size - sizeof(*hdr);
+	if (!meta_left) {
+		btf_verifier_log(env, "No data");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
+	    /* Type section must align to 4 bytes */
+	    hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Invalid type_off");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->str_off ||
+	    meta_left - hdr->str_off < hdr->str_len) {
+		btf_verifier_log(env, "Invalid str_off or str_len");
+		return -EINVAL;
+	}
+
+	btf->nohdr_data = btf->hdr + 1;
+
+	return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+			     u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL;
+	u8 *data;
+	int err;
+
+	if (btf_data_size > BTF_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	if (log_level || log_ubuf || log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log->level = log_level;
+		log->ubuf = log_ubuf;
+		log->len_total = log_size;
+
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!data) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	btf->data = data;
+	btf->data_size = btf_data_size;
+
+	if (copy_from_user(data, btf_data, btf_data_size)) {
+		err = -EFAULT;
+		goto errout;
+	}
+
+	env->btf = btf;
+
+	err = btf_parse_hdr(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_type_sec(env);
+	if (err)
+		goto errout;
+
+	if (!err && log->level && bpf_verifier_log_full(log)) {
+		err = -ENOSPC;
+		goto errout;
+	}
+
+	if (!err) {
+		btf_verifier_env_free(env);
+		return btf;
+	}
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf)
+		btf_free(btf);
+	return ERR_PTR(err);
+}
-- 
cgit v1.2.3-55-g7522


From f56a653c1fd13a197076dec4461c656fd2adec73 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Wed, 18 Apr 2018 15:56:01 -0700
Subject: bpf: btf: Add BPF_BTF_LOAD command

This patch adds a BPF_BTF_LOAD command which
1) loads and verifies the BTF (implemented in earlier patches)
2) returns a BTF fd to userspace.  In the next patch, the
   BTF fd can be specified during BPF_MAP_CREATE.

It currently limits to CAP_SYS_ADMIN.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |  4 +++
 include/uapi/linux/bpf.h |  9 +++++++
 kernel/bpf/btf.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 17 ++++++++++++
 4 files changed, 97 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d8bdab0280ba..a7c7072535ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -8,7 +8,11 @@
 
 struct btf;
 struct btf_type;
+union bpf_attr;
 
+void btf_put(struct btf *btf);
+int btf_new_fd(const union bpf_attr *attr);
+struct btf *btf_get_by_fd(int fd);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9a2d1a04eb24..795bcd577750 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -363,6 +364,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 10ee41589da2..2322340694cf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7,6 +7,8 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/bpf_verifier.h>
@@ -190,6 +192,7 @@ struct btf {
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
+	refcount_t refcnt;
 };
 
 enum verifier_phase {
@@ -604,6 +607,17 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
+static void btf_get(struct btf *btf)
+{
+	refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+	if (btf && refcount_dec_and_test(&btf->refcnt))
+		btf_free(btf);
+}
+
 static int env_resolve_init(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1963,6 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	if (!err) {
 		btf_verifier_env_free(env);
+		btf_get(btf);
 		return btf;
 	}
 
@@ -1980,3 +1995,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 
 	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
 }
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+	btf_put(filp->private_data);
+	return 0;
+}
+
+static const struct file_operations btf_fops = {
+	.release	= btf_release,
+};
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+	struct btf *btf;
+	int fd;
+
+	btf = btf_parse(u64_to_user_ptr(attr->btf),
+			attr->btf_size, attr->btf_log_level,
+			u64_to_user_ptr(attr->btf_log_buf),
+			attr->btf_log_size);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = anon_inode_getfd("btf", &btf_fops, btf,
+			      O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+	struct btf *btf;
+	struct fd f;
+
+	f = fdget(fd);
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &btf_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	btf = f.file->private_data;
+	btf_get(btf);
+	fdput(f);
+
+	return btf;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..cd8ebadc66eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
@@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_LOAD))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_new_fd(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
+	case BPF_BTF_LOAD:
+		err = bpf_btf_load(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3-55-g7522


From a26ca7c982cb576749cbdd01e8ecde4bf010d60a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Wed, 18 Apr 2018 15:56:03 -0700
Subject: bpf: btf: Add pretty print support to the basic arraymap

This patch adds pretty print support to the basic arraymap.
Support for other bpf maps can be added later.

This patch adds new attrs to the BPF_MAP_CREATE command to allow
specifying the btf_fd, btf_key_id and btf_value_id.  The
BPF_MAP_CREATE can then associate the btf to the map if
the creating map supports BTF.

A BTF supported map needs to implement two new map ops,
map_seq_show_elem() and map_check_btf().  This patch has
implemented these new map ops for the basic arraymap.

It also adds file_operations, bpffs_map_fops, to the pinned
map such that the pinned map can be opened and read.
After that, the user has an intuitive way to do
"cat bpffs/pathto/a-pinned-map" instead of getting
an error.

bpffs_map_fops should not be extended further to support
other operations.  Other operations (e.g. write/key-lookup...)
should be realized by the userspace tools (e.g. bpftool) through
the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc.
Follow up patches will allow the userspace to obtain
the BTF from a map-fd.

Here is a sample output when reading a pinned arraymap
with the following map's value:

struct map_value {
	int count_a;
	int count_b;
};

cat /sys/fs/bpf/pinned_array_map:

0: {1,2}
1: {3,4}
2: {5,6}
...

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  20 +++++-
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/arraymap.c    |  50 +++++++++++++++
 kernel/bpf/inode.c       | 156 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  32 +++++++++-
 5 files changed, 254 insertions(+), 7 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd0ee92..ee5275e7d4df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,8 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_map;
 struct sock;
+struct seq_file;
+struct btf;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -43,10 +45,14 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
+				  struct seq_file *m);
+	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
+			     u32 key_type_id, u32 value_type_id);
 };
 
 struct bpf_map {
-	/* 1st cacheline with read-mostly members of which some
+	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
 	 */
 	const struct bpf_map_ops *ops ____cacheline_aligned;
@@ -62,10 +68,13 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	u32 btf_key_id;
+	u32 btf_value_id;
+	struct btf *btf;
 	bool unpriv_array;
-	/* 7 bytes hole */
+	/* 55 bytes hole */
 
-	/* 2nd cacheline with misc members to avoid false sharing
+	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
 	struct user_struct *user ____cacheline_aligned;
@@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
+{
+	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+}
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 795bcd577750..c8383a289f7b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -280,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..02a189339381 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
 
 #include "map_in_map.h"
 
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+				    struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = array_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	seq_printf(m, "%u: ", *(u32 *)key);
+	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	u32 int_data;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	/* bpf array can only take a u32 key.  This check makes
+	 * sure that the btf matches the attr used during map_create.
+	 */
+	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+	    BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size > map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_seq_show_elem = array_map_seq_show_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..a41343009ccc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
+struct map_iter {
+	void *key;
+	bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+	return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+	return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+	if (iter) {
+		kfree(iter->key);
+		kfree(iter);
+	}
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+	struct map_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+	if (!iter)
+		goto error;
+
+	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!iter->key)
+		goto error;
+
+	return iter;
+
+error:
+	map_iter_free(iter);
+	return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (map_iter(m)->done)
+		return NULL;
+
+	if (unlikely(v == SEQ_START_TOKEN))
+		goto done;
+
+	if (map->ops->map_get_next_key(map, key, key)) {
+		map_iter(m)->done = true;
+		return NULL;
+	}
+
+done:
+	++(*pos);
+	return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (map_iter(m)->done)
+		return NULL;
+
+	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (unlikely(v == SEQ_START_TOKEN)) {
+		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+		seq_puts(m, "# WARNING!! The output format will change\n");
+	} else {
+		map->ops->map_seq_show_elem(map, key, m);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+	.start	= map_seq_start,
+	.next	= map_seq_next,
+	.show	= map_seq_show,
+	.stop	= map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+	struct bpf_map *map = inode->i_private;
+	struct map_iter *iter;
+	struct seq_file *m;
+	int err;
+
+	iter = map_iter_alloc(map);
+	if (!iter)
+		return -ENOMEM;
+
+	err = seq_open(file, &bpffs_map_seq_ops);
+	if (err) {
+		map_iter_free(iter);
+		return err;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	map_iter_free(map_iter(m));
+
+	return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map.  The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+	.open		= bpffs_map_open,
+	.read		= seq_read,
+	.release	= bpffs_map_release,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-			 const struct inode_operations *iops)
+			 const struct inode_operations *iops,
+			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 		return PTR_ERR(inode);
 
 	inode->i_op = iops;
+	inode->i_fop = fops;
 	inode->i_private = raw;
 
 	bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+	struct bpf_map *map = arg;
+
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+			     map->btf ? &bpffs_map_fops : NULL);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0a4924a0a8da..fe23dc5a3ec4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,7 @@
 #include <linux/cred.h>
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
+#include <linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -251,6 +252,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
+	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -416,7 +418,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -450,6 +452,33 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
+	if (bpf_map_support_seq_show(map) &&
+	    (attr->btf_key_id || attr->btf_value_id)) {
+		struct btf *btf;
+
+		if (!attr->btf_key_id || !attr->btf_value_id) {
+			err = -EINVAL;
+			goto free_map_nouncharge;
+		}
+
+		btf = btf_get_by_fd(attr->btf_fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto free_map_nouncharge;
+		}
+
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
+					      attr->btf_value_id);
+		if (err) {
+			btf_put(btf);
+			goto free_map_nouncharge;
+		}
+
+		map->btf = btf;
+		map->btf_key_id = attr->btf_key_id;
+		map->btf_value_id = attr->btf_value_id;
+	}
+
 	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
@@ -482,6 +511,7 @@ free_map:
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
+	btf_put(map->btf);
 	map->ops->map_free(map);
 	return err;
 }
-- 
cgit v1.2.3-55-g7522


From a4dfa72d0acd1c99a160e25c099849ae37ad13fd Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
Date: Thu, 19 Apr 2018 11:06:18 +0200
Subject: tipc: set default MTU for UDP media

Currently, all bearers are configured with MTU value same as the
underlying L2 device. However, in case of bearers with media type
UDP, higher throughput is possible with a fixed and higher emulated
MTU value than adapting to the underlying L2 MTU.

In this commit, we introduce a parameter mtu in struct tipc_media
and a default value is set for UDP. A default value of 14k
was determined by experimentation and found to have a higher throughput
than 16k. MTU for UDP bearers are assigned the above set value of
media MTU.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 5 +++++
 net/tipc/udp_media.c             | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 3f29e3c8ed06..4b2c93b1934c 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -185,6 +185,11 @@
 #define TIPC_DEF_LINK_WIN 50
 #define TIPC_MAX_LINK_WIN 8191
 
+/*
+ * Default MTU for UDP media
+ */
+
+#define TIPC_DEF_LINK_UDP_MTU 14000
 
 struct tipc_node_info {
 	__be32 addr;			/* network address of node */
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 			err = -EINVAL;
 			goto err;
 		}
-		b->mtu = dev->mtu - sizeof(struct iphdr)
-			- sizeof(struct udphdr);
+		b->mtu = b->media->mtu;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (local.proto == htons(ETH_P_IPV6)) {
 		udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
 	.priority	= TIPC_DEF_LINK_PRI,
 	.tolerance	= TIPC_DEF_LINK_TOL,
 	.window		= TIPC_DEF_LINK_WIN,
+	.mtu		= TIPC_DEF_LINK_UDP_MTU,
 	.type_id	= TIPC_MEDIA_TYPE_UDP,
 	.hwaddr_len	= 0,
 	.name		= "udp"
-- 
cgit v1.2.3-55-g7522


From 901271e0403af638c224987c2a4e55cebade7e91 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
Date: Thu, 19 Apr 2018 11:06:19 +0200
Subject: tipc: implement configuration of UDP media MTU

In previous commit, we changed the default emulated MTU for UDP bearers
to 14k.

This commit adds the functionality to set/change the default value
by configuring new MTU for UDP media. UDP bearer(s) have to be disabled
and enabled back for the new MTU to take effect.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |  1 +
 net/tipc/bearer.c                 | 13 +++++++++++++
 net/tipc/bearer.h                 |  3 +++
 net/tipc/udp_media.h              | 14 ++++++++++++++
 4 files changed, 31 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 0affb682e5e3..85c11982c89b 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -266,6 +266,7 @@ enum {
 	TIPC_NLA_PROP_PRIO,		/* u32 */
 	TIPC_NLA_PROP_TOL,		/* u32 */
 	TIPC_NLA_PROP_WIN,		/* u32 */
+	TIPC_NLA_PROP_MTU,		/* u32 */
 
 	__TIPC_NLA_PROP_MAX,
 	TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..a22caf9e5a18 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1029,6 +1029,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
 		goto prop_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
 		goto prop_msg_full;
+	if (media->type_id == TIPC_MEDIA_TYPE_UDP)
+		if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
+			goto prop_msg_full;
 
 	nla_nest_end(msg->skb, prop);
 	nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1161,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
 			m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
 		if (props[TIPC_NLA_PROP_WIN])
 			m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+		if (props[TIPC_NLA_PROP_MTU]) {
+			if (m->type_id != TIPC_MEDIA_TYPE_UDP)
+				return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+			if (tipc_udp_mtu_bad(nla_get_u32
+					     (props[TIPC_NLA_PROP_MTU])))
+				return -EINVAL;
+			m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+#endif
+		}
 	}
 
 	return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
  * @priority: default link (and bearer) priority
  * @tolerance: default time (in ms) before declaring link failure
  * @window: default window (in packets) before declaring link congestion
+ * @mtu: max packet size bearer can support for media type not dependent on
+ * underlying device MTU
  * @type_id: TIPC media identifier
  * @hwaddr_len: TIPC media address len
  * @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
 	u32 priority;
 	u32 tolerance;
 	u32 window;
+	u32 mtu;
 	u32 type_id;
 	u32 hwaddr_len;
 	char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
 #ifndef _TIPC_UDP_MEDIA_H
 #define _TIPC_UDP_MEDIA_H
 
+#include <linux/ip.h>
+#include <linux/udp.h>
+
 int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
 int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
 int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
 
+/* check if configured MTU is too low for tipc headers */
+static inline bool tipc_udp_mtu_bad(u32 mtu)
+{
+	if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
+	    sizeof(struct udphdr)))
+		return false;
+
+	pr_warn("MTU too low for tipc bearer\n");
+	return true;
+}
+
 #endif
 #endif
-- 
cgit v1.2.3-55-g7522


From fbcf93ebcaef7d09881ee308b52cd84f5e43c622 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Sat, 21 Apr 2018 09:48:23 -0700
Subject: bpf: btf: Clean up btf.h in uapi

This patch cleans up btf.h in uapi:
1) Rename "name" to "name_off" to better reflect it is an offset to the
   string section instead of a char array.
2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h       |  8 +++-----
 kernel/bpf/btf.c               | 20 ++++++++++----------
 tools/include/uapi/linux/btf.h |  8 +++-----
 tools/lib/bpf/btf.c            |  2 +-
 4 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index eb56ac760547..22e1046a1a86 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -473,7 +473,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
 			   btf_kind_str[kind],
-			   btf_name_by_offset(btf, t->name),
+			   btf_name_by_offset(btf, t->name_off),
 			   log_details ? " " : "");
 
 	if (log_details)
@@ -517,7 +517,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 		btf_verifier_log_type(env, struct_type, NULL);
 
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   btf_name_by_offset(btf, member->name),
+			   btf_name_by_offset(btf, member->name_off),
 			   member->type, member->offset);
 
 	if (fmt && *fmt) {
@@ -1419,10 +1419,10 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for_each_member(i, t, member) {
-		if (!btf_name_offset_valid(btf, member->name)) {
+		if (!btf_name_offset_valid(btf, member->name_off)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member name_offset:%u",
-						member->name);
+						member->name_off);
 			return -EINVAL;
 		}
 
@@ -1605,14 +1605,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for (i = 0; i < nr_enums; i++) {
-		if (!btf_name_offset_valid(btf, enums[i].name)) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
 			btf_verifier_log(env, "\tInvalid name_offset:%u",
-					 enums[i].name);
+					 enums[i].name_off);
 			return -EINVAL;
 		}
 
 		btf_verifier_log(env, "\t%s val=%d\n",
-				 btf_name_by_offset(btf, enums[i].name),
+				 btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
 
@@ -1636,7 +1636,7 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
 	for (i = 0; i < nr_enums; i++) {
 		if (v == enums[i].val) {
 			seq_printf(m, "%s",
-				   btf_name_by_offset(btf, enums[i].name));
+				   btf_name_by_offset(btf, enums[i].name_off));
 			return;
 		}
 	}
@@ -1687,9 +1687,9 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (!btf_name_offset_valid(env->btf, t->name)) {
+	if (!btf_name_offset_valid(env->btf, t->name_off)) {
 		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
-				 env->log_type_id, t->name);
+				 env->log_type_id, t->name_off);
 		return -EINVAL;
 	}
 
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 58b6255abc7a..2bac710e3194 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -281,7 +281,7 @@ int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
 
 	for (i = 1; i <= btf->nr_types; i++) {
 		const struct btf_type *t = btf->types[i];
-		const char *name = btf_name_by_offset(btf, t->name);
+		const char *name = btf_name_by_offset(btf, t->name_off);
 
 		if (name && !strcmp(type_name, name))
 			return i;
-- 
cgit v1.2.3-55-g7522


From 2eb0f624b709e78ec8e2f4c3412947703db99301 Mon Sep 17 00:00:00 2001
From: Thierry Du Tre
Date: Wed, 4 Apr 2018 15:38:22 +0200
Subject: netfilter: add NAT support for shifted portmap ranges

This is a patch proposal to support shifted ranges in portmaps.  (i.e. tcp/udp
incoming port 5000-5100 on WAN redirected to LAN 192.168.1.5:2000-2100)

Currently DNAT only works for single port or identical port ranges.  (i.e.
ports 5000-5100 on WAN interface redirected to a LAN host while original
destination port is not altered) When different port ranges are configured,
either 'random' mode should be used, or else all incoming connections are
mapped onto the first port in the redirect range. (in described example
WAN:5000-5100 will all be mapped to 192.168.1.5:2000)

This patch introduces a new mode indicated by flag NF_NAT_RANGE_PROTO_OFFSET
which uses a base port value to calculate an offset with the destination port
present in the incoming stream. That offset is then applied as index within the
redirect port range (index modulo rangewidth to handle range overflow).

In described example the base port would be 5000. An incoming stream with
destination port 5004 would result in an offset value 4 which means that the
NAT'ed stream will be using destination port 2004.

Other possibilities include deterministic mapping of larger or multiple ranges
to a smaller range : WAN:5000-5999 -> LAN:5000-5099 (maps WAN port 5*xx to port
51xx)

This patch does not change any current behavior. It just adds new NAT proto
range functionality which must be selected via the specific flag when intended
to use.

A patch for iptables (libipt_DNAT.c + libip6t_DNAT.c) will also be proposed
which makes this functionality immediately available.

Signed-off-by: Thierry Du Tre <thierry@dtsystems.be>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/ipv6/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/nf_nat.h                 |  2 +-
 include/net/netfilter/nf_nat_l3proto.h         |  4 +-
 include/net/netfilter/nf_nat_l4proto.h         |  8 +--
 include/net/netfilter/nf_nat_redirect.h        |  2 +-
 include/uapi/linux/netfilter/nf_nat.h          | 12 ++++-
 net/ipv4/netfilter/ipt_MASQUERADE.c            |  2 +-
 net/ipv4/netfilter/nf_nat_h323.c               |  4 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       |  4 +-
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c    |  4 +-
 net/ipv4/netfilter/nf_nat_pptp.c               |  2 +-
 net/ipv4/netfilter/nf_nat_proto_gre.c          |  2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c         |  2 +-
 net/ipv4/netfilter/nft_masq_ipv4.c             |  2 +-
 net/ipv6/netfilter/ip6t_MASQUERADE.c           |  2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       |  4 +-
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c    |  4 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c       |  2 +-
 net/ipv6/netfilter/nft_masq_ipv6.c             |  2 +-
 net/ipv6/netfilter/nft_redir_ipv6.c            |  2 +-
 net/netfilter/nf_nat_core.c                    | 27 +++++-----
 net/netfilter/nf_nat_helper.c                  |  2 +-
 net/netfilter/nf_nat_proto_common.c            |  9 ++--
 net/netfilter/nf_nat_proto_dccp.c              |  2 +-
 net/netfilter/nf_nat_proto_sctp.c              |  2 +-
 net/netfilter/nf_nat_proto_tcp.c               |  2 +-
 net/netfilter/nf_nat_proto_udp.c               |  4 +-
 net/netfilter/nf_nat_proto_unknown.c           |  2 +-
 net/netfilter/nf_nat_redirect.c                |  6 +--
 net/netfilter/nf_nat_sip.c                     |  2 +-
 net/netfilter/nft_nat.c                        |  2 +-
 net/netfilter/xt_NETMAP.c                      |  8 +--
 net/netfilter/xt_REDIRECT.c                    |  2 +-
 net/netfilter/xt_nat.c                         | 72 +++++++++++++++++++++++---
 net/openvswitch/conntrack.c                    |  4 +-
 36 files changed, 145 insertions(+), 71 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h
index ebd869473603..cd24be4c4a99 100644
--- a/include/net/netfilter/ipv4/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h
@@ -6,7 +6,7 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 
 void nf_nat_masquerade_ipv4_register_notifier(void);
diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h
index 1ed4f2631ed6..0c3b5ebf0bb8 100644
--- a/include/net/netfilter/ipv6/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h
@@ -3,7 +3,7 @@
 #define _NF_NAT_MASQUERADE_IPV6_H_
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 void nf_nat_masquerade_ipv6_register_notifier(void);
 void nf_nat_masquerade_ipv6_unregister_notifier(void);
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 207a467e7ca6..da3d601cadee 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -39,7 +39,7 @@ struct nf_conn_nat {
 
 /* Set up the info structure to map into this range. */
 unsigned int nf_nat_setup_info(struct nf_conn *ct,
-			       const struct nf_nat_range *range,
+			       const struct nf_nat_range2 *range,
 			       enum nf_nat_manip_type maniptype);
 
 extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct,
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index ce7c2b4e64bb..ac47098a61dc 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -7,7 +7,7 @@ struct nf_nat_l3proto {
 	u8	l3proto;
 
 	bool	(*in_range)(const struct nf_conntrack_tuple *t,
-			    const struct nf_nat_range *range);
+			    const struct nf_nat_range2 *range);
 
 	u32 	(*secure_port)(const struct nf_conntrack_tuple *t, __be16);
 
@@ -33,7 +33,7 @@ struct nf_nat_l3proto {
 				  struct flowi *fl);
 
 	int	(*nlattr_to_range)(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 };
 
 int nf_nat_l3proto_register(const struct nf_nat_l3proto *);
diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 67835ff8a2d9..b4d6b29bca62 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -34,12 +34,12 @@ struct nf_nat_l4proto {
 	 */
 	void (*unique_tuple)(const struct nf_nat_l3proto *l3proto,
 			     struct nf_conntrack_tuple *tuple,
-			     const struct nf_nat_range *range,
+			     const struct nf_nat_range2 *range,
 			     enum nf_nat_manip_type maniptype,
 			     const struct nf_conn *ct);
 
 	int (*nlattr_to_range)(struct nlattr *tb[],
-			       struct nf_nat_range *range);
+			       struct nf_nat_range2 *range);
 };
 
 /* Protocol registration. */
@@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct, u16 *rover);
 
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 
 #endif /*_NF_NAT_L4PROTO_H*/
diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index 5ddabb08c472..c129aacc8ae8 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 		     const struct nf_nat_ipv4_multi_range_compat *mr,
 		     unsigned int hooknum);
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum);
 
 #endif /* _NF_NAT_REDIRECT_H_ */
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index a33000da7229..4a95c0db14d4 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -10,6 +10,7 @@
 #define NF_NAT_RANGE_PROTO_RANDOM		(1 << 2)
 #define NF_NAT_RANGE_PERSISTENT			(1 << 3)
 #define NF_NAT_RANGE_PROTO_RANDOM_FULLY		(1 << 4)
+#define NF_NAT_RANGE_PROTO_OFFSET		(1 << 5)
 
 #define NF_NAT_RANGE_PROTO_RANDOM_ALL		\
 	(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -17,7 +18,7 @@
 #define NF_NAT_RANGE_MASK					\
 	(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |	\
 	 NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |	\
-	 NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+	 NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET)
 
 struct nf_nat_ipv4_range {
 	unsigned int			flags;
@@ -40,4 +41,13 @@ struct nf_nat_range {
 	union nf_conntrack_man_proto	max_proto;
 };
 
+struct nf_nat_range2 {
+	unsigned int			flags;
+	union nf_inet_addr		min_addr;
+	union nf_inet_addr		max_addr;
+	union nf_conntrack_man_proto	min_proto;
+	union nf_conntrack_man_proto	max_proto;
+	union nf_conntrack_man_proto	base_proto;
+};
+
 #endif /* _NETFILTER_NF_NAT_H */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
 static unsigned int
 masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 
 	mr = par->targinfo;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_q931_expect(struct nf_conn *new,
 			       struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
 		nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_callforwarding_expect(struct nf_conn *new,
 					 struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..4346336cee4c 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
 #endif /* CONFIG_XFRM */
 
 static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
 	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V4_MINIP]) {
 		range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..f538c5001547 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -24,13 +24,13 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	struct nf_conn *ct;
 	struct nf_conn_nat *nat;
 	enum ip_conntrack_info ctinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	const struct rtable *rt;
 	__be32 newsrc, nh;
 
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	struct nf_conntrack_tuple t = {};
 	const struct nf_ct_pptp_master *ct_pptp_info;
 	const struct nf_nat_pptp *nat_pptp_info;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	struct nf_conn_nat *nat;
 
 	nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 static void
 gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6b7f075f811f..56d75eb5448f 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
 #endif
 
 static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
 	       ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
@@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V6_MINIP]) {
 		nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 98f61fcb9108..9dfc2b90c362 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -26,14 +26,14 @@
 static atomic_t v6_worker_count;
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
 	struct in6_addr src;
 	struct nf_conn *ct;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 57593b00c5b4..d9bf42ba44fa 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    enum nf_nat_manip_type maniptype,
 		    const struct nf_conn *ct)
 {
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 4146536e9c15..dd0122f3cffe 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index a27e424f690d..74269865acc8 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
 				const struct nft_pktinfo *pkt)
 {
 	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_proto_min) {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..37b3c9913b08 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
 static int in_range(const struct nf_nat_l3proto *l3proto,
 		    const struct nf_nat_l4proto *l4proto,
 		    const struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range)
+		    const struct nf_nat_range2 *range)
 {
 	/* If we are supposed to map IPs, then we must be in the
 	 * range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +194,7 @@ find_appropriate_src(struct net *net,
 		     const struct nf_nat_l4proto *l4proto,
 		     const struct nf_conntrack_tuple *tuple,
 		     struct nf_conntrack_tuple *result,
-		     const struct nf_nat_range *range)
+		     const struct nf_nat_range2 *range)
 {
 	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
@@ -224,7 +224,7 @@ find_appropriate_src(struct net *net,
 static void
 find_best_ips_proto(const struct nf_conntrack_zone *zone,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    const struct nf_conn *ct,
 		    enum nf_nat_manip_type maniptype)
 {
@@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
 static void
 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_tuple *orig_tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
@@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	/* Only bother mapping if it's not already in range and unique */
 	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
-			if (l4proto->in_range(tuple, maniptype,
-					      &range->min_proto,
-					      &range->max_proto) &&
+			if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
+			    l4proto->in_range(tuple, maniptype,
+			          &range->min_proto,
+			          &range->max_proto) &&
 			    (range->min_proto.all == range->max_proto.all ||
 			     !nf_nat_used_tuple(tuple, ct)))
 				goto out;
@@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		}
 	}
 
-	/* Last change: get protocol to try to obtain unique tuple. */
+	/* Last chance: get protocol to try to obtain unique tuple. */
 	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
 out:
 	rcu_read_unlock();
@@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
 
 unsigned int
 nf_nat_setup_info(struct nf_conn *ct,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype)
 {
 	struct net *net = nf_ct_net(ct);
@@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 		(manip == NF_NAT_MANIP_SRC ?
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
-	struct nf_nat_range range = {
+	struct nf_nat_range2 range = {
 		.flags		= NF_NAT_RANGE_MAP_IPS,
 		.min_addr	= ip,
 		.max_addr	= ip,
@@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
 
 static int nfnetlink_parse_nat_proto(struct nlattr *attr,
 				     const struct nf_conn *ct,
-				     struct nf_nat_range *range)
+				     struct nf_nat_range2 *range)
 {
 	struct nlattr *tb[CTA_PROTONAT_MAX+1];
 	const struct nf_nat_l4proto *l4proto;
@@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
 
 static int
 nfnetlink_parse_nat(const struct nlattr *nat,
-		    const struct nf_conn *ct, struct nf_nat_range *range,
+		    const struct nf_conn *ct, struct nf_nat_range2 *range,
 		    const struct nf_nat_l3proto *l3proto)
 {
 	struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_l3proto *l3proto;
 	int err;
 
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
 void nf_nat_follow_master(struct nf_conn *ct,
 			  struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct,
 				 u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 						  : tuple->src.u.all);
 	} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
 		off = prandom_u32();
+	} else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
+		off = (ntohs(*portptr) - ntohs(range->base_proto.all));
 	} else {
 		off = *rover;
 	}
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		*portptr = htons(min + off % range_size);
 		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
 			continue;
-		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+		if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
+					NF_NAT_RANGE_PROTO_OFFSET)))
 			*rover = off;
 		return;
 	}
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range)
+				   struct nf_nat_range2 *range)
 {
 	if (tb[CTA_PROTONAT_PORT_MIN]) {
 		range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
 static void
 dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
 static void
 sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
 static void
 tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
 static void
 udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
 static void
 udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_range *range,
+		     const struct nf_nat_range2 *range,
 		     enum nf_nat_manip_type maniptype,
 		     const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
 
 static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..7c4bb0a773ca 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	__be32 newdst;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
 		hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
 static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
 
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	struct in6_addr newdst;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
 static void nf_nat_sip_expected(struct nf_conn *ct,
 				struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
 	const struct nft_nat *priv = nft_expr_priv(expr);
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_addr_min) {
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
 static unsigned int
 netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
-	struct nf_nat_range newrange;
+	const struct nf_nat_range2 *range = par->targinfo;
+	struct nf_nat_range2 newrange;
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 		return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	__be32 new_ip, netmask;
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
 		xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-static void xt_nat_convert_range(struct nf_nat_range *dst,
+static void xt_nat_convert_range(struct nf_nat_range2 *dst,
 				 const struct nf_nat_ipv4_range *src)
 {
 	memset(&dst->min_addr, 0, sizeof(dst->min_addr));
 	memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+	memset(&dst->base_proto, 0, sizeof(dst->base_proto));
 
 	dst->flags	 = src->flags;
 	dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
 xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -71,7 +72,7 @@ static unsigned int
 xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 static unsigned int
 xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
 		  ctinfo == IP_CT_RELATED_REPLY)));
 
-	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
 }
 
 static unsigned int
 xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+		  ctinfo == IP_CT_RELATED_REPLY)));
+
+	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 				  (1 << NF_INET_LOCAL_OUT),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "SNAT",
+		.revision	= 2,
+		.checkentry	= xt_nat_checkentry,
+		.destroy	= xt_nat_destroy,
+		.target		= xt_snat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_POST_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "DNAT",
+		.revision	= 2,
+		.target		= xt_dnat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_OUT),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init xt_nat_init(void)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..02fc343feb66 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -72,7 +72,7 @@ struct ovs_conntrack_info {
 	struct md_mark mark;
 	struct md_labels labels;
 #ifdef CONFIG_NF_NAT_NEEDED
-	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+	struct nf_nat_range2 range;  /* Only present for SRC NAT and DST NAT. */
 #endif
 };
 
@@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net,
  */
 static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
 			      enum ip_conntrack_info ctinfo,
-			      const struct nf_nat_range *range,
+			      const struct nf_nat_range2 *range,
 			      enum nf_nat_manip_type maniptype)
 {
 	int hooknum, nh_off, err = NF_ACCEPT;
-- 
cgit v1.2.3-55-g7522


From a1d768f1a00db556e2aae9f92bdb38671e601da5 Mon Sep 17 00:00:00 2001
From: Taehee Yoo
Date: Fri, 13 Apr 2018 23:09:58 +0900
Subject: netfilter: ebtables: add ebt_get_target and ebt_get_target_c

ebt_get_target similar to {ip/ip6/arp}t_get_target.
and ebt_get_target_c similar to {ip/ip6/arp}t_get_target_c.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_bridge/ebtables.h |  6 ++++++
 net/bridge/netfilter/ebtables.c                | 22 +++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index 0c7dc8315013..3b86c14ea49d 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h
@@ -191,6 +191,12 @@ struct ebt_entry {
 	unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace))));
 };
 
+static __inline__ struct ebt_entry_target *
+ebt_get_target(struct ebt_entry *e)
+{
+	return (void *)e + e->target_offset;
+}
+
 /* {g,s}etsockopt numbers */
 #define EBT_BASE_CTL            128
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7c07221369c0..9be240129448 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
 	return (void *)entry + entry->next_offset;
 }
 
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+	return ebt_get_target((struct ebt_entry *)e);
+}
+
 /* Do some firewalling */
 unsigned int ebt_do_table(struct sk_buff *skb,
 			  const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 		 */
 		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
 
-		t = (struct ebt_entry_target *)
-		   (((char *)point) + point->target_offset);
+		t = ebt_get_target_c(point);
 		/* standard target */
 		if (!t->u.target->target)
 			verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -637,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
 		return 1;
 	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 
 	par.net      = net;
 	par.target   = t->u.target;
@@ -716,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
 	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
 	if (ret != 0)
 		goto cleanup_watchers;
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 	gap = e->next_offset - e->target_offset;
 
 	target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -789,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
 			if (pos == nentries)
 				continue;
 		}
-		t = (struct ebt_entry_target *)
-		   (((char *)e) + e->target_offset);
+		t = ebt_get_target_c(e);
 		if (strcmp(t->u.name, EBT_STANDARD_TARGET))
 			goto letscontinue;
 		if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -1396,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
 		return -EFAULT;
 
 	hlp = ubase + (((char *)e + e->target_offset) - base);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
 	if (ret != 0)
@@ -1737,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
 		return ret;
 	target_offset = e->target_offset - (origsize - *size);
 
-	t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target(e);
 
 	ret = compat_target_to_user(t, dstptr, size);
 	if (ret)
@@ -1785,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
 	EBT_MATCH_ITERATE(e, compat_calc_match, &off);
 	EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
 
-	t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	off += xt_compat_target_offset(t->u.target);
 	off += ebt_compat_entry_padsize();
-- 
cgit v1.2.3-55-g7522


From 12bed760a78da6e12ac8252fec64d019a9eac523 Mon Sep 17 00:00:00 2001
From: Eyal Birger
Date: Tue, 24 Apr 2018 17:50:29 +0300
Subject: bpf: add helper for getting xfrm states

This commit introduces a helper which allows fetching xfrm state
parameters by eBPF programs attached to TC.

Prototype:
bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)

skb: pointer to skb
index: the index in the skb xfrm_state secpath array
xfrm_state: pointer to 'struct bpf_xfrm_state'
size: size of 'struct bpf_xfrm_state'
flags: reserved for future extensions

The helper returns 0 on success. Non zero if no xfrm state at the index
is found - or non exists at all.

struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6
address and the reqid; it can be further extended by adding elements to
its end - indicating the populated fields by the 'size' argument -
keeping backwards compatibility.

Typical usage:

struct bpf_xfrm_state x = {};
bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
...

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++-
 net/core/filter.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8383a289f7b..e6679393b687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -774,6 +774,15 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: A negative integer to be added to xdp_md.data_end
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
+ *     retrieve XFRM state
+ *     @skb: pointer to skb
+ *     @index: index of the xfrm state in the secpath
+ *     @key: pointer to 'struct bpf_xfrm_state'
+ *     @size: size of 'struct bpf_xfrm_state'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -841,7 +850,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -947,6 +957,19 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
 /* Generic BPF return codes which all BPF program types may support.
  * The values are binary compatible with their TC_ACT_* counter-part to
  * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
diff --git a/net/core/filter.c b/net/core/filter.c
index e25bc4a3aa1a..8e45c6c7ab08 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,6 +57,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 
 /**
@@ -3743,6 +3744,49 @@ static const struct bpf_func_proto bpf_bind_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+	   struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+	const struct sec_path *sp = skb_sec_path(skb);
+	const struct xfrm_state *x;
+
+	if (!sp || unlikely(index >= sp->len || flags))
+		goto err_clear;
+
+	x = sp->xvec[index];
+
+	if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+		goto err_clear;
+
+	to->reqid = x->props.reqid;
+	to->spi = x->id.spi;
+	to->family = x->props.family;
+	if (to->family == AF_INET6) {
+		memcpy(to->remote_ipv6, x->props.saddr.a6,
+		       sizeof(to->remote_ipv6));
+	} else {
+		to->remote_ipv4 = x->props.saddr.a4;
+	}
+
+	return 0;
+err_clear:
+	memset(to, 0, size);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+	.func		= bpf_skb_get_xfrm_state,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+#endif
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3884,6 +3928,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+#ifdef CONFIG_XFRM
+	case BPF_FUNC_skb_get_xfrm_state:
+		return &bpf_skb_get_xfrm_state_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-55-g7522


From bec1f6f697362c5bc635dacd7ac8499d0a10a4e7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Thu, 26 Apr 2018 13:42:17 -0400
Subject: udp: generate gso with UDP_SEGMENT

Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.

To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.

A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.

Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.

The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.

Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.

    tcp tso
     3197 MB/s 54232 msg/s 54232 calls/s
         6,457,754,262      cycles

    tcp gso
     1765 MB/s 29939 msg/s 29939 calls/s
        11,203,021,806      cycles

    tcp without tso/gso *
      739 MB/s 12548 msg/s 12548 calls/s
        11,205,483,630      cycles

    udp
      876 MB/s 14873 msg/s 624666 calls/s
        11,205,777,429      cycles

    udp gso
     2139 MB/s 36282 msg/s 36282 calls/s
        11,204,374,561      cycles

   [*] after reverting commit 0a6b2a1dc2a2
       ("tcp: switch to GSO being always on")

Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:

  perf stat -a -C 12 -e cycles \
    ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4

Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h      |  3 +++
 include/net/inet_sock.h  |  1 +
 include/net/ip.h         |  1 +
 include/net/ipv6.h       |  1 +
 include/uapi/linux/udp.h |  1 +
 net/ipv4/ip_output.c     |  9 ++++++---
 net/ipv4/udp.c           | 33 ++++++++++++++++++++++++++++++---
 net/ipv6/ip6_output.c    |  6 ++++--
 net/ipv6/udp.c           | 23 ++++++++++++++++++++---
 9 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..ca840345571b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,7 @@ struct udp_sock {
 	 * when the socket is uncorked.
 	 */
 	__u16		 len;		/* total length of pending frames */
+	__u16		 gso_size;
 	/*
 	 * Fields specific to UDP-Lite.
 	 */
@@ -87,6 +88,8 @@ struct udp_sock {
 	int		forward_deficit;
 };
 
+#define UDP_MAX_SEGMENTS	(1 << 6UL)
+
 static inline struct udp_sock *udp_sk(const struct sock *sk)
 {
 	return (struct udp_sock *)sk;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..83d5b3c2ac42 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -147,6 +147,7 @@ struct inet_cork {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 7ec543a64bbc..bada1f1f871e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0dd722cab037..0a872a7c33c8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -298,6 +298,7 @@ struct ipcm6_cookie {
 	__s16 tclass;
 	__s8  dontfrag;
 	struct ipv6_txoptions *opt;
+	__u16 gso_size;
 };
 
 static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index efb7b5991c2f..09d00f8c442b 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -32,6 +32,7 @@ struct udphdr {
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
+#define UDP_SEGMENT	103	/* Set GSO segmentation size */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2883ff1e909c..da4abbee10f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,7 +882,8 @@ static int __ip_append_data(struct sock *sk,
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
 		tskey = sk->sk_tskey++;
@@ -906,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
 	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
@@ -1135,6 +1136,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	*rtp = NULL;
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1214,7 +1217,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 		return -EOPNOTSUPP;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b9d8017b319..bda022c5480b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(udp_set_csum);
 
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+			struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)  				 /*     UDP-Lite      */
 		csum = udplite_csum(skb);
 
@@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_send_skb(skb, fl4);
+	err = udp_send_skb(skb, fl4, &inet->cork.base);
 
 out:
 	up->len = 0;
@@ -922,6 +938,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.sockc.tsflags = sk->sk_tsflags;
 	ipc.addr = inet->inet_saddr;
 	ipc.oif = sk->sk_bound_dev_if;
+	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
 		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
@@ -1037,7 +1054,7 @@ back_from_confirm:
 				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_send_skb(skb, fl4);
+			err = udp_send_skb(skb, fl4, &cork);
 		goto out;
 	}
 
@@ -2367,6 +2384,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->no_check6_rx = valbool;
 		break;
 
+	case UDP_SEGMENT:
+		if (val < 0 || val > USHRT_MAX)
+			return -EINVAL;
+		up->gso_size = val;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
@@ -2457,6 +2480,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		val = up->no_check6_rx;
 		break;
 
+	case UDP_SEGMENT:
+		val = up->gso_size;
+		break;
+
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fa1db447405..a1c4a78132d2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1240,6 +1240,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
@@ -1281,7 +1283,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1329,7 +1331,7 @@ emsgsize:
 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
 	    headersize == sizeof(struct ipv6hdr) &&
 	    length <= mtu - headersize &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
 		csummode = CHECKSUM_PARTIAL;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 824797f8d1ab..86b7dd58d4b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
  *	Sending
  */
 
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+			   struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct udphdr *uh;
@@ -1042,6 +1043,21 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)
 		csum = udplite_csum(skb);
 	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
@@ -1093,7 +1109,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_v6_send_skb(skb, &fl6);
+	err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
 
 out:
 	up->len = 0;
@@ -1127,6 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc6.hlimit = -1;
 	ipc6.tclass = -1;
 	ipc6.dontfrag = -1;
+	ipc6.gso_size = up->gso_size;
 	sockc.tsflags = sk->sk_tsflags;
 
 	/* destination address check */
@@ -1333,7 +1350,7 @@ back_from_confirm:
 				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_v6_send_skb(skb, &fl6);
+			err = udp_v6_send_skb(skb, &fl6, &cork.base);
 		goto out;
 	}
 
-- 
cgit v1.2.3-55-g7522


From b85fab0e67b162014cd328cb4e2a8e8ae382cb8a Mon Sep 17 00:00:00 2001
From: Jiri Olsa
Date: Wed, 25 Apr 2018 19:41:06 +0200
Subject: bpf: Add gpl_compatible flag to struct bpf_prog_info

Adding gpl_compatible flag to struct bpf_prog_info
so it can be dumped via bpf_prog_get_info_by_fd and
displayed via bpftool progs dump.

Alexei noticed 4-byte hole in struct bpf_prog_info,
so we put the u32 flags field in there, and we can
keep adding bit fields in there without breaking
user space.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 1 +
 kernel/bpf/syscall.c     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6679393b687..da8801860c7d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1060,6 +1060,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a3ec4..7bb4ff1c770a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1914,6 +1914,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.load_time = prog->aux->load_time;
 	info.created_by_uid = from_kuid_munged(current_user_ns(),
 					       prog->aux->user->uid);
+	info.gpl_compatible = prog->gpl_compatible;
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
 	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
-- 
cgit v1.2.3-55-g7522


From 56a092c895054a6b423781d788339775bd2bda10 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:52 +0100
Subject: bpf: add script and prepare bpf.h for new helpers documentation

Remove previous "overview" of eBPF helpers from user bpf.h header.
Replace it by a comment explaining how to process the new documentation
(to come in following patches) with a Python script to produce RST, then
man page documentation.

Also add the aforementioned Python script under scripts/. It is used to
process include/uapi/linux/bpf.h and to extract helper descriptions, to
turn it into a RST document that can further be processed with rst2man
to produce a man page. The script takes one "--filename <path/to/file>"
option. If the script is launched from scripts/ in the kernel root
directory, it should be able to find the location of the header to
parse, and "--filename <path/to/file>" is then optional. If it cannot
find the file, then the option becomes mandatory. RST-formatted
documentation is printed to standard output.

Typical workflow for producing the final man page would be:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

Note that the tool kernel-doc cannot be used to document eBPF helpers,
whose signatures are not available directly in the header files
(pre-processor directives are used to produce them at the beginning of
the compilation process).

v4:
- Also remove overviews for newly added bpf_xdp_adjust_tail() and
  bpf_skb_get_xfrm_state().
- Remove vague statement about what helpers are restricted to GPL
  programs in "LICENSE" section for man page footer.
- Replace license boilerplate with SPDX tag for Python script.

v3:
- Change license for man page.
- Remove "for safety reasons" from man page header text.
- Change "packets metadata" to "packets" in man page header text.
- Move and fix comment on helpers introducing no overhead.
- Remove "NOTES" section from man page footer.
- Add "LICENSE" section to man page footer.
- Edit description of file include/uapi/linux/bpf.h in man page footer.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h   | 422 ++-------------------------------------------
 scripts/bpf_helpers_doc.py | 421 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 437 insertions(+), 406 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da8801860c7d..dd43fc98c982 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -377,412 +377,22 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
- *
- * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
- *
- * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
- *
- * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
- *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
- *
- * int bpf_xdp_adjust_tail(xdp_md, delta)
- *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
- *     size is supported.
- *     @xdp_md: pointer to xdp_md
- *     @delta: A negative integer to be added to xdp_md.data_end
- *     Return: 0 on success or negative on error
- *
- * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
- *     retrieve XFRM state
- *     @skb: pointer to skb
- *     @index: index of the xfrm state in the secpath
- *     @key: pointer to 'struct bpf_xfrm_state'
- *     @size: size of 'struct bpf_xfrm_state'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..30ba0fee36e4
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,421 @@
+#!/usr/bin/python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) All BPF authors and contributors from 2014 to present.
+.. See git log include/uapi/linux/bpf.h in kernel tree for details.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program.
+These functions are restricted to a white-list of helpers defined in the
+kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets. Since there are several eBPF
+program types, and that they do not run in the same context, each program type
+can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+Internally, eBPF programs call directly into the compiled helper functions
+without requiring any foreign-function interface. As a result, calling helpers
+introduces no overhead, thus offering excellent performance.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+LICENSE
+=======
+
+eBPF programs can have an associated license, passed along with the bytecode
+instructions to the kernel when the programs are loaded. The format for that
+string is identical to the one in use for kernel modules (Dual licenses, such
+as "Dual BSD/GPL", may be used). Some helper functions are only accessible to
+programs that are compatible with the GNU Privacy License (GPL).
+
+In order to use such helpers, the eBPF program must be loaded with the correct
+license string passed (via **attr**) to the **bpf**\ () system call, and this
+generally translates into the C source code of the program containing a line
+similar to the following:
+
+::
+
+	char ____license[] __attribute__((section("license"), used)) = "GPL";
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list
+  of all helper functions, as well as many other BPF definitions including most
+  of the flags, structs or constants used by the helpers.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
cgit v1.2.3-55-g7522


From ad4a522349e5c4fa6af63df376256749dc489fb8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:53 +0100
Subject: bpf: add documentation for eBPF helpers (01-11)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_map_lookup_elem()
- bpf_map_update_elem()
- bpf_map_delete_elem()
- bpf_probe_read()
- bpf_ktime_get_ns()
- bpf_trace_printk()
- bpf_skb_store_bytes()
- bpf_l3_csum_replace()
- bpf_l4_csum_replace()
- bpf_tail_call()
- bpf_clone_redirect()

v4:
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_map_update_elem(): Add "const" qualifier for key and value.
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_skb_store_bytes(): Clarify comment about invalidated verifier
  checks.
- bpf_l3_csum_replace(): Mention L3 instead of just IP, and add a note
  about bpf_csum_diff().
- bpf_l4_csum_replace(): Mention L4 instead of just TCP/UDP, and add a
  note about bpf_csum_diff().
- bpf_tail_call(): Bring minor edits to description.
- bpf_clone_redirect(): Add a note about the relation with
  bpf_redirect(). Also clarify comment about invalidated verifier
  checks.

v3:
- bpf_map_lookup_elem(): Fix description of restrictions for flags
  related to the existence of the entry.
- bpf_trace_printk(): State that trace_pipe can be configured. Fix
  return value in case an unknown format specifier is met. Add a note on
  kernel log notice when the helper is used. Edit example.
- bpf_tail_call(): Improve comment on stack inheritance.
- bpf_clone_redirect(): Improve description of BPF_F_INGRESS flag.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd43fc98c982..96925e949a24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,236 @@ union bpf_attr {
  * intentional, removing them would break paragraphs for rst2man.
  *
  * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-55-g7522


From c456dec4d2656b70de7371e3a6ae60b7ec7ab74f Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:54 +0100
Subject: bpf: add documentation for eBPF helpers (12-22)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_get_current_pid_tgid()
- bpf_get_current_uid_gid()
- bpf_get_current_comm()
- bpf_skb_vlan_push()
- bpf_skb_vlan_pop()
- bpf_skb_get_tunnel_key()
- bpf_skb_set_tunnel_key()
- bpf_redirect()
- bpf_perf_event_output()
- bpf_get_stackid()
- bpf_get_current_task()

v4:
- bpf_redirect(): Fix typo: "XDP_ABORT" changed to "XDP_ABORTED". Add
  note on bpf_redirect_map() providing better performance. Replace "Save
  for" with "Except for".
- bpf_skb_vlan_push(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_vlan_pop(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_get_tunnel_key(): Add notes on tunnel_id, "collect metadata"
  mode, and example tunneling protocols with which it can be used.
- bpf_skb_set_tunnel_key(): Add a reference to the description of
  bpf_skb_get_tunnel_key().
- bpf_perf_event_output(): Specify that, and for what purpose, the
  helper can be used with programs attached to TC and XDP.

v3:
- bpf_skb_get_tunnel_key(): Change and improve description and example.
- bpf_redirect(): Improve description of BPF_F_INGRESS flag.
- bpf_perf_event_output(): Fix first sentence of description. Delete
  wrong statement on context being evaluated as a struct pt_reg. Remove
  the long yet incomplete example.
- bpf_get_stackid(): Add a note about PERF_MAX_STACK_DEPTH being
  configurable.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 254 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 96925e949a24..465ea273729a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -623,6 +623,260 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-55-g7522


From 1fdd08bedc56d2a927c5f230e2d63f17ca1068f1 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:55 +0100
Subject: bpf: add documentation for eBPF helpers (23-32)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_prandom_u32()
- bpf_get_smp_processor_id()
- bpf_get_cgroup_classid()
- bpf_get_route_realm()
- bpf_skb_load_bytes()
- bpf_csum_diff()
- bpf_skb_get_tunnel_opt()
- bpf_skb_set_tunnel_opt()
- bpf_skb_change_proto()
- bpf_skb_change_type()

v4:
- bpf_get_prandom_u32(): Warn that the prng is not cryptographically
  secure.
- bpf_get_smp_processor_id(): Fix a typo (case).
- bpf_get_cgroup_classid(): Clarify description. Add notes on the helper
  being limited to cgroup v1, and to egress path.
- bpf_get_route_realm(): Add comparison with bpf_get_cgroup_classid().
  Add a note about usage with TC and advantage of clsact. Fix a typo in
  return value ("sdb" instead of "skb").
- bpf_skb_load_bytes(): Make explicit loading large data loads it to the
  eBPF stack.
- bpf_csum_diff(): Add a note on seed that can be cascaded. Link to
  bpf_l3|l4_csum_replace().
- bpf_skb_get_tunnel_opt(): Add a note about usage with "collect
  metadata" mode, and example of this with Geneve.
- bpf_skb_set_tunnel_opt(): Add a link to bpf_skb_get_tunnel_opt()
  description.
- bpf_skb_change_proto(): Mention that the main use case is NAT64.
  Clarify comment about invalidated verifier checks.

v3:
- bpf_get_prandom_u32(): Fix helper name :(. Add description, including
  a note on the internal random state.
- bpf_get_smp_processor_id(): Add description, including a note on the
  processor id remaining stable during program run.
- bpf_get_cgroup_classid(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use the helper. Add a reference to related documentation.
  State that placing a task in net_cls controller disables cgroup-bpf.
- bpf_get_route_realm(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use this helper.
- bpf_skb_load_bytes(): Fix comment on current use cases for the helper.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 197 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 465ea273729a..7352abf72f50 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -495,6 +495,27 @@ union bpf_attr {
  * 		The number of bytes written to the buffer, or a negative error
  * 		in case of failure.
  *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
  * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
  * 		Store *len* bytes from address *from* into the packet
@@ -647,6 +668,32 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
  * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  * 	Description
  * 		Push a *vlan_tci* (VLAN tag control information) of protocol
@@ -786,6 +833,30 @@ union bpf_attr {
  * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
  * 		error.
  *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
  * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
@@ -831,6 +902,23 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
@@ -874,6 +962,115 @@ union bpf_attr {
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
  *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
-- 
cgit v1.2.3-55-g7522


From fa15601ab31e5a90bb2a7dbdb43083470a8e787b Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:56 +0100
Subject: bpf: add documentation for eBPF helpers (33-41)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_hash_recalc()
- bpf_skb_change_tail()
- bpf_skb_pull_data()
- bpf_csum_update()
- bpf_set_hash_invalid()
- bpf_get_numa_node_id()
- bpf_set_hash()
- bpf_skb_adjust_room()
- bpf_xdp_adjust_meta()

v4:
- bpf_skb_change_tail(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_pull_data(): Clarify the motivation for using this helper or
  bpf_skb_load_bytes(), on non-linear buffers. Fix RST formatting for
  *skb*. Clarify comment about invalidated verifier checks.
- bpf_csum_update(): Fix description of checksum (entire packet, not IP
  checksum). Fix a typo: "header" instead of "helper".
- bpf_set_hash_invalid(): Mention bpf_get_hash_recalc().
- bpf_get_numa_node_id(): State that the helper is not restricted to
  programs attached to sockets.
- bpf_skb_adjust_room(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_meta(): Clarify comment about invalidated verifier
  checks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 164 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7352abf72f50..b9c28f63f511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1071,9 +1071,173 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-55-g7522


From c6b5fb8690faccf770d2f344094db1a8e482148f Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:57 +0100
Subject: bpf: add documentation for eBPF helpers (42-50)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Kaixu:
- bpf_perf_event_read()

Helpers from Martin:
- bpf_skb_under_cgroup()
- bpf_xdp_adjust_head()

Helpers from Sargun:
- bpf_probe_write_user()
- bpf_current_task_under_cgroup()

Helper from Thomas:
- bpf_skb_change_head()

Helper from Gianluca:
- bpf_probe_read_str()

Helpers from Chenbo:
- bpf_get_socket_cookie()
- bpf_get_socket_uid()

v4:
- bpf_perf_event_read(): State that bpf_perf_event_read_value() should
  be preferred over this helper.
- bpf_skb_change_head(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_head(): Clarify comment about invalidated verifier
  checks.
- bpf_probe_write_user(): Add that dst must be a valid user space
  address.
- bpf_get_socket_cookie(): Improve description by making clearer that
  the cockie belongs to the socket, and state that it remains stable for
  the life of the socket.

v3:
- bpf_perf_event_read(): Fix time of selection for perf event type in
  description. Remove occurences of "cores" to avoid confusion with
  "CPU".

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Gianluca Borello <g.borello@gmail.com>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
[for bpf_skb_under_cgroup(), bpf_xdp_adjust_head()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9c28f63f511..0d4b55e85e07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -810,6 +810,35 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with bpf_perf_event_read_value(), which at the same time
+ * 		provides more features over the **bpf_perf_event_read**\ ()
+ * 		interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
  * int bpf_redirect(u32 ifindex, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
@@ -1071,6 +1100,17 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
  * u32 bpf_get_hash_recalc(struct sk_buff *skb)
  * 	Description
  * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
@@ -1091,6 +1131,37 @@ union bpf_attr {
  * 	Return
  * 		A pointer to the current task struct.
  *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
  * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Resize (trim or grow) the packet associated to *skb* to the
@@ -1182,6 +1253,107 @@ union bpf_attr {
  * 	Return
  * 		The id of current NUMA node.
  *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
  * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
  * 	Description
  * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
-- 
cgit v1.2.3-55-g7522


From 7aa79a869dd6d3aa345ea8ffd54085c980f5b1c3 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:58 +0100
Subject: bpf: add documentation for eBPF helpers (51-57)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helpers from Lawrence:
- bpf_setsockopt()
- bpf_getsockopt()
- bpf_sock_ops_cb_flags_set()

Helpers from Yonghong:
- bpf_perf_event_read_value()
- bpf_perf_prog_read_value()

Helper from Josef:
- bpf_override_return()

Helper from Andrey:
- bpf_bind()

v4:
- bpf_perf_event_read_value(): State that this helper should be
  preferred over bpf_perf_event_read().

v3:
- bpf_perf_event_read_value(): Fix time of selection for perf event type
  in description. Remove occurences of "cores" to avoid confusion with
  "CPU".
- bpf_bind(): Remove last paragraph of description, which was off topic.

Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Yonghong Song <yhs@fb.com>
[for bpf_perf_event_read_value(), bpf_perf_prog_read_value()]
Acked-by: Andrey Ignatov <rdna@fb.com>
[for bpf_bind()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d4b55e85e07..4aaee0169421 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,6 +1361,28 @@ union bpf_attr {
  * 	Return
  * 		0
  *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
@@ -1410,6 +1432,164 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-55-g7522


From ab12704099c2001915843a8b7885567ee6f1800e Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:16:59 +0100
Subject: bpf: add documentation for eBPF helpers (58-64)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by John:

- bpf_redirect_map()
- bpf_sk_redirect_map()
- bpf_sock_map_update()
- bpf_msg_redirect_map()
- bpf_msg_apply_bytes()
- bpf_msg_cork_bytes()
- bpf_msg_pull_data()

v4:
- bpf_redirect_map(): Fix typos: "XDP_ABORT" changed to "XDP_ABORTED",
  "his" to "this". Also add a paragraph on performance improvement over
  bpf_redirect() helper.

v3:
- bpf_sk_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_msg_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_redirect_map(): Fix note on CPU redirection, not fully implemented
  for generic XDP but supported on native XDP.
- bpf_msg_pull_data(): Clarify comment about invalidated verifier
  checks.

Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 147 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4aaee0169421..27940a36501a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1404,6 +1404,56 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
@@ -1574,6 +1624,103 @@ union bpf_attr {
  * 		be set is returned (which comes down to 0 if all bits were set
  * 		as required).
  *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
-- 
cgit v1.2.3-55-g7522


From 2d020dd771762d96d158a9deed41bc6b7480a8fe Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Wed, 25 Apr 2018 18:17:00 +0100
Subject: bpf: add documentation for eBPF helpers (65-66)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Nikita:
- bpf_xdp_adjust_tail()

Helper from Eyal:
- bpf_skb_get_xfrm_state()

v4:
- New patch (helpers did not exist yet for previous versions).

Cc: Nikita V. Shirokov <tehnerd@tehnerd.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 27940a36501a..da77a9388947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1737,6 +1737,36 @@ union bpf_attr {
  * 		must be set to zero.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-55-g7522


From 3e5cf362c34b14c8d01d19d4b821fb35e1779862 Mon Sep 17 00:00:00 2001
From: Jon Maloy
Date: Wed, 25 Apr 2018 19:29:36 +0200
Subject: tipc: introduce ioctl for fetching node identity

After the introduction of a 128-bit node identity it may be difficult
for a user to correlate between this identity and the generated node
hash address.

We now try to make this easier by introducing a new ioctl() call for
fetching a node identity by using the hash value as key. This will
be particularly useful when we extend some of the commands in the
'tipc' tool, but we also expect regular user applications to need
this feature.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 12 ++++++++----
 net/tipc/node.c           | 21 +++++++++++++++++++++
 net/tipc/node.h           |  1 +
 net/tipc/socket.c         | 13 +++++++++++--
 4 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index bf6d28677cfe..6b2fd4d9655f 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -209,16 +209,16 @@ struct tipc_group_req {
  * The string formatting for each name element is:
  * media: media
  * interface: media:interface name
- * link: Z.C.N:interface-Z.C.N:interface
- *
+ * link: node:interface-node:interface
  */
-
+#define TIPC_NODEID_LEN         16
 #define TIPC_MAX_MEDIA_NAME	16
 #define TIPC_MAX_IF_NAME	16
 #define TIPC_MAX_BEARER_NAME	32
 #define TIPC_MAX_LINK_NAME	68
 
-#define SIOCGETLINKNAME		SIOCPROTOPRIVATE
+#define SIOCGETLINKNAME        SIOCPROTOPRIVATE
+#define SIOCGETNODEID          (SIOCPROTOPRIVATE + 1)
 
 struct tipc_sioc_ln_req {
 	__u32 peer;
@@ -226,6 +226,10 @@ struct tipc_sioc_ln_req {
 	char linkname[TIPC_MAX_LINK_NAME];
 };
 
+struct tipc_sioc_nodeid_req {
+	__u32 peer;
+	char node_id[TIPC_NODEID_LEN];
+};
 
 /* The macros and functions below are deprecated:
  */
diff --git a/net/tipc/node.c b/net/tipc/node.c
index e9c52e1416c5..81e6dd0cd1ca 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
 	return mtu;
 }
 
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
+{
+	u8 *own_id = tipc_own_id(net);
+	struct tipc_node *n;
+
+	if (!own_id)
+		return true;
+
+	if (addr == tipc_own_addr(net)) {
+		memcpy(id, own_id, TIPC_NODEID_LEN);
+		return true;
+	}
+	n = tipc_node_find(net, addr);
+	if (!n)
+		return false;
+
+	memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
+	tipc_node_put(n);
+	return true;
+}
+
 u16 tipc_node_get_capabilities(struct net *net, u32 addr)
 {
 	struct tipc_node *n;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index bb271a37c93f..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
 u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
 void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
 			  struct tipc_bearer *bearer,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 252a52ae0893..c4992002fd68 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2973,7 +2973,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 
 static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sock->sk);
+	struct tipc_sioc_nodeid_req nr = {0};
 	struct tipc_sioc_ln_req lnr;
 	void __user *argp = (void __user *)arg;
 
@@ -2981,7 +2982,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCGETLINKNAME:
 		if (copy_from_user(&lnr, argp, sizeof(lnr)))
 			return -EFAULT;
-		if (!tipc_node_get_linkname(sock_net(sk),
+		if (!tipc_node_get_linkname(net,
 					    lnr.bearer_id & 0xffff, lnr.peer,
 					    lnr.linkname, TIPC_MAX_LINK_NAME)) {
 			if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2989,6 +2990,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			return 0;
 		}
 		return -EADDRNOTAVAIL;
+	case SIOCGETNODEID:
+		if (copy_from_user(&nr, argp, sizeof(nr)))
+			return -EFAULT;
+		if (!tipc_node_get_id(net, nr.peer, nr.node_id))
+			return -EADDRNOTAVAIL;
+		if (copy_to_user(argp, &nr, sizeof(nr)))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOIOCTLCMD;
 	}
-- 
cgit v1.2.3-55-g7522


From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001
From: Yonghong Song
Date: Sat, 28 Apr 2018 22:28:08 -0700
Subject: bpf: add bpf_get_stack helper

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    | 19 ++++++++++++++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 7 files changed, 183 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc61ed99..c553f6f9c6b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..64899c04c1a6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				has_callchain_buf:1; /* callchain buffer allocated? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a9388947..1afb606a18b9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..9349a5db3cf2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->has_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1679f0..3ba102b41512 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr < skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596aebd3..253f6bdb9117 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/stringify.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include "disasm.h"
 
@@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+		const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+		err = -ENOTSUPP;
+		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+		if (err) {
+			verbose(env, err_str, func_id_name(func_id), func_id);
+			return err;
+		}
+
+		env->prog->has_callchain_buf = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2a01db..46d866e9937c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3-55-g7522


From a3ef8e9a4d7cc26d7528d50d10c5720b523b07c9 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov
Date: Sat, 28 Apr 2018 16:06:19 -0700
Subject: bpf: Fix helpers ctx struct types in uapi doc

Helpers may operate on two types of ctx structures: user visible ones
(e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones
(e.g. `struct bpf_sock_ops_kern`) in kernel implementation.

UAPI documentation must refer to only user visible structures.

The patch replaces references to `_kern` structures in BPF helpers
description by corresponding user visible structures.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1afb606a18b9..23b334bba1a6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,7 +1361,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1435,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  * 	Description
  * 		Add an entry to, or update a *map* referencing sockets. The
  * 		*skops* is used as a new value for the entry associated to
@@ -1533,7 +1533,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		For en eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1544,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1588,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
  * 	Description
  * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
  * 		for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1721,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
  * 		*addr*, of length *addr_len*. This allows for making outgoing
-- 
cgit v1.2.3-55-g7522


From 05255b823a6173525587f29c4e8f1ca33fd7677d Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 27 Apr 2018 08:58:08 -0700
Subject: tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive

When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.

Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.

1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
  This operation does not involve any TCP locking.

2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
 the transfert of pages from skbs to one VMA.
  This operation only uses down_read(&current->mm->mmap_sem) after
  holding TCP lock, thus solving the lockdep issue.

This new implementation was suggested by Andy Lutomirski with great details.

Benefits are :

- Better scalability, in case multiple threads reuse VMAS
   (without mmap()/munmap() calls) since mmap_sem wont be write locked.

- Better error recovery.
   The previous mmap() model had to provide the expected size of the
   mapping. If for some reason one part could not be mapped (partial MSS),
   the whole operation had to be aborted.
   With the tcp_zerocopy_receive struct, kernel can report how
   many bytes were successfuly mapped, and how many bytes should
   be read to skip the problematic sequence.

- No more memory allocation to hold an array of page pointers.
  16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/

- skbs are freed while mmap_sem has been released

Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)

Note that memcg might require additional changes.

Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |   8 ++
 net/ipv4/af_inet.c       |   2 +
 net/ipv4/tcp.c           | 194 +++++++++++++++++++++++++----------------------
 net/ipv6/af_inet6.c      |   2 +
 4 files changed, 116 insertions(+), 90 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 379b08700a54..e9e8373b34b9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -122,6 +122,7 @@ enum {
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
+#define TCP_ZEROCOPY_RECEIVE	35
 
 struct tcp_repair_opt {
 	__u32	opt_code;
@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+
+struct tcp_zerocopy_receive {
+	__u64 address;		/* in: address of mapping */
+	__u32 length;		/* in/out: number of bytes to map/mapped */
+	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
+};
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ebf599cebae..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dfd090ea54ad..4028ddd14dd5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
-/* When user wants to mmap X pages, we first need to perform the mapping
- * before freeing any skbs in receive queue, otherwise user would be unable
- * to fallback to standard recvmsg(). This happens if some data in the
- * requested block is not exactly fitting in a page.
- *
- * We only support order-0 pages for the moment.
- * mmap() on TCP is very strict, there is no point
- * trying to accommodate with pathological layouts.
- */
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
 int tcp_mmap(struct file *file, struct socket *sock,
 	     struct vm_area_struct *vma)
 {
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned int nr_pages = size >> PAGE_SHIFT;
-	struct page **pages_array = NULL;
-	u32 seq, len, offset, nr = 0;
-	struct sock *sk = sock->sk;
-	const skb_frag_t *frags;
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	vma->vm_ops = &tcp_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+				struct tcp_zerocopy_receive *zc)
+{
+	unsigned long address = (unsigned long)zc->address;
+	const skb_frag_t *frags = NULL;
+	u32 length = 0, seq, offset;
+	struct vm_area_struct *vma;
+	struct sk_buff *skb = NULL;
 	struct tcp_sock *tp;
-	struct sk_buff *skb;
 	int ret;
 
-	if (vma->vm_pgoff || !nr_pages)
+	if (address & (PAGE_SIZE - 1) || address != zc->address)
 		return -EINVAL;
 
-	if (vma->vm_flags & VM_WRITE)
-		return -EPERM;
-	/* TODO: Maybe the following is not needed if pages are COW */
-	vma->vm_flags &= ~VM_MAYWRITE;
-
-	lock_sock(sk);
-
-	ret = -ENOTCONN;
 	if (sk->sk_state == TCP_LISTEN)
-		goto out;
+		return -ENOTCONN;
 
 	sock_rps_record_flow(sk);
 
-	if (tcp_inq(sk) < size) {
-		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+	down_read(&current->mm->mmap_sem);
+
+	ret = -EINVAL;
+	vma = find_vma(current->mm, address);
+	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
 		goto out;
-	}
+	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
 	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
-	/* Abort if urgent data is in the area */
-	if (unlikely(tp->urg_data)) {
-		u32 urg_offset = tp->urg_seq - seq;
+	zc->length = min_t(u32, zc->length, tcp_inq(sk));
+	zc->length &= ~(PAGE_SIZE - 1);
 
-		ret = -EINVAL;
-		if (urg_offset < size)
-			goto out;
-	}
-	ret = -ENOMEM;
-	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
-				     GFP_KERNEL);
-	if (!pages_array)
-		goto out;
-	skb = tcp_recv_skb(sk, seq, &offset);
-	ret = -EINVAL;
-skb_start:
-	/* We do not support anything not in page frags */
-	offset -= skb_headlen(skb);
-	if ((int)offset < 0)
-		goto out;
-	if (skb_has_frag_list(skb))
-		goto out;
-	len = skb->data_len - offset;
-	frags = skb_shinfo(skb)->frags;
-	while (offset) {
-		if (frags->size > offset)
-			goto out;
-		offset -= frags->size;
-		frags++;
-	}
-	while (nr < nr_pages) {
-		if (len) {
-			if (len < PAGE_SIZE)
-				goto out;
-			if (frags->size != PAGE_SIZE || frags->page_offset)
-				goto out;
-			pages_array[nr++] = skb_frag_page(frags);
-			frags++;
-			len -= PAGE_SIZE;
-			seq += PAGE_SIZE;
-			continue;
+	zap_page_range(vma, address, zc->length);
+
+	zc->recv_skip_hint = 0;
+	ret = 0;
+	while (length + PAGE_SIZE <= zc->length) {
+		if (zc->recv_skip_hint < PAGE_SIZE) {
+			if (skb) {
+				skb = skb->next;
+				offset = seq - TCP_SKB_CB(skb)->seq;
+			} else {
+				skb = tcp_recv_skb(sk, seq, &offset);
+			}
+
+			zc->recv_skip_hint = skb->len - offset;
+			offset -= skb_headlen(skb);
+			if ((int)offset < 0 || skb_has_frag_list(skb))
+				break;
+			frags = skb_shinfo(skb)->frags;
+			while (offset) {
+				if (frags->size > offset)
+					goto out;
+				offset -= frags->size;
+				frags++;
+			}
 		}
-		skb = skb->next;
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		goto skb_start;
-	}
-	/* OK, we have a full set of pages ready to be inserted into vma */
-	for (nr = 0; nr < nr_pages; nr++) {
-		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
-				     pages_array[nr]);
+		if (frags->size != PAGE_SIZE || frags->page_offset)
+			break;
+		ret = vm_insert_page(vma, address + length,
+				     skb_frag_page(frags));
 		if (ret)
-			goto out;
+			break;
+		length += PAGE_SIZE;
+		seq += PAGE_SIZE;
+		zc->recv_skip_hint -= PAGE_SIZE;
+		frags++;
 	}
-	/* operation is complete, we can 'consume' all skbs */
-	tp->copied_seq = seq;
-	tcp_rcv_space_adjust(sk);
-
-	/* Clean up data we have read: This will do ACK frames. */
-	tcp_recv_skb(sk, seq, &offset);
-	tcp_cleanup_rbuf(sk, size);
-
-	ret = 0;
 out:
-	release_sock(sk);
-	kvfree(pages_array);
+	up_read(&current->mm->mmap_sem);
+	if (length) {
+		tp->copied_seq = seq;
+		tcp_rcv_space_adjust(sk);
+
+		/* Clean up data we have read: This will do ACK frames. */
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, length);
+		ret = 0;
+		if (length == zc->length)
+			zc->recv_skip_hint = 0;
+	} else {
+		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+			ret = -EIO;
+	}
+	zc->length = length;
 	return ret;
 }
-EXPORT_SYMBOL(tcp_mmap);
+#endif
 
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		}
 		return 0;
 	}
+#ifdef CONFIG_MMU
+	case TCP_ZEROCOPY_RECEIVE: {
+		struct tcp_zerocopy_receive zc;
+		int err;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+		if (len != sizeof(zc))
+			return -EINVAL;
+		if (copy_from_user(&zc, optval, len))
+			return -EFAULT;
+		lock_sock(sk);
+		err = tcp_zerocopy_receive(sk, &zc);
+		release_sock(sk);
+		if (!err && copy_to_user(optval, &zc, len))
+			err = -EFAULT;
+		return err;
+	}
+#endif
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..d0af96e0d109 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
-- 
cgit v1.2.3-55-g7522


From 3bd5a09b529c03bac354c9d48e688ed2aca934fd Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Mon, 30 Apr 2018 11:39:03 +0100
Subject: bpf: fix formatting for bpf_perf_event_read() helper doc

Some edits brought to the last iteration of BPF helper functions
documentation introduced an error with RST formatting. As a result, most
of one paragraph is rendered in bold text when only the name of a helper
should be. Fix it, and fix formatting of another function name in the
same paragraph.

Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23b334bba1a6..530ff6588d8f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -828,12 +828,12 @@ union bpf_attr {
  *
  * 		Also, be aware that the newer helper
  * 		**bpf_perf_event_read_value**\ () is recommended over
- * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
  * 		quirks where error and counter value are used as a return code
  * 		(which is wrong to do since ranges may overlap). This issue is
- * 		fixed with bpf_perf_event_read_value(), which at the same time
- * 		provides more features over the **bpf_perf_event_read**\ ()
- * 		interface. Please refer to the description of
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
  * 		**bpf_perf_event_read_value**\ () for details.
  * 	Return
  * 		The value of the perf event counter read from the map, or a
-- 
cgit v1.2.3-55-g7522


From 79552fbc0f9dc20dc022be7cc48eb3761623fa56 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Mon, 30 Apr 2018 11:39:04 +0100
Subject: bpf: fix formatting for bpf_get_stack() helper doc

Fix formatting (indent) for bpf_get_stack() helper documentation, so
that the doc is rendered correctly with the Python script.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 54 ++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 530ff6588d8f..8daef7326bb7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1770,33 +1770,33 @@ union bpf_attr {
  *
  * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
  * 	Description
- *		Return a user or a kernel stack in bpf program provided buffer.
- *		To achieve this, the helper needs *ctx*, which is a pointer
- *		to the context on which the tracing program is executed.
- *		To store the stacktrace, the bpf program provides *buf* with
- *		a nonnegative *size*.
- *
- *		The last argument, *flags*, holds the number of stack frames to
- *		skip (from 0 to 255), masked with
- *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
- *		the following flags:
- *
- *		**BPF_F_USER_STACK**
- *			Collect a user space stack instead of a kernel stack.
- *		**BPF_F_USER_BUILD_ID**
- *			Collect buildid+offset instead of ips for user stack,
- *			only valid if **BPF_F_USER_STACK** is also specified.
- *
- *		**bpf_get_stack**\ () can collect up to
- *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
- *		to sufficient large buffer size. Note that
- *		this limit can be controlled with the **sysctl** program, and
- *		that it should be manually increased in order to profile long
- *		user stacks (such as stacks for Java programs). To do so, use:
- *
- *	::
- *
- *		# sysctl kernel.perf_event_max_stack=<new value>
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
  *
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
-- 
cgit v1.2.3-55-g7522


From b086ff87251b4a4c147bc3af20369514e9d0d9ad Mon Sep 17 00:00:00 2001
From: Stefan Strogin
Date: Tue, 1 May 2018 01:04:29 +0300
Subject: connector: add parent pid and tgid to coredump and exit events

The intention is to get notified of process failures as soon
as possible, before a possible core dumping (which could be very long)
(e.g. in some process-manager). Coredump and exit process events
are perfect for such use cases (see 2b5faa4c553f "connector: Added
coredumping event to the process connector").

The problem is that for now the process-manager cannot know the parent
of a dying process using connectors. This could be useful if the
process-manager should monitor for failures only children of certain
parents, so we could filter the coredump and exit events by parent
process and/or thread ID.

Add parent pid and tgid to coredump and exit process connectors event
data.

Signed-off-by: Stefan Strogin <sstrogin@cisco.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_proc.c  | 4 ++++
 include/uapi/linux/cn_proc.h | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce87715c..ed5e42461094 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -262,6 +262,8 @@ void proc_coredump_connector(struct task_struct *task)
 	ev->what = PROC_EVENT_COREDUMP;
 	ev->event_data.coredump.process_pid = task->pid;
 	ev->event_data.coredump.process_tgid = task->tgid;
+	ev->event_data.coredump.parent_pid = task->real_parent->pid;
+	ev->event_data.coredump.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
@@ -288,6 +290,8 @@ void proc_exit_connector(struct task_struct *task)
 	ev->event_data.exit.process_tgid = task->tgid;
 	ev->event_data.exit.exit_code = task->exit_code;
 	ev->event_data.exit.exit_signal = task->exit_signal;
+	ev->event_data.exit.parent_pid = task->real_parent->pid;
+	ev->event_data.exit.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 68ff25414700..db210625cee8 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -116,12 +116,16 @@ struct proc_event {
 		struct coredump_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} coredump;
 
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} exit;
 
 	} event_data;
-- 
cgit v1.2.3-55-g7522


From b75eba76d3d72e2374fac999926dafef2997edd2 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh
Date: Tue, 1 May 2018 15:39:15 -0400
Subject: tcp: send in-queue bytes in cmsg upon read

Applications with many concurrent connections, high variance
in receive queue length and tight memory bounds cannot
allocate worst-case buffer size to drain sockets. Knowing
the size of receive queue length, applications can optimize
how they allocate buffers to read from the socket.

The number of bytes pending on the socket is directly
available through ioctl(FIONREAD/SIOCINQ) and can be
approximated using getsockopt(MEMINFO) (rmem_alloc includes
skb overheads in addition to application data). But, both of
these options add an extra syscall per recvmsg. Moreover,
ioctl(FIONREAD/SIOCINQ) takes the socket lock.

Add the TCP_INQ socket option to TCP. When this socket
option is set, recvmsg() relays the number of bytes available
on the socket for reading to the application via the
TCP_CM_INQ control message.

Calculate the number of bytes after releasing the socket lock
to include the processed backlog, if any. To avoid an extra
branch in the hot path of recvmsg() for this new control
message, move all cmsg processing inside an existing branch for
processing receive timestamps. Since the socket lock is not held
when calculating the size of receive queue, TCP_INQ is a hint.
For example, it can overestimate the queue size by one byte,
if FIN is received.

With this method, applications can start reading from the socket
using a small buffer, and then use larger buffers based on the
remaining data when needed.

V3 change-log:
	As suggested by David Miller, added loads with barrier
	to check whether we have multiple threads calling recvmsg
	in parallel. When that happens we lock the socket to
	calculate inq.
V4 change-log:
	Removed inline from a static function.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 +-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 43 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 20585d5c4e1c..807776928cb8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -228,7 +228,7 @@ struct tcp_sock {
 		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		unused1	    : 1,
+		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e9e8373b34b9..29eb659aa77a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -123,6 +123,9 @@ enum {
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE	35
+#define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
+
+#define TCP_CM_INQ		TCP_INQ
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4028ddd14dd5..868ed74a76a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 	}
 }
 
+static int tcp_inq_hint(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied_seq = READ_ONCE(tp->copied_seq);
+	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+	int inq;
+
+	inq = rcv_nxt - copied_seq;
+	if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+		lock_sock(sk);
+		inq = tp->rcv_nxt - tp->copied_seq;
+		release_sock(sk);
+	}
+	return inq;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	u32 peek_seq;
 	u32 *seq;
 	unsigned long used;
-	int err;
+	int err, inq;
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 	struct scm_timestamping tss;
 	bool has_tss = false;
+	bool has_cmsg;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	if (sk->sk_state == TCP_LISTEN)
 		goto out;
 
+	has_cmsg = tp->recvmsg_inq;
 	timeo = sock_rcvtimeo(sk, nonblock);
 
 	/* Urgent data needs to be handled specially. */
@@ -2112,6 +2130,7 @@ skip_copy:
 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
 			tcp_update_recv_tstamps(skb, &tss);
 			has_tss = true;
+			has_cmsg = true;
 		}
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
@@ -2131,13 +2150,20 @@ skip_copy:
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
 
-	if (has_tss)
-		tcp_recv_timestamp(msg, sk, &tss);
-
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
 
 	release_sock(sk);
+
+	if (has_cmsg) {
+		if (has_tss)
+			tcp_recv_timestamp(msg, sk, &tss);
+		if (tp->recvmsg_inq) {
+			inq = tcp_inq_hint(sk);
+			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+		}
+	}
+
 	return copied;
 
 out:
@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 		break;
+	case TCP_INQ:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else
+			tp->recvmsg_inq = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
+	case TCP_INQ:
+		val = tp->recvmsg_inq;
+		break;
 	case TCP_SAVE_SYN:
 		val = tp->save_syn;
 		break;
-- 
cgit v1.2.3-55-g7522


From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Wed, 2 May 2018 13:01:23 +0200
Subject: xsk: add user memory registration support sockopt

In this commit the base structure of the AF_XDP address family is set
up. Further, we introduce the abilty register a window of user memory
to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
window is viewed by an AF_XDP socket as a set of equally large
frames. After a user memory registration all frames are "owned" by the
user application, and not the kernel.

v2: More robust checks on umem creation and unaccount on error.
    Call set_page_dirty_lock on cleanup.
    Simplified xdp_umem_reg.

Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  31 ++++++
 include/uapi/linux/if_xdp.h |  34 ++++++
 net/Makefile                |   1 +
 net/xdp/Makefile            |   2 +
 net/xdp/xdp_umem.c          | 245 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  45 ++++++++
 net/xdp/xdp_umem_props.h    |  23 +++++
 net/xdp/xsk.c               | 215 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 596 insertions(+)
 create mode 100644 include/net/xdp_sock.h
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xdp_umem.c
 create mode 100644 net/xdp/xdp_umem.h
 create mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 net/xdp/xsk.c

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
new file mode 100644
index 000000000000..94785f5db13e
--- /dev/null
+++ b/include/net/xdp_sock.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * AF_XDP internal functions
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XDP_SOCK_H
+#define _LINUX_XDP_SOCK_H
+
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+struct xdp_umem;
+
+struct xdp_sock {
+	/* struct sock must be the first member of struct xdp_sock */
+	struct sock sk;
+	struct xdp_umem *umem;
+	/* Protects multiple processes in the control path */
+	struct mutex mutex;
+};
+
+#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..41252135a0fe
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* XDP socket options */
+#define XDP_UMEM_REG			3
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 frame_size; /* Frame size */
+	__u32 frame_headroom; /* Frame head room */
+};
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..77aaddedbd29 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -85,3 +85,4 @@ obj-y				+= l3mdev/
 endif
 obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
+obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..a5d736640a0f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..ec8b3552be44
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+
+#define XDP_UMEM_MIN_FRAME_SIZE 2048
+
+int xdp_umem_create(struct xdp_umem **umem)
+{
+	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
+
+	if (!(*umem))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+	unsigned int i;
+
+	if (umem->pgs) {
+		for (i = 0; i < umem->npgs; i++) {
+			struct page *page = umem->pgs[i];
+
+			set_page_dirty_lock(page);
+			put_page(page);
+		}
+
+		kfree(umem->pgs);
+		umem->pgs = NULL;
+	}
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	if (umem->pgs) {
+		xdp_umem_unpin_pages(umem);
+
+		task = get_pid_task(umem->pid, PIDTYPE_PID);
+		put_pid(umem->pid);
+		if (!task)
+			goto out;
+		mm = get_task_mm(task);
+		put_task_struct(task);
+		if (!mm)
+			goto out;
+
+		mmput(mm);
+		umem->pgs = NULL;
+	}
+
+	xdp_umem_unaccount_pages(umem);
+out:
+	kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+	xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+	atomic_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+	if (!umem)
+		return;
+
+	if (atomic_dec_and_test(&umem->users)) {
+		INIT_WORK(&umem->work, xdp_umem_release_deferred);
+		schedule_work(&umem->work);
+	}
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	down_write(&current->mm->mmap_sem);
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags, &umem->pgs[0], NULL);
+	up_write(&current->mm->mmap_sem);
+
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	xdp_umem_unpin_pages(umem);
+out_pgs:
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+	return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+	unsigned long lock_limit, new_npgs, old_npgs;
+
+	if (capable(CAP_IPC_LOCK))
+		return 0;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	umem->user = get_uid(current_user());
+
+	do {
+		old_npgs = atomic_long_read(&umem->user->locked_vm);
+		new_npgs = old_npgs + umem->npgs;
+		if (new_npgs > lock_limit) {
+			free_uid(umem->user);
+			umem->user = NULL;
+			return -ENOBUFS;
+		}
+	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+				     new_npgs) != old_npgs);
+	return 0;
+}
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u64 addr = mr->addr, size = mr->len;
+	unsigned int nframes, nfpp;
+	int size_chk, err;
+
+	if (!umem)
+		return -EINVAL;
+
+	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(frame_size))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return -EINVAL;
+	}
+
+	if ((addr + size) < addr)
+		return -EINVAL;
+
+	nframes = size / frame_size;
+	if (nframes == 0 || nframes > UINT_MAX)
+		return -EINVAL;
+
+	nfpp = PAGE_SIZE / frame_size;
+	if (nframes < nfpp || nframes % nfpp)
+		return -EINVAL;
+
+	frame_headroom = ALIGN(frame_headroom, 64);
+
+	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	if (size_chk < 0)
+		return -EINVAL;
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = (size_t)size;
+	umem->address = (unsigned long)addr;
+	umem->props.frame_size = frame_size;
+	umem->props.nframes = nframes;
+	umem->frame_headroom = frame_headroom;
+	umem->npgs = size / PAGE_SIZE;
+	umem->pgs = NULL;
+	umem->user = NULL;
+
+	umem->frame_size_log2 = ilog2(frame_size);
+	umem->nfpp_mask = nfpp - 1;
+	umem->nfpplog2 = ilog2(nfpp);
+	atomic_set(&umem->users, 1);
+
+	err = xdp_umem_account_pages(umem);
+	if (err)
+		goto out;
+
+	err = xdp_umem_pin_pages(umem);
+	if (err)
+		goto out_account;
+	return 0;
+
+out_account:
+	xdp_umem_unaccount_pages(umem);
+out:
+	put_pid(umem->pid);
+	return err;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..4597ae81a221
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <linux/mm.h>
+#include <linux/if_xdp.h>
+#include <linux/workqueue.h>
+
+#include "xdp_umem_props.h"
+
+struct xdp_umem {
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 npgs;
+	u32 frame_headroom;
+	u32 nfpp_mask;
+	u32 nfpplog2;
+	u32 frame_size_log2;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	size_t size;
+	atomic_t users;
+	struct work_struct work;
+};
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem);
+int xdp_umem_create(struct xdp_umem **umem);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..77fb5daf29f3
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+	u32 frame_size;
+	u32 nframes;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..84e0e867febb
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <net/xdp_sock.h>
+
+#include "xdp_umem.h"
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+static int xsk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	local_bh_enable();
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_UMEM_REG:
+	{
+		struct xdp_umem_reg mr;
+		struct xdp_umem *umem;
+
+		if (xs->umem)
+			return -EBUSY;
+
+		if (copy_from_user(&mr, optval, sizeof(mr)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		err = xdp_umem_create(&umem);
+
+		err = xdp_umem_reg(umem, &mr);
+		if (err) {
+			kfree(umem);
+			mutex_unlock(&xs->mutex);
+			return err;
+		}
+
+		/* Make sure umem is ready before it can be seen by others */
+		smp_wmb();
+
+		xs->umem = umem;
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -ENOPROTOOPT;
+}
+
+static struct proto xsk_proto = {
+	.name =		"XDP",
+	.owner =	THIS_MODULE,
+	.obj_size =	sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+	.family =	PF_XDP,
+	.owner =	THIS_MODULE,
+	.release =	xsk_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		sock_no_poll,
+	.ioctl =	sock_no_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	xsk_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	sock_no_sendmsg,
+	.recvmsg =	sock_no_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	xdp_put_umem(xs->umem);
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	struct xdp_sock *xs;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	xs = xdp_sk(sk);
+	mutex_init(&xs->mutex);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+	local_bh_enable();
+
+	return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+	.family = PF_XDP,
+	.create = xsk_create,
+	.owner	= THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+	int err;
+
+	err = proto_register(&xsk_proto, 0 /* no slab */);
+	if (err)
+		goto out;
+
+	err = sock_register(&xsk_family_ops);
+	if (err)
+		goto out_proto;
+
+	return 0;
+
+out_proto:
+	proto_unregister(&xsk_proto);
+out:
+	return err;
+}
+
+fs_initcall(xsk_init);
-- 
cgit v1.2.3-55-g7522


From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson
Date: Wed, 2 May 2018 13:01:24 +0200
Subject: xsk: add umem fill queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
ask the kernel to allocate a queue (ring buffer) and also mmap it
(XDP_UMEM_PGOFF_FILL_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
user process to the kernel. These frames will in a later patch be
filled in with Rx packet data by the kernel.

v2: Fixed potential crash in xsk_mmap.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 15 +++++++++++
 net/xdp/Makefile            |  2 +-
 net/xdp/xdp_umem.c          |  5 ++++
 net/xdp/xdp_umem.h          |  2 ++
 net/xdp/xsk.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         | 58 ++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 38 ++++++++++++++++++++++++++
 7 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 41252135a0fe..975661e1baca 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,6 +23,7 @@
 
 /* XDP socket options */
 #define XDP_UMEM_REG			3
+#define XDP_UMEM_FILL_RING		4
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -31,4 +32,18 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+/* Pgoff for mmaping the rings */
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+
+struct xdp_ring {
+	__u32 producer __attribute__((aligned(64)));
+	__u32 consumer __attribute__((aligned(64)));
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	__u32 desc[0] __attribute__((aligned(64)));
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index a5d736640a0f..074fb2b2d51c 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index ec8b3552be44..e1f627d0cc1c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	if (umem->fq) {
+		xskq_destroy(umem->fq);
+		umem->fq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 4597ae81a221..25634b8a5c6f 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -19,9 +19,11 @@
 #include <linux/if_xdp.h>
 #include <linux/workqueue.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem_props.h"
 
 struct xdp_umem {
+	struct xsk_queue *fq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 84e0e867febb..da67a3c5c1c9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem.h"
 
 static struct xdp_sock *xdp_sk(struct sock *sk)
@@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries);
+	if (!q)
+		return -ENOMEM;
+
+	*queue = q;
+	return 0;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
+	case XDP_UMEM_FILL_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (!xs->umem)
+			return -EINVAL;
+
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->umem->fq;
+		err = xsk_init_queue(entries, q);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	default:
 		break;
 	}
@@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct xsk_queue *q = NULL;
+	unsigned long pfn;
+	struct page *qpg;
+
+	if (!xs->umem)
+		return -EINVAL;
+
+	if (offset == XDP_UMEM_PGOFF_FILL_RING)
+		q = xs->umem->fq;
+	else
+		return -EINVAL;
+
+	if (!q)
+		return -EINVAL;
+
+	qpg = virt_to_head_page(q->ring);
+	if (size > (PAGE_SIZE << compound_order(qpg)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn,
+			       size, vma->vm_page_prot);
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	sock_no_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..23da4f29d3fb
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+}
+
+struct xsk_queue *xskq_create(u32 nentries)
+{
+	struct xsk_queue *q;
+	gfp_t gfp_flags;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+		    __GFP_COMP  | __GFP_NORETRY;
+	size = xskq_umem_get_ring_size(q);
+
+	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+						      get_order(size));
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	page_frag_free(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..7eb556bf73be
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+struct xsk_queue {
+	struct xdp_umem_props umem_props;
+	u32 ring_mask;
+	u32 nentries;
+	u32 prod_head;
+	u32 prod_tail;
+	u32 cons_head;
+	u32 cons_tail;
+	struct xdp_ring *ring;
+	u64 invalid_descs;
+};
+
+struct xsk_queue *xskq_create(u32 nentries);
+void xskq_destroy(struct xsk_queue *q);
+
+#endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3-55-g7522


From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Wed, 2 May 2018 13:01:25 +0200
Subject: xsk: add Rx queue setup and mmap support

Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate
a queue, where the kernel can pass completed Rx frames from the kernel
to user process.

The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  4 ++++
 include/uapi/linux/if_xdp.h | 16 ++++++++++++++++
 net/xdp/xsk.c               | 41 ++++++++++++++++++++++++++++++++---------
 net/xdp/xsk_queue.c         | 11 +++++++++--
 net/xdp/xsk_queue.h         |  2 +-
 5 files changed, 62 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 94785f5db13e..db9a321de087 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -18,11 +18,15 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+struct net_device;
+struct xsk_queue;
 struct xdp_umem;
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
+	struct xsk_queue *rx;
+	struct net_device *dev;
 	struct xdp_umem *umem;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 975661e1baca..65324558829d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 
 /* XDP socket options */
+#define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 
@@ -33,13 +34,28 @@ struct xdp_umem_reg {
 };
 
 /* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 
+struct xdp_desc {
+	__u32 idx;
+	__u32 len;
+	__u16 offset;
+	__u8 flags;
+	__u8 padding[5];
+};
+
 struct xdp_ring {
 	__u32 producer __attribute__((aligned(64)));
 	__u32 consumer __attribute__((aligned(64)));
 };
 
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] __attribute__((aligned(64)));
+};
+
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index da67a3c5c1c9..92bd9b7e548f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
+#include <net/xdp.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
@@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
-static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+			  bool umem_queue)
 {
 	struct xsk_queue *q;
 
 	if (entries == 0 || *queue || !is_power_of_2(entries))
 		return -EINVAL;
 
-	q = xskq_create(entries);
+	q = xskq_create(entries, umem_queue);
 	if (!q)
 		return -ENOMEM;
 
@@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return -ENOPROTOOPT;
 
 	switch (optname) {
+	case XDP_RX_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (optlen < sizeof(entries))
+			return -EINVAL;
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->rx;
+		err = xsk_init_queue(entries, q, false);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	case XDP_UMEM_REG:
 	{
 		struct xdp_umem_reg mr;
@@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 		mutex_lock(&xs->mutex);
 		q = &xs->umem->fq;
-		err = xsk_init_queue(entries, q);
+		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
@@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
-	if (!xs->umem)
-		return -EINVAL;
+	if (offset == XDP_PGOFF_RX_RING) {
+		q = xs->rx;
+	} else {
+		if (!xs->umem)
+			return -EINVAL;
 
-	if (offset == XDP_UMEM_PGOFF_FILL_RING)
-		q = xs->umem->fq;
-	else
-		return -EINVAL;
+		if (offset == XDP_UMEM_PGOFF_FILL_RING)
+			q = xs->umem->fq;
+		else
+			return -EINVAL;
+	}
 
 	if (!q)
 		return -EINVAL;
@@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		return;
 
+	xskq_destroy(xs->rx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 23da4f29d3fb..894f9f89afc7 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
 }
 
-struct xsk_queue *xskq_create(u32 nentries)
+static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
+{
+	return (sizeof(struct xdp_ring) +
+		q->nentries * sizeof(struct xdp_desc));
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 {
 	struct xsk_queue *q;
 	gfp_t gfp_flags;
@@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries)
 
 	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
 		    __GFP_COMP  | __GFP_NORETRY;
-	size = xskq_umem_get_ring_size(q);
+	size = umem_queue ? xskq_umem_get_ring_size(q) :
+	       xskq_rxtx_get_ring_size(q);
 
 	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
 						      get_order(size));
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7eb556bf73be..5439fa381763 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,7 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
-struct xsk_queue *xskq_create(u32 nentries);
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3-55-g7522


From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson
Date: Wed, 2 May 2018 13:01:26 +0200
Subject: xsk: add support for bind for Rx

Here, the bind syscall is added. Binding an AF_XDP socket, means
associating the socket to an umem, a netdev and a queue index. This
can be done in two ways.

The first way, creating a "socket from scratch". Create the umem using
the XDP_UMEM_REG setsockopt and an associated fill queue with
XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE
setsockopt. Call bind passing ifindex and queue index ("channel" in
ethtool speak).

The second way to bind a socket, is simply skipping the
umem/netdev/queue index, and passing another already setup AF_XDP
socket. The new socket will then have the same umem/netdev/queue index
as the parent so it will share the same umem. You must also set the
flags field in the socket address to XDP_SHARED_UMEM.

v2: Use PTR_ERR instead of passing error variable explicitly.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |   1 +
 include/uapi/linux/if_xdp.h |  11 ++++
 net/xdp/xdp_umem.c          |   5 ++
 net/xdp/xdp_umem.h          |   1 +
 net/xdp/xsk.c               | 124 +++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         |   8 +++
 net/xdp/xsk_queue.h         |   1 +
 7 files changed, 150 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index db9a321de087..85d02512f59b 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 };
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 65324558829d..e5091881f776 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -21,6 +21,17 @@
 
 #include <linux/types.h>
 
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM 1
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+	__u16 sxdp_flags;
+};
+
 /* XDP socket options */
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index e1f627d0cc1c..9bac1ad570fa 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -248,3 +248,8 @@ out:
 	put_pid(umem->pid);
 	return err;
 }
+
+bool xdp_umem_validate_queues(struct xdp_umem *umem)
+{
+	return umem->fq;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 25634b8a5c6f..b13133e9c501 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -39,6 +39,7 @@ struct xdp_umem {
 	struct work_struct work;
 };
 
+bool xdp_umem_validate_queues(struct xdp_umem *umem);
 int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 92bd9b7e548f..bf2c97b87992 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	return 0;
 }
 
+static void __xsk_release(struct xdp_sock *xs)
+{
+	/* Wait for driver to stop using the xdp socket. */
+	synchronize_net();
+
+	dev_put(xs->dev);
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
 	struct net *net;
 
 	if (!sk)
@@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock)
 	sock_prot_inuse_add(net, sk->sk_prot, -1);
 	local_bh_enable();
 
+	if (xs->dev) {
+		__xsk_release(xs);
+		xs->dev = NULL;
+	}
+
 	sock_orphan(sk);
 	sock->sk = NULL;
 
@@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock)
 	return 0;
 }
 
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+	struct socket *sock;
+	int err;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return ERR_PTR(-ENOTSOCK);
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return ERR_PTR(-ENOPROTOOPT);
+	}
+
+	return sock;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+	struct sock *sk = sock->sk;
+	struct net_device *dev, *dev_curr;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xdp_umem *old_umem = NULL;
+	int err = 0;
+
+	if (addr_len < sizeof(struct sockaddr_xdp))
+		return -EINVAL;
+	if (sxdp->sxdp_family != AF_XDP)
+		return -EINVAL;
+
+	mutex_lock(&xs->mutex);
+	dev_curr = xs->dev;
+	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+	if (!dev) {
+		err = -ENODEV;
+		goto out_release;
+	}
+
+	if (!xs->rx) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+		struct xdp_sock *umem_xs;
+		struct socket *sock;
+
+		if (xs->umem) {
+			/* We have already our own. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+		if (IS_ERR(sock)) {
+			err = PTR_ERR(sock);
+			goto out_unlock;
+		}
+
+		umem_xs = xdp_sk(sock->sk);
+		if (!umem_xs->umem) {
+			/* No umem to inherit. */
+			err = -EBADF;
+			sockfd_put(sock);
+			goto out_unlock;
+		} else if (umem_xs->dev != dev ||
+			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+			err = -EINVAL;
+			sockfd_put(sock);
+			goto out_unlock;
+		}
+
+		xdp_get_umem(umem_xs->umem);
+		old_umem = xs->umem;
+		xs->umem = umem_xs->umem;
+		sockfd_put(sock);
+	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Rebind? */
+	if (dev_curr && (dev_curr != dev ||
+			 xs->queue_id != sxdp->sxdp_queue_id)) {
+		__xsk_release(xs);
+		if (old_umem)
+			xdp_put_umem(old_umem);
+	}
+
+	xs->dev = dev;
+	xs->queue_id = sxdp->sxdp_queue_id;
+
+	xskq_set_umem(xs->rx, &xs->umem->props);
+
+out_unlock:
+	if (err)
+		dev_put(dev);
+out_release:
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.family =	PF_XDP,
 	.owner =	THIS_MODULE,
 	.release =	xsk_release,
-	.bind =		sock_no_bind,
+	.bind =		xsk_bind,
 	.connect =	sock_no_connect,
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 894f9f89afc7..d012e5e23591 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -16,6 +16,14 @@
 
 #include "xsk_queue.h"
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+{
+	if (!q)
+		return;
+
+	q->umem_props = *umem_props;
+}
+
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5439fa381763..9ddd2ee07a84 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,6 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
-- 
cgit v1.2.3-55-g7522


From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Wed, 2 May 2018 13:01:28 +0200
Subject: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP

The xskmap is yet another BPF map, very much inspired by
dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application
adds AF_XDP sockets into the map, and by using the bpf_redirect_map
helper, an XDP program can redirect XDP frames to an AF_XDP socket.

Note that a socket that is bound to certain ifindex/queue index will
*only* accept XDP frames from that netdev/queue index. If an XDP
program tries to redirect from a netdev/queue index other than what
the socket is bound to, the frame will not be received on the socket.

A socket can reside in multiple maps.

v3: Fixed race and simplified code.
v2: Removed one indirection in map lookup.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h       |  25 +++++
 include/linux/bpf_types.h |   3 +
 include/net/xdp_sock.h    |   7 ++
 include/uapi/linux/bpf.h  |   1 +
 kernel/bpf/Makefile       |   3 +
 kernel/bpf/verifier.c     |   8 +-
 kernel/bpf/xskmap.c       | 239 ++++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk.c             |   5 +
 8 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/xskmap.c

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c553f6f9c6b0..68ecdb4eea09 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map,
 }
 #endif
 
+#if defined(CONFIG_XDP_SOCKETS)
+struct xdp_sock;
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs);
+void __xsk_map_flush(struct bpf_map *map);
+#else
+struct xdp_sock;
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
+{
+	return NULL;
+}
+
+static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+				     struct xdp_sock *xs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void __xsk_map_flush(struct bpf_map *map)
+{
+}
+#endif
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf6f6ae..d7df1b323082 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
+#if defined(CONFIG_XDP_SOCKETS)
+BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
+#endif
 #endif
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index a0342dff6a4d..ce3a2ab16b8f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	struct list_head flush_node;
 	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
@@ -39,6 +40,7 @@ struct xdp_buff;
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 static inline void xsk_flush(struct xdp_sock *xs)
 {
 }
+
+static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return false;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8daef7326bb7..a3a495052511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 35c485fa9ea3..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 712d8655e916..0d91f18b2eb5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
-	/* Restrict bpf side of cpumap, open when use-cases appear */
+	/* Restrict bpf side of cpumap and xskmap, open when use-cases
+	 * appear.
+	 */
 	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
-		    map->map_type != BPF_MAP_TYPE_CPUMAP)
+		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
+		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..869dbb11b612
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+	struct bpf_map map;
+	struct xdp_sock **xsk_map;
+	struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	int cpu, err = -EINVAL;
+	struct xsk_map *m;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	m = kzalloc(sizeof(*m), GFP_USER);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&m->map, attr);
+
+	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+	cost += sizeof(struct list_head) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_m;
+
+	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	err = bpf_map_precharge_memlock(m->map.pages);
+	if (err)
+		goto free_m;
+
+	m->flush_list = alloc_percpu(struct list_head);
+	if (!m->flush_list)
+		goto free_m;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+					sizeof(struct xdp_sock *),
+					m->map.numa_node);
+	if (!m->xsk_map)
+		goto free_percpu;
+	return &m->map;
+
+free_percpu:
+	free_percpu(m->flush_list);
+free_m:
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	int i;
+
+	synchronize_net();
+
+	for (i = 0; i < map->max_entries; i++) {
+		struct xdp_sock *xs;
+
+		xs = m->xsk_map[i];
+		if (!xs)
+			continue;
+
+		sock_put((struct sock *)xs);
+	}
+
+	free_percpu(m->flush_list);
+	bpf_map_area_free(m->xsk_map);
+	kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	xs = READ_ONCE(m->xsk_map[key]);
+	return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del(xs->flush_node.prev, xs->flush_node.next);
+		xs->flush_node.prev = NULL;
+	}
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xdp_sock *xs, *old_xs;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	if (!xsk_is_setup_for_bpf_map(xs)) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	sock_hold(sock->sk);
+
+	old_xs = xchg(&m->xsk_map[i], xs);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	sockfd_put(sock);
+	return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *old_xs;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	old_xs = xchg(&m->xsk_map[k], NULL);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4e1e6c581e1d..b931a0db5588 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return !!xs->rx;
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 *id, len = xdp->data_end - xdp->data;
-- 
cgit v1.2.3-55-g7522


From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson
Date: Wed, 2 May 2018 13:01:31 +0200
Subject: xsk: add umem completion queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the
process can ask the kernel to allocate a queue (ring buffer) and also
mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
kernel to user process. This will be used by the TX path to tell user
space that a certain frame has been transmitted and user space can use
it for something else, if it wishes.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xdp_umem.c          | 7 ++++++-
 net/xdp/xdp_umem.h          | 1 +
 net/xdp/xsk.c               | 7 ++++++-
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e5091881f776..71581a139f26 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -36,6 +36,7 @@ struct sockaddr_xdp {
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
+#define XDP_UMEM_COMPLETION_RING	5
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
 struct xdp_desc {
 	__u32 idx;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9bac1ad570fa..881dfdefe235 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -70,6 +70,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->fq = NULL;
 	}
 
+	if (umem->cq) {
+		xskq_destroy(umem->cq);
+		umem->cq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
@@ -251,5 +256,5 @@ out:
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 {
-	return umem->fq;
+	return (umem->fq && umem->cq);
 }
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index c7378a11721f..7e0b2fab8522 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -24,6 +24,7 @@
 
 struct xdp_umem {
 	struct xsk_queue *fq;
+	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b931a0db5588..f4a2c5bc6da9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -255,6 +255,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	} else {
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
+		xskq_set_umem(xs->umem->cq, &xs->umem->props);
 	}
 
 	/* Rebind? */
@@ -334,6 +335,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return 0;
 	}
 	case XDP_UMEM_FILL_RING:
+	case XDP_UMEM_COMPLETION_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -345,7 +347,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->umem->fq;
+		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
+			&xs->umem->cq;
 		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -375,6 +378,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
 			q = xs->umem->fq;
+		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+			q = xs->umem->cq;
 		else
 			return -EINVAL;
 	}
-- 
cgit v1.2.3-55-g7522


From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson
Date: Wed, 2 May 2018 13:01:32 +0200
Subject: xsk: add Tx queue setup and mmap support

Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate
a queue, where the user process can pass frames to be transmitted by
the kernel.

The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      | 1 +
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xsk.c               | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce3a2ab16b8f..185f4928fbda 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
+	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 71581a139f26..e2ea878d025c 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -34,6 +34,7 @@ struct sockaddr_xdp {
 
 /* XDP socket options */
 #define XDP_RX_RING			1
+#define XDP_TX_RING			2
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f4a2c5bc6da9..2d7b0c90d996 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -206,7 +206,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_release;
 	}
 
-	if (!xs->rx) {
+	if (!xs->rx && !xs->tx) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
@@ -291,6 +291,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 	switch (optname) {
 	case XDP_RX_RING:
+	case XDP_TX_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -301,7 +302,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->rx;
+		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -372,6 +373,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 	if (offset == XDP_PGOFF_RX_RING) {
 		q = xs->rx;
+	} else if (offset == XDP_PGOFF_TX_RING) {
+		q = xs->tx;
 	} else {
 		if (!xs->umem)
 			return -EINVAL;
@@ -431,6 +434,7 @@ static void xsk_destruct(struct sock *sk)
 		return;
 
 	xskq_destroy(xs->rx);
+	xskq_destroy(xs->tx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
-- 
cgit v1.2.3-55-g7522


From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson
Date: Wed, 2 May 2018 13:01:35 +0200
Subject: xsk: statistics support

In this commit, a new getsockopt is added: XDP_STATISTICS. This is
used to obtain stats from the sockets.

v2: getsockopt now returns size of stats structure.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h |  7 +++++++
 net/xdp/xsk.c               | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.h         |  5 +++++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e2ea878d025c..77b88c4efe98 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -38,6 +38,7 @@ struct sockaddr_xdp {
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
+#define XDP_STATISTICS			6
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -46,6 +47,12 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b33c535c7996..009c5af5bba5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -468,6 +468,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int len;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case XDP_STATISTICS:
+	{
+		struct xdp_statistics stats;
+
+		if (len < sizeof(stats))
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		stats.rx_dropped = xs->rx_dropped;
+		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+		mutex_unlock(&xs->mutex);
+
+		if (copy_to_user(optval, &stats, sizeof(stats)))
+			return -EFAULT;
+		if (put_user(sizeof(stats), optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int xsk_mmap(struct file *file, struct socket *sock,
 		    struct vm_area_struct *vma)
 {
@@ -524,7 +567,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
 	.setsockopt =	xsk_setsockopt,
-	.getsockopt =	sock_no_getsockopt,
+	.getsockopt =	xsk_getsockopt,
 	.sendmsg =	xsk_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
 	.mmap =		xsk_mmap,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 3497e8808608..7aa9a535db0e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -36,6 +36,11 @@ struct xsk_queue {
 
 /* Common functions operating for both RXTX and umem queues */
 
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+	return q ? q->invalid_descs : 0;
+}
+
 static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 {
 	u32 entries = q->prod_tail - q->cons_tail;
-- 
cgit v1.2.3-55-g7522


From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Fri, 4 May 2018 01:08:15 +0200
Subject: bpf: add skb_load_bytes_relative helper

This adds a small BPF helper similar to bpf_skb_load_bytes() that
is able to load relative to mac/net header offset from the skb's
linear data. Compared to bpf_skb_load_bytes(), it takes a fifth
argument namely start_header, which is either BPF_HDR_START_MAC
or BPF_HDR_START_NET. This allows for a more flexible alternative
compared to LD_ABS/LD_IND with negative offset. It's enabled for
tc BPF programs as well as sock filter program types where it's
mainly useful in reuseport programs to ease access to lower header
data.

Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++-
 net/core/filter.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a3a495052511..93d5a4eeec2a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1802,6 +1802,30 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1871,7 +1895,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
 };
 
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index a49729842b3d..6877426c23a6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1684,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+	   u32, offset, void *, to, u32, len, u32, start_header)
+{
+	u8 *ptr;
+
+	if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+		goto err_clear;
+
+	switch (start_header) {
+	case BPF_HDR_START_MAC:
+		ptr = skb_mac_header(skb) + offset;
+		break;
+	case BPF_HDR_START_NET:
+		ptr = skb_network_header(skb) + offset;
+		break;
+	default:
+		goto err_clear;
+	}
+
+	if (likely(ptr >= skb_mac_header(skb) &&
+		   ptr + len <= skb_tail_pointer(skb))) {
+		memcpy(to, ptr, len);
+		return 0;
+	}
+
+err_clear:
+	memset(to, 0, len);
+	return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+	.func		= bpf_skb_load_bytes_relative,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
 {
 	/* Idea is the following: should the needed direct read/write
@@ -4061,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
@@ -4078,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_skb_pull_data:
 		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
-- 
cgit v1.2.3-55-g7522


From d734a2888922efa521f9eb8dfe6790ced8f62bb7 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana
Date: Sun, 22 Apr 2018 11:03:23 +0200
Subject: netfilter: nft_numgen: add map lookups for numgen statements

This patch includes a new attribute in the numgen structure to allow
the lookup of an element based on the number generator as a key.

For this purpose, different ops have been included to extend the
current numgen inc functions.

Currently, only supported for numgen incremental operations, but
it will be supported for random in a follow-up patch.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++
 net/netfilter/nft_numgen.c               | 85 ++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6a3d653d5b27..5a5551a580f7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1450,6 +1450,8 @@ enum nft_trace_types {
  * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
  * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32)
+ * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_NG_SET_ID: id of the map (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
@@ -1457,6 +1459,8 @@ enum nft_ng_attributes {
 	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
 	NFTA_NG_OFFSET,
+	NFTA_NG_SET_NAME,
+	NFTA_NG_SET_ID,
 	__NFTA_NG_MAX
 };
 #define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..8a64db8f2e69 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
 	u32			modulus;
 	atomic_t		counter;
 	u32			offset;
+	struct nft_set		*map;
 };
 
-static void nft_ng_inc_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
 {
-	struct nft_ng_inc *priv = nft_expr_priv(expr);
 	u32 nval, oval;
 
 	do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	regs->data[priv->dreg] = nval + priv->offset;
+	return nval + priv->offset;
+}
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	regs->data[priv->dreg] = nft_ng_inc_gen(priv);
+}
+
+static void nft_ng_inc_map_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = nft_ng_inc_gen(priv);
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
+	[NFTA_NG_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_NG_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,25 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_ng_inc_init(ctx, expr, tb);
+
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_NG_SET_NAME],
+					  tb[NFTA_NG_SET_ID], genmask);
+
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
@@ -97,6 +146,22 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
+static int nft_ng_inc_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+			NFT_NG_INCREMENTAL, priv->offset) ||
+	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
@@ -156,6 +221,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
 	.dump		= nft_ng_inc_dump,
 };
 
+static const struct nft_expr_ops nft_ng_inc_map_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+	.eval		= nft_ng_inc_map_eval,
+	.init		= nft_ng_inc_map_init,
+	.dump		= nft_ng_inc_map_dump,
+};
+
 static const struct nft_expr_ops nft_ng_random_ops = {
 	.type		= &nft_ng_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -178,6 +251,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 
 	switch (type) {
 	case NFT_NG_INCREMENTAL:
+		if (tb[NFTA_NG_SET_NAME])
+			return &nft_ng_inc_map_ops;
 		return &nft_ng_inc_ops;
 	case NFT_NG_RANDOM:
 		return &nft_ng_random_ops;
-- 
cgit v1.2.3-55-g7522


From c1c7e44b4f62e9c07fea8aaa58ed46cfae48ba3d Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam
Date: Wed, 25 Apr 2018 05:30:24 -0500
Subject: netfilter: ip6t_srh: extend SRH matching for previous, next and last
 SID

IPv6 Segment Routing Header (SRH) contains a list of SIDs to be crossed
by SR encapsulated packet. Each SID is encoded as an IPv6 prefix.

When a Firewall receives an SR encapsulated packet, it should be able
to identify which node previously processed the packet (previous SID),
which node is going to process the packet next (next SID), and which
node is the last to process the packet (last SID) which represent the
final destination of the packet in case of inline SR mode.

An example use-case of using these features could be SID list that
includes two firewalls. When the second firewall receives a packet,
it can check whether the packet has been processed by the first firewall
or not. Based on that check, it decides to apply all rules, apply just
subset of the rules, or totally skip all rules and forward the packet to
the next SID.

This patch extends SRH match to support matching previous SID, next SID,
and last SID.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h |  43 ++++++-
 net/ipv6/netfilter/ip6t_srh.c                | 173 +++++++++++++++++++++++++--
 2 files changed, 205 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
index f3cc0ef514a7..54ed83360dac 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -17,7 +17,10 @@
 #define IP6T_SRH_LAST_GT        0x0100
 #define IP6T_SRH_LAST_LT        0x0200
 #define IP6T_SRH_TAG            0x0400
-#define IP6T_SRH_MASK           0x07FF
+#define IP6T_SRH_PSID           0x0800
+#define IP6T_SRH_NSID           0x1000
+#define IP6T_SRH_LSID           0x2000
+#define IP6T_SRH_MASK           0x3FFF
 
 /* Values for "mt_invflags" field in struct ip6t_srh */
 #define IP6T_SRH_INV_NEXTHDR    0x0001
@@ -31,7 +34,10 @@
 #define IP6T_SRH_INV_LAST_GT    0x0100
 #define IP6T_SRH_INV_LAST_LT    0x0200
 #define IP6T_SRH_INV_TAG        0x0400
-#define IP6T_SRH_INV_MASK       0x07FF
+#define IP6T_SRH_INV_PSID       0x0800
+#define IP6T_SRH_INV_NSID       0x1000
+#define IP6T_SRH_INV_LSID       0x2000
+#define IP6T_SRH_INV_MASK       0x3FFF
 
 /**
  *      struct ip6t_srh - SRH match options
@@ -54,4 +60,37 @@ struct ip6t_srh {
 	__u16                   mt_invflags;
 };
 
+/**
+ *      struct ip6t_srh1 - SRH match options (revision 1)
+ *      @ next_hdr: Next header field of SRH
+ *      @ hdr_len: Extension header length field of SRH
+ *      @ segs_left: Segments left field of SRH
+ *      @ last_entry: Last entry field of SRH
+ *      @ tag: Tag field of SRH
+ *      @ psid_addr: Address of previous SID in SRH SID list
+ *      @ nsid_addr: Address of NEXT SID in SRH SID list
+ *      @ lsid_addr: Address of LAST SID in SRH SID list
+ *      @ psid_msk: Mask of previous SID in SRH SID list
+ *      @ nsid_msk: Mask of next SID in SRH SID list
+ *      @ lsid_msk: MAsk of last SID in SRH SID list
+ *      @ mt_flags: match options
+ *      @ mt_invflags: Invert the sense of match options
+ */
+
+struct ip6t_srh1 {
+	__u8                    next_hdr;
+	__u8                    hdr_len;
+	__u8                    segs_left;
+	__u8                    last_entry;
+	__u16                   tag;
+	struct in6_addr         psid_addr;
+	struct in6_addr         nsid_addr;
+	struct in6_addr         lsid_addr;
+	struct in6_addr         psid_msk;
+	struct in6_addr         nsid_msk;
+	struct in6_addr         lsid_msk;
+	__u16                   mt_flags;
+	__u16                   mt_invflags;
+};
+
 #endif /*_IP6T_SRH_H*/
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+	struct in6_addr *psid, *nsid, *lsid;
+	struct in6_addr _psid, _nsid, _lsid;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6_sr_hdr _srh;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return false;
+	srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+	if (!srh)
+		return false;
+
+	hdrlen = ipv6_optlen(srh);
+	if (skb->len - srhoff < hdrlen)
+		return false;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (srh->segments_left > srh->first_segment)
+		return false;
+
+	/* Next Header matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+				!(srh->nexthdr == srhinfo->next_hdr)))
+			return false;
+
+	/* Header Extension Length matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+				!(srh->hdrlen == srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+				!(srh->hdrlen > srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+				!(srh->hdrlen < srhinfo->hdr_len)))
+			return false;
+
+	/* Segments Left matching */
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+				!(srh->segments_left == srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+				!(srh->segments_left > srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+				!(srh->segments_left < srhinfo->segs_left)))
+			return false;
+
+	/**
+	 * Last Entry matching
+	 * Last_Entry field was introduced in revision 6 of the SRH draft.
+	 * It was called First_Segment in the previous revision
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+				!(srh->first_segment == srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+				!(srh->first_segment > srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+				!(srh->first_segment < srhinfo->last_entry)))
+			return false;
+
+	/**
+	 * Tag matchig
+	 * Tag field was introduced in revision 6 of the SRH draft
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_TAG)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+				!(srh->tag == srhinfo->tag)))
+			return false;
+
+	/* Previous SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+		if (srh->segments_left == srh->first_segment)
+			return false;
+		psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left + 1) * sizeof(struct in6_addr));
+		psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+				ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+						     &srhinfo->psid_addr)))
+			return false;
+	}
+
+	/* Next SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+		if (srh->segments_left == 0)
+			return false;
+		nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left - 1) * sizeof(struct in6_addr));
+		nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+				ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+						     &srhinfo->nsid_addr)))
+			return false;
+	}
+
+	/* Last SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+		lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+		lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+				ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+						     &srhinfo->lsid_addr)))
+			return false;
+	}
+	return true;
+}
+
 static int srh_mt6_check(const struct xt_mtchk_param *par)
 {
 	const struct ip6t_srh *srhinfo = par->matchinfo;
@@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static struct xt_match srh_mt6_reg __read_mostly = {
-	.name		= "srh",
-	.family		= NFPROTO_IPV6,
-	.match		= srh_mt6,
-	.matchsize	= sizeof(struct ip6t_srh),
-	.checkentry	= srh_mt6_check,
-	.me		= THIS_MODULE,
+static int srh1_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+
+	if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+		pr_info_ratelimited("unknown srh match flags  %X\n",
+				    srhinfo->mt_flags);
+		return -EINVAL;
+	}
+
+	if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+		pr_info_ratelimited("unknown srh invflags %X\n",
+				    srhinfo->mt_invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match srh_mt6_reg[] __read_mostly = {
+	{
+		.name		= "srh",
+		.revision	= 0,
+		.family		= NFPROTO_IPV6,
+		.match		= srh_mt6,
+		.matchsize	= sizeof(struct ip6t_srh),
+		.checkentry	= srh_mt6_check,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "srh",
+		.revision       = 1,
+		.family         = NFPROTO_IPV6,
+		.match          = srh1_mt6,
+		.matchsize      = sizeof(struct ip6t_srh1),
+		.checkentry     = srh1_mt6_check,
+		.me             = THIS_MODULE,
+	}
 };
 
 static int __init srh_mt6_init(void)
 {
-	return xt_register_match(&srh_mt6_reg);
+	return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 static void __exit srh_mt6_exit(void)
 {
-	xt_unregister_match(&srh_mt6_reg);
+	xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 module_init(srh_mt6_init);
-- 
cgit v1.2.3-55-g7522


From 3f9c56a581b96d8117922c4fd8221687fd649f9b Mon Sep 17 00:00:00 2001
From: Phil Sutter
Date: Fri, 27 Apr 2018 12:47:01 +0200
Subject: netfilter: nf_tables: Provide NFT_{RT,CT}_MAX for userspace

These macros allow conveniently declaring arrays which use NFT_{RT,CT}_*
values as indexes.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5a5551a580f7..ce031cf72288 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -831,7 +831,9 @@ enum nft_rt_keys {
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
 	NFT_RT_TCPMSS,
+	__NFT_RT_MAX
 };
+#define NFT_RT_MAX		(__NFT_RT_MAX - 1)
 
 /**
  * enum nft_hash_types - nf_tables hash expression types
@@ -949,7 +951,9 @@ enum nft_ct_keys {
 	NFT_CT_DST_IP,
 	NFT_CT_SRC_IP6,
 	NFT_CT_DST_IP6,
+	__NFT_CT_MAX
 };
+#define NFT_CT_MAX		(__NFT_CT_MAX - 1)
 
 /**
  * enum nft_ct_attributes - nf_tables ct expression netlink attributes
-- 
cgit v1.2.3-55-g7522


From bfb15f2a95cbbc548b59abf8007d0fdb35fdfee5 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera
Date: Thu, 3 May 2018 14:05:40 +0200
Subject: netfilter: extract Passive OS fingerprint infrastructure from xt_osf

Add nf_osf_ttl() and nf_osf_match() into nf_osf.c to prepare for
nf_tables support.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      |  27 +++++
 include/uapi/linux/netfilter/nf_osf.h |  90 ++++++++++++++
 include/uapi/linux/netfilter/xt_osf.h | 106 +++--------------
 net/netfilter/Kconfig                 |   4 +
 net/netfilter/Makefile                |   1 +
 net/netfilter/nf_osf.c                | 218 ++++++++++++++++++++++++++++++++++
 net/netfilter/xt_osf.c                | 202 +------------------------------
 7 files changed, 359 insertions(+), 289 deletions(-)
 create mode 100644 include/linux/netfilter/nf_osf.h
 create mode 100644 include/uapi/linux/netfilter/nf_osf.h
 create mode 100644 net/netfilter/nf_osf.c

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..a2b39602e87d
--- /dev/null
+++ b/include/linux/netfilter/nf_osf.h
@@ -0,0 +1,27 @@
+#include <uapi/linux/netfilter/nf_osf.h>
+
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN   = 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+		  int hooknum, struct net_device *in, struct net_device *out,
+		  const struct nf_osf_info *info, struct net *net,
+		  const struct list_head *nf_osf_fingers);
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..45376eae31ef
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -0,0 +1,90 @@
+#ifndef _NF_OSF_H
+#define _NF_OSF_H
+
+#define MAXGENRELEN	32
+
+#define NF_OSF_GENRE	(1 << 0)
+#define NF_OSF_TTL	(1 << 1)
+#define NF_OSF_LOG	(1 << 2)
+#define NF_OSF_INVERT	(1 << 3)
+
+#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
+#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
+#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
+
+#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
+
+/* Do not compare ip and fingerprint TTL at all */
+#define NF_OSF_TTL_NOCHECK		2
+
+/* Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct nf_osf_wc {
+	__u32	wc;
+	__u32	val;
+};
+
+/* This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct nf_osf_opt {
+	__u16			kind, length;
+	struct nf_osf_wc	wc;
+};
+
+struct nf_osf_info {
+	char	genre[MAXGENRELEN];
+	__u32	len;
+	__u32	flags;
+	__u32	loglevel;
+	__u32	ttl;
+};
+
+struct nf_osf_user_finger {
+	struct nf_osf_wc	wss;
+
+	__u8	ttl, df;
+	__u16	ss, mss;
+	__u16	opt_num;
+
+	char	genre[MAXGENRELEN];
+	char	version[MAXGENRELEN];
+	char	subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct nf_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
+struct nf_osf_nlmsg {
+	struct nf_osf_user_finger	f;
+	struct iphdr			ip;
+	struct tcphdr			tcp;
+};
+
+/* Defines for IANA option kinds */
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP,		/* NOP */
+	OSFOPT_MSS,		/* Maximum segment size */
+	OSFOPT_WSO,		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index dad197e2ab99..72956eceeb09 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -23,101 +23,29 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
+#include <linux/netfilter/nf_osf.h>
 
-#define MAXGENRELEN		32
+#define XT_OSF_GENRE		NF_OSF_GENRE
+#define XT_OSF_INVERT		NF_OSF_INVERT
 
-#define XT_OSF_GENRE		(1<<0)
-#define	XT_OSF_TTL		(1<<1)
-#define XT_OSF_LOG		(1<<2)
-#define XT_OSF_INVERT		(1<<3)
+#define XT_OSF_TTL		NF_OSF_TTL
+#define XT_OSF_LOG		NF_OSF_LOG
 
-#define XT_OSF_LOGLEVEL_ALL	0	/* log all matched fingerprints */
-#define XT_OSF_LOGLEVEL_FIRST	1	/* log only the first matced fingerprint */
-#define XT_OSF_LOGLEVEL_ALL_KNOWN	2 /* do not log unknown packets */
+#define XT_OSF_LOGLEVEL_ALL		NF_OSF_LOGLEVEL_ALL
+#define XT_OSF_LOGLEVEL_FIRST		NF_OSF_LOGLEVEL_FIRST
+#define XT_OSF_LOGLEVEL_ALL_KNOWN	NF_OSF_LOGLEVEL_ALL_KNOWN
 
-#define XT_OSF_TTL_TRUE		0	/* True ip and fingerprint TTL comparison */
-#define XT_OSF_TTL_LESS		1	/* Check if ip TTL is less than fingerprint one */
-#define XT_OSF_TTL_NOCHECK	2	/* Do not compare ip and fingerprint TTL at all */
+#define XT_OSF_TTL_TRUE		NF_OSF_TTL_TRUE
+#define XT_OSF_TTL_NOCHECK	NF_OSF_TTL_NOCHECK
 
-struct xt_osf_info {
-	char			genre[MAXGENRELEN];
-	__u32			len;
-	__u32			flags;
-	__u32			loglevel;
-	__u32			ttl;
-};
-
-/*
- * Wildcard MSS (kind of).
- * It is used to implement a state machine for the different wildcard values
- * of the MSS and window sizes.
- */
-struct xt_osf_wc {
-	__u32			wc;
-	__u32			val;
-};
-
-/*
- * This struct represents IANA options
- * http://www.iana.org/assignments/tcp-parameters
- */
-struct xt_osf_opt {
-	__u16			kind, length;
-	struct xt_osf_wc	wc;
-};
-
-struct xt_osf_user_finger {
-	struct xt_osf_wc	wss;
-
-	__u8			ttl, df;
-	__u16			ss, mss;
-	__u16			opt_num;
-
-	char			genre[MAXGENRELEN];
-	char			version[MAXGENRELEN];
-	char			subtype[MAXGENRELEN];
+#define XT_OSF_TTL_LESS	1	/* Check if ip TTL is less than fingerprint one */
 
-	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
-	struct xt_osf_opt	opt[MAX_IPOPTLEN];
-};
-
-struct xt_osf_nlmsg {
-	struct xt_osf_user_finger	f;
-	struct iphdr		ip;
-	struct tcphdr		tcp;
-};
-
-/* Defines for IANA option kinds */
-
-enum iana_options {
-	OSFOPT_EOL = 0,		/* End of options */
-	OSFOPT_NOP, 		/* NOP */
-	OSFOPT_MSS, 		/* Maximum segment size */
-	OSFOPT_WSO, 		/* Window scale option */
-	OSFOPT_SACKP,		/* SACK permitted */
-	OSFOPT_SACK,		/* SACK */
-	OSFOPT_ECHO,
-	OSFOPT_ECHOREPLY,
-	OSFOPT_TS,		/* Timestamp option */
-	OSFOPT_POCP,		/* Partial Order Connection Permitted */
-	OSFOPT_POSP,		/* Partial Order Service Profile */
-
-	/* Others are not used in the current OSF */
-	OSFOPT_EMPTY = 255,
-};
-
-/*
- * Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum xt_osf_window_size_options {
-	OSF_WSS_PLAIN	= 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
+#define xt_osf_wc		nf_osf_wc
+#define xt_osf_opt		nf_osf_opt
+#define xt_osf_info		nf_osf_info
+#define xt_osf_user_finger	nf_osf_user_finger
+#define xt_osf_finger		nf_osf_finger
+#define xt_osf_nlmsg		nf_osf_nlmsg
 
 /*
  * Add/remove fingerprint from the kernel.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f66586fb41cd..e25cdb3d51cb 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -444,6 +444,9 @@ config NETFILTER_SYNPROXY
 
 endif # NF_CONNTRACK
 
+config NF_OSF
+	tristate 'Passive OS fingerprint infrastructure'
+
 config NF_TABLES
 	select NETFILTER_NETLINK
 	tristate "Netfilter nf_tables support"
@@ -1358,6 +1361,7 @@ config NETFILTER_XT_MATCH_NFACCT
 config NETFILTER_XT_MATCH_OSF
 	tristate '"osf" Passive OS fingerprint match'
 	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+	select NF_OSF
 	help
 	  This option selects the Passive OS Fingerprinting match module
 	  that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b37ce0bc9ab7..1aa710b5d384 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
 obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
+obj-$(CONFIG_NF_OSF)		+= nf_osf.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nf_osf.h>
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+			     const struct nf_osf_info *info,
+			     unsigned char f_ttl)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	if (info->flags & NF_OSF_TTL) {
+		if (info->ttl == NF_OSF_TTL_TRUE)
+			return ip->ttl == f_ttl;
+		if (info->ttl == NF_OSF_TTL_NOCHECK)
+			return 1;
+		else if (ip->ttl <= f_ttl)
+			return 1;
+		else {
+			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+			int ret = 0;
+
+			for_ifa(in_dev) {
+				if (inet_ifa_match(ip->saddr, ifa)) {
+					ret = (ip->ttl == f_ttl);
+					break;
+				}
+			}
+			endfor_ifa(in_dev);
+
+			return ret;
+		}
+	}
+
+	return ip->ttl == f_ttl;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+	     int hooknum, struct net_device *in, struct net_device *out,
+	     const struct nf_osf_info *info, struct net *net,
+	     const struct list_head *nf_osf_fingers)
+{
+	const unsigned char *optp = NULL, *_optp = NULL;
+	unsigned int optsize = 0, check_WSS = 0;
+	int fmatch = FMATCH_WRONG, fcount = 0;
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct nf_osf_user_finger *f;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct nf_osf_finger *kf;
+	u16 window, totlen, mss = 0;
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	bool df;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return false;
+
+	if (!tcp->syn)
+		return false;
+
+	totlen = ntohs(ip->tot_len);
+	df = ntohs(ip->frag_off) & IP_DF;
+	window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), optsize, opts);
+	}
+
+	list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+		int foptsize, optnum;
+
+		f = &kf->finger;
+
+		if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+			continue;
+
+		optp = _optp;
+		fmatch = FMATCH_WRONG;
+
+		if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+			continue;
+
+		/*
+		 * Should not happen if userspace parser was written correctly.
+		 */
+		if (f->wss.wc >= OSF_WSS_MAX)
+			continue;
+
+		/* Check options */
+
+		foptsize = 0;
+		for (optnum = 0; optnum < f->opt_num; ++optnum)
+			foptsize += f->opt[optnum].length;
+
+		if (foptsize > MAX_IPOPTLEN ||
+		    optsize > MAX_IPOPTLEN ||
+		    optsize != foptsize)
+			continue;
+
+		check_WSS = f->wss.wc;
+
+		for (optnum = 0; optnum < f->opt_num; ++optnum) {
+			if (f->opt[optnum].kind == (*optp)) {
+				__u32 len = f->opt[optnum].length;
+				const __u8 *optend = optp + len;
+
+				fmatch = FMATCH_OK;
+
+				switch (*optp) {
+				case OSFOPT_MSS:
+					mss = optp[3];
+					mss <<= 8;
+					mss |= optp[2];
+
+					mss = ntohs((__force __be16)mss);
+					break;
+				case OSFOPT_TS:
+					break;
+				}
+
+				optp = optend;
+			} else
+				fmatch = FMATCH_OPT_WRONG;
+
+			if (fmatch != FMATCH_OK)
+				break;
+		}
+
+		if (fmatch != FMATCH_OPT_WRONG) {
+			fmatch = FMATCH_WRONG;
+
+			switch (check_WSS) {
+			case OSF_WSS_PLAIN:
+				if (f->wss.val == 0 || window == f->wss.val)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MSS:
+				/*
+				 * Some smart modems decrease mangle MSS to
+				 * SMART_MSS_2, so we check standard, decreased
+				 * and the one provided in the fingerprint MSS
+				 * values.
+				 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+				if (window == f->wss.val * mss ||
+				    window == f->wss.val * SMART_MSS_1 ||
+				    window == f->wss.val * SMART_MSS_2)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MTU:
+				if (window == f->wss.val * (mss + 40) ||
+				    window == f->wss.val * (SMART_MSS_1 + 40) ||
+				    window == f->wss.val * (SMART_MSS_2 + 40))
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MODULO:
+				if ((window % f->wss.val) == 0)
+					fmatch = FMATCH_OK;
+				break;
+			}
+		}
+
+		if (fmatch != FMATCH_OK)
+			continue;
+
+		fcount++;
+
+		if (info->flags & NF_OSF_LOG)
+			nf_log_packet(net, family, hooknum, skb,
+				      in, out, NULL,
+				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+				      f->genre, f->version, f->subtype,
+				      &ip->saddr, ntohs(tcp->source),
+				      &ip->daddr, ntohs(tcp->dest),
+				      f->ttl - ip->ttl);
+
+		if ((info->flags & NF_OSF_LOG) &&
+		    info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+			break;
+	}
+
+	if (!fcount && (info->flags & NF_OSF_LOG))
+		nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+			      "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+			      &ip->saddr, ntohs(tcp->source),
+			      &ip->daddr, ntohs(tcp->dest));
+
+	if (fcount)
+		fmatch = FMATCH_OK;
+
+	return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
 #include <net/netfilter/nf_log.h>
 #include <linux/netfilter/xt_osf.h>
 
-struct xt_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct xt_osf_user_finger	finger;
-};
-
-enum osf_fmatch_states {
-	/* Packet does not match the fingerprint */
-	FMATCH_WRONG = 0,
-	/* Packet matches the fingerprint */
-	FMATCH_OK,
-	/* Options do not match the fingerprint, but header does */
-	FMATCH_OPT_WRONG,
-};
-
 /*
  * Indexed by dont-fragment bit.
  * It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
 	.cb			= xt_osf_nfnetlink_callbacks,
 };
 
-static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
-			    unsigned char f_ttl)
-{
-	const struct iphdr *ip = ip_hdr(skb);
-
-	if (info->flags & XT_OSF_TTL) {
-		if (info->ttl == XT_OSF_TTL_TRUE)
-			return ip->ttl == f_ttl;
-		if (info->ttl == XT_OSF_TTL_NOCHECK)
-			return 1;
-		else if (ip->ttl <= f_ttl)
-			return 1;
-		else {
-			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
-			int ret = 0;
-
-			for_ifa(in_dev) {
-				if (inet_ifa_match(ip->saddr, ifa)) {
-					ret = (ip->ttl == f_ttl);
-					break;
-				}
-			}
-			endfor_ifa(in_dev);
-
-			return ret;
-		}
-	}
-
-	return ip->ttl == f_ttl;
-}
-
 static bool
 xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 {
 	const struct xt_osf_info *info = p->matchinfo;
-	const struct iphdr *ip = ip_hdr(skb);
-	const struct tcphdr *tcp;
-	struct tcphdr _tcph;
-	int fmatch = FMATCH_WRONG, fcount = 0;
-	unsigned int optsize = 0, check_WSS = 0;
-	u16 window, totlen, mss = 0;
-	bool df;
-	const unsigned char *optp = NULL, *_optp = NULL;
-	unsigned char opts[MAX_IPOPTLEN];
-	const struct xt_osf_finger *kf;
-	const struct xt_osf_user_finger *f;
 	struct net *net = xt_net(p);
 
 	if (!info)
 		return false;
 
-	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
-	if (!tcp)
-		return false;
-
-	if (!tcp->syn)
-		return false;
-
-	totlen = ntohs(ip->tot_len);
-	df = ntohs(ip->frag_off) & IP_DF;
-	window = ntohs(tcp->window);
-
-	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
-		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
-		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
-				sizeof(struct tcphdr), optsize, opts);
-	}
-
-	list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
-		int foptsize, optnum;
-
-		f = &kf->finger;
-
-		if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
-			continue;
-
-		optp = _optp;
-		fmatch = FMATCH_WRONG;
-
-		if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
-			continue;
-
-		/*
-		 * Should not happen if userspace parser was written correctly.
-		 */
-		if (f->wss.wc >= OSF_WSS_MAX)
-			continue;
-
-		/* Check options */
-
-		foptsize = 0;
-		for (optnum = 0; optnum < f->opt_num; ++optnum)
-			foptsize += f->opt[optnum].length;
-
-		if (foptsize > MAX_IPOPTLEN ||
-		    optsize > MAX_IPOPTLEN ||
-		    optsize != foptsize)
-			continue;
-
-		check_WSS = f->wss.wc;
-
-		for (optnum = 0; optnum < f->opt_num; ++optnum) {
-			if (f->opt[optnum].kind == (*optp)) {
-				__u32 len = f->opt[optnum].length;
-				const __u8 *optend = optp + len;
-
-				fmatch = FMATCH_OK;
-
-				switch (*optp) {
-				case OSFOPT_MSS:
-					mss = optp[3];
-					mss <<= 8;
-					mss |= optp[2];
-
-					mss = ntohs((__force __be16)mss);
-					break;
-				case OSFOPT_TS:
-					break;
-				}
-
-				optp = optend;
-			} else
-				fmatch = FMATCH_OPT_WRONG;
-
-			if (fmatch != FMATCH_OK)
-				break;
-		}
-
-		if (fmatch != FMATCH_OPT_WRONG) {
-			fmatch = FMATCH_WRONG;
-
-			switch (check_WSS) {
-			case OSF_WSS_PLAIN:
-				if (f->wss.val == 0 || window == f->wss.val)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MSS:
-				/*
-				 * Some smart modems decrease mangle MSS to
-				 * SMART_MSS_2, so we check standard, decreased
-				 * and the one provided in the fingerprint MSS
-				 * values.
-				 */
-#define SMART_MSS_1	1460
-#define SMART_MSS_2	1448
-				if (window == f->wss.val * mss ||
-				    window == f->wss.val * SMART_MSS_1 ||
-				    window == f->wss.val * SMART_MSS_2)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MTU:
-				if (window == f->wss.val * (mss + 40) ||
-				    window == f->wss.val * (SMART_MSS_1 + 40) ||
-				    window == f->wss.val * (SMART_MSS_2 + 40))
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MODULO:
-				if ((window % f->wss.val) == 0)
-					fmatch = FMATCH_OK;
-				break;
-			}
-		}
-
-		if (fmatch != FMATCH_OK)
-			continue;
-
-		fcount++;
-
-		if (info->flags & XT_OSF_LOG)
-			nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
-				      xt_in(p), xt_out(p), NULL,
-				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
-				      f->genre, f->version, f->subtype,
-				      &ip->saddr, ntohs(tcp->source),
-				      &ip->daddr, ntohs(tcp->dest),
-				      f->ttl - ip->ttl);
-
-		if ((info->flags & XT_OSF_LOG) &&
-		    info->loglevel == XT_OSF_LOGLEVEL_FIRST)
-			break;
-	}
-
-	if (!fcount && (info->flags & XT_OSF_LOG))
-		nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
-			      xt_out(p), NULL,
-			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
-				&ip->saddr, ntohs(tcp->source),
-				&ip->daddr, ntohs(tcp->dest));
-
-	if (fcount)
-		fmatch = FMATCH_OK;
-
-	return fmatch == FMATCH_OK;
+	return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
+			    xt_out(p), info, net, xt_osf_fingers);
 }
 
 static struct xt_match xt_osf_match = {
-- 
cgit v1.2.3-55-g7522


From 538c5672be6d67b7b10c15701588e20617374973 Mon Sep 17 00:00:00 2001
From: Florent Fourcot
Date: Sun, 6 May 2018 16:30:14 +0200
Subject: netfilter: ctnetlink: export nf_conntrack_max

IPCTNL_MSG_CT_GET_STATS netlink command allow to monitor current number
of conntrack entries. However, if one wants to compare it with the
maximum (and detect exhaustion), the only solution is currently to read
sysctl value.

This patch add nf_conntrack_max value in netlink message, and simplify
monitoring for application built on netlink API.

Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 +
 net/netfilter/nf_conntrack_core.c                  | 1 +
 net/netfilter/nf_conntrack_netlink.c               | 3 +++
 3 files changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 77987111cab0..1d41810d17e2 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -262,6 +262,7 @@ enum ctattr_stats_cpu {
 enum ctattr_stats_global {
 	CTA_STATS_GLOBAL_UNSPEC,
 	CTA_STATS_GLOBAL_ENTRIES,
+	CTA_STATS_GLOBAL_MAX_ENTRIES,
 	__CTA_STATS_GLOBAL_MAX,
 };
 #define CTA_STATS_GLOBAL_MAX (__CTA_STATS_GLOBAL_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..605441727008 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -186,6 +186,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
 unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_t nf_conntrack_generation __read_mostly;
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..d807b8770be3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return skb->len;
 
-- 
cgit v1.2.3-55-g7522


From 50f32718e125c3be5b0528bfa3868e88d677d8ce Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss
Date: Fri, 20 Apr 2018 13:49:26 +0300
Subject: nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command

This will serve userspace entity to maintain its regulatory limitation.
More specifcally APs can use this data to calculate the WMM IE when
building: beacons, probe responses, assoc responses etc...

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++
 net/wireless/nl80211.c       | 57 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 15daf5e2638d..04c9b97aa5fc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,6 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -3141,6 +3142,29 @@ enum nl80211_band_attr {
 
 #define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA
 
+/**
+ * enum nl80211_wmm_rule - regulatory wmm rule
+ *
+ * @__NL80211_WMMR_INVALID: attribute number 0 is reserved
+ * @NL80211_WMMR_CW_MIN: Minimum contention window slot.
+ * @NL80211_WMMR_CW_MAX: Maximum contention window slot.
+ * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space.
+ * @NL80211_WMMR_TXOP: Maximum allowed tx operation time.
+ * @nl80211_WMMR_MAX: highest possible wmm rule.
+ * @__NL80211_WMMR_LAST: Internal use.
+ */
+enum nl80211_wmm_rule {
+	__NL80211_WMMR_INVALID,
+	NL80211_WMMR_CW_MIN,
+	NL80211_WMMR_CW_MAX,
+	NL80211_WMMR_AIFSN,
+	NL80211_WMMR_TXOP,
+
+	/* keep last */
+	__NL80211_WMMR_LAST,
+	NL80211_WMMR_MAX = __NL80211_WMMR_LAST - 1
+};
+
 /**
  * enum nl80211_frequency_attr - frequency attributes
  * @__NL80211_FREQUENCY_ATTR_INVALID: attribute number 0 is reserved
@@ -3190,6 +3214,9 @@ enum nl80211_band_attr {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed
  *	on this channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations.
+ *	This is a nested attribute that contains the wmm limitation per AC.
+ *	(see &enum nl80211_wmm_rule)
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -3218,6 +3245,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_IR_CONCURRENT,
 	NL80211_FREQUENCY_ATTR_NO_20MHZ,
 	NL80211_FREQUENCY_ATTR_NO_10MHZ,
+	NL80211_FREQUENCY_ATTR_WMM,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ff28f8feeb09..016d0a1de576 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  */
 
 #include <linux/if.h>
@@ -645,7 +646,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
 	return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
 }
 
-static int nl80211_msg_put_channel(struct sk_buff *msg,
+static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
+				     const struct ieee80211_reg_rule *rule)
+{
+	int j;
+	struct nlattr *nl_wmm_rules =
+		nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM);
+
+	if (!nl_wmm_rules)
+		goto nla_put_failure;
+
+	for (j = 0; j < IEEE80211_NUM_ACS; j++) {
+		struct nlattr *nl_wmm_rule = nla_nest_start(msg, j);
+
+		if (!nl_wmm_rule)
+			goto nla_put_failure;
+
+		if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
+				rule->wmm_rule->client[j].cw_min) ||
+		    nla_put_u16(msg, NL80211_WMMR_CW_MAX,
+				rule->wmm_rule->client[j].cw_max) ||
+		    nla_put_u8(msg, NL80211_WMMR_AIFSN,
+			       rule->wmm_rule->client[j].aifsn) ||
+		    nla_put_u8(msg, NL80211_WMMR_TXOP,
+			       rule->wmm_rule->client[j].cot))
+			goto nla_put_failure;
+
+		nla_nest_end(msg, nl_wmm_rule);
+	}
+	nla_nest_end(msg, nl_wmm_rules);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 				   struct ieee80211_channel *chan,
 				   bool large)
 {
@@ -721,6 +758,16 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
 			DBM_TO_MBM(chan->max_power)))
 		goto nla_put_failure;
 
+	if (large) {
+		const struct ieee80211_reg_rule *rule =
+			freq_reg_info(wiphy, chan->center_freq);
+
+		if (!IS_ERR(rule) && rule->wmm_rule) {
+			if (nl80211_msg_put_wmm_rules(msg, rule))
+				goto nla_put_failure;
+		}
+	}
+
 	return 0;
 
  nla_put_failure:
@@ -1631,7 +1678,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 					chan = &sband->channels[i];
 
 					if (nl80211_msg_put_channel(
-							msg, chan,
+							msg, &rdev->wiphy, chan,
 							state->split))
 						goto nla_put_failure;
 
@@ -14320,7 +14367,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_before, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
@@ -14328,7 +14376,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_after, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
-- 
cgit v1.2.3-55-g7522


From 81d5439da84419ee35bea54309a9f2c3871b6605 Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori
Date: Mon, 16 Apr 2018 20:18:40 +0530
Subject: cfg80211: average ack rssi support for data frames

Average ack rssi will be given to userspace via NL80211 interface
if firmware is capable. Userspace tool ‘iw’ can process this
information and give the output as one of the fields in
‘iw dev wlanX station dump’.

Example output :

localhost ~ #iw dev wlan-5000mhz station dump Station
34:f3:9a:aa:3b:29 (on wlan-5000mhz)
        inactive time:  5370 ms
        rx bytes:       85321
        rx packets:     576
        tx bytes:       14225
        tx packets:     71
        tx retries:     0
        tx failed:      2
        beacon loss:    0
        rx drop misc:   0
        signal:         -54 dBm
        signal avg:     -53 dBm
        tx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        rx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        avg ack signal: -56 dBm
        authorized:     yes
        authenticated:  yes
        associated:     yes
        preamble:       short
        WMM/WME:        yes
        MFP:            no
        TDLS peer:      no
        DTIM period:    2
        beacon interval:100
       short preamble: yes
       short slot time:yes
       connected time: 203 seconds

Main use case is to measure the signal strength of a connected station
to AP. Data packet transmit rates and bandwidth used by station can vary
a lot even if the station is at fixed location, especially if the rates
used are multi stream(2stream, 3stream) rates with different bandwidth(20/40/80 Mhz).
These multi stream rates are sensitive and station can use different transmit power
for each of the rate and bandwidth combinations. RSSI measured from these RX packets
on AP will be not stable and can vary a lot with in a short time.
Whereas 802.11 ack frames from station are sent relatively at a constant
rate (6/12/24 Mbps) with constant bandwidth(20 Mhz).
So average rssi of the ack packets is good and more accurate.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 7 +++++++
 net/wireless/nl80211.c       | 3 +++
 3 files changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 250dac390806..5e888ec62811 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1152,6 +1152,8 @@ struct cfg80211_tid_stats {
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ *	been sent.
  */
 struct station_info {
 	u64 filled;
@@ -1197,6 +1199,7 @@ struct station_info {
 	u8 rx_beacon_signal_avg;
 	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
 	s8 ack_signal;
+	s8 avg_ack_signal;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 04c9b97aa5fc..8e55b63ed3ff 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2981,6 +2981,8 @@ enum nl80211_sta_bss_param {
  *	received from the station (u64, usec)
  * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment
  * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm)
+ * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data)
+ *	ACK frame (s8, dBm)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3020,6 +3022,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_RX_DURATION,
 	NL80211_STA_INFO_PAD,
 	NL80211_STA_INFO_ACK_SIGNAL,
+	NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
@@ -5066,6 +5069,9 @@ enum nl80211_feature_flags {
  *	"radar detected" event.
  * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and
  *	receiving control port frames over nl80211 instead of the netdevice.
+ * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
+ *	rssi if firmware support, this flag is to intimate about ack rssi
+ *	support to nl80211.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5098,6 +5104,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN,
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
+	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 016d0a1de576..6b942a68d1c8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4541,6 +4541,9 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
 	PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
+		PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
 
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
-- 
cgit v1.2.3-55-g7522


From 52539ca89f365d3db530535fbffa88a3cca4d2ec Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen
Date: Tue, 8 May 2018 13:03:50 +0200
Subject: cfg80211: Expose TXQ stats and parameters to userspace

This adds support for exporting the mac80211 TXQ stats via nl80211 by
way of a nested TXQ stats attribute, as well as for configuring the
quantum and limits that were previously only changeable through debugfs.

This commit adds just the nl80211 API, a subsequent commit adds support to
mac80211 itself.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  50 +++++++++++
 include/uapi/linux/nl80211.h |  58 +++++++++++++
 net/mac80211/ethtool.c       |  32 ++++---
 net/wireless/nl80211.c       | 202 +++++++++++++++++++++++++++++++++++++------
 net/wireless/rdev-ops.h      |  12 +++
 net/wireless/trace.h         |  14 +++
 net/wireless/wext-compat.c   |  23 +++--
 7 files changed, 343 insertions(+), 48 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5e888ec62811..8db6071b6063 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1079,6 +1079,37 @@ struct sta_bss_parameters {
 	u16 beacon_interval;
 };
 
+/**
+ * struct cfg80211_txq_stats - TXQ statistics for this TID
+ * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
+ *	indicate the relevant values in this struct are filled
+ * @backlog_bytes: total number of bytes currently backlogged
+ * @backlog_packets: total number of packets currently backlogged
+ * @flows: number of new flows seen
+ * @drops: total number of packets dropped
+ * @ecn_marks: total number of packets marked with ECN CE
+ * @overlimit: number of drops due to queue space overflow
+ * @overmemory: number of drops due to memory limit overflow
+ * @collisions: number of hash collisions
+ * @tx_bytes: total number of bytes dequeued
+ * @tx_packets: total number of packets dequeued
+ * @max_flows: maximum number of flows supported
+ */
+struct cfg80211_txq_stats {
+	u32 filled;
+	u32 backlog_bytes;
+	u32 backlog_packets;
+	u32 flows;
+	u32 drops;
+	u32 ecn_marks;
+	u32 overlimit;
+	u32 overmemory;
+	u32 collisions;
+	u32 tx_bytes;
+	u32 tx_packets;
+	u32 max_flows;
+};
+
 /**
  * struct cfg80211_tid_stats - per-TID statistics
  * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
@@ -1088,6 +1119,7 @@ struct sta_bss_parameters {
  * @tx_msdu_retries: number of retries (not counting the first) for
  *	transmitted MSDUs
  * @tx_msdu_failed: number of failed transmitted MSDUs
+ * @txq_stats: TXQ statistics
  */
 struct cfg80211_tid_stats {
 	u32 filled;
@@ -1095,6 +1127,7 @@ struct cfg80211_tid_stats {
 	u64 tx_msdu;
 	u64 tx_msdu_retries;
 	u64 tx_msdu_failed;
+	struct cfg80211_txq_stats txq_stats;
 };
 
 #define IEEE80211_MAX_CHAINS	4
@@ -2204,6 +2237,9 @@ enum cfg80211_connect_params_changed {
  * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
  * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
  * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
+ * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
+ * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
+ * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
  */
 enum wiphy_params_flags {
 	WIPHY_PARAM_RETRY_SHORT		= 1 << 0,
@@ -2212,6 +2248,9 @@ enum wiphy_params_flags {
 	WIPHY_PARAM_RTS_THRESHOLD	= 1 << 3,
 	WIPHY_PARAM_COVERAGE_CLASS	= 1 << 4,
 	WIPHY_PARAM_DYN_ACK		= 1 << 5,
+	WIPHY_PARAM_TXQ_LIMIT		= 1 << 6,
+	WIPHY_PARAM_TXQ_MEMORY_LIMIT	= 1 << 7,
+	WIPHY_PARAM_TXQ_QUANTUM		= 1 << 8,
 };
 
 /**
@@ -2964,6 +3003,9 @@ struct cfg80211_external_auth_params {
  *
  * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
  *
+ * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
+ *      function should return phy stats, and interface stats otherwise.
+ *
  * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
  *	If not deleted through @del_pmk the PMK remains valid until disconnect
  *	upon which the driver should clear it.
@@ -3265,6 +3307,10 @@ struct cfg80211_ops {
 					    struct net_device *dev,
 					    const bool enabled);
 
+	int	(*get_txq_stats)(struct wiphy *wiphy,
+				 struct wireless_dev *wdev,
+				 struct cfg80211_txq_stats *txqstats);
+
 	int	(*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
 			   const struct cfg80211_pmk_conf *conf);
 	int	(*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
@@ -3943,6 +3989,10 @@ struct wiphy {
 
 	u8 nan_supported_bands;
 
+	u32 txq_limit;
+	u32 txq_memory_limit;
+	u32 txq_quantum;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8e55b63ed3ff..5e67e3444aba 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2226,6 +2226,16 @@ enum nl80211_commands {
  * @NL80211_ATTR_NSS: Station's New/updated  RX_NSS value notified using this
  *	u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED.
  *
+ * @NL80211_ATTR_TXQ_STATS: TXQ statistics (nested attribute, see &enum
+ *      nl80211_txq_stats)
+ * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy.
+ *      The smaller of this and the memory limit is enforced.
+ * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the
+ *      TXQ queues for this phy. The smaller of this and the packet limit is
+ *      enforced.
+ * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes
+ *      a flow is assigned on each round of the DRR scheduler.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2660,6 +2670,11 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_CONTROL_PORT_OVER_NL80211,
 
+	NL80211_ATTR_TXQ_STATS,
+	NL80211_ATTR_TXQ_LIMIT,
+	NL80211_ATTR_TXQ_MEMORY_LIMIT,
+	NL80211_ATTR_TXQ_QUANTUM,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3040,6 +3055,7 @@ enum nl80211_sta_info {
  * @NL80211_TID_STATS_TX_MSDU_FAILED: number of failed transmitted
  *	MSDUs (u64)
  * @NL80211_TID_STATS_PAD: attribute used for padding for 64-bit alignment
+ * @NL80211_TID_STATS_TXQ_STATS: TXQ stats (nested attribute)
  * @NUM_NL80211_TID_STATS: number of attributes here
  * @NL80211_TID_STATS_MAX: highest numbered attribute here
  */
@@ -3050,12 +3066,51 @@ enum nl80211_tid_stats {
 	NL80211_TID_STATS_TX_MSDU_RETRIES,
 	NL80211_TID_STATS_TX_MSDU_FAILED,
 	NL80211_TID_STATS_PAD,
+	NL80211_TID_STATS_TXQ_STATS,
 
 	/* keep last */
 	NUM_NL80211_TID_STATS,
 	NL80211_TID_STATS_MAX = NUM_NL80211_TID_STATS - 1
 };
 
+/**
+ * enum nl80211_txq_stats - per TXQ statistics attributes
+ * @__NL80211_TXQ_STATS_INVALID: attribute number 0 is reserved
+ * @NUM_NL80211_TXQ_STATS: number of attributes here
+ * @NL80211_TXQ_STATS_BACKLOG_BYTES: number of bytes currently backlogged
+ * @NL80211_TXQ_STATS_BACKLOG_PACKETS: number of packets currently
+ *      backlogged
+ * @NL80211_TXQ_STATS_FLOWS: total number of new flows seen
+ * @NL80211_TXQ_STATS_DROPS: total number of packet drops
+ * @NL80211_TXQ_STATS_ECN_MARKS: total number of packet ECN marks
+ * @NL80211_TXQ_STATS_OVERLIMIT: number of drops due to queue space overflow
+ * @NL80211_TXQ_STATS_OVERMEMORY: number of drops due to memory limit overflow
+ *      (only for per-phy stats)
+ * @NL80211_TXQ_STATS_COLLISIONS: number of hash collisions
+ * @NL80211_TXQ_STATS_TX_BYTES: total number of bytes dequeued from TXQ
+ * @NL80211_TXQ_STATS_TX_PACKETS: total number of packets dequeued from TXQ
+ * @NL80211_TXQ_STATS_MAX_FLOWS: number of flow buckets for PHY
+ * @NL80211_TXQ_STATS_MAX: highest numbered attribute here
+ */
+enum nl80211_txq_stats {
+	__NL80211_TXQ_STATS_INVALID,
+	NL80211_TXQ_STATS_BACKLOG_BYTES,
+	NL80211_TXQ_STATS_BACKLOG_PACKETS,
+	NL80211_TXQ_STATS_FLOWS,
+	NL80211_TXQ_STATS_DROPS,
+	NL80211_TXQ_STATS_ECN_MARKS,
+	NL80211_TXQ_STATS_OVERLIMIT,
+	NL80211_TXQ_STATS_OVERMEMORY,
+	NL80211_TXQ_STATS_COLLISIONS,
+	NL80211_TXQ_STATS_TX_BYTES,
+	NL80211_TXQ_STATS_TX_PACKETS,
+	NL80211_TXQ_STATS_MAX_FLOWS,
+
+	/* keep last */
+	NUM_NL80211_TXQ_STATS,
+	NL80211_TXQ_STATS_MAX = NUM_NL80211_TXQ_STATS - 1
+};
+
 /**
  * enum nl80211_mpath_flags - nl80211 mesh path flags
  *
@@ -5072,6 +5127,8 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
  *	rssi if firmware support, this flag is to intimate about ack rssi
  *	support to nl80211.
+ * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
+ *      TXQs.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5105,6 +5162,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
 	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
+	NL80211_EXT_FEATURE_TXQS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 08408520c3f8..1afeff94af8b 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -71,11 +71,15 @@ static void ieee80211_get_stats(struct net_device *dev,
 	struct ieee80211_channel *channel;
 	struct sta_info *sta;
 	struct ieee80211_local *local = sdata->local;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct survey_info survey;
 	int i, q;
 #define STA_STATS_SURVEY_LEN 7
 
+	sinfo = kmalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return;
+
 	memset(data, 0, sizeof(u64) * STA_STATS_LEN);
 
 #define ADD_STA_STATS(sta)					\
@@ -86,8 +90,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] += sta->rx_stats.fragments;		\
 		data[i++] += sta->rx_stats.dropped;		\
 								\
-		data[i++] += sinfo.tx_packets;			\
-		data[i++] += sinfo.tx_bytes;			\
+		data[i++] += sinfo->tx_packets;			\
+		data[i++] += sinfo->tx_bytes;			\
 		data[i++] += sta->status_stats.filtered;	\
 		data[i++] += sta->status_stats.retry_failed;	\
 		data[i++] += sta->status_stats.retry_count;	\
@@ -107,8 +111,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
 			goto do_survey;
 
-		memset(&sinfo, 0, sizeof(sinfo));
-		sta_set_sinfo(sta, &sinfo);
+		memset(sinfo, 0, sizeof(*sinfo));
+		sta_set_sinfo(sta, sinfo);
 
 		i = 0;
 		ADD_STA_STATS(sta);
@@ -116,17 +120,17 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] = sta->sta_state;
 
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.txrate);
+				cfg80211_calculate_bitrate(&sinfo->txrate);
 		i++;
-		if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.rxrate);
+				cfg80211_calculate_bitrate(&sinfo->rxrate);
 		i++;
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
-			data[i] = (u8)sinfo.signal_avg;
+		if (sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+			data[i] = (u8)sinfo->signal_avg;
 		i++;
 	} else {
 		list_for_each_entry(sta, &local->sta_list, list) {
@@ -134,14 +138,16 @@ static void ieee80211_get_stats(struct net_device *dev,
 			if (sta->sdata->dev != dev)
 				continue;
 
-			memset(&sinfo, 0, sizeof(sinfo));
-			sta_set_sinfo(sta, &sinfo);
+			memset(sinfo, 0, sizeof(*sinfo));
+			sta_set_sinfo(sta, sinfo);
 			i = 0;
 			ADD_STA_STATS(sta);
 		}
 	}
 
 do_survey:
+	kfree(sinfo);
+
 	i = STA_STATS_LEN - STA_STATS_SURVEY_LEN;
 	/* Get survey stats for current channel */
 	survey.filled = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6b942a68d1c8..f7715b85fd2b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -424,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 	[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
 	[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
+
+	[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -774,6 +778,39 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 	return -ENOBUFS;
 }
 
+static bool nl80211_put_txq_stats(struct sk_buff *msg,
+				  struct cfg80211_txq_stats *txqstats,
+				  int attrtype)
+{
+	struct nlattr *txqattr;
+
+#define PUT_TXQVAL_U32(attr, memb) do {					  \
+	if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) &&	  \
+	    nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
+		return false;						  \
+	} while (0)
+
+	txqattr = nla_nest_start(msg, attrtype);
+	if (!txqattr)
+		return false;
+
+	PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
+	PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
+	PUT_TXQVAL_U32(FLOWS, flows);
+	PUT_TXQVAL_U32(DROPS, drops);
+	PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
+	PUT_TXQVAL_U32(OVERLIMIT, overlimit);
+	PUT_TXQVAL_U32(OVERMEMORY, overmemory);
+	PUT_TXQVAL_U32(COLLISIONS, collisions);
+	PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
+	PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
+	PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
+	nla_nest_end(msg, txqattr);
+
+#undef PUT_TXQVAL_U32
+	return true;
+}
+
 /* netlink command implementations */
 
 struct key_parse {
@@ -1973,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 				rdev->wiphy.nan_supported_bands))
 			goto nla_put_failure;
 
+		if (wiphy_ext_feature_isset(&rdev->wiphy,
+					    NL80211_EXT_FEATURE_TXQS)) {
+			struct cfg80211_txq_stats txqstats = {};
+			int res;
+
+			res = rdev_get_txq_stats(rdev, NULL, &txqstats);
+			if (!res &&
+			    !nl80211_put_txq_stats(msg, &txqstats,
+						   NL80211_ATTR_TXQ_STATS))
+				goto nla_put_failure;
+
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
+					rdev->wiphy.txq_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
+					rdev->wiphy.txq_memory_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
+					rdev->wiphy.txq_quantum))
+				goto nla_put_failure;
+		}
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -2350,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	u8 retry_short = 0, retry_long = 0;
 	u32 frag_threshold = 0, rts_threshold = 0;
 	u8 coverage_class = 0;
+	u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
 
 	ASSERT_RTNL();
 
@@ -2556,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		changed |= WIPHY_PARAM_DYN_ACK;
 	}
 
+	if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_memory_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_quantum = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
+		changed |= WIPHY_PARAM_TXQ_QUANTUM;
+	}
+
 	if (changed) {
 		u8 old_retry_short, old_retry_long;
 		u32 old_frag_threshold, old_rts_threshold;
 		u8 old_coverage_class;
+		u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
 
 		if (!rdev->ops->set_wiphy_params)
 			return -EOPNOTSUPP;
@@ -2569,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		old_frag_threshold = rdev->wiphy.frag_threshold;
 		old_rts_threshold = rdev->wiphy.rts_threshold;
 		old_coverage_class = rdev->wiphy.coverage_class;
+		old_txq_limit = rdev->wiphy.txq_limit;
+		old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
+		old_txq_quantum = rdev->wiphy.txq_quantum;
 
 		if (changed & WIPHY_PARAM_RETRY_SHORT)
 			rdev->wiphy.retry_short = retry_short;
@@ -2580,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.rts_threshold = rts_threshold;
 		if (changed & WIPHY_PARAM_COVERAGE_CLASS)
 			rdev->wiphy.coverage_class = coverage_class;
+		if (changed & WIPHY_PARAM_TXQ_LIMIT)
+			rdev->wiphy.txq_limit = txq_limit;
+		if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
+			rdev->wiphy.txq_memory_limit = txq_memory_limit;
+		if (changed & WIPHY_PARAM_TXQ_QUANTUM)
+			rdev->wiphy.txq_quantum = txq_quantum;
 
 		result = rdev_set_wiphy_params(rdev, changed);
 		if (result) {
@@ -2588,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.frag_threshold = old_frag_threshold;
 			rdev->wiphy.rts_threshold = old_rts_threshold;
 			rdev->wiphy.coverage_class = old_coverage_class;
+			rdev->wiphy.txq_limit = old_txq_limit;
+			rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
+			rdev->wiphy.txq_quantum = old_txq_quantum;
 			return result;
 		}
 	}
@@ -2709,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	}
 	wdev_unlock(wdev);
 
+	if (rdev->ops->get_txq_stats) {
+		struct cfg80211_txq_stats txqstats = {};
+		int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);
+
+		if (ret == 0 &&
+		    !nl80211_put_txq_stats(msg, &txqstats,
+					   NL80211_ATTR_TXQ_STATS))
+			goto nla_put_failure;
+	}
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -4582,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 			PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
 
 #undef PUT_TIDVAL_U64
+			if ((tidstats->filled &
+			     BIT(NL80211_TID_STATS_TXQ_STATS)) &&
+			    !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
+						   NL80211_TID_STATS_TXQ_STATS))
+				goto nla_put_failure;
+
 			nla_nest_end(msg, tidattr);
 		}
 
@@ -4606,13 +4722,17 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 static int nl80211_dump_station(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
 	u8 mac_addr[ETH_ALEN];
 	int sta_idx = cb->args[2];
 	int err;
 
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
+
 	rtnl_lock();
 	err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
 	if (err)
@@ -4629,9 +4749,9 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	}
 
 	while (1) {
-		memset(&sinfo, 0, sizeof(sinfo));
+		memset(sinfo, 0, sizeof(*sinfo));
 		err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
-					mac_addr, &sinfo);
+					mac_addr, sinfo);
 		if (err == -ENOENT)
 			break;
 		if (err)
@@ -4641,7 +4761,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev->netdev, mac_addr,
-				&sinfo) < 0)
+				sinfo) < 0)
 			goto out;
 
 		sta_idx++;
@@ -4652,6 +4772,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	err = skb->len;
  out_err:
 	rtnl_unlock();
+	kfree(sinfo);
 
 	return err;
 }
@@ -4660,37 +4781,49 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
 	int err;
 
-	memset(&sinfo, 0, sizeof(sinfo));
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!info->attrs[NL80211_ATTR_MAC])
-		return -EINVAL;
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (!rdev->ops->get_station)
-		return -EOPNOTSUPP;
+	if (!rdev->ops->get_station) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
+	err = rdev_get_station(rdev, dev, mac_addr, sinfo);
 	if (err)
-		return err;
+		goto out;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
-				 rdev, dev, mac_addr, &sinfo) < 0) {
+				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return -ENOBUFS;
+		err = -ENOBUFS;
+		goto out;
 	}
 
-	return genlmsg_reply(msg, info);
+	err = genlmsg_reply(msg, info);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 int cfg80211_check_station_change(struct wiphy *wiphy,
@@ -9954,18 +10087,26 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 */
 	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
 	    rdev->ops->get_station) {
-		struct station_info sinfo = {};
+		struct station_info *sinfo;
 		u8 *mac_addr;
 
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo)
+			return -ENOMEM;
+
 		mac_addr = wdev->current_bss->pub.bssid;
 
-		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
-		if (err)
+		err = rdev_get_station(rdev, dev, mac_addr, sinfo);
+		if (err) {
+			kfree(sinfo);
 			return err;
+		}
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
 			wdev->cqm_config->last_rssi_event_value =
-				(s8) sinfo.rx_beacon_signal_avg;
+				(s8)sinfo->rx_beacon_signal_avg;
+
+		kfree(sinfo);
 	}
 
 	last = wdev->cqm_config->last_rssi_event_value;
@@ -14499,25 +14640,32 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct sk_buff *msg;
-	struct station_info empty_sinfo = {};
+	struct station_info *empty_sinfo = NULL;
 
-	if (!sinfo)
-		sinfo = &empty_sinfo;
+	if (!sinfo) {
+		empty_sinfo = kzalloc(sizeof(*empty_sinfo), GFP_KERNEL);
+		if (!empty_sinfo)
+			return;
+		sinfo = empty_sinfo;
+	}
 
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
 	if (!msg)
-		return;
+		goto out;
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return;
+		goto out;
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				NL80211_MCGRP_MLME, gfp);
+
+out:
+	kfree(empty_sinfo);
 }
 EXPORT_SYMBOL(cfg80211_del_sta_sinfo);
 
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 87479a53411b..364f5d67f05b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
+		   struct wireless_dev *wdev,
+		   struct cfg80211_txq_stats *txqstats)
+{
+	int ret;
+	trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
+	ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
 {
 	trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 55fb279a5196..2b417a2fe63f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG,
 		  BOOL_TO_STR(__entry->enabled))
 );
+
+TRACE_EVENT(rdev_get_txq_stats,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+	TP_ARGS(wiphy, wdev),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..9e002df0f8d8 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1254,7 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
-	struct station_info sinfo = {};
+	struct station_info *sinfo;
 	u8 addr[ETH_ALEN];
 	int err;
 
@@ -1274,16 +1274,23 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 	if (err)
 		return err;
 
-	err = rdev_get_station(rdev, dev, addr, &sinfo);
-	if (err)
-		return err;
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
-		return -EOPNOTSUPP;
+	err = rdev_get_station(rdev, dev, addr, sinfo);
+	if (err)
+		goto out;
 
-	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+	if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	return 0;
+	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo->txrate);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 /* Get wireless statistics.  Called by /proc/net/wireless and by SIOCGIWSTATS */
-- 
cgit v1.2.3-55-g7522


From 78958fca7ead2f81b60a6827881c4866d1ed0c52 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Fri, 4 May 2018 14:49:51 -0700
Subject: bpf: btf: Introduce BTF ID

This patch gives an ID to each loaded BTF.  The ID is allocated by
the idr like the existing prog-id and map-id.

The bpf_put(map->btf) is moved to __bpf_map_put() so that the
userspace can stop seeing the BTF ID ASAP when the last BTF
refcnt is gone.

It also makes BTF accessible from userspace through the
1. new BPF_BTF_GET_FD_BY_ID command.  It is limited to CAP_SYS_ADMIN
   which is inline with the BPF_BTF_LOAD cmd and the existing
   BPF_[MAP|PROG]_GET_FD_BY_ID cmd.
2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info"

Once the BTF ID handler is accessible from userspace, freeing a BTF
object has to go through a rcu period.  The BPF_BTF_GET_FD_BY_ID cmd
can then be done under a rcu_read_lock() instead of taking
spin_lock.
[Note: A similar rcu usage can be done to the existing
       bpf_prog_get_fd_by_id() in a follow up patch]

When processing the BPF_BTF_GET_FD_BY_ID cmd,
refcount_inc_not_zero() is needed because the BTF object
could be already in the rcu dead row .  btf_get() is
removed since its usage is currently limited to btf.c
alone.  refcount_inc() is used directly instead.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |   2 +
 include/uapi/linux/bpf.h |   5 +++
 kernel/bpf/btf.c         | 108 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c     |  24 ++++++++++-
 4 files changed, 128 insertions(+), 11 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a966dc6d61ee..e076c4697049 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *ret_size);
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
+int btf_get_fd_by_id(u32 id);
+u32 btf_id(const struct btf *btf);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 93d5a4eeec2a..6106f23a9a8a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_cmd {
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -344,6 +345,7 @@ union bpf_attr {
 			__u32		start_id;
 			__u32		prog_id;
 			__u32		map_id;
+			__u32		btf_id;
 		};
 		__u32		next_id;
 		__u32		open_flags;
@@ -2130,6 +2132,9 @@ struct bpf_map_info {
 	__u32 ifindex;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index fa0dce0452e7..40950b6bf395 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -11,6 +11,7 @@
 #include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -179,6 +180,9 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
 struct btf {
 	union {
 		struct btf_header *hdr;
@@ -193,6 +197,8 @@ struct btf {
 	u32 types_size;
 	u32 data_size;
 	refcount_t refcnt;
+	u32 id;
+	struct rcu_head rcu;
 };
 
 enum verifier_phase {
@@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 	return 0;
 }
 
+static int btf_alloc_id(struct btf *btf)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&btf_idr_lock);
+	id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		btf->id = id;
+	spin_unlock_bh(&btf_idr_lock);
+	idr_preload_end();
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+	unsigned long flags;
+
+	/*
+	 * In map-in-map, calling map_delete_elem() on outer
+	 * map will call bpf_map_put on the inner map.
+	 * It will then eventually call btf_free_id()
+	 * on the inner map.  Some of the map_delete_elem()
+	 * implementation may have irq disabled, so
+	 * we need to use the _irqsave() version instead
+	 * of the _bh() version.
+	 */
+	spin_lock_irqsave(&btf_idr_lock, flags);
+	idr_remove(&btf_idr, btf->id);
+	spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
@@ -607,15 +649,19 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
-static void btf_get(struct btf *btf)
+static void btf_free_rcu(struct rcu_head *rcu)
 {
-	refcount_inc(&btf->refcnt);
+	struct btf *btf = container_of(rcu, struct btf, rcu);
+
+	btf_free(btf);
 }
 
 void btf_put(struct btf *btf)
 {
-	if (btf && refcount_dec_and_test(&btf->refcnt))
-		btf_free(btf);
+	if (btf && refcount_dec_and_test(&btf->refcnt)) {
+		btf_free_id(btf);
+		call_rcu(&btf->rcu, btf_free_rcu);
+	}
 }
 
 static int env_resolve_init(struct btf_verifier_env *env)
@@ -2006,10 +2052,15 @@ const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
+static int __btf_new_fd(struct btf *btf)
+{
+	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
 int btf_new_fd(const union bpf_attr *attr)
 {
 	struct btf *btf;
-	int fd;
+	int ret;
 
 	btf = btf_parse(u64_to_user_ptr(attr->btf),
 			attr->btf_size, attr->btf_log_level,
@@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *attr)
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
-	fd = anon_inode_getfd("btf", &btf_fops, btf,
-			      O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
+	ret = btf_alloc_id(btf);
+	if (ret) {
+		btf_free(btf);
+		return ret;
+	}
+
+	/*
+	 * The BTF ID is published to the userspace.
+	 * All BTF free must go through call_rcu() from
+	 * now on (i.e. free by calling btf_put()).
+	 */
+
+	ret = __btf_new_fd(btf);
+	if (ret < 0)
 		btf_put(btf);
 
-	return fd;
+	return ret;
 }
 
 struct btf *btf_get_by_fd(int fd)
@@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd)
 	}
 
 	btf = f.file->private_data;
-	btf_get(btf);
+	refcount_inc(&btf->refcnt);
 	fdput(f);
 
 	return btf;
@@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf *btf,
 
 	return 0;
 }
+
+int btf_get_fd_by_id(u32 id)
+{
+	struct btf *btf;
+	int fd;
+
+	rcu_read_lock();
+	btf = idr_find(&btf_idr, id);
+	if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+		btf = ERR_PTR(-ENOENT);
+	rcu_read_unlock();
+
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = __btf_new_fd(btf);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+	return btf->id;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9b87198deea2..31c4092da277 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -252,7 +252,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
-	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -273,6 +272,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
+		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -2002,6 +2002,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (map->btf) {
+		info.btf_id = btf_id(map->btf);
+		info.btf_key_id = map->btf_key_id;
+		info.btf_value_id = map->btf_value_id;
+	}
+
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
 		if (err)
@@ -2059,6 +2065,19 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	return btf_new_fd(attr);
 }
 
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_get_fd_by_id(attr->btf_id);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2142,6 +2161,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_LOAD:
 		err = bpf_btf_load(&attr);
 		break;
+	case BPF_BTF_GET_FD_BY_ID:
+		err = bpf_btf_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3-55-g7522


From 62dab84c81a487d946a5fc37c6df541dd95cca38 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Fri, 4 May 2018 14:49:52 -0700
Subject: bpf: btf: Add struct bpf_btf_info

During BPF_OBJ_GET_INFO_BY_FD on a btf_fd, the current bpf_attr's
info.info is directly filled with the BTF binary data.  It is
not extensible.  In this case, we want to add BTF ID.

This patch adds "struct bpf_btf_info" which has the BTF ID as
one of its member.  The BTF binary data itself is exposed through
the "btf" and "btf_size" members.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/btf.c         | 26 +++++++++++++++++++++-----
 kernel/bpf/syscall.c     | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6106f23a9a8a..d615c777b573 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2137,6 +2137,12 @@ struct bpf_map_info {
 	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
+} __attribute__((aligned(8)));
+
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
  * attach attach type).
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 40950b6bf395..ded10ab47b8a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2114,12 +2114,28 @@ int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
 		       union bpf_attr __user *uattr)
 {
-	void __user *udata = u64_to_user_ptr(attr->info.info);
-	u32 copy_len = min_t(u32, btf->data_size,
-			     attr->info.info_len);
+	struct bpf_btf_info __user *uinfo;
+	struct bpf_btf_info info = {};
+	u32 info_copy, btf_copy;
+	void __user *ubtf;
+	u32 uinfo_len;
 
-	if (copy_to_user(udata, btf->data, copy_len) ||
-	    put_user(btf->data_size, &uattr->info.info_len))
+	uinfo = u64_to_user_ptr(attr->info.info);
+	uinfo_len = attr->info.info_len;
+
+	info_copy = min_t(u32, uinfo_len, sizeof(info));
+	if (copy_from_user(&info, uinfo, info_copy))
+		return -EFAULT;
+
+	info.id = btf->id;
+	ubtf = u64_to_user_ptr(info.btf);
+	btf_copy = min_t(u32, btf->data_size, info.btf_size);
+	if (copy_to_user(ubtf, btf->data, btf_copy))
+		return -EFAULT;
+	info.btf_size = btf->data_size;
+
+	if (copy_to_user(uinfo, &info, info_copy) ||
+	    put_user(info_copy, &uattr->info.info_len))
 		return -EFAULT;
 
 	return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 31c4092da277..e2aeb5e89f44 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2021,6 +2021,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	return 0;
 }
 
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	if (err)
+		return err;
+
+	return btf_get_info_by_fd(btf, attr, uattr);
+}
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -2044,7 +2059,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
 	else if (f.file->f_op == &btf_fops)
-		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
+		err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
cgit v1.2.3-55-g7522


From 87f5fc7e48dd3175b30dd03b41564e1a8e136323 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Wed, 9 May 2018 20:34:26 -0700
Subject: bpf: Provide helper to do forwarding lookups in kernel FIB table

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

These number are single CPU core forwarding on a Broadwell
E5-1650 v4 @ 3.60GHz.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  81 +++++++++++++-
 net/core/filter.c        | 267 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 347 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..02e4112510f8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,33 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *             full lookup using FIB rules
+ *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
+ *             perspective (default is ingress)
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *
+ *     Return
+ *             Egress device index on success, 0 if packet needs to continue
+ *             up the stack for further processing or a negative error in case
+ *             of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1925,8 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 0baa715e4699..ca60d2872da4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -60,6 +60,10 @@
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(dst) || rt6_need_strict(src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	idev = __in6_dev_get_safely(dev);
+	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = *dst;
+	fl6.saddr = *src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		*dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-55-g7522


From 2724273e8fd00b512596a77ee063f49b25f36507 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy
Date: Wed, 2 May 2018 15:17:17 +0530
Subject: vmcore: add API to collect hardware dump in second kernel

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/vmcore are as follows:

1. During probe (before hardware is initialized), device drivers
register to the vmcore module (via vmcore_add_device_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. vmcore module allocates the buffer with requested size. It adds
an Elf note and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to vmcore module.

Ensure that the device dump buffer size is always aligned to page size
so that it can be mmaped.

Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more
generic and reserve NT_VMCOREDD note type to indicate vmcore device
dump.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/Kconfig             |  15 +++++
 fs/proc/vmcore.c            | 135 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/crash_dump.h  |  18 ++++++
 include/linux/kcore.h       |   6 ++
 include/uapi/linux/elf.h    |   1 +
 include/uapi/linux/vmcore.h |  18 ++++++
 6 files changed, 184 insertions(+), 9 deletions(-)
 create mode 100644 include/uapi/linux/vmcore.h

(limited to 'include/uapi')

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..0eaeb41453f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,21 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_VMCORE_DEVICE_DUMP
+	bool "Device Hardware/Firmware Log Collection"
+	depends on PROC_VMCORE
+	default n
+	help
+	  After kernel panic, device drivers can collect the device
+	  specific snapshot of their hardware or firmware before the
+	  underlying devices are initialized in crash recovery kernel.
+	  Note that the device driver must be present in the crash
+	  recovery kernel's initramfs to collect its underlying device
+	  snapshot.
+
+	  If you say Y here, the collected device dumps will be added
+	  as ELF notes to /proc/vmcore.
+
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..abb3dba0fa49 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
@@ -44,6 +45,12 @@ static u64 vmcore_size;
 
 static struct proc_dir_entry *proc_vmcore;
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
 };
 
 /**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- *                      vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
  *
  * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
  * the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
  * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
  * disabled and there's no need to allow users to mmap the buffer.
  */
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
 {
 #ifdef CONFIG_MMU
-	return vmalloc_user(notes_sz);
+	return vmalloc_user(size);
 #else
-	return vzalloc(notes_sz);
+	return vzalloc(size);
 #endif
 }
 
@@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -1145,6 +1150,115 @@ static int __init parse_crash_elf_headers(void)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+				  u32 size)
+{
+	struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+	vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+	vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+	vdd_hdr->n_type = NT_VMCOREDD;
+
+	strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+		sizeof(vdd_hdr->name));
+	memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	struct vmcoredd_node *dump;
+	void *buf = NULL;
+	size_t data_size;
+	int ret;
+
+	if (!data || !strlen(data->dump_name) ||
+	    !data->vmcoredd_callback || !data->size)
+		return -EINVAL;
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Keep size of the buffer page aligned so that it can be mmaped */
+	data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+			    PAGE_SIZE);
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vmcore_alloc_buf(data_size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	vmcoredd_write_header(buf, data, data_size -
+			      sizeof(struct vmcoredd_header));
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->vmcoredd_callback(data, buf +
+				      sizeof(struct vmcoredd_header));
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data_size;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&vmcoredd_mutex);
+	list_add_tail(&dump->list, &vmcoredd_list);
+	mutex_unlock(&vmcoredd_mutex);
+
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+	mutex_lock(&vmcoredd_mutex);
+	while (!list_empty(&vmcoredd_list)) {
+		struct vmcoredd_node *dump;
+
+		dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+					list);
+		list_del(&dump->list);
+		vfree(dump->buf);
+		vfree(dump);
+	}
+	mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
 /* Init function for vmcore module. */
 static int __init vmcore_init(void)
 {
@@ -1192,4 +1306,7 @@ void vmcore_cleanup(void)
 		kfree(m);
 	}
 	free_elfcorebuf();
+
+	/* clear vmcore device dump list */
+	vmcore_free_device_dumps();
 }
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f7ac2aa93269..3e4ba9d753c8 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -5,6 +5,7 @@
 #include <linux/kexec.h>
 #include <linux/proc_fs.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>
 
 #include <asm/pgtable.h> /* for pgprot_t */
 
@@ -93,4 +94,21 @@ static inline bool is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
 
 extern unsigned long saved_max_pfn;
+
+/* Device Dump information to be filled by drivers */
+struct vmcoredd_data {
+	char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
+	unsigned int size;                       /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
+};
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+int vmcore_add_device_dump(struct vmcoredd_data *data);
+#else
+static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 80db19d3a505..8de55e4b5ee9 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -28,6 +28,12 @@ struct vmcore {
 	loff_t offset;
 };
 
+struct vmcoredd_node {
+	struct list_head list;	/* List of dumps */
+	void *buf;		/* Buffer containing device's dump */
+	unsigned int size;	/* Size of the buffer */
+};
+
 #ifdef CONFIG_PROC_KCORE
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e2535d6dcec7..4e12c423b9fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
+#define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
new file mode 100644
index 000000000000..022619668e0e
--- /dev/null
+++ b/include/uapi/linux/vmcore.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_VMCORE_H
+#define _UAPI_VMCORE_H
+
+#include <linux/types.h>
+
+#define VMCOREDD_NOTE_NAME "LINUX"
+#define VMCOREDD_MAX_NAME_BYTES 44
+
+struct vmcoredd_header {
+	__u32 n_namesz; /* Name size */
+	__u32 n_descsz; /* Content size */
+	__u32 n_type;   /* NT_VMCOREDD */
+	__u8 name[8];   /* LINUX\0\0\0 */
+	__u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
+};
+
+#endif /* _UAPI_VMCORE_H */
-- 
cgit v1.2.3-55-g7522


From 81c7288b170a59d01427fa33d6591da1dbf59277 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner
Date: Sun, 13 May 2018 17:44:27 -0300
Subject: sched: cls: enable verbose logging

Currently, when the rule is not to be exclusively executed by the
hardware, extack is not passed along and offloading failures don't
get logged. The idea was that hardware failures are okay because the
rule will get executed in software then and this way it doesn't confuse
unware users.

But this is not helpful in case one needs to understand why a certain
rule failed to get offloaded. Considering it may have been a temporary
failure, like resources exceeded or so, reproducing it later and knowing
that it is triggering the same reason may be challenging.

The ultimate goal is to improve Open vSwitch debuggability when using
flower offloading.

This patch adds a new flag to enable verbose logging. With the flag set,
extack will be passed to the driver, which will be able to log the
error. As the operation itself probably won't fail (not because of this,
at least), current iproute will already log it as a Warning.

The flag is generic, so it can be reused later. No need to restrict it
just for HW offloading. The command line will follow the syntax that
tc-ebpf already uses, tc ... [ verbose ] ... , and extend its meaning.

For example:
# ./tc qdisc add dev p7p1 ingress
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower verbose \
	src_mac ed:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
Warning: TC offload is disabled on net device.
# echo $?
0
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower \
	src_mac ff:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
# echo $?
0

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 6 ++++--
 include/uapi/linux/pkt_cls.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e828d31be5da..0005f0b40fe9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -683,9 +683,11 @@ static inline bool tc_skip_sw(u32 flags)
 /* SKIP_HW and SKIP_SW are mutually exclusive flags. */
 static inline bool tc_flags_valid(u32 flags)
 {
-	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))
+	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW |
+		      TCA_CLS_FLAGS_VERBOSE))
 		return false;
 
+	flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW;
 	if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)))
 		return false;
 
@@ -705,7 +707,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	if (tc_skip_sw(flags))
+	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be05e66c167b..84e4c1d0f874 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -129,6 +129,7 @@ enum {
 #define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
 #define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
 #define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
+#define TCA_CLS_FLAGS_VERBOSE	(1 << 4) /* verbose logging */
 
 /* U32 filters */
 
-- 
cgit v1.2.3-55-g7522


From 81110384441a59cff47430f20f049e69b98c17f4 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 14 May 2018 10:00:17 -0700
Subject: bpf: sockmap, add hash map support

Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.

To support this add hash support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h       |   8 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  54 ++++-
 kernel/bpf/core.c         |   1 +
 kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c     |  14 +-
 net/core/filter.c         |  58 ++++++
 7 files changed, 611 insertions(+), 19 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474bf7ee..ed0122b45b63 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 	return NULL;
 }
 
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+						    void *key)
+{
+	return NULL;
+}
+
 static inline int sock_map_prog(struct bpf_map *map,
 				struct bpf_prog *prog,
 				u32 type)
@@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d7df1b323082..b67f8793de0d 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 02e4112510f8..d94d333a8225 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1828,7 +1829,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- *
  * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
@@ -1855,6 +1855,53 @@ union bpf_attr {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ union bpf_attr {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d0d7d9462368..2194c6a9df42 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index beab9ec9b023..56879c9fd3a4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
 	struct bpf_sock_progs progs;
 };
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bucket *buckets;
+	atomic_t count;
+	u32 n_buckets;
+	u32 elem_size;
+	struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+	struct rcu_head rcu;
+	struct hlist_node hash_node;
+	u32 hash;
+	struct sock *sk;
+	char key[0];
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -67,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
+	struct htab_elem *hash_link;
+	struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -195,6 +219,12 @@ out:
 	rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	}
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		osk = cmpxchg(e->entry, sk, NULL);
-		if (osk == sk) {
-			list_del(&e->list);
-			smap_release_sock(psock, sk);
+		if (e->entry) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		} else {
+			hlist_del_rcu(&e->hash_link->hash_node);
+			smap_release_sock(psock, e->hash_link->sk);
+			free_htab_elem(e->htab, e->hash_link);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -1527,12 +1563,14 @@ free_stab:
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+			     struct sock **entry,
+			     struct htab_elem *hash_link)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
+		if (e->entry == entry || e->hash_link == hash_link) {
 			list_del(&e->list);
 			break;
 		}
@@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_list_remove(psock, &stab->sock_map[i], NULL);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_list_remove(psock, &stab->sock_map[k], NULL);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		new = true;
 	}
 
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e) {
-		err = -ENOMEM;
-		goto out_progs;
+	if (map_link) {
+		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+		if (!e) {
+			err = -ENOMEM;
+			goto out_progs;
+		}
 	}
 
 	/* 3. At this point we have a reference to a valid psock that is
@@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
+	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
@@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_list_remove(opsock, &stab->sock_map[i], NULL);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
@@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 
 		progs = &stab->progs;
+	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
 	} else {
 		return -EINVAL;
 	}
@@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 
 static void sock_map_release(struct bpf_map *map)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	progs = &stab->progs;
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	}
+
 	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
@@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map)
 		bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->value_size != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	err = -EINVAL;
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
+		goto free_htab;
+
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_htab;
+
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = bpf_map_area_alloc(
+				htab->n_buckets * sizeof(struct bucket),
+				htab->map.numa_node);
+	if (!htab->buckets)
+		goto free_htab;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			struct sock *sock = l->sk;
+			struct smap_psock *psock;
+
+			hlist_del_rcu(&l->hash_node);
+			write_lock_bh(&sock->sk_callback_lock);
+			psock = smap_psock_sk(sock);
+			/* This check handles a racing sock event that can get
+			 * the sk_callback_lock before this case but after xchg
+			 * causing the refcnt to hit zero and sock user data
+			 * (psock) to be null and queued for garbage collection.
+			 */
+			if (likely(psock)) {
+				smap_list_remove(psock, NULL, l);
+				smap_release_sock(psock, sock);
+			}
+			write_unlock_bh(&sock->sk_callback_lock);
+			kfree(l);
+		}
+	}
+	rcu_read_unlock();
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+					      void *key, u32 key_size, u32 hash,
+					      struct sock *sk,
+					      struct htab_elem *old_elem)
+{
+	struct htab_elem *l_new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old_elem) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			     htab->map.numa_node);
+	if (!l_new)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(l_new->key, key, key_size);
+	l_new->sk = sk;
+	l_new->hash = hash;
+	return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+				  void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l, *next_l;
+	struct hlist_head *h;
+	u32 hash, key_size;
+	int i = 0;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+	if (!key)
+		goto find_first_elem;
+	hash = htab_map_hash(key, key_size);
+	h = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(h, hash, key, key_size);
+	if (!l)
+		goto find_first_elem;
+	next_l = hlist_entry_safe(
+		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+		     struct htab_elem, hash_node);
+	if (next_l) {
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		h = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(h)),
+				struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* iterated over all buckets and all elements */
+	return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				     struct bpf_map *map,
+				     void *key, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_sock_progs *progs = &htab->progs;
+	struct htab_elem *l_new = NULL, *l_old;
+	struct smap_psock_map_entry *e = NULL;
+	struct hlist_head *head;
+	struct smap_psock *psock;
+	u32 key_size, hash;
+	struct sock *sock;
+	struct bucket *b;
+	int err;
+
+	sock = skops->sk;
+
+	if (sock->sk_type != SOCK_STREAM ||
+	    sock->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e)
+		return -ENOMEM;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+	if (err)
+		goto err;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_bh(&b->lock);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+	if (l_old && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto bucket_err;
+	}
+	if (!l_old && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto bucket_err;
+	}
+
+	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+	if (IS_ERR(l_new)) {
+		err = PTR_ERR(l_new);
+		goto bucket_err;
+	}
+
+	psock = smap_psock_sk(sock);
+	if (unlikely(!psock)) {
+		err = -EINVAL;
+		goto bucket_err;
+	}
+
+	e->hash_link = l_new;
+	e->htab = container_of(map, struct bpf_htab, map);
+	list_add_tail(&e->list, &psock->maps);
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		psock = smap_psock_sk(l_old->sk);
+
+		hlist_del_rcu(&l_old->hash_node);
+		smap_list_remove(psock, NULL, l_old);
+		smap_release_sock(psock, l_old->sk);
+		free_htab_elem(htab, l_old);
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return 0;
+bucket_err:
+	raw_spin_unlock_bh(&b->lock);
+err:
+	kfree(e);
+	psock = smap_psock_sk(sock);
+	if (psock)
+		smap_release_sock(psock, sock);
+	return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	fput(socket->file);
+	return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		struct sock *sock = l->sk;
+		struct smap_psock *psock;
+
+		hlist_del_rcu(&l->hash_node);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, NULL, l);
+			smap_release_sock(psock, sock);
+		}
+		write_unlock_bh(&sock->sk_callback_lock);
+		free_htab_elem(htab, l);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 key_size, hash;
+	struct bucket *b;
+	struct sock *sk;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	sk = l ? l->sk : NULL;
+	raw_spin_unlock_bh(&b->lock);
+	return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_release_uref = sock_map_release,
 };
 
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc = sock_hash_alloc,
+	.map_free = sock_hash_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_hash_get_next_key,
+	.map_update_elem = sock_hash_update_elem,
+	.map_delete_elem = sock_hash_delete_elem,
+};
+
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
@@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d92d9c37affd..a9e4b1372da6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SOCKHASH:
+		if (func_id != BPF_FUNC_sk_redirect_hash &&
+		    func_id != BPF_FUNC_sock_hash_update &&
+		    func_id != BPF_FUNC_map_delete_elem &&
+		    func_id != BPF_FUNC_msg_redirect_hash)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_sk_redirect_map:
 	case BPF_FUNC_msg_redirect_map:
+	case BPF_FUNC_sock_map_update:
 		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
 			goto error;
 		break;
-	case BPF_FUNC_sock_map_update:
-		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+	case BPF_FUNC_sk_redirect_hash:
+	case BPF_FUNC_msg_redirect_hash:
+	case BPF_FUNC_sock_hash_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
 	default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 61a3ed6bac25..6d0d1560bd70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
+	case BPF_FUNC_sock_hash_update:
+		return &bpf_sock_hash_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_redirect_hash:
+		return &bpf_msg_redirect_hash_proto;
 	case BPF_FUNC_msg_apply_bytes:
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
@@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_redirect_hash:
+		return &bpf_sk_redirect_hash_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-55-g7522


From 01cd267bff52619a53fa05c930ea5ed53493d21a Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Tue, 8 May 2018 10:05:38 +0200
Subject: netfilter: fix fallout from xt/nf osf separation

Stephen Rothwell says:
  today's linux-next build (x86_64 allmodconfig) produced this warning:
  ./usr/include/linux/netfilter/nf_osf.h:25: found __[us]{8,16,32,64} type without #include <linux/types.h>

Fix that up and also move kernel-private struct out of uapi (it was not
exposed in any released kernel version).

tested via allmodconfig build + make headers_check.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      | 6 ++++++
 include/uapi/linux/netfilter/nf_osf.h | 8 ++------
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index a2b39602e87d..0e114c492fb8 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -21,6 +21,12 @@ enum osf_fmatch_states {
 	FMATCH_OPT_WRONG,
 };
 
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
 bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  int hooknum, struct net_device *in, struct net_device *out,
 		  const struct nf_osf_info *info, struct net *net,
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 45376eae31ef..8f2f2f403183 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -1,6 +1,8 @@
 #ifndef _NF_OSF_H
 #define _NF_OSF_H
 
+#include <linux/types.h>
+
 #define MAXGENRELEN	32
 
 #define NF_OSF_GENRE	(1 << 0)
@@ -57,12 +59,6 @@ struct nf_osf_user_finger {
 	struct nf_osf_opt	opt[MAX_IPOPTLEN];
 };
 
-struct nf_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct nf_osf_user_finger	finger;
-};
-
 struct nf_osf_nlmsg {
 	struct nf_osf_user_finger	f;
 	struct iphdr			ip;
-- 
cgit v1.2.3-55-g7522


From b9ccc07e3f31ad8073697982bac014fbceef7ecb Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana
Date: Fri, 11 May 2018 00:14:48 +0200
Subject: netfilter: nft_hash: add map lookups for hashing operations

This patch creates new attributes to accept a map as argument and
then perform the lookup with the generated hash accordingly.

Both current hash functions are supported: Jenkins and Symmetric Hash.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_hash.c                 | 131 ++++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ce031cf72288..9c71f024f9cc 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -856,6 +856,8 @@ enum nft_hash_types {
  * @NFTA_HASH_SEED: seed value (NLA_U32)
  * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
  * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types)
+ * @NFTA_HASH_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_HASH_SET_ID: id of the map (NLA_U32)
  */
 enum nft_hash_attributes {
 	NFTA_HASH_UNSPEC,
@@ -866,6 +868,8 @@ enum nft_hash_attributes {
 	NFTA_HASH_SEED,
 	NFTA_HASH_OFFSET,
 	NFTA_HASH_TYPE,
+	NFTA_HASH_SET_NAME,
+	NFTA_HASH_SET_ID,
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e235c17f1b8b..f0fc21f88775 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
 	u32			modulus;
 	u32			seed;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
 	const void *data = &regs->data[priv->sreg];
 	u32 h;
 
-	h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+	h = reciprocal_scale(jhash(data, priv->len, priv->seed),
+			     priv->modulus);
+
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_jhash_map_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	const void *data = &regs->data[priv->sreg];
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(jhash(data, priv->len, priv->seed),
+					priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 struct nft_symhash {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_symhash_map_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_symhash *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(__skb_get_hash_symmetric(skb),
+				  priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
 	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SEED]	= { .type = NLA_U32 },
 	[NFTA_HASH_OFFSET]	= { .type = NLA_U32 },
 	[NFTA_HASH_TYPE]	= { .type = NLA_U32 },
+	[NFTA_HASH_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_HASH_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -115,6 +166,23 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_jhash_map_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_jhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_symhash_init(const struct nft_ctx *ctx,
 			    const struct nft_expr *expr,
 			    const struct nlattr * const tb[])
@@ -141,6 +209,23 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_symhash_map_init(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_symhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_jhash_dump(struct sk_buff *skb,
 			  const struct nft_expr *expr)
 {
@@ -168,6 +253,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_jhash_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_jhash *priv = nft_expr_priv(expr);
+
+	if (nft_jhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static int nft_symhash_dump(struct sk_buff *skb,
 			    const struct nft_expr *expr)
 {
@@ -188,6 +285,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_symhash_map_dump(struct sk_buff *skb,
+				const struct nft_expr *expr)
+{
+	const struct nft_symhash *priv = nft_expr_priv(expr);
+
+	if (nft_symhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static struct nft_expr_type nft_hash_type;
 static const struct nft_expr_ops nft_jhash_ops = {
 	.type		= &nft_hash_type,
@@ -197,6 +306,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
 	.dump		= nft_jhash_dump,
 };
 
+static const struct nft_expr_ops nft_jhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+	.eval		= nft_jhash_map_eval,
+	.init		= nft_jhash_map_init,
+	.dump		= nft_jhash_map_dump,
+};
+
 static const struct nft_expr_ops nft_symhash_ops = {
 	.type		= &nft_hash_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +322,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
 	.dump		= nft_symhash_dump,
 };
 
+static const struct nft_expr_ops nft_symhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
+	.eval		= nft_symhash_map_eval,
+	.init		= nft_symhash_map_init,
+	.dump		= nft_symhash_map_dump,
+};
+
 static const struct nft_expr_ops *
 nft_hash_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -217,8 +342,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
 	type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
 	switch (type) {
 	case NFT_HASH_SYM:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_symhash_map_ops;
 		return &nft_symhash_ops;
 	case NFT_HASH_JENKINS:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_jhash_map_ops;
 		return &nft_jhash_ops;
 	default:
 		break;
-- 
cgit v1.2.3-55-g7522


From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Fri, 18 May 2018 14:00:21 +0200
Subject: xsk: clean up SPDX headers

Clean up SPDX-License-Identifier and removing licensing leftovers.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      | 13 ++-----------
 include/uapi/linux/if_xdp.h | 13 ++-----------
 kernel/bpf/xskmap.c         |  9 ---------
 net/xdp/xdp_umem.c          |  9 ---------
 net/xdp/xdp_umem.h          | 13 ++-----------
 net/xdp/xdp_umem_props.h    | 13 ++-----------
 net/xdp/xsk.c               |  9 ---------
 net/xdp/xsk_queue.c         |  9 ---------
 net/xdp/xsk_queue.h         | 13 ++-----------
 samples/bpf/xdpsock_user.c  | 12 +-----------
 10 files changed, 11 insertions(+), 102 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 185f4928fbda..7a647c56ec15 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * AF_XDP internal functions
+/* SPDX-License-Identifier: GPL-2.0 */
+/* AF_XDP internal functions
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XDP_SOCK_H
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 77b88c4efe98..56db977221d2 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -1,17 +1,8 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
- *
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
  * if_xdp: XDP socket user-space interface
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index cb3a12137404..b3c557476a8d 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XSKMAP used for AF_XDP sockets
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/bpf.h>
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2b47a1dd7c6c..df4ea97c433b 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/init.h>
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 7e0b2fab8522..70fe225baa51 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_H_
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 77fb5daf29f3..2cf8ec485fd2 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_PROPS_H_
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 009c5af5bba5..b8d1cb4d78c0 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -5,15 +5,6 @@
  * applications.
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index d012e5e23591..9f605d22dad4 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/slab.h>
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7aa9a535db0e..928d464e57b9 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space ring structure
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XSK_QUEUE_H
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 7fe60f6f7d53..60a882a2296c 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,15 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright(c) 2017 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
+/* Copyright(c) 2017 - 2018 Intel Corporation. */
 
 #include <assert.h>
 #include <errno.h>
-- 
cgit v1.2.3-55-g7522


From 200d95f4575934e49f872109cce18c5e72383eb8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 17 May 2018 14:47:27 -0700
Subject: tcp: add TCPAckCompressed SNMP counter

This counter tracks number of ACK packets that the host has not sent,
thanks to ACK compression.

Sample output :

$ nstat -n;sleep 1;nstat|egrep "IpInReceives|IpOutRequests|TcpInSegs|TcpOutSegs|TcpExtTCPAckCompressed"
IpInReceives                    123250             0.0
IpOutRequests                   3684               0.0
TcpInSegs                       123251             0.0
TcpOutSegs                      3684               0.0
TcpExtTCPAckCompressed          119252             0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_output.c     | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index d02e859301ff..750d89120335 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,7 @@ enum
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
 	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
 	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
+	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..6c1ff89a60fa 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
 	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
 	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7ee98aad82b7..437bb7ceba7f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -165,6 +165,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (unlikely(tp->compressed_ack)) {
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+			      tp->compressed_ack);
 		tp->compressed_ack = 0;
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
-- 
cgit v1.2.3-55-g7522


From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Thu, 17 May 2018 14:16:58 -0700
Subject: bpf: allow sk_msg programs to read sock fields

Currently sk_msg programs only have access to the raw data. However,
it is often useful when building policies to have the policies specific
to the socket endpoint. This allows using the socket tuple as input
into filters, etc.

This patch adds ctx access to the sock fields.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |   1 +
 include/uapi/linux/bpf.h |   8 ++++
 kernel/bpf/sockmap.c     |   1 +
 net/core/filter.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9dbcb9d55921..d358d1815c16 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -517,6 +517,7 @@ struct sk_msg_buff {
 	bool sg_copy[MAX_SKB_FRAGS];
 	__u32 flags;
 	struct sock *sk_redir;
+	struct sock *sk;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..97446bbe2ca5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2176,6 +2176,14 @@ enum sk_action {
 struct sk_msg_md {
 	void *data;
 	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c682669570dc..3b28955a6383 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	}
 
 	bpf_compute_data_pointers_sg(md);
+	md->sk = sk;
 	rc = (*prog->bpf_func)(md, prog->insnsi);
 	psock->apply_bytes = md->apply_bytes;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0d1560bd70..aec5ebafb262 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5148,18 +5148,23 @@ static bool sk_msg_is_valid_access(int off, int size,
 	switch (off) {
 	case offsetof(struct sk_msg_md, data):
 		info->reg_type = PTR_TO_PACKET;
+		if (size != sizeof(__u64))
+			return false;
 		break;
 	case offsetof(struct sk_msg_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
+		if (size != sizeof(__u64))
+			return false;
 		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
 	}
 
 	if (off < 0 || off >= sizeof(struct sk_msg_md))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u64))
-		return false;
 
 	return true;
 }
@@ -5835,7 +5840,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock_ops, local_ip4):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
 					      struct bpf_sock_ops_kern, sk),
@@ -6152,6 +6158,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
@@ -6164,6 +6171,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_msg_buff, data_end));
 		break;
+	case offsetof(struct sk_msg_md, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_family));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_daddr));
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_rcv_saddr));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+	     offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip6[0]) ...
+	     offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, local_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_num));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3-55-g7522


From b9ffcbaf56d3040efee64d3818688083c29b2a44 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Fri, 18 May 2018 09:29:00 +0200
Subject: devlink: introduce devlink_port_attrs_set

Change existing setter for split port information into more generic
attrs setter. Alongside with that, allow to set port number and subport
number for split ports.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  7 ++--
 drivers/net/ethernet/mellanox/mlxsw/core.h       |  3 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   |  4 +--
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  4 +--
 include/net/devlink.h                            | 20 +++++++----
 include/uapi/linux/devlink.h                     |  3 ++
 net/core/devlink.c                               | 46 ++++++++++++++++++------
 8 files changed, 64 insertions(+), 25 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index e13ac3b8dff7..958769689ca2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1714,15 +1714,16 @@ EXPORT_SYMBOL(mlxsw_core_port_fini);
 
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group)
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber)
 {
 	struct mlxsw_core_port *mlxsw_core_port =
 					&mlxsw_core->ports[local_port];
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	if (split)
-		devlink_port_split_set(devlink_port, split_group);
+	devlink_port_attrs_set(devlink_port, port_number,
+			       split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 092d39399f3c..e65287c15afd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -201,7 +201,8 @@ int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group);
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber);
 void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			    void *port_driver_priv);
 void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 94132f6cec61..96f6d8af1fd8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2927,8 +2927,8 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port,
-				mlxsw_sp_port, dev, mlxsw_sp_port->split,
-				module);
+				mlxsw_sp_port, dev, module + 1,
+				mlxsw_sp_port->split, lane / width);
 	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0);
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index a655c5850aa6..551e05067b1e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -1149,7 +1149,7 @@ static int __mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
-				mlxsw_sx_port, dev, false, 0);
+				mlxsw_sx_port, dev, module + 1, false, 0);
 	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
 	return 0;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index eb0fc614673d..d7a768ff3112 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,8 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	if (eth_port.is_split)
-		devlink_port_split_set(&port->dl_port, eth_port.label_port);
+	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
+			       eth_port.is_split, eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2e4f71e16e95..d6ca92266709 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -35,6 +35,13 @@ struct devlink {
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_port_attrs {
+	bool set;
+	u32 port_number; /* same value as "split group" */
+	bool split;
+	u32 split_subport_number;
+};
+
 struct devlink_port {
 	struct list_head list;
 	struct devlink *devlink;
@@ -43,8 +50,7 @@ struct devlink_port {
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
 	void *type_dev;
-	bool split;
-	u32 split_group;
+	struct devlink_port_attrs attrs;
 };
 
 struct devlink_sb_pool_info {
@@ -367,8 +373,9 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
 void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group);
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -466,8 +473,9 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
 }
 
-static inline void devlink_port_split_set(struct devlink_port *devlink_port,
-					  u32 split_group)
+static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  u32 port_number, bool split,
+					  u32 split_subport_number)
 {
 }
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1df65a4c2044..15b031a5ee7a 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -224,6 +224,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..8fde7d2df9b0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,25 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
 				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+				     struct devlink_port *devlink_port)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	if (!attrs->set)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+		return -EMSGSIZE;
+	if (!attrs->split)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+			attrs->split_subport_number))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				struct devlink_port *devlink_port,
 				enum devlink_command cmd, u32 portid,
@@ -492,9 +511,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				   ibdev->name))
 			goto nla_put_failure;
 	}
-	if (devlink_port->split &&
-	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
-			devlink_port->split_group))
+	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -2971,19 +2988,28 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
 /**
- *	devlink_port_split_set - Set port is split
+ *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
- *	@split_group: split group - identifies group split port is part of
+ *	@port_number: number of the port that is facing user, for example
+ *	              the front panel port number
+ *	@split: indicates if this is split port
+ *	@split_subport_number: if the port is split, this is the number
+ *	                       of subport.
  */
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group)
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number)
 {
-	devlink_port->split = true;
-	devlink_port->split_group = split_group;
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	attrs->set = true;
+	attrs->port_number = port_number;
+	attrs->split = split;
+	attrs->split_subport_number = split_subport_number;
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
-- 
cgit v1.2.3-55-g7522


From 5ec1380a21bb6cd2ba89e31c44dfcc150f9ef792 Mon Sep 17 00:00:00 2001
From: Jiri Pirko
Date: Fri, 18 May 2018 09:29:01 +0200
Subject: devlink: extend attrs_set for setting port flavours

Devlink ports can have specific flavour according to the purpose of use.
This patch extend attrs_set so the driver can say which flavour port
has. Initial flavours are:
physical, cpu, dsa
User can query this to see right away what is the purpose of each port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  5 +++--
 include/net/devlink.h                            |  3 +++
 include/uapi/linux/devlink.h                     | 11 +++++++++++
 net/core/devlink.c                               |  5 +++++
 5 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 958769689ca2..a720aa11bcc0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1722,8 +1722,8 @@ void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	devlink_port_attrs_set(devlink_port, port_number,
-			       split, split_port_subnumber);
+	devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       port_number, split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index d7a768ff3112..b1e67cf4257a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,9 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
-			       eth_port.is_split, eth_port.label_subport);
+	devlink_port_attrs_set(&port->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       eth_port.label_port, eth_port.is_split,
+			       eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index d6ca92266709..6eae15cf0f57 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -37,6 +37,7 @@ struct devlink {
 
 struct devlink_port_attrs {
 	bool set;
+	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	bool split;
 	u32 split_subport_number;
@@ -374,6 +375,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
@@ -474,6 +476,7 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 
 static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  enum devlink_port_flavour flavour,
 					  u32 port_number, bool split,
 					  u32 split_subport_number)
 {
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 15b031a5ee7a..75cb5450c851 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -132,6 +132,16 @@ enum devlink_eswitch_encap_mode {
 	DEVLINK_ESWITCH_ENCAP_MODE_BASIC,
 };
 
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically
+					* facing the user.
+					*/
+	DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */
+	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
+				   * interconnect port.
+				   */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -224,6 +234,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
 	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
 	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fde7d2df9b0..af90d237cbc2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -460,6 +460,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 
 	if (!attrs->set)
 		return 0;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+		return -EMSGSIZE;
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
 		return -EMSGSIZE;
 	if (!attrs->split)
@@ -2991,6 +2993,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
+ *	@flavour: flavour of the port
  *	@port_number: number of the port that is facing user, for example
  *	              the front panel port number
  *	@split: indicates if this is split port
@@ -2998,12 +3001,14 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	                       of subport.
  */
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number)
 {
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 
 	attrs->set = true;
+	attrs->flavour = flavour;
 	attrs->port_number = port_number;
 	attrs->split = split;
 	attrs->split_subport_number = split_subport_number;
-- 
cgit v1.2.3-55-g7522


From ad75646c68a6e344a3609f40825855a0e742d04d Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Tue, 22 May 2018 09:34:57 +0200
Subject: xsk: fill hole in struct sockaddr_xdp

Move the sxdp_flags up, avoiding a hole in the uapi structure.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 56db977221d2..c46609a00168 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -17,10 +17,10 @@
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
+	__u16 sxdp_flags;
 	__u32 sxdp_ifindex;
 	__u32 sxdp_queue_id;
 	__u32 sxdp_shared_umem_fd;
-	__u16 sxdp_flags;
 };
 
 /* XDP socket options */
-- 
cgit v1.2.3-55-g7522


From b3a9e0be436960072ddf327d9d82f50ee2f620e0 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Tue, 22 May 2018 09:34:59 +0200
Subject: xsk: remove explicit ring structure from uapi

In this commit we remove the explicit ring structure from the the
uapi. It is tricky for an uapi to depend on a certain L1 cache line
size, since it can differ for variants of the same architecture. Now,
we let the user application determine the offsets of the producer,
consumer and descriptors by asking the socket via getsockopt.

A typical flow would be (Rx ring):

  struct xdp_mmap_offsets off;
  struct xdp_desc *ring;
  u32 *prod, *cons;
  void *map;
  ...

  getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);

  map = mmap(NULL, off.rx.desc +
		   NUM_DESCS * sizeof(struct xdp_desc),
		   PROT_READ | PROT_WRITE,
		   MAP_SHARED | MAP_POPULATE, sfd,
		   XDP_PGOFF_RX_RING);
  prod = map + off.rx.producer;
  cons = map + off.rx.consumer;
  ring = map + off.rx.desc;

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++++++----------------------
 net/xdp/xsk.c               | 29 +++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 17 +++++++++++++++++
 3 files changed, 68 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index c46609a00168..4737cfe222f5 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,13 +23,27 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
 /* XDP socket options */
-#define XDP_RX_RING			1
-#define XDP_TX_RING			2
-#define XDP_UMEM_REG			3
-#define XDP_UMEM_FILL_RING		4
-#define XDP_UMEM_COMPLETION_RING	5
-#define XDP_STATISTICS			6
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -50,6 +64,7 @@ struct xdp_statistics {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
+/* Rx/Tx descriptor */
 struct xdp_desc {
 	__u32 idx;
 	__u32 len;
@@ -58,21 +73,6 @@ struct xdp_desc {
 	__u8 padding[5];
 };
 
-struct xdp_ring {
-	__u32 producer __attribute__((aligned(64)));
-	__u32 consumer __attribute__((aligned(64)));
-};
-
-/* Used for the RX and TX queues for packets */
-struct xdp_rxtx_ring {
-	struct xdp_ring ptrs;
-	struct xdp_desc desc[0] __attribute__((aligned(64)));
-};
-
-/* Used for the fill and completion queues for buffers */
-struct xdp_umem_ring {
-	struct xdp_ring ptrs;
-	__u32 desc[0] __attribute__((aligned(64)));
-};
+/* UMEM descriptor is __u32 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 29707354cf78..378dd9287da5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -489,6 +489,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 
 		return 0;
 	}
+	case XDP_MMAP_OFFSETS:
+	{
+		struct xdp_mmap_offsets off;
+
+		if (len < sizeof(off))
+			return -EINVAL;
+
+		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+
+		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
+		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
+
+		len = sizeof(off);
+		if (copy_to_user(optval, &off, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		break;
 	}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 62e43be407d8..cb8e5be35110 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -13,6 +13,23 @@
 
 #define RX_BATCH_SIZE 16
 
+struct xdp_ring {
+	u32 producer ____cacheline_aligned_in_smp;
+	u32 consumer ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	u32 desc[0] ____cacheline_aligned_in_smp;
+};
+
 struct xsk_queue {
 	struct xdp_umem_props umem_props;
 	u32 ring_mask;
-- 
cgit v1.2.3-55-g7522


From e841b7b11ebd0b359e07bb3d7caf15dca1a80b72 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel
Date: Tue, 22 May 2018 10:19:07 +0200
Subject: nl80211: add FILS related parameters to ROAM event

In case of FILS shared key offload the parameters can change
upon roaming of which user-space needs to be notified.

Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/nl80211.c       | 16 +++++++++++++--
 net/wireless/sme.c           | 48 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e4a2a04d55d..1e7c0df4fe33 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5624,6 +5624,7 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
+ * @fils: FILS related roaming information.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5633,6 +5634,7 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
+	struct cfg80211_fils_resp_params fils;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 83ed1dd757e6..0a412335d56b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -215,7 +215,8 @@
  * as specified in IETF RFC 6696.
  *
  * When FILS shared key authentication is completed, driver needs to provide the
- * below additional parameters to userspace.
+ * below additional parameters to userspace, which can be either after setting
+ * up a connection or after roaming.
  *	%NL80211_ATTR_FILS_KEK - used for key renewal
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges
  *	%NL80211_ATTR_PMKID - used to identify the PMKSA used/generated
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3ab443b13bb0..ae57f9712d7d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14264,7 +14264,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	void *hdr;
 	const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
 
-	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp);
+	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
+			info->fils.kek_len + info->fils.pmk_len +
+			(info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -14282,7 +14284,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)))
+		     info->resp_ie)) ||
+	    (info->fils.update_erp_next_seq_num &&
+	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+			 info->fils.erp_next_seq_num)) ||
+	    (info->fils.kek &&
+	     nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
+		     info->fils.kek)) ||
+	    (info->fils.pmk &&
+	     nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
+	    (info->fils.pmkid &&
+	     nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 73881fb7f86d..d536b07582f8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -932,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_event *ev;
 	unsigned long flags;
+	u8 *next;
 
 	if (!info->bss) {
 		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
@@ -944,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	if (WARN_ON(!info->bss))
 		return;
 
-	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp);
+	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
+		     info->fils.kek_len + info->fils.pmk_len +
+		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, info->bss);
 		return;
 	}
 
 	ev->type = EVENT_ROAMED;
-	ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
-	ev->rm.req_ie_len = info->req_ie_len;
-	memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
-	ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len;
-	ev->rm.resp_ie_len = info->resp_ie_len;
-	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
+	next = ((u8 *)ev) + sizeof(*ev);
+	if (info->req_ie_len) {
+		ev->rm.req_ie = next;
+		ev->rm.req_ie_len = info->req_ie_len;
+		memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
+		next += info->req_ie_len;
+	}
+	if (info->resp_ie_len) {
+		ev->rm.resp_ie = next;
+		ev->rm.resp_ie_len = info->resp_ie_len;
+		memcpy((void *)ev->rm.resp_ie, info->resp_ie,
+		       info->resp_ie_len);
+		next += info->resp_ie_len;
+	}
+	if (info->fils.kek_len) {
+		ev->rm.fils.kek = next;
+		ev->rm.fils.kek_len = info->fils.kek_len;
+		memcpy((void *)ev->rm.fils.kek, info->fils.kek,
+		       info->fils.kek_len);
+		next += info->fils.kek_len;
+	}
+	if (info->fils.pmk_len) {
+		ev->rm.fils.pmk = next;
+		ev->rm.fils.pmk_len = info->fils.pmk_len;
+		memcpy((void *)ev->rm.fils.pmk, info->fils.pmk,
+		       info->fils.pmk_len);
+		next += info->fils.pmk_len;
+	}
+	if (info->fils.pmkid) {
+		ev->rm.fils.pmkid = next;
+		memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid,
+		       WLAN_PMKID_LEN);
+		next += WLAN_PMKID_LEN;
+	}
+	ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
+	if (info->fils.update_erp_next_seq_num)
+		ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
 	ev->rm.bss = info->bss;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
-- 
cgit v1.2.3-55-g7522


From 7f9a3e150ec7d3596386449c15aefb59904a1266 Mon Sep 17 00:00:00 2001
From: Vidyullatha Kanchanapally
Date: Tue, 22 May 2018 10:19:08 +0200
Subject: nl80211: Update ERP info using NL80211_CMD_UPDATE_CONNECT_PARAMS

Use NL80211_CMD_UPDATE_CONNECT_PARAMS to update new ERP information,
Association IEs and the Authentication type to driver / firmware which
will be used in subsequent roamings.

Signed-off-by: Vidyullatha Kanchanapally <vidyullatha@codeaurora.org>
[arend: extended fils-sk kernel doc and added check in wiphy_register()]
Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/core.c          |  4 ++++
 net/wireless/nl80211.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1e7c0df4fe33..5fbfe61f41c6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2225,9 +2225,14 @@ struct cfg80211_connect_params {
  * have to be updated as part of update_connect_params() call.
  *
  * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
+ * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
+ *	username, erp sequence number and rrk) are updated
+ * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
  */
 enum cfg80211_connect_params_changed {
 	UPDATE_ASSOC_IES		= BIT(0),
+	UPDATE_FILS_ERP_INFO		= BIT(1),
+	UPDATE_AUTH_TYPE		= BIT(2),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0a412335d56b..06f9af23156b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -204,7 +204,8 @@
  * FILS shared key authentication offload should be able to construct the
  * authentication and association frames for FILS shared key authentication and
  * eventually do a key derivation as per IEEE 802.11ai. The below additional
- * parameters should be given to driver in %NL80211_CMD_CONNECT.
+ * parameters should be given to driver in %NL80211_CMD_CONNECT and/or in
+ * %NL80211_CMD_UPDATE_CONNECT_PARAMS.
  *	%NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c0fd8a85e7f7..5fe35aafdd9c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy)
 		    (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
 		return -EINVAL;
 
+	if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
+		    rdev->ops->update_connect_params))
+		return -EINVAL;
+
 	if (wiphy->addresses)
 		memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ae57f9712d7d..bdf73b24cc09 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9429,6 +9429,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	bool fils_sk_offload;
+	u32 auth_type;
 	u32 changed = 0;
 	int ret;
 
@@ -9443,6 +9445,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 		changed |= UPDATE_ASSOC_IES;
 	}
 
+	fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
+						  NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);
+
+	/*
+	 * when driver supports fils-sk offload all attributes must be
+	 * provided. So the else covers "fils-sk-not-all" and
+	 * "no-fils-sk-any".
+	 */
+	if (fils_sk_offload &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		connect.fils_erp_username =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_username_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_realm =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_realm_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_next_seq_num =
+			nla_get_u16(
+			   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
+		connect.fils_erp_rrk =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		connect.fils_erp_rrk_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		changed |= UPDATE_FILS_ERP_INFO;
+	} else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+		auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+		if (!nl80211_valid_auth_type(rdev, auth_type,
+					     NL80211_CMD_CONNECT))
+			return -EINVAL;
+
+		if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
+		    fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
+			return -EINVAL;
+
+		connect.auth_type = auth_type;
+		changed |= UPDATE_AUTH_TYPE;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	if (!wdev->current_bss)
 		ret = -ENOLINK;
-- 
cgit v1.2.3-55-g7522


From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Tue, 22 May 2018 14:57:18 -0700
Subject: bpf: btf: Change how section is supported in btf_header

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |   8 +-
 kernel/bpf/btf.c         | 209 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 160 insertions(+), 57 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index bcb56ee47014..4fa479741a02 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -12,15 +12,11 @@ struct btf_header {
 	__u16	magic;
 	__u8	version;
 	__u8	flags;
-
-	__u32	parent_label;
-	__u32	parent_name;
+	__u32	hdr_len;
 
 	/* All offsets are in bytes relative to the end of this header */
-	__u32	label_off;	/* offset of label section	*/
-	__u32	object_off;	/* offset of data object section*/
-	__u32	func_off;	/* offset of function section	*/
 	__u32	type_off;	/* offset of type section	*/
+	__u32	type_len;	/* length of type section	*/
 	__u32	str_off;	/* offset of string section	*/
 	__u32	str_len;	/* length of string section	*/
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab47b8a..75da6cbae47d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
-	union {
-		struct btf_header *hdr;
-		void *data;
-	};
+	void *data;
 	struct btf_type **types;
 	u32 *resolved_ids;
 	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
+	struct btf_header hdr;
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
@@ -228,6 +227,11 @@ enum resolve_mode {
 
 #define MAX_RESOLVE_DEPTH 32
 
+struct btf_sec_info {
+	u32 off;
+	u32 len;
+};
+
 struct btf_verifier_env {
 	struct btf *btf;
 	u8 *visit_states;
@@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!BTF_STR_OFFSET(offset))
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
 		return &btf->strings[BTF_STR_OFFSET(offset)];
 	else
 		return "(invalid-name-offset)";
@@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "\n");
 }
 
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+				 u32 btf_data_size)
 {
 	struct bpf_verifier_log *log = &env->log;
 	const struct btf *btf = env->btf;
@@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env)
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
 	__btf_verifier_log(log, "version: %u\n", hdr->version);
 	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
-	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
-	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
-	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
-	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
 	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
 	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
 	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
-	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
 }
 
 static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
@@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	struct btf_header *hdr;
 	void *cur, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	cur = btf->nohdr_data + hdr->type_off;
-	end = btf->nohdr_data + hdr->str_off;
+	end = btf->nohdr_data + hdr->type_len;
 
 	env->log_type_id = 1;
 	while (cur < end) {
@@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
+	const struct btf_header *hdr = &env->btf->hdr;
 	int err;
 
+	/* Type section must align to 4 bytes */
+	if (hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Unaligned type_off");
+		return -EINVAL;
+	}
+
+	if (!hdr->type_len) {
+		btf_verifier_log(env, "No type found");
+		return -EINVAL;
+	}
+
 	err = btf_check_all_metas(env);
 	if (err)
 		return err;
@@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	struct btf *btf = env->btf;
 	const char *start, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	start = btf->nohdr_data + hdr->str_off;
 	end = start + hdr->str_len;
 
+	if (end != btf->data + btf->data_size) {
+		btf_verifier_log(env, "String section is not at the end");
+		return -EINVAL;
+	}
+
 	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
 	    start[0] || end[-1]) {
 		btf_verifier_log(env, "Invalid string section");
@@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_parse_hdr(struct btf_verifier_env *env)
+static const size_t btf_sec_info_offset[] = {
+	offsetof(struct btf_header, type_off),
+	offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+	const struct btf_sec_info *x = a;
+	const struct btf_sec_info *y = b;
+
+	return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+			      u32 btf_data_size)
 {
+	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
+	struct btf_sec_info secs[nr_secs];
+	u32 total, expected_total, i;
 	const struct btf_header *hdr;
-	struct btf *btf = env->btf;
-	u32 meta_left;
+	const struct btf *btf;
+
+	btf = env->btf;
+	hdr = &btf->hdr;
 
-	if (btf->data_size < sizeof(*hdr)) {
+	/* Populate the secs from hdr */
+	for (i = 0; i < nr_secs; i++)
+		secs[i] = *(struct btf_sec_info *)((void *)hdr +
+						   btf_sec_info_offset[i]);
+
+	sort(secs, nr_secs, sizeof(struct btf_sec_info),
+	     btf_sec_info_cmp, NULL);
+
+	/* Check for gaps and overlap among sections */
+	total = 0;
+	expected_total = btf_data_size - hdr->hdr_len;
+	for (i = 0; i < nr_secs; i++) {
+		if (expected_total < secs[i].off) {
+			btf_verifier_log(env, "Invalid section offset");
+			return -EINVAL;
+		}
+		if (total < secs[i].off) {
+			/* gap */
+			btf_verifier_log(env, "Unsupported section found");
+			return -EINVAL;
+		}
+		if (total > secs[i].off) {
+			btf_verifier_log(env, "Section overlap found");
+			return -EINVAL;
+		}
+		if (expected_total - total < secs[i].len) {
+			btf_verifier_log(env,
+					 "Total section length too long");
+			return -EINVAL;
+		}
+		total += secs[i].len;
+	}
+
+	/* There is data other than hdr and known sections */
+	if (expected_total != total) {
+		btf_verifier_log(env, "Unsupported section found");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+			 u32 btf_data_size)
+{
+	const struct btf_header *hdr;
+	u32 hdr_len, hdr_copy;
+	/*
+	 * Minimal part of the "struct btf_header" that
+	 * contains the hdr_len.
+	 */
+	struct btf_min_header {
+		u16	magic;
+		u8	version;
+		u8	flags;
+		u32	hdr_len;
+	} __user *min_hdr;
+	struct btf *btf;
+	int err;
+
+	btf = env->btf;
+	min_hdr = btf_data;
+
+	if (btf_data_size < sizeof(*min_hdr)) {
+		btf_verifier_log(env, "hdr_len not found");
+		return -EINVAL;
+	}
+
+	if (get_user(hdr_len, &min_hdr->hdr_len))
+		return -EFAULT;
+
+	if (btf_data_size < hdr_len) {
 		btf_verifier_log(env, "btf_header not found");
 		return -EINVAL;
 	}
 
-	btf_verifier_log_hdr(env);
+	err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+	if (err) {
+		if (err == -E2BIG)
+			btf_verifier_log(env, "Unsupported btf_header");
+		return err;
+	}
+
+	hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+	if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+		return -EFAULT;
+
+	hdr = &btf->hdr;
+
+	btf_verifier_log_hdr(env, btf_data_size);
 
-	hdr = btf->hdr;
 	if (hdr->magic != BTF_MAGIC) {
 		btf_verifier_log(env, "Invalid magic");
 		return -EINVAL;
@@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 		return -ENOTSUPP;
 	}
 
-	meta_left = btf->data_size - sizeof(*hdr);
-	if (!meta_left) {
+	if (btf_data_size == hdr->hdr_len) {
 		btf_verifier_log(env, "No data");
 		return -EINVAL;
 	}
 
-	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
-	    /* Type section must align to 4 bytes */
-	    hdr->type_off & (sizeof(u32) - 1)) {
-		btf_verifier_log(env, "Invalid type_off");
-		return -EINVAL;
-	}
-
-	if (meta_left < hdr->str_off ||
-	    meta_left - hdr->str_off < hdr->str_len) {
-		btf_verifier_log(env, "Invalid str_off or str_len");
-		return -EINVAL;
-	}
-
-	btf->nohdr_data = btf->hdr + 1;
+	err = btf_check_sec_info(env, btf_data_size);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		err = -ENOMEM;
 		goto errout;
 	}
+	env->btf = btf;
+
+	err = btf_parse_hdr(env, btf_data, btf_data_size);
+	if (err)
+		goto errout;
 
 	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
@@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	btf->data = data;
 	btf->data_size = btf_data_size;
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
 
 	if (copy_from_user(data, btf_data, btf_data_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
 
-	env->btf = btf;
-
-	err = btf_parse_hdr(env);
-	if (err)
-		goto errout;
-
 	err = btf_parse_str_sec(env);
 	if (err)
 		goto errout;
@@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
-	if (!err && log->level && bpf_verifier_log_full(log)) {
+	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout;
 	}
 
-	if (!err) {
-		btf_verifier_env_free(env);
-		refcount_set(&btf->refcnt, 1);
-		return btf;
-	}
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
 
 errout:
 	btf_verifier_env_free(env);
-- 
cgit v1.2.3-55-g7522


From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Tue, 22 May 2018 14:57:20 -0700
Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
   raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
   currently encoded at the highest bit of a u32.
   It is because the current use case does not require supporting
   parent type (i.e type_id referring to a type in another BTF file).
   It also does not support referring to a string in ELF.

   The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
   by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
   defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
   There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
   used in the current use case.

5. Remove BTF_INT_VARARGS since func type is not supported now.
   The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 29 +++++++++------------------
 kernel/bpf/btf.c         | 52 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 4fa479741a02..0b5ddbe135a4 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -22,28 +22,19 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x7fffffff
+#define BTF_MAX_TYPE	0x0000ffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x7fffffff
+#define BTF_MAX_NAME_OFFSET	0x0000ffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 
-/* The type id is referring to a parent BTF */
-#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
-#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
-
-/* String is in the ELF string section */
-#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
-#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
-
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
-	 * bits    31: root
+	 * bits 24-27: kind (e.g. int, ptr, array...etc)
+	 * bits 28-31: unused
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -58,8 +49,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
@@ -84,15 +74,14 @@ struct btf_type {
 /* BTF_KIND_INT is followed by a u32 and the following
  * is the 32 bits arrangement:
  */
-#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
 #define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
 #define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED	0x1
-#define BTF_INT_CHAR	0x2
-#define BTF_INT_BOOL	0x4
-#define BTF_INT_VARARGS	0x8
+#define BTF_INT_SIGNED	(1 << 0)
+#define BTF_INT_CHAR	(1 << 1)
+#define BTF_INT_BOOL	(1 << 2)
 
 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
  * The exact number of btf_enum is stored in the vlen (of the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e388a6598de2..9cbeabb5aca3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -163,13 +163,16 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
 /* 16MB for 64k structs and each has 16 members and
  * a few MB spaces for the string section.
  * The hard limit is S32_MAX.
  */
 #define BTF_MAX_SIZE (16 * 1024 * 1024)
-/* 64k. We can raise it later. The hard limit is S32_MAX. */
-#define BTF_MAX_NR_TYPES 65535
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
@@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding)
 		return "CHAR";
 	else if (encoding == BTF_INT_BOOL)
 		return "BOOL";
-	else if (encoding == BTF_INT_VARARGS)
-		return "VARARGS";
 	else
 		return "UNKN";
 }
@@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
-	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
+	return BTF_STR_OFFSET_VALID(offset) &&
+		offset < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
-	if (!BTF_STR_OFFSET(offset))
+	if (!offset)
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
-		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
 	else
 		return "(invalid-name-offset)";
 }
@@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 		struct btf_type **new_types;
 		u32 expand_by, new_size;
 
-		if (btf->types_size == BTF_MAX_NR_TYPES) {
+		if (btf->types_size == BTF_MAX_TYPE) {
 			btf_verifier_log(env, "Exceeded max num of types");
 			return -E2BIG;
 		}
 
 		expand_by = max_t(u32, btf->types_size >> 2, 16);
-		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+		new_size = min_t(u32, BTF_MAX_TYPE,
 				 btf->types_size + expand_by);
 
 		new_types = kvzalloc(new_size * sizeof(*new_types),
@@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 	}
 
 	int_data = btf_type_int(t);
+	if (int_data & ~BTF_INT_MASK) {
+		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+				       int_data);
+		return -EINVAL;
+	}
+
 	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
 
 	if (nr_bits > BITS_PER_U64) {
@@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/*
+	 * Only one of the encoding bits is allowed and it
+	 * should be sufficient for the pretty print purpose (i.e. decoding).
+	 * Multiple bits can be allowed later if it is found
+	 * to be insufficient.
+	 */
 	encoding = BTF_INT_ENCODING(int_data);
 	if (encoding &&
 	    encoding != BTF_INT_SIGNED &&
 	    encoding != BTF_INT_CHAR &&
-	    encoding != BTF_INT_BOOL &&
-	    encoding != BTF_INT_VARARGS) {
+	    encoding != BTF_INT_BOOL) {
 		btf_verifier_log_type(env, t, "Unsupported encoding");
 		return -ENOTSUPP;
 	}
@@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (BTF_TYPE_PARENT(t->type)) {
+	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
-	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+	if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
 		btf_verifier_log_type(env, t, "Invalid elem");
 		return -EINVAL;
 	}
 
-	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+	if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
 		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		}
 
 		/* A member cannot be in type void */
-		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid type_id");
 			return -EINVAL;
@@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 	}
 	meta_left -= sizeof(*t);
 
+	if (t->info & ~BTF_INFO_MASK) {
+		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+				 env->log_type_id, t->info);
+		return -EINVAL;
+	}
+
 	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
 	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
 		btf_verifier_log(env, "[%u] Invalid kind:%u",
-- 
cgit v1.2.3-55-g7522


From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau
Date: Tue, 22 May 2018 14:57:21 -0700
Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  4 ++--
 include/uapi/linux/bpf.h |  8 ++++----
 kernel/bpf/arraymap.c    |  2 +-
 kernel/bpf/syscall.c     | 18 +++++++++---------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6fe3c719ca8..1795eeee846c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,8 +69,8 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
-	u32 btf_key_id;
-	u32 btf_value_id;
+	u32 btf_key_type_id;
+	u32 btf_value_type_id;
 	struct btf *btf;
 	bool unpriv_array;
 	/* 55 bytes hole */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bbe2ca5..c3e502d06bc3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -284,8 +284,8 @@ union bpf_attr {
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
 		__u32	btf_fd;		/* fd pointing to a BTF type data */
-		__u32	btf_key_id;	/* BTF type_id of the key */
-		__u32	btf_value_id;	/* BTF type_id of the value */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -2219,8 +2219,8 @@ struct bpf_map_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 btf_id;
-	__u32 btf_key_id;
-	__u32 btf_value_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0fd8d8f1a398..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	}
 
 	seq_printf(m, "%u: ", *(u32 *)key);
-	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
 	seq_puts(m, "\n");
 
 	rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b29ef84ded3..0b4c94551001 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->usercnt, 1);
 
 	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_id || attr->btf_value_id)) {
+	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
 		struct btf *btf;
 
-		if (!attr->btf_key_id || !attr->btf_value_id) {
+		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
 			err = -EINVAL;
 			goto free_map_nouncharge;
 		}
@@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
-					      attr->btf_value_id);
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+					      attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;
 		}
 
 		map->btf = btf;
-		map->btf_key_id = attr->btf_key_id;
-		map->btf_value_id = attr->btf_value_id;
+		map->btf_key_type_id = attr->btf_key_type_id;
+		map->btf_value_type_id = attr->btf_value_type_id;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 
 	if (map->btf) {
 		info.btf_id = btf_id(map->btf);
-		info.btf_key_id = map->btf_key_id;
-		info.btf_value_id = map->btf_value_id;
+		info.btf_key_type_id = map->btf_key_type_id;
+		info.btf_value_type_id = map->btf_value_type_id;
 	}
 
 	if (bpf_map_is_dev_bound(map)) {
-- 
cgit v1.2.3-55-g7522


From d2ba09c17a0647f899d6c20a11bab9e6d3382f07 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov
Date: Mon, 21 May 2018 19:22:30 -0700
Subject: net: add skeleton of bpfilter kernel module

bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
  0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
  0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpfilter.h      |  15 ++++++
 include/uapi/linux/bpfilter.h |  21 ++++++++
 net/Kconfig                   |   2 +
 net/Makefile                  |   1 +
 net/bpfilter/Kconfig          |  16 ++++++
 net/bpfilter/Makefile         |  30 ++++++++++++
 net/bpfilter/bpfilter_kern.c  | 111 ++++++++++++++++++++++++++++++++++++++++++
 net/bpfilter/main.c           |  63 ++++++++++++++++++++++++
 net/bpfilter/msgfmt.h         |  17 +++++++
 net/ipv4/Makefile             |   2 +
 net/ipv4/bpfilter/Makefile    |   2 +
 net/ipv4/bpfilter/sockopt.c   |  42 ++++++++++++++++
 net/ipv4/ip_sockglue.c        |  17 +++++++
 13 files changed, 339 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

(limited to 'include/uapi')

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index 000000000000..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include <uapi/linux/bpfilter.h>
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+			    unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+			    int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				       char __user *optval,
+				       unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index 000000000000..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include <linux/if.h>
+
+enum {
+	BPFILTER_IPT_SO_SET_REPLACE = 64,
+	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+	BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+	BPFILTER_IPT_SO_GET_INFO = 64,
+	BPFILTER_IPT_SO_GET_ENTRIES = 65,
+	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+	BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index df8d45ef47d8..ba554cedb615 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -202,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 77aaddedbd29..bdaf53925acd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
+obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..60725c5f79db
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
+menuconfig BPFILTER
+	bool "BPF based packet filtering framework (BPFILTER)"
+	default n
+	depends on NET && BPF
+	help
+	  This builds experimental bpfilter framework that is aiming to
+	  provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+	tristate "bpfilter kernel module with user mode helper"
+	default m
+	help
+	  This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..2af752c8ef5e
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/
+ifeq ($(CONFIG_BPFILTER_UMH), y)
+# builtin bpfilter_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+HOSTLDFLAGS += -static
+endif
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+      --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+	$(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..7596314b61c7
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+/* since ip_getsockopt() can run in parallel, serialize access to umh */
+static DEFINE_MUTEX(bpfilter_lock);
+
+static void shutdown_umh(struct umh_info *info)
+{
+	struct task_struct *tsk;
+
+	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+	if (tsk)
+		force_sig(SIGKILL, tsk);
+	fput(info->pipe_to_umh);
+	fput(info->pipe_from_umh);
+}
+
+static void __stop_umh(void)
+{
+	if (bpfilter_process_sockopt) {
+		bpfilter_process_sockopt = NULL;
+		shutdown_umh(&info);
+	}
+}
+
+static void stop_umh(void)
+{
+	mutex_lock(&bpfilter_lock);
+	__stop_umh();
+	mutex_unlock(&bpfilter_lock);
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+				      char __user *optval,
+				      unsigned int optlen, bool is_set)
+{
+	struct mbox_request req;
+	struct mbox_reply reply;
+	loff_t pos;
+	ssize_t n;
+	int ret;
+
+	req.is_set = is_set;
+	req.pid = current->pid;
+	req.cmd = optname;
+	req.addr = (long)optval;
+	req.len = optlen;
+	mutex_lock(&bpfilter_lock);
+	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+	if (n != sizeof(req)) {
+		pr_err("write fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	pos = 0;
+	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+	if (n != sizeof(reply)) {
+		pr_err("read fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = reply.status;
+out:
+	mutex_unlock(&bpfilter_lock);
+	return ret;
+}
+
+static int __init load_umh(void)
+{
+	int err;
+
+	/* fork usermode process */
+	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+	if (err)
+		return err;
+	pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
+
+	/* health check that usermode process started correctly */
+	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+		stop_umh();
+		return -EFAULT;
+	}
+	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+	return 0;
+}
+
+static void __exit fini_umh(void)
+{
+	stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..81bbc1684896
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+	switch (cmd->cmd) {
+	case 0:
+		return 0;
+	default:
+		break;
+	}
+	return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+	return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+	while (1) {
+		struct mbox_request req;
+		struct mbox_reply reply;
+		int n;
+
+		n = read(0, &req, sizeof(req));
+		if (n != sizeof(req)) {
+			dprintf(debug_fd, "invalid request %d\n", n);
+			return;
+		}
+
+		reply.status = req.is_set ?
+			handle_set_cmd(&req) :
+			handle_get_cmd(&req);
+
+		n = write(1, &reply, sizeof(reply));
+		if (n != sizeof(reply)) {
+			dprintf(debug_fd, "reply failed %d\n", n);
+			return;
+		}
+	}
+}
+
+int main(void)
+{
+	debug_fd = open("/dev/console", 00000002 | 00000100);
+	dprintf(debug_fd, "Started bpfilter\n");
+	loop();
+	close(debug_fd);
+	return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFILTER_MSGFMT_H
+
+struct mbox_request {
+	__u64 addr;
+	__u32 len;
+	__u32 is_set;
+	__u32 cmd;
+	__u32 pid;
+};
+
+struct mbox_reply {
+	__u32 status;
+};
+
+#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520f9133..7018f91c5a39 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o
 
+obj-$(CONFIG_BPFILTER) += bpfilter/
+
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..42a96d2d8d05
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				char __user *optval,
+				unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
+			  unsigned int optlen, bool is_set)
+{
+	if (!bpfilter_process_sockopt) {
+		int err = request_module("bpfilter");
+
+		if (err)
+			return err;
+		if (!bpfilter_process_sockopt)
+			return -ECHILD;
+	}
+	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+			    unsigned int optlen)
+{
+	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+			    int __user *optlen)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5ad2d8ed3a3f..e0791faacb24 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
+#include <linux/bpfilter.h>
+
 /*
  *	SOL_IP control messages.
  */
@@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+	    optname < BPFILTER_IPT_SET_MAX)
+		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
 	int err;
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
 
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
-- 
cgit v1.2.3-55-g7522


From 404eb77ea766260c45cb05c4a8043b13bd7142d5 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu
Date: Tue, 22 May 2018 14:03:27 -0700
Subject: ipv4: support sport, dport and ip_proto in RTM_GETROUTE

This is a followup to fib rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h               |   3 +
 include/uapi/linux/rtnetlink.h |   3 +
 net/ipv4/Makefile              |   2 +-
 net/ipv4/fib_frontend.c        |   3 +
 net/ipv4/netlink.c             |  23 +++++++
 net/ipv4/route.c               | 146 ++++++++++++++++++++++++++++++-----------
 6 files changed, 140 insertions(+), 40 deletions(-)
 create mode 100644 net/ipv4/netlink.c

(limited to 'include/uapi')

diff --git a/include/net/ip.h b/include/net/ip.h
index bada1f1f871e..0d2281b4b27a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst;
 int ip_misc_proc_init(void);
 #endif
 
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack);
+
 #endif	/* _IP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005955fa..cabb210c93af 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_IP_PROTO,
+	RTA_SPORT,
+	RTA_DPORT,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7018f91c5a39..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o
+	     metrics.o netlink.o
 
 obj-$(CONFIG_BPFILTER) += bpfilter/
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112bf95..897ae92dff0f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -649,6 +649,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b518f8d..0e401dc4e1bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2574,11 +2574,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
@@ -2674,7 +2673,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			}
 		} else
 #endif
-			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
 				goto nla_put_failure;
 	}
 
@@ -2689,43 +2688,93 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = sizeof(struct udphdr);
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
-
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		err = -ENOBUFS;
-		goto errout;
-	}
-
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-
 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2735,14 +2784,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		uid = (iif ? INVALID_UID : current_uid());
 
-	/* Bugfix: need to give ip_route_input enough of an IP header to
-	 * not gag.
-	 */
-	ip_hdr(skb)->protocol = IPPROTO_UDP;
-	ip_hdr(skb)->saddr = src;
-	ip_hdr(skb)->daddr = dst;
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
 
-	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2751,6 +2808,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2760,10 +2822,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
-		skb->protocol	= htons(ETH_P_IP);
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
 		skb->dev	= dev;
 		skb->mark	= mark;
 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2783,7 +2845,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2791,34 +2853,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = res.table ? res.table->tb_id : 0;
 
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
-		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
 
 errout_free:
+	return err;
+errout_rcu:
 	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
cgit v1.2.3-55-g7522


From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001
From: Sandipan Das
Date: Thu, 24 May 2018 12:26:48 +0530
Subject: bpf: get kernel symbol addresses via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c94551001..068a4fc79ddb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74ba29e..8c4d9d0fd3ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
cgit v1.2.3-55-g7522


From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001
From: Sandipan Das
Date: Thu, 24 May 2018 12:26:52 +0530
Subject: bpf: get JITed image lengths of functions via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8e987a612b5..788456c18617 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3-55-g7522


From fe94cc290f535709d3c5ebd1e472dfd0aec7ee79 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux
Date: Sun, 20 May 2018 14:58:14 +0100
Subject: bpf: Add IPv6 Segment Routing helpers

The BPF seg6local hook should be powerful enough to enable users to
implement most of the use-cases one could think of. After some thinking,
we figured out that the following actions should be possible on a SRv6
packet, requiring 3 specific helpers :
    - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH
    - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH
                               (to add/delete TLVs)
    - bpf_lwt_seg6_action: Apply some SRv6 network programming actions
                           (specifically End.X, End.T, End.B6 and
                            End.B6.Encap)

The specifications of these helpers are provided in the patch (see
include/uapi/linux/bpf.h).

The non-sensitive fields of the SRH are the following : flags, tag and
TLVs. The other fields can not be modified, to maintain the SRH
integrity. Flags, tag and TLVs can easily be modified as their validity
can be checked afterwards via seg6_validate_srh. It is not allowed to
modify the segments directly. If one wants to add segments on the path,
he should stack a new SRH using the End.B6 action via
bpf_lwt_seg6_action.

Growing, shrinking or editing TLVs via the helpers will flag the SRH as
invalid, and it will have to be re-validated before re-entering the IPv6
layer. This flag is stored in a per-CPU buffer, along with the current
header length in bytes.

Storing the SRH len in bytes in the control block is mandatory when using
bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH
len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes
boundary). When adding/deleting TLVs within the BPF program, the SRH may
temporary be in an invalid state where its length cannot be rounded to 8
bytes without remainder, hence the need to store the length in bytes
separately. The caller of the BPF program can then ensure that the SRH's
final length is valid using this value. Again, a final SRH modified by a
BPF program which doesn’t respect the 8-bytes boundary will be discarded
as it will be considered as invalid.

Finally, a fourth helper is provided, bpf_lwt_push_encap, which is
available from the LWT BPF IN hook, but not from the seg6local BPF one.
This helper allows to encapsulate a Segment Routing Header (either with
a new outer IPv6 header, or by inlining it directly in the existing IPv6
header) into a non-SRv6 packet. This helper is required if we want to
offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet,
as the BPF seg6local hook only works on traffic already containing a SRH.
This is the BPF equivalent of the seg6 LWT infrastructure, which achieves
the same purpose but with a static SRH per route.

These helpers require CONFIG_IPV6=y (and not =m).

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6_local.h |   8 ++
 include/uapi/linux/bpf.h |  96 +++++++++++++++-
 net/core/filter.c        | 285 +++++++++++++++++++++++++++++++++++++++++++----
 net/ipv6/Kconfig         |   5 +
 net/ipv6/seg6_local.c    |   2 +
 5 files changed, 372 insertions(+), 24 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
index 57498b23085d..661fd5b4d3e0 100644
--- a/include/net/seg6_local.h
+++ b/include/net/seg6_local.h
@@ -15,10 +15,18 @@
 #ifndef _NET_SEG6_LOCAL_H
 #define _NET_SEG6_LOCAL_H
 
+#include <linux/percpu.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
 
 extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 			       u32 tbl_id);
 
+struct seg6_bpf_srh_state {
+	bool valid;
+	u16 hdrlen;
+};
+
+DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 344d2ddcef49..fdaf6a0bfa5b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1902,6 +1902,90 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2060,11 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2131,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index ba3ff5aa575a..2e05dcfda6d7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -64,6 +64,10 @@
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3363,28 +3367,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_pkt_data(void *func)
-{
-	if (func == bpf_skb_vlan_push ||
-	    func == bpf_skb_vlan_pop ||
-	    func == bpf_skb_store_bytes ||
-	    func == bpf_skb_change_proto ||
-	    func == bpf_skb_change_head ||
-	    func == bpf_skb_change_tail ||
-	    func == bpf_skb_adjust_room ||
-	    func == bpf_skb_pull_data ||
-	    func == bpf_clone_redirect ||
-	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data ||
-	    func == bpf_xdp_adjust_tail)
-		return true;
-
-	return false;
-}
-
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 				  unsigned long off, unsigned long len)
 {
@@ -4360,6 +4342,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+	int err;
+	struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+	if (!seg6_validate_srh(srh, len))
+		return -EINVAL;
+
+	switch (type) {
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EBADMSG;
+
+		err = seg6_do_srh_inline(skb, srh);
+		break;
+	case BPF_LWT_ENCAP_SEG6:
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+		err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (err)
+		return err;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+	   u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	case BPF_LWT_ENCAP_SEG6:
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+	.func		= bpf_lwt_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_tlvs, *srh_end, *ptr;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+	srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+	ptr = skb->data + offset;
+	if (ptr >= srh_tlvs && ptr + len <= srh_end)
+		srh_state->valid = 0;
+	else if (ptr < (void *)&srh->flags ||
+		 ptr + len > (void *)&srh->segments)
+		return -EFAULT;
+
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
+		return -EFAULT;
+
+	memcpy(skb->data + offset, from, len);
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+	.func		= bpf_lwt_seg6_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+	   u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int err;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	if (!srh_state->valid) {
+		if (unlikely((srh_state->hdrlen & 7) != 0))
+			return -EBADMSG;
+
+		srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+		if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+			return -EBADMSG;
+
+		srh_state->valid = 1;
+	}
+
+	switch (action) {
+	case SEG6_LOCAL_ACTION_END_X:
+		if (param_len != sizeof(struct in6_addr))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+	case SEG6_LOCAL_ACTION_END_T:
+		if (param_len != sizeof(int))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+	case SEG6_LOCAL_ACTION_END_B6:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	default:
+		return -EINVAL;
+	}
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+	.func		= bpf_lwt_seg6_action,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+	   s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_end, *srh_tlvs, *ptr;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6hdr *hdr;
+	int srhoff = 0;
+	int ret;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+			((srh->first_segment + 1) << 4));
+	srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+			srh_state->hdrlen);
+	ptr = skb->data + offset;
+
+	if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+		return -EFAULT;
+	if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+		return -EFAULT;
+
+	if (len > 0) {
+		ret = skb_cow_head(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		ret = bpf_skb_net_hdr_push(skb, offset, len);
+	} else {
+		ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (unlikely(ret < 0))
+		return ret;
+
+	hdr = (struct ipv6hdr *)skb->data;
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	srh_state->hdrlen += len;
+	srh_state->valid = 0;
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+	.func		= bpf_lwt_seg6_adjust_srh,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_adjust_room ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_clone_redirect ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail ||
+	    func == bpf_lwt_push_encap ||
+	    func == bpf_lwt_seg6_store_bytes ||
+	    func == bpf_lwt_seg6_adjust_srh ||
+	    func == bpf_lwt_seg6_action
+	    )
+		return true;
+
+	return false;
+}
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4774,7 +5014,6 @@ static bool lwt_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
-
 /* Attach type specific accesses */
 static bool __sock_filter_check_attach_type(int off,
 					    enum bpf_access_type access_type,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
 
 	  If unsure, say N.
 
+config IPV6_SEG6_BPF
+	def_bool y
+	depends on IPV6_SEG6_LWTUNNEL
+	depends on IPV6 = y
+
 endif # IPV6
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index e9b23fb924ad..ae68c1ef8fb0 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -449,6 +449,8 @@ drop:
 	return err;
 }
 
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
-- 
cgit v1.2.3-55-g7522


From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux
Date: Sun, 20 May 2018 14:58:16 +0100
Subject: ipv6: sr: Add seg6local action End.BPF

This patch adds the End.BPF action to the LWT seg6local infrastructure.
This action works like any other seg6local End action, meaning that an IPv6
header with SRH is needed, whose DA has to be equal to the SID of the
action. It will also advance the SRH to the next segment, the BPF program
does not have to take care of this.

Since the BPF program may not be a source of instability in the kernel, it
is important to ensure that the integrity of the packet is maintained
before yielding it back to the IPv6 layer. The hook hence keeps track if
the SRH has been altered through the helpers, and re-validates its
content if needed with seg6_validate_srh. The state kept for validation is
stored in a per-CPU buffer. The BPF program is not allowed to directly
write into the packet, and only some fields of the SRH can be altered
through the helper bpf_lwt_seg6_store_bytes.

Performances profiling has shown that the SRH re-validation does not induce
a significant overhead. If the altered SRH is deemed as invalid, the packet
is dropped.

This validation is also done before executing any action through
bpf_lwt_seg6_action, and will not be performed again if the SRH is not
modified after calling the action.

The BPF program may return 3 types of return codes:
    - BPF_OK: the End.BPF action will look up the next destination through
             seg6_lookup_nexthop.
    - BPF_REDIRECT: if an action has been executed through the
          bpf_lwt_seg6_action helper, the BPF program should return this
          value, as the skb's destination is already set and the default
          lookup should not be performed.
    - BPF_DROP : the packet will be dropped.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |   1 +
 include/uapi/linux/bpf.h        |   1 +
 include/uapi/linux/seg6_local.h |  12 +++
 kernel/bpf/verifier.c           |   1 +
 net/core/filter.c               |  25 ++++++
 net/ipv6/seg6_local.c           | 168 +++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.c          |   1 +
 7 files changed, 207 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index aa5c8b878474..b161e506dcfc 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fdaf6a0bfa5b..e95fec90c2c1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index ef2d8c3e76c1..edc138bdc56d 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -25,6 +25,7 @@ enum {
 	SEG6_LOCAL_NH6,
 	SEG6_LOCAL_IIF,
 	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -59,10 +60,21 @@ enum {
 	SEG6_LOCAL_ACTION_END_AS	= 13,
 	/* forward to SR-unaware VNF with masquerading */
 	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
 
 	__SEG6_LOCAL_ACTION_MAX,
 };
 
 #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
 
+enum {
+	SEG6_LOCAL_BPF_PROG_UNSPEC,
+	SEG6_LOCAL_BPF_PROG,
+	SEG6_LOCAL_BPF_PROG_NAME,
+	__SEG6_LOCAL_BPF_PROG_MAX,
+};
+
+#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c4d9d0fd3ab..967cacf286ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5dc44309d124..aa114c4acb25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4921,6 +4921,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_seg6_store_bytes:
+		return &bpf_lwt_seg6_store_bytes_proto;
+	case BPF_FUNC_lwt_seg6_action:
+		return &bpf_lwt_seg6_action_proto;
+	case BPF_FUNC_lwt_seg6_adjust_srh:
+		return &bpf_lwt_seg6_adjust_srh_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
@@ -6629,6 +6644,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+	.get_func_proto		= lwt_seg6local_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ae68c1ef8fb0..cd6e4cab63f6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
 /*
  *  SR-IPv6 implementation
  *
- *  Author:
+ *  Authors:
  *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
  *
  *
  *  This program is free software; you can redistribute it and/or
@@ -32,6 +33,7 @@
 #endif
 #include <net/seg6_local.h>
 #include <linux/etherdevice.h>
+#include <linux/bpf.h>
 
 struct seg6_local_lwt;
 
@@ -42,6 +44,11 @@ struct seg6_action_desc {
 	int static_headroom;
 };
 
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
 struct seg6_local_lwt {
 	int action;
 	struct ipv6_sr_hdr *srh;
@@ -50,6 +57,7 @@ struct seg6_local_lwt {
 	struct in6_addr nh6;
 	int iif;
 	int oif;
+	struct bpf_lwt_prog bpf;
 
 	int headroom;
 	struct seg6_action_desc *desc;
@@ -451,6 +459,69 @@ drop:
 
 DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
 
+static int input_action_end_bpf(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct seg6_bpf_srh_state local_srh_state;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int ret;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+	/* preempt_disable is needed to protect the per-CPU buffer srh_state,
+	 * which is also accessed by the bpf_lwt_seg6_* helpers
+	 */
+	preempt_disable();
+	srh_state->hdrlen = srh->hdrlen << 3;
+	srh_state->valid = 1;
+
+	rcu_read_lock();
+	bpf_compute_data_pointers(skb);
+	ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+	rcu_read_unlock();
+
+	local_srh_state = *srh_state;
+	preempt_enable();
+
+	switch (ret) {
+	case BPF_OK:
+	case BPF_REDIRECT:
+		break;
+	case BPF_DROP:
+		goto drop;
+	default:
+		pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+		goto drop;
+	}
+
+	if (unlikely((local_srh_state.hdrlen & 7) != 0))
+		goto drop;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		goto drop;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
+
+	if (!local_srh_state.valid &&
+	    unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+		goto drop;
+
+	if (ret != BPF_REDIRECT)
+		seg6_lookup_nexthop(skb, NULL, 0);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
@@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.attrs		= (1 << SEG6_LOCAL_SRH),
 		.input		= input_action_end_b6_encap,
 		.static_headroom	= sizeof(struct ipv6hdr),
-	}
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_BPF,
+		.attrs		= (1 << SEG6_LOCAL_BPF),
+		.input		= input_action_end_bpf,
+	},
+
 };
 
 static struct seg6_action_desc *__get_action_desc(int action)
@@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 				    .len = sizeof(struct in6_addr) },
 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -719,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
 	return 0;
 }
 
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
+	[SEG6_LOCAL_BPF_PROG]	   = { .type = NLA_U32, },
+	[SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				       .len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
+			       attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
+		return -EINVAL;
+
+	slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
+	if (!slwt->bpf.name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
+	p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+	if (IS_ERR(p)) {
+		kfree(slwt->bpf.name);
+		return PTR_ERR(p);
+	}
+
+	slwt->bpf.prog = p;
+	return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nest;
+
+	if (!slwt->bpf.prog)
+		return 0;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
+		return -EMSGSIZE;
+
+	if (slwt->bpf.name &&
+	    nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (!a->bpf.name && !b->bpf.name)
+		return 0;
+
+	if (!a->bpf.name || !b->bpf.name)
+		return 1;
+
+	return strcmp(a->bpf.name, b->bpf.name);
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -749,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
 				    .put = put_nla_oif,
 				    .cmp = cmp_nla_oif },
+
+	[SEG6_LOCAL_BPF]	= { .parse = parse_nla_bpf,
+				    .put = put_nla_bpf,
+				    .cmp = cmp_nla_bpf },
+
 };
 
 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -834,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
 
 	kfree(slwt->srh);
+
+	if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
+		kfree(slwt->bpf.name);
+		bpf_prog_put(slwt->bpf.prog);
+	}
+
+	return;
 }
 
 static int seg6_local_fill_encap(struct sk_buff *skb,
@@ -886,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 	if (attrs & (1 << SEG6_LOCAL_OIF))
 		nlsize += nla_total_size(4);
 
+	if (attrs & (1 << SEG6_LOCAL_BPF))
+		nlsize += nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) +
+		       nla_total_size(4);
+
 	return nlsize;
 }
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e5cd4a958846..d07e44474848 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
 	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
-- 
cgit v1.2.3-55-g7522


From e549f6f9c098067a99e9de8ac84f5cc2c07ae5c6 Mon Sep 17 00:00:00 2001
From: Huy Nguyen
Date: Thu, 22 Feb 2018 11:57:10 -0600
Subject: net/dcb: Add dcbnl buffer attribute

In this patch, we add dcbnl buffer attribute to allow user
change the NIC's buffer configuration such as priority
to buffer mapping and buffer size of individual buffer.

This attribute combined with pfc attribute allows advanced user to
fine tune the qos setting for specific priority queue. For example,
user can give dedicated buffer for one or more priorities or user
can give large buffer to certain priorities.

The dcb buffer configuration will be controlled by lldptool.
lldptool -T -i eth2 -V BUFFER prio 0,2,5,7,1,2,3,6
  maps priorities 0,1,2,3,4,5,6,7 to receive buffer 0,2,5,7,1,2,3,6
lldptool -T -i eth2 -V BUFFER size 87296,87296,0,87296,0,0,0,0
  sets receive buffer size for buffer 0,1,2,3,4,5,6,7 respectively

After discussion on mailing list with Jakub, Jiri, Ido and John, we agreed to
choose dcbnl over devlink interface since this feature is intended to set
port attributes which are governed by the netdev instance of that port, where
devlink API is more suitable for global ASIC configurations.

We present an use case scenario where dcbnl buffer attribute configured
by advance user helps reduce the latency of messages of different sizes.

Scenarios description:
On ConnectX-5, we run latency sensitive traffic with
small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
traffic with large messages sizes 512KB and 1MB. We group small, medium,
and large message sizes to their own pfc enables priorities as follow.
  Priorities 1 & 2 (64B, 256B and 1KB)
  Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
  Priorities 5 & 6 (512KB and 1MB)

By default, ConnectX-5 maps all pfc enabled priorities to a single
lossless fixed buffer size of 50% of total available buffer space. The
other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
we create three equal size lossless buffers. Each buffer has 25% of total
available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
to lossless  buffer mappings are set as follow.
  Priorities 1 & 2 on lossless buffer #1
  Priorities 3 & 4 on lossless buffer #2
  Priorities 5 & 6 on lossless buffer #3

We observe improvements in latency for small and medium message sizes
as follows. Please note that the large message sizes bandwidth performance is
reduced but the total bandwidth remains the same.
  256B message size (42 % latency reduction)
  4K message size (21% latency reduction)
  64K message size (16% latency reduction)

CC: Ido Schimmel <idosch@idosch.org>
CC: Jakub Kicinski <jakub.kicinski@netronome.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Or Gerlitz <gerlitz.or@gmail.com>
CC: Parav Pandit <parav@mellanox.com>
CC: Aron Silverton <aron.silverton@oracle.com>
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/dcbnl.h        |  4 ++++
 include/uapi/linux/dcbnl.h | 11 +++++++++++
 net/dcb/dcbnl.c            | 20 ++++++++++++++++++++
 3 files changed, 35 insertions(+)

(limited to 'include/uapi')

diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 207d9ba1f92c..0e5e91be2d30 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -101,6 +101,10 @@ struct dcbnl_rtnl_ops {
 	/* CEE peer */
 	int (*cee_peer_getpg) (struct net_device *, struct cee_pg *);
 	int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *);
+
+	/* buffer settings */
+	int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *);
+	int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 2c0c6453c3f4..60aa2e446698 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -163,6 +163,16 @@ struct ieee_pfc {
 	__u64	indications[IEEE_8021QAZ_MAX_TCS];
 };
 
+#define IEEE_8021Q_MAX_PRIORITIES 8
+#define DCBX_MAX_BUFFERS  8
+struct dcbnl_buffer {
+	/* priority to buffer mapping */
+	__u8    prio2buffer[IEEE_8021Q_MAX_PRIORITIES];
+	/* buffer size in Bytes */
+	__u32   buffer_size[DCBX_MAX_BUFFERS];
+	__u32   total_size;
+};
+
 /* CEE DCBX std supported values */
 #define CEE_DCBX_MAX_PGS	8
 #define CEE_DCBX_MAX_PRIO	8
@@ -406,6 +416,7 @@ enum ieee_attrs {
 	DCB_ATTR_IEEE_MAXRATE,
 	DCB_ATTR_IEEE_QCN,
 	DCB_ATTR_IEEE_QCN_STATS,
+	DCB_ATTR_DCB_BUFFER,
 	__DCB_ATTR_IEEE_MAX
 };
 #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1)
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
 	[DCB_ATTR_IEEE_MAXRATE]   = {.len = sizeof(struct ieee_maxrate)},
 	[DCB_ATTR_IEEE_QCN]         = {.len = sizeof(struct ieee_qcn)},
 	[DCB_ATTR_IEEE_QCN_STATS]   = {.len = sizeof(struct ieee_qcn_stats)},
+	[DCB_ATTR_DCB_BUFFER]       = {.len = sizeof(struct dcbnl_buffer)},
 };
 
 /* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 			return -EMSGSIZE;
 	}
 
+	if (ops->dcbnl_getbuffer) {
+		struct dcbnl_buffer buffer;
+
+		memset(&buffer, 0, sizeof(buffer));
+		err = ops->dcbnl_getbuffer(netdev, &buffer);
+		if (!err &&
+		    nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+			return -EMSGSIZE;
+	}
+
 	app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
 	if (!app)
 		return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
 			goto err;
 	}
 
+	if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+		struct dcbnl_buffer *buffer =
+			nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+		err = ops->dcbnl_setbuffer(netdev, buffer);
+		if (err)
+			goto err;
+	}
+
 	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
 		struct nlattr *attr;
 		int rem;
-- 
cgit v1.2.3-55-g7522


From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song
Date: Thu, 24 May 2018 11:21:09 -0700
Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/trace_events.h |  17 ++++++
 include/uapi/linux/bpf.h     |  26 +++++++++
 kernel/bpf/syscall.c         | 131 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c     |  48 ++++++++++++++++
 kernel/trace/trace_kprobe.c  |  29 ++++++++++
 kernel/trace/trace_uprobe.c  |  22 ++++++++
 6 files changed, 273 insertions(+)

(limited to 'include/uapi')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3eff564c..d34144a3c5b5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
 {
 	return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+					  u32 *prog_id, u32 *fd_type,
+					  const char **buf, u64 *probe_offset,
+					  u64 *probe_addr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **symbol,
+			       u64 *probe_offset, u64 *probe_addr,
+			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **filename,
+			       u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e95fec90c2c1..9b8c6e310e9a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 788456c18617..388d4feda348 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && ubuf) {
+		if (!len) {
+			/* nothing to copy, just make ubuf NULL terminated */
+			char zero = '\0';
+
+			if (put_user(zero, ubuf))
+				return -EFAULT;
+		} else if (input_len >= len + 1) {
+			/* ubuf can hold the string with NULL terminator */
+			if (copy_to_user(ubuf, buf, len + 1))
+				return -EFAULT;
+		} else {
+			/* ubuf cannot hold the string with NULL terminator,
+			 * do a partial copy with NULL terminator.
+			 */
+			char zero = '\0';
+
+			err = -ENOSPC;
+			if (copy_to_user(ubuf, buf, input_len - 1))
+				return -EFAULT;
+			if (put_user(zero, ubuf + input_len - 1))
+				return -EFAULT;
+		}
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbff27e4..81fdf2fc94ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*fd_type = BPF_FD_TYPE_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, fd_type, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, fd_type, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76e0978..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+					      : BPF_FD_TYPE_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac892878dbe6..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+				    : BPF_FD_TYPE_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-- 
cgit v1.2.3-55-g7522


From 7d850abd5f4edb1b1ca4b4141a4453305736f564 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Thu, 24 May 2018 11:56:48 +0300
Subject: net: bridge: add support for port isolation

This patch adds support for a new port flag - BR_ISOLATED. If it is set
then isolated ports cannot communicate between each other, but they can
still communicate with non-isolated ports. The same can be achieved via
ACLs but they can't scale with large number of ports and also the
complexity of the rules grows. This feature can be used to achieve
isolated vlan functionality (similar to pvlan) as well, though currently
it will be port-wide (for all vlans on the port). The new test in
should_deliver uses data that is already cache hot and the new boolean
is used to avoid an additional source port test in should_deliver.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    | 1 +
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_forward.c      | 3 ++-
 net/bridge/br_input.c        | 1 +
 net/bridge/br_netlink.c      | 9 ++++++++-
 net/bridge/br_private.h      | 9 +++++++++
 net/bridge/br_sysfs_if.c     | 2 ++
 7 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 585d27182425..7843b98e1c6e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -50,6 +50,7 @@ struct br_ip_list {
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
 #define BR_NEIGH_SUPPRESS	BIT(15)
+#define BR_ISOLATED		BIT(16)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b85266420bfb..cf01b6824244 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -333,6 +333,7 @@ enum {
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
 	IFLA_BRPORT_NEIGH_SUPPRESS,
+	IFLA_BRPORT_ISOLATED,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7a7fd672ccf2..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
 		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
-		nbp_switchdev_allowed_egress(p, skb);
+		nbp_switchdev_allowed_egress(p, skb) &&
+		!br_skb_isolated(p, skb);
 }
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		goto drop;
 
 	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+	BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
 
 	if (IS_ENABLED(CONFIG_INET) &&
 	    (skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
+		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 							BR_VLAN_TUNNEL)) ||
 	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
 	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
-		       !!(p->flags & BR_NEIGH_SUPPRESS)))
+		       !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
 	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	if (err)
 		return err;
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 742f40aefdaf..11520ed528b0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
 #endif
 
 	bool proxyarp_replied;
+	bool src_port_isolated;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	bool vlan_filtered;
@@ -574,6 +575,14 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
 
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+				   const struct sk_buff *skb)
+{
+	return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+	       (to->flags & BR_ISOLATED);
+}
+
 /* br_if.c */
 void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
 int br_add_bridge(struct net *net, const char *name);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
 BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
 	&brport_attr_neigh_suppress,
+	&brport_attr_isolated,
 	NULL
 };
 
-- 
cgit v1.2.3-55-g7522


From 5972be6b2495c6bffbf444497517fd1c070eef78 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei
Date: Thu, 24 May 2018 17:56:42 -0700
Subject: openvswitch: Add conntrack limit netlink definition

Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..863aabaa5cc9 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,32 @@ enum ovs_meter_band_type {
 
 #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
 
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY  "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+	OVS_CT_LIMIT_CMD_UNSPEC,
+	OVS_CT_LIMIT_CMD_SET,		/* Add or modify ct limit. */
+	OVS_CT_LIMIT_CMD_DEL,		/* Delete ct limit. */
+	OVS_CT_LIMIT_CMD_GET		/* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+	OVS_CT_LIMIT_ATTR_UNSPEC,
+	OVS_CT_LIMIT_ATTR_ZONE_LIMIT,	/* Nested struct ovs_zone_limit. */
+	__OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+#define OVS_ZONE_LIMIT_DEFAULT_ZONE -1
+
+struct ovs_zone_limit {
+	int zone_id;
+	__u32 limit;
+	__u32 count;
+};
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit v1.2.3-55-g7522


From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov
Date: Fri, 25 May 2018 08:55:23 -0700
Subject: bpf: Hooks for sys_sendmsg

In addition to already existing BPF hooks for sys_bind and sys_connect,
the patch provides new hooks for sys_sendmsg.

It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
that provides access to socket itlself (properties like family, type,
protocol) and user-passed `struct sockaddr *` so that BPF program can
override destination IP and port for system calls such as sendto(2) or
sendmsg(2) and/or assign source IP to the socket.

The hooks are implemented as two new attach types:
`BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
UDPv6 correspondingly.

UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
sys_connect hooks, i.e. to prevent reading from / writing to e.g.
user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.

The difference with already existing hooks is sys_sendmsg are
implemented only for unconnected UDP.

For TCP it doesn't make sense to change user-provided `struct sockaddr *`
at sendto(2)/sendmsg(2) time since socket either was already connected
and has source/destination set or wasn't connected and call to
sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.

Connected UDP is already handled by sys_connect hooks that can override
source/destination at connect time and use fast-path later, i.e. these
hooks don't affect UDP fast-path.

Rewriting source IP is implemented differently than that in sys_connect
hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
just bind socket to desired local IP address since source IP can be set
on per-packet basis by using ancillary data (cmsg(3)). So no matter if
socket is bound or not, source IP has to be rewritten on every call to
sys_sendmsg.

To do so two new fields are added to UAPI `struct bpf_sock_addr`;
* `msg_src_ip4` to set source IPv4 for UDPv4;
* `msg_src_ip6` to set source IPv6 for UDPv6.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
 include/linux/filter.h     |  1 +
 include/uapi/linux/bpf.h   |  8 ++++++++
 kernel/bpf/cgroup.c        | 11 ++++++++++-
 kernel/bpf/syscall.c       |  8 ++++++++
 net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c             | 20 ++++++++++++++++++--
 net/ipv6/udp.c             | 24 ++++++++++++++++++++++++
 8 files changed, 125 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index de8e89a3758b..975fb4cf1bb7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type);
+				      enum bpf_attach_type type,
+				      void *t_ctx);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
@@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  NULL);	       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)	{					       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  t_ctx);	       \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
@@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d358d1815c16..d90abdaea94b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
 	 * only two (src and dst) are available at convert_ctx_access time
 	 */
 	u64 tmp_reg;
+	void *t_ctx;	/* Attach type specific context. */
 };
 
 struct bpf_sock_ops_kern {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type)
+				      enum bpf_attach_type type,
+				      void *t_ctx)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
 		.uaddr = uaddr,
+		.t_ctx = t_ctx,
 	};
+	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
+	if (!ctx.uaddr) {
+		memset(&unspec, 0, sizeof(unspec));
+		ctx.uaddr = (struct sockaddr *)&unspec;
+	}
+
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 388d4feda348..e254526d6744 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1249,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP6_SENDMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1565,6 +1567,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1635,6 +1639,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1692,6 +1698,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index acf1f4fb99d1..24e6ce8be567 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5299,6 +5299,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5308,6 +5309,24 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP6_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP4_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP6_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5318,6 +5337,9 @@ static bool sock_addr_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
@@ -6072,6 +6094,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
 					SK_FL_PROTO_SHIFT);
 		break;
+
+	case offsetof(struct bpf_sock_addr, msg_src_ip4):
+		/* Treat t_ctx as struct in_addr for msg_src_ip4. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+			s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		off = si->off;
+		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d71f1f3e1155..3c27d00b5730 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -901,6 +901,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct udp_sock *up = udp_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
 	struct flowi4 fl4_stack;
 	struct flowi4 *fl4;
 	int ulen = len;
@@ -955,8 +956,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	/*
 	 *	Get and verify the address.
 	 */
-	if (msg->msg_name) {
-		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	if (usin) {
 		if (msg->msg_namelen < sizeof(*usin))
 			return -EINVAL;
 		if (usin->sin_family != AF_INET) {
@@ -1010,6 +1010,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+					    (struct sockaddr *)usin, &ipc.addr);
+		if (err)
+			goto out_free;
+		if (usin) {
+			if (usin->sin_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_free;
+			}
+			daddr = usin->sin_addr.s_addr;
+			dport = usin->sin_port;
+		}
+	}
+
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 426c9d2b418d..9f729a7b8cf0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1316,6 +1316,29 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+					   (struct sockaddr *)sin6, &fl6.saddr);
+		if (err)
+			goto out_no_dst;
+		if (sin6) {
+			if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+				/* BPF program rewrote IPv6-only by IPv4-mapped
+				 * IPv6. It's currently unsupported.
+				 */
+				err = -ENOTSUPP;
+				goto out_no_dst;
+			}
+			if (sin6->sin6_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_no_dst;
+			}
+			fl6.fl6_dport = sin6->sin6_port;
+			fl6.daddr = sin6->sin6_addr;
+		}
+	}
+
 	final_p = fl6_update_dst(&fl6, opt, &final);
 	if (final_p)
 		connected = false;
@@ -1395,6 +1418,7 @@ do_append_data:
 
 out:
 	dst_release(dst);
+out_no_dst:
 	fl6_sock_release(flowlabel);
 	txopt_put(opt_to_free);
 	if (!err)
-- 
cgit v1.2.3-55-g7522


From 9805069d14c1b0b66b1600ea60cfc08f94841bd8 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala
Date: Thu, 24 May 2018 09:55:16 -0700
Subject: virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit

This feature bit can be used by hypervisor to indicate virtio_net device to
act as a standby for another device with the same MAC address.

VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c        | 2 +-
 include/uapi/linux/virtio_net.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..fe4534ee50a1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3030,7 +3030,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
-	VIRTIO_NET_F_SPEED_DUPLEX
+	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..a3715a3224c1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
 
+#define VIRTIO_NET_F_STANDBY	  62	/* Act as standby for another device
+					 * with the same MAC.
+					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
 
 #ifndef VIRTIO_NET_NO_LEGACY
-- 
cgit v1.2.3-55-g7522


From 620dee9415fae09003d012cc7c23b011e5ebb43d Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Sun, 27 May 2018 08:09:56 -0700
Subject: net: Add IFA_RT_PRIORITY address attribute

Currently, if two interfaces have addresses in the same connected route,
then the order of the prefix route entries is determined by the order in
which the addresses are assigned or the links brought up.

Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix
route associated with an address giving interface managers better
control of the order of prefix routes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 2ef053d265de..ebaf5701c9db 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
+	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
 	__IFA_MAX,
 };
 
-- 
cgit v1.2.3-55-g7522


From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001
From: Quentin Monnet
Date: Tue, 29 May 2018 12:27:44 +0100
Subject: bpf: clean up eBPF helpers documentation

These are minor edits for the eBPF helpers documentation in
include/uapi/linux/bpf.h.

The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
with a non-escaped underscore that gets interpreted by rst2man and
produces the following message in the resulting manual page:

    DOCUTILS SYSTEM MESSAGES
           System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
                  Unknown target name: "bpf_fib_lookup".

Other edits consist in:

- Improving formatting for flag values for "bpf_fib_lookup()" helper.
- Emphasising a parameter name in description of the return value for
  "bpf_get_stack()" helper.
- Removing unnecessary blank lines between "Description" and "Return"
  sections for the few helpers that would use it, for consistency.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 21 ++++++++++-----------
 tools/include/uapi/linux/bpf.h | 21 ++++++++++-----------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
-- 
cgit v1.2.3-55-g7522


From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Tue, 29 May 2018 10:58:07 -0700
Subject: bpf: Drop mpls from bpf_fib_lookup

MPLS support will not be submitted this dev cycle, but in working on it
I do see a few changes are needed to the API. For now, drop mpls from the
API. Since the fields in question are unions, the mpls fields can be added
back later without affecting the uapi.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f556b35ac8d..33f37eb0b6bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1852,10 +1852,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2556,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
-- 
cgit v1.2.3-55-g7522


From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001
From: Sean Young
Date: Sun, 27 May 2018 12:24:09 +0100
Subject: media: rc: introduce BPF_PROG_LIRC_MODE2

Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
that the last key should be repeated.

The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
the target_fd must be the /dev/lircN device.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/media/rc/Kconfig        |  13 ++
 drivers/media/rc/Makefile       |   1 +
 drivers/media/rc/bpf-lirc.c     | 313 ++++++++++++++++++++++++++++++++++++++++
 drivers/media/rc/lirc_dev.c     |  30 ++++
 drivers/media/rc/rc-core-priv.h |  21 +++
 drivers/media/rc/rc-ir-raw.c    |  12 +-
 include/linux/bpf_lirc.h        |  29 ++++
 include/linux/bpf_types.h       |   3 +
 include/uapi/linux/bpf.h        |  53 ++++++-
 kernel/bpf/syscall.c            |   7 +
 10 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/bpf-lirc.c
 create mode 100644 include/linux/bpf_lirc.h

(limited to 'include/uapi')

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index eb2c3b6eca7f..d5b35a6ba899 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -25,6 +25,19 @@ config LIRC
 	   passes raw IR to and from userspace, which is needed for
 	   IR transmitting (aka "blasting") and for the lirc daemon.
 
+config BPF_LIRC_MODE2
+	bool "Support for eBPF programs attached to lirc devices"
+	depends on BPF_SYSCALL
+	depends on RC_CORE=y
+	depends on LIRC
+	help
+	   Allow attaching eBPF programs to a lirc device using the bpf(2)
+	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
+	   receivers.
+
+	   These eBPF programs can be used to decode IR into scancodes, for
+	   IR protocols not supported by the kernel decoders.
+
 menuconfig RC_DECODERS
 	bool "Remote controller decoders"
 	depends on RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 2e1c87066f6c..e0340d043fe8 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -5,6 +5,7 @@ obj-y += keymaps/
 obj-$(CONFIG_RC_CORE) += rc-core.o
 rc-core-y := rc-main.o rc-ir-raw.o
 rc-core-$(CONFIG_LIRC) += lirc_dev.o
+rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
 obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
 obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
 obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
new file mode 100644
index 000000000000..40826bba06b6
--- /dev/null
+++ b/drivers/media/rc/bpf-lirc.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+// bpf-lirc.c - handles bpf
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_lirc.h>
+#include "rc-core-priv.h"
+
+/*
+ * BPF interface for raw IR
+ */
+const struct bpf_prog_ops lirc_mode2_prog_ops = {
+};
+
+BPF_CALL_1(bpf_rc_repeat, u32*, sample)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_repeat(ctrl->dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_repeat_proto = {
+	.func	   = bpf_rc_repeat,
+	.gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+};
+
+/*
+ * Currently rc-core does not support 64-bit scancodes, but there are many
+ * known protocols with more than 32 bits. So, define the interface as u64
+ * as a future-proof.
+ */
+BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
+	   u32, toggle)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_keydown_proto = {
+	.func	   = bpf_rc_keydown,
+	.gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_rc_repeat:
+		return &rc_repeat_proto;
+	case BPF_FUNC_rc_keydown:
+		return &rc_keydown_proto;
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+		/* fall through */
+	default:
+		return NULL;
+	}
+}
+
+static bool lirc_mode2_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	/* We have one field of u32 */
+	return type == BPF_READ && off == 0 && size == sizeof(u32);
+}
+
+const struct bpf_verifier_ops lirc_mode2_verifier_ops = {
+	.get_func_proto  = lirc_mode2_func_proto,
+	.is_valid_access = lirc_mode2_is_valid_access
+};
+
+#define BPF_MAX_PROGS 64
+
+static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	/*
+	 * Do not use bpf_prog_array_delete_safe() as we would end up
+	 * with a dummy entry in the array, and the we would free the
+	 * dummy in lirc_bpf_free()
+	 */
+	if (ret)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
+{
+	struct ir_raw_event_ctrl *raw = rcdev->raw;
+
+	raw->bpf_sample = sample;
+
+	if (raw->progs)
+		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+}
+
+/*
+ * This should be called once the rc thread has been stopped, so there can be
+ * no concurrent bpf execution.
+ */
+void lirc_bpf_free(struct rc_dev *rcdev)
+{
+	struct bpf_prog **progs;
+
+	if (!rcdev->raw->progs)
+		return;
+
+	progs = rcu_dereference(rcdev->raw->progs)->progs;
+	while (*progs)
+		bpf_prog_put(*progs++);
+
+	bpf_prog_array_free(rcdev->raw->progs);
+}
+
+int lirc_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_attach(rcdev, prog);
+	if (ret)
+		bpf_prog_put(prog);
+
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_detach(rcdev, prog);
+
+	bpf_prog_put(prog);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array __rcu *progs;
+	struct rc_dev *rcdev;
+	u32 cnt, flags = 0;
+	int ret;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	rcdev = rc_dev_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(rcdev))
+		return PTR_ERR(rcdev);
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW) {
+		ret = -EINVAL;
+		goto put;
+	}
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		goto put;
+
+	progs = rcdev->raw->progs;
+	cnt = progs ? bpf_prog_array_length(progs) : 0;
+
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+		ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+put:
+	put_device(&rcdev->dev);
+
+	return ret;
+}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 24e9fbb80e81..da7013a12a58 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/idr.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 			TO_US(ev.duration), TO_STR(ev.pulse));
 	}
 
+	/*
+	 * bpf does not care about the gap generated above; that exists
+	 * for backwards compatibility
+	 */
+	lirc_bpf_run(dev, sample);
+
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list) {
 		if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports)
@@ -816,4 +823,27 @@ void __exit lirc_dev_exit(void)
 	unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX);
 }
 
+struct rc_dev *rc_dev_get_from_fd(int fd)
+{
+	struct fd f = fdget(fd);
+	struct lirc_fh *fh;
+	struct rc_dev *dev;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &lirc_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fh = f.file->private_data;
+	dev = fh->rc;
+
+	get_device(&dev->dev);
+	fdput(f);
+
+	return dev;
+}
+
 MODULE_ALIAS("lirc_dev");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index e0e6a17460f6..eb004757038b 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -13,6 +13,7 @@
 #define	MAX_IR_EVENT_SIZE	512
 
 #include <linux/slab.h>
+#include <uapi/linux/bpf.h>
 #include <media/rc-core.h>
 
 /**
@@ -57,6 +58,11 @@ struct ir_raw_event_ctrl {
 	/* raw decoder state follows */
 	struct ir_raw_event prev_ev;
 	struct ir_raw_event this_ev;
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+	u32				bpf_sample;
+	struct bpf_prog_array __rcu	*progs;
+#endif
 	struct nec_dec {
 		int state;
 		unsigned count;
@@ -126,6 +132,9 @@ struct ir_raw_event_ctrl {
 	} imon;
 };
 
+/* Mutex for locking raw IR processing and handler change */
+extern struct mutex ir_raw_handler_lock;
+
 /* macros for IR decoders */
 static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin)
 {
@@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev);
 void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc);
 int ir_lirc_register(struct rc_dev *dev);
 void ir_lirc_unregister(struct rc_dev *dev);
+struct rc_dev *rc_dev_get_from_fd(int fd);
 #else
 static inline int lirc_dev_init(void) { return 0; }
 static inline void lirc_dev_exit(void) {}
@@ -299,4 +309,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; }
 static inline void ir_lirc_unregister(struct rc_dev *dev) { }
 #endif
 
+/*
+ * bpf interface
+ */
+#ifdef CONFIG_BPF_LIRC_MODE2
+void lirc_bpf_free(struct rc_dev *dev);
+void lirc_bpf_run(struct rc_dev *dev, u32 sample);
+#else
+static inline void lirc_bpf_free(struct rc_dev *dev) { }
+static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { }
+#endif
+
 #endif /* _RC_CORE_PRIV */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 374f83105a23..7675b7ee5bc7 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -14,7 +14,7 @@
 static LIST_HEAD(ir_raw_client_list);
 
 /* Used to handle IR raw handler extensions */
-static DEFINE_MUTEX(ir_raw_handler_lock);
+DEFINE_MUTEX(ir_raw_handler_lock);
 static LIST_HEAD(ir_raw_handler_list);
 static atomic64_t available_protocols = ATOMIC64_INIT(0);
 
@@ -621,9 +621,17 @@ void ir_raw_event_unregister(struct rc_dev *dev)
 	list_for_each_entry(handler, &ir_raw_handler_list, list)
 		if (handler->raw_unregister)
 			handler->raw_unregister(dev);
-	mutex_unlock(&ir_raw_handler_lock);
+
+	lirc_bpf_free(dev);
 
 	ir_raw_event_free(dev);
+
+	/*
+	 * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so
+	 * ensure that the raw member is null on unlock; this is how
+	 * "device gone" is checked.
+	 */
+	mutex_unlock(&ir_raw_handler_lock);
 }
 
 /*
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
new file mode 100644
index 000000000000..5f8a4283092d
--- /dev/null
+++ b/include/linux/bpf_lirc.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_LIRC_H
+#define _BPF_LIRC_H
+
+#include <uapi/linux/bpf.h>
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_detach(const union bpf_attr *attr);
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int lirc_prog_attach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_detach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_query(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* _BPF_LIRC_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b161e506dcfc..c5700c2d5549 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_BPF_LIRC_MODE2
+BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 33f37eb0b6bf..64ac0f7a689e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e254526d6744..7365d79ae00d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -1582,6 +1583,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1654,6 +1657,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1703,6 +1708,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_LIRC_MODE2:
+		return lirc_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-55-g7522


From 35aada99b5bfa7a9360f9653377688556f71edcb Mon Sep 17 00:00:00 2001
From: Donald Sharp
Date: Wed, 30 May 2018 08:27:32 -0400
Subject: rtnetlink: Add more well known protocol values

FRRouting installs routes into the kernel associated with
the originating protocol.  Add these values to the well
known values in rtnetlink.h.

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cabb210c93af..7d8502313c99 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -254,6 +254,11 @@ enum {
 #define RTPROT_DHCP	16      /* DHCP client */
 #define RTPROT_MROUTED	17      /* Multicast daemon */
 #define RTPROT_BABEL	42      /* Babel daemon */
+#define RTPROT_BGP	186     /* BGP Routes */
+#define RTPROT_ISIS	187     /* ISIS Routes */
+#define RTPROT_OSPF	188     /* OSPF Routes */
+#define RTPROT_RIP	189     /* RIP Routes */
+#define RTPROT_EIGRP	192     /* EIGRP Routes */
 
 /* rtm_scope
 
-- 
cgit v1.2.3-55-g7522


From 554ced0a6e2946562c20d9fffdbaf2aa7da36b1b Mon Sep 17 00:00:00 2001
From: Máté Eckl
Date: Mon, 28 May 2018 09:15:33 +0200
Subject: netfilter: nf_tables: add support for native socket matching

Now it can only match the transparent flag of an ip/ipv6 socket.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  25 ++++++
 net/netfilter/Kconfig                    |   9 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_socket.c               | 143 +++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)
 create mode 100644 net/netfilter/nft_socket.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9c71f024f9cc..3d46c82a5ebd 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -904,6 +904,31 @@ enum nft_rt_attributes {
 };
 #define NFTA_RT_MAX		(__NFTA_RT_MAX - 1)
 
+/**
+ * enum nft_socket_attributes - nf_tables socket expression netlink attributes
+ *
+ * @NFTA_SOCKET_KEY: socket key to match
+ * @NFTA_SOCKET_DREG: destination register
+ */
+enum nft_socket_attributes {
+	NFTA_SOCKET_UNSPEC,
+	NFTA_SOCKET_KEY,
+	NFTA_SOCKET_DREG,
+	__NFTA_SOCKET_MAX
+};
+#define NFTA_SOCKET_MAX		(__NFTA_SOCKET_MAX - 1)
+
+/*
+ * enum nft_socket_keys - nf_tables socket expression keys
+ *
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ */
+enum nft_socket_keys {
+	NFT_SOCKET_TRANSPARENT,
+	__NFT_SOCKET_MAX
+};
+#define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
+
 /**
  * enum nft_ct_keys - nf_tables ct expression keys
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 3ec8886850b2..276e1e32f44e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -613,6 +613,15 @@ config NFT_FIB_INET
 	  The lookup will be delegated to the IPv4 or IPv6 FIB depending
 	  on the protocol of the packet.
 
+config NFT_SOCKET
+	tristate "Netfilter nf_tables socket match support"
+	depends on IPV6 || IPV6=n
+	select NF_SOCKET_IPV4
+	select NF_SOCKET_IPV6 if IPV6
+	help
+	  This option allows matching for the presence or absence of a
+	  corresponding socket and its attributes.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 9b3434360d49..eec169555731 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NF_OSF)		+= nf_osf.o
+obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..d86337068ecb
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+
+struct nft_socket {
+	enum nft_socket_keys		key:8;
+	union {
+		enum nft_registers	dreg:8;
+	};
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	struct sock *sk = skb->sk;
+	u32 *dest = &regs->data[priv->dreg];
+
+	if (!sk)
+		switch(nft_pf(pkt)) {
+		case NFPROTO_IPV4:
+			sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+		case NFPROTO_IPV6:
+			sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#endif
+		default:
+			WARN_ON_ONCE(1);
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+
+	if(!sk) {
+		nft_reg_store8(dest, 0);
+		return;
+	}
+
+	/* So that subsequent socket matching not to require other lookups. */
+	skb->sk = sk;
+
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		nft_reg_store8(dest, nf_sk_is_transparent(sk));
+		break;
+	default:
+		WARN_ON(1);
+		regs->verdict.code = NFT_BREAK;
+	}
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+	[NFTA_SOCKET_KEY]		= { .type = NLA_U32 },
+	[NFTA_SOCKET_DREG]		= { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_socket *priv = nft_expr_priv(expr);
+	unsigned int len;
+
+	if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+		return -EINVAL;
+
+	switch(ctx->family) {
+	case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+	case NFPROTO_IPV6:
+#endif
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		len = sizeof(u8);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+
+	if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+		return -1;
+	if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+		return -1;
+	return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+	.type		= &nft_socket_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+	.eval		= nft_socket_eval,
+	.init		= nft_socket_init,
+	.dump		= nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+	.name		= "socket",
+	.ops		= &nft_socket_ops,
+	.policy		= nft_socket_policy,
+	.maxattr	= NFTA_SOCKET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+	return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+	nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");
-- 
cgit v1.2.3-55-g7522


From 1a893b44de4528887e7dabcdce7151ca2a8ee238 Mon Sep 17 00:00:00 2001
From: Phil Sutter
Date: Wed, 30 May 2018 11:06:22 +0200
Subject: netfilter: nf_tables: Add audit support to log statement

This extends log statement to support the behaviour achieved with
AUDIT target in iptables.

Audit logging is enabled via a pseudo log level 8. In this case any
other settings like log prefix are ignored since audit log format is
fixed.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  5 ++
 net/netfilter/nft_log.c                  | 92 +++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3d46c82a5ebd..5c7eb9b9f6d6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1080,6 +1080,11 @@ enum nft_log_attributes {
 };
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
+/**
+ * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ */
+#define LOGLEVEL_AUDIT		8
+
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
  *
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <linux/audit.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
 	char			*prefix;
 };
 
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+	if (!ih)
+		return false;
+
+	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+			 &ih->saddr, &ih->daddr, ih->protocol);
+
+	return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+	if (!ih)
+		return false;
+
+	nexthdr = ih->nexthdr;
+	ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+			 &ih->saddr, &ih->daddr, nexthdr);
+
+	return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+	struct sk_buff *skb = pkt->skb;
+	struct audit_buffer *ab;
+	int fam = -1;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+	if (!ab)
+		return;
+
+	audit_log_format(ab, "mark=%#x", skb->mark);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_BRIDGE:
+		switch (eth_hdr(skb)->h_proto) {
+		case htons(ETH_P_IP):
+			fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+			break;
+		case htons(ETH_P_IPV6):
+			fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+			break;
+		}
+		break;
+	case NFPROTO_IPV4:
+		fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+		break;
+	case NFPROTO_IPV6:
+		fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+		break;
+	}
+
+	if (fam == -1)
+		audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+	audit_log_end(ab);
+}
+
 static void nft_log_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
 {
 	const struct nft_log *priv = nft_expr_priv(expr);
 
+	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+		nft_log_eval_audit(pkt);
+		return;
+	}
+
 	nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
 		      nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
 		      priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		} else {
 			li->u.log.level = LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_DEBUG) {
+		if (li->u.log.level > LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return 0;
+
 	err = nf_logger_find_get(ctx->family, li->type);
 	if (err < 0)
 		goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return;
+
 	nf_logger_put(ctx->family, li->type);
 }
 
-- 
cgit v1.2.3-55-g7522


From d32de98ea70fe7cf606f3809f0970b31c115764b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Thu, 31 May 2018 01:58:00 +0200
Subject: netfilter: nft_fwd_netdev: allow to forward packets via neighbour
 layer

This allows us to forward packets from the netdev family via neighbour
layer, so you don't need an explicit link-layer destination when using
this expression from rules. The ttl/hop_limit field is decremented.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_fwd_netdev.c           | 146 ++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5c7eb9b9f6d6..a089af092a29 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1260,10 +1260,14 @@ enum nft_dup_attributes {
  * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes
  *
  * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register)
+ * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto)
  */
 enum nft_fwd_attributes {
 	NFTA_FWD_UNSPEC,
 	NFTA_FWD_SREG_DEV,
+	NFTA_FWD_SREG_ADDR,
+	NFTA_FWD_NFPROTO,
 	__NFTA_FWD_MAX
 };
 #define NFTA_FWD_MAX	(__NFTA_FWD_MAX - 1)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
 
 struct nft_fwd_netdev {
 	enum nft_registers	sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
 	[NFTA_FWD_SREG_DEV]	= { .type = NLA_U32 },
+	[NFTA_FWD_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_FWD_NFPROTO]	= { .type = NLA_U32 },
 };
 
 static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
 	return -1;
 }
 
+struct nft_fwd_neigh {
+	enum nft_registers	sreg_dev:8;
+	enum nft_registers	sreg_addr:8;
+	u8			nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	void *addr = &regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+	unsigned int verdict = NF_STOLEN;
+	struct sk_buff *skb = pkt->skb;
+	struct net_device *dev;
+	int neigh_table;
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4: {
+		struct iphdr *iph;
+
+		if (skb->protocol != htons(ETH_P_IP)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*iph))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		iph = ip_hdr(skb);
+		ip_decrease_ttl(iph);
+		neigh_table = NEIGH_ARP_TABLE;
+		break;
+		}
+	case NFPROTO_IPV6: {
+		struct ipv6hdr *ip6h;
+
+		if (skb->protocol != htons(ETH_P_IPV6)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		ip6h = ipv6_hdr(skb);
+		ip6h->hop_limit--;
+		neigh_table = NEIGH_ND_TABLE;
+		break;
+		}
+	default:
+		verdict = NFT_BREAK;
+		goto out;
+	}
+
+	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+	if (dev == NULL)
+		return;
+
+	skb->dev = dev;
+	neigh_xmit(neigh_table, dev, addr, skb);
+out:
+	regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	unsigned int addr_len;
+	int err;
+
+	if (!tb[NFTA_FWD_SREG_DEV] ||
+	    !tb[NFTA_FWD_SREG_ADDR] ||
+	    !tb[NFTA_FWD_NFPROTO])
+		return -EINVAL;
+
+	priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+	priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+	priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case NFPROTO_IPV6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	if (err < 0)
+		return err;
+
+	return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+	    nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+	    nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+	.type		= &nft_fwd_netdev_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+	.eval		= nft_fwd_neigh_eval,
+	.init		= nft_fwd_neigh_init,
+	.dump		= nft_fwd_neigh_dump,
+};
+
 static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.type		= &nft_fwd_netdev_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.dump		= nft_fwd_netdev_dump,
 };
 
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+		   const struct nlattr * const tb[])
+{
+	if (tb[NFTA_FWD_SREG_ADDR])
+		return &nft_fwd_neigh_netdev_ops;
+	if (tb[NFTA_FWD_SREG_DEV])
+		return &nft_fwd_netdev_ops;
+
+        return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
 	.family		= NFPROTO_NETDEV,
 	.name		= "fwd",
-	.ops		= &nft_fwd_netdev_ops,
+	.select_ops	= nft_fwd_select_ops,
 	.policy		= nft_fwd_netdev_policy,
 	.maxattr	= NFTA_FWD_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3-55-g7522


From 290180e2448c02d6b391455937098882a73a9494 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Sat, 2 Jun 2018 21:38:51 +0200
Subject: netfilter: nf_tables: add connlimit support

This features which allows you to limit the maximum number of
connections per arbitrary key. The connlimit expression is stateful,
therefore it can be used from meters to dynamically populate a set, this
provides a mapping to the iptables' connlimit match. This patch also
comes that allows you define static connlimit policies.

This extension depends on the nf_conncount infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  21 ++-
 net/netfilter/Kconfig                    |   9 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_connlimit.c            | 297 +++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 net/netfilter/nft_connlimit.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a089af092a29..ae00a3c49b8a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1043,6 +1043,24 @@ enum nft_limit_attributes {
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
 
+enum nft_connlimit_flags {
+	NFT_CONNLIMIT_F_INV	= (1 << 0),
+};
+
+/**
+ * enum nft_connlimit_attributes - nf_tables connlimit expression netlink attributes
+ *
+ * @NFTA_CONNLIMIT_COUNT: number of connections (NLA_U32)
+ * @NFTA_CONNLIMIT_FLAGS: flags (NLA_U32: enum nft_connlimit_flags)
+ */
+enum nft_connlimit_attributes {
+	NFTA_CONNLIMIT_UNSPEC,
+	NFTA_CONNLIMIT_COUNT,
+	NFTA_CONNLIMIT_FLAGS,
+	__NFTA_CONNLIMIT_MAX
+};
+#define NFTA_CONNLIMIT_MAX	(__NFTA_CONNLIMIT_MAX - 1)
+
 /**
  * enum nft_counter_attributes - nf_tables counter expression netlink attributes
  *
@@ -1357,7 +1375,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_QUOTA	2
 #define NFT_OBJECT_CT_HELPER	3
 #define NFT_OBJECT_LIMIT	4
-#define __NFT_OBJECT_MAX	5
+#define NFT_OBJECT_CONNLIMIT	5
+#define __NFT_OBJECT_MAX	6
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 41240abd755f..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -517,6 +517,15 @@ config NFT_COUNTER
 	  This option adds the "counter" expression that you can use to
 	  include packet and byte counters in a rule.
 
+config NFT_CONNLIMIT
+	tristate "Netfilter nf_tables connlimit module"
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_CONNCOUNT
+	help
+	  This option adds the "connlimit" expression that you can use to
+	  ratelimit rule matchings per connections.
+
 config NFT_LOG
 	tristate "Netfilter nf_tables log module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index eec169555731..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -80,6 +80,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
+obj-$(CONFIG_NFT_CONNLIMIT)	+= nft_connlimit.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+struct nft_connlimit {
+	spinlock_t		lock;
+	struct hlist_head	hhead;
+	u32			limit;
+	bool			invert;
+};
+
+static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
+					 struct nft_regs *regs,
+					 const struct nft_pktinfo *pkt,
+					 const struct nft_set_ext *ext)
+{
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	const struct nf_conntrack_tuple *tuple_ptr;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int count;
+	bool addit;
+
+	tuple_ptr = &tuple;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (ct != NULL) {
+		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+		zone = nf_ct_zone(ct);
+	} else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
+				      nft_pf(pkt), nft_net(pkt), &tuple)) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	spin_lock_bh(&priv->lock);
+	count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+				    &addit);
+
+	if (!addit)
+		goto out;
+
+	if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+		regs->verdict.code = NF_DROP;
+		spin_unlock_bh(&priv->lock);
+		return;
+	}
+	count++;
+out:
+	spin_unlock_bh(&priv->lock);
+
+	if ((count > priv->limit) ^ priv->invert) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+}
+
+static int nft_connlimit_do_init(const struct nft_ctx *ctx,
+				 const struct nlattr * const tb[],
+				 struct nft_connlimit *priv)
+{
+	bool invert = false;
+	u32 flags, limit;
+
+	if (!tb[NFTA_CONNLIMIT_COUNT])
+		return -EINVAL;
+
+	limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
+
+	if (tb[NFTA_CONNLIMIT_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
+		if (flags & ~NFT_CONNLIMIT_F_INV)
+			return -EOPNOTSUPP;
+		if (flags & NFT_CONNLIMIT_F_INV)
+			invert = true;
+	}
+
+	spin_lock_init(&priv->lock);
+	INIT_HLIST_HEAD(&priv->hhead);
+	priv->limit	= limit;
+	priv->invert	= invert;
+
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
+				     struct nft_connlimit *priv)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static int nft_connlimit_do_dump(struct sk_buff *skb,
+				 struct nft_connlimit *priv)
+{
+	if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
+		goto nla_put_failure;
+	if (priv->invert &&
+	    nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline void nft_connlimit_obj_eval(struct nft_object *obj,
+					struct nft_regs *regs,
+					const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
+				const struct nlattr * const tb[],
+				struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
+				      struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_obj_dump(struct sk_buff *skb,
+				  struct nft_object *obj, bool reset)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
+	[NFTA_CONNLIMIT_COUNT]	= { .type = NLA_U32 },
+	[NFTA_CONNLIMIT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static struct nft_object_type nft_connlimit_obj_type;
+static const struct nft_object_ops nft_connlimit_obj_ops = {
+	.type		= &nft_connlimit_obj_type,
+	.size		= sizeof(struct nft_connlimit),
+	.eval		= nft_connlimit_obj_eval,
+	.init		= nft_connlimit_obj_init,
+	.destroy	= nft_connlimit_obj_destroy,
+	.dump		= nft_connlimit_obj_dump,
+};
+
+static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CONNLIMIT,
+	.ops		= &nft_connlimit_obj_ops,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.policy		= nft_connlimit_policy,
+	.owner		= THIS_MODULE,
+};
+
+static void nft_connlimit_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static int nft_connlimit_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_destroy(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+	struct nft_connlimit *priv_dst = nft_expr_priv(dst);
+	struct nft_connlimit *priv_src = nft_expr_priv(src);
+
+	spin_lock_init(&priv_dst->lock);
+	INIT_HLIST_HEAD(&priv_dst->hhead);
+	priv_dst->limit	 = priv_src->limit;
+	priv_dst->invert = priv_src->invert;
+
+	return 0;
+}
+
+static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
+					const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+	bool addit, ret;
+
+	spin_lock_bh(&priv->lock);
+	nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+
+	ret = hlist_empty(&priv->hhead);
+	spin_unlock_bh(&priv->lock);
+
+	return ret;
+}
+
+static struct nft_expr_type nft_connlimit_type;
+static const struct nft_expr_ops nft_connlimit_ops = {
+	.type		= &nft_connlimit_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
+	.eval		= nft_connlimit_eval,
+	.init		= nft_connlimit_init,
+	.destroy	= nft_connlimit_destroy,
+	.clone		= nft_connlimit_clone,
+	.destroy_clone	= nft_connlimit_destroy_clone,
+	.dump		= nft_connlimit_dump,
+	.gc		= nft_connlimit_gc,
+};
+
+static struct nft_expr_type nft_connlimit_type __read_mostly = {
+	.name		= "connlimit",
+	.ops		= &nft_connlimit_ops,
+	.policy		= nft_connlimit_policy,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.flags		= NFT_EXPR_STATEFUL | NFT_EXPR_GC,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_connlimit_module_init(void)
+{
+	int err;
+
+	err = nft_register_obj(&nft_connlimit_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_connlimit_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_connlimit_obj_type);
+	return err;
+}
+
+static void __exit nft_connlimit_module_exit(void)
+{
+	nft_unregister_expr(&nft_connlimit_type);
+	nft_unregister_obj(&nft_connlimit_obj_type);
+}
+
+module_init(nft_connlimit_module_init);
+module_exit(nft_connlimit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso");
+MODULE_ALIAS_NFT_EXPR("connlimit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
-- 
cgit v1.2.3-55-g7522


From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Sat, 2 Jun 2018 23:06:36 +0200
Subject: bpf: add bpf_skb_cgroup_id helper

Add a new bpf_skb_cgroup_id() helper that allows to retrieve the
cgroup id from the skb's socket. This is useful in particular to
enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in
cgroup v2 by allowing ID based matching on egress. This can in
particular be used in combination with applying policy e.g. from
map lookups, and also complements the older bpf_skb_under_cgroup()
interface. In user space the cgroup id for a given path can be
retrieved through the f_handle as demonstrated in [0] recently.

  [0] https://lkml.org/lkml/2018/5/22/1190

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 19 ++++++++++++++++++-
 net/core/filter.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 64ac0f7a689e..661318141f72 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 885fb0e2f264..edbfaa613b6d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3661,6 +3661,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+	.func           = bpf_skb_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+#endif
+
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 				  unsigned long off, unsigned long len)
 {
@@ -4747,12 +4768,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 #ifdef CONFIG_XFRM
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
-	case BPF_FUNC_fib_lookup:
-		return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	case BPF_FUNC_skb_cgroup_id:
+		return &bpf_skb_cgroup_id_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-55-g7522


From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Sat, 2 Jun 2018 23:06:37 +0200
Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch

Since the remaining bits are not filled in struct bpf_tunnel_key
resp. struct bpf_xfrm_state and originate from uninitialized stack
space, we should make sure to clear them before handing control
back to the program.

Also add a padding element to struct bpf_xfrm_state for future use
similar as we have in struct bpf_tunnel_key and clear it as well.

  struct bpf_xfrm_state {
      __u32                      reqid;            /*     0     4 */
      __u32                      spi;              /*     4     4 */
      __u16                      family;           /*     8     2 */

      /* XXX 2 bytes hole, try to pack */

      union {
          __u32              remote_ipv4;          /*           4 */
          __u32              remote_ipv6[4];       /*          16 */
      };                                           /*    12    16 */

      /* size: 28, cachelines: 1, members: 4 */
      /* sum members: 26, holes: 1, sum holes: 2 */
      /* last cacheline: 28 bytes */
  };

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 3 ++-
 net/core/filter.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 661318141f72..f0b6608b1f1c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2279,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
diff --git a/net/core/filter.c b/net/core/filter.c
index edbfaa613b6d..28e864777c0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3445,6 +3445,7 @@ set_compat:
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
+	to->tunnel_ext = 0;
 
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3452,6 +3453,8 @@ set_compat:
 		to->tunnel_label = be32_to_cpu(info->key.label);
 	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+		to->tunnel_label = 0;
 	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
 	to->reqid = x->props.reqid;
 	to->spi = x->id.spi;
 	to->family = x->props.family;
+	to->ext = 0;
+
 	if (to->family == AF_INET6) {
 		memcpy(to->remote_ipv6, x->props.saddr.a6,
 		       sizeof(to->remote_ipv6));
 	} else {
 		to->remote_ipv4 = x->props.saddr.a4;
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
 	}
 
 	return 0;
-- 
cgit v1.2.3-55-g7522


From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001
From: Yonghong Song
Date: Sun, 3 Jun 2018 15:59:41 -0700
Subject: bpf: implement bpf_get_current_cgroup_id() helper

bpf has been used extensively for tracing. For example, bcc
contains an almost full set of bpf-based tools to trace kernel
and user functions/events. Most tracing tools are currently
either filtered based on pid or system-wide.

Containers have been used quite extensively in industry and
cgroup is often used together to provide resource isolation
and protection. Several processes may run inside the same
container. It is often desirable to get container-level tracing
results as well, e.g. syscall count, function count, I/O
activity, etc.

This patch implements a new helper, bpf_get_current_cgroup_id(),
which will return cgroup id based on the cgroup within which
the current task is running.

The later patch will provide an example to show that
userspace can get the same cgroup id so it could
configure a filter or policy in the bpf program based on
task cgroup id.

The helper is currently implemented for tracing. It can
be added to other program types as well when needed.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 15 +++++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 5 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbe297436e5d..995c3b1e59bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
+extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 527587de8a67..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+
+	return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+	.func		= bpf_get_current_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 752992ce3513..e2ab5b7f29d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-55-g7522


From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Sun, 3 Jun 2018 08:15:19 -0700
Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo

As Michal noted the flow struct takes both the flow label and priority.
Update the bpf_fib_lookup API to note that it is flowinfo and not just
the flow label.

Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h   | 2 +-
 net/core/filter.c          | 2 +-
 samples/bpf/xdp_fwd_kern.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18712b0dbfe7..eeb6237be5c2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2629,7 +2629,7 @@ struct bpf_fib_lookup {
 	union {
 		/* inputs to lookup */
 		__u8	tos;		/* AF_INET  */
-		__be32	flowlabel;	/* AF_INET6 */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
 
 		/* output: metric of fib result (IPv4/IPv6 only) */
 		__u32	rt_metric;
diff --git a/net/core/filter.c b/net/core/filter.c
index a72ea9f61010..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4221,7 +4221,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		fl6.flowi6_oif = 0;
 		strict = RT6_LOOKUP_F_HAS_SADDR;
 	}
-	fl6.flowlabel = params->flowlabel;
+	fl6.flowlabel = params->flowinfo;
 	fl6.flowi6_scope = 0;
 	fl6.flowi6_flags = 0;
 	fl6.mp_hash = 0;
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 4a6be0f87505..6673cdb9f55c 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -88,7 +88,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 			return XDP_PASS;
 
 		fib_params.family	= AF_INET6;
-		fib_params.flowlabel	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+		fib_params.flowinfo	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
 		fib_params.l4_protocol	= ip6h->nexthdr;
 		fib_params.sport	= 0;
 		fib_params.dport	= 0;
-- 
cgit v1.2.3-55-g7522


From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Mon, 4 Jun 2018 13:57:13 +0200
Subject: xsk: new descriptor addressing scheme

Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data.  Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).

By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.

In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.

In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.

We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.

To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.

On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.

Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.

This commit changes the uapi of AF_XDP sockets, and updates the
documentation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++---------------
 include/uapi/linux/if_xdp.h         |  12 ++---
 net/xdp/xdp_umem.c                  |  33 ++++++------
 net/xdp/xdp_umem.h                  |  27 +++-------
 net/xdp/xdp_umem_props.h            |   4 +-
 net/xdp/xsk.c                       |  30 ++++++-----
 net/xdp/xsk_queue.c                 |   2 +-
 net/xdp/xsk_queue.h                 |  43 +++++++--------
 8 files changed, 123 insertions(+), 129 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 91928d9ee4bf..ff929cfab4f4 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -12,7 +12,7 @@ packet processing.
 
 This document assumes that the reader is familiar with BPF and XDP. If
 not, the Cilium project has an excellent reference guide at
-http://cilium.readthedocs.io/en/doc-1.0/bpf/.
+http://cilium.readthedocs.io/en/latest/bpf/.
 
 Using the XDP_REDIRECT action from an XDP program, the program can
 redirect ingress frames to other XDP enabled netdevs, using the
@@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points
 to that packet can be changed to point to another and reused right
 away. This again avoids copying data.
 
-The UMEM consists of a number of equally size frames and each frame
-has a unique frame id. A descriptor in one of the rings references a
-frame by referencing its frame id. The user space allocates memory for
-this UMEM using whatever means it feels is most appropriate (malloc,
-mmap, huge pages, etc). This memory area is then registered with the
-kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
-rings: the FILL ring and the COMPLETION ring. The fill ring is used by
-the application to send down frame ids for the kernel to fill in with
-RX packet data. References to these frames will then appear in the RX
-ring once each packet has been received. The completion ring, on the
-other hand, contains frame ids that the kernel has transmitted
-completely and can now be used again by user space, for either TX or
-RX. Thus, the frame ids appearing in the completion ring are ids that
-were previously transmitted using the TX ring. In summary, the RX and
-FILL rings are used for the RX path and the TX and COMPLETION rings
-are used for the TX path.
+The UMEM consists of a number of equally sized chunks. A descriptor in
+one of the rings references a frame by referencing its addr. The addr
+is simply an offset within the entire UMEM region. The user space
+allocates memory for this UMEM using whatever means it feels is most
+appropriate (malloc, mmap, huge pages, etc). This memory area is then
+registered with the kernel using the new setsockopt XDP_UMEM_REG. The
+UMEM also has two rings: the FILL ring and the COMPLETION ring. The
+fill ring is used by the application to send down addr for the kernel
+to fill in with RX packet data. References to these frames will then
+appear in the RX ring once each packet has been received. The
+completion ring, on the other hand, contains frame addr that the
+kernel has transmitted completely and can now be used again by user
+space, for either TX or RX. Thus, the frame addrs appearing in the
+completion ring are addrs that were previously transmitted using the
+TX ring. In summary, the RX and FILL rings are used for the RX path
+and the TX and COMPLETION rings are used for the TX path.
 
 The socket is then finally bound with a bind() call to a device and a
 specific queue id on that device, and it is not until bind is
@@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its
 corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
 call and submits the XSK of the process it would like to share UMEM
 with as well as its own newly created XSK socket. The new process will
-then receive frame id references in its own RX ring that point to this
-shared UMEM. Note that since the ring structures are single-consumer /
-single-producer (for performance reasons), the new process has to
-create its own socket with associated RX and TX rings, since it cannot
-share this with the other process. This is also the reason that there
-is only one set of FILL and COMPLETION rings per UMEM. It is the
-responsibility of a single process to handle the UMEM.
+then receive frame addr references in its own RX ring that point to
+this shared UMEM. Note that since the ring structures are
+single-consumer / single-producer (for performance reasons), the new
+process has to create its own socket with associated RX and TX rings,
+since it cannot share this with the other process. This is also the
+reason that there is only one set of FILL and COMPLETION rings per
+UMEM. It is the responsibility of a single process to handle the UMEM.
 
 How is then packets distributed from an XDP program to the XSKs? There
 is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
@@ -102,10 +102,10 @@ UMEM
 
 UMEM is a region of virtual contiguous memory, divided into
 equal-sized frames. An UMEM is associated to a netdev and a specific
-queue id of that netdev. It is created and configured (frame size,
-frame headroom, start address and size) by using the XDP_UMEM_REG
-setsockopt system call. A UMEM is bound to a netdev and queue id, via
-the bind() system call.
+queue id of that netdev. It is created and configured (chunk size,
+headroom, start address and size) by using the XDP_UMEM_REG setsockopt
+system call. A UMEM is bound to a netdev and queue id, via the bind()
+system call.
 
 An AF_XDP is socket linked to a single UMEM, but one UMEM can have
 multiple AF_XDP sockets. To share an UMEM created via one socket A,
@@ -147,13 +147,17 @@ UMEM Fill Ring
 ~~~~~~~~~~~~~~
 
 The Fill ring is used to transfer ownership of UMEM frames from
-user-space to kernel-space. The UMEM indicies are passed in the
-ring. As an example, if the UMEM is 64k and each frame is 4k, then the
-UMEM has 16 frames and can pass indicies between 0 and 15.
+user-space to kernel-space. The UMEM addrs are passed in the ring. As
+an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
+16 chunks and can pass addrs between 0 and 64k.
 
 Frames passed to the kernel are used for the ingress path (RX rings).
 
-The user application produces UMEM indicies to this ring.
+The user application produces UMEM addrs to this ring. Note that the
+kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
+log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
+and 3000 refers to the same chunk.
+
 
 UMEM Completetion Ring
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -165,16 +169,15 @@ used.
 Frames passed from the kernel to user-space are frames that has been
 sent (TX ring) and can be used by user-space again.
 
-The user application consumes UMEM indicies from this ring.
+The user application consumes UMEM addrs from this ring.
 
 
 RX Ring
 ~~~~~~~
 
 The RX ring is the receiving side of a socket. Each entry in the ring
-is a struct xdp_desc descriptor. The descriptor contains UMEM index
-(idx), the length of the data (len), the offset into the frame
-(offset).
+is a struct xdp_desc descriptor. The descriptor contains UMEM offset
+(addr) and the length of the data (len).
 
 If no frames have been passed to kernel via the Fill ring, no
 descriptors will (or can) appear on the RX ring.
@@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c.
 
 Naive ring dequeue and enqueue could look like this::
 
+    // struct xdp_rxtx_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	struct xdp_desc *desc;
+    // };
+
+    // struct xdp_umem_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	__u64 *desc;
+    // };
+
     // typedef struct xdp_rxtx_ring RING;
     // typedef struct xdp_umem_ring RING;
 
     // typedef struct xdp_desc RING_TYPE;
-    // typedef __u32 RING_TYPE;
+    // typedef __u64 RING_TYPE;
 
     int dequeue_one(RING *ring, RING_TYPE *item)
     {
-        __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
+        __u32 entries = *ring->producer - *ring->consumer;
 
         if (entries == 0)
             return -1;
 
         // read-barrier!
 
-        *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
-        ring->ptrs.consumer++;
+        *item = ring->desc[*ring->consumer & (RING_SIZE - 1)];
+        (*ring->consumer)++;
         return 0;
     }
 
     int enqueue_one(RING *ring, const RING_TYPE *item)
     {
-        u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
+        u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer);
 
         if (free_entries == 0)
             return -1;
 
-        ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
+        ring->desc[*ring->producer & (RING_SIZE - 1)] = *item;
 
         // write-barrier!
 
-        ring->ptrs.producer++;
+        (*ring->producer)++;
         return 0;
     }
 
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 4737cfe222f5..e411d6f9ac65 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -48,8 +48,8 @@ struct xdp_mmap_offsets {
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
 	__u64 len; /* Length of packet data area */
-	__u32 frame_size; /* Frame size */
-	__u32 frame_headroom; /* Frame head room */
+	__u32 chunk_size;
+	__u32 headroom;
 };
 
 struct xdp_statistics {
@@ -66,13 +66,11 @@ struct xdp_statistics {
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
-	__u32 idx;
+	__u64 addr;
 	__u32 len;
-	__u16 offset;
-	__u8 flags;
-	__u8 padding[5];
+	__u32 options;
 };
 
-/* UMEM descriptor is __u32 */
+/* UMEM descriptor is __u64 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 87998818116f..9ad791ff4739 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
 
 #include "xdp_umem.h"
 
-#define XDP_UMEM_MIN_FRAME_SIZE 2048
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
@@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
-	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	unsigned int nframes, nfpp;
 	int size_chk, err;
 
-	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
 		 * - huge pages, or*
 		 * - using an IOMMU, or
@@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(frame_size))
+	if (!is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = (unsigned int)div_u64(size, frame_size);
-	if (nframes == 0 || nframes > UINT_MAX)
+	chunks = (unsigned int)div_u64(size, chunk_size);
+	if (chunks == 0)
 		return -EINVAL;
 
-	nfpp = PAGE_SIZE / frame_size;
-	if (nframes < nfpp || nframes % nfpp)
+	chunks_per_page = PAGE_SIZE / chunk_size;
+	if (chunks < chunks_per_page || chunks % chunks_per_page)
 		return -EINVAL;
 
-	frame_headroom = ALIGN(frame_headroom, 64);
+	headroom = ALIGN(headroom, 64);
 
-	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	if (size_chk < 0)
 		return -EINVAL;
 
 	umem->pid = get_task_pid(current, PIDTYPE_PID);
-	umem->size = (size_t)size;
 	umem->address = (unsigned long)addr;
-	umem->props.frame_size = frame_size;
-	umem->props.nframes = nframes;
-	umem->frame_headroom = frame_headroom;
+	umem->props.chunk_mask = ~((u64)chunk_size - 1);
+	umem->props.size = size;
+	umem->headroom = headroom;
+	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
 
-	umem->frame_size_log2 = ilog2(frame_size);
-	umem->nfpp_mask = nfpp - 1;
-	umem->nfpplog2 = ilog2(nfpp);
 	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 0881cf456230..aeadd1bcb72d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,35 +18,20 @@ struct xdp_umem {
 	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
-	u32 npgs;
-	u32 frame_headroom;
-	u32 nfpp_mask;
-	u32 nfpplog2;
-	u32 frame_size_log2;
+	u32 headroom;
+	u32 chunk_size_nohr;
 	struct user_struct *user;
 	struct pid *pid;
 	unsigned long address;
-	size_t size;
 	refcount_t users;
 	struct work_struct work;
+	u32 npgs;
 };
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
-{
-	u64 pg, off;
-	char *data;
-
-	pg = idx >> umem->nfpplog2;
-	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
-
-	data = page_address(umem->pgs[pg]);
-	return data + off;
-}
-
-static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
-						    u32 idx)
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
+		(addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 2cf8ec485fd2..40eab10dfc49 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -7,8 +7,8 @@
 #define XDP_UMEM_PROPS_H_
 
 struct xdp_umem_props {
-	u32 frame_size;
-	u32 nframes;
+	u64 chunk_mask;
+	u64 size;
 };
 
 #endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 966307ce4b8e..4688c750df1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 id, len = xdp->data_end - xdp->data;
+	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
+	u64 addr;
 	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id)) {
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, id, len,
-				      xs->umem->frame_headroom);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err)
-		xskq_discard_id(xs->umem->fq);
+		xskq_discard_addr(xs->umem->fq);
 	else
 		xs->rx_dropped++;
 
@@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
-	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
+	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
-	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
+	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 
 	sock_wfree(skb);
 }
@@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	while (xskq_peek_desc(xs->tx, &desc)) {
 		char *buffer;
-		u32 id, len;
+		u64 addr;
+		u32 len;
 
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		if (xskq_reserve_id(xs->umem->cq)) {
+		if (xskq_reserve_addr(xs->umem->cq)) {
 			err = -EAGAIN;
 			goto out;
 		}
@@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		}
 
 		skb_put(skb, len);
-		id = desc.idx;
-		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
+		addr = desc.addr;
+		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
 		if (unlikely(err)) {
 			kfree_skb(skb);
@@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index ebe85e59507e..6c32e92e98fc 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
 
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
-	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
 }
 
 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b5924e7aeb2b..337e5ad3b10e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -27,7 +27,7 @@ struct xdp_rxtx_ring {
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
-	u32 desc[0] ____cacheline_aligned_in_smp;
+	u64 desc[0] ____cacheline_aligned_in_smp;
 };
 
 struct xsk_queue {
@@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 
 /* UMEM queue */
 
-static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (unlikely(idx >= q->umem_props.nframes)) {
+	if (addr >= q->umem_props.size) {
 		q->invalid_descs++;
 		return false;
 	}
+
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		*id = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_id(q, *id))
-			return id;
+		*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+		if (xskq_is_valid_addr(q, *addr))
+			return addr;
 
 		q->cons_tail++;
 	}
@@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
@@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 		smp_rmb();
 	}
 
-	return xskq_validate_id(q, id);
+	return xskq_validate_addr(q, addr);
 }
 
-static inline void xskq_discard_id(struct xsk_queue *q)
+static inline void xskq_discard_addr(struct xsk_queue *q)
 {
 	q->cons_tail++;
 }
 
-static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	ring->desc[q->prod_tail++ & q->ring_mask] = id;
+	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
 	smp_wmb();
@@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
 	return 0;
 }
 
-static inline int xskq_reserve_id(struct xsk_queue *q)
+static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
@@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q)
 
 static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 {
-	u32 buff_len;
-
-	if (unlikely(d->idx >= q->umem_props.nframes)) {
-		q->invalid_descs++;
+	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
-	}
 
-	buff_len = q->umem_props.frame_size;
-	if (unlikely(d->len > buff_len || d->len == 0 ||
-		     d->offset > buff_len || d->offset + d->len > buff_len)) {
+	if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+	    (d->addr & q->umem_props.chunk_mask)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q)
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-					  u32 id, u32 len, u16 offset)
+					  u64 addr, u32 len)
 {
 	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 	unsigned int idx;
@@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 		return -ENOSPC;
 
 	idx = (q->prod_head++) & q->ring_mask;
-	ring->desc[idx].idx = id;
+	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
-	ring->desc[idx].offset = offset;
 
 	return 0;
 }
-- 
cgit v1.2.3-55-g7522


From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001
From: Björn Töpel
Date: Mon, 4 Jun 2018 14:05:55 +0200
Subject: xsk: add zero-copy support for Rx

Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and
wireup ndo_bpf call in bind.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      |  6 +++
 include/uapi/linux/if_xdp.h |  4 +-
 net/xdp/xdp_umem.c          | 77 ++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  3 ++
 net/xdp/xsk.c               | 96 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 165 insertions(+), 21 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index caf343a7e224..d93d3aac3fc9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -22,6 +22,7 @@ struct xdp_umem_props {
 
 struct xdp_umem_page {
 	void *addr;
+	dma_addr_t dma;
 };
 
 struct xdp_umem {
@@ -38,6 +39,9 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	struct net_device *dev;
+	u16 queue_id;
+	bool zc;
 };
 
 struct xdp_sock {
@@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_discard_addr(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e411d6f9ac65..1fa0e977ea8d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -13,7 +13,9 @@
 #include <linux/types.h>
 
 /* Options for the sxdp_flags field */
-#define XDP_SHARED_UMEM 1
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index aca826011f6c..f729d79b8d91 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,81 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags)
+{
+	bool force_zc, force_copy;
+	struct netdev_bpf bpf;
+	int err;
+
+	force_zc = flags & XDP_ZEROCOPY;
+	force_copy = flags & XDP_COPY;
+
+	if (force_zc && force_copy)
+		return -EINVAL;
+
+	if (force_copy)
+		return 0;
+
+	dev_hold(dev);
+
+	if (dev->netdev_ops->ndo_bpf) {
+		bpf.command = XDP_QUERY_XSK_UMEM;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? -ENOTSUPP : 0;
+		}
+
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = umem;
+		bpf.xsk.queue_id = queue_id;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? err : 0; /* fail or fallback */
+		}
+
+		umem->dev = dev;
+		umem->queue_id = queue_id;
+		umem->zc = true;
+		return 0;
+	}
+
+	dev_put(dev);
+	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	if (umem->dev) {
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = NULL;
+		bpf.xsk.queue_id = umem->queue_id;
+
+		rtnl_lock();
+		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+		rtnl_unlock();
+
+		if (err)
+			WARN(1, "failed to disable umem!\n");
+
+		dev_put(umem->dev);
+		umem->dev = NULL;
+	}
+}
+
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unsigned int i;
@@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	xdp_umem_clear_dev(umem);
+
 	if (umem->fq) {
 		xskq_destroy(umem->fq);
 		umem->fq = NULL;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 40e8fa4a92af..674508a32a4d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4688c750df1d..ab64bd8260ea 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
-	return !!xs->rx;
+	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
+		READ_ONCE(xs->umem->fq);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+	xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
 	u64 addr;
 	int err;
 
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
-		return -EINVAL;
-
 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
@@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
-	if (!err)
+	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
-	else
-		xs->rx_dropped++;
+		xdp_return_buff(xdp);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	int err;
+	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-	err = __xsk_rcv(xs, xdp);
-	if (likely(!err))
+	if (err) {
 		xdp_return_buff(xdp);
+		xs->rx_dropped++;
+	}
 
 	return err;
 }
 
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	len = xdp->data_end - xdp->data;
+
+	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
 void xsk_flush(struct xdp_sock *xs)
 {
 	xskq_produce_flush_desc(xs->rx);
@@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
+	u32 len = xdp->data_end - xdp->data;
+	void *buffer;
+	u64 addr;
 	int err;
 
-	err = __xsk_rcv(xs, xdp);
-	if (!err)
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	if (!err) {
+		xskq_discard_addr(xs->umem->fq);
 		xsk_flush(xs);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
@@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct net_device *dev;
+	u32 flags, qid;
 	int err = 0;
 
 	if (addr_len < sizeof(struct sockaddr_xdp))
@@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	}
 
-	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
-	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
+	qid = sxdp->sxdp_queue_id;
+
+	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+	    (xs->tx && qid >= dev->real_num_tx_queues)) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
 
-	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+	flags = sxdp->sxdp_flags;
+
+	if (flags & XDP_SHARED_UMEM) {
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+			/* Cannot specify flags for shared sockets. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
 		if (xs->umem) {
 			/* We have already our own. */
 			err = -EINVAL;
@@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev ||
-			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
@@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+		if (err)
+			goto out_unlock;
 	}
 
 	xs->dev = dev;
-- 
cgit v1.2.3-55-g7522