diff options
author | David S. Miller | 2018-12-19 20:21:45 +0100 |
---|---|---|
committer | David S. Miller | 2018-12-19 20:21:45 +0100 |
commit | 4a54877ee767fe70a6966352c788fc5f405aa3c6 (patch) | |
tree | 6c37280ad8ba6e1d028962bfb2e3ace1f590d3fc /include/linux | |
parent | Merge branch 'dpaa2-eth-add-QBMAN-statistics' (diff) | |
parent | net: switch secpath to use skb extension infrastructure (diff) | |
download | kernel-qcow2-linux-4a54877ee767fe70a6966352c788fc5f405aa3c6.tar.gz kernel-qcow2-linux-4a54877ee767fe70a6966352c788fc5f405aa3c6.tar.xz kernel-qcow2-linux-4a54877ee767fe70a6966352c788fc5f405aa3c6.zip |
Merge branch 'sk_buff-add-extension-infrastructure'
Florian Westphal says:
====================
sk_buff: add extension infrastructure
TL;DR:
- objdiff shows no change if CONFIG_XFRM=n && BR_NETFILTER=n
- small size reduction when one or both options are set
- no changes in ipsec performance
Changes since v1:
- Allocate entire extension space from a kmem_cache.
- Avoid atomic_dec_and_test operation on skb_ext_put() for refcnt == 1 case.
(similar to kfree_skbmem() fclone_ref use).
This adds an optional extension infrastructure, with ispec (xfrm) and
bridge netfilter as first users.
The third (future) user is Multipath TCP which is still out-of-tree.
MPTCP needs to map logical mptcp sequence numbers to the tcp sequence
numbers used by individual subflows.
This DSS mapping is read/written from tcp option space on receive and
written to tcp option space on transmitted tcp packets that are part of
and MPTCP connection.
Extending skb_shared_info or adding a private data field to skb fclones
doesn't work for incoming skb, so a different DSS propagation method would
be required for the receive side.
mptcp has same requirements as secpath/bridge netfilter:
1. extension memory is released when the sk_buff is free'd.
2. data is shared after cloning an skb (clone inherits extension)
3. adding extension to an skb will COW the extension buffer if needed.
Two new members are added to sk_buff:
1. 'active_extensions' byte (filling a hole), telling which extensions
are available for this skb.
This has two purposes.
a) avoids the need to initialize the pointer.
b) allows to "delete" an extension by clearing its bit
value in ->active_extensions.
While it would be possible to store the active_extensions byte
in the extension struct instead of sk_buff, there is one problem
with this:
When an extension has to be disabled, we can always clear the
bit in skb->active_extensions. But in case it would be stored in the
extension buffer itself, we might have to COW it first, if
we are dealing with a cloned skb. On kmalloc failure we would
be unable to turn an extension off.
2. extension pointer, located at the end of the sk_buff.
If the active_extensions byte is 0, the pointer is undefined,
it is not initialized on skb allocation.
This adds extra code to skb clone and free paths (to deal with
refcount/free of extension area) but this replaces similar code that
manages skb->nf_bridge and skb->sp structs in the followup patches of
the series.
It is possible to add support for extensions that are not preseved on
clones/copies:
1. define a bitmask of all extensions that need copy/cow on clone
2. change __skb_ext_copy() to check
->active_extensions & SKB_EXT_PRESERVE_ON_CLONE
3. set clone->active_extensions to 0 if test is false.
This isn't done here because all extensions that get added here
need the copy/cow semantics.
Last patch converts skb->sp, secpath information gets stored as
new SKB_EXT_SEC_PATH, so the 'sp' pointer is removed from skbuff.
Extra code added to skb clone and free paths (to deal with refcount/free
of extension area) replaces the existing code that does the same for
skb->nf_bridge and skb->secpath.
I don't see any other in-tree users that could benefit from this
infrastructure, it doesn't make sense to add an extension just for the sake
of a single flag bit (like skb->nf_trace).
Adding a new extension is a good fit if all of the following are true:
1. Data is related to the skb/packet aggregate
2. Data should be freed when the skb is free'd
3. Data is not going to be relevant/needed in normal case (udp, tcp,
forwarding workloads, ...)
4. There are no fancy action(s) needed on clone/free, such as callbacks
into kernel modules.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/netfilter_bridge.h | 33 | ||||
-rw-r--r-- | include/linux/skbuff.h | 152 |
2 files changed, 146 insertions, 39 deletions
diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fa0686500970..5f2614d02e03 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -17,43 +17,58 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) skb_dst_drop(skb); } +static inline struct nf_bridge_info * +nf_bridge_info_get(const struct sk_buff *skb) +{ + return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); +} + +static inline bool nf_bridge_info_exists(const struct sk_buff *skb) +{ + return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); +} + static inline int nf_bridge_get_physinif(const struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (skb->nf_bridge == NULL) + if (!nf_bridge) return 0; - nf_bridge = skb->nf_bridge; return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (skb->nf_bridge == NULL) + if (!nf_bridge) return 0; - nf_bridge = skb->nf_bridge; return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0; } static inline struct net_device * nf_bridge_get_physindev(const struct sk_buff *skb) { - return skb->nf_bridge ? skb->nf_bridge->physindev : NULL; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge ? nf_bridge->physindev : NULL; } static inline struct net_device * nf_bridge_get_physoutdev(const struct sk_buff *skb) { - return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge ? nf_bridge->physoutdev : NULL; } static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { - return skb->nf_bridge && skb->nf_bridge->in_prerouting; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge && nf_bridge->in_prerouting; } #else #define br_drop_fake_rtable(skb) do { } while (0) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1831a5ca173..3f741b04e55d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -245,6 +245,7 @@ struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; +struct skb_ext; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -254,7 +255,6 @@ struct nf_conntrack { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { - refcount_t use; enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, @@ -636,6 +636,7 @@ typedef unsigned char *sk_buff_data_t; * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -665,6 +666,7 @@ typedef unsigned char *sk_buff_data_t; * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c + * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { @@ -712,15 +714,9 @@ struct sk_buff { struct list_head tcp_tsorted_anchor; }; -#ifdef CONFIG_XFRM - struct sec_path *sp; -#endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge; -#endif unsigned int len, data_len; __u16 mac_len, @@ -747,7 +743,9 @@ struct sk_buff { head_frag:1, xmit_more:1, pfmemalloc:1; - +#ifdef CONFIG_SKB_EXTENSIONS + __u8 active_extensions; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -869,6 +867,11 @@ struct sk_buff { *data; unsigned int truesize; refcount_t users; + +#ifdef CONFIG_SKB_EXTENSIONS + /* only useable after checking ->active_extensions != 0 */ + struct skb_ext *extensions; +#endif }; #ifdef __KERNEL__ @@ -3896,18 +3899,108 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif + +#ifdef CONFIG_SKB_EXTENSIONS +enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) + SKB_EXT_BRIDGE_NF, +#endif +#ifdef CONFIG_XFRM + SKB_EXT_SEC_PATH, +#endif + SKB_EXT_NUM, /* must be last */ +}; + +/** + * struct skb_ext - sk_buff extensions + * @refcnt: 1 on allocation, deallocated on 0 + * @offset: offset to add to @data to obtain extension address + * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units + * @data: start of extension data, variable sized + * + * Note: offsets/lengths are stored in chunks of 8 bytes, this allows + * to use 'u8' types while allowing up to 2kb worth of extension data. + */ +struct skb_ext { + refcount_t refcnt; + u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ + u8 chunks; /* same */ + char data[0] __aligned(8); +}; + +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_put(struct skb_ext *ext); + +static inline void skb_ext_put(struct sk_buff *skb) +{ + if (skb->active_extensions) + __skb_ext_put(skb->extensions); +} + +static inline void skb_ext_get(struct sk_buff *skb) +{ + if (skb->active_extensions) { + struct skb_ext *ext = skb->extensions; + + if (ext) + refcount_inc(&ext->refcnt); + } +} + +static inline void __skb_ext_copy(struct sk_buff *dst, + const struct sk_buff *src) +{ + dst->active_extensions = src->active_extensions; + + if (src->active_extensions) { + struct skb_ext *ext = src->extensions; + + refcount_inc(&ext->refcnt); + dst->extensions = ext; + } +} + +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) +{ + skb_ext_put(dst); + __skb_ext_copy(dst, src); +} + +static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) +{ + return !!ext->offset[i]; +} + +static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) { - if (nf_bridge && refcount_dec_and_test(&nf_bridge->use)) - kfree(nf_bridge); + return skb->active_extensions & (1 << id); } -static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge) + +static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { - if (nf_bridge) - refcount_inc(&nf_bridge->use); + if (skb_ext_exist(skb, id)) + __skb_ext_del(skb, id); } -#endif /* CONFIG_BRIDGE_NETFILTER */ + +static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) { + struct skb_ext *ext = skb->extensions; + + return (void *)ext + (ext->offset[id] << 3); + } + + return NULL; +} +#else +static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_get(struct sk_buff *skb) {} +static inline void skb_ext_del(struct sk_buff *skb, int unused) {} +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} +#endif /* CONFIG_SKB_EXTENSIONS */ + static inline void nf_reset(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -3915,8 +4008,7 @@ static inline void nf_reset(struct sk_buff *skb) skb->_nfct = 0; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); #endif } @@ -3934,7 +4026,7 @@ static inline void ipvs_reset(struct sk_buff *skb) #endif } -/* Note: This doesn't put any conntrack and bridge info in dst. */ +/* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { @@ -3942,10 +4034,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - dst->nf_bridge = src->nf_bridge; - nf_bridge_get(src->nf_bridge); -#endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; @@ -3957,9 +4045,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(dst->nf_bridge); -#endif __nf_copy(dst, src, true); } @@ -3981,12 +4066,19 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif +static inline int secpath_exists(const struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + return skb_ext_exist(skb, SKB_EXT_SEC_PATH); +#else + return 0; +#endif +} + static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && -#if IS_ENABLED(CONFIG_XFRM) - !skb->sp && -#endif + !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); @@ -4032,10 +4124,10 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) return skb->dst_pending_confirm != 0; } -static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp; + return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif |