diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 3 | ||||
-rw-r--r-- | net/core/datagram.c | 55 | ||||
-rw-r--r-- | net/core/dev.c | 21 | ||||
-rw-r--r-- | net/core/dst.c | 6 | ||||
-rw-r--r-- | net/core/fib_notifier.c | 164 | ||||
-rw-r--r-- | net/core/fib_rules.c | 69 | ||||
-rw-r--r-- | net/core/filter.c | 280 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 59 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 28 | ||||
-rw-r--r-- | net/core/neighbour.c | 10 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 222 | ||||
-rw-r--r-- | net/core/net-traces.c | 1 | ||||
-rw-r--r-- | net/core/net_namespace.c | 5 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 249 | ||||
-rw-r--r-- | net/core/skbuff.c | 364 | ||||
-rw-r--r-- | net/core/sock.c | 47 |
16 files changed, 1197 insertions, 386 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index d501c4278015..56d771a887b6 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o sock_reuseport.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ + fib_notifier.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647bd91b3..2f3277945d35 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,27 +573,12 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -/** - * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter - * @skb: buffer to copy - * @from: the source to copy from - * - * The function will first copy up to headlen, and then pin the userspace - * pages and build frags through them. - * - * Returns 0, -EFAULT or -EMSGSIZE. - */ -int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) { - int len = iov_iter_count(from); - int copy = min_t(int, skb_headlen(skb), len); - int frag = 0; + int frag = skb_shinfo(skb)->nr_frags; - /* copy up to skb headlen */ - if (skb_copy_datagram_from_iter(skb, 0, from, copy)) - return -EFAULT; - - while (iov_iter_count(from)) { + while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; size_t start; ssize_t copied; @@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - copied = iov_iter_get_pages(from, pages, ~0U, + copied = iov_iter_get_pages(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; iov_iter_advance(from, copied); + length -= copied; truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - refcount_add(truesize, &skb->sk->sk_wmem_alloc); + if (sk && sk->sk_type == SOCK_STREAM) { + sk->sk_wmem_queued += truesize; + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } while (copied) { int size = min_t(int, copied, PAGE_SIZE - start); skb_fill_page_desc(skb, frag++, pages[n], start, size); @@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } return 0; } +EXPORT_SYMBOL(__zerocopy_sg_from_iter); + +/** + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter + * @skb: buffer to copy + * @from: the source to copy from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + */ +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; + + return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); +} EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, diff --git a/net/core/dev.c b/net/core/dev.c index 8ea6b4b42611..40b28e417072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@ -2731,8 +2731,7 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + return skb->ip_summed != CHECKSUM_PARTIAL; return skb->ip_summed == CHECKSUM_NONE; } @@ -3920,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3941,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) kfree_skb(skb); } } +EXPORT_SYMBOL_GPL(generic_xdp_tx); static struct static_key generic_xdp_needed __read_mostly; -static int do_xdp_generic(struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); int err; @@ -3972,6 +3970,7 @@ out_redir: kfree_skb(skb); return XDP_DROP; } +EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { @@ -3982,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4412,7 +4412,7 @@ skip_classify: } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@ -4503,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); if (ret != XDP_PASS) { rcu_read_unlock(); diff --git a/net/core/dst.c b/net/core/dst.c index 00aa972ad1a1..d6ead757c258 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -55,7 +55,7 @@ const struct dst_metrics dst_default_metrics = { * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - .refcnt = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -213,7 +213,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - atomic_set(&p->refcnt, 1); + refcount_set(&p->refcnt, 1); memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; @@ -225,7 +225,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) if (prev & DST_METRICS_READ_ONLY) p = NULL; } else if (prev & DST_METRICS_REFCOUNTED) { - if (atomic_dec_and_test(&old_p->refcnt)) + if (refcount_dec_and_test(&old_p->refcnt)) kfree(old_p); } } diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c new file mode 100644 index 000000000000..292aab83702f --- /dev/null +++ b/net/core/fib_notifier.c @@ -0,0 +1,164 @@ +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <net/net_namespace.h> +#include <net/fib_notifier.h> + +static ATOMIC_NOTIFIER_HEAD(fib_chain); + +int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return nb->notifier_call(nb, event_type, info); +} +EXPORT_SYMBOL(call_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return atomic_notifier_call_chain(&fib_chain, event_type, info); +} +EXPORT_SYMBOL(call_fib_notifiers); + +static unsigned int fib_seq_sum(void) +{ + struct fib_notifier_ops *ops; + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) { + list_for_each_entry(ops, &net->fib_notifier_ops, list) + fib_seq += ops->fib_seq_read(net); + } + rtnl_unlock(); + + return fib_seq; +} + +static int fib_net_dump(struct net *net, struct notifier_block *nb) +{ + struct fib_notifier_ops *ops; + + list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { + int err = ops->fib_dump(net, nb); + + if (err) + return err; + } + + return 0; +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + int err; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + rcu_read_lock(); + for_each_net_rcu(net) { + err = fib_net_dump(net, nb); + if (err) + goto err_fib_net_dump; + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; + +err_fib_net_dump: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +static int __fib_notifier_ops_register(struct fib_notifier_ops *ops, + struct net *net) +{ + struct fib_notifier_ops *o; + + list_for_each_entry(o, &net->fib_notifier_ops, list) + if (ops->family == o->family) + return -EEXIST; + list_add_tail_rcu(&ops->list, &net->fib_notifier_ops); + return 0; +} + +struct fib_notifier_ops * +fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net) +{ + struct fib_notifier_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (!ops) + return ERR_PTR(-ENOMEM); + + err = __fib_notifier_ops_register(ops, net); + if (err) + goto err_register; + + return ops; + +err_register: + kfree(ops); + return ERR_PTR(err); +} +EXPORT_SYMBOL(fib_notifier_ops_register); + +void fib_notifier_ops_unregister(struct fib_notifier_ops *ops) +{ + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); +} +EXPORT_SYMBOL(fib_notifier_ops_unregister); + +static int __net_init fib_notifier_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->fib_notifier_ops); + return 0; +} + +static struct pernet_operations fib_notifier_net_ops = { + .init = fib_notifier_net_init, +}; + +static int __init fib_notifier_init(void) +{ + return register_pernet_subsys(&fib_notifier_net_ops); +} + +subsys_initcall(fib_notifier_init); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index fdcb1bcd2afa..9a6d97c1d810 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -299,6 +299,67 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); +static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule, int family) +{ + struct fib_rule_notifier_info info = { + .info.family = family, + .rule = rule, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule, + struct fib_rules_ops *ops) +{ + struct fib_rule_notifier_info info = { + .info.family = ops->family, + .rule = rule, + }; + + ops->fib_rules_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +/* Called with rcu_read_lock() */ +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +{ + struct fib_rules_ops *ops; + struct fib_rule *rule; + + ops = lookup_rules_ops(net, family); + if (!ops) + return -EAFNOSUPPORT; + list_for_each_entry_rcu(rule, &ops->rules_list, list) + call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, + family); + rules_ops_put(ops); + + return 0; +} +EXPORT_SYMBOL_GPL(fib_rules_dump); + +unsigned int fib_rules_seq_read(struct net *net, int family) +{ + unsigned int fib_rules_seq; + struct fib_rules_ops *ops; + + ASSERT_RTNL(); + + ops = lookup_rules_ops(net, family); + if (!ops) + return 0; + fib_rules_seq = ops->fib_rules_seq; + rules_ops_put(ops); + + return fib_rules_seq; +} +EXPORT_SYMBOL_GPL(fib_rules_seq_read); + static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rules_ops *ops) { @@ -548,6 +609,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -687,6 +749,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); @@ -963,9 +1026,9 @@ static struct pernet_operations fib_rules_net_ops = { static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) diff --git a/net/core/filter.c b/net/core/filter.c index 7e9708653c6f..fa2115695037 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -514,14 +514,27 @@ do_pass: break; } - /* Convert JEQ into JNE when 'jump_true' is next insn. */ - if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { - insn->code = BPF_JMP | BPF_JNE | bpf_src; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + target = i + fp->jf + 1; BPF_EMIT_JMP; break; } - +jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; @@ -1845,6 +1858,45 @@ static const struct bpf_func_proto bpf_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return SK_ABORTED; + + ri->ifindex = key; + ri->flags = flags; + ri->map = map; + + return SK_REDIRECT; +} + +struct sock *do_sk_redirect_map(void) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct sock *sk = NULL; + + if (ri->map) { + sk = __sock_map_lookup_elem(ri->map, ri->ifindex); + + ri->ifindex = 0; + ri->map = NULL; + /* we do not clear flags for future lookup */ + } + + return sk; +} + +static const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2483,14 +2535,16 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_map *map = ri->map; u32 index = ri->ifindex; struct net_device *fwd; - int err = -EINVAL; + int err; ri->ifindex = 0; ri->map = NULL; fwd = __dev_map_lookup_elem(map, index); - if (!fwd) + if (!fwd) { + err = -EINVAL; goto out; + } if (ri->map_to_flush && (ri->map_to_flush != map)) xdp_do_flush_map(); @@ -2500,7 +2554,7 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->map_to_flush = map; out: - trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err); return err; } @@ -2509,21 +2563,24 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, { struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct net_device *fwd; + u32 index = ri->ifindex; + int err; if (ri->map) return xdp_do_redirect_map(dev, xdp, xdp_prog); - fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; - ri->map = NULL; if (unlikely(!fwd)) { - bpf_warn_invalid_xdp_redirect(ri->ifindex); - return -EINVAL; + bpf_warn_invalid_xdp_redirect(index); + err = -EINVAL; + goto out; } - trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); - - return __bpf_tx_xdp(fwd, NULL, xdp, 0); + err = __bpf_tx_xdp(fwd, NULL, xdp, 0); +out: + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err); + return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -2531,11 +2588,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned int len; + u32 index = ri->ifindex; - dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + dev = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; if (unlikely(!dev)) { - bpf_warn_invalid_xdp_redirect(ri->ifindex); + bpf_warn_invalid_xdp_redirect(index); goto err; } @@ -3214,6 +3272,32 @@ static const struct bpf_func_proto * switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; + case BPF_FUNC_sock_map_update: + return &bpf_sock_map_update_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + case BPF_FUNC_sk_redirect_map: + return &bpf_sk_redirect_map_proto; default: return bpf_base_func_proto(func_id); } @@ -3271,6 +3355,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): + case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) @@ -3299,6 +3387,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -3320,6 +3409,7 @@ static bool lwt_is_valid_access(int off, int size, { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -3370,8 +3460,8 @@ static bool sock_filter_is_valid_access(int off, int size, return true; } -static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, - const struct bpf_prog *prog) +static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; @@ -3398,7 +3488,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); - *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ @@ -3409,6 +3499,12 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); +} + static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) @@ -3433,6 +3529,8 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; } return bpf_skb_is_valid_access(off, size, type, info); @@ -3510,6 +3608,41 @@ static bool sock_ops_is_valid_access(int off, int size, return __is_valid_sock_ops_access(off, size); } +static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); +} + +static bool sk_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + return false; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -3675,6 +3808,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else @@ -3690,9 +3824,110 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else + *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; + case offsetof(struct __sk_buff, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_family, + 2, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_daddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_rcv_saddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip6[0]) ... + offsetof(struct __sk_buff, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, remote_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, local_ip6[0]) ... + offsetof(struct __sk_buff, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, local_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_dport, + 2, target_size)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct __sk_buff, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_num, 2, target_size)); + break; } return insn - insn_buf; @@ -3977,6 +4212,13 @@ const struct bpf_verifier_ops sock_ops_prog_ops = { .convert_ctx_access = sock_ops_convert_ctx_access, }; +const struct bpf_verifier_ops sk_skb_prog_ops = { + .get_func_proto = sk_skb_func_proto, + .is_valid_access = sk_skb_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, + .gen_prologue = sk_skb_prologue, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index fc5fc4594c90..e2eaa1ff948d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -4,6 +4,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> +#include <net/dsa.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> @@ -440,6 +441,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb, skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); +#if IS_ENABLED(CONFIG_NET_DSA) + if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) { + const struct dsa_device_ops *ops; + int offset; + + ops = skb->dev->dsa_ptr->tag_ops; + if (ops->flow_dissect && + !ops->flow_dissect(skb, &proto, &offset)) { + hlen -= offset; + nhoff += offset; + } + } +#endif } /* It is ensured by skb_flow_dissector_init() that control key will @@ -998,51 +1012,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) -{ - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - memcpy(&keys.addrs.v6addrs.src, &fl6->saddr, - sizeof(keys.addrs.v6addrs.src)); - memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr, - sizeof(keys.addrs.v6addrs.dst)); - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys.ports.src = fl6->fl6_sport; - keys.ports.dst = fl6->fl6_dport; - keys.keyid.keyid = fl6->fl6_gre_key; - keys.tags.flow_label = (__force u32)fl6->flowlabel; - keys.basic.ip_proto = fl6->flowi6_proto; - - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), - flow_keys_have_l4(&keys)); - - return skb->hash; -} -EXPORT_SYMBOL(__skb_get_hash_flowi6); - -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) -{ - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - keys.addrs.v4addrs.src = fl4->saddr; - keys.addrs.v4addrs.dst = fl4->daddr; - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys.ports.src = fl4->fl4_sport; - keys.ports.dst = fl4->fl4_dport; - keys.keyid.keyid = fl4->fl4_gre_key; - keys.basic.ip_proto = fl4->flowi4_proto; - - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), - flow_keys_have_l4(&keys)); - - return skb->hash; -} -EXPORT_SYMBOL(__skb_get_hash_flowi4); - u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index d9cb3532f1dd..0b171756453c 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -44,6 +44,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; + case LWTUNNEL_ENCAP_SEG6_LOCAL: + return "SEG6LOCAL"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: @@ -65,7 +67,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) return lws; } -EXPORT_SYMBOL(lwtunnel_state_alloc); +EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; @@ -80,7 +82,7 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, &lwtun_encaps[num], NULL, ops) ? 0 : -1; } -EXPORT_SYMBOL(lwtunnel_encap_add_ops); +EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) @@ -99,7 +101,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, return ret; } -EXPORT_SYMBOL(lwtunnel_encap_del_ops); +EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, @@ -138,7 +140,7 @@ int lwtunnel_build_state(u16 encap_type, return ret; } -EXPORT_SYMBOL(lwtunnel_build_state); +EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { @@ -175,7 +177,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) return ret; } -EXPORT_SYMBOL(lwtunnel_valid_encap_type); +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack) @@ -205,7 +207,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, return 0; } -EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr); +EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { @@ -219,7 +221,7 @@ void lwtstate_free(struct lwtunnel_state *lws) } module_put(ops->owner); } -EXPORT_SYMBOL(lwtstate_free); +EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { @@ -259,7 +261,7 @@ nla_put_failure: return (ret == -EOPNOTSUPP ? 0 : ret); } -EXPORT_SYMBOL(lwtunnel_fill_encap); +EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { @@ -281,7 +283,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) return ret; } -EXPORT_SYMBOL(lwtunnel_get_encap_size); +EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { @@ -309,7 +311,7 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) return ret; } -EXPORT_SYMBOL(lwtunnel_cmp_encap); +EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -343,7 +345,7 @@ drop: return ret; } -EXPORT_SYMBOL(lwtunnel_output); +EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { @@ -378,7 +380,7 @@ drop: return ret; } -EXPORT_SYMBOL(lwtunnel_xmit); +EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { @@ -412,4 +414,4 @@ drop: return ret; } -EXPORT_SYMBOL(lwtunnel_input); +EXPORT_SYMBOL_GPL(lwtunnel_input); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d0713627deb6..16a1a4c4eb57 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3261,13 +3261,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - NULL); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL); + 0); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b4f9922b6f23..927a6dcbad96 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -97,7 +97,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, return restart_syscall(); if (dev_isalive(netdev)) { - if ((ret = (*set)(netdev, new)) == 0) + ret = (*set)(netdev, new); + if (ret == 0) ret = len; } rtnl_unlock(); @@ -160,6 +161,7 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + if (dev_isalive(ndev)) return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); return -EINVAL; @@ -170,7 +172,7 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier) { if (!netif_running(dev)) return -EINVAL; - return dev_change_carrier(dev, (bool) new_carrier); + return dev_change_carrier(dev, (bool)new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, @@ -183,9 +185,10 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - if (netif_running(netdev)) { + + if (netif_running(netdev)) return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); - } + return -EINVAL; } static DEVICE_ATTR_RW(carrier); @@ -290,6 +293,7 @@ static ssize_t carrier_changes_show(struct device *dev, char *buf) { struct net_device *netdev = to_net_dev(dev); + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_changes)); } @@ -299,7 +303,7 @@ static DEVICE_ATTR_RO(carrier_changes); static int change_mtu(struct net_device *dev, unsigned long new_mtu) { - return dev_set_mtu(dev, (int) new_mtu); + return dev_set_mtu(dev, (int)new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, @@ -311,7 +315,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int) new_flags); + return dev_change_flags(dev, (unsigned int)new_flags); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, @@ -362,8 +366,8 @@ static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) } static ssize_t gro_flush_timeout_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -412,7 +416,7 @@ static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *dev, unsigned long new_group) { - dev_set_group(dev, (int) new_group); + dev_set_group(dev, (int)new_group); return 0; } @@ -426,7 +430,7 @@ static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { - return dev_change_proto_down(dev, (bool) proto_down); + return dev_change_proto_down(dev, (bool)proto_down); } static ssize_t proto_down_store(struct device *dev, @@ -508,7 +512,7 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); -static struct attribute *net_class_attrs[] = { +static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, @@ -549,14 +553,14 @@ static ssize_t netstat_show(const struct device *d, ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || - offset % sizeof(u64) != 0); + offset % sizeof(u64) != 0); read_lock(&dev_base_lock); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); + ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -565,7 +569,7 @@ static ssize_t netstat_show(const struct device *d, /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ static ssize_t name##_show(struct device *d, \ - struct device_attribute *attr, char *buf) \ + struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ @@ -597,7 +601,7 @@ NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); NETSTAT_ENTRY(rx_nohandler); -static struct attribute *netstat_attrs[] = { +static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, @@ -625,7 +629,6 @@ static struct attribute *netstat_attrs[] = { NULL }; - static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, @@ -647,33 +650,33 @@ static const struct attribute_group wireless_group = { #endif /* CONFIG_SYSFS */ #ifdef CONFIG_SYSFS -#define to_rx_queue_attr(_attr) container_of(_attr, \ - struct rx_queue_attribute, attr) +#define to_rx_queue_attr(_attr) \ + container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { @@ -682,8 +685,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { }; #ifdef CONFIG_RPS -static ssize_t show_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, char *buf) +static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; @@ -706,8 +708,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, } static ssize_t store_rps_map(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attribute, - const char *buf, size_t len) + const char *buf, size_t len) { struct rps_map *old_map, *map; cpumask_var_t mask; @@ -727,8 +728,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } map = kzalloc(max_t(unsigned int, - RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), - GFP_KERNEL); + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); if (!map) { free_cpumask_var(mask); return -ENOMEM; @@ -738,9 +739,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; - if (i) + if (i) { map->len = i; - else { + } else { kfree(map); map = NULL; } @@ -765,7 +766,6 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf) { struct rps_dev_flow_table *flow_table; @@ -788,8 +788,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu) } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, - const char *buf, size_t len) + const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; @@ -831,8 +830,9 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, table->mask = mask; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; - } else + } else { table = NULL; + } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, @@ -846,16 +846,15 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return len; } -static struct rx_queue_attribute rps_cpus_attribute = - __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); - +static struct rx_queue_attribute rps_cpus_attribute __ro_after_init + = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); -static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = - __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, - show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init + = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ -static struct attribute *rx_queue_default_attrs[] = { +static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, @@ -870,7 +869,6 @@ static void rx_queue_release(struct kobject *kobj) struct rps_map *map; struct rps_dev_flow_table *flow_table; - map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); @@ -900,7 +898,7 @@ static const void *rx_queue_namespace(struct kobject *kobj) return ns; } -static struct kobj_type rx_queue_ktype = { +static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_attrs = rx_queue_default_attrs, @@ -915,23 +913,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, - "rx-%u", index); + "rx-%u", index); if (error) - goto exit; + return error; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); - if (error) - goto exit; + if (error) { + kobject_put(kobj); + return error; + } } kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); return error; -exit: - kobject_put(kobj); - return error; } #endif /* CONFIG_SYSFS */ @@ -976,39 +973,40 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf); + ssize_t (*show)(struct netdev_queue *queue, char *buf); ssize_t (*store)(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, const char *buf, size_t len); + const char *buf, size_t len); }; -#define to_netdev_queue_attr(_attr) container_of(_attr, \ - struct netdev_queue_attribute, attr) +#define to_netdev_queue_attr(_attr) \ + container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; - return attribute->show(queue, attribute, buf); + return attribute->show(queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + const struct netdev_queue_attribute *attribute + = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; - return attribute->store(queue, attribute, buf, count); + return attribute->store(queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1016,9 +1014,7 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t show_trans_timeout(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - char *buf) +static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { unsigned long trans_timeout; @@ -1040,8 +1036,7 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t show_traffic_class(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, +static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; @@ -1055,16 +1050,14 @@ static ssize_t show_traffic_class(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t show_tx_maxrate(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, +static ssize_t tx_maxrate_show(struct netdev_queue *queue, char *buf) { return sprintf(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t set_tx_maxrate(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct netdev_queue *queue, + const char *buf, size_t len) { struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); @@ -1089,16 +1082,15 @@ static ssize_t set_tx_maxrate(struct netdev_queue *queue, return err; } -static struct netdev_queue_attribute queue_tx_maxrate = - __ATTR(tx_maxrate, S_IRUGO | S_IWUSR, - show_tx_maxrate, set_tx_maxrate); +static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init + = __ATTR_RW(tx_maxrate); #endif -static struct netdev_queue_attribute queue_trans_timeout = - __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); +static struct netdev_queue_attribute queue_trans_timeout __ro_after_init + = __ATTR_RO(tx_timeout); -static struct netdev_queue_attribute queue_traffic_class = - __ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL); +static struct netdev_queue_attribute queue_traffic_class __ro_after_init + = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* @@ -1115,9 +1107,9 @@ static ssize_t bql_set(const char *buf, const size_t count, unsigned int value; int err; - if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; - else { + } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; @@ -1131,7 +1123,6 @@ static ssize_t bql_set(const char *buf, const size_t count, } static ssize_t bql_show_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf) { struct dql *dql = &queue->dql; @@ -1140,7 +1131,6 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue, } static ssize_t bql_set_hold_time(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, const char *buf, size_t len) { struct dql *dql = &queue->dql; @@ -1156,12 +1146,11 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue, return len; } -static struct netdev_queue_attribute bql_hold_time_attribute = - __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, - bql_set_hold_time); +static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init + = __ATTR(hold_time, S_IRUGO | S_IWUSR, + bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_inflight(struct netdev_queue *queue, - struct netdev_queue_attribute *attr, char *buf) { struct dql *dql = &queue->dql; @@ -1169,33 +1158,31 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); } -static struct netdev_queue_attribute bql_inflight_attribute = +static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ - struct netdev_queue_attribute *attr, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ -static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ - __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ - bql_set_ ## NAME); +static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ + = __ATTR(NAME, S_IRUGO | S_IWUSR, \ + bql_show_ ## NAME, bql_set_ ## NAME) -BQL_ATTR(limit, limit) -BQL_ATTR(limit_max, max_limit) -BQL_ATTR(limit_min, min_limit) +BQL_ATTR(limit, limit); +BQL_ATTR(limit_max, max_limit); +BQL_ATTR(limit_min, min_limit); -static struct attribute *dql_attrs[] = { +static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, @@ -1211,8 +1198,8 @@ static const struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static ssize_t show_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, char *buf) +static ssize_t xps_cpus_show(struct netdev_queue *queue, + char *buf) { struct net_device *dev = queue->dev; int cpu, len, num_tc = 1, tc = 0; @@ -1258,9 +1245,8 @@ static ssize_t show_xps_map(struct netdev_queue *queue, return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t store_xps_map(struct netdev_queue *queue, - struct netdev_queue_attribute *attribute, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct netdev_queue *queue, + const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned long index; @@ -1288,11 +1274,11 @@ static ssize_t store_xps_map(struct netdev_queue *queue, return err ? : len; } -static struct netdev_queue_attribute xps_cpus_attribute = - __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init + = __ATTR_RW(xps_cpus); #endif /* CONFIG_XPS */ -static struct attribute *netdev_queue_default_attrs[] = { +static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, &queue_traffic_class.attr, #ifdef CONFIG_XPS @@ -1322,7 +1308,7 @@ static const void *netdev_queue_namespace(struct kobject *kobj) return ns; } -static struct kobj_type netdev_queue_ktype = { +static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_attrs = netdev_queue_default_attrs, @@ -1337,23 +1323,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, - "tx-%u", index); + "tx-%u", index); if (error) - goto exit; + return error; #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); - if (error) - goto exit; + if (error) { + kobject_put(kobj); + return error; + } #endif kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); return 0; -exit: - kobject_put(kobj); - return error; } #endif /* CONFIG_SYSFS */ @@ -1395,7 +1380,7 @@ static int register_queue_kobjects(struct net_device *dev) #ifdef CONFIG_SYSFS dev->queues_kset = kset_create_and_add("queues", - NULL, &dev->dev.kobj); + NULL, &dev->dev.kobj); if (!dev->queues_kset) return -ENOMEM; real_rx = dev->real_num_rx_queues; @@ -1463,7 +1448,7 @@ static const void *net_netlink_ns(struct sock *sk) return sock_net(sk); } -struct kobj_ns_type_operations net_ns_type_operations = { +const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, @@ -1485,7 +1470,8 @@ static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) - * and is what RtNetlink uses natively. */ + * and is what RtNetlink uses natively. + */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: @@ -1513,7 +1499,7 @@ static const void *net_namespace(struct device *d) return dev_net(dev); } -static struct class net_class = { +static struct class net_class __ro_after_init = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, @@ -1560,7 +1546,7 @@ EXPORT_SYMBOL(of_find_net_device_by_node); */ void netdev_unregister_kobject(struct net_device *ndev) { - struct device *dev = &(ndev->dev); + struct device *dev = &ndev->dev; if (!atomic_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); @@ -1577,7 +1563,7 @@ void netdev_unregister_kobject(struct net_device *ndev) /* Create sysfs entries for network device. */ int netdev_register_kobject(struct net_device *ndev) { - struct device *dev = &(ndev->dev); + struct device *dev = &ndev->dev; const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; @@ -1620,14 +1606,14 @@ int netdev_register_kobject(struct net_device *ndev) return error; } -int netdev_class_create_file_ns(struct class_attribute *class_attr, +int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { return class_create_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_create_file_ns); -void netdev_class_remove_file_ns(struct class_attribute *class_attr, +void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns) { class_remove_file_ns(&net_class, class_attr, ns); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 92da5e4ceb4f..4f1468ccd056 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -32,6 +32,7 @@ #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/fib.h> +#include <trace/events/qdisc.h> #if IS_ENABLED(CONFIG_IPV6) #include <trace/events/fib6.h> EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 8726d051f31d..6cfdc7c84c48 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -855,9 +855,10 @@ static int __init net_ns_init(void) register_pernet_subsys(&net_ns_ops); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, + RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - NULL); + RTNL_FLAG_DOIT_UNLOCKED); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9201e3621351..a78fd61da0ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,7 +62,7 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; - rtnl_calcit_func calcit; + unsigned int flags; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,7 +127,8 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; +static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -143,58 +144,13 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } -static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) -{ - struct rtnl_link *tab; - - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; - - if (tab == NULL || tab[msgindex].doit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; - - return tab[msgindex].doit; -} - -static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) -{ - struct rtnl_link *tab; - - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; - - if (tab == NULL || tab[msgindex].dumpit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; - - return tab[msgindex].dumpit; -} - -static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) -{ - struct rtnl_link *tab; - - if (protocol <= RTNL_FAMILY_MAX) - tab = rtnl_msg_handlers[protocol]; - else - tab = NULL; - - if (tab == NULL || tab[msgindex].calcit == NULL) - tab = rtnl_msg_handlers[PF_UNSPEC]; - - return tab[msgindex].calcit; -} - /** * __rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @calcit: Function pointer to calc size of dump message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the @@ -208,7 +164,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) */ int __rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, - rtnl_calcit_func calcit) + unsigned int flags) { struct rtnl_link *tab; int msgindex; @@ -216,23 +172,20 @@ int __rtnl_register(int protocol, int msgtype, BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rtnl_msg_handlers[protocol]; + tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); if (tab == NULL) return -ENOBUFS; - rtnl_msg_handlers[protocol] = tab; + rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } if (doit) tab[msgindex].doit = doit; - if (dumpit) tab[msgindex].dumpit = dumpit; - - if (calcit) - tab[msgindex].calcit = calcit; + tab[msgindex].flags |= flags; return 0; } @@ -249,9 +202,9 @@ EXPORT_SYMBOL_GPL(__rtnl_register); */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, - rtnl_calcit_func calcit) + unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0) + if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) panic("Unable to register rtnetlink message handler, " "protocol = %d, message type = %d\n", protocol, msgtype); @@ -267,17 +220,23 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { + struct rtnl_link *handlers; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - if (rtnl_msg_handlers[protocol] == NULL) + rtnl_lock(); + handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!handlers) { + rtnl_unlock(); return -ENOENT; + } - rtnl_msg_handlers[protocol][msgindex].doit = NULL; - rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; - rtnl_msg_handlers[protocol][msgindex].calcit = NULL; + handlers[msgindex].doit = NULL; + handlers[msgindex].dumpit = NULL; + handlers[msgindex].flags = 0; + rtnl_unlock(); return 0; } @@ -292,10 +251,20 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { + struct rtnl_link *handlers; + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); - kfree(rtnl_msg_handlers[protocol]); - rtnl_msg_handlers[protocol] = NULL; + rtnl_lock(); + handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + rtnl_unlock(); + + synchronize_net(); + + while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) + schedule(); + kfree(handlers); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -433,16 +402,24 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; + size_t size = 0; - master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) - return 0; + goto out; + ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) - return 0; + goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ - return nla_total_size(sizeof(struct nlattr)) + + size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); + +out: + rcu_read_unlock(); + return size; } static size_t rtnl_link_get_size(const struct net_device *dev) @@ -1644,8 +1621,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_idx = cb->args[1]; - cb->seq = net->dev_base_seq; - /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's @@ -1691,8 +1666,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) goto out_err; } - - nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } @@ -1702,6 +1675,8 @@ out: out_err: cb->args[1] = idx; cb->args[0] = h; + cb->seq = net->dev_base_seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } @@ -2831,11 +2806,13 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ - list_for_each_entry(dev, &net->dev_base_head, dev_list) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } + rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } @@ -2847,19 +2824,29 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (s_idx == 0) s_idx = 1; + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { int type = cb->nlh->nlmsg_type-RTM_BASE; + struct rtnl_link *handlers; + rtnl_dumpit_func dumpit; + if (idx < s_idx || idx == PF_PACKET) continue; - if (rtnl_msg_handlers[idx] == NULL || - rtnl_msg_handlers[idx][type].dumpit == NULL) + + handlers = rtnl_dereference(rtnl_msg_handlers[idx]); + if (!handlers) continue; + + dumpit = READ_ONCE(handlers[type].dumpit); + if (!dumpit) + continue; + if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } - if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) + if (dumpit(skb, cb)) break; } cb->family = idx; @@ -4162,11 +4149,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct rtnl_link *handlers; + int err = -EOPNOTSUPP; rtnl_doit_func doit; + unsigned int flags; int kind; int family; int type; - int err; type = nlh->nlmsg_type; if (type > RTM_MAX) @@ -4184,20 +4173,40 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; + if (family >= ARRAY_SIZE(rtnl_msg_handlers)) + family = PF_UNSPEC; + + rcu_read_lock(); + handlers = rcu_dereference(rtnl_msg_handlers[family]); + if (!handlers) { + family = PF_UNSPEC; + handlers = rcu_dereference(rtnl_msg_handlers[family]); + } + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; - rtnl_calcit_func calcit; u16 min_dump_alloc = 0; - dumpit = rtnl_get_dumpit(family, type); - if (dumpit == NULL) - return -EOPNOTSUPP; - calcit = rtnl_get_calcit(family, type); - if (calcit) - min_dump_alloc = calcit(skb, nlh); + dumpit = READ_ONCE(handlers[type].dumpit); + if (!dumpit) { + family = PF_UNSPEC; + handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); + if (!handlers) + goto err_unlock; + + dumpit = READ_ONCE(handlers[type].dumpit); + if (!dumpit) + goto err_unlock; + } + + refcount_inc(&rtnl_msg_handlers_ref[family]); + + if (type == RTM_GETLINK - RTM_BASE) + min_dump_alloc = rtnl_calcit(skb, nlh); + + rcu_read_unlock(); - __rtnl_unlock(); rtnl = net->rtnl; { struct netlink_dump_control c = { @@ -4206,22 +4215,47 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, }; err = netlink_dump_start(rtnl, skb, nlh, &c); } - rtnl_lock(); + refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - doit = rtnl_get_doit(family, type); - if (doit == NULL) - return -EOPNOTSUPP; + doit = READ_ONCE(handlers[type].doit); + if (!doit) { + family = PF_UNSPEC; + handlers = rcu_dereference(rtnl_msg_handlers[family]); + } + + flags = READ_ONCE(handlers[type].flags); + if (flags & RTNL_FLAG_DOIT_UNLOCKED) { + refcount_inc(&rtnl_msg_handlers_ref[family]); + doit = READ_ONCE(handlers[type].doit); + rcu_read_unlock(); + if (doit) + err = doit(skb, nlh, extack); + refcount_dec(&rtnl_msg_handlers_ref[family]); + return err; + } - return doit(skb, nlh, extack); + rcu_read_unlock(); + + rtnl_lock(); + handlers = rtnl_dereference(rtnl_msg_handlers[family]); + if (handlers) { + doit = READ_ONCE(handlers[type].doit); + if (doit) + err = doit(skb, nlh, extack); + } + rtnl_unlock(); + return err; + +err_unlock: + rcu_read_unlock(); + return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { - rtnl_lock(); netlink_rcv_skb(skb, &rtnetlink_rcv_msg); - rtnl_unlock(); } static int rtnetlink_bind(struct net *net, int group) @@ -4294,29 +4328,34 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { + int i; + + for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) + refcount_set(&rtnl_msg_handlers_ref[i], 1); + if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, rtnl_calcit); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL); + rtnl_dump_ifinfo, 0); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); - rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, NULL); + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); + rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); + rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, - NULL); + 0); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0f0933b338d7..917da73d3ab3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = shinfo->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } - if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); + skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list); */ void skb_tx_error(struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; - } + skb_zcopy_clear(skb, true); } EXPORT_SYMBOL(skb_tx_error); @@ -915,6 +897,273 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +{ + unsigned long max_pg, num_pg, new_pg, old_pg; + struct user_struct *user; + + if (capable(CAP_IPC_LOCK) || !size) + return 0; + + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ + max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + user = mmp->user ? : current_user(); + + do { + old_pg = atomic_long_read(&user->locked_vm); + new_pg = old_pg + num_pg; + if (new_pg > max_pg) + return -ENOBUFS; + } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != + old_pg); + + if (!mmp->user) { + mmp->user = get_uid(user); + mmp->num_pg = num_pg; + } else { + mmp->num_pg += num_pg; + } + + return 0; +} + +static void mm_unaccount_pinned_pages(struct mmpin *mmp) +{ + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + } +} + +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +{ + struct ubuf_info *uarg; + struct sk_buff *skb; + + WARN_ON_ONCE(!in_task()); + + if (!sock_flag(sk, SOCK_ZEROCOPY)) + return NULL; + + skb = sock_omalloc(sk, 0, GFP_KERNEL); + if (!skb) + return NULL; + + BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); + uarg = (void *)skb->cb; + uarg->mmp.user = NULL; + + if (mm_account_pinned_pages(&uarg->mmp, size)) { + kfree_skb(skb); + return NULL; + } + + uarg->callback = sock_zerocopy_callback; + uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; + uarg->len = 1; + uarg->bytelen = size; + uarg->zerocopy = 1; + atomic_set(&uarg->refcnt, 0); + sock_hold(sk); + + return uarg; +} +EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); + +static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +{ + return container_of((void *)uarg, struct sk_buff, cb); +} + +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) +{ + if (uarg) { + const u32 byte_limit = 1 << 19; /* limit to a few TSO */ + u32 bytelen, next; + + /* realloc only when socket is locked (TCP, UDP cork), + * so uarg->len and sk_zckey access is serialized + */ + if (!sock_owned_by_user(sk)) { + WARN_ON_ONCE(1); + return NULL; + } + + bytelen = uarg->bytelen + size; + if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { + /* TCP can create new skb to attach new uarg */ + if (sk->sk_type == SOCK_STREAM) + goto new_alloc; + return NULL; + } + + next = (u32)atomic_read(&sk->sk_zckey); + if ((u32)(uarg->id + uarg->len) == next) { + if (mm_account_pinned_pages(&uarg->mmp, size)) + return NULL; + uarg->len++; + uarg->bytelen = bytelen; + atomic_set(&sk->sk_zckey, ++next); + return uarg; + } + } + +new_alloc: + return sock_zerocopy_alloc(sk, size); +} +EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); + +static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 old_lo, old_hi; + u64 sum_len; + + old_lo = serr->ee.ee_info; + old_hi = serr->ee.ee_data; + sum_len = old_hi - old_lo + 1ULL + len; + + if (sum_len >= (1ULL << 32)) + return false; + + if (lo != old_hi + 1) + return false; + + serr->ee.ee_data += len; + return true; +} + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +{ + struct sk_buff *tail, *skb = skb_from_uarg(uarg); + struct sock_exterr_skb *serr; + struct sock *sk = skb->sk; + struct sk_buff_head *q; + unsigned long flags; + u32 lo, hi; + u16 len; + + mm_unaccount_pinned_pages(&uarg->mmp); + + /* if !len, there was only 1 call, and it was aborted + * so do not queue a completion notification + */ + if (!uarg->len || sock_flag(sk, SOCK_DEAD)) + goto release; + + len = uarg->len; + lo = uarg->id; + hi = uarg->id + len - 1; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; + serr->ee.ee_data = hi; + serr->ee.ee_info = lo; + if (!success) + serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || + !skb_zerocopy_notify_extend(tail, lo, len)) { + __skb_queue_tail(q, skb); + skb = NULL; + } + spin_unlock_irqrestore(&q->lock, flags); + + sk->sk_error_report(sk); + +release: + consume_skb(skb); + sock_put(sk); +} +EXPORT_SYMBOL_GPL(sock_zerocopy_callback); + +void sock_zerocopy_put(struct ubuf_info *uarg) +{ + if (uarg && atomic_dec_and_test(&uarg->refcnt)) { + if (uarg->callback) + uarg->callback(uarg, uarg->zerocopy); + else + consume_skb(skb_from_uarg(uarg)); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put); + +void sock_zerocopy_put_abort(struct ubuf_info *uarg) +{ + if (uarg) { + struct sock *sk = skb_from_uarg(uarg)->sk; + + atomic_dec(&sk->sk_zckey); + uarg->len--; + + /* sock_zerocopy_put expects a ref. Most sockets take one per + * skb, which is zero on abort. tcp_sendmsg holds one extra, to + * avoid an skb send inside the main loop triggering uarg free. + */ + if (sk->sk_type != SOCK_STREAM) + atomic_inc(&uarg->refcnt); + + sock_zerocopy_put(uarg); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); + +extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg) +{ + struct ubuf_info *orig_uarg = skb_zcopy(skb); + struct iov_iter orig_iter = msg->msg_iter; + int err, orig_len = skb->len; + + /* An skb can only point to one uarg. This edge case happens when + * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + + err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); + if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + /* Streams do not free skb on error. Reset to prev state. */ + msg->msg_iter = orig_iter; + ___pskb_trim(skb, orig_len); + return err; + } + + skb_zcopy_set(skb, uarg); + return skb->len - orig_len; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); + +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, + gfp_t gfp_mask) +{ + if (skb_zcopy(orig)) { + if (skb_zcopy(nskb)) { + /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ + if (!gfp_mask) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + if (skb_uarg(nskb) == skb_uarg(orig)) + return 0; + if (skb_copy_ubufs(nskb, GFP_ATOMIC)) + return -EIO; + } + skb_zcopy_set(nskb, skb_uarg(orig)); + } + return 0; +} + /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify @@ -932,17 +1181,19 @@ EXPORT_SYMBOL_GPL(skb_morph); */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - int i; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; + int i, new_frags; + u32 d_off; - for (i = 0; i < num_frags; i++) { - skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - u32 p_off, p_len, copied; - struct page *p; - u8 *vaddr; + if (!num_frags) + return 0; + if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) + return -EINVAL; + + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); if (!page) { while (head) { @@ -952,33 +1203,51 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } return -ENOMEM; } + set_page_private(page, (unsigned long)head); + head = page; + } + + page = head; + d_off = 0; + for (i = 0; i < num_frags; i++) { + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f), p, p_off, p_len, copied) { + u32 copy, done = 0; vaddr = kmap_atomic(p); - memcpy(page_address(page) + copied, vaddr + p_off, - p_len); + + while (done < p_len) { + if (d_off == PAGE_SIZE) { + d_off = 0; + page = (struct page *)page_private(page); + } + copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); + memcpy(page_address(page) + d_off, + vaddr + p_off + done, copy); + done += copy; + d_off += copy; + } kunmap_atomic(vaddr); } - - set_page_private(page, (unsigned long)head); - head = page; } /* skb frags release userspace buffers */ for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg, false); - /* skb frags point to kernel buffers */ - for (i = num_frags - 1; i >= 0; i--) { - __skb_fill_page_desc(skb, i, head, 0, - skb_shinfo(skb)->frags[i].size); + for (i = 0; i < new_frags - 1; i++) { + __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); head = (struct page *)page_private(head); } + __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + skb_shinfo(skb)->nr_frags = new_frags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); @@ -1139,7 +1408,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_orphan_frags(skb, gfp_mask)) { + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; @@ -1216,9 +1486,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, * be since all we did is relocate the values */ if (skb_cloned(skb)) { - /* copy this zero copy skb frags */ if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; + if (skb_zcopy(skb)) + atomic_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); @@ -1713,6 +1984,9 @@ end: skb->tail += delta; skb->data_len -= delta; + if (!skb->data_len) + skb_zcopy_clear(skb, false); + return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); @@ -2011,7 +2285,7 @@ do_frag_list: slen = min_t(int, len, skb_headlen(skb) - offset); kv.iov_base = skb->data + offset; - kv.iov_len = len; + kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); @@ -2468,6 +2742,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_tx_error(from); return -ENOMEM; } + skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { if (!len) @@ -2765,6 +3040,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2808,6 +3084,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_headlen(skb)) return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; todo = shiftlen; from = 0; @@ -3381,6 +3659,8 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) + goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -4504,6 +4784,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; + if (skb_zcopy(to) || skb_zcopy(from)) + return false; if (skb_headlen(from) != 0) { struct page *page; diff --git a/net/core/sock.c b/net/core/sock.c index 564f835f408a..0f04d8bff607 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1055,6 +1055,20 @@ set_rcvbuf: if (val == 1) dst_negative_advice(sk); break; + + case SO_ZEROCOPY: + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + ret = -ENOTSUPP; + else if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + else if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -1383,6 +1397,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val64 = sock_gen_cookie(sk); break; + case SO_ZEROCOPY: + v.val = sock_flag(sk, SOCK_ZEROCOPY); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1670,6 +1688,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); @@ -1923,6 +1942,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } EXPORT_SYMBOL(sock_wmalloc); +static void sock_ofree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_omem_alloc); +} + +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, + gfp_t priority) +{ + struct sk_buff *skb; + + /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ + if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > + sysctl_optmem_max) + return NULL; + + skb = alloc_skb(size, priority); + if (!skb) + return NULL; + + atomic_add(skb->truesize, &sk->sk_omem_alloc); + skb->sk = sk; + skb->destructor = sock_ofree; + return skb; +} + /* * Allocate a memory block from the socket's option memory buffer. */ @@ -2695,6 +2741,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; + atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; |