From a53b42c11815d2357e31a9403ae3950517525894 Mon Sep 17 00:00:00 2001 From: Tan Hu Date: Wed, 25 Jul 2018 15:23:07 +0800 Subject: ipvs: fix race between ip_vs_conn_new() and ip_vs_del_dest() We came across infinite loop in ipvs when using ipvs in docker env. When ipvs receives new packets and cannot find an ipvs connection, it will create a new connection, then if the dest is unavailable (i.e. IP_VS_DEST_F_AVAILABLE), the packet will be dropped sliently. But if the dropped packet is the first packet of this connection, the connection control timer never has a chance to start and the ipvs connection cannot be released. This will lead to memory leak, or infinite loop in cleanup_net() when net namespace is released like this: ip_vs_conn_net_cleanup at ffffffffa0a9f31a [ip_vs] __ip_vs_cleanup at ffffffffa0a9f60a [ip_vs] ops_exit_list at ffffffff81567a49 cleanup_net at ffffffff81568b40 process_one_work at ffffffff810a851b worker_thread at ffffffff810a9356 kthread at ffffffff810b0b6f ret_from_fork at ffffffff81697a18 race condition: CPU1 CPU2 ip_vs_in() ip_vs_conn_new() ip_vs_del_dest() __ip_vs_unlink_dest() ~IP_VS_DEST_F_AVAILABLE cp->dest && !IP_VS_DEST_F_AVAILABLE __ip_vs_conn_put ... cleanup_net ---> infinite looping Fix this by checking whether the timer already started. Signed-off-by: Tan Hu Reviewed-by: Jiang Biao Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 0679dd101e72..7ca926a03b81 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1972,13 +1972,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (sysctl_expire_nodest_conn(ipvs)) { + __u32 flags = cp->flags; + + /* when timer already started, silently drop the packet.*/ + if (timer_pending(&cp->timer)) + __ip_vs_conn_put(cp); + else + ip_vs_conn_put(cp); + + if (sysctl_expire_nodest_conn(ipvs) && + !(flags & IP_VS_CONN_F_ONE_PACKET)) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); + return NF_DROP; } -- cgit v1.2.3-55-g7522 From 14d32b2525dd3a21c29b0d9edf1748f5dabc1bbc Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 31 Jul 2018 18:03:32 +0200 Subject: jiffies: add utility function to calculate delta in ms add jiffies_delta_to_msecs() helper func to calculate the delta between two times and eventually 0 if negative. Suggested-by: Eric Dumazet Signed-off-by: Matteo Croce Reviewed-by: Eric Dumazet Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/jiffies.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index a27cf6652327..fa928242567d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -447,6 +447,11 @@ static inline clock_t jiffies_delta_to_clock_t(long delta) return jiffies_to_clock_t(max(0L, delta)); } +static inline unsigned int jiffies_delta_to_msecs(long delta) +{ + return jiffies_to_msecs(max(0L, delta)); +} + extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); -- cgit v1.2.3-55-g7522 From b71ed54dc2a1dbb4328f7d7c98d712747aee92ba Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 31 Jul 2018 18:03:33 +0200 Subject: ipvs: don't show negative times in ip_vs_conn Since commit 500462a9de65 ("timers: Switch to a non-cascading wheel"), timers duration can last even 12.5% more than the scheduled interval. IPVS has two handlers, /proc/net/ip_vs_conn and /proc/net/ip_vs_conn_sync, which shows the remaining time before that a connection expires. The default expire time for a connection is 60 seconds, and the expiration timer can fire even 4 seconds later than the scheduled time. The expiration time is calculated subtracting jiffies to the scheduled expiration time, and it's shown as a huge number when the timer fires late, since both values are unsigned. This can confuse script and tools which relies on it, like ipvsadm: root@mcroce-redhat:~# while ipvsadm -lc |grep SYN_RECV; do sleep 1 ; done TCP 00:05 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 00:04 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 00:03 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 00:02 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 00:01 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 00:00 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:44 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:43 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:42 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:41 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:40 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 TCP 68719476:39 SYN_RECV [fc00:1::1]:55732 [fc00:1::2]:8000 [fc00:2000::1]:8000 Signed-off-by: Matteo Croce Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 0edc62910ebf..5b2b17867cb1 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1117,24 +1117,28 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " - "%s %04X %-11s %7lu%s\n", + "%s %04X %-11s %7u%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), ip_vs_state_name(cp), - (cp->timer.expires-jiffies)/HZ, pe_data); + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000, + pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %s %04X %-11s %7lu%s\n", + " %s %04X %-11s %7u%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), ip_vs_state_name(cp), - (cp->timer.expires-jiffies)/HZ, pe_data); + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000, + pe_data); } return 0; } @@ -1179,26 +1183,28 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " - "%s %04X %-11s %-6s %7lu\n", + "%s %04X %-11s %-6s %7u\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X " - "%s %04X %-11s %-6s %7lu\n", + "%s %04X %-11s %-6s %7u\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); + jiffies_delta_to_msecs(cp->timer.expires - + jiffies) / 1000); } return 0; } -- cgit v1.2.3-55-g7522 From da786717e0894886301ed2536843c13f9e8fd53e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 25 Jul 2018 21:38:43 +0200 Subject: netfilter: ip6t_rpfilter: set F_IFACE for linklocal addresses Roman reports that DHCPv6 client no longer sees replies from server due to ip6tables -t raw -A PREROUTING -m rpfilter --invert -j DROP rule. We need to set the F_IFACE flag for linklocal addresses, they are scoped per-device. Fixes: 47b7e7f82802 ("netfilter: don't set F_IFACE on ipv6 fib lookups") Reported-by: Roman Mamedov Tested-by: Roman Mamedov Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_rpfilter.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 0fe61ede77c6..c3c6b09acdc4 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr) return addr_type & IPV6_ADDR_UNICAST; } +static bool rpfilter_addr_linklocal(const struct in6_addr *addr) +{ + int addr_type = ipv6_addr_type(addr); + return addr_type & IPV6_ADDR_LINKLOCAL; +} + static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, const struct net_device *dev, u8 flags) { @@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, } fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - if ((flags & XT_RPFILTER_LOOSE) == 0) + + if (rpfilter_addr_linklocal(&iph->saddr)) { + lookup_flags |= RT6_LOOKUP_F_IFACE; + fl6.flowi6_oif = dev->ifindex; + } else if ((flags & XT_RPFILTER_LOOSE) == 0) fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); -- cgit v1.2.3-55-g7522 From 4ef360dd6a65f6ef337645e1b65e744034754b19 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 26 Jul 2018 00:39:51 +0900 Subject: netfilter: nft_set: fix allocation size overflow in privsize callback. In order to determine allocation size of set, ->privsize is invoked. At this point, both desc->size and size of each data structure of set are used. desc->size means number of element that is given by user. desc->size is u32 type. so that upperlimit of set element is 4294967295. but return type of ->privsize is also u32. hence overflow can occurred. test commands: %nft add table ip filter %nft add set ip filter hash1 { type ipv4_addr \; size 4294967295 \; } %nft list ruleset splat looks like: [ 1239.202910] kasan: CONFIG_KASAN_INLINE enabled [ 1239.208788] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 1239.217625] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 1239.219329] CPU: 0 PID: 1603 Comm: nft Not tainted 4.18.0-rc5+ #7 [ 1239.229091] RIP: 0010:nft_hash_walk+0x1d2/0x310 [nf_tables_set] [ 1239.229091] Code: 84 d2 7f 10 4c 89 e7 89 44 24 38 e8 d8 5a 17 e0 8b 44 24 38 48 8d 7b 10 41 0f b6 0c 24 48 89 fa 48 89 fe 48 c1 ea 03 83 e6 07 <42> 0f b6 14 3a 40 38 f2 7f 1a 84 d2 74 16 [ 1239.229091] RSP: 0018:ffff8801118cf358 EFLAGS: 00010246 [ 1239.229091] RAX: 0000000000000000 RBX: 0000000000020400 RCX: 0000000000000001 [ 1239.229091] RDX: 0000000000004082 RSI: 0000000000000000 RDI: 0000000000020410 [ 1239.229091] RBP: ffff880114d5a988 R08: 0000000000007e94 R09: ffff880114dd8030 [ 1239.229091] R10: ffff880114d5a988 R11: ffffed00229bb006 R12: ffff8801118cf4d0 [ 1239.229091] R13: ffff8801118cf4d8 R14: 0000000000000000 R15: dffffc0000000000 [ 1239.229091] FS: 00007f5a8fe0b700(0000) GS:ffff88011b600000(0000) knlGS:0000000000000000 [ 1239.229091] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1239.229091] CR2: 00007f5a8ecc27b0 CR3: 000000010608e000 CR4: 00000000001006f0 [ 1239.229091] Call Trace: [ 1239.229091] ? nft_hash_remove+0xf0/0xf0 [nf_tables_set] [ 1239.229091] ? memset+0x1f/0x40 [ 1239.229091] ? __nla_reserve+0x9f/0xb0 [ 1239.229091] ? memcpy+0x34/0x50 [ 1239.229091] nf_tables_dump_set+0x9a1/0xda0 [nf_tables] [ 1239.229091] ? __kmalloc_reserve.isra.29+0x2e/0xa0 [ 1239.229091] ? nft_chain_hash_obj+0x630/0x630 [nf_tables] [ 1239.229091] ? nf_tables_commit+0x2c60/0x2c60 [nf_tables] [ 1239.229091] netlink_dump+0x470/0xa20 [ 1239.229091] __netlink_dump_start+0x5ae/0x690 [ 1239.229091] nft_netlink_dump_start_rcu+0xd1/0x160 [nf_tables] [ 1239.229091] nf_tables_getsetelem+0x2e5/0x4b0 [nf_tables] [ 1239.229091] ? nft_get_set_elem+0x440/0x440 [nf_tables] [ 1239.229091] ? nft_chain_hash_obj+0x630/0x630 [nf_tables] [ 1239.229091] ? nf_tables_dump_obj_done+0x70/0x70 [nf_tables] [ 1239.229091] ? nla_parse+0xab/0x230 [ 1239.229091] ? nft_get_set_elem+0x440/0x440 [nf_tables] [ 1239.229091] nfnetlink_rcv_msg+0x7f0/0xab0 [nfnetlink] [ 1239.229091] ? nfnetlink_bind+0x1d0/0x1d0 [nfnetlink] [ 1239.229091] ? debug_show_all_locks+0x290/0x290 [ 1239.229091] ? sched_clock_cpu+0x132/0x170 [ 1239.229091] ? find_held_lock+0x39/0x1b0 [ 1239.229091] ? sched_clock_local+0x10d/0x130 [ 1239.229091] netlink_rcv_skb+0x211/0x320 [ 1239.229091] ? nfnetlink_bind+0x1d0/0x1d0 [nfnetlink] [ 1239.229091] ? netlink_ack+0x7b0/0x7b0 [ 1239.229091] ? ns_capable_common+0x6e/0x110 [ 1239.229091] nfnetlink_rcv+0x2d1/0x310 [nfnetlink] [ 1239.229091] ? nfnetlink_rcv_batch+0x10f0/0x10f0 [nfnetlink] [ 1239.229091] ? netlink_deliver_tap+0x829/0x930 [ 1239.229091] ? lock_acquire+0x265/0x2e0 [ 1239.229091] netlink_unicast+0x406/0x520 [ 1239.509725] ? netlink_attachskb+0x5b0/0x5b0 [ 1239.509725] ? find_held_lock+0x39/0x1b0 [ 1239.509725] netlink_sendmsg+0x987/0xa20 [ 1239.509725] ? netlink_unicast+0x520/0x520 [ 1239.509725] ? _copy_from_user+0xa9/0xc0 [ 1239.509725] __sys_sendto+0x21a/0x2c0 [ 1239.509725] ? __ia32_sys_getpeername+0xa0/0xa0 [ 1239.509725] ? retint_kernel+0x10/0x10 [ 1239.509725] ? sched_clock_cpu+0x132/0x170 [ 1239.509725] ? find_held_lock+0x39/0x1b0 [ 1239.509725] ? lock_downgrade+0x540/0x540 [ 1239.509725] ? up_read+0x1c/0x100 [ 1239.509725] ? __do_page_fault+0x763/0x970 [ 1239.509725] ? retint_user+0x18/0x18 [ 1239.509725] __x64_sys_sendto+0x177/0x180 [ 1239.509725] do_syscall_64+0xaa/0x360 [ 1239.509725] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1239.509725] RIP: 0033:0x7f5a8f468e03 [ 1239.509725] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb d0 0f 1f 84 00 00 00 00 00 83 3d 49 c9 2b 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 [ 1239.509725] RSP: 002b:00007ffd78d0b778 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 1239.509725] RAX: ffffffffffffffda RBX: 00007ffd78d0c890 RCX: 00007f5a8f468e03 [ 1239.509725] RDX: 0000000000000034 RSI: 00007ffd78d0b7e0 RDI: 0000000000000003 [ 1239.509725] RBP: 00007ffd78d0b7d0 R08: 00007f5a8f15c160 R09: 000000000000000c [ 1239.509725] R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffd78d0b7e0 [ 1239.509725] R13: 0000000000000034 R14: 00007f5a8f9aff60 R15: 00005648040094b0 [ 1239.509725] Modules linked in: nf_tables_set nf_tables nfnetlink ip_tables x_tables [ 1239.670713] ---[ end trace 39375adcda140f11 ]--- [ 1239.676016] RIP: 0010:nft_hash_walk+0x1d2/0x310 [nf_tables_set] [ 1239.682834] Code: 84 d2 7f 10 4c 89 e7 89 44 24 38 e8 d8 5a 17 e0 8b 44 24 38 48 8d 7b 10 41 0f b6 0c 24 48 89 fa 48 89 fe 48 c1 ea 03 83 e6 07 <42> 0f b6 14 3a 40 38 f2 7f 1a 84 d2 74 16 [ 1239.705108] RSP: 0018:ffff8801118cf358 EFLAGS: 00010246 [ 1239.711115] RAX: 0000000000000000 RBX: 0000000000020400 RCX: 0000000000000001 [ 1239.719269] RDX: 0000000000004082 RSI: 0000000000000000 RDI: 0000000000020410 [ 1239.727401] RBP: ffff880114d5a988 R08: 0000000000007e94 R09: ffff880114dd8030 [ 1239.735530] R10: ffff880114d5a988 R11: ffffed00229bb006 R12: ffff8801118cf4d0 [ 1239.743658] R13: ffff8801118cf4d8 R14: 0000000000000000 R15: dffffc0000000000 [ 1239.751785] FS: 00007f5a8fe0b700(0000) GS:ffff88011b600000(0000) knlGS:0000000000000000 [ 1239.760993] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1239.767560] CR2: 00007f5a8ecc27b0 CR3: 000000010608e000 CR4: 00000000001006f0 [ 1239.775679] Kernel panic - not syncing: Fatal exception [ 1239.776630] Kernel Offset: 0x1f000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 1239.776630] Rebooting in 5 seconds.. Fixes: 20a69341f2d0 ("netfilter: nf_tables: add netlink set API") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_set_bitmap.c | 6 +++--- net/netfilter/nft_set_hash.c | 8 ++++---- net/netfilter/nft_set_rbtree.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index dc417ef0a0c5..552bfbef1bf1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -274,7 +274,7 @@ enum nft_set_class { * @space: memory class */ struct nft_set_estimate { - unsigned int size; + u64 size; enum nft_set_class lookup; enum nft_set_class space; }; @@ -336,7 +336,7 @@ struct nft_set_ops { const struct nft_set_elem *elem, unsigned int flags); - unsigned int (*privsize)(const struct nlattr * const nla[], + u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 67cdd5c4f4f5..3008f93469c4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3354,7 +3354,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_set *set; struct nft_ctx ctx; char *name; - unsigned int size; + u64 size; u64 timeout; u32 ktype, dtype, flags, policy, gc_int, objtype; struct nft_set_desc desc; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 128bc16f52dd..f866bd41e5d2 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -248,13 +248,13 @@ static inline u32 nft_bitmap_size(u32 klen) return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; } -static inline u32 nft_bitmap_total_size(u32 klen) +static inline u64 nft_bitmap_total_size(u32 klen) { return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); } -static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[], - const struct nft_set_desc *desc) +static u64 nft_bitmap_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 90c3e7e6cacb..015124e649cb 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -341,8 +341,8 @@ schedule: nft_set_gc_interval(set)); } -static unsigned int nft_rhash_privsize(const struct nlattr * const nla[], - const struct nft_set_desc *desc) +static u64 nft_rhash_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { return sizeof(struct nft_rhash); } @@ -585,8 +585,8 @@ cont: } } -static unsigned int nft_hash_privsize(const struct nlattr * const nla[], - const struct nft_set_desc *desc) +static u64 nft_hash_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { return sizeof(struct nft_hash) + nft_hash_buckets(desc->size) * sizeof(struct hlist_head); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9873d734b494..55e2d9215c0d 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -411,8 +411,8 @@ static void nft_rbtree_gc(struct work_struct *work) nft_set_gc_interval(set)); } -static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[], - const struct nft_set_desc *desc) +static u64 nft_rbtree_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) { return sizeof(struct nft_rbtree); } -- cgit v1.2.3-55-g7522 From 3e673b23b541b8e7f773b2d378d6eb99831741cd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 31 Jul 2018 13:41:23 +0200 Subject: netfilter: fix memory leaks on netlink_dump_start error Shaochun Chen points out we leak dumper filter state allocations stored in dump_control->data in case there is an error before netlink sets cb_running (after which ->done will be called at some point). In order to fix this, add .start functions and move allocations there. Same pattern as used in commit 90fd131afc565159c9e0ea742f082b337e10f8c6 ("netfilter: nf_tables: move dumper state allocation into ->start"). Reported-by: shaochun chen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 26 +++++++++++++++++--------- net/netfilter/nfnetlink_acct.c | 29 +++++++++++++---------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f981bfa8db72..036207ecaf16 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -846,6 +846,21 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[]) #endif } +static int ctnetlink_start(struct netlink_callback *cb) +{ + const struct nlattr * const *cda = cb->data; + struct ctnetlink_filter *filter = NULL; + + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + filter = ctnetlink_alloc_filter(cda); + if (IS_ERR(filter)) + return PTR_ERR(filter); + } + + cb->data = filter; + return 0; +} + static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; @@ -1290,19 +1305,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { + .start = ctnetlink_start, .dump = ctnetlink_dump_table, .done = ctnetlink_done, + .data = (void *)cda, }; - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { - struct ctnetlink_filter *filter; - - filter = ctnetlink_alloc_filter(cda); - if (IS_ERR(filter)) - return PTR_ERR(filter); - - c.data = filter; - } return netlink_dump_start(ctnl, skb, nlh, &c); } diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index a0e5adf0b3b6..8fa8bf7c48e6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -238,29 +238,33 @@ static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, }; -static struct nfacct_filter * -nfacct_filter_alloc(const struct nlattr * const attr) +static int nfnl_acct_start(struct netlink_callback *cb) { - struct nfacct_filter *filter; + const struct nlattr *const attr = cb->data; struct nlattr *tb[NFACCT_FILTER_MAX + 1]; + struct nfacct_filter *filter; int err; + if (!attr) + return 0; + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return err; if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) - return ERR_PTR(-EINVAL); + return -EINVAL; filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); if (!filter) - return ERR_PTR(-ENOMEM); + return -ENOMEM; filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); + cb->data = filter; - return filter; + return 0; } static int nfnl_acct_get(struct net *net, struct sock *nfnl, @@ -275,18 +279,11 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, + .start = nfnl_acct_start, .done = nfnl_acct_done, + .data = (void *)tb[NFACCT_FILTER], }; - if (tb[NFACCT_FILTER]) { - struct nfacct_filter *filter; - - filter = nfacct_filter_alloc(tb[NFACCT_FILTER]); - if (IS_ERR(filter)) - return PTR_ERR(filter); - - c.data = filter; - } return netlink_dump_start(nfnl, skb, nlh, &c); } -- cgit v1.2.3-55-g7522 From d209df3e7f7002d9099fdb0f6df0f972b4386a63 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Aug 2018 21:44:40 +0200 Subject: netfilter: nf_tables: fix register ordering We must register nfnetlink ops last, as that exposes nf_tables to userspace. Without this, we could theoretically get nfnetlink request before net->nft state has been initialized. Fixes: 99633ab29b213 ("netfilter: nf_tables: complete net namespace support") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 29 ++++++++++++++++++++++------- net/netfilter/nft_chain_filter.c | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 552bfbef1bf1..0f39ac487012 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1374,6 +1374,6 @@ struct nft_trans_flowtable { (((struct nft_trans_flowtable *)trans->data)->flowtable) int __init nft_chain_filter_init(void); -void __exit nft_chain_filter_fini(void); +void nft_chain_filter_fini(void); #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3008f93469c4..80636cc59686 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7273,21 +7273,36 @@ static int __init nf_tables_module_init(void) { int err; - nft_chain_filter_init(); + err = register_pernet_subsys(&nf_tables_net_ops); + if (err < 0) + return err; + + err = nft_chain_filter_init(); + if (err < 0) + goto err1; err = nf_tables_core_module_init(); if (err < 0) - return err; + goto err2; - err = nfnetlink_subsys_register(&nf_tables_subsys); + err = register_netdevice_notifier(&nf_tables_flowtable_notifier); if (err < 0) - goto err; + goto err3; - register_netdevice_notifier(&nf_tables_flowtable_notifier); + /* must be last */ + err = nfnetlink_subsys_register(&nf_tables_subsys); + if (err < 0) + goto err4; - return register_pernet_subsys(&nf_tables_net_ops); -err: + return err; +err4: + unregister_netdevice_notifier(&nf_tables_flowtable_notifier); +err3: nf_tables_core_module_exit(); +err2: + nft_chain_filter_fini(); +err1: + unregister_pernet_subsys(&nf_tables_net_ops); return err; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index ea5b7c4944f6..9d07b277b9ee 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -392,7 +392,7 @@ int __init nft_chain_filter_init(void) return 0; } -void __exit nft_chain_filter_fini(void) +void nft_chain_filter_fini(void) { nft_chain_filter_bridge_fini(); nft_chain_filter_inet_fini(); -- cgit v1.2.3-55-g7522 From 6a48de0144767f2c6880540c0a4ac6741e3c440b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Aug 2018 21:44:41 +0200 Subject: netfilter: nf_tables: don't prevent event handler from device cleanup on netns exit When a netnsamespace exits, the nf_tables pernet_ops will remove all rules. However, there is one caveat: Base chains that register ingress hooks will cause use-after-free: device is already gone at that point. The device event handlers prevent this from happening: netns exit synthesizes unregister events for all devices. However, an improper fix for a race condition made the notifiers a no-op in case they get called from netns exit path, so revert that part. This is safe now as the previous patch fixed nf_tables pernet ops and device notifier initialisation ordering. Fixes: 0a2cf5ee432c2 ("netfilter: nf_tables: close race between netns exit and rmmod") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 ++----- net/netfilter/nft_chain_filter.c | 12 +++++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 80636cc59686..1dca5683f59f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5925,10 +5925,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, if (event != NETDEV_UNREGISTER) return 0; - net = maybe_get_net(dev_net(dev)); - if (!net) - return 0; - + net = dev_net(dev); mutex_lock(&net->nft.commit_mutex); list_for_each_entry(table, &net->nft.tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { @@ -5936,7 +5933,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, } } mutex_unlock(&net->nft.commit_mutex); - put_net(net); + return NOTIFY_DONE; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 9d07b277b9ee..3fd540b2c6ba 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -293,6 +293,13 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, if (strcmp(basechain->dev_name, dev->name) != 0) return; + /* UNREGISTER events are also happpening on netns exit. + * + * Altough nf_tables core releases all tables/chains, only + * this event handler provides guarantee that + * basechain.ops->dev is still accessible, so we cannot + * skip exiting net namespaces. + */ __nft_release_basechain(ctx); break; case NETDEV_CHANGENAME: @@ -318,10 +325,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - ctx.net = maybe_get_net(ctx.net); - if (!ctx.net) - return NOTIFY_DONE; - mutex_lock(&ctx.net->nft.commit_mutex); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) @@ -338,7 +341,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, } } mutex_unlock(&ctx.net->nft.commit_mutex); - put_net(ctx.net); return NOTIFY_DONE; } -- cgit v1.2.3-55-g7522 From 1c117d3b721a2678020d522ff1bf33984d4f0a5a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Aug 2018 12:30:09 +0200 Subject: netfilter: conntrack: fix removal of conntrack entries when l4tracker is removed nf_ct_l4proto_unregister_one() leaves conntracks added by to-be-removed tracker behind, nf_ct_l4proto_unregister has to iterate for each protocol to be removed. v2: call nf_ct_iterate_destroy without holding nf_ct_proto_mutex. Fixes: 2c41f33c1b703 ("netfilter: move table iteration out of netns exit paths") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 30070732ee50..9f14b0df6960 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -312,7 +312,9 @@ void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) __nf_ct_l4proto_unregister_one(l4proto); mutex_unlock(&nf_ct_proto_mutex); - synchronize_rcu(); + synchronize_net(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); } EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one); @@ -333,14 +335,17 @@ static void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], unsigned int num_proto) { + int i; + mutex_lock(&nf_ct_proto_mutex); - while (num_proto-- != 0) - __nf_ct_l4proto_unregister_one(l4proto[num_proto]); + for (i = 0; i < num_proto; i++) + __nf_ct_l4proto_unregister_one(l4proto[i]); mutex_unlock(&nf_ct_proto_mutex); synchronize_net(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); + + for (i = 0; i < num_proto; i++) + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]); } static int -- cgit v1.2.3-55-g7522 From a148ce15375fc664ad64762c751c0c2aecb2cafe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 7 Aug 2018 21:54:00 +0200 Subject: netfilter: x_tables: do not fail xt_alloc_table_info too easilly eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_table_info()") has unintentionally fortified xt_alloc_table_info allocation when __GFP_RETRY has been dropped from the vmalloc fallback. Later on there was a syzbot report that this can lead to OOM killer invocations when tables are too large and 0537250fdc6c ("netfilter: x_tables: make allocation less aggressive") has been merged to restore the original behavior. Georgi Nikolov however noticed that he is not able to install his iptables anymore so this can be seen as a regression. The primary argument for 0537250fdc6c was that this allocation path shouldn't really trigger the OOM killer and kill innocent tasks. On the other hand the interface requires root and as such should allow what the admin asks for. Root inside a namespaces makes this more complicated because those might be not trusted in general. If they are not then such namespaces should be restricted anyway. Therefore drop the __GFP_NORETRY and replace it by __GFP_ACCOUNT to enfore memcg constrains on it. Fixes: 0537250fdc6c ("netfilter: x_tables: make allocation less aggressive") Reported-by: Georgi Nikolov Suggested-by: Vlastimil Babka Acked-by: Florian Westphal Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d0d8397c9588..aecadd471e1d 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1178,12 +1178,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) return NULL; - /* __GFP_NORETRY is not fully supported by kvmalloc but it should - * work reasonably well if sz is too large and bail out rather - * than shoot all processes down before realizing there is nothing - * more to reclaim. - */ - info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY); + info = kvmalloc(sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; -- cgit v1.2.3-55-g7522 From 1bfc2bc7ad8ac045045e99e33aa1ba727d2d291d Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Fri, 10 Aug 2018 12:58:39 +0200 Subject: netfilter: doc: Add nf_tables part in tproxy.txt Recently, transparent proxy support has been added to nf_tables so that this document should be updated with the new information. - Nft commands are added as alternatives to iptables ones. - The link for a patched iptables is removed as it is already part of the mainline iptables implementation (and the link is dead). - tcprdr is added as an example implementation of a transparent proxy Cc: "David S. Miller" Cc: Jonathan Corbet Cc: Florian Westphal Cc: KOVACS Krisztian Cc: Pablo Neira Ayuso Cc: linux-doc@vger.kernel.org Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/tproxy.txt | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt index ec11429e1d42..b9a188823d9f 100644 --- a/Documentation/networking/tproxy.txt +++ b/Documentation/networking/tproxy.txt @@ -5,19 +5,28 @@ This feature adds Linux 2.2-like transparent proxy support to current kernels. To use it, enable the socket match and the TPROXY target in your kernel config. You will need policy routing too, so be sure to enable that as well. +From Linux 4.18 transparent proxy support is also available in nf_tables. 1. Making non-local sockets work ================================ The idea is that you identify packets with destination address matching a local -socket on your box, set the packet mark to a certain value, and then match on that -value using policy routing to have those packets delivered locally: +socket on your box, set the packet mark to a certain value: # iptables -t mangle -N DIVERT # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT # iptables -t mangle -A DIVERT -j MARK --set-mark 1 # iptables -t mangle -A DIVERT -j ACCEPT +Alternatively you can do this in nft with the following commands: + +# nft add table filter +# nft add chain filter divert "{ type filter hook prerouting priority -150; }" +# nft add rule filter divert meta l4proto tcp socket transparent 1 meta mark set 1 accept + +And then match on that value using policy routing to have those packets +delivered locally: + # ip rule add fwmark 1 lookup 100 # ip route add local 0.0.0.0/0 dev lo table 100 @@ -57,17 +66,28 @@ add rules like this to the iptables ruleset above: # iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ --tproxy-mark 0x1/0x1 --on-port 50080 +Or the following rule to nft: + +# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept + Note that for this to work you'll have to modify the proxy to enable (SOL_IP, IP_TRANSPARENT) for the listening socket. +As an example implementation, tcprdr is available here: +https://git.breakpoint.cc/cgit/fw/tcprdr.git/ +This tool is written by Florian Westphal and it was used for testing during the +nf_tables implementation. -3. Iptables extensions -====================== +3. Iptables and nf_tables extensions +==================================== -To use tproxy you'll need to have the 'socket' and 'TPROXY' modules -compiled for iptables. A patched version of iptables is available -here: http://git.balabit.hu/?p=bazsi/iptables-tproxy.git +To use tproxy you'll need to have the following modules compiled for iptables: + - NETFILTER_XT_MATCH_SOCKET + - NETFILTER_XT_TARGET_TPROXY +Or the floowing modules for nf_tables: + - NFT_SOCKET + - NFT_TPROXY 4. Application support ====================== -- cgit v1.2.3-55-g7522 From 3206c516ce4e4b56d7b99341814c261ec190f6df Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Fri, 10 Aug 2018 22:52:37 +0530 Subject: netfilter: nft_ct: make l3 protocol field optional for timeout object If l3 protocol value is not specified for ct timeout object then use the value from nft_ctx protocol family. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 4855d4ce1c8f..26a8baebd072 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -832,12 +832,13 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, __u8 l4num; int ret; - if (!tb[NFTA_CT_TIMEOUT_L3PROTO] || - !tb[NFTA_CT_TIMEOUT_L4PROTO] || + if (!tb[NFTA_CT_TIMEOUT_L4PROTO] || !tb[NFTA_CT_TIMEOUT_DATA]) return -EINVAL; - l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); + if (tb[NFTA_CT_TIMEOUT_L3PROTO]) + l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; -- cgit v1.2.3-55-g7522 From cdb2f401246ef70b37eeedd9c6f9c28d611d8255 Mon Sep 17 00:00:00 2001 From: Dmitry V. Levin Date: Mon, 13 Aug 2018 18:43:36 +0300 Subject: netfilter: uapi: fix linux/netfilter/nf_osf.h userspace compilation errors Move inclusion of and from linux/netfilter/xt_osf.h to linux/netfilter/nf_osf.h to fix the following linux/netfilter/nf_osf.h userspace compilation errors: /usr/include/linux/netfilter/nf_osf.h:59:24: error: 'MAX_IPOPTLEN' undeclared here (not in a function) struct nf_osf_opt opt[MAX_IPOPTLEN]; /usr/include/linux/netfilter/nf_osf.h:64:17: error: field 'ip' has incomplete type struct iphdr ip; /usr/include/linux/netfilter/nf_osf.h:65:18: error: field 'tcp' has incomplete type struct tcphdr tcp; Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf") Signed-off-by: Dmitry V. Levin Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_osf.h | 2 ++ include/uapi/linux/netfilter/xt_osf.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h index 76a3527df5dd..272bc3195f2d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_osf.h +++ b/include/uapi/linux/netfilter/nfnetlink_osf.h @@ -2,6 +2,8 @@ #define _NF_OSF_H #include +#include +#include #define MAXGENRELEN 32 diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index 24102b5286ec..6e466236ca4b 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -21,8 +21,6 @@ #define _XT_OSF_H #include -#include -#include #include #define XT_OSF_GENRE NF_OSF_GENRE -- cgit v1.2.3-55-g7522 From 90d827f06bebbd9aded00c152e6c9eb2db4db1a3 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Tue, 14 Aug 2018 22:09:10 +0200 Subject: netfilter: nft_tproxy: Fix missing-braces warning This patch fixes a warning reported by the kbuild test robot (from linux-next tree): net/netfilter/nft_tproxy.c: In function 'nft_tproxy_eval_v6': >> net/netfilter/nft_tproxy.c:85:9: warning: missing braces around initializer [-Wmissing-braces] struct in6_addr taddr = {0}; ^ net/netfilter/nft_tproxy.c:85:9: warning: (near initialization for 'taddr.in6_u') [-Wmissing-braces] This warning is actually caused by a gcc bug already resolved in newer versions (kbuild used 4.9) so this kind of initialization is omitted and memset is used instead. Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index eff99dffc842..f92a82c73880 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -82,13 +82,15 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr = {0}; + struct in6_addr taddr; int thoff = pkt->xt.thoff; struct udphdr _hdr, *hp; __be16 tport = 0; struct sock *sk; int l4proto; + memset(&taddr, 0, sizeof(taddr)); + if (!pkt->tprot_set) { regs->verdict.code = NFT_BREAK; return; -- cgit v1.2.3-55-g7522 From feb9f55c33e5114127238a2c87c069b4f30d1f23 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Aug 2018 15:37:23 +0200 Subject: netfilter: nft_dynset: allow dynamic updates of non-anonymous set This check is superfluous since it breaks valid configurations, remove it. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 81184c244d1a..6e91a37d57f2 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -187,8 +187,6 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_EXPR] != NULL) { if (!(set->flags & NFT_SET_EVAL)) return -EINVAL; - if (!nft_set_is_anonymous(set)) - return -EOPNOTSUPP; priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); if (IS_ERR(priv->expr)) -- cgit v1.2.3-55-g7522 From a51c76b4dfb30496dc65396a957ef0f06af7fb22 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 14 Aug 2018 17:28:26 +0800 Subject: cls_matchall: fix tcf_unbind_filter missing Fix tcf_unbind_filter missing in cls_matchall as this will trigger WARN_ON() in cbq_destroy_class(). Fixes: fd62d9f5c575f ("net/sched: matchall: Fix configuration race") Reported-by: Li Shuang Signed-off-by: Hangbin Liu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index af16f36ed578..856fa79d4ffd 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -113,6 +113,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) if (!head) return; + tcf_unbind_filter(tp, &head->res); + if (!tc_skip_hw(head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head, extack); -- cgit v1.2.3-55-g7522 From 5f34f69ede27dea1f167a0a6d8996f0194b27ba8 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Tue, 14 Aug 2018 13:33:37 +0100 Subject: dt-bindings: net: ravb: Add support for r8a774a1 SoC Document RZ/G2M (R8A774A1) SoC bindings. Signed-off-by: Fabrizio Castro Reviewed-by: Biju Das Reviewed-by: Sergei Shtylyov Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/renesas,ravb.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index 19740d01cab0..da249b7c406c 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -16,6 +16,7 @@ Required properties: - "renesas,etheravb-rcar-gen2" as a fallback for the above R-Car Gen2 and RZ/G1 devices. + - "renesas,etheravb-r8a774a1" for the R8A774A1 SoC. - "renesas,etheravb-r8a7795" for the R8A7795 SoC. - "renesas,etheravb-r8a7796" for the R8A7796 SoC. - "renesas,etheravb-r8a77965" for the R8A77965 SoC. @@ -24,7 +25,7 @@ Required properties: - "renesas,etheravb-r8a77990" for the R8A77990 SoC. - "renesas,etheravb-r8a77995" for the R8A77995 SoC. - "renesas,etheravb-rcar-gen3" as a fallback for the above - R-Car Gen3 devices. + R-Car Gen3 and RZ/G2 devices. When compatible with the generic version, nodes must list the SoC-specific version corresponding to the platform first followed by -- cgit v1.2.3-55-g7522 From 32039eac4c48f70dfaad36a84f30356ededc7ea8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Aug 2018 20:29:56 +0300 Subject: net: sched: act_ife: always release ife action on init error Action init API was changed to always take reference to action, even when overwriting existing action. Substitute conditional action release, which was executed only if action is newly created, with unconditional release in tcf_ife_init() error handling code to prevent double free or memory leak in case of overwrite. Fixes: 4e8ddd7f1758 ("net: sched: don't release reference on action overwrite") Reported-by: Cong Wang Signed-off-by: Vlad Buslov Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index fdb928ca81bb..d1081bdf1bdb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -551,9 +551,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, NULL, NULL); if (err) { metadata_parse_err: - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); - if (exists) spin_unlock_bh(&ife->tcf_lock); tcf_idr_release(*a, bind); @@ -574,11 +571,10 @@ metadata_parse_err: */ err = use_all_metadata(ife); if (err) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); - if (exists) spin_unlock_bh(&ife->tcf_lock); + tcf_idr_release(*a, bind); + kfree(p); return err; } -- cgit v1.2.3-55-g7522 From ff93bca769925a2d8fd7f910cdf543d992e17f07 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 14 Aug 2018 15:21:31 -0700 Subject: ila: make lockdep happy again Previously, alloc_ila_locks() and bucket_table_alloc() call spin_lock_init() separately, therefore they have two different lock names and lock class keys. However, after commit b893281715ab ("ila: Call library function alloc_bucket_locks") they both call helper alloc_bucket_spinlocks() which now only has one lock name and lock class key. This causes a few bogus lockdep warnings as reported by syzbot. Fix this by making alloc_bucket_locks() a macro and pass declaration name as lock name and a static lock class key inside the macro. Fixes: b893281715ab ("ila: Call library function alloc_bucket_locks") Reported-by: Cc: Tom Herbert Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/spinlock.h | 17 ++++++++++++++--- lib/bucket_locks.c | 11 +++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 3190997df9ca..e089157dcf97 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -451,9 +451,20 @@ extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) -int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, - size_t max_size, unsigned int cpu_mult, - gfp_t gfp); +int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, + size_t max_size, unsigned int cpu_mult, + gfp_t gfp, const char *name, + struct lock_class_key *key); + +#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp) \ + ({ \ + static struct lock_class_key key; \ + int ret; \ + \ + ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size, \ + cpu_mult, gfp, #locks, &key); \ + ret; \ + }) void free_bucket_spinlocks(spinlock_t *locks); diff --git a/lib/bucket_locks.c b/lib/bucket_locks.c index ade3ce6c4af6..64b92e1dbace 100644 --- a/lib/bucket_locks.c +++ b/lib/bucket_locks.c @@ -11,8 +11,9 @@ * to a power of 2 to be suitable as a hash table. */ -int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, - size_t max_size, unsigned int cpu_mult, gfp_t gfp) +int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, + size_t max_size, unsigned int cpu_mult, gfp_t gfp, + const char *name, struct lock_class_key *key) { spinlock_t *tlocks = NULL; unsigned int i, size; @@ -33,8 +34,10 @@ int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, tlocks = kvmalloc_array(size, sizeof(spinlock_t), gfp); if (!tlocks) return -ENOMEM; - for (i = 0; i < size; i++) + for (i = 0; i < size; i++) { spin_lock_init(&tlocks[i]); + lockdep_init_map(&tlocks[i].dep_map, name, key, 0); + } } *locks = tlocks; @@ -42,7 +45,7 @@ int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, return 0; } -EXPORT_SYMBOL(alloc_bucket_spinlocks); +EXPORT_SYMBOL(__alloc_bucket_spinlocks); void free_bucket_spinlocks(spinlock_t *locks) { -- cgit v1.2.3-55-g7522 From 7797b93b756dabffd1a8db3e7e7b778fb07ef0a6 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Wed, 15 Aug 2018 17:07:29 +0900 Subject: veth: Free queues on link delete David Ahern reported memory leak in veth. ======================================================================= $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800354d5c00 (size 1024): comm "ip", pid 836, jiffies 4294722952 (age 25.904s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x147/0x3ac [veth] ... unreferenced object 0xffff88002e009c00 (size 1024): comm "ip", pid 836, jiffies 4294722958 (age 25.898s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<(____ptrval____)>] kmemleak_alloc+0x70/0x94 [<(____ptrval____)>] slab_post_alloc_hook+0x42/0x52 [<(____ptrval____)>] __kmalloc+0x101/0x142 [<(____ptrval____)>] kmalloc_array.constprop.20+0x1e/0x26 [veth] [<(____ptrval____)>] veth_newlink+0x219/0x3ac [veth] ======================================================================= veth_rq allocated in veth_newlink() was not freed on dellink. We need to free up them after veth_close() so that any packets will not reference the queues afterwards. Thus free them in veth_dev_free() in the same way as freeing stats structure (vstats). Also move queues allocation to veth_dev_init() to be in line with stats allocation. Fixes: 638264dc90227 ("veth: Support per queue XDP ring") Reported-by: David Ahern Signed-off-by: Toshiaki Makita Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/veth.c | 70 +++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index e3202af72df5..8d679c8b7f25 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -789,16 +789,48 @@ static int is_valid_veth_mtu(int mtu) return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; } +static int veth_alloc_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); + if (!priv->rq) + return -ENOMEM; + + for (i = 0; i < dev->num_rx_queues; i++) + priv->rq[i].dev = dev; + + return 0; +} + +static void veth_free_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + kfree(priv->rq); +} + static int veth_dev_init(struct net_device *dev) { + int err; + dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats); if (!dev->vstats) return -ENOMEM; + + err = veth_alloc_queues(dev); + if (err) { + free_percpu(dev->vstats); + return err; + } + return 0; } static void veth_dev_free(struct net_device *dev) { + veth_free_queues(dev); free_percpu(dev->vstats); } @@ -1040,31 +1072,13 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int veth_alloc_queues(struct net_device *dev) -{ - struct veth_priv *priv = netdev_priv(dev); - - priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); - if (!priv->rq) - return -ENOMEM; - - return 0; -} - -static void veth_free_queues(struct net_device *dev) -{ - struct veth_priv *priv = netdev_priv(dev); - - kfree(priv->rq); -} - static struct rtnl_link_ops veth_link_ops; static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - int err, i; + int err; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; @@ -1117,12 +1131,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, return PTR_ERR(peer); } - err = veth_alloc_queues(peer); - if (err) { - put_net(net); - goto err_peer_alloc_queues; - } - if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -1151,10 +1159,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, * should be re-allocated */ - err = veth_alloc_queues(dev); - if (err) - goto err_alloc_queues; - if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); @@ -1174,28 +1178,20 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, */ priv = netdev_priv(dev); - for (i = 0; i < dev->real_num_rx_queues; i++) - priv->rq[i].dev = dev; rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); - for (i = 0; i < peer->real_num_rx_queues; i++) - priv->rq[i].dev = peer; rcu_assign_pointer(priv->peer, dev); return 0; err_register_dev: - veth_free_queues(dev); -err_alloc_queues: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: - veth_free_queues(peer); -err_peer_alloc_queues: free_netdev(peer); return err; } -- cgit v1.2.3-55-g7522 From 45316818371d1fb3ecbffa4d4a2d0550e1822e92 Mon Sep 17 00:00:00 2001 From: Lad, Prabhakar Date: Wed, 15 Aug 2018 16:51:23 +0100 Subject: net: dsa: add support for ksz9897 ethernet switch ksz9477 is superset of ksz9xx series, driver just works out of the box for ksz9897 chip with this patch. Signed-off-by: Lad, Prabhakar Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/ksz.txt | 4 +++- drivers/net/dsa/microchip/ksz_common.c | 9 +++++++++ drivers/net/dsa/microchip/ksz_spi.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt index a700943218ca..ac145b885e95 100644 --- a/Documentation/devicetree/bindings/net/dsa/ksz.txt +++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt @@ -4,7 +4,9 @@ Microchip KSZ Series Ethernet switches Required properties: - compatible: For external switch chips, compatible string must be exactly one - of: "microchip,ksz9477" + of the following: + - "microchip,ksz9477" + - "microchip,ksz9897" See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional required and optional properties. diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 7210c49b7922..54e0ca6ed730 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -1102,6 +1102,15 @@ static const struct ksz_chip_data ksz_switch_chips[] = { .cpu_ports = 0x7F, /* can be configured as cpu port */ .port_cnt = 7, /* total physical port count */ }, + { + .chip_id = 0x00989700, + .dev_name = "KSZ9897", + .num_vlans = 4096, + .num_alus = 4096, + .num_statics = 16, + .cpu_ports = 0x7F, /* can be configured as cpu port */ + .port_cnt = 7, /* total physical port count */ + }, }; static int ksz_switch_init(struct ksz_device *dev) diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c index c51946983bed..8c1778b42701 100644 --- a/drivers/net/dsa/microchip/ksz_spi.c +++ b/drivers/net/dsa/microchip/ksz_spi.c @@ -195,6 +195,7 @@ static int ksz_spi_remove(struct spi_device *spi) static const struct of_device_id ksz_dt_ids[] = { { .compatible = "microchip,ksz9477" }, + { .compatible = "microchip,ksz9897" }, {}, }; MODULE_DEVICE_TABLE(of, ksz_dt_ids); -- cgit v1.2.3-55-g7522 From 5e22002aa8809e2efab2da95855f73f63e14a36c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 15 Aug 2018 12:14:05 -0700 Subject: isdn: Disable IIOCDBGVAR It was possible to directly leak the kernel address where the isdn_dev structure pointer was stored. This is a kernel ASLR bypass for anyone with access to the ioctl. The code had been present since the beginning of git history, though this shouldn't ever be needed for normal operation, therefore remove it. Reported-by: Al Viro Cc: Karsten Keil Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- drivers/isdn/i4l/isdn_common.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c index 7a501dbe7123..6a5b3f00f9ad 100644 --- a/drivers/isdn/i4l/isdn_common.c +++ b/drivers/isdn/i4l/isdn_common.c @@ -1640,13 +1640,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg) } else return -EINVAL; case IIOCDBGVAR: - if (arg) { - if (copy_to_user(argp, &dev, sizeof(ulong))) - return -EFAULT; - return 0; - } else - return -EINVAL; - break; + return -EINVAL; default: if ((cmd & IIOCDRVCTL) == IIOCDRVCTL) cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK; -- cgit v1.2.3-55-g7522 From 70fd8036d0d77bd93ae12185c66504a409d5b54e Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Wed, 15 Aug 2018 23:29:53 +0300 Subject: Documentation: networking: ti-cpsw: correct cbs parameters for Eth1 100Mb If set cbs parameters calculated for 1000Mb, but use on 100Mb port w/o h/w offload (for cpsw offload it doesn't matter), it works incorrectly. According to the example and testing board, second port is 100Mb interface. Correct them on recalculated for 100Mb interface. It allows to use the same command for CBS software implementation for board in example. Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- Documentation/networking/ti-cpsw.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/ti-cpsw.txt b/Documentation/networking/ti-cpsw.txt index 67039205bd69..d4d4c0751a09 100644 --- a/Documentation/networking/ti-cpsw.txt +++ b/Documentation/networking/ti-cpsw.txt @@ -469,17 +469,18 @@ $ tc -g class show dev eth1 14) // Set rate for class A - 31 Mbit (tc0, txq2) using CBS Qdisc for Eth1 -// here only idle slope is important, others ignored +// here only idle slope is important, others ignored, but calculated +// for interface speed - 100Mb for eth1 port. // Set it +1 Mb for reserve (important!) -$ tc qdisc add dev eth1 parent 100:3 cbs locredit -1453 \ -hicredit 47 sendslope -969000 idleslope 31000 offload 1 +$ tc qdisc add dev eth1 parent 100:3 cbs locredit -1035 \ +hicredit 465 sendslope -69000 idleslope 31000 offload 1 net eth1: set FIFO3 bw = 31 15) // Set rate for class B - 11 Mbit (tc1, txq3) using CBS Qdisc for Eth1 // Set it +1 Mb for reserve (important!) -$ tc qdisc add dev eth1 parent 100:4 cbs locredit -1483 \ -hicredit 34 sendslope -989000 idleslope 11000 offload 1 +$ tc qdisc add dev eth1 parent 100:4 cbs locredit -1335 \ +hicredit 405 sendslope -89000 idleslope 11000 offload 1 net eth1: set FIFO2 bw = 11 16) -- cgit v1.2.3-55-g7522 From 54c73f8651500181a948aa6767ae8c2bcfbe9f35 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Thu, 16 Aug 2018 12:02:20 +0300 Subject: net/mlx5e: Delete unneeded function argument priv argument is not used by the function, delete it. Fixes: a89842811ea98 ("net/mlx5e: Merge per priority stats groups") Signed-off-by: Yuval Shaia Reviewed-by: Leon Romanovsky Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 12fdf5c92b67..6839481f7697 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -848,7 +848,7 @@ static const struct counter_desc pport_per_prio_traffic_stats_desc[] = { #define NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS ARRAY_SIZE(pport_per_prio_traffic_stats_desc) -static int mlx5e_grp_per_prio_traffic_get_num_stats(struct mlx5e_priv *priv) +static int mlx5e_grp_per_prio_traffic_get_num_stats(void) { return NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS * NUM_PPORT_PRIO; } @@ -1006,7 +1006,7 @@ static int mlx5e_grp_per_prio_pfc_fill_stats(struct mlx5e_priv *priv, static int mlx5e_grp_per_prio_get_num_stats(struct mlx5e_priv *priv) { - return mlx5e_grp_per_prio_traffic_get_num_stats(priv) + + return mlx5e_grp_per_prio_traffic_get_num_stats() + mlx5e_grp_per_prio_pfc_get_num_stats(priv); } -- cgit v1.2.3-55-g7522 From 21b172ee11b6ec260bd7e6a27b11a8a8d392fce5 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 13 Aug 2018 12:21:58 +0300 Subject: net/xdp: Fix suspicious RCU usage warning Fix the warning below by calling rhashtable_lookup_fast. Also, make some code movements for better quality and human readability. [ 342.450870] WARNING: suspicious RCU usage [ 342.455856] 4.18.0-rc2+ #17 Tainted: G O [ 342.462210] ----------------------------- [ 342.467202] ./include/linux/rhashtable.h:481 suspicious rcu_dereference_check() usage! [ 342.476568] [ 342.476568] other info that might help us debug this: [ 342.476568] [ 342.486978] [ 342.486978] rcu_scheduler_active = 2, debug_locks = 1 [ 342.495211] 4 locks held by modprobe/3934: [ 342.500265] #0: 00000000e23116b2 (mlx5_intf_mutex){+.+.}, at: mlx5_unregister_interface+0x18/0x90 [mlx5_core] [ 342.511953] #1: 00000000ca16db96 (rtnl_mutex){+.+.}, at: unregister_netdev+0xe/0x20 [ 342.521109] #2: 00000000a46e2c4b (&priv->state_lock){+.+.}, at: mlx5e_close+0x29/0x60 [mlx5_core] [ 342.531642] #3: 0000000060c5bde3 (mem_id_lock){+.+.}, at: xdp_rxq_info_unreg+0x93/0x6b0 [ 342.541206] [ 342.541206] stack backtrace: [ 342.547075] CPU: 12 PID: 3934 Comm: modprobe Tainted: G O 4.18.0-rc2+ #17 [ 342.556621] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 1.5.4 10/002/2015 [ 342.565606] Call Trace: [ 342.568861] dump_stack+0x78/0xb3 [ 342.573086] xdp_rxq_info_unreg+0x3f5/0x6b0 [ 342.578285] ? __call_rcu+0x220/0x300 [ 342.582911] mlx5e_free_rq+0x38/0xc0 [mlx5_core] [ 342.588602] mlx5e_close_channel+0x20/0x120 [mlx5_core] [ 342.594976] mlx5e_close_channels+0x26/0x40 [mlx5_core] [ 342.601345] mlx5e_close_locked+0x44/0x50 [mlx5_core] [ 342.607519] mlx5e_close+0x42/0x60 [mlx5_core] [ 342.613005] __dev_close_many+0xb1/0x120 [ 342.617911] dev_close_many+0xa2/0x170 [ 342.622622] rollback_registered_many+0x148/0x460 [ 342.628401] ? __lock_acquire+0x48d/0x11b0 [ 342.633498] ? unregister_netdev+0xe/0x20 [ 342.638495] rollback_registered+0x56/0x90 [ 342.643588] unregister_netdevice_queue+0x7e/0x100 [ 342.649461] unregister_netdev+0x18/0x20 [ 342.654362] mlx5e_remove+0x2a/0x50 [mlx5_core] [ 342.659944] mlx5_remove_device+0xe5/0x110 [mlx5_core] [ 342.666208] mlx5_unregister_interface+0x39/0x90 [mlx5_core] [ 342.673038] cleanup+0x5/0xbfc [mlx5_core] [ 342.678094] __x64_sys_delete_module+0x16b/0x240 [ 342.683725] ? do_syscall_64+0x1c/0x210 [ 342.688476] do_syscall_64+0x5a/0x210 [ 342.693025] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 8d5d88527587 ("xdp: rhashtable with allocator ID to pointer mapping") Signed-off-by: Tariq Toukan Suggested-by: Daniel Borkmann Cc: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/xdp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 3dd99e1c04f5..89b6785cef2a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -98,23 +98,15 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; int id = xdp_rxq->mem.id; - int err; if (id == 0) return; mutex_lock(&mem_id_lock); - xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; - } - - err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); - WARN_ON(err); - - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); mutex_unlock(&mem_id_lock); } -- cgit v1.2.3-55-g7522 From 817b89beb9d8876450fcde9155e17425c329569d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 15 Aug 2018 16:57:14 +0200 Subject: samples/bpf: all XDP samples should unload xdp/bpf prog on SIGTERM It is common XDP practice to unload/deattach the XDP bpf program, when the XDP sample program is Ctrl-C interrupted (SIGINT) or killed (SIGTERM). The samples/bpf programs xdp_redirect_cpu and xdp_rxq_info, forgot to trap signal SIGTERM (which is the default signal used by the kill command). This was discovered by Red Hat QA, which automated scripts depend on killing the XDP sample program after a timeout period. Fixes: fad3917e361b ("samples/bpf: add cpumap sample program xdp_redirect_cpu") Fixes: 0fca931a6f21 ("samples/bpf: program demonstrating access to xdp_rxq_info") Reported-by: Jean-Tsung Hsiao Signed-off-by: Jesper Dangaard Brouer Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- samples/bpf/xdp_redirect_cpu_user.c | 3 ++- samples/bpf/xdp_rxq_info_user.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 9a6c7e0a6dd1..2d23054aaccf 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -679,8 +679,9 @@ int main(int argc, char **argv) return EXIT_FAIL_OPTION; } - /* Remove XDP program when program is interrupted */ + /* Remove XDP program when program is interrupted or killed */ signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { fprintf(stderr, "link set xdp fd failed\n"); diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 248a7eab9531..ef26f882f92f 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -567,8 +567,9 @@ int main(int argc, char **argv) exit(EXIT_FAIL_BPF); } - /* Remove XDP program when program is interrupted */ + /* Remove XDP program when program is interrupted or killed */ signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { fprintf(stderr, "link set xdp fd failed\n"); -- cgit v1.2.3-55-g7522 From 965931e3a803a506482616f89239eff6901c17b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Aug 2018 11:01:12 -0700 Subject: bpf: fix a rcu usage warning in bpf_prog_array_copy_core() Commit 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") refactored the bpf_prog_array_copy_core() to accommodate new structure bpf_prog_array_item which contains bpf_prog array itself. In the old code, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): prog = rcu_dereference_check(array, 1)->progs bpf_prog_array_copy_core(prog, ...) mutex_unlock(...) With the above commit, we had perf_event_query_prog_array(): mutex_lock(...) bpf_prog_array_copy_call(): bpf_prog_array_copy_core(array, ...): item = rcu_dereference(array)->items; ... mutex_unlock(...) The new code will trigger a lockdep rcu checking warning. The fix is to change rcu_dereference() to rcu_dereference_check() to prevent such a warning. Reported-by: syzbot+6e72317008eef84a216b@syzkaller.appspotmail.com Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage") Cc: Roman Gushchin Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4d09e610777f..3f5bf1af0826 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1579,7 +1579,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference(array)->items; + item = rcu_dereference_check(array, 1)->items; for (; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; -- cgit v1.2.3-55-g7522 From 037b0b86ecf5646f8eae777d8b52ff8b401692ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:06 +0200 Subject: tcp, ulp: add alias for all ulp modules Lets not turn the TCP ULP lookup into an arbitrary module loader as we only intend to load ULP modules through this mechanism, not other unrelated kernel modules: [root@bar]# cat foo.c #include #include #include #include int main(void) { int sock = socket(PF_INET, SOCK_STREAM, 0); setsockopt(sock, IPPROTO_TCP, TCP_ULP, "sctp", sizeof("sctp")); return 0; } [root@bar]# gcc foo.c -O2 -Wall [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp sctp 1077248 4 libcrc32c 16384 3 nf_conntrack,nf_nat,sctp [root@bar]# Fix it by adding module alias to TCP ULP modules, so probing module via request_module() will be limited to tcp-ulp-[name]. The existing modules like kTLS will load fine given tcp-ulp-tls alias, but others will fail to load: [root@bar]# lsmod | grep sctp [root@bar]# ./a.out [root@bar]# lsmod | grep sctp [root@bar]# Sockmap is not affected from this since it's either built-in or not. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 ++++ net/ipv4/tcp_ulp.c | 2 +- net/tls/tls_main.c | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d196901c9dba..770917d0caa7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2065,6 +2065,10 @@ int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +#define MODULE_ALIAS_TCP_ULP(name) \ + __MODULE_INFO(alias, alias_userspace, name); \ + __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 622caa4039e0..7dd44b6156c7 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -51,7 +51,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); - request_module("%s", name); + request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b09867c8b817..93c0c225ab34 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -45,6 +45,7 @@ MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_TCP_ULP("tls"); enum { TLSV4, -- cgit v1.2.3-55-g7522 From 90545cdc3f2b2ea700e24335610cd181e73756da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:07 +0200 Subject: tcp, ulp: fix leftover icsk_ulp_ops preventing sock from reattach I found that in BPF sockmap programs once we either delete a socket from the map or we updated a map slot and the old socket was purged from the map that these socket can never get reattached into a map even though their related psock has been dropped entirely at that point. Reason is that tcp_cleanup_ulp() leaves the old icsk->icsk_ulp_ops intact, so that on the next tcp_set_ulp_id() the kernel returns an -EEXIST thinking there is still some active ULP attached. BPF sockmap is the only one that has this issue as the other user, kTLS, only calls tcp_cleanup_ulp() from tcp_v4_destroy_sock() whereas sockmap semantics allow dropping the socket from the map with all related psock state being cleaned up. Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/ipv4/tcp_ulp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 7dd44b6156c7..a5995bb2eaca 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -129,6 +129,8 @@ void tcp_cleanup_ulp(struct sock *sk) if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); + + icsk->icsk_ulp_ops = NULL; } /* Change upper layer protocol for socket */ -- cgit v1.2.3-55-g7522 From d40b0116c94bd8fc2b63aae35ce8e66bb53bba42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:08 +0200 Subject: bpf, sockmap: fix leakage of smap_psock_map_entry While working on sockmap I noticed that we do not always kfree the struct smap_psock_map_entry list elements which track psocks attached to maps. In the case of sock_hash_ctx_update_elem(), these map entries are allocated outside of __sock_map_ctx_update_elem() with their linkage to the socket hash table filled. In the case of sock array, the map entries are allocated inside of __sock_map_ctx_update_elem() and added with their linkage to the psock->maps. Both additions are under psock->maps_lock each. Now, we drop these elements from their psock->maps list in a few occasions: i) in sock array via smap_list_map_remove() when an entry is either deleted from the map from user space, or updated via user space or BPF program where we drop the old socket at that map slot, or the sock array is freed via sock_map_free() and drops all its elements; ii) for sock hash via smap_list_hash_remove() in exactly the same occasions as just described for sock array; iii) in the bpf_tcp_close() where we remove the elements from the list via psock_map_pop() and iterate over them dropping themselves from either sock array or sock hash; and last but not least iv) once again in smap_gc_work() which is a callback for deferring the work once the psock refcount hit zero and thus the socket is being destroyed. Problem is that the only case where we kfree() the list entry is in case iv), which at that point should have an empty list in normal cases. So in cases from i) to iii) we unlink the elements without freeing where they go out of reach from us. Hence fix is to properly kfree() them as well to stop the leakage. Given these are all handled under psock->maps_lock there is no need for deferred RCU freeing. I later also ran with kmemleak detector and it confirmed the finding as well where in the state before the fix the object goes unreferenced while after the patch no kmemleak report related to BPF showed up. [...] unreferenced object 0xffff880378eadae0 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 50 4d 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 PMu]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000045dd6d3c>] bpf_sock_map_update+0x29/0x60 [<00000000877723aa>] ___bpf_prog_run+0x1e1f/0x4960 [<000000002ef89e83>] 0xffffffffffffffff unreferenced object 0xffff880378ead240 (size 64): comm "test_sockmap", pid 2225, jiffies 4294720701 (age 43.504s) hex dump (first 32 bytes): 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ 00 44 75 5d 03 88 ff ff 00 00 00 00 00 00 00 00 .Du]............ backtrace: [<000000005225ac3c>] sock_map_ctx_update_elem.isra.21+0xd8/0x210 [<0000000030e37a3a>] sock_map_update_elem+0x125/0x240 [<000000002e5ce36e>] map_update_elem+0x4eb/0x7b0 [<00000000db453cc9>] __x64_sys_bpf+0x1f9/0x360 [<0000000000763660>] do_syscall_64+0x9a/0x300 [<00000000422a2bb2>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<000000002ef89e83>] 0xffffffffffffffff [...] Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Fixes: 54fedb42c653 ("bpf: sockmap, fix smap_list_map_remove when psock is in many maps") Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0c1a696b041b..94a324bc6afc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -370,6 +370,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } raw_spin_unlock_bh(&b->lock); } + kfree(e); e = psock_map_pop(sk, psock); } rcu_read_unlock(); @@ -1675,8 +1676,10 @@ static void smap_list_map_remove(struct smap_psock *psock, spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) + if (e->entry == entry) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } @@ -1690,8 +1693,10 @@ static void smap_list_hash_remove(struct smap_psock *psock, list_for_each_entry_safe(e, tmp, &psock->maps, list) { struct htab_elem *c = rcu_dereference(e->hash_link); - if (c == hash_link) + if (c == hash_link) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } -- cgit v1.2.3-55-g7522 From 166ab6f0a0702fdd4d865ad5090bf3094ed83428 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:09 +0200 Subject: bpf, sockmap: fix map elem deletion race with smap_stop_sock The smap_start_sock() and smap_stop_sock() are each protected under the sock->sk_callback_lock from their call-sites except in the case of sock_map_delete_elem() where we drop the old socket from the map slot. This is racy because the same sock could be part of multiple sock maps, so we run smap_stop_sock() in parallel, and given at that point psock->strp_enabled might be true on both CPUs, we might for example wrongly restore the sk->sk_data_ready / sk->sk_write_space. Therefore, hold the sock->sk_callback_lock as well on delete. Looks like 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") had this right, but later on e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") removed it again from delete leaving this smap_stop_sock() instance unprotected. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 94a324bc6afc..921cb6b8c862 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1786,8 +1786,11 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!psock) goto out; - if (psock->bpf_parse) + if (psock->bpf_parse) { + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); + } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: -- cgit v1.2.3-55-g7522 From 585f5a6252ee43ec8feeee07387e3fcc7e8bb292 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 16 Aug 2018 21:49:10 +0200 Subject: bpf, sockmap: fix sock_map_ctx_update_elem race with exist/noexist The current code in sock_map_ctx_update_elem() allows for BPF_EXIST and BPF_NOEXIST map update flags. While on array-like maps this approach is rather uncommon, e.g. bpf_fd_array_map_update_elem() and others enforce map update flags to be BPF_ANY such that xchg() can be used directly, the current implementation in sock map does not guarantee that such operation with BPF_EXIST / BPF_NOEXIST is atomic. The initial test does a READ_ONCE(stab->sock_map[i]) to fetch the socket from the slot which is then tested for NULL / non-NULL. However later after __sock_map_ctx_update_elem(), the actual update is done through osock = xchg(&stab->sock_map[i], sock). Problem is that in the meantime a different CPU could have updated / deleted a socket on that specific slot and thus flag contraints won't hold anymore. I've been thinking whether best would be to just break UAPI and do an enforcement of BPF_ANY to check if someone actually complains, however trouble is that already in BPF kselftest we use BPF_NOEXIST for the map update, and therefore it might have been copied into applications already. The fix to keep the current behavior intact would be to add a map lock similar to the sock hash bucket lock only for covering the whole map. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 106 +++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 49 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 921cb6b8c862..98e621a29e8e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -58,6 +58,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; struct bpf_sock_progs progs; + raw_spinlock_t lock; }; struct bucket { @@ -89,9 +90,9 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; + struct bpf_map *map; struct sock **entry; struct htab_elem __rcu *hash_link; - struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); while (e) { if (e->entry) { - osk = cmpxchg(e->entry, sk, NULL); + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); + + raw_spin_lock_bh(&stab->lock); + osk = *e->entry; if (osk == sk) { + *e->entry = NULL; smap_release_sock(psock, sk); } + raw_spin_unlock_bh(&stab->lock); } else { struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = rcu_dereference(e->htab); + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); struct hlist_head *head; struct htab_elem *l; struct bucket *b; @@ -1642,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -1716,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map) * and a grace period expire to ensure psock is really safe to remove. */ rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct smap_psock *psock; struct sock *sock; - sock = xchg(&stab->sock_map[i], NULL); + sock = stab->sock_map[i]; if (!sock) continue; - + stab->sock_map[i] = NULL; psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1735,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map) smap_release_sock(psock, sock); } } + raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); sock_map_remove_complete(stab); @@ -1778,14 +1787,16 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - sock = xchg(&stab->sock_map[k], NULL); + raw_spin_lock_bh(&stab->lock); + sock = stab->sock_map[k]; + stab->sock_map[k] = NULL; + raw_spin_unlock_bh(&stab->lock); if (!sock) return -EINVAL; psock = smap_psock_sk(sock); if (!psock) - goto out; - + return 0; if (psock->bpf_parse) { write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); @@ -1793,7 +1804,6 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); -out: return 0; } @@ -1829,11 +1839,9 @@ out: static int __sock_map_ctx_update_elem(struct bpf_map *map, struct bpf_sock_progs *progs, struct sock *sock, - struct sock **map_link, void *key) { struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; int err = 0; @@ -1906,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - if (map_link) { - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_free; - } - } - /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ @@ -1935,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); } - /* 4. Place psock in sockmap for use and stop any programs on - * the old sock assuming its not the same sock we are replacing - * it with. Because we can only have a single set of programs if - * old_sock has a strp we can stop it. - */ - if (map_link) { - e->entry = map_link; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - } return err; out_free: smap_release_sock(psock, sock); @@ -1956,7 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - kfree(e); return err; } @@ -1966,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock; + struct sock *osock, *sock = skops->sk; + struct smap_psock_map_entry *e; + struct smap_psock *psock; u32 i = *(u32 *)key; int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; - sock = skops->sk; - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], - key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto out; - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); + /* psock guaranteed to be present. */ + psock = smap_psock_sk(sock); + raw_spin_lock_bh(&stab->lock); + osock = stab->sock_map[i]; + if (osock && flags == BPF_NOEXIST) { + err = -EEXIST; + goto out_unlock; + } + if (!osock && flags == BPF_EXIST) { + err = -ENOENT; + goto out_unlock; + } + + e->entry = &stab->sock_map[i]; + e->map = map; + spin_lock_bh(&psock->maps_lock); + list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); - smap_list_map_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); + stab->sock_map[i] = sock; + if (osock) { + psock = smap_psock_sk(osock); + smap_list_map_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, osock); } + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + smap_release_sock(psock, sock); + raw_spin_unlock_bh(&stab->lock); out: + kfree(e); return err; } @@ -2358,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, b = __select_bucket(htab, hash); head = &b->head; - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto err; @@ -2384,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, } rcu_assign_pointer(e->hash_link, l_new); - rcu_assign_pointer(e->htab, - container_of(map, struct bpf_htab, map)); + e->map = map; spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); spin_unlock_bh(&psock->maps_lock); -- cgit v1.2.3-55-g7522 From a85da34e97f553d27b520757ffc9b31232694a78 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 17 Aug 2018 08:54:15 -0700 Subject: tools/bpf: fix bpf selftest test_cgroup_storage failure The bpf selftest test_cgroup_storage failed in one of our production test servers. # sudo ./test_cgroup_storage Failed to create map: Operation not permitted It turns out this is due to insufficient locked memory with system default 16KB. Similar to other self tests, let us arm the process with unlimited locked memory. With this change, the test passed. # sudo ./test_cgroup_storage test_cgroup_storage:PASS Fixes: 68cfa3ac6b8d ("selftests/bpf: add a cgroup storage test") Cc: Roman Gushchin Signed-off-by: Yonghong Song Acked-by: Roman Gushchin Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_cgroup_storage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index dc83fb2d3f27..4e196e3bfecf 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -5,6 +5,7 @@ #include #include +#include "bpf_rlimit.h" #include "cgroup_helpers.h" char bpf_log_buf[BPF_LOG_BUF_SIZE]; -- cgit v1.2.3-55-g7522 From bfdd19ad80f203f42f05fd32a31c678c9c524ef9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 17 Aug 2018 20:42:16 +0200 Subject: r8169: add missing Kconfig dependency Now that we switched the r8169 driver to use phylib, there's a dependency on the Realtek PHY drivers. This dependency was missing in Kconfig. Reported-by: Jouni Mettälä Fixes: f1e911d5d0df ("r8169: add basic phylib support") Signed-off-by: Heiner Kallweit Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/Kconfig b/drivers/net/ethernet/realtek/Kconfig index e1cd934c2e4f..96d1b9c08f1a 100644 --- a/drivers/net/ethernet/realtek/Kconfig +++ b/drivers/net/ethernet/realtek/Kconfig @@ -100,6 +100,7 @@ config R8169 select FW_LOADER select CRC32 select PHYLIB + select REALTEK_PHY ---help--- Say Y here if you have a Realtek 8169 PCI Gigabit Ethernet adapter. -- cgit v1.2.3-55-g7522 From f6069b9aa9934ede26f41ac0781fce241279ad43 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 17 Aug 2018 23:26:14 +0200 Subject: bpf: fix redirect to map under tail calls Commits 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner pointer on bpf_prog_realloc") tried to mitigate that buggy programs using bpf_redirect_map() helper call do not leave stale maps behind. Idea was to add a map_owner cookie into the per CPU struct redirect_info which was set to prog->aux by the prog making the helper call as a proof that the map is not stale since the prog is implicitly holding a reference to it. This owner cookie could later on get compared with the program calling into BPF whether they match and therefore the redirect could proceed with processing the map safely. In (obvious) hindsight, this approach breaks down when tail calls are involved since the original caller's prog->aux pointer does not have to match the one from one of the progs out of the tail call chain, and therefore the xdp buffer will be dropped instead of redirected. A way around that would be to fix the issue differently (which also allows to remove related work in fast path at the same time): once the life-time of a redirect map has come to its end we use it's map free callback where we need to wait on synchronize_rcu() for current outstanding xdp buffers and remove such a map pointer from the redirect info if found to be present. At that time no program is using this map anymore so we simply invalidate the map pointers to NULL iff they previously pointed to that instance while making sure that the redirect path only reads out the map once. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") Reported-by: Sebastiano Miano Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +- include/trace/events/xdp.h | 5 ++-- kernel/bpf/cpumap.c | 2 ++ kernel/bpf/devmap.c | 1 + kernel/bpf/verifier.c | 21 -------------- kernel/bpf/xskmap.c | 1 + net/core/filter.c | 68 ++++++++++++++++++++-------------------------- 7 files changed, 38 insertions(+), 63 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 5d565c50bcb2..6791a0ac0139 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,7 +543,6 @@ struct bpf_redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - unsigned long map_owner; u32 kern_flags; }; @@ -781,6 +780,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +void bpf_clear_redirect_map(struct bpf_map *map); + static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 1ecf4c67fcf7..e95cb86b65cf 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -147,9 +147,8 @@ struct _bpf_dtab_netdev { #define devmap_ifindex(fwd, map) \ (!fwd ? 0 : \ - (!map ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 620bc5024d7d..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ac1df79f3788..141710b82a6c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca90679a7fe5..92246117d2b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5844,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4ddf61e158f6..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { diff --git a/net/core/filter.c b/net/core/filter.c index fd423ce3da34..c25eb36f1320 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3246,31 +3246,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) } } -static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, - unsigned long aux) +void bpf_clear_redirect_map(struct bpf_map *map) { - return (unsigned long)xdp_prog->aux != aux; + struct bpf_redirect_info *ri; + int cpu; + + for_each_possible_cpu(cpu) { + ri = per_cpu_ptr(&bpf_redirect_info, cpu); + /* Avoid polluting remote cacheline due to writes if + * not needed. Once we pass this test, we need the + * cmpxchg() to make sure it hasn't been changed in + * the meantime by remote CPU. + */ + if (unlikely(READ_ONCE(ri->map) == map)) + cmpxchg(&ri->map, map, NULL); + } } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; - - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } + WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { @@ -3296,12 +3298,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); struct net_device *fwd; u32 index = ri->ifindex; int err; - if (ri->map) - return xdp_do_redirect_map(dev, xdp, xdp_prog); + if (map) + return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; @@ -3325,24 +3328,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + struct bpf_prog *xdp_prog, + struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - unsigned long map_owner = ri->map_owner; - struct bpf_map *map = ri->map; u32 index = ri->ifindex; void *fwd = NULL; int err = 0; ri->ifindex = 0; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; @@ -3379,13 +3375,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; - if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); - + if (map) + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, + map); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { @@ -3416,8 +3413,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; - ri->map_owner = 0; + WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; } @@ -3430,8 +3426,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - unsigned long, map_owner) +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, + u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -3440,15 +3436,11 @@ BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; - ri->map = map; - ri->map_owner = map_owner; + WRITE_ONCE(ri->map, map); return XDP_REDIRECT; } -/* Note, arg4 is hidden from users and populated by the verifier - * with the right pointer. - */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, -- cgit v1.2.3-55-g7522 From bb107456ba5a85e50567d56b0c8050c2ad199e9b Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sat, 18 Aug 2018 22:43:48 +0800 Subject: ip6_vti: simplify stats handling in vti6_xmit Same as ip_vti, use iptunnel_xmit_stats to updates stats in tunnel xmit code path. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c72ae3a4fe09..65d4a8044cb0 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -503,17 +503,9 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) skb->dev = skb_dst(skb)->dev; err = dst_output(t->net, skb->sk, skb); - if (net_xmit_eval(err) == 0) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } + if (net_xmit_eval(err) == 0) + err = pkt_len; + iptunnel_xmit_stats(dev, err); return 0; tx_err_link_failure: -- cgit v1.2.3-55-g7522 From 653cd284a8a857ddfcf24f5bc3bd204a229f6c9f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Aug 2018 21:46:16 +0300 Subject: net: sched: always disable bh when taking tcf_lock Recently, ops->init() and ops->dump() of all actions were modified to always obtain tcf_lock when accessing private action state. Actions that don't depend on tcf_lock for synchronization with their data path use non-bh locking API. However, tcf_lock is also used to protect rate estimator stats in softirq context by timer callback. Change ops->init() and ops->dump() of all actions to disable bh when using tcf_lock to prevent deadlock reported by following lockdep warning: [ 105.470398] ================================ [ 105.475014] WARNING: inconsistent lock state [ 105.479628] 4.18.0-rc8+ #664 Not tainted [ 105.483897] -------------------------------- [ 105.488511] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. [ 105.494871] swapper/16/0 [HC0[0]:SC1[1]:HE1:SE0] takes: [ 105.500449] 00000000f86c012e (&(&p->tcfa_lock)->rlock){+.?.}, at: est_fetch_counters+0x3c/0xa0 [ 105.509696] {SOFTIRQ-ON-W} state was registered at: [ 105.514925] _raw_spin_lock+0x2c/0x40 [ 105.519022] tcf_bpf_init+0x579/0x820 [act_bpf] [ 105.523990] tcf_action_init_1+0x4e4/0x660 [ 105.528518] tcf_action_init+0x1ce/0x2d0 [ 105.532880] tcf_exts_validate+0x1d8/0x200 [ 105.537416] fl_change+0x55a/0x268b [cls_flower] [ 105.542469] tc_new_tfilter+0x748/0xa20 [ 105.546738] rtnetlink_rcv_msg+0x56a/0x6d0 [ 105.551268] netlink_rcv_skb+0x18d/0x200 [ 105.555628] netlink_unicast+0x2d0/0x370 [ 105.559990] netlink_sendmsg+0x3b9/0x6a0 [ 105.564349] sock_sendmsg+0x6b/0x80 [ 105.568271] ___sys_sendmsg+0x4a1/0x520 [ 105.572547] __sys_sendmsg+0xd7/0x150 [ 105.576655] do_syscall_64+0x72/0x2c0 [ 105.580757] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 105.586243] irq event stamp: 489296 [ 105.590084] hardirqs last enabled at (489296): [] _raw_spin_unlock_irq+0x29/0x40 [ 105.599765] hardirqs last disabled at (489295): [] _raw_spin_lock_irq+0x15/0x50 [ 105.609277] softirqs last enabled at (489292): [] irq_enter+0x83/0xa0 [ 105.618001] softirqs last disabled at (489293): [] irq_exit+0x140/0x190 [ 105.626813] other info that might help us debug this: [ 105.633976] Possible unsafe locking scenario: [ 105.640526] CPU0 [ 105.643325] ---- [ 105.646125] lock(&(&p->tcfa_lock)->rlock); [ 105.650747] [ 105.653717] lock(&(&p->tcfa_lock)->rlock); [ 105.658514] *** DEADLOCK *** [ 105.665349] 1 lock held by swapper/16/0: [ 105.669629] #0: 00000000a640ad99 ((&est->timer)){+.-.}, at: call_timer_fn+0x10b/0x550 [ 105.678200] stack backtrace: [ 105.683194] CPU: 16 PID: 0 Comm: swapper/16 Not tainted 4.18.0-rc8+ #664 [ 105.690249] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 105.698626] Call Trace: [ 105.701421] [ 105.703791] dump_stack+0x92/0xeb [ 105.707461] print_usage_bug+0x336/0x34c [ 105.711744] mark_lock+0x7c9/0x980 [ 105.715500] ? print_shortest_lock_dependencies+0x2e0/0x2e0 [ 105.721424] ? check_usage_forwards+0x230/0x230 [ 105.726315] __lock_acquire+0x923/0x26f0 [ 105.730597] ? debug_show_all_locks+0x240/0x240 [ 105.735478] ? mark_lock+0x493/0x980 [ 105.739412] ? check_chain_key+0x140/0x1f0 [ 105.743861] ? __lock_acquire+0x836/0x26f0 [ 105.748323] ? lock_acquire+0x12e/0x290 [ 105.752516] lock_acquire+0x12e/0x290 [ 105.756539] ? est_fetch_counters+0x3c/0xa0 [ 105.761084] _raw_spin_lock+0x2c/0x40 [ 105.765099] ? est_fetch_counters+0x3c/0xa0 [ 105.769633] est_fetch_counters+0x3c/0xa0 [ 105.773995] est_timer+0x87/0x390 [ 105.777670] ? est_fetch_counters+0xa0/0xa0 [ 105.782210] ? lock_acquire+0x12e/0x290 [ 105.786410] call_timer_fn+0x161/0x550 [ 105.790512] ? est_fetch_counters+0xa0/0xa0 [ 105.795055] ? del_timer_sync+0xd0/0xd0 [ 105.799249] ? __lock_is_held+0x93/0x110 [ 105.803531] ? mark_held_locks+0x20/0xe0 [ 105.807813] ? _raw_spin_unlock_irq+0x29/0x40 [ 105.812525] ? est_fetch_counters+0xa0/0xa0 [ 105.817069] ? est_fetch_counters+0xa0/0xa0 [ 105.821610] run_timer_softirq+0x3c4/0x9f0 [ 105.826064] ? lock_acquire+0x12e/0x290 [ 105.830257] ? __bpf_trace_timer_class+0x10/0x10 [ 105.835237] ? __lock_is_held+0x25/0x110 [ 105.839517] __do_softirq+0x11d/0x7bf [ 105.843542] irq_exit+0x140/0x190 [ 105.847208] smp_apic_timer_interrupt+0xac/0x3b0 [ 105.852182] apic_timer_interrupt+0xf/0x20 [ 105.856628] [ 105.859081] RIP: 0010:cpuidle_enter_state+0xd8/0x4d0 [ 105.864395] Code: 46 ff 48 89 44 24 08 0f 1f 44 00 00 31 ff e8 cf ec 46 ff 80 7c 24 07 00 0f 85 1d 02 00 00 e8 9f 90 4b ff fb 66 0f 1f 44 00 00 <4c> 8b 6c 24 08 4d 29 fd 0f 80 36 03 00 00 4c 89 e8 48 ba cf f7 53 [ 105.884288] RSP: 0018:ffff8803ad94fd20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 [ 105.892494] RAX: 0000000000000000 RBX: ffffe8fb300829c0 RCX: ffffffffb41e19e1 [ 105.899988] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8803ad9358ac [ 105.907503] RBP: ffffffffb6636300 R08: 0000000000000004 R09: 0000000000000000 [ 105.914997] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000004 [ 105.922487] R13: ffffffffb6636140 R14: ffffffffb66362d8 R15: 000000188d36091b [ 105.929988] ? trace_hardirqs_on_caller+0x141/0x2d0 [ 105.935232] do_idle+0x28e/0x320 [ 105.938817] ? arch_cpu_idle_exit+0x40/0x40 [ 105.943361] ? mark_lock+0x8c1/0x980 [ 105.947295] ? _raw_spin_unlock_irqrestore+0x32/0x60 [ 105.952619] cpu_startup_entry+0xc2/0xd0 [ 105.956900] ? cpu_in_idle+0x20/0x20 [ 105.960830] ? _raw_spin_unlock_irqrestore+0x32/0x60 [ 105.966146] ? trace_hardirqs_on_caller+0x141/0x2d0 [ 105.971391] start_secondary+0x2b5/0x360 [ 105.975669] ? set_cpu_sibling_map+0x1330/0x1330 [ 105.980654] secondary_startup_64+0xa5/0xb0 Taking tcf_lock in sample action with bh disabled causes lockdep to issue a warning regarding possible irq lock inversion dependency between tcf_lock, and psample_groups_lock that is taken when holding tcf_lock in sample init: [ 162.108959] Possible interrupt unsafe locking scenario: [ 162.116386] CPU0 CPU1 [ 162.121277] ---- ---- [ 162.126162] lock(psample_groups_lock); [ 162.130447] local_irq_disable(); [ 162.136772] lock(&(&p->tcfa_lock)->rlock); [ 162.143957] lock(psample_groups_lock); [ 162.150813] [ 162.153808] lock(&(&p->tcfa_lock)->rlock); [ 162.158608] *** DEADLOCK *** In order to prevent potential lock inversion dependency between tcf_lock and psample_groups_lock, extract call to psample_group_get() from tcf_lock protected section in sample action init function. Fixes: 4e232818bd32 ("net: sched: act_mirred: remove dependency on rtnl lock") Fixes: 764e9a24480f ("net: sched: act_vlan: remove dependency on rtnl lock") Fixes: 729e01260989 ("net: sched: act_tunnel_key: remove dependency on rtnl lock") Fixes: d77284956656 ("net: sched: act_sample: remove dependency on rtnl lock") Fixes: e8917f437006 ("net: sched: act_gact: remove dependency on rtnl lock") Fixes: b6a2b971c0b0 ("net: sched: act_csum: remove dependency on rtnl lock") Fixes: 2142236b4584 ("net: sched: act_bpf: remove dependency on rtnl lock") Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 10 +++++----- net/sched/act_csum.c | 10 +++++----- net/sched/act_gact.c | 10 +++++----- net/sched/act_mirred.c | 16 ++++++++-------- net/sched/act_sample.c | 25 ++++++++++++++----------- net/sched/act_tunnel_key.c | 10 +++++----- net/sched/act_vlan.c | 10 +++++----- 7 files changed, 47 insertions(+), 44 deletions(-) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 9b30e62805c7..d30b23e42436 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -147,7 +147,7 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_t tm; int ret; - spin_lock(&prog->tcf_lock); + spin_lock_bh(&prog->tcf_lock); opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -164,11 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, TCA_ACT_BPF_PAD)) goto nla_put_failure; - spin_unlock(&prog->tcf_lock); + spin_unlock_bh(&prog->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&prog->tcf_lock); + spin_unlock_bh(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } @@ -340,7 +340,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog = to_bpf(*act); - spin_lock(&prog->tcf_lock); + spin_lock_bh(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); @@ -352,7 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->tcf_action = parm->action; rcu_assign_pointer(prog->filter, cfg.filter); - spin_unlock(&prog->tcf_lock); + spin_unlock_bh(&prog->tcf_lock); if (res == ACT_P_CREATED) { tcf_idr_insert(tn, *act); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5596fae4e478..e698d3fe2080 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -96,11 +96,11 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } params_new->update_flags = parm->update_flags; - spin_lock(&p->tcf_lock); + spin_lock_bh(&p->tcf_lock); p->tcf_action = parm->action; rcu_swap_protected(p->params, params_new, lockdep_is_held(&p->tcf_lock)); - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); @@ -604,7 +604,7 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct tcf_t t; - spin_lock(&p->tcf_lock); + spin_lock_bh(&p->tcf_lock); params = rcu_dereference_protected(p->params, lockdep_is_held(&p->tcf_lock)); opt.action = p->tcf_action; @@ -616,12 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 52a3e474d822..6a3f25a8ffb3 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(*a); - spin_lock(&gact->tcf_lock); + spin_lock_bh(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { @@ -126,7 +126,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock(&gact->tcf_lock); + spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -183,7 +183,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock(&gact->tcf_lock); + spin_lock_bh(&gact->tcf_lock); opt.action = gact->tcf_action; if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -202,12 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; - spin_unlock(&gact->tcf_lock); + spin_unlock_bh(&gact->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&gact->tcf_lock); + spin_unlock_bh(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8ec216001077..38fd20f10f67 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -159,14 +159,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } m = to_mirred(*a); - spin_lock(&m->tcf_lock); + spin_lock_bh(&m->tcf_lock); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (parm->ifindex) { dev = dev_get_by_index(net, parm->ifindex); if (!dev) { - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); tcf_idr_release(*a, bind); return -ENODEV; } @@ -177,7 +177,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); @@ -305,7 +305,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct net_device *dev; struct tcf_t t; - spin_lock(&m->tcf_lock); + spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; opt.eaction = m->tcfm_eaction; dev = tcf_mirred_dev_dereference(m); @@ -318,12 +318,12 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -356,7 +356,7 @@ static int mirred_device_event(struct notifier_block *unused, if (event == NETDEV_UNREGISTER) { spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { - spin_lock(&m->tcf_lock); + spin_lock_bh(&m->tcf_lock); if (tcf_mirred_dev_dereference(m) == dev) { dev_put(dev); /* Note : no rcu grace period necessary, as @@ -364,7 +364,7 @@ static int mirred_device_event(struct notifier_block *unused, */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); } spin_unlock(&mirred_list_lock); } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 81071afe1b43..207b4132d1b0 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -44,6 +44,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; struct tc_sample *parm; + u32 psample_group_num; struct tcf_sample *s; bool exists = false; int ret, err; @@ -78,25 +79,27 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, tcf_idr_release(*a, bind); return -EEXIST; } - s = to_sample(*a); - spin_lock(&s->tcf_lock); - s->tcf_action = parm->action; - s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); - s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); - psample_group = psample_group_get(net, s->psample_group_num); + psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); + psample_group = psample_group_get(net, psample_group_num); if (!psample_group) { - spin_unlock(&s->tcf_lock); tcf_idr_release(*a, bind); return -ENOMEM; } + + s = to_sample(*a); + + spin_lock_bh(&s->tcf_lock); + s->tcf_action = parm->action; + s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); + s->psample_group_num = psample_group_num; RCU_INIT_POINTER(s->psample_group, psample_group); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } - spin_unlock(&s->tcf_lock); + spin_unlock_bh(&s->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -183,7 +186,7 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock(&s->tcf_lock); + spin_lock_bh(&s->tcf_lock); opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -201,12 +204,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; - spin_unlock(&s->tcf_lock); + spin_unlock_bh(&s->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&s->tcf_lock); + spin_unlock_bh(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ba2ae9f75ef5..8f09cf08d8fe 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -354,11 +354,11 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - spin_lock(&t->tcf_lock); + spin_lock_bh(&t->tcf_lock); t->tcf_action = parm->action; rcu_swap_protected(t->params, params_new, lockdep_is_held(&t->tcf_lock)); - spin_unlock(&t->tcf_lock); + spin_unlock_bh(&t->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); @@ -485,7 +485,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock(&t->tcf_lock); + spin_lock_bh(&t->tcf_lock); params = rcu_dereference_protected(t->params, lockdep_is_held(&t->tcf_lock)); opt.action = t->tcf_action; @@ -520,12 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock(&t->tcf_lock); + spin_unlock_bh(&t->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&t->tcf_lock); + spin_unlock_bh(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index d1f5028384c9..209e70ad2c09 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -213,10 +213,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; - spin_lock(&v->tcf_lock); + spin_lock_bh(&v->tcf_lock); v->tcf_action = parm->action; rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); - spin_unlock(&v->tcf_lock); + spin_unlock_bh(&v->tcf_lock); if (p) kfree_rcu(p, rcu); @@ -249,7 +249,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock(&v->tcf_lock); + spin_lock_bh(&v->tcf_lock); opt.action = v->tcf_action; p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); opt.v_action = p->tcfv_action; @@ -268,12 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock(&v->tcf_lock); + spin_unlock_bh(&v->tcf_lock); return skb->len; nla_put_failure: - spin_unlock(&v->tcf_lock); + spin_unlock_bh(&v->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3-55-g7522 From 0b3e776e2e809b808875b10e35d1047c5c548da4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2018 19:49:49 +0200 Subject: net: lan743x_ptp: convert to ktime_get_clocktai_ts64 timekeeping_clocktai64() has been renamed to ktime_get_clocktai_ts64() for consistency with the other ktime_get_* access functions. Rename the new caller that has come up as well. Question: this is the only ptp driver that sets the hardware time to the current system time in TAI. Why does it do that? Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_ptp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index 64dba96edc79..ccdf9123f26f 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -831,8 +831,7 @@ static void lan743x_ptp_sync_to_system_clock(struct lan743x_adapter *adapter) { struct timespec64 ts; - memset(&ts, 0, sizeof(ts)); - timekeeping_clocktai64(&ts); + ktime_get_clocktai_ts64(&ts); lan743x_ptp_clock_set(adapter, ts.tv_sec, ts.tv_nsec, 0); } -- cgit v1.2.3-55-g7522 From 7bb05b85bc2d1a1b647b91424b2ed4a18e6ecd81 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 17 Aug 2018 13:07:35 +0800 Subject: r8169: don't use MSI-X on RTL8106e Found the ethernet network on ASUS X441UAR doesn't come back on resume from suspend when using MSI-X. The chip is RTL8106e - version 39. [ 21.848357] libphy: r8169: probed [ 21.848473] r8169 0000:02:00.0 eth0: RTL8106e, 0c:9d:92:32:67:b4, XID 44900000, IRQ 127 [ 22.518860] r8169 0000:02:00.0 enp2s0: renamed from eth0 [ 29.458041] Generic PHY r8169-200:00: attached PHY driver [Generic PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE) [ 63.227398] r8169 0000:02:00.0 enp2s0: Link is Up - 100Mbps/Full - flow control off [ 124.514648] Generic PHY r8169-200:00: attached PHY driver [Generic PHY] (mii_bus:phy_addr=r8169-200:00, irq=IGNORE) Here is the ethernet controller in detail: 02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd. RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136] (rev 07) Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast Ethernet controller [1043:200f] Flags: bus master, fast devsel, latency 0, IRQ 16 I/O ports at e000 [size=256] Memory at ef100000 (64-bit, non-prefetchable) [size=4K] Memory at e0000000 (64-bit, prefetchable) [size=16K] Capabilities: Kernel driver in use: r8169 Kernel modules: r8169 Falling back to MSI fixes the issue. Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling") Signed-off-by: Jian-Hong Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 0d9c3831838f..0efa977c422d 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -7071,17 +7071,20 @@ static int rtl_alloc_irq(struct rtl8169_private *tp) { unsigned int flags; - if (tp->mac_version <= RTL_GIGA_MAC_VER_06) { + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06: RTL_W8(tp, Cfg9346, Cfg9346_Unlock); RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable); RTL_W8(tp, Cfg9346, Cfg9346_Lock); flags = PCI_IRQ_LEGACY; - } else if (tp->mac_version == RTL_GIGA_MAC_VER_40) { + break; + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40: /* This version was reported to have issues with resume * from suspend when using MSI-X */ flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI; - } else { + break; + default: flags = PCI_IRQ_ALL_TYPES; } -- cgit v1.2.3-55-g7522 From cd1aa9c2c665cafbd05b83507d3f1096f3912aa4 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 19 Aug 2018 15:05:04 +0800 Subject: ip_vti: fix a null pointer deferrence when create vti fallback tunnel After set fb_tunnels_only_for_init_net to 1, the itn->fb_tunnel_dev will be NULL and will cause following crash: [ 2742.849298] BUG: unable to handle kernel NULL pointer dereference at 0000000000000941 [ 2742.851380] PGD 800000042c21a067 P4D 800000042c21a067 PUD 42aaed067 PMD 0 [ 2742.852818] Oops: 0002 [#1] SMP PTI [ 2742.853570] CPU: 7 PID: 2484 Comm: unshare Kdump: loaded Not tainted 4.18.0-rc8+ #2 [ 2742.855163] Hardware name: Fedora Project OpenStack Nova, BIOS seabios-1.7.5-11.el7 04/01/2014 [ 2742.856970] RIP: 0010:vti_init_net+0x3a/0x50 [ip_vti] [ 2742.858034] Code: 90 83 c0 48 c7 c2 20 a1 83 c0 48 89 fb e8 6e 3b f6 ff 85 c0 75 22 8b 0d f4 19 00 00 48 8b 93 00 14 00 00 48 8b 14 ca 48 8b 12 82 41 09 00 00 04 c6 82 38 09 00 00 45 5b c3 66 0f 1f 44 00 00 [ 2742.861940] RSP: 0018:ffff9be28207fde0 EFLAGS: 00010246 [ 2742.863044] RAX: 0000000000000000 RBX: ffff8a71ebed4980 RCX: 0000000000000013 [ 2742.864540] RDX: 0000000000000000 RSI: 0000000000000013 RDI: ffff8a71ebed4980 [ 2742.866020] RBP: ffff8a71ea717000 R08: ffffffffc083903c R09: ffff8a71ea717000 [ 2742.867505] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a71ebed4980 [ 2742.868987] R13: 0000000000000013 R14: ffff8a71ea5b49c0 R15: 0000000000000000 [ 2742.870473] FS: 00007f02266c9740(0000) GS:ffff8a71ffdc0000(0000) knlGS:0000000000000000 [ 2742.872143] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2742.873340] CR2: 0000000000000941 CR3: 000000042bc20006 CR4: 00000000001606e0 [ 2742.874821] Call Trace: [ 2742.875358] ops_init+0x38/0xf0 [ 2742.876078] setup_net+0xd9/0x1f0 [ 2742.876789] copy_net_ns+0xb7/0x130 [ 2742.877538] create_new_namespaces+0x11a/0x1d0 [ 2742.878525] unshare_nsproxy_namespaces+0x55/0xa0 [ 2742.879526] ksys_unshare+0x1a7/0x330 [ 2742.880313] __x64_sys_unshare+0xe/0x20 [ 2742.881131] do_syscall_64+0x5b/0x180 [ 2742.881933] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reproduce: echo 1 > /proc/sys/net/core/fb_tunnels_only_for_init_net modprobe ip_vti unshare -n Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces") Cc: Eric Dumazet Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 3f091ccad9af..f38cb21d773d 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net) if (err) return err; itn = net_generic(net, vti_net_id); - vti_fb_tunnel_init(itn->fb_tunnel_dev); + if (itn->fb_tunnel_dev) + vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } -- cgit v1.2.3-55-g7522 From e2948e5af8eeb6c945000772b7613b0323a0a203 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 19 Aug 2018 15:05:05 +0800 Subject: ip6_vti: fix creating fallback tunnel device for vti6 When set fb_tunnels_only_for_init_net to 1, don't create fallback tunnel device for vti6 when a new namespace is created. Tested: [root@builder2 ~]# modprobe ip6_tunnel [root@builder2 ~]# modprobe ip6_vti [root@builder2 ~]# echo 1 > /proc/sys/net/core/fb_tunnels_only_for_init_net [root@builder2 ~]# unshare -n [root@builder2 ~]# ip link 1: lo: mtu 65536 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 65d4a8044cb0..38dec9da90d3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1106,6 +1106,8 @@ static int __net_init vti6_init_net(struct net *net) ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", NET_NAME_UNKNOWN, vti6_dev_setup); -- cgit v1.2.3-55-g7522