From 0ab03c2b1478f2438d2c80204f7fef65b1bca9cf Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 7 Jan 2011 03:15:05 +0000 Subject: netlink: test for all flags of the NLM_F_DUMP composite Due to NLM_F_DUMP is composed of two bits, NLM_F_ROOT | NLM_F_MATCH, when doing "if (x & NLM_F_DUMP)", it tests for _either_ of the bits being set. Because NLM_F_MATCH's value overlaps with NLM_F_EXCL, non-dump requests with NLM_F_EXCL set are mistaken as dump requests. Substitute the condition to test for _all_ bits being set. Signed-off-by: Jan Engelhardt Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 0cdba50c0d69..746140264b2d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -928,7 +928,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, u16 zone; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, ctnetlink_done); @@ -1790,7 +1790,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, u16 zone; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { return netlink_dump_start(ctnl, skb, nlh, ctnetlink_exp_dump_table, ctnetlink_exp_done); -- cgit v1.2.3-55-g7522 From 83723d60717f8da0f53f91cf42a845ed56c09662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Jan 2011 20:11:38 +0100 Subject: netfilter: x_tables: dont block BH while reading counters Using "iptables -L" with a lot of rules have a too big BH latency. Jesper mentioned ~6 ms and worried of frame drops. Switch to a per_cpu seqlock scheme, so that taking a snapshot of counters doesnt need to block BH (for this cpu, but also other cpus). This adds two increments on seqlock sequence per ipt_do_table() call, its a reasonable cost for allowing "iptables -L" not block BH processing. Reported-by: Jesper Dangaard Brouer Signed-off-by: Eric Dumazet CC: Patrick McHardy Acked-by: Stephen Hemminger Acked-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 10 ++++----- net/ipv4/netfilter/arp_tables.c | 45 ++++++++++++-------------------------- net/ipv4/netfilter/ip_tables.c | 45 ++++++++++++-------------------------- net/ipv6/netfilter/ip6_tables.c | 45 ++++++++++++-------------------------- net/netfilter/x_tables.c | 3 ++- 5 files changed, 49 insertions(+), 99 deletions(-) (limited to 'net/netfilter') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 742bec051440..6712e713b299 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -472,7 +472,7 @@ extern void xt_free_table_info(struct xt_table_info *info); * necessary for reading the counters. */ struct xt_info_lock { - spinlock_t lock; + seqlock_t lock; unsigned char readers; }; DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); @@ -497,7 +497,7 @@ static inline void xt_info_rdlock_bh(void) local_bh_disable(); lock = &__get_cpu_var(xt_info_locks); if (likely(!lock->readers++)) - spin_lock(&lock->lock); + write_seqlock(&lock->lock); } static inline void xt_info_rdunlock_bh(void) @@ -505,7 +505,7 @@ static inline void xt_info_rdunlock_bh(void) struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); if (likely(!--lock->readers)) - spin_unlock(&lock->lock); + write_sequnlock(&lock->lock); local_bh_enable(); } @@ -516,12 +516,12 @@ static inline void xt_info_rdunlock_bh(void) */ static inline void xt_info_wrlock(unsigned int cpu) { - spin_lock(&per_cpu(xt_info_locks, cpu).lock); + write_seqlock(&per_cpu(xt_info_locks, cpu).lock); } static inline void xt_info_wrunlock(unsigned int cpu) { - spin_unlock(&per_cpu(xt_info_locks, cpu).lock); + write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); } /* diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3fac340a28d5..e855fffaed95 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t, struct arpt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) * about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a846d633b3b6..652efea013dc 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t, struct ipt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU. - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ipt_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 455582384ece..7d227c644f72 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t, struct ip6t_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu = get_cpu(); - - /* Instead of clearing (by a previous call to memset()) - * the counters and using adds, we set the counters - * with data used by 'current' CPU - * - * Bottom half has to be disabled to prevent deadlock - * if new softirq were to run and call ipt_do_table - */ - local_bh_disable(); - i = 0; - xt_entry_foreach(iter, t->entries[curcpu], t->size) { - SET_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); - ++i; - } - local_bh_enable(); - /* Processing counters from other cpus, we can let bottom half enabled, - * (preemption is disabled) - */ for_each_possible_cpu(cpu) { - if (cpu == curcpu) - continue; + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; + i = 0; - local_bh_disable(); - xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { - ADD_COUNTER(counters[i], iter->counters.bcnt, - iter->counters.pcnt); + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqbegin(lock); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqretry(lock, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); ++i; } - xt_info_wrunlock(cpu); - local_bh_enable(); } - put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc(countersize); + counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ip6t_entry *iter; ret = 0; - counters = vmalloc(num_counters * sizeof(struct xt_counters)); + counters = vzalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 80463507420e..c94237631077 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1325,7 +1325,8 @@ static int __init xt_init(void) for_each_possible_cpu(i) { struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); - spin_lock_init(&lock->lock); + + seqlock_init(&lock->lock); lock->readers = 0; } -- cgit v1.2.3-55-g7522 From 13ee6ac579574a2a95e982b19920fd2495dce8cd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 11 Jan 2011 23:54:42 +0100 Subject: netfilter: fix race in conntrack between dump_table and destroy The netlink interface to dump the connection tracking table has a race when entries are deleted at the same time. A customer reported a crash and the backtrace showed thatctnetlink_dump_table was running while a conntrack entry was being destroyed. (see https://bugzilla.vyatta.com/show_bug.cgi?id=6402). According to RCU documentation, when using hlist_nulls the reader must handle the case of seeing a deleted entry and not proceed further down the linked list. The old code would continue which caused the scan to walk into the free list. This patch uses locking (rather than RCU) for this operation which is guaranteed safe, and no longer requires getting reference while doing dump operation. Signed-off-by: Stephen Hemminger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 746140264b2d..5cb8d3027b18 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -645,25 +645,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; - rcu_read_lock(); + spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { restart: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]], + hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (!atomic_inc_not_zero(&ct->ct_general.use)) - continue; /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ if (l3proto && nf_ct_l3num(ct) != l3proto) - goto releasect; + continue; if (cb->args[1]) { if (ct != last) - goto releasect; + continue; cb->args[1] = 0; } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, @@ -681,8 +679,6 @@ restart: if (acct) memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); } -releasect: - nf_ct_put(ct); } if (cb->args[1]) { cb->args[1] = 0; @@ -690,7 +686,7 @@ releasect: } } out: - rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); if (last) nf_ct_put(last); -- cgit v1.2.3-55-g7522 From f31e8d4982653b39fe312f9938be0f49dd9ab5fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 13 Jan 2011 14:19:55 +0100 Subject: netfilter: ctnetlink: fix loop in ctnetlink_get_conntrack() This patch fixes a loop in ctnetlink_get_conntrack() that can be triggered if you use the same socket to receive events and to perform a GET operation. Under heavy load, netlink_unicast() may return -EAGAIN, this error code is reserved in nfnetlink for the module load-on-demand. Instead, we return -ENOBUFS which is the appropriate error code that has to be propagated to user-space. Reported-by: Holger Eitzenberger Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5cb8d3027b18..2b7eef37875c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -972,7 +972,8 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, free: kfree_skb(skb2); out: - return err; + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; } #ifdef CONFIG_NF_NAT_NEEDED -- cgit v1.2.3-55-g7522