From efcdbf24fd5daa88060869e51ed49f68b7ac8708 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Mon, 30 Jan 2012 14:16:06 -0800 Subject: net: Disambiguate kernel message Some of our machines were reporting: TCP: too many of orphaned sockets even when the number of orphaned sockets was well below the limit. We print a different message depending on whether we're out of TCP memory or there are too many orphaned sockets. Also move the check out of line and cleanup the messages that were printed. Signed-off-by: Arun Sharma Suggested-by: Mohan Srinivasan Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: David Miller Cc: Glauber Costa Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++++++++---- net/ipv4/tcp_timer.c | 5 +---- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 06373b4a449a..a34f5cfdd44c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ + bool too_many_orphans, out_of_socket_memory; + + too_many_orphans = tcp_too_many_orphans(sk, shift); + out_of_socket_memory = tcp_out_of_memory(sk); + + if (too_many_orphans && net_ratelimit()) + pr_info("TCP: too many orphaned sockets\n"); + if (out_of_socket_memory && net_ratelimit()) + pr_info("TCP: out of memory -- consider tuning tcp_mem\n"); + return too_many_orphans || out_of_socket_memory; +} + void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2015,10 +2029,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, 0)) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a516d1e399df..cd2e0723266d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) shift++; - if (tcp_too_many_orphans(sk, shift)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || -- cgit v1.2.3-55-g7522 From 786f528119722f564a22ad953411374e06116333 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 1 Feb 2012 09:32:25 +0000 Subject: ethtool: Null-terminate filename passed to ethtool_ops::flash_device The parameters for ETHTOOL_FLASHDEV include a filename, which ought to be null-terminated. Currently the only driver that implements ethtool_ops::flash_device attempts to add a null terminator if necessary, but does it wrongly. Do it in the ethtool core instead. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_ethtool.c | 6 +----- net/core/ethtool.c | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index 6db6b6ae5e9b..802e5ddef8a8 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -716,12 +716,8 @@ static int be_do_flash(struct net_device *netdev, struct ethtool_flash *efl) { struct be_adapter *adapter = netdev_priv(netdev); - char file_name[ETHTOOL_FLASH_MAX_FILENAME]; - file_name[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - strcpy(file_name, efl->data); - - return be_load_fw(adapter, file_name); + return be_load_fw(adapter, efl->data); } static int diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 369b41894527..3f79db1b612a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1190,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, if (!dev->ethtool_ops->flash_device) return -EOPNOTSUPP; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + return dev->ethtool_ops->flash_device(dev, &efl); } -- cgit v1.2.3-55-g7522 From 07ae2dfcf4f7143ce191c6436da1c33f179af0d6 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 1 Feb 2012 18:48:09 +0200 Subject: mac80211: timeout a single frame in the rx reorder buffer The current code checks for stored_mpdu_num > 1, causing the reorder_timer to be triggered indefinitely, but the frame is never timed-out (until the next packet is received) Signed-off-by: Eliad Peller Cc: Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 751409120769..5a5e504a8ffb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -611,7 +611,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { + tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any * frames in the reorder buffer have timed out. -- cgit v1.2.3-55-g7522 From c43b874d5d714f271b80d4c3f49e05d0cbf51ed2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 2 Feb 2012 00:07:00 +0000 Subject: tcp: properly initialize tcp memory limits Commit 4acb4190 tries to fix the using uninitialized value introduced by commit 3dc43e3, but it would make the per-socket memory limits too small. This patch fixes this and also remove the redundant codes introduced in 4acb4190. Signed-off-by: Jason Wang Acked-by: Glauber Costa Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 ------ net/ipv4/tcp.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4cb9cd2f2c39..7a7724da9bff 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; - unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -815,11 +814,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; tcp_init_mem(net); - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a34f5cfdd44c..37755ccc0e96 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3229,7 +3229,6 @@ __setup("thash_entries=", set_thash_entries); void tcp_init_mem(struct net *net) { - /* Set per-socket limits to no more than 1/128 the pressure threshold */ unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; @@ -3298,7 +3297,8 @@ void __init tcp_init(void) sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(&init_net); - limit = nr_free_buffer_pages() / 8; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); -- cgit v1.2.3-55-g7522 From b01377a4200d0dfc7b04a8daabb4739727353703 Mon Sep 17 00:00:00 2001 From: sjur.brandeland@stericsson.com Date: Thu, 2 Feb 2012 01:21:02 +0000 Subject: caif: Bugfix list_del_rcu race in cfmuxl_ctrlcmd. Always use cfmuxl_remove_uplayer when removing a up-layer. cfmuxl_ctrlcmd() can be called independently and in parallel with cfmuxl_remove_uplayer(). The race between them could cause list_del_rcu to be called on a node which has been already taken out from the list. That lead to a (rare) crash on accessing poisoned node->prev inside list_del_rcu. This fix ensures that deletion are done holding the same lock. Reported-by: Dmitry Tarnyagin Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/cfmuxl.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index b36f24a4c8e7..94b08612a4d8 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -248,7 +248,6 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; - int idx; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { @@ -257,14 +256,9 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && - layer->id != 0) { - - idx = layer->id % UP_CACHE_SIZE; - spin_lock_bh(&muxl->receive_lock); - RCU_INIT_POINTER(muxl->up_cache[idx], NULL); - list_del_rcu(&layer->node); - spin_unlock_bh(&muxl->receive_lock); - } + layer->id != 0) + cfmuxl_remove_uplayer(layr, layer->id); + /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } -- cgit v1.2.3-55-g7522 From ba7605745d5c99f0e71b3ec6c7cb5ed6afe540ad Mon Sep 17 00:00:00 2001 From: Dmitry Tarnyagin Date: Thu, 2 Feb 2012 01:21:03 +0000 Subject: caif: Bugfix double kfree_skb upon xmit failure SKB is freed twice upon send error. The Network stack consumes SKB even when it returns error code. Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a98628086452..a97d97a3a512 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -539,8 +539,10 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); - if (cf_sk->layer.dn == NULL) + if (cf_sk->layer.dn == NULL) { + kfree_skb(skb); return -EINVAL; + } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } @@ -683,10 +685,10 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); - if (err < 0) { - kfree_skb(skb); + if (err < 0) + /* skb is already freed */ goto pipe_err; - } + sent += size; } -- cgit v1.2.3-55-g7522 From e0aac52e17a3db68fe2ceae281780a70fc69957f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 27 Jan 2012 10:45:27 +0900 Subject: ipvs: fix matching of fwmark templates during scheduling Commit f11017ec2d1859c661f4e2b12c4a8d250e1f47cf (2.6.37) moved the fwmark variable in subcontext that is invalidated before reaching the ip_vs_ct_in_get call. As vaddr is provided as pointer in the param structure make sure the fwmark variable is in same context. As the fwmark templates can not be matched, more and more template connections are created and the controlled connections can not go to single real server. Signed-off-by: Julian Anastasov Cc: stable@vger.kernel.org Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 611c3359b94d..2555816e7788 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -232,6 +232,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ @@ -267,7 +268,6 @@ ip_vs_sched_persist(struct ip_vs_service *svc, { int protocol = iph.protocol; const union nf_inet_addr *vaddr = &iph.daddr; - const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; __be16 vport = 0; if (dst_port == svc->port) { -- cgit v1.2.3-55-g7522 From 5962b35c1de3254a2f03b95efd3b7854b874d7b7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 3 Feb 2012 05:18:43 +0000 Subject: netprio_cgroup: Fix obo in get_prioidx It was recently pointed out to me that the get_prioidx function sets a bit in the prioidx map prior to checking to see if the index being set is out of bounds. This patch corrects that, avoiding the possiblity of us writing beyond the end of the array Signed-off-by: Neil Horman Reported-by: Stanislaw Gruszka CC: Stanislaw Gruszka CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd4826b75..9ae183a9a381 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -58,11 +58,12 @@ static int get_prioidx(u32 *prio) spin_lock_irqsave(&prioidx_map_lock, flags); prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { + spin_unlock_irqrestore(&prioidx_map_lock, flags); + return -ENOSPC; + } set_bit(prioidx, prioidx_map); spin_unlock_irqrestore(&prioidx_map_lock, flags); - if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) - return -ENOSPC; - atomic_set(&max_prioidx, prioidx); *prio = prioidx; return 0; -- cgit v1.2.3-55-g7522 From e2446eaab5585555a38ea0df4e01ff313dbb4ac9 Mon Sep 17 00:00:00 2001 From: Shawn Lu Date: Sat, 4 Feb 2012 12:38:09 +0000 Subject: tcp_v4_send_reset: binding oif to iif in no sock case Binding RST packet outgoing interface to incoming interface for tcp v4 when there is no socket associate with it. when sk is not NULL, using sk->sk_bound_dev_if instead. (suggested by Eric Dumazet). This has few benefits: 1. tcp_v6_send_reset already did that. 2. This helps tcp connect with SO_BINDTODEVICE set. When connection is lost, we still able to sending out RST using same interface. 3. we are sending reply, it is most likely to be succeed if iif is used Signed-off-by: Shawn Lu Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 337ba4cca052..94d683a61cba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -651,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. + * routing might fail in this case. using iif for oif to + * make sure we can deliver it + */ + arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; -- cgit v1.2.3-55-g7522 From 6d25886ee2fbc05a7bf4dae5f5ae345cb73df2fd Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Tue, 7 Feb 2012 07:39:11 +0000 Subject: net: Fix build regression when INET_UDP_DIAG=y and IPV6=m Tested-by: Anisse Astier Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index aa2a2c79776f..d183262943d9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -409,7 +409,7 @@ config INET_TCP_DIAG config INET_UDP_DIAG tristate "UDP: socket monitoring interface" - depends on INET_DIAG + depends on INET_DIAG && (IPV6 || IPV6=n) default n ---help--- Support for UDP socket monitoring interface used by the ss tool. -- cgit v1.2.3-55-g7522 From 5ca3b72c5da47d95b83857b768def6172fbc080a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Feb 2012 08:51:50 +0000 Subject: gro: more generic L2 header check Shlomo Pongratz reported GRO L2 header check was suited for Ethernet only, and failed on IB/ipoib traffic. He provided a patch faking a zeroed header to let GRO aggregates frames. Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header check to be more generic, ie not assuming L2 header is 14 bytes, but taking into account hard_header_len. __napi_gro_receive() has special handling for the common case (Ethernet) to avoid a memcmp() call and use an inline optimized function instead. Signed-off-by: Eric Dumazet Reported-by: Shlomo Pongratz Cc: Roland Dreier Cc: Or Gerlitz Cc: Herbert Xu Tested-by: Sean Hefty Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..6ca32f6b3105 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3500,14 +3500,20 @@ static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.3-55-g7522 From 16bda13d90c8d5da243e2cfa1677e62ecce26860 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Mon, 6 Feb 2012 15:14:37 -0500 Subject: net: Make qdisc_skb_cb upper size bound explicit. Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside of other data structures. This is intended to be used by IPoIB so that it can remember addressing information stored at hard_header_ops->create() time that it can fetch when the packet gets to the transmit routine. Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- net/sched/sch_choke.c | 3 +-- net/sched/sch_netem.c | 3 +-- net/sched/sch_sfb.c | 3 +-- net/sched/sch_sfq.c | 5 ++--- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f6bb08b73ca4..55ce96b53b09 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -220,9 +220,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - long data[]; + unsigned char data[24]; }; +static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) +{ + struct qdisc_skb_cb *qcb; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + BUILD_BUG_ON(sizeof(qcb->data) < sz); +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index e465064d39a3..7e267d7b9c75 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -148,8 +148,7 @@ struct choke_skb_cb { static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2776012132ea..e83d61ca78ca 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -130,8 +130,7 @@ struct netem_skb_cb { static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 96e42cae4c7a..d7eea99333e9 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -94,8 +94,7 @@ struct sfb_skb_cb { static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 67494aef9acf..60d47180f043 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -166,9 +166,8 @@ struct sfq_skb_cb { static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; + qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } static unsigned int sfq_hash(const struct sfq_sched_data *q, -- cgit v1.2.3-55-g7522 From a8db7b2d197a0d624baab83f0c810b0edbc4ffd0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 6 Feb 2012 13:23:10 +0100 Subject: netfilter: nf_queue: fix queueing of bridged gro skbs When trying to nf_queue GRO/GSO skbs, nf_queue uses skb_gso_segment to split the skb. However, if nf_queue is called via bridge netfilter, the mac header won't be preserved -- packets will thus contain a bogus mac header. Fix this by setting skb->data to the mac header when skb->nf_bridge is set and restoring skb->data afterwards for all segments. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b3a7db678b8d..ce60cf0f6c11 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -203,6 +203,27 @@ err: return status; } +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + int nf_queue(struct sk_buff *skb, struct list_head *elem, u_int8_t pf, unsigned int hook, @@ -212,7 +233,7 @@ int nf_queue(struct sk_buff *skb, unsigned int queuenum) { struct sk_buff *segs; - int err; + int err = -EINVAL; unsigned int queued; if (!skb_is_gso(skb)) @@ -228,23 +249,25 @@ int nf_queue(struct sk_buff *skb, break; } + nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ECANCELED to mean * 'ignore this hook'. */ if (IS_ERR(segs)) - return -EINVAL; - + goto out_err; queued = 0; err = 0; do { struct sk_buff *nskb = segs->next; segs->next = NULL; - if (err == 0) + if (err == 0) { + nf_bridge_adjust_segmented_data(segs); err = __nf_queue(segs, elem, pf, hook, indev, outdev, okfn, queuenum); + } if (err == 0) queued++; else @@ -252,11 +275,12 @@ int nf_queue(struct sk_buff *skb, segs = nskb; } while (segs); - /* also free orig skb if only some segments were queued */ - if (unlikely(err && queued)) - err = 0; - if (err == 0) + if (queued) { kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); return err; } -- cgit v1.2.3-55-g7522 From b57e6b560fc2a2742910ac5ca0eb2c46e45aeac2 Mon Sep 17 00:00:00 2001 From: Mohammed Shafi Shajakhan Date: Thu, 9 Feb 2012 19:59:43 +0530 Subject: mac80211: Fix a rwlock bad magic bug read_lock(&tpt_trig->trig.leddev_list_lock) is accessed via the path ieee80211_open (->) ieee80211_do_open (->) ieee80211_mod_tpt_led_trig (->) ieee80211_start_tpt_led_trig (->) tpt_trig_timer before initializing it. the intilization of this read/write lock happens via the path ieee80211_led_init (->) led_trigger_register, but we are doing 'ieee80211_led_init' after 'ieeee80211_if_add' where we register netdev_ops. so we access leddev_list_lock before initializing it and causes the following bug in chrome laptops with AR928X cards with the following script while true do sudo modprobe -v ath9k sleep 3 sudo modprobe -r ath9k sleep 3 done BUG: rwlock bad magic on CPU#1, wpa_supplicant/358, f5b9eccc Pid: 358, comm: wpa_supplicant Not tainted 3.0.13 #1 Call Trace: [<8137b9df>] rwlock_bug+0x3d/0x47 [<81179830>] do_raw_read_lock+0x19/0x29 [<8137f063>] _raw_read_lock+0xd/0xf [] tpt_trig_timer+0xc3/0x145 [mac80211] [] ieee80211_mod_tpt_led_trig+0x152/0x174 [mac80211] [] ieee80211_do_open+0x11e/0x42e [mac80211] [] ? ieee80211_check_concurrent_iface+0x26/0x13c [mac80211] [] ieee80211_open+0x48/0x4c [mac80211] [<812dbed8>] __dev_open+0x82/0xab [<812dc0c9>] __dev_change_flags+0x9c/0x113 [<812dc1ae>] dev_change_flags+0x18/0x44 [<8132144f>] devinet_ioctl+0x243/0x51a [<81321ba9>] inet_ioctl+0x93/0xac [<812cc951>] sock_ioctl+0x1c6/0x1ea [<812cc78b>] ? might_fault+0x20/0x20 [<810b1ebb>] do_vfs_ioctl+0x46e/0x4a2 [<810a6ebb>] ? fget_light+0x2f/0x70 [<812ce549>] ? sys_recvmsg+0x3e/0x48 [<810b1f35>] sys_ioctl+0x46/0x69 [<8137fa77>] sysenter_do_call+0x12/0x2 Cc: Cc: Gary Morain Cc: Paul Stewart Cc: Abhijit Pradhan Cc: Vasanthakumar Thiagarajan Cc: Rajkumar Manoharan Acked-by: Johannes Berg Tested-by: Mohammed Shafi Shajakhan Signed-off-by: Mohammed Shafi Shajakhan Signed-off-by: John W. Linville --- net/mac80211/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0a0d94ad9b08..b142bd4c2390 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -910,6 +910,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", result); + ieee80211_led_init(local); + rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, @@ -931,8 +933,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); - ieee80211_led_init(local); - local->network_latency_notifier.notifier_call = ieee80211_max_network_latency; result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY, -- cgit v1.2.3-55-g7522 From a87dfe14a78501c931a4d5481efff6a809aa907d Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:36 +0000 Subject: netprio_cgroup: fix an off-by-one bug # mount -t cgroup xxx /mnt # mkdir /mnt/tmp # cat /mnt/tmp/net_prio.ifpriomap lo 0 eth0 0 virbr0 0 # echo 'lo 999' > /mnt/tmp/net_prio.ifpriomap # cat /mnt/tmp/net_prio.ifpriomap lo 999 eth0 0 virbr0 4101267344 We got weired output, because we exceeded the boundary of the array. We may even crash the kernel.. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9ae183a9a381..72c638780805 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -108,7 +108,7 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len) static void update_netdev_tables(void) { struct net_device *dev; - u32 max_len = atomic_read(&max_prioidx); + u32 max_len = atomic_read(&max_prioidx) + 1; struct netprio_map *map; rtnl_lock(); -- cgit v1.2.3-55-g7522 From f5c38208d32412d72b97a4f0d44af0eb39feb20b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:37 +0000 Subject: netprio_cgroup: don't allocate prio table when a device is registered So we delay the allocation till the priority is set through cgroup, and this makes skb_update_priority() faster when it's not set. This also eliminates an off-by-one bug similar with the one fixed in the previous patch. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 72c638780805..4dacc44637ef 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -271,7 +271,6 @@ static int netprio_device_event(struct notifier_block *unused, { struct net_device *dev = ptr; struct netprio_map *old; - u32 max_len = atomic_read(&max_prioidx); /* * Note this is called with rtnl_lock held so we have update side @@ -279,11 +278,6 @@ static int netprio_device_event(struct notifier_block *unused, */ switch (event) { - - case NETDEV_REGISTER: - if (max_len) - extend_netdev_table(dev, max_len); - break; case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); -- cgit v1.2.3-55-g7522 From 2b73bc65e2771372c818db7955709c8caedbf8b9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:38 +0000 Subject: netprio_cgroup: fix wrong memory access when NETPRIO_CGROUP=m When the netprio_cgroup module is not loaded, net_prio_subsys_id is -1, and so sock_update_prioidx() accesses cgroup_subsys array with negative index subsys[-1]. Make the code resembles cls_cgroup code, which is bug free. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- include/net/netprio_cgroup.h | 48 ++++++++++++++++++++++++++++++++++++-------- net/core/sock.c | 7 ++----- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 7b2d43139c8e..d58fdec47597 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -37,19 +37,51 @@ extern int net_prio_subsys_id; extern void sock_update_netprioidx(struct sock *sk); -static inline struct cgroup_netprio_state - *task_netprio_state(struct task_struct *p) +#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) { -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) - return container_of(task_subsys_state(p, net_prio_subsys_id), - struct cgroup_netprio_state, css); -#else - return NULL; -#endif + struct cgroup_netprio_state *state; + u32 idx; + + rcu_read_lock(); + state = container_of(task_subsys_state(p, net_prio_subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + rcu_read_unlock(); + return idx; +} + +#elif IS_MODULE(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) +{ + struct cgroup_netprio_state *state; + int subsys_id; + u32 idx = 0; + + rcu_read_lock(); + subsys_id = rcu_dereference_index_check(net_prio_subsys_id, + rcu_read_lock_held()); + if (subsys_id >= 0) { + state = container_of(task_subsys_state(p, subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + } + rcu_read_unlock(); + return idx; } #else +static inline u32 task_netprioidx(struct task_struct *p) +{ + return 0; +} + +#endif /* CONFIG_NETPRIO_CGROUP */ + +#else #define sock_update_netprioidx(sk) #endif diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..02f8dfe320b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid); void sock_update_netprioidx(struct sock *sk) { - struct cgroup_netprio_state *state; if (in_interrupt()) return; - rcu_read_lock(); - state = task_netprio_state(current); - sk->sk_cgrp_prioidx = state ? state->prioidx : 0; - rcu_read_unlock(); + + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -- cgit v1.2.3-55-g7522 From 5dc7883f2a7c25f8df40d7479687153558cd531b Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 9 Feb 2012 21:15:25 +0000 Subject: ipv4: Fix wrong order of ip_rt_get_source() and update iph->daddr. This patch fix a bug which introduced by commit ac8a4810 (ipv4: Save nexthop address of LSRR/SSRR option to IPCB.).In that patch, we saved the nexthop of SRR in ip_option->nexthop and update iph->daddr until we get to ip_forward_options(), but we need to update it before ip_rt_get_source(), otherwise we may get a wrong src. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1e60f7679075..42dd1a90edea 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], skb, rt); ip_hdr(skb)->daddr = opt->nexthop; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); -- cgit v1.2.3-55-g7522 From 70620c46ac2b45c24b0f22002fdf5ddd1f7daf81 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 10 Feb 2012 04:07:11 +0000 Subject: net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is disabled Commit 653241 (net: RFC3069, private VLAN proxy arp support) changed the behavior of arp proxy to send arp replies back out on the interface the request came in even if the private VLAN feature is disabled. Previously we checked rt->dst.dev != skb->dev for in scenarios, when proxy arp is enabled on for the netdevice and also when individual proxy neighbour entries have been added. This patch adds the check back for the pneigh_lookup() scenario. Signed-off-by: Thomas Graf Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59402be133f0..63e49890ad31 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,7 +863,8 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + (rt->dst.dev != dev && + pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); -- cgit v1.2.3-55-g7522 From cc9a672ee522d4805495b98680f4a3db5d0a0af9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 12 Feb 2012 18:37:09 +0000 Subject: tcp: allow tcp_sacktag_one() to tag ranges not aligned with skbs This commit allows callers of tcp_sacktag_one() to pass in sequence ranges that do not align with skb boundaries, as tcp_shifted_skb() needs to do in an upcoming fix in this patch series. In fact, now tcp_sacktag_one() does not need to depend on an input skb at all, which makes its semantics and dependencies more clear. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 976034f82320..4e8a81fda65c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1307,25 +1307,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); - u8 sacked = TCP_SKB_CB(skb)->sacked; int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1344,13 +1345,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ - if (before(TCP_SKB_CB(skb)->seq, + if (before(start_seq, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + if (!after(end_seq, tp->frto_highmark)) state->flag |= FLAG_ONLY_ORIG_SACKED; } @@ -1368,8 +1369,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq)) + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) @@ -1425,7 +1425,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* We discard results */ - tcp_sacktag_one(skb, sk, state, dup_sack, pcount); + tcp_sacktag_one(sk, state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1664,10 +1668,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, - state, - dup_sack, - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) -- cgit v1.2.3-55-g7522 From daef52bab1fd26e24e8e9578f8fb33ba1d0cb412 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 12 Feb 2012 18:37:10 +0000 Subject: tcp: fix range tcp_shifted_skb() passes to tcp_sacktag_one() Fix the newly-SACKed range to be the range of newly-shifted bytes. Previously - since 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 - tcp_shifted_skb() incorrectly called tcp_sacktag_one() with the start and end sequence numbers of the skb it passes in set to the range just beyond the range that is newly-SACKed. This commit also removes a special-case adjustment to lost_cnt_hint in tcp_shifted_skb() since the pre-existing adjustment of lost_cnt_hint in tcp_sacktag_one() now properly handles this things now that the correct start sequence number is passed in. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e8a81fda65c..8116d06e042c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1388,6 +1388,9 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, @@ -1395,12 +1398,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1424,12 +1426,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* We discard results */ - tcp_sacktag_one(sk, state, - TCP_SKB_CB(skb)->sacked, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - dup_sack, pcount); + /* Adjust counters and hints for the newly sacked sequence range but + * discard the return value since prev is already marked. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); -- cgit v1.2.3-55-g7522 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc02..fe19ac13f75f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad7..d3ebdbe723d0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deaccb..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e96..22ef5f9fd2ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3-55-g7522 From 0af2a0d0576205dda778d25c6c344fc6508fc81d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 13 Feb 2012 20:22:08 +0000 Subject: tcp: fix tcp_shifted_skb() adjustment of lost_cnt_hint for FACK This commit ensures that lost_cnt_hint is correctly updated in tcp_shifted_skb() for FACK TCP senders. The lost_cnt_hint adjustment in tcp_sacktag_one() only applies to non-FACK senders, so FACK senders need their own adjustment. This applies the spirit of 1e5289e121372a3494402b1b131b41bfe1cf9b7f - except now that the sequence range passed into tcp_sacktag_one() is correct we need only have a special case adjustment for FACK. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8116d06e042c..53c8ce4046b2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1403,6 +1403,10 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); + /* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */ + if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint)) + tp->lost_cnt_hint += pcount; + TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; -- cgit v1.2.3-55-g7522 From f65bd5ec47a4461bc575d5d34902fd18b6ec5542 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Mon, 13 Feb 2012 20:19:14 +0000 Subject: RxRPC: Fix kcalloc parameters swapped The first parameter should be "number of elements" and the second parameter should be "element size". Signed-off-by: Axel Lin Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/ar-key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 4cba13e46ffd..ae3a035f5390 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -232,7 +232,7 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, if (toklen <= (n_parts + 1) * 4) return -EINVAL; - princ->name_parts = kcalloc(sizeof(char *), n_parts, GFP_KERNEL); + princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); if (!princ->name_parts) return -ENOMEM; @@ -355,7 +355,7 @@ static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, _debug("n_elem %d", n_elem); - td = kcalloc(sizeof(struct krb5_tagged_data), n_elem, + td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), GFP_KERNEL); if (!td) return -ENOMEM; -- cgit v1.2.3-55-g7522 From 58e05f357a039a94aa36475f8c110256f693a239 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Feb 2012 10:11:59 +0000 Subject: netpoll: netpoll_poll_dev() should access dev->flags commit 5a698af53f (bond: service netpoll arp queue on master device) tested IFF_SLAVE flag against dev->priv_flags instead of dev->flags Signed-off-by: Eric Dumazet Cc: WANG Cong Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 556b08298669..ddefc513b44a 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -194,7 +194,7 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); - if (dev->priv_flags & IFF_SLAVE) { + if (dev->flags & IFF_SLAVE) { if (dev->npinfo) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; -- cgit v1.2.3-55-g7522 From 18daf1644e634bae951a6e3d4d19d89170209762 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Fri, 13 Jan 2012 15:11:30 +0100 Subject: Bluetooth: Fix l2cap conn failures for ssp devices Commit 330605423c fixed l2cap conn establishment for non-ssp remote devices by not setting HCI_CONN_ENCRYPT_PEND every time conn security is tested (which was always returning failure on any subsequent security checks). However, this broke l2cap conn establishment for ssp remote devices when an ACL link was already established at SDP-level security. This fix ensures that encryption must be pending whenever authentication is also pending. Signed-off-by: Peter Hurley Tested-by: Daniel Wagner Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/hci_conn.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3db432473ad5..07bc69ed9498 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -635,6 +635,10 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_auth_requested cp; + + /* encrypt must be pending if auth is also pending */ + set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); -- cgit v1.2.3-55-g7522 From 4aa832c27edf902130f8bace1d42cf22468823fa Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 8 Jan 2012 22:51:16 +0200 Subject: Bluetooth: Remove bogus inline declaration from l2cap_chan_connect As reported by Dan Carpenter this function causes a Sparse warning and shouldn't be declared inline: include/net/bluetooth/l2cap.h:837:30 error: marked inline, but without a definition" Reported-by: Dan Carpenter Signed-off-by: Johan Hedberg Acked-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 2 +- net/bluetooth/l2cap_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 68f589150692..124f7cf45bd7 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -834,7 +834,7 @@ int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(struct sock *sk); void l2cap_chan_close(struct l2cap_chan *chan, int reason); void l2cap_chan_destroy(struct l2cap_chan *chan); -inline int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst); int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u32 priority); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index faf0b11ac1d3..980abdb30671 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1120,7 +1120,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr return c1; } -inline int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) { struct sock *sk = chan->sk; bdaddr_t *src = &bt_sk(sk)->src; -- cgit v1.2.3-55-g7522 From a63752552b95624a9f1dfa3d763870f72f964ad0 Mon Sep 17 00:00:00 2001 From: Andrzej Kaczmarek Date: Wed, 4 Jan 2012 12:10:41 +0100 Subject: Bluetooth: Fix sk_sndtimeo initialization for L2CAP socket sk_sndtime value should be specified in jiffies thus initial value needs to be converted from miliseconds. Otherwise this timeout is unreliable when CONFIG_HZ is not set to 1000. Signed-off-by: Andrzej Kaczmarek Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/l2cap_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c61d967012b2..c57027f7606f 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1002,7 +1002,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p INIT_LIST_HEAD(&bt_sk(sk)->accept_q); sk->sk_destruct = l2cap_sock_destruct; - sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; + sk->sk_sndtimeo = msecs_to_jiffies(L2CAP_CONN_TIMEOUT); sock_reset_flag(sk, SOCK_ZAPPED); -- cgit v1.2.3-55-g7522 From 6e1da683f79a22fafaada62d547138daaa9e3456 Mon Sep 17 00:00:00 2001 From: Andrzej Kaczmarek Date: Wed, 4 Jan 2012 12:10:42 +0100 Subject: Bluetooth: l2cap_set_timer needs jiffies as timeout value After moving L2CAP timers to workqueues l2cap_set_timer expects timeout value to be specified in jiffies but constants defined in miliseconds are used. This makes timeouts unreliable when CONFIG_HZ is not set to 1000. __set_chan_timer macro still uses jiffies as input to avoid multiple conversions from/to jiffies for sk_sndtimeo value which is already specified in jiffies. Signed-off-by: Andrzej Kaczmarek Ackec-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/l2cap.h | 6 +++--- net/bluetooth/l2cap_core.c | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 124f7cf45bd7..26486e182f55 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -626,13 +626,13 @@ static inline void l2cap_clear_timer(struct l2cap_chan *chan, #define __set_chan_timer(c, t) l2cap_set_timer(c, &c->chan_timer, (t)) #define __clear_chan_timer(c) l2cap_clear_timer(c, &c->chan_timer) #define __set_retrans_timer(c) l2cap_set_timer(c, &c->retrans_timer, \ - L2CAP_DEFAULT_RETRANS_TO); + msecs_to_jiffies(L2CAP_DEFAULT_RETRANS_TO)); #define __clear_retrans_timer(c) l2cap_clear_timer(c, &c->retrans_timer) #define __set_monitor_timer(c) l2cap_set_timer(c, &c->monitor_timer, \ - L2CAP_DEFAULT_MONITOR_TO); + msecs_to_jiffies(L2CAP_DEFAULT_MONITOR_TO)); #define __clear_monitor_timer(c) l2cap_clear_timer(c, &c->monitor_timer) #define __set_ack_timer(c) l2cap_set_timer(c, &chan->ack_timer, \ - L2CAP_DEFAULT_ACK_TO); + msecs_to_jiffies(L2CAP_DEFAULT_ACK_TO)); #define __clear_ack_timer(c) l2cap_clear_timer(c, &c->ack_timer) static inline int __seq_offset(struct l2cap_chan *chan, __u16 seq1, __u16 seq2) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 980abdb30671..dcccae04ae08 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2970,7 +2970,8 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr default: sk->sk_err = ECONNRESET; - __set_chan_timer(chan, L2CAP_DISC_REJ_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_REJ_TIMEOUT)); l2cap_send_disconn_req(conn, chan, ECONNRESET); goto done; } @@ -4478,7 +4479,8 @@ static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) if (encrypt == 0x00) { if (chan->sec_level == BT_SECURITY_MEDIUM) { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_ENC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_ENC_TIMEOUT)); } else if (chan->sec_level == BT_SECURITY_HIGH) l2cap_chan_close(chan, ECONNREFUSED); } else { @@ -4546,7 +4548,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) L2CAP_CONN_REQ, sizeof(req), &req); } else { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); } } else if (chan->state == BT_CONNECT2) { struct l2cap_conn_rsp rsp; @@ -4566,7 +4569,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } } else { l2cap_state_change(chan, BT_DISCONN); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); res = L2CAP_CR_SEC_BLOCK; stat = L2CAP_CS_NO_INFO; } -- cgit v1.2.3-55-g7522 From b5a30dda6598af216c070165ece6068f9f00f33a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sun, 22 Jan 2012 00:28:34 +0200 Subject: Bluetooth: silence lockdep warning Since bluetooth uses multiple protocols types, to avoid lockdep warnings, we need to use different lockdep classes (one for each protocol type). This is already done in bt_sock_create but it misses a couple of cases when new connections are created. This patch corrects that to fix the following warning: <4>[ 1864.732366] ======================================================= <4>[ 1864.733030] [ INFO: possible circular locking dependency detected ] <4>[ 1864.733544] 3.0.16-mid3-00007-gc9a0f62 #3 <4>[ 1864.733883] ------------------------------------------------------- <4>[ 1864.734408] t.android.btclc/4204 is trying to acquire lock: <4>[ 1864.734869] (rfcomm_mutex){+.+.+.}, at: [] rfcomm_dlc_close+0x15/0x30 <4>[ 1864.735541] <4>[ 1864.735549] but task is already holding lock: <4>[ 1864.736045] (sk_lock-AF_BLUETOOTH){+.+.+.}, at: [] lock_sock+0xa/0xc <4>[ 1864.736732] <4>[ 1864.736740] which lock already depends on the new lock. <4>[ 1864.736750] <4>[ 1864.737428] <4>[ 1864.737437] the existing dependency chain (in reverse order) is: <4>[ 1864.738016] <4>[ 1864.738023] -> #1 (sk_lock-AF_BLUETOOTH){+.+.+.}: <4>[ 1864.738549] [] lock_acquire+0x104/0x140 <4>[ 1864.738977] [] lock_sock_nested+0x58/0x68 <4>[ 1864.739411] [] l2cap_sock_sendmsg+0x3e/0x76 <4>[ 1864.739858] [] __sock_sendmsg+0x50/0x59 <4>[ 1864.740279] [] sock_sendmsg+0x94/0xa8 <4>[ 1864.740687] [] kernel_sendmsg+0x28/0x37 <4>[ 1864.741106] [] rfcomm_send_frame+0x30/0x38 <4>[ 1864.741542] [] rfcomm_send_ua+0x58/0x5a <4>[ 1864.741959] [] rfcomm_run+0x441/0xb52 <4>[ 1864.742365] [] kthread+0x63/0x68 <4>[ 1864.742742] [] kernel_thread_helper+0x6/0xd <4>[ 1864.743187] <4>[ 1864.743193] -> #0 (rfcomm_mutex){+.+.+.}: <4>[ 1864.743667] [] __lock_acquire+0x988/0xc00 <4>[ 1864.744100] [] lock_acquire+0x104/0x140 <4>[ 1864.744519] [] __mutex_lock_common+0x3b/0x33f <4>[ 1864.744975] [] mutex_lock_nested+0x2d/0x36 <4>[ 1864.745412] [] rfcomm_dlc_close+0x15/0x30 <4>[ 1864.745842] [] __rfcomm_sock_close+0x5f/0x6b <4>[ 1864.746288] [] rfcomm_sock_shutdown+0x2f/0x62 <4>[ 1864.746737] [] sys_socketcall+0x1db/0x422 <4>[ 1864.747165] [] syscall_call+0x7/0xb Signed-off-by: Octavian Purdila Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 2 ++ net/bluetooth/af_bluetooth.c | 12 +++++------- net/bluetooth/l2cap_sock.c | 2 ++ net/bluetooth/rfcomm/sock.c | 2 ++ 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index abaad6ed9b83..4a82ca0bb0b2 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -256,4 +256,6 @@ void l2cap_exit(void); int sco_init(void); void sco_exit(void); +void bt_sock_reclassify_lock(struct sock *sk, int proto); + #endif /* __BLUETOOTH_H */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ef92864ac625..72eb187a5f60 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -71,19 +71,16 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { "slock-AF_BLUETOOTH-BTPROTO_AVDTP", }; -static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) +void bt_sock_reclassify_lock(struct sock *sk, int proto) { - struct sock *sk = sock->sk; - - if (!sk) - return; - + BUG_ON(!sk); BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } +EXPORT_SYMBOL(bt_sock_reclassify_lock); int bt_sock_register(int proto, const struct net_proto_family *ops) { @@ -145,7 +142,8 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto, if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { err = bt_proto[proto]->create(net, sock, proto, kern); - bt_sock_reclassify_lock(sock, proto); + if (!err) + bt_sock_reclassify_lock(sock->sk, proto); module_put(bt_proto[proto]->owner); } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c57027f7606f..401d9428ae4c 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -849,6 +849,8 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(void *data) if (!sk) return NULL; + bt_sock_reclassify_lock(sk, BTPROTO_L2CAP); + l2cap_sock_init(sk, parent); return l2cap_pi(sk)->chan; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f066678faeee..22169c3f1482 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -956,6 +956,8 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * if (!sk) goto done; + bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM); + rfcomm_sock_init(sk, parent); bacpy(&bt_sk(sk)->src, &src); bacpy(&bt_sk(sk)->dst, &dst); -- cgit v1.2.3-55-g7522 From cf33e77b76d7439f21a23a94eab4ab3b405a6a7d Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Fri, 27 Jan 2012 19:32:39 +0200 Subject: Bluetooth: Fix RFCOMM session reference counting issue There is an imbalance in the rfcomm_session_hold / rfcomm_session_put operations which causes the following crash: [ 685.010159] BUG: unable to handle kernel paging request at 6b6b6b6b [ 685.010169] IP: [] rfcomm_process_dlcs+0x1b/0x15e [ 685.010181] *pdpt = 000000002d665001 *pde = 0000000000000000 [ 685.010191] Oops: 0000 [#1] PREEMPT SMP [ 685.010247] [ 685.010255] Pid: 947, comm: krfcommd Tainted: G C 3.0.16-mid8-dirty #44 [ 685.010266] EIP: 0060:[] EFLAGS: 00010246 CPU: 1 [ 685.010274] EIP is at rfcomm_process_dlcs+0x1b/0x15e [ 685.010281] EAX: e79f551c EBX: 6b6b6b6b ECX: 00000007 EDX: e79f40b4 [ 685.010288] ESI: e79f4060 EDI: ed4e1f70 EBP: ed4e1f68 ESP: ed4e1f50 [ 685.010295] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 [ 685.010303] Process krfcommd (pid: 947, ti=ed4e0000 task=ed43e5e0 task.ti=ed4e0000) [ 685.010308] Stack: [ 685.010312] ed4e1f68 c149eb53 e5925150 e79f4060 ed500000 ed4e1f70 ed4e1f80 c149ec10 [ 685.010331] 00000000 ed43e5e0 00000000 ed4e1f90 ed4e1f9c c149ec87 0000bf54 00000000 [ 685.010348] 00000000 ee03bf54 c149ec37 ed4e1fe4 c104fe01 00000000 00000000 00000000 [ 685.010367] Call Trace: [ 685.010376] [] ? rfcomm_process_rx+0x6e/0x74 [ 685.010387] [] rfcomm_process_sessions+0xb7/0xde [ 685.010398] [] rfcomm_run+0x50/0x6d [ 685.010409] [] ? rfcomm_process_sessions+0xde/0xde [ 685.010419] [] kthread+0x63/0x68 [ 685.010431] [] ? __init_kthread_worker+0x42/0x42 [ 685.010442] [] kernel_thread_helper+0x6/0xd This issue has been brought up earlier here: https://lkml.org/lkml/2011/5/21/127 The issue appears to be the rfcomm_session_put in rfcomm_recv_ua. This operation doesn't seem be to required as for the non-initiator case we have the rfcomm_process_rx doing an explicit put and in the initiator case the last dlc_unlink will drive the reference counter to 0. There have been several attempts to fix these issue: 6c2718d Bluetooth: Do not call rfcomm_session_put() for RFCOMM UA on closed socket 683d949 Bluetooth: Never deallocate a session when some DLC points to it but AFAICS they do not fix the issue just make it harder to reproduce. Signed-off-by: Octavian Purdila Signed-off-by: Gopala Krishna Murala Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/rfcomm/core.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 501649bf5596..8a602388f1e7 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1164,12 +1164,18 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) break; case BT_DISCONN: - /* When socket is closed and we are not RFCOMM - * initiator rfcomm_process_rx already calls - * rfcomm_session_put() */ - if (s->sock->sk->sk_state != BT_CLOSED) - if (list_empty(&s->dlcs)) - rfcomm_session_put(s); + /* rfcomm_session_put is called later so don't do + * anything here otherwise we will mess up the session + * reference counter: + * + * (a) when we are the initiator dlc_unlink will drive + * the reference counter to 0 (there is no initial put + * after session_add) + * + * (b) when we are not the initiator rfcomm_rx_process + * will explicitly call put to balance the initial hold + * done after session add. + */ break; } } -- cgit v1.2.3-55-g7522 From ca0d6c7ece0e78268cd7c5c378d6b1b610625085 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Fri, 3 Feb 2012 21:29:40 +0200 Subject: Bluetooth: Add missing QUIRK_NO_RESET test to hci_dev_do_close We should only perform a reset in hci_dev_do_close if the HCI_QUIRK_NO_RESET flag is set (since in such a case a reset will not be performed when initializing the device). Signed-off-by: Johan Hedberg Acked-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 9de93714213a..5aeb62491198 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -640,7 +640,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!test_bit(HCI_RAW, &hdev->flags)) { + if (!test_bit(HCI_RAW, &hdev->flags) && + test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) { set_bit(HCI_INIT, &hdev->flags); __hci_request(hdev, hci_reset_req, 0, msecs_to_jiffies(250)); -- cgit v1.2.3-55-g7522 From 6de32750822d00bfa92c341166132b0714c5b559 Mon Sep 17 00:00:00 2001 From: Ulisses Furquim Date: Mon, 30 Jan 2012 18:26:28 -0200 Subject: Bluetooth: Remove usage of __cancel_delayed_work() __cancel_delayed_work() is being used in some paths where we cannot sleep waiting for the delayed work to finish. However, that function might return while the timer is running and the work will be queued again. Replace the calls with safer cancel_delayed_work() version which spins until the timer handler finishes on other CPUs and cancels the delayed work. Signed-off-by: Ulisses Furquim Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/l2cap.h | 4 ++-- net/bluetooth/l2cap_core.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 26486e182f55..b1664ed884e6 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -611,7 +611,7 @@ static inline void l2cap_set_timer(struct l2cap_chan *chan, { BT_DBG("chan %p state %d timeout %ld", chan, chan->state, timeout); - if (!__cancel_delayed_work(work)) + if (!cancel_delayed_work(work)) l2cap_chan_hold(chan); schedule_delayed_work(work, timeout); } @@ -619,7 +619,7 @@ static inline void l2cap_set_timer(struct l2cap_chan *chan, static inline void l2cap_clear_timer(struct l2cap_chan *chan, struct delayed_work *work) { - if (__cancel_delayed_work(work)) + if (cancel_delayed_work(work)) l2cap_chan_put(chan); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index dcccae04ae08..ec10c698b891 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2574,7 +2574,7 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && cmd->ident == conn->info_ident) { - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; @@ -3121,7 +3121,7 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) return 0; - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); if (result != L2CAP_IR_SUCCESS) { conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; @@ -4501,7 +4501,7 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) if (hcon->type == LE_LINK) { smp_distribute_keys(conn, 0); - __cancel_delayed_work(&conn->security_timer); + cancel_delayed_work(&conn->security_timer); } rcu_read_lock(); -- cgit v1.2.3-55-g7522 From 24d2b8c0ac5c8ec41c26ed432238b0e027184882 Mon Sep 17 00:00:00 2001 From: Ulisses Furquim Date: Mon, 30 Jan 2012 18:26:29 -0200 Subject: Bluetooth: Fix possible use after free in delete path We need to use the _sync() version for cancelling the info and security timer in the L2CAP connection delete path. Otherwise the delayed work handler might run after the connection object is freed. Signed-off-by: Ulisses Furquim Acked-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- net/bluetooth/l2cap_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ec10c698b891..32d338c30e65 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1018,10 +1018,10 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) hci_chan_del(conn->hchan); if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work_sync(&conn->info_timer); if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->pend)) { - __cancel_delayed_work(&conn->security_timer); + cancel_delayed_work_sync(&conn->security_timer); smp_chan_destroy(conn); } -- cgit v1.2.3-55-g7522 From 4b5a433ae5348c23caa0b5f0a2fca7c342acb200 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 20 Jan 2012 13:55:23 +0100 Subject: mac80211: call rate control only after init There are situations where we don't have the necessary rate control information yet for station entries, e.g. when associating. This currently doesn't really happen due to the dummy station handling; explicitly disabling rate control when it's not initialised will allow us to remove dummy stations. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/debugfs_sta.c | 4 ++-- net/mac80211/rate.c | 2 +- net/mac80211/rate.h | 1 + net/mac80211/sta_info.h | 2 ++ 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 2406b3e7393f..d86217d56bd7 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -63,14 +63,14 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : "" int res = scnprintf(buf, sizeof(buf), - "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), TEST(WME), TEST(WDS), TEST(CLEAR_PS_FILT), TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), - TEST(TDLS_PEER_AUTH)); + TEST(TDLS_PEER_AUTH), TEST(RATE_CONTROL)); #undef TEST return simple_read_from_buffer(userbuf, count, ppos, buf, res); } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 5a5a7767d541..ad64f4d5271a 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -336,7 +336,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, int i; u32 mask; - if (sta) { + if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; priv_sta = sta->rate_ctrl_priv; } diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 168427b0ffdc..2b83f32144fd 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -62,6 +62,7 @@ static inline void rate_control_rate_init(struct sta_info *sta) sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; ref->ops->rate_init(ref->priv, sband, ista, priv_sta); + set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } static inline void rate_control_rate_update(struct ieee80211_local *local, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 6f77f12dc3fc..bfed851d0d36 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -52,6 +52,7 @@ * @WLAN_STA_SP: Station is in a service period, so don't try to * reply to other uAPSD trigger frames or PS-Poll. * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame. + * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station. */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, @@ -71,6 +72,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, + WLAN_STA_RATE_CONTROL, }; enum ieee80211_sta_state { -- cgit v1.2.3-55-g7522 From 216c57b214bd621335ff698b475f6db2802502dc Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 8 Feb 2012 19:17:11 +0100 Subject: mac80211: do not call rate control .tx_status before .rate_init Most rate control implementations assume .get_rate and .tx_status are only called once the per-station data has been fully initialized. minstrel_ht crashes if this assumption is violated. Signed-off-by: Felix Fietkau Tested-by: Arend van Spriel Signed-off-by: John W. Linville --- net/mac80211/rate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 2b83f32144fd..80cfc006dd74 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -41,7 +41,7 @@ static inline void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; - if (!ref) + if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); -- cgit v1.2.3-55-g7522 From cd961c2ca98efbe7d738ca8720673fc03538b2b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Feb 2012 20:28:25 +0000 Subject: netem: fix dequeue commit 50612537e9 (netem: fix classful handling) added two errors in netem_dequeue() 1) After checking skb at the head of tfifo queue for time constraints, it dequeues tail skb, thus adding unwanted reordering. 2) qdisc stats are updated twice per packet (one when packet dequeued from tfifo, once when delivered) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e83d61ca78ca..5da548fa7ae9 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -501,9 +501,8 @@ tfifo_dequeue: /* if more time remaining? */ if (cb->time_to_send <= psched_get_time()) { - skb = qdisc_dequeue_tail(sch); - if (unlikely(!skb)) - goto qdisc_dequeue; + __skb_unlink(skb, &sch->q); + sch->qstats.backlog -= qdisc_pkt_len(skb); #ifdef CONFIG_NET_CLS_ACT /* @@ -539,7 +538,6 @@ deliver: qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); } -qdisc_dequeue: if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) -- cgit v1.2.3-55-g7522 From af14cca162ddcdea017b648c21b9b091e4bf1fa4 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 21 Feb 2012 14:53:57 +0100 Subject: netfilter: ctnetlink: fix soft lockup when netlink adds new entries Marcell Zambo and Janos Farago noticed and reported that when new conntrack entries are added via netlink and the conntrack table gets full, soft lockup happens. This is because the nf_conntrack_lock is held while nf_conntrack_alloc is called, which is in turn wants to lock nf_conntrack_lock while evicting entries from the full table. The patch fixes the soft lockup with limiting the holding of the nf_conntrack_lock to the minimum, where it's absolutely required. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 43 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9307b033c0c9..cc705175765c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1469,7 +1466,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); + spin_lock_bh(&nf_conntrack_lock); nf_conntrack_hash_insert(ct); + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); rcu_read_unlock(); return ct; @@ -1490,6 +1490,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; u16 zone; int err; @@ -1512,25 +1513,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(net, zone, &otuple); + h = nf_conntrack_find_get(net, zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(net, zone, &rtuple); + h = nf_conntrack_find_get(net, zone, &rtuple); + spin_unlock_bh(&nf_conntrack_lock); if (h == NULL) { err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - struct nf_conn *ct; enum ip_conntrack_events events; ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); - goto out_unlock; - } + if (IS_ERR(ct)) + return PTR_ERR(ct); + err = 0; - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = IPCT_RELATED; else @@ -1545,23 +1543,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); + } return err; } /* implicit 'else' */ - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ err = -EEXIST; + ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - + spin_lock_bh(&nf_conntrack_lock); err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_lock); if (err == 0) { - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1570,15 +1564,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_MARK), ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); - nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); - - return err; + } } -out_unlock: - spin_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); return err; } -- cgit v1.2.3-55-g7522 From 84338a6c9dbb6ff3de4749864020f8f25d86fc81 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 21 Feb 2012 16:04:13 -0500 Subject: neighbour: Fixed race condition at tbl->nht When the fixed race condition happens: 1. While function neigh_periodic_work scans the neighbor hash table pointed by field tbl->nht, it unlocks and locks tbl->lock between buckets in order to call cond_resched. 2. Assume that function neigh_periodic_work calls cond_resched, that is, the lock tbl->lock is available, and function neigh_hash_grow runs. 3. Once function neigh_hash_grow finishes, and RCU calls neigh_hash_free_rcu, the original struct neigh_hash_table that function neigh_periodic_work was using doesn't exist anymore. 4. Once back at neigh_periodic_work, whenever the old struct neigh_hash_table is accessed, things can go badly. Signed-off-by: Michel Machado Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e287346e0934..2a83914b0277 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -826,6 +826,8 @@ next_elt: write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); } /* Cycle through all hash buckets every base_reachable_time/2 ticks. * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 -- cgit v1.2.3-55-g7522 From 115c9b81928360d769a76c632bae62d15206a94a Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Tue, 21 Feb 2012 16:54:48 -0500 Subject: rtnetlink: Fix problem with buffer allocation Implement a new netlink attribute type IFLA_EXT_MASK. The mask is a 32 bit value that can be used to indicate to the kernel that certain extended ifinfo values are requested by the user application. At this time the only mask value defined is RTEXT_FILTER_VF to indicate that the user wants the ifinfo dump to send information about the VFs belonging to the interface. This patch fixes a bug in which certain applications do not have large enough buffers to accommodate the extra information returned by the kernel with large numbers of SR-IOV virtual functions. Those applications will not send the new netlink attribute with the interface info dump request netlink messages so they will not get unexpectedly large request buffers returned by the kernel. Modifies the rtnl_calcit function to traverse the list of net devices and compute the minimum buffer size that can hold the info dumps of all matching devices based upon the filter passed in via the new netlink attribute filter mask. If no filter mask is sent then the buffer allocation defaults to NLMSG_GOODSIZE. With this change it is possible to add yet to be defined netlink attributes to the dump request which should make it fairly extensible in the future. Signed-off-by: Greg Rose Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/rtnetlink.h | 3 ++ include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 78 ++++++++++++++++++++++++++++++++++------------- 4 files changed, 62 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index c52d4b5f872a..4b24ff453aee 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -137,6 +137,7 @@ enum { IFLA_AF_SPEC, IFLA_GROUP, /* Group the device belongs to */ IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ __IFLA_MAX }; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8e872ead88b5..577592ea0ea0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -602,6 +602,9 @@ struct tcamsg { #define TCA_ACT_TAB 1 /* attr type must be >=1 */ #define TCAA_MAX 1 +/* New extended info filters for IFLA_EXT_MASK */ +#define RTEXT_FILTER_VF (1 << 0) + /* End of information exported to user level */ #ifdef __KERNEL__ diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 678f1ffaf843..370293901971 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -6,7 +6,7 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); -typedef u16 (*rtnl_calcit_func)(struct sk_buff *); +typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); extern int __rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 65aebd450027..606a6e8f3671 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -60,7 +60,6 @@ struct rtnl_link { }; static DEFINE_MUTEX(rtnl_mutex); -static u16 min_ifinfo_dump_size; void rtnl_lock(void) { @@ -724,10 +723,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) } /* All VF info */ -static inline int rtnl_vfinfo_size(const struct net_device *dev) +static inline int rtnl_vfinfo_size(const struct net_device *dev, + u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { - + if (dev->dev.parent && dev_is_pci(dev->dev.parent) && + (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(sizeof(struct nlattr)); size += nla_total_size(num_vfs * sizeof(struct nlattr)); @@ -766,7 +766,8 @@ static size_t rtnl_port_size(const struct net_device *dev) return port_self_size; } -static noinline size_t if_nlmsg_size(const struct net_device *dev) +static noinline size_t if_nlmsg_size(const struct net_device *dev, + u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -784,8 +785,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ - + nla_total_size(4) /* IFLA_NUM_VF */ - + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + nla_total_size(ext_filter_mask + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ @@ -868,7 +870,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags) + unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -941,10 +943,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; copy_rtnl_link_stats64(nla_data(attr), stats); - if (dev->dev.parent) + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent + && (ext_filter_mask & RTEXT_FILTER_VF)) { int i; struct nlattr *vfinfo, *vf; @@ -1048,6 +1051,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct hlist_head *head; struct hlist_node *node; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -1055,6 +1060,12 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->dev_base_seq; + nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy); + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; @@ -1064,7 +1075,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI) <= 0) + NLM_F_MULTI, + ext_filter_mask) <= 0) goto out; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1100,6 +1112,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, + [IFLA_EXT_MASK] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1509,8 +1522,6 @@ errout: if (send_addr_notify) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); - min_ifinfo_dump_size = max_t(u16, if_nlmsg_size(dev), - min_ifinfo_dump_size); return err; } @@ -1842,6 +1853,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) struct net_device *dev = NULL; struct sk_buff *nskb; int err; + u32 ext_filter_mask = 0; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1850,6 +1862,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); @@ -1861,12 +1876,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (dev == NULL) return -ENODEV; - nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, 0, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -1877,8 +1892,31 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) return err; } -static u16 rtnl_calcit(struct sk_buff *skb) +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + u16 min_ifinfo_dump_size = 0; + + nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ifla_policy); + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + if (!ext_filter_mask) + return NLMSG_GOODSIZE; + /* + * traverse the list of net devices and compute the minimum + * buffer size based upon the filter mask. + */ + list_for_each_entry(dev, &net->dev_base_head, dev_list) { + min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, + if_nlmsg_size(dev, + ext_filter_mask)); + } + return min_ifinfo_dump_size; } @@ -1913,13 +1951,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); if (skb == NULL) goto errout; - min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); - - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1977,7 +2013,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; calcit = rtnl_get_calcit(family, type); if (calcit) - min_dump_alloc = calcit(skb); + min_dump_alloc = calcit(skb, nlh); __rtnl_unlock(); rtnl = net->rtnl; -- cgit v1.2.3-55-g7522 From a5e7424d424f6398a198ead79d99e0a3c2f24ce8 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 21 Feb 2012 17:59:19 -0500 Subject: ipv4: ping: Fix recvmsg MSG_OOB error handling. Don't return an uninitialized variable as the error, return -EOPNOTSUPP instead. Reported-by: Dave Jones Signed-off-by: David S. Miller --- net/ipv4/ping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index aea5a199c37a..b072386cee21 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -630,6 +630,7 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; -- cgit v1.2.3-55-g7522 From 597cdbc2239e6019bbb2dd73b266f436166f0427 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Feb 2012 10:46:49 +0000 Subject: atm: clip: remove clip_tbl Commit 32092ecf0644 (atm: clip: Use device neigh support on top of "arp_tbl".) introduced a bug since clip_tbl is zeroed : Crash occurs in __neigh_for_each_release() idle_timer_check() must use instead arp_tbl and neigh_check_cb() should ignore non clip neighbours. Idea from David Miller. Reported-by: Meelis Roos Signed-off-by: Eric Dumazet Tested-by: Meelis Roos Signed-off-by: David S. Miller --- net/atm/clip.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index c12c2582457c..127fe70a1baa 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -46,8 +46,8 @@ static struct net_device *clip_devs; static struct atm_vcc *atmarpd; -static struct neigh_table clip_tbl; static struct timer_list idle_timer; +static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { @@ -123,6 +123,8 @@ static int neigh_check_cb(struct neighbour *n) struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; + if (n->ops != &clip_neigh_ops) + return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; @@ -154,10 +156,10 @@ static int neigh_check_cb(struct neighbour *n) static void idle_timer_check(unsigned long dummy) { - write_lock(&clip_tbl.lock); - __neigh_for_each_release(&clip_tbl, neigh_check_cb); + write_lock(&arp_tbl.lock); + __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); - write_unlock(&clip_tbl.lock); + write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) -- cgit v1.2.3-55-g7522 From 5095d64db1b978bdb31d30fed9e47dbf04f729be Mon Sep 17 00:00:00 2001 From: RongQing.Li Date: Tue, 21 Feb 2012 22:10:49 +0000 Subject: ipv6: ip6_route_output() never returns NULL. ip6_route_output() never returns NULL, so it is wrong to check if the return value is NULL. Signed-off-by: RongQing.Li Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 +++- net/ipv6/ndisc.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c7e95c8c579f..5aa3981a3922 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1926,8 +1926,10 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, }; dst = ip6_route_output(net, NULL, &fl6); - if (!dst) + if (dst->error) { + dst_release(dst); goto out_free; + } skb_dst_drop(skb); skb_dst_set(skb, dst); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d8f02ef88e59..c964958ac470 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1545,9 +1545,10 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL) + if (dst->error) { + dst_release(dst); return; - + } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; -- cgit v1.2.3-55-g7522 From 5d38b1f8cf8798d4df7809b3f3e38fad4d923e85 Mon Sep 17 00:00:00 2001 From: RongQing.Li Date: Tue, 21 Feb 2012 22:10:51 +0000 Subject: netfilter: ip6_route_output() never returns NULL. ip6_route_output() never returns NULL, so it is wrong to check if the return value is NULL. Signed-off-by: RongQing.Li Signed-off-by: David S. Miller --- net/netfilter/xt_TEE.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3aae66facf9f..4d5057902839 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -152,9 +152,10 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL) + if (dst->error) { + dst_release(dst); return false; - + } skb_dst_drop(skb); skb_dst_set(skb, dst); skb->dev = dst->dev; -- cgit v1.2.3-55-g7522 From 03606895cd98c0a628b17324fd7b5ff15db7e3cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Feb 2012 10:55:02 +0000 Subject: ipsec: be careful of non existing mac headers Niccolo Belli reported ipsec crashes in case we handle a frame without mac header (atm in his case) Before copying mac header, better make sure it is present. Bugzilla reference: https://bugzilla.kernel.org/show_bug.cgi?id=42809 Reported-by: Niccolò Belli Tested-by: Niccolò Belli Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 ++++++++++ net/ipv4/xfrm4_mode_beet.c | 5 +---- net/ipv4/xfrm4_mode_tunnel.c | 6 ++---- net/ipv6/xfrm6_mode_beet.c | 6 +----- net/ipv6/xfrm6_mode_tunnel.c | 6 ++---- 5 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50db9b04a552..ae86adee3746 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1465,6 +1465,16 @@ static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ +static inline void skb_mac_header_rebuild(struct sk_buff *skb) +{ + if (skb_mac_header_was_set(skb)) { + const unsigned char *old_mac = skb_mac_header(skb); + + skb_set_mac_header(skb, -skb->mac_len); + memmove(skb_mac_header(skb), old_mac, skb->mac_len); + } +} + static inline int skb_checksum_start_offset(const struct sk_buff *skb) { return skb->csum_start - skb_headroom(skb); diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 63418185f524..e3db3f915114 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); - - memmove(skb->data - skb->mac_len, skb_mac_header(skb), - skb->mac_len); - skb_set_mac_header(skb, -skb->mac_len); + skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 534972e114ac..ed4bf11ef9f4 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - const unsigned char *old_mac; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index a81ce9450750..9949a356d62c 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -80,7 +80,6 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; - const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); int err; @@ -90,10 +89,7 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) __skb_push(skb, size); skb_reset_network_header(skb); - - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 261e6e6f487e..9f2095b19ad0 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -63,7 +63,6 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; - const unsigned char *old_mac; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) goto out; @@ -80,10 +79,9 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: -- cgit v1.2.3-55-g7522 From 279072882dc0149e5740dace075e1a49f087046d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Feb 2012 12:18:38 +0100 Subject: Revert "netfilter: ctnetlink: fix soft lockup when netlink adds new entries" This reverts commit af14cca162ddcdea017b648c21b9b091e4bf1fa4. This patch contains a race condition between packets and ctnetlink in the conntrack addition. A new patch to fix this issue follows up. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 43 ++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index cc705175765c..9307b033c0c9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1367,12 +1367,15 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); + spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } + spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1466,10 +1469,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); - spin_lock_bh(&nf_conntrack_lock); nf_conntrack_hash_insert(ct); - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); rcu_read_unlock(); return ct; @@ -1490,7 +1490,6 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; u16 zone; int err; @@ -1513,22 +1512,25 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, zone, &otuple); + h = __nf_conntrack_find(net, zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, zone, &rtuple); - spin_unlock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(net, zone, &rtuple); if (h == NULL) { err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { + struct nf_conn *ct; enum ip_conntrack_events events; ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); - if (IS_ERR(ct)) - return PTR_ERR(ct); - + if (IS_ERR(ct)) { + err = PTR_ERR(ct); + goto out_unlock; + } err = 0; + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = IPCT_RELATED; else @@ -1543,19 +1545,23 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); nf_ct_put(ct); - } + } else + spin_unlock_bh(&nf_conntrack_lock); return err; } /* implicit 'else' */ + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ err = -EEXIST; - ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_lock); + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_lock); if (err == 0) { + nf_conntrack_get(&ct->ct_general); + spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1564,10 +1570,15 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_MARK), ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); - } + nf_ct_put(ct); + } else + spin_unlock_bh(&nf_conntrack_lock); + + return err; } - nf_ct_put(ct); +out_unlock: + spin_unlock_bh(&nf_conntrack_lock); return err; } -- cgit v1.2.3-55-g7522 From 7d367e06688dc7a2cc98c2ace04e1296e1d987e2 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 24 Feb 2012 11:45:49 +0100 Subject: netfilter: ctnetlink: fix soft lockup when netlink adds new entries (v2) Marcell Zambo and Janos Farago noticed and reported that when new conntrack entries are added via netlink and the conntrack table gets full, soft lockup happens. This is because the nf_conntrack_lock is held while nf_conntrack_alloc is called, which is in turn wants to lock nf_conntrack_lock while evicting entries from the full table. The patch fixes the soft lockup with limiting the holding of the nf_conntrack_lock to the minimum, where it's absolutely required. It required to extend (and thus change) nf_conntrack_hash_insert so that it makes sure conntrack and ctnetlink do not add the same entry twice to the conntrack table. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 38 +++++++++++++++++++++++++---- net/netfilter/nf_conntrack_netlink.c | 46 +++++++++++++----------------------- 3 files changed, 51 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8a2b0ae7dbd2..ab86036bbf0c 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -209,7 +209,7 @@ extern struct nf_conntrack_tuple_hash * __nf_conntrack_find(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); -extern void nf_conntrack_hash_insert(struct nf_conn *ct); +extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); extern void nf_ct_insert_dying_list(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 76613f5a55c0..ed86a3be678e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, &net->ct.hash[repl_hash]); } -void nf_conntrack_hash_insert(struct nf_conn *ct) +int +nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; u16 zone; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + spin_lock_bh(&nf_conntrack_lock); + /* See if there's one in the list already, including reverse */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + add_timer(&ct->timeout); + nf_conntrack_get(&ct->ct_general); __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + return 0; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return -EEXIST; } -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); /* Confirm a connection given skb; places it in hash table */ int diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9307b033c0c9..30c9d4ca0218 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1367,15 +1367,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1468,8 +1465,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, if (tstamp) tstamp->start = ktime_to_ns(ktime_get_real()); - add_timer(&ct->timeout); - nf_conntrack_hash_insert(ct); + err = nf_conntrack_hash_check_insert(ct); + if (err < 0) + goto err2; + rcu_read_unlock(); return ct; @@ -1490,6 +1489,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; u16 zone; int err; @@ -1510,27 +1510,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; } - spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(net, zone, &otuple); + h = nf_conntrack_find_get(net, zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(net, zone, &rtuple); + h = nf_conntrack_find_get(net, zone, &rtuple); if (h == NULL) { err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - struct nf_conn *ct; enum ip_conntrack_events events; ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); - goto out_unlock; - } + if (IS_ERR(ct)) + return PTR_ERR(ct); + err = 0; - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = IPCT_RELATED; else @@ -1545,23 +1540,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); + } return err; } /* implicit 'else' */ - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ err = -EEXIST; + ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - + spin_lock_bh(&nf_conntrack_lock); err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_lock); if (err == 0) { - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1570,15 +1561,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_MARK), ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); - nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); - - return err; + } } -out_unlock: - spin_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); return err; } -- cgit v1.2.3-55-g7522 From bff528578fc3c4a14227b052a313109b5ffb73da Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 24 Feb 2012 08:08:20 +0000 Subject: gre: fix spelling in comments The original spelling and bad word choice makes these comments hard to read. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6b3ca5ba4450..38673d2860e2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -65,7 +65,7 @@ it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), and silently drop packet when it expires. It is a good - solution, but it supposes maintaing new variable in ALL + solution, but it supposes maintaining new variable in ALL skb, even if no tunneling is used. Current solution: xmit_recursion breaks dead loops. This is a percpu @@ -91,14 +91,14 @@ One of them is to parse packet trying to detect inner encapsulation made by our node. It is difficult or even impossible, especially, - taking into account fragmentation. TO be short, tt is not solution at all. + taking into account fragmentation. TO be short, ttl is not solution at all. Current solution: The solution was UNEXPECTEDLY SIMPLE. We force DF flag on tunnels with preconfigured hop limit, that is ALL. :-) Well, it does not remove the problem completely, but exponential growth of network traffic is changed to linear (branches, that exceed pmtu are pruned) and tunnel mtu - fastly degrades to value <68, where looping stops. + rapidly degrades to value <68, where looping stops. Yes, it is not good if there exists a router in the loop, which does not force DF, even when encapsulating packets have DF set. But it is not our problem! Nobody could accuse us, we made @@ -457,8 +457,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) GRE tunnels with enabled checksum. Tell them "thank you". Well, I wonder, rfc1812 was written by Cisco employee, - what the hell these idiots break standrads established - by themself??? + what the hell these idiots break standards established + by themselves??? */ const struct iphdr *iph = (const struct iphdr *)skb->data; -- cgit v1.2.3-55-g7522