From d48102007d068df7ba3055cdc1723e12db1ba30f Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Wed, 18 May 2005 22:51:45 -0700 Subject: [XFRM]: skb_cow_data() does not set proper owner for new skbs. It looks like skb_cow_data() does not set proper owner for newly created skb. If we have several fragments for skb and some of them are shared(?) or cloned (like in async IPsec) there might be a situation when we require recreating skb and thus using skb_copy() for it. Newly created skb has neither a destructor nor a socket assotiated with it, which must be copied from the old skb. As far as I can see, current code sets destructor and socket for the first one skb only and uses truesize of the first skb only to increment sk_wmem_alloc value. If above "analysis" is correct then attached patch fixes that. Signed-off-by: Evgeniy Polyakov Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_algo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 080aae243ce0..2f4531fcaca2 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -698,7 +698,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; if (skb1->sk) - skb_set_owner_w(skb, skb1->sk); + skb_set_owner_w(skb2, skb1->sk); /* Looking around. Are we still alive? * OK, link new skb, drop old one */ -- cgit v1.2.3-55-g7522 From 2fdba6b085eb7068e9594cfa55ffe40466184b4d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 18 May 2005 22:52:33 -0700 Subject: [IPV4/IPV6] Ensure all frag_list members have NULL sk Having frag_list members which holds wmem of an sk leads to nightmares with partially cloned frag skb's. The reason is that once you unleash a skb with a frag_list that has individual sk ownerships into the stack you can never undo those ownerships safely as they may have been cloned by things like netfilter. Since we have to undo them in order to make skb_linearize happy this approach leads to a dead-end. So let's go the other way and make this an invariant: For any skb on a frag_list, skb->sk must be NULL. That is, the socket ownership always belongs to the head skb. It turns out that the implementation is actually pretty simple. The above invariant is actually violated in the following patch for a short duration inside ip_fragment. This is OK because the offending frag_list member is either destroyed at the end of the slow path without being sent anywhere, or it is detached from the frag_list before being sent. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 ++++++++ net/ipv6/ip6_output.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index daebd93fd8a0..760dc8238d65 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -490,6 +490,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0f0711417c9d..b78a53586804 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -552,13 +552,17 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_headroom(frag) < hlen) goto slow_path; - /* Correct socket ownership. */ - if (frag->sk == NULL) - goto slow_path; - /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } err = 0; @@ -1116,12 +1120,10 @@ int ip6_push_pending_frames(struct sock *sk) tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; -#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; -#endif } ipv6_addr_copy(final_dst, &fl->fl6_dst); -- cgit v1.2.3-55-g7522 From f81a0bffa116ea22149aa7cfb0b4ee09096d9d92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 May 2005 12:26:43 -0700 Subject: [AF_UNIX]: Use lookup_create(). currently it opencodes it, but that's in the way of chaning the lookup_hash interface. I'd prefer to disallow modular af_unix over exporting lookup_create, but I'll leave that to you. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/namei.c | 1 + net/unix/af_unix.c | 28 +++------------------------- 2 files changed, 4 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/fs/namei.c b/fs/namei.c index defe6781e003..dd78f01b6de8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1580,6 +1580,7 @@ enoent: fail: return dentry; } +EXPORT_SYMBOL_GPL(lookup_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c478fc8db776..c420eba4876b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -770,33 +770,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); if (err) goto out_mknod_parent; - /* - * Yucky last component or no last component at all? - * (foo/., foo/.., /////) - */ - err = -EEXIST; - if (nd.last_type != LAST_NORM) - goto out_mknod; - /* - * Lock the directory. - */ - down(&nd.dentry->d_inode->i_sem); - /* - * Do the final lookup. - */ - dentry = lookup_hash(&nd.last, nd.dentry); + + dentry = lookup_create(&nd, 0); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_mknod_unlock; - err = -ENOENT; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (nd.last.name[nd.last.len] && !dentry->d_inode) - goto out_mknod_dput; + /* * All right, let's create it. */ @@ -845,7 +824,6 @@ out_mknod_dput: dput(dentry); out_mknod_unlock: up(&nd.dentry->d_inode->i_sem); -out_mknod: path_release(&nd); out_mknod_parent: if (err==-EEXIST) -- cgit v1.2.3-55-g7522 From d9fa0f392b20b2b8e3df379c44194492a2446c6e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 19 May 2005 12:29:59 -0700 Subject: [IP_VS]: Remove extra __ip_vs_conn_put() for incoming ICMP. Remove extra __ip_vs_conn_put for incoming ICMP in direct routing mode. Mark de Vries reports that IPVS connections are not leaked anymore. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_xmit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index faa6176bbeb1..de21da00057f 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -508,7 +508,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - __ip_vs_conn_put(cp); goto out; } -- cgit v1.2.3-55-g7522 From 8be58932ca596972e4953ae980d8bc286857cae8 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Thu, 19 May 2005 12:36:33 -0700 Subject: [NETFILTER]: Do not be clever about SKB ownership in ip_ct_gather_frags(). Just do an skb_orphan() and be done with it. Based upon discussions with Herbert Xu on netdev. Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 28d9425d5c39..09e824622977 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -940,37 +940,25 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { - struct sock *sk = skb->sk; #ifdef CONFIG_NETFILTER_DEBUG unsigned int olddebug = skb->nf_debug; #endif - if (sk) { - sock_hold(sk); - skb_orphan(skb); - } + skb_orphan(skb); local_bh_disable(); skb = ip_defrag(skb, user); local_bh_enable(); - if (!skb) { - if (sk) - sock_put(sk); - return skb; - } - - if (sk) { - skb_set_owner_w(skb, sk); - sock_put(sk); - } - - ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; + if (skb) { + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; #ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; #endif + } + return skb; } -- cgit v1.2.3-55-g7522 From b9e9dead05b19e7f52c9aa00cd3a5b7ac4fcacf4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:04 -0700 Subject: [IPSEC]: Fixed alg_key_len usage in attach_one_algo The variable alg_key_len is in bits and not bytes. The function attach_one_algo is currently using it as if it were in bytes. This causes it to read memory which may not be there. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5ddda2c98af9..15ba08602aa1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -162,6 +162,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, struct rtattr *rta = u_arg; struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; + int len; if (!rta) return 0; @@ -173,11 +174,12 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return -ENOSYS; *props = algo->desc.sadb_alg_id; - p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); + len = sizeof(*ualg) + (ualg->alg_key_len + 7U) / 8; + p = kmalloc(len, GFP_KERNEL); if (!p) return -ENOMEM; - memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len); + memcpy(p, ualg, len); *algpp = p; return 0; } -- cgit v1.2.3-55-g7522 From 31c26852cb2ac77f1d4acb37bcf31f165fd5eb68 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:49 -0700 Subject: [IPSEC]: Verify key payload in verify_one_algo We need to verify that the payload contains enough data so that attach_one_algo can copy alg_key_len bits from the payload. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 15ba08602aa1..97509011c274 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -34,14 +34,21 @@ static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) { struct rtattr *rt = xfrma[type - 1]; struct xfrm_algo *algp; + int len; if (!rt) return 0; - if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp)) + len = (rt->rta_len - sizeof(*rt)) - sizeof(*algp); + if (len < 0) return -EINVAL; algp = RTA_DATA(rt); + + len -= (algp->alg_key_len + 7U) / 8; + if (len < 0) + return -EINVAL; + switch (type) { case XFRMA_ALG_AUTH: if (!algp->alg_key_len && -- cgit v1.2.3-55-g7522 From db61ecc3352d72513c1b07805bd6f760e30c001b Mon Sep 17 00:00:00 2001 From: Tommy S. Christensen Date: Thu, 19 May 2005 12:46:59 -0700 Subject: [NETLINK]: Fix race with recvmsg(). This bug causes: assertion (!atomic_read(&sk->sk_rmem_alloc)) failed at net/netlink/af_netlink.c (122) What's happening is that: 1) The skb is sent to socket 1. 2) Someone does a recvmsg on socket 1 and drops the ref on the skb. Note that the rmalloc is not returned at this point since the skb is still referenced. 3) The same skb is now sent to socket 2. This version of the fix resurrects the skb_orphan call that was moved out, last time we had 'shared-skb troubles'. It is practically a no-op in the common case, but still prevents the possible race with recvmsg. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 733bf52cef3e..b40c9a976969 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,6 +697,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { + skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); -- cgit v1.2.3-55-g7522 From 68acc024ea7391e03c2c695ba0b9fb31baa974bf Mon Sep 17 00:00:00 2001 From: Tommy S. Christensen Date: Thu, 19 May 2005 13:06:35 -0700 Subject: [NETLINK]: Move broadcast skb_orphan to the skb_get path. Cloned packets don't need the orphan call. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b40c9a976969..4b91f4b84cb7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,7 +697,6 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { - skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); @@ -736,11 +735,15 @@ static inline int do_one_broadcast(struct sock *sk, sock_hold(sk); if (p->skb2 == NULL) { - if (atomic_read(&p->skb->users) != 1) { + if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { - p->skb2 = p->skb; - atomic_inc(&p->skb->users); + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); } } if (p->skb2 == NULL) { -- cgit v1.2.3-55-g7522 From aa1c6a6f7f0518b42994d02756a41cbfdcac1916 Mon Sep 17 00:00:00 2001 From: Tommy S. Christensen Date: Thu, 19 May 2005 13:07:32 -0700 Subject: [NETLINK]: Defer socket destruction a bit In netlink_broadcast() we're sending shared skb's to netlink listeners when possible (saves some copying). This is OK, since we hold the only other reference to the skb. However, this implies that we must drop our reference on the skb, before allowing a receiving socket to disappear. Otherwise, the socket buffer accounting is disrupted. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4b91f4b84cb7..e41ce458c2a9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -789,11 +789,12 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); + kfree_skb(skb); + netlink_unlock_table(); if (info.skb2) kfree_skb(info.skb2); - kfree_skb(skb); if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT)) -- cgit v1.2.3-55-g7522 From 314324121f9b94b2ca657a494cf2b9cb0e4a28cc Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Mon, 23 May 2005 12:03:06 -0700 Subject: [TCP]: Fix stretch ACK performance killer when doing ucopy. When we are doing ucopy, we try to defer the ACK generation to cleanup_rbuf(). This works most of the time very well, but if the ucopy prequeue is large, this ACKing behavior kills performance. With TSO, it is possible to fill the prequeue so large that by the time the ACK is sent and gets back to the sender, most of the window has emptied of data and performance suffers significantly. This behavior does help in some cases, so we should think about re-enabling this trick in the future, using some kind of limit in order to avoid the bug case. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 79835a67a274..5bad504630a3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4355,16 +4355,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (eaten) { - if (tcp_in_quickack_mode(tp)) { - tcp_send_ack(sk); - } else { - tcp_send_delayed_ack(sk); - } - } else { - __tcp_ack_snd_check(sk, 0); - } - + __tcp_ack_snd_check(sk, 0); no_ack: if (eaten) __kfree_skb(skb); -- cgit v1.2.3-55-g7522 From 180e42503300629692b513daeb55a6bb0b51500c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 May 2005 13:11:07 -0700 Subject: [IPV6]: Fix xfrm tunnel oops with large packets Signed-off-by: Herbert Xu Acked-by: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- net/ipv6/xfrm6_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 601a148f60f3..6b9867717d11 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -84,6 +84,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) mtu = IPV6_MIN_MTU; if (skb->len > mtu) { + skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); ret = -EMSGSIZE; } -- cgit v1.2.3-55-g7522 From 0afb51e72855971dba83b3c6b70c547c2d1161fd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:53:49 -0700 Subject: [PKT_SCHED]: netem: reinsert for duplication Handle duplication of packets in netem by re-inserting at top of qdisc tree. This avoid problems with qlen accounting with nested qdisc. This recursion requires no additional locking but will potentially increase stack depth. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e0c9fbe73b15..5c0f0c209a4c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -203,42 +203,47 @@ static int netem_run(struct Qdisc *sch) return 0; } +/* + * Insert one skb into qdisc. + * Note: parent depends on return value to account for queue length. + * NET_XMIT_DROP: queue length didn't change. + * NET_XMIT_SUCCESS: one skb was queued. + */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb2; int ret; + int count = 1; pr_debug("netem_enqueue skb=%p\n", skb); + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) + ++count; + /* Random packet drop 0 => none, ~0 => all */ - if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { - pr_debug("netem_enqueue: random loss\n"); + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) + --count; + + if (count == 0) { sch->qstats.drops++; kfree_skb(skb); - return 0; /* lie about loss so TCP doesn't know */ + return NET_XMIT_DROP; } - /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { - struct sk_buff *skb2; - - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) { - struct Qdisc *qp; - - /* Since one packet can generate two packets in the - * queue, the parent's qlen accounting gets confused, - * so fix it. - */ - qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent)); - if (qp) - qp->q.qlen++; - - sch->q.qlen++; - sch->bstats.bytes += skb2->len; - sch->bstats.packets++; - } else - sch->qstats.drops++; + /* + * If we need to duplicate packet, then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + struct Qdisc *rootq = sch->dev->qdisc; + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + q->duplicate = 0; + + rootq->enqueue(skb2, rootq); + q->duplicate = dupsave; } /* If doing simple delay then gap == 0 so all packets -- cgit v1.2.3-55-g7522 From 0f9f32ac65ee4a452a912a8440cebbc4dff73852 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:55:01 -0700 Subject: [PKT_SCHED] netem: use only inner qdisc -- no private skbuff queue Netem works better if there if packets are just queued in the inner discipline rather than having a separate delayed queue. Change to use the dequeue/requeue to peek like TBF does. By doing this potential qlen problems with the old method are avoided. The problems happened when the netem_run that moved packets from the inner discipline to the nested discipline failed (because inner queue was full). This happened in dequeue, so the effective qlen of the netem would be decreased (because of the drop), but there was no way to keep the outer qdisc (caller of netem dequeue) in sync. The problem window is still there since this patch doesn't address the issue of requeue failing in netem_dequeue, but that shouldn't happen since the sequence dequeue/requeue should always work. Long term correct fix is to implement qdisc->peek in all the qdisc's to allow for this (needed by several other qdisc's as well). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 124 +++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5c0f0c209a4c..48360f7eec5d 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -53,7 +53,6 @@ struct netem_sched_data { struct Qdisc *qdisc; - struct sk_buff_head delayed; struct timer_list timer; u32 latency; @@ -137,72 +136,6 @@ static long tabledist(unsigned long mu, long sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -/* Put skb in the private delayed queue. */ -static int netem_delay(struct Qdisc *sch, struct sk_buff *skb) -{ - struct netem_sched_data *q = qdisc_priv(sch); - psched_tdiff_t td; - psched_time_t now; - - PSCHED_GET_TIME(now); - td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); - - /* Always queue at tail to keep packets in order */ - if (likely(q->delayed.qlen < q->limit)) { - struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; - - PSCHED_TADD2(now, td, cb->time_to_send); - - pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb, - now, cb->time_to_send); - - __skb_queue_tail(&q->delayed, skb); - return NET_XMIT_SUCCESS; - } - - pr_debug("netem_delay: queue over limit %d\n", q->limit); - sch->qstats.overlimits++; - kfree_skb(skb); - return NET_XMIT_DROP; -} - -/* - * Move a packet that is ready to send from the delay holding - * list to the underlying qdisc. - */ -static int netem_run(struct Qdisc *sch) -{ - struct netem_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb; - psched_time_t now; - - PSCHED_GET_TIME(now); - - skb = skb_peek(&q->delayed); - if (skb) { - const struct netem_skb_cb *cb - = (const struct netem_skb_cb *)skb->cb; - long delay - = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); - - /* if more time remaining? */ - if (delay > 0) { - mod_timer(&q->timer, jiffies + delay); - return 1; - } - - __skb_unlink(skb, &q->delayed); - - if (q->qdisc->enqueue(skb, q->qdisc)) { - sch->q.qlen--; - sch->qstats.drops++; - } - } - - return 0; -} - /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -212,6 +145,7 @@ static int netem_run(struct Qdisc *sch) static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; struct sk_buff *skb2; int ret; int count = 1; @@ -246,18 +180,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } - /* If doing simple delay then gap == 0 so all packets - * go into the delayed holding queue - * otherwise if doing out of order only "1 out of gap" - * packets will be delayed. + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + * gap == 0 is special case for no-reordering. */ - if (q->counter < q->gap) { + if (q->gap == 0 || q->counter != q->gap) { + psched_time_t now; + PSCHED_GET_TIME(now); + PSCHED_TADD2(now, + tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist), + cb->time_to_send); + ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { q->counter = 0; - ret = netem_delay(sch, skb); - netem_run(sch); + PSCHED_GET_TIME(cb->time_to_send); + ret = q->qdisc->ops->requeue(skb, q->qdisc); } if (likely(ret == NET_XMIT_SUCCESS)) { @@ -301,22 +241,33 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - int pending; - - pending = netem_run(sch); skb = q->qdisc->dequeue(q->qdisc); if (skb) { - pr_debug("netem_dequeue: return skb=%p\n", skb); - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - } - else if (pending) { - pr_debug("netem_dequeue: throttling\n"); + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + psched_time_t now; + long delay; + + /* if more time remaining? */ + PSCHED_GET_TIME(now); + delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); + if (delay <= 0) { + pr_debug("netem_dequeue: return skb=%p\n", skb); + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + mod_timer(&q->timer, jiffies + delay); sch->flags |= TCQ_F_THROTTLED; - } - return skb; + if (q->qdisc->ops->requeue(skb, q->qdisc) != 0) + sch->qstats.drops++; + } + + return NULL; } static void netem_watchdog(unsigned long arg) @@ -333,8 +284,6 @@ static void netem_reset(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); - skb_queue_purge(&q->delayed); - sch->q.qlen = 0; sch->flags &= ~TCQ_F_THROTTLED; del_timer_sync(&q->timer); @@ -460,7 +409,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) if (!opt) return -EINVAL; - skb_queue_head_init(&q->delayed); init_timer(&q->timer); q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; -- cgit v1.2.3-55-g7522 From 0dca51d362b8e4af6b0dbc9e54d1e5165341918a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 May 2005 12:55:48 -0700 Subject: [PKT_SCHED] netem: allow random reordering (with fix) Here is a fixed up version of the reorder feature of netem. It is the same as the earlier patch plus with the bugfix from Julio merged in. Has expected backwards compatibility behaviour. Go ahead and merge this one, the TCP strangeness I was seeing was due to the reordering bug, and previous version of TSO patch. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 9 +++++++- net/sched/sch_netem.c | 54 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 73d84c071cb1..1d9da36eb9db 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -427,6 +427,7 @@ enum TCA_NETEM_UNSPEC, TCA_NETEM_CORR, TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, __TCA_NETEM_MAX, }; @@ -437,7 +438,7 @@ struct tc_netem_qopt __u32 latency; /* added delay (us) */ __u32 limit; /* fifo limit (packets) */ __u32 loss; /* random packet loss (0=none ~0=100%) */ - __u32 gap; /* re-ordering gap (0 for delay all) */ + __u32 gap; /* re-ordering gap (0 for none) */ __u32 duplicate; /* random packet dup (0=none ~0=100%) */ __u32 jitter; /* random jitter in latency (us) */ }; @@ -449,6 +450,12 @@ struct tc_netem_corr __u32 dup_corr; /* duplicate correlation */ }; +struct tc_netem_reorder +{ + __u32 probability; + __u32 correlation; +}; + #define NETEM_DIST_SCALE 8192 #endif diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 48360f7eec5d..bb9bf8d5003c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -62,11 +62,12 @@ struct netem_sched_data { u32 gap; u32 jitter; u32 duplicate; + u32 reorder; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor; struct disttable { u32 size; @@ -180,23 +181,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } - /* - * Do re-ordering by putting one out of N packets at the front - * of the queue. - * gap == 0 is special case for no-reordering. - */ - if (q->gap == 0 || q->counter != q->gap) { + if (q->gap == 0 /* not doing reordering */ + || q->counter < q->gap /* inside last reordering gap */ + || q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; PSCHED_GET_TIME(now); - PSCHED_TADD2(now, - tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist), + PSCHED_TADD2(now, tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist), cb->time_to_send); - ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { - q->counter = 0; + /* + * Do re-ordering by putting one out of N packets at the front + * of the queue. + */ PSCHED_GET_TIME(cb->time_to_send); + q->counter = 0; ret = q->qdisc->ops->requeue(skb, q->qdisc); } @@ -351,6 +352,19 @@ static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_reorder *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->reorder = r->probability; + init_crandom(&q->reorder_cor, r->correlation); + return 0; +} + static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -371,9 +385,15 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) q->jitter = qopt->jitter; q->limit = qopt->limit; q->gap = qopt->gap; + q->counter = 0; q->loss = qopt->loss; q->duplicate = qopt->duplicate; + /* for compatiablity with earlier versions. + * if gap is set, need to assume 100% probablity + */ + q->reorder = ~0; + /* Handle nested options after initial queue options. * Should have put all options in nested format but too late now. */ @@ -395,6 +415,11 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { + ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); + if (ret) + return ret; + } } @@ -412,7 +437,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) init_timer(&q->timer); q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; - q->counter = 0; q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); if (!q->qdisc) { @@ -444,6 +468,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct rtattr *rta = (struct rtattr *) b; struct tc_netem_qopt qopt; struct tc_netem_corr cor; + struct tc_netem_reorder reorder; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -457,6 +482,11 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + + reorder.probability = q->reorder; + reorder.correlation = q->reorder_cor.rho; + RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + rta->rta_len = skb->tail - b; return skb->len; -- cgit v1.2.3-55-g7522 From 92d63decc0b6a5d600f792fcf5f3ff9718c09a3d Mon Sep 17 00:00:00 2001 From: Hideaki YOSHIFUJI Date: Thu, 26 May 2005 12:58:04 -0700 Subject: From: Kazunori Miyazawa [XFRM] Call dst_check() with appropriate cookie This fixes infinite loop issue with IPv6 tunnel mode. Signed-off-by: Kazunori Miyazawa Signed-off-by: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 ++ net/ipv6/xfrm6_policy.c | 4 ++++ net/xfrm/xfrm_policy.c | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e142a256d5dc..d675836ba6c3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -515,6 +515,8 @@ struct xfrm_dst struct dst_entry *route; u32 route_mtu_cached; u32 child_mtu_cached; + u32 route_cookie; + u32 path_cookie; }; static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4429b1a1fe5f..cf1d91e74c82 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -113,6 +113,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int xdst = (struct xfrm_dst *)dst1; xdst->route = &rt->u.dst; + if (rt->rt6i_node) + xdst->route_cookie = rt->rt6i_node->fn_sernum; dst1->next = dst_prev; dst_prev = dst1; @@ -137,6 +139,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int dst_prev->child = &rt->u.dst; dst->path = &rt->u.dst; + if (rt->rt6i_node) + ((struct xfrm_dst *)dst)->path_cookie = rt->rt6i_node->fn_sernum; *dst_p = dst; dst = dst_prev; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 55ed979db144..d07f5ce31824 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1136,7 +1136,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) struct xfrm_dst *last; u32 mtu; - if (!dst_check(dst->path, 0) || + if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; @@ -1156,7 +1156,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) xdst->child_mtu_cached = mtu; } - if (!dst_check(xdst->route, 0)) + if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { -- cgit v1.2.3-55-g7522 From c6b3365391c626206f6789354794a81a010cb7a1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2005 12:59:05 -0700 Subject: [TOKENRING]: be'ify trh_hdr, trllc, rif_cache_s Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/if_tr.h | 6 +++--- net/802/tr.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h index a2b01e1e72f2..6688b414c529 100644 --- a/include/linux/if_tr.h +++ b/include/linux/if_tr.h @@ -36,8 +36,8 @@ struct trh_hdr { __u8 fc; /* frame control field */ __u8 daddr[TR_ALEN]; /* destination address */ __u8 saddr[TR_ALEN]; /* source address */ - __u16 rcf; /* route control field */ - __u16 rseg[8]; /* routing registers */ + __be16 rcf; /* route control field */ + __be16 rseg[8]; /* routing registers */ }; #ifdef __KERNEL__ @@ -55,7 +55,7 @@ struct trllc { __u8 ssap; /* source SAP */ __u8 llc; /* LLC control field */ __u8 protid[3]; /* protocol id */ - __u16 ethertype; /* ether type field */ + __be16 ethertype; /* ether type field */ }; /* Token-Ring statistics collection data. */ diff --git a/net/802/tr.c b/net/802/tr.c index 85293ccf7efc..3bfb86260f27 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -50,8 +50,8 @@ static void rif_check_expire(unsigned long dummy); struct rif_cache_s { unsigned char addr[TR_ALEN]; int iface; - __u16 rcf; - __u16 rseg[8]; + __be16 rcf; + __be16 rseg[8]; struct rif_cache_s *next; unsigned long last_used; unsigned char local_ring; -- cgit v1.2.3-55-g7522 From c8b35d2a29ec3c93e3b9c1e70d649a77a214b1c1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2005 12:59:42 -0700 Subject: [TOKENRING]: net/802/tr.c: s/struct rif_cache_s/struct rif_cache/ "_s" suffix is certainly of hungarian origin. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/802/tr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/802/tr.c b/net/802/tr.c index 3bfb86260f27..a755e880f4ba 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -47,12 +47,12 @@ static void rif_check_expire(unsigned long dummy); * Each RIF entry we learn is kept this way */ -struct rif_cache_s { +struct rif_cache { unsigned char addr[TR_ALEN]; int iface; __be16 rcf; __be16 rseg[8]; - struct rif_cache_s *next; + struct rif_cache *next; unsigned long last_used; unsigned char local_ring; }; @@ -64,7 +64,7 @@ struct rif_cache_s { * up a lot. */ -static struct rif_cache_s *rif_table[RIF_TABLE_SIZE]; +static struct rif_cache *rif_table[RIF_TABLE_SIZE]; static DEFINE_SPINLOCK(rif_lock); @@ -249,7 +249,7 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device * { int slack; unsigned int hash; - struct rif_cache_s *entry; + struct rif_cache *entry; unsigned char *olddata; static const unsigned char mcast_func_addr[] = {0xC0,0x00,0x00,0x04,0x00,0x00}; @@ -337,7 +337,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0], static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) { unsigned int hash, rii_p = 0; - struct rif_cache_s *entry; + struct rif_cache *entry; spin_lock_bh(&rif_lock); @@ -373,7 +373,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n", * FIXME: We ought to keep some kind of cache size * limiting and adjust the timers to suit. */ - entry=kmalloc(sizeof(struct rif_cache_s),GFP_ATOMIC); + entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC); if(!entry) { @@ -435,7 +435,7 @@ static void rif_check_expire(unsigned long dummy) spin_lock_bh(&rif_lock); for(i =0; i < RIF_TABLE_SIZE; i++) { - struct rif_cache_s *entry, **pentry; + struct rif_cache *entry, **pentry; pentry = rif_table+i; while((entry=*pentry) != NULL) { @@ -467,10 +467,10 @@ static void rif_check_expire(unsigned long dummy) #ifdef CONFIG_PROC_FS -static struct rif_cache_s *rif_get_idx(loff_t pos) +static struct rif_cache *rif_get_idx(loff_t pos) { int i; - struct rif_cache_s *entry; + struct rif_cache *entry; loff_t off = 0; for(i = 0; i < RIF_TABLE_SIZE; i++) @@ -493,7 +493,7 @@ static void *rif_seq_start(struct seq_file *seq, loff_t *pos) static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int i; - struct rif_cache_s *ent = v; + struct rif_cache *ent = v; ++*pos; @@ -522,7 +522,7 @@ static void rif_seq_stop(struct seq_file *seq, void *v) static int rif_seq_show(struct seq_file *seq, void *v) { int j, rcf_len, segment, brdgnmb; - struct rif_cache_s *entry = v; + struct rif_cache *entry = v; if (v == SEQ_START_TOKEN) seq_puts(seq, -- cgit v1.2.3-55-g7522 From d8a33ac435c43a1a404b2ec560ef1d1536710c36 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 May 2005 14:13:47 -0700 Subject: [BRIDGE]: features change notification Resend of earlier patch (no changes) from Catalin used to provide device feature change notification. Signed-off-by: Catalin BOIE Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/linux/notifier.h | 1 + net/core/dev.c | 12 ++++++++++++ net/core/ethtool.c | 8 +++++++- 4 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b25bd02720d3..d8c65ecef9d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -913,6 +913,7 @@ extern void dev_mc_discard(struct net_device *dev); extern void dev_set_promiscuity(struct net_device *dev, int inc); extern void dev_set_allmulti(struct net_device *dev, int inc); extern void netdev_state_change(struct net_device *dev); +extern void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ extern void dev_load(const char *name); extern void dev_mcast_init(void); diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 9303a003e9ab..5937dd6053c3 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -56,6 +56,7 @@ extern int notifier_call_chain(struct notifier_block **n, unsigned long val, voi #define NETDEV_CHANGEADDR 0x0008 #define NETDEV_GOING_DOWN 0x0009 #define NETDEV_CHANGENAME 0x000A +#define NETDEV_FEAT_CHANGE 0x000B #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN diff --git a/net/core/dev.c b/net/core/dev.c index d4d9e2680adb..f15a3ffff635 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -760,6 +760,18 @@ int dev_change_name(struct net_device *dev, char *newname) return err; } +/** + * netdev_features_change - device changes fatures + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + /** * netdev_state_change - device changes state * @dev: device to cause notification diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f05fde97c43d..252bfc6f03f4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -682,6 +682,7 @@ int dev_ethtool(struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; + int old_features; /* * XXX: This can be pushed down into the ethtool_* handlers that @@ -703,6 +704,8 @@ int dev_ethtool(struct ifreq *ifr) if ((rc = dev->ethtool_ops->begin(dev)) < 0) return rc; + old_features = dev->features; + switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); @@ -712,7 +715,6 @@ int dev_ethtool(struct ifreq *ifr) break; case ETHTOOL_GDRVINFO: rc = ethtool_get_drvinfo(dev, useraddr); - break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); @@ -801,6 +803,10 @@ int dev_ethtool(struct ifreq *ifr) if(dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + return rc; ioctl: -- cgit v1.2.3-55-g7522 From 81e8157583c559c27aac75c708d40a35f563d734 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 May 2005 14:14:35 -0700 Subject: [BRIDGE]: make dev->features unsigned The features field in netdevice is really a bitmask, and bitmask's should be unsigned. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/ethtool.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8c65ecef9d9..470af8c1a4a0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -401,7 +401,7 @@ struct net_device } reg_state; /* Net device features */ - int features; + unsigned long features; #define NETIF_F_SG 1 /* Scatter/gather IO. */ #define NETIF_F_IP_CSUM 2 /* Can checksum only TCP/UDP over IPv4. */ #define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */ diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 252bfc6f03f4..2a56a5218363 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -682,7 +682,7 @@ int dev_ethtool(struct ifreq *ifr) void __user *useraddr = ifr->ifr_data; u32 ethcmd; int rc; - int old_features; + unsigned long old_features; /* * XXX: This can be pushed down into the ethtool_* handlers that -- cgit v1.2.3-55-g7522 From 81d35307dd468b92fe8c58797abb13c62e3e64dd Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 May 2005 14:15:17 -0700 Subject: [BRIDGE]: set features based on enslaved devices Make features of the bridge pseudo-device be a subset of the underlying devices. Motivated by Xen and others who use bridging to do failover. Signed-off-by: Catalin BOIE Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 15 +++++++-------- net/bridge/br_if.c | 23 +++++++++++++++++++++++ net/bridge/br_notify.c | 9 +++++++++ net/bridge/br_private.h | 1 + 4 files changed, 40 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index d9b72fde433c..f564ee99782d 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -21,10 +21,7 @@ static struct net_device_stats *br_dev_get_stats(struct net_device *dev) { - struct net_bridge *br; - - br = dev->priv; - + struct net_bridge *br = netdev_priv(dev); return &br->statistics; } @@ -54,9 +51,11 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) static int br_dev_open(struct net_device *dev) { - netif_start_queue(dev); + struct net_bridge *br = netdev_priv(dev); - br_stp_enable_bridge(dev->priv); + br_features_recompute(br); + netif_start_queue(dev); + br_stp_enable_bridge(br); return 0; } @@ -67,7 +66,7 @@ static void br_dev_set_multicast_list(struct net_device *dev) static int br_dev_stop(struct net_device *dev) { - br_stp_disable_bridge(dev->priv); + br_stp_disable_bridge(netdev_priv(dev)); netif_stop_queue(dev); @@ -76,7 +75,7 @@ static int br_dev_stop(struct net_device *dev) static int br_change_mtu(struct net_device *dev, int new_mtu) { - if ((new_mtu < 68) || new_mtu > br_min_mtu(dev->priv)) + if (new_mtu < 68 || new_mtu > br_min_mtu(netdev_priv(dev))) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 69872bf3b87e..91bb895375f4 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -314,6 +314,28 @@ int br_min_mtu(const struct net_bridge *br) return mtu; } +/* + * Recomputes features using slave's features + */ +void br_features_recompute(struct net_bridge *br) +{ + struct net_bridge_port *p; + unsigned long features, checksum; + + features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_HIGHDMA | NETIF_F_TSO; + checksum = NETIF_F_IP_CSUM; /* least commmon subset */ + + list_for_each_entry(p, &br->port_list, list) { + if (!(p->dev->features + & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) + checksum = 0; + features &= p->dev->features; + } + + br->dev->features = features | checksum | NETIF_F_LLTX; +} + /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev) { @@ -368,6 +390,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) spin_lock_bh(&br->lock); br_stp_recalculate_bridge_id(br); + br_features_recompute(br); spin_unlock_bh(&br->lock); return 0; diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index f8fb49e34764..917311c6828b 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -65,6 +65,15 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v } break; + case NETDEV_FEAT_CHANGE: + if (br->dev->flags & IFF_UP) + br_features_recompute(br); + + /* could do recursive feature change notification + * but who would care?? + */ + break; + case NETDEV_DOWN: if (br->dev->flags & IFF_UP) br_stp_disable_port(p); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 54d63f1372a0..bdf95a74d8cd 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -174,6 +174,7 @@ extern int br_add_if(struct net_bridge *br, extern int br_del_if(struct net_bridge *br, struct net_device *dev); extern int br_min_mtu(const struct net_bridge *br); +extern void br_features_recompute(struct net_bridge *br); /* br_input.c */ extern int br_handle_frame_finish(struct sk_buff *skb); -- cgit v1.2.3-55-g7522 From 85967bb46dd1f8f2c49b85a313866c00ac0c9b59 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 May 2005 14:15:55 -0700 Subject: [BRIDGE]: prevent bad forwarding table updates Avoid poisoning of the bridge forwarding table by frames that have been dropped by filtering. This prevents spoofed source addresses on hostile side of bridge from causing packet leakage, a small but possible security risk. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_input.c | 6 ++++-- net/bridge/br_stp_bpdu.c | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 2b1cce46cab4..2aa5dda24a08 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -54,6 +54,9 @@ int br_handle_frame_finish(struct sk_buff *skb) struct net_bridge_fdb_entry *dst; int passedup = 0; + /* insert into forwarding database after filtering to avoid spoofing */ + br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; @@ -108,8 +111,7 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; - if (p->state == BR_STATE_LEARNING || - p->state == BR_STATE_FORWARDING) + if (p->state == BR_STATE_LEARNING) br_fdb_update(p->br, p, eth_hdr(skb)->h_source); if (p->br->stp_enabled && diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index b91a875aca01..d071f1c9ad0b 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -140,6 +140,9 @@ int br_stp_handle_bpdu(struct sk_buff *skb) struct net_bridge *br = p->br; unsigned char *buf; + /* insert into forwarding database after filtering to avoid spoofing */ + br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + /* need at least the 802 and STP headers */ if (!pskb_may_pull(skb, sizeof(header)+1) || memcmp(skb->data, header, sizeof(header))) -- cgit v1.2.3-55-g7522 From 7ce54e3f428b33af714271140601c87b8bf2c544 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 29 May 2005 14:16:48 -0700 Subject: [BRIDGE]: receive path optimization This improves the bridge local receive path by avoiding going through another softirq. The bridge receive path is already being called from a netif_receive_skb() there is no point in going through another receiveq round trip. Recursion is limited because bridge can never be a port of a bridge so handle_bridge() always returns. Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 2aa5dda24a08..8f5f2e730992 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -26,7 +26,7 @@ static int br_pass_frame_up_finish(struct sk_buff *skb) #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 0; #endif - netif_rx(skb); + netif_receive_skb(skb); return 0; } -- cgit v1.2.3-55-g7522 From 8f937c6099858eee15fae14009dcbd05177fa91d Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sun, 29 May 2005 20:23:46 -0700 Subject: [IPV4]: Primary and secondary addresses Add an option to make secondary IP addresses get promoted when primary IP addresses are removed from the device. It defaults to off to preserve existing behavior. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 2 ++ include/linux/sysctl.h | 1 + net/ipv4/devinet.c | 34 +++++++++++++++++++++++++++++----- 3 files changed, 32 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 6fafb27877a7..7e1e15f934f3 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -29,6 +29,7 @@ struct ipv4_devconf int no_xfrm; int no_policy; int force_igmp_version; + int promote_secondaries; void *sysctl; }; @@ -71,6 +72,7 @@ struct in_device #define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag) #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id) +#define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries) #define IN_DEV_RX_REDIRECTS(in_dev) \ ((IN_DEV_FORWARD(in_dev) && \ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 772998147e3e..23032d9d6071 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -399,6 +399,7 @@ enum NET_IPV4_CONF_FORCE_IGMP_VERSION=17, NET_IPV4_CONF_ARP_ANNOUNCE=18, NET_IPV4_CONF_ARP_IGNORE=19, + NET_IPV4_CONF_PROMOTE_SECONDARIES=20, __NET_IPV4_CONF_MAX }; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3cc96730c4ed..478a30179a52 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -233,11 +233,14 @@ int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { + struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa1 = *ifap; ASSERT_RTNL(); - /* 1. Deleting primary ifaddr forces deletion all secondaries */ + /* 1. Deleting primary ifaddr forces deletion all secondaries + * unless alias promotion is set + **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr *ifa; @@ -251,11 +254,16 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, continue; } - *ifap1 = ifa->ifa_next; + if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) { + *ifap1 = ifa->ifa_next; - rtmsg_ifa(RTM_DELADDR, ifa); - notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); - inet_free_ifa(ifa); + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } else { + promote = ifa; + break; + } } } @@ -281,6 +289,13 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, if (!in_dev->ifa_list) inetdev_destroy(in_dev); } + + if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) { + /* not sure if we should send a delete notify first? */ + promote->ifa_flags &= ~IFA_F_SECONDARY; + rtmsg_ifa(RTM_NEWADDR, promote); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); + } } static int inet_insert_ifa(struct in_ifaddr *ifa) @@ -1384,6 +1399,15 @@ static struct devinet_sysctl_table { .proc_handler = &ipv4_doint_and_flush, .strategy = &ipv4_doint_and_flush_strategy, }, + { + .ctl_name = NET_IPV4_CONF_PROMOTE_SECONDARIES, + .procname = "promote_secondaries", + .data = &ipv4_devconf.promote_secondaries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, }, .devinet_dev = { { -- cgit v1.2.3-55-g7522 From 37e20a66db02eff9adbeee043af990cca85d0034 Mon Sep 17 00:00:00 2001 From: Pravin B. Shelar Date: Sun, 29 May 2005 20:26:44 -0700 Subject: [IPV4]: Kill MULTIPATHHOLDROUTE flag. It cannot work properly, so just ignore it in drr and rr multipath algorithms just like the random multipath algorithm does. Suggested by Herbert Xu. Signed-off by: Pravin B. Shelar Signed-off-by: David S. Miller --- include/net/route.h | 3 --- net/ipv4/multipath_drr.c | 18 +----------------- net/ipv4/multipath_rr.c | 20 -------------------- 3 files changed, 1 insertion(+), 40 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index efe92b239ef1..d34ca8fc6756 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -181,9 +181,6 @@ static inline int ip_route_newports(struct rtable **rp, u16 sport, u16 dport, memcpy(&fl, &(*rp)->fl, sizeof(fl)); fl.fl_ip_sport = sport; fl.fl_ip_dport = dport; -#if defined(CONFIG_IP_ROUTE_MULTIPATH_CACHED) - fl.flags |= FLOWI_FLAG_MULTIPATHOLDROUTE; -#endif ip_rt_put(*rp); *rp = NULL; return ip_route_output_flow(rp, &fl, sk, 0); diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index 9349686131fc..cf2e6bcf7973 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -57,7 +57,6 @@ struct multipath_device { static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES]; static DEFINE_SPINLOCK(state_lock); -static struct rtable *last_selection = NULL; static int inline __multipath_findslot(void) { @@ -111,11 +110,6 @@ struct notifier_block drr_dev_notifier = { .notifier_call = drr_dev_event, }; -static void drr_remove(struct rtable *rt) -{ - if (last_selection == rt) - last_selection = NULL; -} static void drr_safe_inc(atomic_t *usecount) { @@ -144,14 +138,6 @@ static void drr_select_route(const struct flowi *flp, int devidx = -1; int cur_min_devidx = -1; - /* if necessary and possible utilize the old alternative */ - if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && - last_selection != NULL) { - result = last_selection; - *rp = result; - return; - } - /* 1. make sure all alt. nexthops have the same GC related data */ /* 2. determine the new candidate to be returned */ result = NULL; @@ -229,12 +215,10 @@ static void drr_select_route(const struct flowi *flp, } *rp = result; - last_selection = result; } static struct ip_mp_alg_ops drr_ops = { .mp_alg_select_route = drr_select_route, - .mp_alg_remove = drr_remove, }; static int __init drr_init(void) @@ -244,7 +228,7 @@ static int __init drr_init(void) if (err) return err; - err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR); + err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR); if (err) goto fail; diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c index 554a82568160..061b6b253982 100644 --- a/net/ipv4/multipath_rr.c +++ b/net/ipv4/multipath_rr.c @@ -47,29 +47,12 @@ #include #include -#define MULTIPATH_MAX_CANDIDATES 40 - -static struct rtable* last_used = NULL; - -static void rr_remove(struct rtable *rt) -{ - if (last_used == rt) - last_used = NULL; -} - static void rr_select_route(const struct flowi *flp, struct rtable *first, struct rtable **rp) { struct rtable *nh, *result, *min_use_cand = NULL; int min_use = -1; - /* if necessary and possible utilize the old alternative */ - if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && - last_used != NULL) { - result = last_used; - goto out; - } - /* 1. make sure all alt. nexthops have the same GC related data * 2. determine the new candidate to be returned */ @@ -90,15 +73,12 @@ static void rr_select_route(const struct flowi *flp, if (!result) result = first; -out: - last_used = result; result->u.dst.__use++; *rp = result; } static struct ip_mp_alg_ops rr_ops = { .mp_alg_select_route = rr_select_route, - .mp_alg_remove = rr_remove, }; static int __init rr_init(void) -- cgit v1.2.3-55-g7522 From 69f6a0fafcdf0bfe85af182695d6d38f80f9d549 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Sun, 29 May 2005 20:27:24 -0700 Subject: [NET]: Add ethtool support for NETIF_F_HW_CSUM. Signed-off-by: Jon Mason Signed-off-by: David S. Miller --- include/linux/ethtool.h | 1 + net/core/ethtool.c | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c85b210490ea..a0ab26aab450 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -256,6 +256,7 @@ struct net_device; u32 ethtool_op_get_link(struct net_device *dev); u32 ethtool_op_get_tx_csum(struct net_device *dev); int ethtool_op_set_tx_csum(struct net_device *dev, u32 data); +int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data); u32 ethtool_op_get_sg(struct net_device *dev); int ethtool_op_set_sg(struct net_device *dev, u32 data); u32 ethtool_op_get_tso(struct net_device *dev); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 2a56a5218363..8ec484894d68 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -29,7 +29,7 @@ u32 ethtool_op_get_link(struct net_device *dev) u32 ethtool_op_get_tx_csum(struct net_device *dev) { - return (dev->features & NETIF_F_IP_CSUM) != 0; + return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; } int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) @@ -42,6 +42,15 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) return 0; } +int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_HW_CSUM; + else + dev->features &= ~NETIF_F_HW_CSUM; + + return 0; +} u32 ethtool_op_get_sg(struct net_device *dev) { return (dev->features & NETIF_F_SG) != 0; @@ -823,3 +832,4 @@ EXPORT_SYMBOL(ethtool_op_get_tx_csum); EXPORT_SYMBOL(ethtool_op_set_sg); EXPORT_SYMBOL(ethtool_op_set_tso); EXPORT_SYMBOL(ethtool_op_set_tx_csum); +EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); -- cgit v1.2.3-55-g7522 From 6c94d3611be61e4cff33b311f1a626d93d1d3e92 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Sun, 29 May 2005 20:28:01 -0700 Subject: [IPV6]: Clear up user copy warning in flowlabel code. We are intentionally ignoring the copy_to_user() value, make it clear to the compiler too. Noted by Jeff Garzik. Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a93f6dc51979..0e5f7499debb 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -535,10 +535,12 @@ release: if (err) goto done; - /* Do not check for fault */ - if (!freq.flr_label) - copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, - &fl->label, sizeof(fl->label)); + if (!freq.flr_label) { + if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label))) { + /* Intentionally ignore fault. */ + } + } sfl1->fl = fl; sfl1->next = np->ipv6_fl_list; -- cgit v1.2.3-55-g7522 From d1102b59ca7b3a3c58912330a4ae38f549c8d569 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Sun, 29 May 2005 20:28:25 -0700 Subject: [NET]: Use %lx for netdev->features sysfs formatting. Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 060f703659e8..910eb4c05a47 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -21,6 +21,7 @@ #define to_net_dev(class) container_of(class, struct net_device, class_dev) static const char fmt_hex[] = "%#x\n"; +static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; @@ -91,7 +92,7 @@ static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \ NETDEVICE_ATTR(addr_len, fmt_dec); NETDEVICE_ATTR(iflink, fmt_dec); NETDEVICE_ATTR(ifindex, fmt_dec); -NETDEVICE_ATTR(features, fmt_hex); +NETDEVICE_ATTR(features, fmt_long_hex); NETDEVICE_ATTR(type, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ -- cgit v1.2.3-55-g7522 From 9bb7bc942d3da606f184ac6a4dfc7e4d470c831b Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 30 May 2005 15:35:26 -0700 Subject: [NETFILTER]: Fix deadlock with ip_queue and tcp local input path. When we have ip_queue being used from LOCAL_IN, then we end up with a situation where the verdicts coming back from userspace traverse the TCP input path from syscall context. While this seems to work most of the time, there's an ugly deadlock: syscall context is interrupted by the timer interrupt. When the timer interrupt leaves, the timer softirq get's scheduled and calls tcp_delack_timer() and alike. They themselves do bh_lock_sock(sk), which is already held from somewhere else -> boom. I've now tested the suggested solution by Patrick McHardy and Herbert Xu to simply use local_bh_{en,dis}able(). Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index e5746b674413..eda1fba431a4 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -3,6 +3,7 @@ * communicating with userspace via netlink. * * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +18,7 @@ * 2005-01-10: Added /proc counter for dropped packets; fixed so * packets aren't delivered to user space if they're going * to be dropped. + * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) * */ #include @@ -71,7 +73,15 @@ static DECLARE_MUTEX(ipqnl_sem); static void ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) { + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like ipq_issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + kfree(entry); } -- cgit v1.2.3-55-g7522 From 208d89843b7b03978d8e748b8b991c1be81c4f43 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 30 May 2005 15:50:15 -0700 Subject: [IPV4]: Fix BUG() in 2.6.x, udp_poll(), fragments + CONFIG_HIGHMEM Steven Hand wrote: > > Reconstructed forward trace: > > net/ipv4/udp.c:1334 spin_lock_irq() > net/ipv4/udp.c:1336 udp_checksum_complete() > net/core/skbuff.c:1069 skb_shinfo(skb)->nr_frags > 1 > net/core/skbuff.c:1086 kunmap_skb_frag() > net/core/skbuff.h:1087 local_bh_enable() > kernel/softirq.c:0140 WARN_ON(irqs_disabled()); The receive queue lock is never taken in IRQs (and should never be) so we can simply substitute bh for irq. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a6952e3fee9..7c24e64b443f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) unsigned long amount; amount = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* @@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) */ amount = skb->len - sizeof(struct udphdr); } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } @@ -848,12 +848,12 @@ csum_copy_err: /* Clear queue. */ if (flags&MSG_PEEK) { int clear = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); if (skb == skb_peek(&sk->sk_receive_queue)) { __skb_unlink(skb, &sk->sk_receive_queue); clear = 1; } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); if (clear) kfree_skb(skb); } @@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; - spin_lock_irq(&rcvq->lock); + spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL) { if (udp_checksum_complete(skb)) { UDP_INC_STATS_BH(UDP_MIB_INERRORS); @@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) break; } } - spin_unlock_irq(&rcvq->lock); + spin_unlock_bh(&rcvq->lock); /* nothing to see, move along */ if (skb == NULL) -- cgit v1.2.3-55-g7522 From 0451eb074eef30240c6c06dacf2911bee26831e1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:15:58 -0700 Subject: [PKT_SCHED]: Fix dsmark to count ignored indices while walking Unused indices which are ignored while walking must still be counted to avoid dumping the same index twice. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 8a3db9d95bab..acbe9d2b3e15 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -163,14 +163,15 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { if (p->mask[i] == 0xff && !p->value[i]) - continue; + goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i+1, walker) < 0) { walker->stop = 1; break; } } - walker->count++; +ignore: + walker->count++; } } -- cgit v1.2.3-55-g7522 From 486b53e59ca8cd07d91ad88375c1c884b15cc9bd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:16:52 -0700 Subject: [PKT_SCHED]: make dsmark try using pfifo instead of noop while grafting Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index acbe9d2b3e15..a8c948f62978 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,8 +73,13 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg, DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, old); - if (!new) - new = &noop_qdisc; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + sch_tree_lock(sch); *old = xchg(&p->q,new); if (*old) -- cgit v1.2.3-55-g7522 From 08e9cd1fc559c00bc05df3fc551efe3b87c57ee3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:17:28 -0700 Subject: [PKT_SCHED]: Disable dsmark debugging messages by default Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index a8c948f62978..d8bd2a569c7c 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -18,7 +18,7 @@ #include -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) #else #define DPRINTK(format,args...) -- cgit v1.2.3-55-g7522 From 36839836e8132731e0cadddce452423036a1d5b3 Mon Sep 17 00:00:00 2001 From: Edgar E Iglesias Date: Tue, 31 May 2005 17:08:05 -0700 Subject: [IPSEC]: Fix esp_decap_data size verification in esp4. Signed-off-by: Edgar E Iglesias Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 053a883247ba..eae84cc39d3f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -478,7 +478,7 @@ static int __init esp4_init(void) { struct xfrm_decap_state decap; - if (sizeof(struct esp_decap_data) < + if (sizeof(struct esp_decap_data) > sizeof(decap.decap_data)) { extern void decap_data_too_small(void); -- cgit v1.2.3-55-g7522