219 files changed, 21713 insertions, 3691 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 6ed711748f26..6fea0750662b 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -29,7 +29,7 @@ static struct llc_sap *snap_sap;
 /*
  *	Find a snap client by matching the 5 bytes.
  */
-static struct datalink_proto *find_snap_client(unsigned char *desc)
+static struct datalink_proto *find_snap_client(const unsigned char *desc)
 {
 	struct datalink_proto *proto = NULL, *p;
 
@@ -95,15 +95,16 @@ static int snap_request(struct datalink_proto *dl,
 EXPORT_SYMBOL(register_snap_client);
 EXPORT_SYMBOL(unregister_snap_client);
 
-static char snap_err_msg[] __initdata =
+static const char snap_err_msg[] __initconst =
 	KERN_CRIT "SNAP - unable to register with 802.2\n";
 
 static int __init snap_init(void)
 {
 	snap_sap = llc_sap_open(0xAA, snap_rcv);
-
-	if (!snap_sap)
+	if (!snap_sap) {
 		printk(snap_err_msg);
+		return -EBUSY;
+	}
 
 	return 0;
 }
@@ -121,7 +122,7 @@ module_exit(snap_exit);
 /*
  *	Register SNAP clients. We don't yet use this for IP.
  */
-struct datalink_proto *register_snap_client(unsigned char *desc,
+struct datalink_proto *register_snap_client(const unsigned char *desc,
 					    int (*rcvfunc)(struct sk_buff *,
 							   struct net_device *,
 							   struct packet_type *,
@@ -136,7 +137,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc,
 
 	proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
 	if (proto) {
-		memcpy(proto->type, desc,5);
+		memcpy(proto->type, desc, 5);
 		proto->rcvfunc		= rcvfunc;
 		proto->header_length	= 5 + 3; /* snap + 802.2 */
 		proto->request		= snap_request;
diff --git a/net/802/tr.c b/net/802/tr.c
index 158150fee462..e7eb13084d71 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -486,6 +486,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
 }
 
 static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(&rif_lock)
 {
 	spin_lock_irq(&rif_lock);
 
@@ -517,6 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void rif_seq_stop(struct seq_file *seq, void *v)
+	__releases(&rif_lock)
 {
 	spin_unlock_irq(&rif_lock);
 }
@@ -668,3 +670,5 @@ module_init(rif_init);
 
 EXPORT_SYMBOL(tr_type_trans);
 EXPORT_SYMBOL(alloc_trdev);
+
+MODULE_LICENSE("GPL");
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 4163ea65bf41..2b7390e377b3 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,7 +51,7 @@ const char vlan_version[] = DRV_VERSION;
 static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
 static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>";
 
-static struct packet_type vlan_packet_type = {
+static struct packet_type vlan_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_8021Q),
 	.func = vlan_skb_recv, /* VLAN receive method */
 };
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 70435af153f2..654e45f5719d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -1,12 +1,16 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/netpoll.h>
 #include "vlan.h"
 
 /* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
 int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 		      u16 vlan_tci, int polling)
 {
+	if (netpoll_rx(skb))
+		return NET_RX_DROP;
+
 	if (skb_bond_should_drop(skb))
 		goto drop;
 
@@ -94,12 +98,15 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 	return dev_gro_receive(napi, skb);
 
 drop:
-	return 2;
+	return GRO_DROP;
 }
 
 int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 		     unsigned int vlan_tci, struct sk_buff *skb)
 {
+	if (netpoll_rx_on(skb))
+		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+
 	skb_gro_reset_offset(skb);
 
 	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
@@ -114,6 +121,9 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 	if (!skb)
 		return NET_RX_DROP;
 
+	if (netpoll_rx_on(skb))
+		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
+
 	return napi_frags_finish(napi, skb,
 				 vlan_gro_common(napi, grp, vlan_tci, skb));
 }
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4a19acd3a32b..1b34135cf990 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
 	int err = 0;
 
 	if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
-		err = ops->ndo_neigh_setup(dev, pa);
+		err = ops->ndo_neigh_setup(real_dev, pa);
 
 	return err;
 }
@@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev)
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
 		dev->netdev_ops         = &vlan_netdev_ops;
 	}
+	netdev_resync_ops(dev);
 
 	if (is_vlan_dev(real_dev))
 		subclass = 1;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 1df0356f242b..c613ed08a5ee 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -417,7 +417,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len)
 	oldfs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos);
+	ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
 	set_fs(oldfs);
 
 	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
diff --git a/net/Kconfig b/net/Kconfig
index a12bae0e3fe9..93998a9c39c2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -24,9 +24,6 @@ if NET
 
 menu "Networking options"
 
-config COMPAT_NET_DEV_OPS
-       def_bool y
-
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
@@ -171,6 +168,7 @@ endif
 
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
+source "net/rds/Kconfig"
 source "net/tipc/Kconfig"
 source "net/atm/Kconfig"
 source "net/802/Kconfig"
@@ -221,6 +219,17 @@ config NET_TCPPROBE
 	To compile this code as a module, choose M here: the
 	module will be called tcp_probe.
 
+config NET_DROP_MONITOR
+	boolean "Network packet drop alerting service"
+	depends on INET && EXPERIMENTAL && TRACEPOINTS
+	---help---
+	This feature provides an alerting service to userspace in the
+	event that packets are discarded in the network stack.  Alerts
+	are broadcast via netlink socket to any listening user space
+	process.  If you don't need network drop alerts, or if you are ok
+	just checking the various proc files and other utilities for
+	drop statistics, say N here.
+
 endmenu
 
 endmenu
diff --git a/net/Makefile b/net/Makefile
index 0fcce89d7169..9e00a55a901b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -49,6 +49,7 @@ obj-y				+= 8021q/
 endif
 obj-$(CONFIG_IP_DCCP)		+= dccp/
 obj-$(CONFIG_IP_SCTP)		+= sctp/
+obj-$(CONFIG_RDS)		+= rds/
 obj-y				+= wireless/
 obj-$(CONFIG_MAC80211)		+= mac80211/
 obj-$(CONFIG_TIPC)		+= tipc/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 510a6782da8f..3e0671df3a3f 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1860,12 +1860,12 @@ static struct notifier_block ddp_notifier = {
 	.notifier_call	= ddp_device_event,
 };
 
-static struct packet_type ltalk_packet_type = {
+static struct packet_type ltalk_packet_type __read_mostly = {
 	.type		= cpu_to_be16(ETH_P_LOCALTALK),
 	.func		= ltalk_rcv,
 };
 
-static struct packet_type ppptalk_packet_type = {
+static struct packet_type ppptalk_packet_type __read_mostly = {
 	.type		= cpu_to_be16(ETH_P_PPPTALK),
 	.func		= atalk_rcv,
 };
@@ -1877,7 +1877,7 @@ EXPORT_SYMBOL(aarp_send_ddp);
 EXPORT_SYMBOL(atrtr_get_dev);
 EXPORT_SYMBOL(atalk_find_dev_addr);
 
-static char atalk_err_snap[] __initdata =
+static const char atalk_err_snap[] __initconst =
 	KERN_CRIT "Unable to register DDP with SNAP.\n";
 
 /* Called by proto.c on kernel start up */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index da42fd06b61f..3dc0a3a42a57 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -552,10 +552,13 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
 	return error;
 }
 
+static const struct net_device_ops clip_netdev_ops = {
+	.ndo_start_xmit = clip_start_xmit,
+};
+
 static void clip_setup(struct net_device *dev)
 {
-	dev->hard_start_xmit = clip_start_xmit;
-	/* sg_xmit ... */
+	dev->netdev_ops = &clip_netdev_ops;
 	dev->type = ARPHRD_ATM;
 	dev->hard_header_len = RFC1483LLC_LEN;
 	dev->mtu = RFC1626_MTU;
@@ -615,7 +618,7 @@ static int clip_device_event(struct notifier_block *this, unsigned long event,
 	}
 
 	/* ignore non-CLIP devices */
-	if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit)
+	if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
 		return NOTIFY_DONE;
 
 	switch (event) {
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c0cba9a037e8..199b6bb79f42 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -502,7 +502,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
 		priv->lane2_ops = NULL;
 		if (priv->lane_version > 1)
 			priv->lane2_ops = &lane2_ops;
-		if (dev->change_mtu(dev, mesg->content.config.mtu))
+		if (dev_set_mtu(dev, mesg->content.config.mtu))
 			printk("%s: change_mtu to %d failed\n", dev->name,
 			       mesg->content.config.mtu);
 		priv->is_proxy = mesg->content.config.is_proxy;
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 039d5cc72c3d..e5bf11453a18 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -286,33 +286,32 @@ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
 {
 
 	dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name);
-	if (dev->hard_start_xmit == NULL) {
-		printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n",
-		       dev->name);
-		return;
+	if (!dev->netdev_ops)
+		printk("mpoa: (%s) start_mpc  not starting\n", dev->name);
+	else {
+		mpc->old_ops = dev->netdev_ops;
+		mpc->new_ops = *mpc->old_ops;
+		mpc->new_ops.ndo_start_xmit = mpc_send_packet;
+		dev->netdev_ops = &mpc->new_ops;
 	}
-	mpc->old_hard_start_xmit = dev->hard_start_xmit;
-	dev->hard_start_xmit = mpc_send_packet;
-
-	return;
 }
 
 static void stop_mpc(struct mpoa_client *mpc)
 {
-
+	struct net_device *dev = mpc->dev;
 	dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name);
 
 	/* Lets not nullify lec device's dev->hard_start_xmit */
-	if (mpc->dev->hard_start_xmit != mpc_send_packet) {
+	if (dev->netdev_ops != &mpc->new_ops) {
 		dprintk(" mpc already stopped, not fatal\n");
 		return;
 	}
 	dprintk("\n");
-	mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit;
-	mpc->old_hard_start_xmit = NULL;
-	/* close_shortcuts(mpc);    ??? FIXME */
 
-	return;
+	dev->netdev_ops = mpc->old_ops;
+	mpc->old_ops = NULL;
+
+	/* close_shortcuts(mpc);    ??? FIXME */
 }
 
 static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
@@ -531,7 +530,6 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
  */
 static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
 {
-	int retval;
 	struct mpoa_client *mpc;
 	struct ethhdr *eth;
 	int i = 0;
@@ -561,9 +559,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
 	}
 
  non_ip:
-	retval = mpc->old_hard_start_xmit(skb,dev);
-
-	return retval;
+	return mpc->old_ops->ndo_start_xmit(skb,dev);
 }
 
 static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
index 24c386c35f57..0919a88bbc70 100644
--- a/net/atm/mpc.h
+++ b/net/atm/mpc.h
@@ -15,7 +15,7 @@ struct mpoa_client {
 	struct mpoa_client *next;
 	struct net_device *dev;      /* lec in question                     */
 	int dev_num;                 /* e.g. 2 for lec2                     */
-	int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev);
+
 	struct atm_vcc *mpoad_vcc;   /* control channel to mpoad            */
 	uint8_t mps_ctrl_addr[ATM_ESA_LEN];  /* MPS control ATM address     */
 	uint8_t our_ctrl_addr[ATM_ESA_LEN];  /* MPC's control ATM address   */
@@ -31,6 +31,9 @@ struct mpoa_client {
 	uint8_t *mps_macs;           /* array of MPS MAC addresses, >=1     */
 	int number_of_mps_macs;      /* number of the above MAC addresses   */
 	struct mpc_parameters parameters;  /* parameters for this client    */
+
+	const struct net_device_ops *old_ops;
+	struct net_device_ops new_ops;
 };
 
 
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d127fd3ba5c6..7da5ebb84e97 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1435,6 +1435,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
 	size_t size;
 	int lv, err, addr_len = msg->msg_namelen;
 
+	/* AX.25 empty data frame has no meaning : don't send */
+	if (len == 0) {
+		return (0);
+	}
+
 	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
 		return -EINVAL;
 
@@ -1529,10 +1534,8 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
 		dp = ax25->digipeat;
 	}
 
-	SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n");
-
 	/* Build a packet */
-	SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n");
+	SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n");
 
 	/* Assume the worst case */
 	size = len + ax25->ax25_dev->dev->hard_header_len;
@@ -1636,6 +1639,13 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reset_transport_header(skb);
 	copied = skb->len;
 
+	/* AX.25 empty data frame has no meaning : ignore it */
+	if (copied == 0) {
+		err = copied;
+		skb_free_datagram(sk, skb);
+		goto out;
+	}
+
 	if (copied > size) {
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
@@ -1985,9 +1995,8 @@ static const struct proto_ops ax25_proto_ops = {
 /*
  *	Called by socket.c on kernel start up
  */
-static struct packet_type ax25_packet_type = {
+static struct packet_type ax25_packet_type __read_mostly = {
 	.type	=	cpu_to_be16(ETH_P_AX25),
-	.dev	=	NULL,				/* All devices */
 	.func	=	ax25_kiss_rcv,
 };
 
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 744ed3f07ef3..02b9baa1930b 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -41,14 +41,13 @@
 
 #include <net/bluetooth/bluetooth.h>
 
-#define VERSION "2.14"
+#define VERSION "2.15"
 
 /* Bluetooth sockets */
 #define BT_MAX_PROTO	8
 static struct net_proto_family *bt_proto[BT_MAX_PROTO];
 static DEFINE_RWLOCK(bt_proto_lock);
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
 static const char *bt_key_strings[BT_MAX_PROTO] = {
 	"sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
@@ -86,11 +85,6 @@ static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
 			bt_slock_key_strings[proto], &bt_slock_key[proto],
 				bt_key_strings[proto], &bt_lock_key[proto]);
 }
-#else
-static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
-{
-}
-#endif
 
 int bt_sock_register(int proto, struct net_proto_family *ops)
 {
@@ -217,7 +211,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
 			continue;
 		}
 
-		if (sk->sk_state == BT_CONNECTED || !newsock) {
+		if (sk->sk_state == BT_CONNECTED || !newsock ||
+						bt_sk(parent)->defer_setup) {
 			bt_accept_unlink(sk);
 			if (newsock)
 				sock_graft(sk, newsock);
@@ -232,7 +227,7 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
 EXPORT_SYMBOL(bt_accept_dequeue);
 
 int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
-	struct msghdr *msg, size_t len, int flags)
+				struct msghdr *msg, size_t len, int flags)
 {
 	int noblock = flags & MSG_DONTWAIT;
 	struct sock *sk = sock->sk;
@@ -277,7 +272,9 @@ static inline unsigned int bt_accept_poll(struct sock *parent)
 
 	list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
 		sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
-		if (sk->sk_state == BT_CONNECTED)
+		if (sk->sk_state == BT_CONNECTED ||
+					(bt_sk(parent)->defer_setup &&
+						sk->sk_state == BT_CONNECT2))
 			return POLLIN | POLLRDNORM;
 	}
 
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index c9cac7719efe..0073ec8495da 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -126,8 +126,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const
 
 	session->reassembly[id] = nskb;
 
-	if (skb)
-		kfree_skb(skb);
+	kfree_skb(skb);
 }
 
 static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a4a789f24c8d..1181db08d9de 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -123,6 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle)
 	conn->state = BT_CONNECT;
 	conn->out = 1;
 
+	conn->attempt++;
+
 	cp.handle   = cpu_to_le16(handle);
 	cp.pkt_type = cpu_to_le16(conn->pkt_type);
 
@@ -139,6 +141,8 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
 	conn->state = BT_CONNECT;
 	conn->out = 1;
 
+	conn->attempt++;
+
 	cp.handle   = cpu_to_le16(handle);
 	cp.pkt_type = cpu_to_le16(conn->pkt_type);
 
@@ -155,6 +159,7 @@ static void hci_conn_timeout(unsigned long arg)
 {
 	struct hci_conn *conn = (void *) arg;
 	struct hci_dev *hdev = conn->hdev;
+	__u8 reason;
 
 	BT_DBG("conn %p state %d", conn, conn->state);
 
@@ -173,7 +178,8 @@ static void hci_conn_timeout(unsigned long arg)
 		break;
 	case BT_CONFIG:
 	case BT_CONNECTED:
-		hci_acl_disconn(conn, 0x13);
+		reason = hci_proto_disconn_ind(conn);
+		hci_acl_disconn(conn, reason);
 		break;
 	default:
 		conn->state = BT_CLOSED;
@@ -216,12 +222,13 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
 		break;
 	case SCO_LINK:
 		if (lmp_esco_capable(hdev))
-			conn->pkt_type = hdev->esco_type & SCO_ESCO_MASK;
+			conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+					(hdev->esco_type & EDR_ESCO_MASK);
 		else
 			conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
 		break;
 	case ESCO_LINK:
-		conn->pkt_type = hdev->esco_type;
+		conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
 		break;
 	}
 
@@ -280,6 +287,8 @@ int hci_conn_del(struct hci_conn *conn)
 
 	skb_queue_purge(&conn->data_q);
 
+	hci_conn_del_sysfs(conn);
+
 	return 0;
 }
 
@@ -325,7 +334,7 @@ EXPORT_SYMBOL(hci_get_route);
 
 /* Create SCO or ACL connection.
  * Device _must_ be locked */
-struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type)
+struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 sec_level, __u8 auth_type)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
@@ -340,6 +349,7 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8
 	hci_conn_hold(acl);
 
 	if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+		acl->sec_level = sec_level;
 		acl->auth_type = auth_type;
 		hci_acl_connect(acl);
 	}
@@ -385,51 +395,59 @@ int hci_conn_check_link_mode(struct hci_conn *conn)
 EXPORT_SYMBOL(hci_conn_check_link_mode);
 
 /* Authenticate remote device */
-int hci_conn_auth(struct hci_conn *conn)
+static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
 {
 	BT_DBG("conn %p", conn);
 
-	if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) {
-		if (!(conn->auth_type & 0x01)) {
-			conn->auth_type |= 0x01;
-			conn->link_mode &= ~HCI_LM_AUTH;
-		}
-	}
-
-	if (conn->link_mode & HCI_LM_AUTH)
+	if (sec_level > conn->sec_level)
+		conn->sec_level = sec_level;
+	else if (conn->link_mode & HCI_LM_AUTH)
 		return 1;
 
+	conn->auth_type = auth_type;
+
 	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
 		struct hci_cp_auth_requested cp;
 		cp.handle = cpu_to_le16(conn->handle);
 		hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
 							sizeof(cp), &cp);
 	}
+
 	return 0;
 }
-EXPORT_SYMBOL(hci_conn_auth);
 
-/* Enable encryption */
-int hci_conn_encrypt(struct hci_conn *conn)
+/* Enable security */
+int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
 {
 	BT_DBG("conn %p", conn);
 
+	if (sec_level == BT_SECURITY_SDP)
+		return 1;
+
+	if (sec_level == BT_SECURITY_LOW) {
+		if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0)
+			return hci_conn_auth(conn, sec_level, auth_type);
+		else
+			return 1;
+	}
+
 	if (conn->link_mode & HCI_LM_ENCRYPT)
-		return hci_conn_auth(conn);
+		return hci_conn_auth(conn, sec_level, auth_type);
 
 	if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
 		return 0;
 
-	if (hci_conn_auth(conn)) {
+	if (hci_conn_auth(conn, sec_level, auth_type)) {
 		struct hci_cp_set_conn_encrypt cp;
 		cp.handle  = cpu_to_le16(conn->handle);
 		cp.encrypt = 1;
 		hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT,
 							sizeof(cp), &cp);
 	}
+
 	return 0;
 }
-EXPORT_SYMBOL(hci_conn_encrypt);
+EXPORT_SYMBOL(hci_conn_security);
 
 /* Change link key */
 int hci_conn_change_link_key(struct hci_conn *conn)
@@ -442,12 +460,13 @@ int hci_conn_change_link_key(struct hci_conn *conn)
 		hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY,
 							sizeof(cp), &cp);
 	}
+
 	return 0;
 }
 EXPORT_SYMBOL(hci_conn_change_link_key);
 
 /* Switch role */
-int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
+int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
 {
 	BT_DBG("conn %p", conn);
 
@@ -460,6 +479,7 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
 		cp.role = role;
 		hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
 	}
+
 	return 0;
 }
 EXPORT_SYMBOL(hci_conn_switch_role);
@@ -542,9 +562,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
 
 		c->state = BT_CLOSED;
 
-		hci_conn_del_sysfs(c);
-
-		hci_proto_disconn_ind(c, 0x16);
+		hci_proto_disconn_cfm(c, 0x16);
 		hci_conn_del(c);
 	}
 }
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ba78cc1eb8d9..cd061510b6bd 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1565,8 +1565,7 @@ static void hci_cmd_task(unsigned long arg)
 
 	/* Send queued commands */
 	if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) {
-		if (hdev->sent_cmd)
-			kfree_skb(hdev->sent_cmd);
+		kfree_skb(hdev->sent_cmd);
 
 		if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) {
 			atomic_dec(&hdev->cmd_cnt);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f91ba690f5d2..55534244c3a0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,6 +484,15 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb
 	if (hdev->features[4] & LMP_EV5)
 		hdev->esco_type |= (ESCO_EV5);
 
+	if (hdev->features[5] & LMP_EDR_ESCO_2M)
+		hdev->esco_type |= (ESCO_2EV3);
+
+	if (hdev->features[5] & LMP_EDR_ESCO_3M)
+		hdev->esco_type |= (ESCO_3EV3);
+
+	if (hdev->features[5] & LMP_EDR_3S_ESCO)
+		hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+
 	BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name,
 					hdev->features[0], hdev->features[1],
 					hdev->features[2], hdev->features[3],
@@ -914,7 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 	if (ev->status) {
 		hci_proto_connect_cfm(conn, ev->status);
 		hci_conn_del(conn);
-	}
+	} else if (ev->link_type != ACL_LINK)
+		hci_proto_connect_cfm(conn, ev->status);
 
 unlock:
 	hci_dev_unlock(hdev);
@@ -1009,9 +1019,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff
 	if (conn) {
 		conn->state = BT_CLOSED;
 
-		hci_conn_del_sysfs(conn);
-
-		hci_proto_disconn_ind(conn, ev->reason);
+		hci_proto_disconn_cfm(conn, ev->reason);
 		hci_conn_del(conn);
 	}
 
@@ -1600,7 +1608,8 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b
 
 		if (conn->state == BT_CONFIG) {
 			if (!ev->status && hdev->ssp_mode > 0 &&
-					conn->ssp_mode > 0 && conn->out) {
+					conn->ssp_mode > 0 && conn->out &&
+					conn->sec_level != BT_SECURITY_SDP) {
 				struct hci_cp_auth_requested cp;
 				cp.handle = ev->handle;
 				hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED,
@@ -1637,6 +1646,13 @@ static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_bu
 		conn->type = SCO_LINK;
 	}
 
+	if (conn->out && ev->status == 0x1c && conn->attempt < 2) {
+		conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+					(hdev->esco_type & EDR_ESCO_MASK);
+		hci_setup_sync(conn, conn->link->handle);
+		goto unlock;
+	}
+
 	if (!ev->status) {
 		conn->handle = __le16_to_cpu(ev->handle);
 		conn->state  = BT_CONNECTED;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index b93748e224ff..ca4d3b40d5ce 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -50,9 +50,10 @@
 #include <net/bluetooth/hci_core.h>
 #include <net/bluetooth/l2cap.h>
 
-#define VERSION "2.11"
+#define VERSION "2.13"
 
-static u32 l2cap_feat_mask = 0x0000;
+static u32 l2cap_feat_mask = 0x0080;
+static u8 l2cap_fixed_chan[8] = { 0x02, };
 
 static const struct proto_ops l2cap_sock_ops;
 
@@ -77,9 +78,10 @@ static void l2cap_sock_timeout(unsigned long arg)
 
 	bh_lock_sock(sk);
 
-	if (sk->sk_state == BT_CONNECT &&
-			(l2cap_pi(sk)->link_mode & (L2CAP_LM_AUTH |
-					L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)))
+	if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONFIG)
+		reason = ECONNREFUSED;
+	else if (sk->sk_state == BT_CONNECT &&
+				l2cap_pi(sk)->sec_level != BT_SECURITY_SDP)
 		reason = ECONNREFUSED;
 	else
 		reason = ETIMEDOUT;
@@ -204,6 +206,8 @@ static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct so
 
 	BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid);
 
+	conn->disc_reason = 0x13;
+
 	l2cap_pi(sk)->conn = conn;
 
 	if (sk->sk_type == SOCK_SEQPACKET) {
@@ -259,18 +263,35 @@ static void l2cap_chan_del(struct sock *sk, int err)
 }
 
 /* Service level security */
-static inline int l2cap_check_link_mode(struct sock *sk)
+static inline int l2cap_check_security(struct sock *sk)
 {
 	struct l2cap_conn *conn = l2cap_pi(sk)->conn;
+	__u8 auth_type;
 
-	if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) ||
-				(l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE))
-		return hci_conn_encrypt(conn->hcon);
+	if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
+			auth_type = HCI_AT_NO_BONDING_MITM;
+		else
+                        auth_type = HCI_AT_NO_BONDING;
 
-	if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH)
-		return hci_conn_auth(conn->hcon);
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
+	} else {
+		switch (l2cap_pi(sk)->sec_level) {
+		case BT_SECURITY_HIGH:
+			auth_type = HCI_AT_GENERAL_BONDING_MITM;
+			break;
+		case BT_SECURITY_MEDIUM:
+			auth_type = HCI_AT_GENERAL_BONDING;
+			break;
+		default:
+			auth_type = HCI_AT_NO_BONDING;
+			break;
+		}
+	}
 
-	return 1;
+	return hci_conn_security(conn->hcon, l2cap_pi(sk)->sec_level,
+								auth_type);
 }
 
 static inline u8 l2cap_get_ident(struct l2cap_conn *conn)
@@ -312,7 +333,10 @@ static void l2cap_do_start(struct sock *sk)
 	struct l2cap_conn *conn = l2cap_pi(sk)->conn;
 
 	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
-		if (l2cap_check_link_mode(sk)) {
+		if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
+			return;
+
+		if (l2cap_check_security(sk)) {
 			struct l2cap_conn_req req;
 			req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
 			req.psm  = l2cap_pi(sk)->psm;
@@ -356,7 +380,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
 		}
 
 		if (sk->sk_state == BT_CONNECT) {
-			if (l2cap_check_link_mode(sk)) {
+			if (l2cap_check_security(sk)) {
 				struct l2cap_conn_req req;
 				req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
 				req.psm  = l2cap_pi(sk)->psm;
@@ -371,10 +395,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
 			rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
 			rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
 
-			if (l2cap_check_link_mode(sk)) {
-				sk->sk_state = BT_CONFIG;
-				rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
-				rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+			if (l2cap_check_security(sk)) {
+				if (bt_sk(sk)->defer_setup) {
+					struct sock *parent = bt_sk(sk)->parent;
+					rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+					rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
+					parent->sk_data_ready(parent, 0);
+
+				} else {
+					sk->sk_state = BT_CONFIG;
+					rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+					rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+				}
 			} else {
 				rsp.result = cpu_to_le16(L2CAP_CR_PEND);
 				rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
@@ -426,7 +458,7 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
 	read_lock(&l->lock);
 
 	for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
-		if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE)
+		if (l2cap_pi(sk)->force_reliable)
 			sk->sk_err = err;
 	}
 
@@ -437,6 +469,7 @@ static void l2cap_info_timeout(unsigned long arg)
 {
 	struct l2cap_conn *conn = (void *) arg;
 
+	conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
 	conn->info_ident = 0;
 
 	l2cap_conn_start(conn);
@@ -470,6 +503,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status)
 	spin_lock_init(&conn->lock);
 	rwlock_init(&conn->chan_list.lock);
 
+	conn->disc_reason = 0x13;
+
 	return conn;
 }
 
@@ -483,8 +518,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
 
 	BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
 
-	if (conn->rx_skb)
-		kfree_skb(conn->rx_skb);
+	kfree_skb(conn->rx_skb);
 
 	/* Kill channels */
 	while ((sk = conn->chan_list.head)) {
@@ -608,7 +642,6 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
 
 	case BT_CONNECTED:
 	case BT_CONFIG:
-	case BT_CONNECT2:
 		if (sk->sk_type == SOCK_SEQPACKET) {
 			struct l2cap_conn *conn = l2cap_pi(sk)->conn;
 			struct l2cap_disconn_req req;
@@ -624,6 +657,27 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
 			l2cap_chan_del(sk, reason);
 		break;
 
+	case BT_CONNECT2:
+		if (sk->sk_type == SOCK_SEQPACKET) {
+			struct l2cap_conn *conn = l2cap_pi(sk)->conn;
+			struct l2cap_conn_rsp rsp;
+			__u16 result;
+
+			if (bt_sk(sk)->defer_setup)
+				result = L2CAP_CR_SEC_BLOCK;
+			else
+				result = L2CAP_CR_BAD_PSM;
+
+			rsp.scid   = cpu_to_le16(l2cap_pi(sk)->dcid);
+			rsp.dcid   = cpu_to_le16(l2cap_pi(sk)->scid);
+			rsp.result = cpu_to_le16(result);
+			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+			l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
+					L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+		} else
+			l2cap_chan_del(sk, reason);
+		break;
+
 	case BT_CONNECT:
 	case BT_DISCONN:
 		l2cap_chan_del(sk, reason);
@@ -653,13 +707,19 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
 
 	if (parent) {
 		sk->sk_type = parent->sk_type;
+		bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup;
+
 		pi->imtu = l2cap_pi(parent)->imtu;
 		pi->omtu = l2cap_pi(parent)->omtu;
-		pi->link_mode = l2cap_pi(parent)->link_mode;
+		pi->sec_level = l2cap_pi(parent)->sec_level;
+		pi->role_switch = l2cap_pi(parent)->role_switch;
+		pi->force_reliable = l2cap_pi(parent)->force_reliable;
 	} else {
 		pi->imtu = L2CAP_DEFAULT_MTU;
 		pi->omtu = 0;
-		pi->link_mode = 0;
+		pi->sec_level = BT_SECURITY_LOW;
+		pi->role_switch = 0;
+		pi->force_reliable = 0;
 	}
 
 	/* Default config options */
@@ -723,17 +783,24 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol)
 	return 0;
 }
 
-static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
 {
-	struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
 	struct sock *sk = sock->sk;
-	int err = 0;
+	struct sockaddr_l2 la;
+	int len, err = 0;
 
-	BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm);
+	BT_DBG("sk %p", sk);
 
 	if (!addr || addr->sa_family != AF_BLUETOOTH)
 		return -EINVAL;
 
+	memset(&la, 0, sizeof(la));
+	len = min_t(unsigned int, sizeof(la), alen);
+	memcpy(&la, addr, len);
+
+	if (la.l2_cid)
+		return -EINVAL;
+
 	lock_sock(sk);
 
 	if (sk->sk_state != BT_OPEN) {
@@ -741,7 +808,7 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
 		goto done;
 	}
 
-	if (la->l2_psm && btohs(la->l2_psm) < 0x1001 &&
+	if (la.l2_psm && btohs(la.l2_psm) < 0x1001 &&
 				!capable(CAP_NET_BIND_SERVICE)) {
 		err = -EACCES;
 		goto done;
@@ -749,14 +816,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
 
 	write_lock_bh(&l2cap_sk_list.lock);
 
-	if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) {
+	if (la.l2_psm && __l2cap_get_sock_by_addr(la.l2_psm, &la.l2_bdaddr)) {
 		err = -EADDRINUSE;
 	} else {
 		/* Save source address */
-		bacpy(&bt_sk(sk)->src, &la->l2_bdaddr);
-		l2cap_pi(sk)->psm   = la->l2_psm;
-		l2cap_pi(sk)->sport = la->l2_psm;
+		bacpy(&bt_sk(sk)->src, &la.l2_bdaddr);
+		l2cap_pi(sk)->psm   = la.l2_psm;
+		l2cap_pi(sk)->sport = la.l2_psm;
 		sk->sk_state = BT_BOUND;
+
+		if (btohs(la.l2_psm) == 0x0001 || btohs(la.l2_psm) == 0x0003)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
 	}
 
 	write_unlock_bh(&l2cap_sk_list.lock);
@@ -776,7 +846,8 @@ static int l2cap_do_connect(struct sock *sk)
 	__u8 auth_type;
 	int err = 0;
 
-	BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm);
+	BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst),
+							l2cap_pi(sk)->psm);
 
 	if (!(hdev = hci_get_route(dst, src)))
 		return -EHOSTUNREACH;
@@ -785,21 +856,42 @@ static int l2cap_do_connect(struct sock *sk)
 
 	err = -ENOMEM;
 
-	if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH ||
-			l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT ||
-				l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) {
-		if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001))
+	if (sk->sk_type == SOCK_RAW) {
+		switch (l2cap_pi(sk)->sec_level) {
+		case BT_SECURITY_HIGH:
+			auth_type = HCI_AT_DEDICATED_BONDING_MITM;
+			break;
+		case BT_SECURITY_MEDIUM:
+			auth_type = HCI_AT_DEDICATED_BONDING;
+			break;
+		default:
+			auth_type = HCI_AT_NO_BONDING;
+			break;
+		}
+	} else if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
 			auth_type = HCI_AT_NO_BONDING_MITM;
 		else
-			auth_type = HCI_AT_GENERAL_BONDING_MITM;
-	} else {
-		if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001))
 			auth_type = HCI_AT_NO_BONDING;
-		else
+
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
+	} else {
+		switch (l2cap_pi(sk)->sec_level) {
+		case BT_SECURITY_HIGH:
+			auth_type = HCI_AT_GENERAL_BONDING_MITM;
+			break;
+		case BT_SECURITY_MEDIUM:
 			auth_type = HCI_AT_GENERAL_BONDING;
+			break;
+		default:
+			auth_type = HCI_AT_NO_BONDING;
+			break;
+		}
 	}
 
-	hcon = hci_connect(hdev, ACL_LINK, dst, auth_type);
+	hcon = hci_connect(hdev, ACL_LINK, dst,
+					l2cap_pi(sk)->sec_level, auth_type);
 	if (!hcon)
 		goto done;
 
@@ -835,20 +927,25 @@ done:
 
 static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
 {
-	struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
 	struct sock *sk = sock->sk;
-	int err = 0;
-
-	lock_sock(sk);
+	struct sockaddr_l2 la;
+	int len, err = 0;
 
 	BT_DBG("sk %p", sk);
 
-	if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) {
-		err = -EINVAL;
-		goto done;
-	}
+	if (!addr || addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	memset(&la, 0, sizeof(la));
+	len = min_t(unsigned int, sizeof(la), alen);
+	memcpy(&la, addr, len);
+
+	if (la.l2_cid)
+		return -EINVAL;
+
+	lock_sock(sk);
 
-	if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) {
+	if (sk->sk_type == SOCK_SEQPACKET && !la.l2_psm) {
 		err = -EINVAL;
 		goto done;
 	}
@@ -875,8 +972,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
 	}
 
 	/* Set destination address and psm */
-	bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr);
-	l2cap_pi(sk)->psm = la->l2_psm;
+	bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
+	l2cap_pi(sk)->psm = la.l2_psm;
 
 	if ((err = l2cap_do_connect(sk)))
 		goto done;
@@ -1000,12 +1097,16 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l
 	addr->sa_family = AF_BLUETOOTH;
 	*len = sizeof(struct sockaddr_l2);
 
-	if (peer)
+	if (peer) {
+		la->l2_psm = l2cap_pi(sk)->psm;
 		bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst);
-	else
+		la->l2_cid = htobs(l2cap_pi(sk)->dcid);
+	} else {
+		la->l2_psm = l2cap_pi(sk)->sport;
 		bacpy(&la->l2_bdaddr, &bt_sk(sk)->src);
+		la->l2_cid = htobs(l2cap_pi(sk)->scid);
+	}
 
-	la->l2_psm = l2cap_pi(sk)->psm;
 	return 0;
 }
 
@@ -1106,11 +1207,38 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 	return err;
 }
 
-static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) {
+		struct l2cap_conn_rsp rsp;
+
+		sk->sk_state = BT_CONFIG;
+
+		rsp.scid   = cpu_to_le16(l2cap_pi(sk)->dcid);
+		rsp.dcid   = cpu_to_le16(l2cap_pi(sk)->scid);
+		rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+		rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+		l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident,
+					L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+
+		release_sock(sk);
+		return 0;
+	}
+
+	release_sock(sk);
+
+	return bt_sock_recvmsg(iocb, sock, msg, len, flags);
+}
+
+static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
 {
 	struct sock *sk = sock->sk;
 	struct l2cap_options opts;
-	int err = 0, len;
+	int len, err = 0;
 	u32 opt;
 
 	BT_DBG("sk %p", sk);
@@ -1140,7 +1268,15 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
 			break;
 		}
 
-		l2cap_pi(sk)->link_mode = opt;
+		if (opt & L2CAP_LM_AUTH)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_LOW;
+		if (opt & L2CAP_LM_ENCRYPT)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+		if (opt & L2CAP_LM_SECURE)
+			l2cap_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+		l2cap_pi(sk)->role_switch    = (opt & L2CAP_LM_MASTER);
+		l2cap_pi(sk)->force_reliable = (opt & L2CAP_LM_RELIABLE);
 		break;
 
 	default:
@@ -1152,12 +1288,77 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
 	return err;
 }
 
-static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_L2CAP)
+		return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = BT_SECURITY_LOW;
+
+		len = min_t(unsigned int, sizeof(sec), optlen);
+		if (copy_from_user((char *) &sec, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (sec.level < BT_SECURITY_LOW ||
+					sec.level > BT_SECURITY_HIGH) {
+			err = -EINVAL;
+			break;
+		}
+
+		l2cap_pi(sk)->sec_level = sec.level;
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		bt_sk(sk)->defer_setup = opt;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
 {
 	struct sock *sk = sock->sk;
 	struct l2cap_options opts;
 	struct l2cap_conninfo cinfo;
 	int len, err = 0;
+	u32 opt;
 
 	BT_DBG("sk %p", sk);
 
@@ -1180,12 +1381,36 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
 		break;
 
 	case L2CAP_LM:
-		if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval))
+		switch (l2cap_pi(sk)->sec_level) {
+		case BT_SECURITY_LOW:
+			opt = L2CAP_LM_AUTH;
+			break;
+		case BT_SECURITY_MEDIUM:
+			opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
+			break;
+		case BT_SECURITY_HIGH:
+			opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+							L2CAP_LM_SECURE;
+			break;
+		default:
+			opt = 0;
+			break;
+		}
+
+		if (l2cap_pi(sk)->role_switch)
+			opt |= L2CAP_LM_MASTER;
+
+		if (l2cap_pi(sk)->force_reliable)
+			opt |= L2CAP_LM_RELIABLE;
+
+		if (put_user(opt, (u32 __user *) optval))
 			err = -EFAULT;
 		break;
 
 	case L2CAP_CONNINFO:
-		if (sk->sk_state != BT_CONNECTED) {
+		if (sk->sk_state != BT_CONNECTED &&
+					!(sk->sk_state == BT_CONNECT2 &&
+						bt_sk(sk)->defer_setup)) {
 			err = -ENOTCONN;
 			break;
 		}
@@ -1208,6 +1433,60 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
 	return err;
 }
 
+static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_L2CAP)
+		return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = l2cap_pi(sk)->sec_level;
+
+		len = min_t(unsigned int, len, sizeof(sec));
+		if (copy_to_user(optval, (char *) &sec, len))
+			err = -EFAULT;
+
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
 static int l2cap_sock_shutdown(struct socket *sock, int how)
 {
 	struct sock *sk = sock->sk;
@@ -1270,11 +1549,6 @@ static void l2cap_chan_ready(struct sock *sk)
 		 */
 		parent->sk_data_ready(parent, 0);
 	}
-
-	if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) {
-		struct l2cap_conn *conn = l2cap_pi(sk)->conn;
-		hci_conn_change_link_key(conn->hcon);
-	}
 }
 
 /* Copy frame to all raw sockets on that connection */
@@ -1549,8 +1823,11 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd
 
 	if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
 					cmd->ident == conn->info_ident) {
-		conn->info_ident = 0;
 		del_timer(&conn->info_timer);
+
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
 		l2cap_conn_start(conn);
 	}
 
@@ -1580,6 +1857,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
 	/* Check if the ACL is secure enough (if not SDP) */
 	if (psm != cpu_to_le16(0x0001) &&
 				!hci_conn_check_link_mode(conn->hcon)) {
+		conn->disc_reason = 0x05;
 		result = L2CAP_CR_SEC_BLOCK;
 		goto response;
 	}
@@ -1621,11 +1899,18 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
 
 	l2cap_pi(sk)->ident = cmd->ident;
 
-	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
-		if (l2cap_check_link_mode(sk)) {
-			sk->sk_state = BT_CONFIG;
-			result = L2CAP_CR_SUCCESS;
-			status = L2CAP_CS_NO_INFO;
+	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
+		if (l2cap_check_security(sk)) {
+			if (bt_sk(sk)->defer_setup) {
+				sk->sk_state = BT_CONNECT2;
+				result = L2CAP_CR_PEND;
+				status = L2CAP_CS_AUTHOR_PEND;
+				parent->sk_data_ready(parent, 0);
+			} else {
+				sk->sk_state = BT_CONFIG;
+				result = L2CAP_CR_SUCCESS;
+				status = L2CAP_CS_NO_INFO;
+			}
 		} else {
 			sk->sk_state = BT_CONNECT2;
 			result = L2CAP_CR_PEND;
@@ -1695,11 +1980,14 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
 		l2cap_pi(sk)->dcid = dcid;
 		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
 
+		l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND;
+
 		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
 					l2cap_build_conf_req(sk, req), req);
 		break;
 
 	case L2CAP_CR_PEND:
+		l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND;
 		break;
 
 	default:
@@ -1908,6 +2196,14 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm
 		put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data);
 		l2cap_send_cmd(conn, cmd->ident,
 					L2CAP_INFO_RSP, sizeof(buf), buf);
+	} else if (type == L2CAP_IT_FIXED_CHAN) {
+		u8 buf[12];
+		struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+		rsp->type   = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+		rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+		memcpy(buf + 4, l2cap_fixed_chan, 8);
+		l2cap_send_cmd(conn, cmd->ident,
+					L2CAP_INFO_RSP, sizeof(buf), buf);
 	} else {
 		struct l2cap_info_rsp rsp;
 		rsp.type   = cpu_to_le16(type);
@@ -1929,14 +2225,31 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm
 
 	BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
 
-	conn->info_ident = 0;
-
 	del_timer(&conn->info_timer);
 
-	if (type == L2CAP_IT_FEAT_MASK)
+	if (type == L2CAP_IT_FEAT_MASK) {
 		conn->feat_mask = get_unaligned_le32(rsp->data);
 
-	l2cap_conn_start(conn);
+		if (conn->feat_mask & 0x0080) {
+			struct l2cap_info_req req;
+			req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+
+			conn->info_ident = l2cap_get_ident(conn);
+
+			l2cap_send_cmd(conn, conn->info_ident,
+					L2CAP_INFO_REQ, sizeof(req), &req);
+		} else {
+			conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+			conn->info_ident = 0;
+
+			l2cap_conn_start(conn);
+		}
+	} else if (type == L2CAP_IT_FIXED_CHAN) {
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
+		l2cap_conn_start(conn);
+	}
 
 	return 0;
 }
@@ -2143,10 +2456,15 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
 			continue;
 
 		if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) {
-			lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
+			lm1 |= HCI_LM_ACCEPT;
+			if (l2cap_pi(sk)->role_switch)
+				lm1 |= HCI_LM_MASTER;
 			exact++;
-		} else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
-			lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode);
+		} else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+			lm2 |= HCI_LM_ACCEPT;
+			if (l2cap_pi(sk)->role_switch)
+				lm2 |= HCI_LM_MASTER;
+		}
 	}
 	read_unlock(&l2cap_sk_list.lock);
 
@@ -2172,89 +2490,48 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
 	return 0;
 }
 
-static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason)
+static int l2cap_disconn_ind(struct hci_conn *hcon)
 {
-	BT_DBG("hcon %p reason %d", hcon, reason);
+	struct l2cap_conn *conn = hcon->l2cap_data;
 
-	if (hcon->type != ACL_LINK)
-		return 0;
+	BT_DBG("hcon %p", hcon);
 
-	l2cap_conn_del(hcon, bt_err(reason));
+	if (hcon->type != ACL_LINK || !conn)
+		return 0x13;
 
-	return 0;
+	return conn->disc_reason;
 }
 
-static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status)
+static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
 {
-	struct l2cap_chan_list *l;
-	struct l2cap_conn *conn = hcon->l2cap_data;
-	struct sock *sk;
+	BT_DBG("hcon %p reason %d", hcon, reason);
 
-	if (!conn)
+	if (hcon->type != ACL_LINK)
 		return 0;
 
-	l = &conn->chan_list;
-
-	BT_DBG("conn %p", conn);
-
-	read_lock(&l->lock);
-
-	for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
-		struct l2cap_pinfo *pi = l2cap_pi(sk);
-
-		bh_lock_sock(sk);
-
-		if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
-					!(hcon->link_mode & HCI_LM_ENCRYPT) &&
-								!status) {
-			bh_unlock_sock(sk);
-			continue;
-		}
-
-		if (sk->sk_state == BT_CONNECT) {
-			if (!status) {
-				struct l2cap_conn_req req;
-				req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
-				req.psm  = l2cap_pi(sk)->psm;
-
-				l2cap_pi(sk)->ident = l2cap_get_ident(conn);
-
-				l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
-					L2CAP_CONN_REQ, sizeof(req), &req);
-			} else {
-				l2cap_sock_clear_timer(sk);
-				l2cap_sock_set_timer(sk, HZ / 10);
-			}
-		} else if (sk->sk_state == BT_CONNECT2) {
-			struct l2cap_conn_rsp rsp;
-			__u16 result;
+	l2cap_conn_del(hcon, bt_err(reason));
 
-			if (!status) {
-				sk->sk_state = BT_CONFIG;
-				result = L2CAP_CR_SUCCESS;
-			} else {
-				sk->sk_state = BT_DISCONN;
-				l2cap_sock_set_timer(sk, HZ / 10);
-				result = L2CAP_CR_SEC_BLOCK;
-			}
+	return 0;
+}
 
-			rsp.scid   = cpu_to_le16(l2cap_pi(sk)->dcid);
-			rsp.dcid   = cpu_to_le16(l2cap_pi(sk)->scid);
-			rsp.result = cpu_to_le16(result);
-			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-			l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
-					L2CAP_CONN_RSP, sizeof(rsp), &rsp);
-		}
+static inline void l2cap_check_encryption(struct sock *sk, u8 encrypt)
+{
+	if (sk->sk_type != SOCK_SEQPACKET)
+		return;
 
-		bh_unlock_sock(sk);
+	if (encrypt == 0x00) {
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) {
+			l2cap_sock_clear_timer(sk);
+			l2cap_sock_set_timer(sk, HZ * 5);
+		} else if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
+			__l2cap_sock_close(sk, ECONNREFUSED);
+	} else {
+		if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM)
+			l2cap_sock_clear_timer(sk);
 	}
-
-	read_unlock(&l->lock);
-
-	return 0;
 }
 
-static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
+static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
 {
 	struct l2cap_chan_list *l;
 	struct l2cap_conn *conn = hcon->l2cap_data;
@@ -2270,15 +2547,16 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
 	read_lock(&l->lock);
 
 	for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
-		struct l2cap_pinfo *pi = l2cap_pi(sk);
-
 		bh_lock_sock(sk);
 
-		if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
-					(sk->sk_state == BT_CONNECTED ||
-						sk->sk_state == BT_CONFIG) &&
-						!status && encrypt == 0x00) {
-			__l2cap_sock_close(sk, ECONNREFUSED);
+		if (l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND) {
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if (!status && (sk->sk_state == BT_CONNECTED ||
+						sk->sk_state == BT_CONFIG)) {
+			l2cap_check_encryption(sk, encrypt);
 			bh_unlock_sock(sk);
 			continue;
 		}
@@ -2376,7 +2654,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 			goto drop;
 
 		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
-			      skb->len);
+								skb->len);
 		conn->rx_len = len - skb->len;
 	} else {
 		BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
@@ -2398,7 +2676,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 		}
 
 		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
-			      skb->len);
+								skb->len);
 		conn->rx_len -= skb->len;
 
 		if (!conn->rx_len) {
@@ -2424,10 +2702,10 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
 	sk_for_each(sk, node, &l2cap_sk_list.head) {
 		struct l2cap_pinfo *pi = l2cap_pi(sk);
 
-		str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n",
+		str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
 				batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
 				sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid,
-				pi->imtu, pi->omtu, pi->link_mode);
+				pi->imtu, pi->omtu, pi->sec_level);
 	}
 
 	read_unlock_bh(&l2cap_sk_list.lock);
@@ -2447,7 +2725,7 @@ static const struct proto_ops l2cap_sock_ops = {
 	.accept		= l2cap_sock_accept,
 	.getname	= l2cap_sock_getname,
 	.sendmsg	= l2cap_sock_sendmsg,
-	.recvmsg	= bt_sock_recvmsg,
+	.recvmsg	= l2cap_sock_recvmsg,
 	.poll		= bt_sock_poll,
 	.ioctl		= bt_sock_ioctl,
 	.mmap		= sock_no_mmap,
@@ -2469,8 +2747,8 @@ static struct hci_proto l2cap_hci_proto = {
 	.connect_ind	= l2cap_connect_ind,
 	.connect_cfm	= l2cap_connect_cfm,
 	.disconn_ind	= l2cap_disconn_ind,
-	.auth_cfm	= l2cap_auth_cfm,
-	.encrypt_cfm	= l2cap_encrypt_cfm,
+	.disconn_cfm	= l2cap_disconn_cfm,
+	.security_cfm	= l2cap_security_cfm,
 	.recv_acldata	= l2cap_recv_acldata
 };
 
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index acd84fd524b8..1d0fb0f23c63 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -46,7 +46,7 @@
 #include <net/bluetooth/l2cap.h>
 #include <net/bluetooth/rfcomm.h>
 
-#define VERSION "1.10"
+#define VERSION "1.11"
 
 static int disable_cfc = 0;
 static int channel_mtu = -1;
@@ -223,19 +223,25 @@ static int rfcomm_l2sock_create(struct socket **sock)
 	return err;
 }
 
-static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d)
+static inline int rfcomm_check_security(struct rfcomm_dlc *d)
 {
 	struct sock *sk = d->session->sock->sk;
+	__u8 auth_type;
 
-	if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) {
-		if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon))
-			return 1;
-	} else if (d->link_mode & RFCOMM_LM_AUTH) {
-		if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon))
-			return 1;
+	switch (d->sec_level) {
+	case BT_SECURITY_HIGH:
+		auth_type = HCI_AT_GENERAL_BONDING_MITM;
+		break;
+	case BT_SECURITY_MEDIUM:
+		auth_type = HCI_AT_GENERAL_BONDING;
+		break;
+	default:
+		auth_type = HCI_AT_NO_BONDING;
+		break;
 	}
 
-	return 0;
+	return hci_conn_security(l2cap_pi(sk)->conn->hcon, d->sec_level,
+								auth_type);
 }
 
 /* ---- RFCOMM DLCs ---- */
@@ -388,10 +394,10 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst,
 	d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
 
 	if (s->state == BT_CONNECTED) {
-		if (rfcomm_check_link_mode(d))
-			set_bit(RFCOMM_AUTH_PENDING, &d->flags);
-		else
+		if (rfcomm_check_security(d))
 			rfcomm_send_pn(s, 1, d);
+		else
+			set_bit(RFCOMM_AUTH_PENDING, &d->flags);
 	}
 
 	rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
@@ -421,9 +427,16 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 			d, d->state, d->dlci, err, s);
 
 	switch (d->state) {
-	case BT_CONNECTED:
-	case BT_CONFIG:
 	case BT_CONNECT:
+	case BT_CONFIG:
+		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+			rfcomm_schedule(RFCOMM_SCHED_AUTH);
+			break;
+		}
+		/* Fall through */
+
+	case BT_CONNECTED:
 		d->state = BT_DISCONN;
 		if (skb_queue_empty(&d->tx_queue)) {
 			rfcomm_send_disc(s, d->dlci);
@@ -434,6 +447,15 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 		}
 		break;
 
+	case BT_OPEN:
+	case BT_CONNECT2:
+		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+			rfcomm_schedule(RFCOMM_SCHED_AUTH);
+			break;
+		}
+		/* Fall through */
+
 	default:
 		rfcomm_dlc_clear_timer(d);
 
@@ -636,6 +658,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
 	bacpy(&addr.l2_bdaddr, src);
 	addr.l2_family = AF_BLUETOOTH;
 	addr.l2_psm    = 0;
+	addr.l2_cid    = 0;
 	*err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
 	if (*err < 0)
 		goto failed;
@@ -657,6 +680,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
 	bacpy(&addr.l2_bdaddr, dst);
 	addr.l2_family = AF_BLUETOOTH;
 	addr.l2_psm    = htobs(RFCOMM_PSM);
+	addr.l2_cid    = 0;
 	*err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
 	if (*err == 0 || *err == -EINPROGRESS)
 		return s;
@@ -1162,7 +1186,7 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
 	return 0;
 }
 
-static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
+void rfcomm_dlc_accept(struct rfcomm_dlc *d)
 {
 	struct sock *sk = d->session->sock->sk;
 
@@ -1175,12 +1199,31 @@ static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
 	d->state_change(d, 0);
 	rfcomm_dlc_unlock(d);
 
-	if (d->link_mode & RFCOMM_LM_MASTER)
+	if (d->role_switch)
 		hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00);
 
 	rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
 }
 
+static void rfcomm_check_accept(struct rfcomm_dlc *d)
+{
+	if (rfcomm_check_security(d)) {
+		if (d->defer_setup) {
+			set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+			rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+			rfcomm_dlc_lock(d);
+			d->state = BT_CONNECT2;
+			d->state_change(d, 0);
+			rfcomm_dlc_unlock(d);
+		} else
+			rfcomm_dlc_accept(d);
+	} else {
+		set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+		rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+	}
+}
+
 static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
 {
 	struct rfcomm_dlc *d;
@@ -1203,11 +1246,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
 	if (d) {
 		if (d->state == BT_OPEN) {
 			/* DLC was previously opened by PN request */
-			if (rfcomm_check_link_mode(d)) {
-				set_bit(RFCOMM_AUTH_PENDING, &d->flags);
-				rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
-			} else
-				rfcomm_dlc_accept(d);
+			rfcomm_check_accept(d);
 		}
 		return 0;
 	}
@@ -1219,11 +1258,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
 		d->addr = __addr(s->initiator, dlci);
 		rfcomm_dlc_link(s, d);
 
-		if (rfcomm_check_link_mode(d)) {
-			set_bit(RFCOMM_AUTH_PENDING, &d->flags);
-			rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
-		} else
-			rfcomm_dlc_accept(d);
+		rfcomm_check_accept(d);
 	} else {
 		rfcomm_send_dm(s, dlci);
 	}
@@ -1637,11 +1672,12 @@ static void rfcomm_process_connect(struct rfcomm_session *s)
 		d = list_entry(p, struct rfcomm_dlc, list);
 		if (d->state == BT_CONFIG) {
 			d->mtu = s->mtu;
-			if (rfcomm_check_link_mode(d)) {
+			if (rfcomm_check_security(d)) {
+				rfcomm_send_pn(s, 1, d);
+			} else {
 				set_bit(RFCOMM_AUTH_PENDING, &d->flags);
 				rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
-			} else
-				rfcomm_send_pn(s, 1, d);
+			}
 		}
 	}
 }
@@ -1717,11 +1753,17 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
 			if (d->out) {
 				rfcomm_send_pn(s, 1, d);
 				rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
-			} else
-				rfcomm_dlc_accept(d);
-			if (d->link_mode & RFCOMM_LM_SECURE) {
-				struct sock *sk = s->sock->sk;
-				hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon);
+			} else {
+				if (d->defer_setup) {
+					set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+					rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+					rfcomm_dlc_lock(d);
+					d->state = BT_CONNECT2;
+					d->state_change(d, 0);
+					rfcomm_dlc_unlock(d);
+				} else
+					rfcomm_dlc_accept(d);
 			}
 			continue;
 		} else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
@@ -1734,6 +1776,9 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
 			continue;
 		}
 
+		if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
+			continue;
+
 		if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
 			continue;
 
@@ -1876,6 +1921,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
 	bacpy(&addr.l2_bdaddr, ba);
 	addr.l2_family = AF_BLUETOOTH;
 	addr.l2_psm    = htobs(RFCOMM_PSM);
+	addr.l2_cid    = 0;
 	err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
 	if (err < 0) {
 		BT_ERR("Bind failed %d", err);
@@ -1947,42 +1993,7 @@ static int rfcomm_run(void *unused)
 	return 0;
 }
 
-static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status)
-{
-	struct rfcomm_session *s;
-	struct rfcomm_dlc *d;
-	struct list_head *p, *n;
-
-	BT_DBG("conn %p status 0x%02x", conn, status);
-
-	s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
-	if (!s)
-		return;
-
-	rfcomm_session_hold(s);
-
-	list_for_each_safe(p, n, &s->dlcs) {
-		d = list_entry(p, struct rfcomm_dlc, list);
-
-		if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
-				!(conn->link_mode & HCI_LM_ENCRYPT) && !status)
-			continue;
-
-		if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
-			continue;
-
-		if (!status)
-			set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
-		else
-			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
-	}
-
-	rfcomm_session_put(s);
-
-	rfcomm_schedule(RFCOMM_SCHED_AUTH);
-}
-
-static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
+static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
 {
 	struct rfcomm_session *s;
 	struct rfcomm_dlc *d;
@@ -1999,18 +2010,29 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
 	list_for_each_safe(p, n, &s->dlcs) {
 		d = list_entry(p, struct rfcomm_dlc, list);
 
-		if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
-					(d->state == BT_CONNECTED ||
-						d->state == BT_CONFIG) &&
-						!status && encrypt == 0x00) {
-			__rfcomm_dlc_close(d, ECONNREFUSED);
-			continue;
+		if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
+			rfcomm_dlc_clear_timer(d);
+			if (status || encrypt == 0x00) {
+				__rfcomm_dlc_close(d, ECONNREFUSED);
+				continue;
+			}
+		}
+
+		if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
+			if (d->sec_level == BT_SECURITY_MEDIUM) {
+				set_bit(RFCOMM_SEC_PENDING, &d->flags);
+				rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+				continue;
+			} else if (d->sec_level == BT_SECURITY_HIGH) {
+				__rfcomm_dlc_close(d, ECONNREFUSED);
+				continue;
+			}
 		}
 
 		if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
 			continue;
 
-		if (!status && encrypt)
+		if (!status)
 			set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
 		else
 			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
@@ -2023,8 +2045,7 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
 
 static struct hci_cb rfcomm_cb = {
 	.name		= "RFCOMM",
-	.auth_cfm	= rfcomm_auth_cfm,
-	.encrypt_cfm	= rfcomm_encrypt_cfm
+	.security_cfm	= rfcomm_security_cfm
 };
 
 static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d3fc6fca38d0..7f482784e9f7 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -261,12 +261,19 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
 
 	if (parent) {
 		sk->sk_type = parent->sk_type;
-		pi->link_mode = rfcomm_pi(parent)->link_mode;
+		pi->dlc->defer_setup = bt_sk(parent)->defer_setup;
+
+		pi->sec_level = rfcomm_pi(parent)->sec_level;
+		pi->role_switch = rfcomm_pi(parent)->role_switch;
 	} else {
-		pi->link_mode = 0;
+		pi->dlc->defer_setup = 0;
+
+		pi->sec_level = BT_SECURITY_LOW;
+		pi->role_switch = 0;
 	}
 
-	pi->dlc->link_mode = pi->link_mode;
+	pi->dlc->sec_level = pi->sec_level;
+	pi->dlc->role_switch = pi->role_switch;
 }
 
 static struct proto rfcomm_proto = {
@@ -406,7 +413,8 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
 	bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
 	rfcomm_pi(sk)->channel = sa->rc_channel;
 
-	d->link_mode = rfcomm_pi(sk)->link_mode;
+	d->sec_level = rfcomm_pi(sk)->sec_level;
+	d->role_switch = rfcomm_pi(sk)->role_switch;
 
 	err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel);
 	if (!err)
@@ -554,6 +562,9 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int sent = 0;
 
+	if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
+		return -ENOTCONN;
+
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
 
@@ -570,8 +581,11 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 		skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
 				msg->msg_flags & MSG_DONTWAIT, &err);
-		if (!skb)
+		if (!skb) {
+			if (sent == 0)
+				sent = err;
 			break;
+		}
 		skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
@@ -630,10 +644,16 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size, int flags)
 {
 	struct sock *sk = sock->sk;
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
 	int err = 0;
 	size_t target, copied = 0;
 	long timeo;
 
+	if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+		rfcomm_dlc_accept(d);
+		return 0;
+	}
+
 	if (flags & MSG_OOB)
 		return -EOPNOTSUPP;
 
@@ -710,7 +730,7 @@ out:
 	return copied ? : err;
 }
 
-static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
 {
 	struct sock *sk = sock->sk;
 	int err = 0;
@@ -727,7 +747,14 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
 			break;
 		}
 
-		rfcomm_pi(sk)->link_mode = opt;
+		if (opt & RFCOMM_LM_AUTH)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
+		if (opt & RFCOMM_LM_ENCRYPT)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+		if (opt & RFCOMM_LM_SECURE)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+		rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
 		break;
 
 	default:
@@ -739,12 +766,76 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
 	return err;
 }
 
-static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_RFCOMM)
+		return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_STREAM) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = BT_SECURITY_LOW;
+
+		len = min_t(unsigned int, sizeof(sec), optlen);
+		if (copy_from_user((char *) &sec, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (sec.level > BT_SECURITY_HIGH) {
+			err = -EINVAL;
+			break;
+		}
+
+		rfcomm_pi(sk)->sec_level = sec.level;
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		bt_sk(sk)->defer_setup = opt;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
 {
 	struct sock *sk = sock->sk;
 	struct sock *l2cap_sk;
 	struct rfcomm_conninfo cinfo;
 	int len, err = 0;
+	u32 opt;
 
 	BT_DBG("sk %p", sk);
 
@@ -755,12 +846,32 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
 
 	switch (optname) {
 	case RFCOMM_LM:
-		if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval))
+		switch (rfcomm_pi(sk)->sec_level) {
+		case BT_SECURITY_LOW:
+			opt = RFCOMM_LM_AUTH;
+			break;
+		case BT_SECURITY_MEDIUM:
+			opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
+			break;
+		case BT_SECURITY_HIGH:
+			opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+							RFCOMM_LM_SECURE;
+			break;
+		default:
+			opt = 0;
+			break;
+		}
+
+		if (rfcomm_pi(sk)->role_switch)
+			opt |= RFCOMM_LM_MASTER;
+
+		if (put_user(opt, (u32 __user *) optval))
 			err = -EFAULT;
 		break;
 
 	case RFCOMM_CONNINFO:
-		if (sk->sk_state != BT_CONNECTED) {
+		if (sk->sk_state != BT_CONNECTED &&
+					!rfcomm_pi(sk)->dlc->defer_setup) {
 			err = -ENOTCONN;
 			break;
 		}
@@ -785,6 +896,60 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
 	return err;
 }
 
+static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_RFCOMM)
+		return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_STREAM) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = rfcomm_pi(sk)->sec_level;
+
+		len = min_t(unsigned int, len, sizeof(sec));
+		if (copy_to_user(optval, (char *) &sec, len))
+			err = -EFAULT;
+
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
 static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk __maybe_unused = sock->sk;
@@ -888,6 +1053,10 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
 
 done:
 	bh_unlock_sock(parent);
+
+	if (bt_sk(parent)->defer_setup)
+		parent->sk_state_change(parent);
+
 	return result;
 }
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 46fd8bf9a690..51ae0c3e470a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -195,7 +195,7 @@ static int sco_connect(struct sock *sk)
 	else
 		type = SCO_LINK;
 
-	hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING);
+	hcon = hci_connect(hdev, type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
 	if (!hcon)
 		goto done;
 
@@ -668,7 +668,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
 	return err;
 }
 
-static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
 {
 	struct sock *sk = sock->sk;
 	struct sco_options opts;
@@ -723,6 +723,31 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
 	return err;
 }
 
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_SCO)
+		return sco_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
 static int sco_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -832,10 +857,30 @@ done:
 /* ----- SCO interface with lower layer (HCI) ----- */
 static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
 {
+	register struct sock *sk;
+	struct hlist_node *node;
+	int lm = 0;
+
+	if (type != SCO_LINK && type != ESCO_LINK)
+		return 0;
+
 	BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
 
-	/* Always accept connection */
-	return HCI_LM_ACCEPT;
+	/* Find listening sockets */
+	read_lock(&sco_sk_list.lock);
+	sk_for_each(sk, node, &sco_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) ||
+				!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+			lm |= HCI_LM_ACCEPT;
+			break;
+		}
+	}
+	read_unlock(&sco_sk_list.lock);
+
+	return lm;
 }
 
 static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
@@ -857,7 +902,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
 	return 0;
 }
 
-static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason)
+static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 {
 	BT_DBG("hcon %p reason %d", hcon, reason);
 
@@ -940,7 +985,7 @@ static struct hci_proto sco_hci_proto = {
 	.id		= HCI_PROTO_SCO,
 	.connect_ind	= sco_connect_ind,
 	.connect_cfm	= sco_connect_cfm,
-	.disconn_ind	= sco_disconn_ind,
+	.disconn_cfm	= sco_disconn_cfm,
 	.recv_scodata	= sco_recv_scodata
 };
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ba7be195803c..fcffb3fb1177 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index d90e8dd975fc..547bafc79e28 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -273,8 +273,7 @@ int can_send(struct sk_buff *skb, int loop)
 		err = net_xmit_errno(err);
 
 	if (err) {
-		if (newskb)
-			kfree_skb(newskb);
+		kfree_skb(newskb);
 		return err;
 	}
 
diff --git a/net/core/Makefile b/net/core/Makefile
index 26a37cb31923..796f46eece5f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 5e2ac0c4b07c..d0de644b378d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
 
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 {
-	kfree_skb(skb);
+	consume_skb(skb);
 	sk_mem_reclaim_partial(sk);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d393fc997cd9..052dd478d3e1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,14 +135,6 @@
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
-enum {
-	GRO_MERGED,
-	GRO_MERGED_FREE,
-	GRO_HELD,
-	GRO_NORMAL,
-	GRO_DROP,
-};
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -1672,23 +1664,12 @@ static int dev_gso_segment(struct sk_buff *skb)
 	return 0;
 }
 
-static void tstamp_tx(struct sk_buff *skb)
-{
-	union skb_shared_tx *shtx =
-		skb_tx(skb);
-	if (unlikely(shtx->software &&
-			!shtx->in_progress)) {
-		skb_tstamp_tx(skb, NULL);
-	}
-}
-
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc;
 
-	prefetch(&dev->netdev_ops->ndo_start_xmit);
 	if (likely(!skb->next)) {
 		if (!list_empty(&ptype_all))
 			dev_queue_xmit_nit(skb, dev);
@@ -1715,8 +1696,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		 * the skb destructor before the call and restoring it
 		 * afterwards, then doing the skb_orphan() ourselves?
 		 */
-		if (likely(!rc))
-			tstamp_tx(skb);
 		return rc;
 	}
 
@@ -1732,7 +1711,6 @@ gso:
 			skb->next = nskb;
 			return rc;
 		}
-		tstamp_tx(skb);
 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
 			return NETDEV_TX_BUSY;
 	} while (skb->next);
@@ -1745,17 +1723,11 @@ out_kfree_skb:
 }
 
 static u32 skb_tx_hashrnd;
-static int skb_tx_hashrnd_initialized = 0;
 
-static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
+u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
 {
 	u32 hash;
 
-	if (unlikely(!skb_tx_hashrnd_initialized)) {
-		get_random_bytes(&skb_tx_hashrnd, 4);
-		skb_tx_hashrnd_initialized = 1;
-	}
-
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
 	} else if (skb->sk && skb->sk->sk_hash) {
@@ -1767,6 +1739,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
 
 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
 }
+EXPORT_SYMBOL(skb_tx_hash);
 
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
@@ -2273,12 +2246,6 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	rcu_read_lock();
 
-	/* Don't receive packets in an exiting network namespace */
-	if (!net_alive(dev_net(skb->dev))) {
-		kfree_skb(skb);
-		goto out;
-	}
-
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2499,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
+	if (netpoll_rx_on(skb))
+		return GRO_NORMAL;
+
 	for (p = napi->gro_list; p; p = p->next) {
 		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
 			skb_mac_header(p), skb_gro_mac_header(skb));
@@ -2657,9 +2627,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		local_irq_disable();
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
-			__napi_complete(napi);
 			local_irq_enable();
-			break;
+			napi_complete(napi);
+			goto out;
 		}
 		local_irq_enable();
 
@@ -2668,6 +2638,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 
 	napi_gro_flush(napi);
 
+out:
 	return work;
 }
 
@@ -2741,7 +2712,7 @@ void netif_napi_del(struct napi_struct *napi)
 	struct sk_buff *skb, *next;
 
 	list_del_init(&napi->dev_list);
-	kfree(napi->skb);
+	kfree_skb(napi->skb);
 
 	for (skb = napi->gro_list; skb; skb = next) {
 		next = skb->next;
@@ -4355,6 +4326,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
 }
 EXPORT_SYMBOL(netdev_fix_features);
 
+/* Some devices need to (re-)set their netdev_ops inside
+ * ->init() or similar.  If that happens, we have to setup
+ * the compat pointers again.
+ */
+void netdev_resync_ops(struct net_device *dev)
+{
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	dev->init = ops->ndo_init;
+	dev->uninit = ops->ndo_uninit;
+	dev->open = ops->ndo_open;
+	dev->change_rx_flags = ops->ndo_change_rx_flags;
+	dev->set_rx_mode = ops->ndo_set_rx_mode;
+	dev->set_multicast_list = ops->ndo_set_multicast_list;
+	dev->set_mac_address = ops->ndo_set_mac_address;
+	dev->validate_addr = ops->ndo_validate_addr;
+	dev->do_ioctl = ops->ndo_do_ioctl;
+	dev->set_config = ops->ndo_set_config;
+	dev->change_mtu = ops->ndo_change_mtu;
+	dev->neigh_setup = ops->ndo_neigh_setup;
+	dev->tx_timeout = ops->ndo_tx_timeout;
+	dev->get_stats = ops->ndo_get_stats;
+	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
+	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
+	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = ops->ndo_poll_controller;
+#endif
+#endif
+}
+EXPORT_SYMBOL(netdev_resync_ops);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4399,27 +4403,7 @@ int register_netdevice(struct net_device *dev)
 	 * This is temporary until all network devices are converted.
 	 */
 	if (dev->netdev_ops) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-
-		dev->init = ops->ndo_init;
-		dev->uninit = ops->ndo_uninit;
-		dev->open = ops->ndo_open;
-		dev->change_rx_flags = ops->ndo_change_rx_flags;
-		dev->set_rx_mode = ops->ndo_set_rx_mode;
-		dev->set_multicast_list = ops->ndo_set_multicast_list;
-		dev->set_mac_address = ops->ndo_set_mac_address;
-		dev->validate_addr = ops->ndo_validate_addr;
-		dev->do_ioctl = ops->ndo_do_ioctl;
-		dev->set_config = ops->ndo_set_config;
-		dev->change_mtu = ops->ndo_change_mtu;
-		dev->tx_timeout = ops->ndo_tx_timeout;
-		dev->get_stats = ops->ndo_get_stats;
-		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
-		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
-		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
-#ifdef CONFIG_NET_POLL_CONTROLLER
-		dev->poll_controller = ops->ndo_poll_controller;
-#endif
+		netdev_resync_ops(dev);
 	} else {
 		char drivername[64];
 		pr_info("%s (%s): not using net_device_ops yet\n",
@@ -5291,6 +5275,14 @@ out:
 
 subsys_initcall(net_dev_init);
 
+static int __init initialize_hashrnd(void)
+{
+	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
+	return 0;
+}
+
+late_initcall_sync(initialize_hashrnd);
+
 EXPORT_SYMBOL(__dev_get_by_index);
 EXPORT_SYMBOL(__dev_get_by_name);
 EXPORT_SYMBOL(__dev_remove_pack);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000000..9fd0dc3cca99
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,263 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <net/genetlink.h>
+
+#include <trace/skb.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+static void send_dm_alert(struct work_struct *unused);
+
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+struct sock *dm_sock;
+
+struct per_cpu_dm_data {
+	struct work_struct dm_alert_work;
+	struct sk_buff *skb;
+	atomic_t dm_hit_count;
+	struct timer_list send_timer;
+};
+
+static struct genl_family net_drop_monitor_family = {
+	.id             = GENL_ID_GENERATE,
+	.hdrsize        = 0,
+	.name           = "NET_DM",
+	.version        = 1,
+	.maxattr        = NET_DM_CMD_MAX,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+
+
+static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+	size_t al;
+	struct net_dm_alert_msg *msg;
+
+	al = sizeof(struct net_dm_alert_msg);
+	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+	data->skb = genlmsg_new(al, GFP_KERNEL);
+	genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
+			0, NET_DM_CMD_ALERT);
+	msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
+	memset(msg, 0, al);
+	atomic_set(&data->dm_hit_count, dm_hit_limit);
+}
+
+static void send_dm_alert(struct work_struct *unused)
+{
+	struct sk_buff *skb;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+	/*
+	 * Grab the skb we're about to send
+	 */
+	skb = data->skb;
+
+	/*
+	 * Replace it with a new one
+	 */
+	reset_per_cpu_data(data);
+
+	/*
+	 * Ship it!
+	 */
+	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.  Note that it operates under the timer interrupt
+ * so we don't need to disable preemption here
+ */
+static void sched_send_work(unsigned long unused)
+{
+	struct per_cpu_dm_data *data =  &__get_cpu_var(dm_cpu_data);
+
+	schedule_work(&data->dm_alert_work);
+}
+
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+	struct net_dm_alert_msg *msg;
+	struct nlmsghdr *nlh;
+	int i;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+
+	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
+		/*
+		 * we're already at zero, discard this hit
+		 */
+		goto out;
+	}
+
+	nlh = (struct nlmsghdr *)data->skb->data;
+	msg = genlmsg_data(nlmsg_data(nlh));
+	for (i = 0; i < msg->entries; i++) {
+		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+			msg->points[i].count++;
+			goto out;
+		}
+	}
+
+	/*
+	 * We need to create a new entry
+	 */
+	__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+	msg->points[msg->entries].count = 1;
+	msg->entries++;
+
+	if (!timer_pending(&data->send_timer)) {
+		data->send_timer.expires = jiffies + dm_delay * HZ;
+		add_timer_on(&data->send_timer, smp_processor_id());
+	}
+
+out:
+	return;
+}
+
+static int set_all_monitor_traces(int state)
+{
+	int rc = 0;
+
+	switch (state) {
+	case TRACE_ON:
+		rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+		break;
+	case TRACE_OFF:
+		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+
+		tracepoint_synchronize_unregister();
+		break;
+	default:
+		rc = 1;
+		break;
+	}
+
+	if (rc)
+		return -EINPROGRESS;
+	return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	switch (info->genlhdr->cmd) {
+	case NET_DM_CMD_START:
+		return set_all_monitor_traces(TRACE_ON);
+		break;
+	case NET_DM_CMD_STOP:
+		return set_all_monitor_traces(TRACE_OFF);
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
+
+static struct genl_ops dropmon_ops[] = {
+	{
+		.cmd = NET_DM_CMD_CONFIG,
+		.doit = net_dm_cmd_config,
+	},
+	{
+		.cmd = NET_DM_CMD_START,
+		.doit = net_dm_cmd_trace,
+	},
+	{
+		.cmd = NET_DM_CMD_STOP,
+		.doit = net_dm_cmd_trace,
+	},
+};
+
+static int __init init_net_drop_monitor(void)
+{
+	int cpu;
+	int rc, i, ret;
+	struct per_cpu_dm_data *data;
+	printk(KERN_INFO "Initalizing network drop monitor service\n");
+
+	if (sizeof(void *) > 8) {
+		printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
+		return -ENOSPC;
+	}
+
+	if (genl_register_family(&net_drop_monitor_family) < 0) {
+		printk(KERN_ERR "Could not create drop monitor netlink family\n");
+		return -EFAULT;
+	}
+
+	rc = -EFAULT;
+
+	for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) {
+		ret = genl_register_ops(&net_drop_monitor_family,
+					&dropmon_ops[i]);
+		if (ret) {
+			printk(KERN_CRIT "failed to register operation %d\n",
+				dropmon_ops[i].cmd);
+			goto out_unreg;
+		}
+	}
+
+	rc = 0;
+
+	for_each_present_cpu(cpu) {
+		data = &per_cpu(dm_cpu_data, cpu);
+		reset_per_cpu_data(data);
+		INIT_WORK(&data->dm_alert_work, send_dm_alert);
+		init_timer(&data->send_timer);
+		data->send_timer.data = cpu;
+		data->send_timer.function = sched_send_work;
+	}
+	goto out;
+
+out_unreg:
+	genl_unregister_family(&net_drop_monitor_family);
+out:
+	return rc;
+}
+
+late_initcall(init_net_drop_monitor);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 947710a36ced..244ca56dffac 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
 	return 0;
 }
 
-static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc cmd;
 
-	if (!dev->ethtool_ops->set_rxhash)
+	if (!dev->ethtool_ops->set_rxnfc)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
-	return dev->ethtool_ops->set_rxhash(dev, &cmd);
+	return dev->ethtool_ops->set_rxnfc(dev, &cmd);
 }
 
-static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr)
+static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_rxnfc info;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	int ret;
+	void *rule_buf = NULL;
 
-	if (!dev->ethtool_ops->get_rxhash)
+	if (!ops->get_rxnfc)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&info, useraddr, sizeof(info)))
 		return -EFAULT;
 
-	dev->ethtool_ops->get_rxhash(dev, &info);
+	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+		if (info.rule_cnt > 0) {
+			rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
+					   GFP_USER);
+			if (!rule_buf)
+				return -ENOMEM;
+		}
+	}
 
+	ret = ops->get_rxnfc(dev, &info, rule_buf);
+	if (ret < 0)
+		goto err_out;
+
+	ret = -EFAULT;
 	if (copy_to_user(useraddr, &info, sizeof(info)))
-		return -EFAULT;
-	return 0;
+		goto err_out;
+
+	if (rule_buf) {
+		useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+		if (copy_to_user(useraddr, rule_buf,
+				 info.rule_cnt * sizeof(u32)))
+			goto err_out;
+	}
+	ret = 0;
+
+err_out:
+	if (rule_buf)
+		kfree(rule_buf);
+
+	return ret;
 }
 
 static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GFLAGS:
 	case ETHTOOL_GPFLAGS:
 	case ETHTOOL_GRXFH:
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
 		break;
 	default:
 		if (!capable(CAP_NET_ADMIN))
@@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 				       dev->ethtool_ops->set_priv_flags);
 		break;
 	case ETHTOOL_GRXFH:
-		rc = ethtool_get_rxhash(dev, useraddr);
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+		rc = ethtool_get_rxnfc(dev, useraddr);
 		break;
 	case ETHTOOL_SRXFH:
-		rc = ethtool_set_rxhash(dev, useraddr);
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+		rc = ethtool_set_rxnfc(dev, useraddr);
 		break;
 	case ETHTOOL_GGRO:
 		rc = ethtool_get_gro(dev, useraddr);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 32b3a0152d7a..98691e1466b8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
 		goto errout;
 	}
 
-	err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, ops->nlgroup, err);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 278a142d1047..a1cbce7fdae5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg)
 		write_unlock(&neigh->lock);
 		neigh->ops->solicit(neigh, skb);
 		atomic_inc(&neigh->probes);
-		if (skb)
-			kfree_skb(skb);
+		kfree_skb(skb);
 	} else {
 out:
 		write_unlock(&neigh->lock);
@@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 			neigh->updated = jiffies;
 			write_unlock_bh(&neigh->lock);
 
-			if (skb)
-				kfree_skb(skb);
+			kfree_skb(skb);
 			return 1;
 		}
 	} else if (neigh->nud_state & NUD_STALE) {
@@ -1656,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 				flags &= ~NEIGH_UPDATE_F_OVERRIDE;
 		}
 
-		err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+		if (ndm->ndm_flags & NTF_USE) {
+			neigh_event_send(neigh, NULL);
+			err = 0;
+		} else
+			err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
 		neigh_release(neigh);
 		goto out_dev_put;
 	}
@@ -2534,7 +2536,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6ac29a46e23e..2da59a0ac4ac 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 	if (endp == buf)
 		goto err;
 
-	rtnl_lock();
+	if (!rtnl_trylock())
+		return -ERESTARTSYS;
+
 	if (dev_isalive(net)) {
 		if ((ret = (*set)(net, new)) == 0)
 			ret = len;
@@ -496,7 +498,7 @@ int netdev_register_kobject(struct net_device *net)
 	dev->groups = groups;
 
 	BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
-	dev_set_name(dev, net->name);
+	dev_set_name(dev, "%s", net->name);
 
 #ifdef CONFIG_SYSFS
 	*groups++ = &netstat_group;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 000000000000..c8fb45665e4f
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,29 @@
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <trace/skb.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+
+DEFINE_TRACE(kfree_skb);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 55151faaf90c..e3bebd36f053 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -32,24 +32,14 @@ static __net_init int setup_net(struct net *net)
 {
 	/* Must be called with net_mutex held */
 	struct pernet_operations *ops;
-	int error;
-	struct net_generic *ng;
+	int error = 0;
 
 	atomic_set(&net->count, 1);
+
 #ifdef NETNS_REFCNT_DEBUG
 	atomic_set(&net->use_count, 0);
 #endif
 
-	error = -ENOMEM;
-	ng = kzalloc(sizeof(struct net_generic) +
-			INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL);
-	if (ng == NULL)
-		goto out;
-
-	ng->len = INITIAL_NET_GEN_PTRS;
-	rcu_assign_pointer(net->gen, ng);
-
-	error = 0;
 	list_for_each_entry(ops, &pernet_list, list) {
 		if (ops->init) {
 			error = ops->init(net);
@@ -70,24 +60,50 @@ out_undo:
 	}
 
 	rcu_barrier();
-	kfree(ng);
 	goto out;
 }
 
+static struct net_generic *net_alloc_generic(void)
+{
+	struct net_generic *ng;
+	size_t generic_size = sizeof(struct net_generic) +
+		INITIAL_NET_GEN_PTRS * sizeof(void *);
+
+	ng = kzalloc(generic_size, GFP_KERNEL);
+	if (ng)
+		ng->len = INITIAL_NET_GEN_PTRS;
+
+	return ng;
+}
+
 #ifdef CONFIG_NET_NS
 static struct kmem_cache *net_cachep;
 static struct workqueue_struct *netns_wq;
 
 static struct net *net_alloc(void)
 {
-	return kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+	struct net *net = NULL;
+	struct net_generic *ng;
+
+	ng = net_alloc_generic();
+	if (!ng)
+		goto out;
+
+	net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+	if (!net)
+		goto out_free;
+
+	rcu_assign_pointer(net->gen, ng);
+out:
+	return net;
+
+out_free:
+	kfree(ng);
+	goto out;
 }
 
 static void net_free(struct net *net)
 {
-	if (!net)
-		return;
-
 #ifdef NETNS_REFCNT_DEBUG
 	if (unlikely(atomic_read(&net->use_count) != 0)) {
 		printk(KERN_EMERG "network namespace not free! Usage: %d\n",
@@ -112,27 +128,28 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 	err = -ENOMEM;
 	new_net = net_alloc();
 	if (!new_net)
-		goto out;
+		goto out_err;
 
 	mutex_lock(&net_mutex);
 	err = setup_net(new_net);
-	if (err)
-		goto out_unlock;
-
-	rtnl_lock();
-	list_add_tail(&new_net->list, &net_namespace_list);
-	rtnl_unlock();
-
-
-out_unlock:
+	if (!err) {
+		rtnl_lock();
+		list_add_tail(&new_net->list, &net_namespace_list);
+		rtnl_unlock();
+	}
 	mutex_unlock(&net_mutex);
+
+	if (err)
+		goto out_free;
 out:
 	put_net(old_net);
-	if (err) {
-		net_free(new_net);
-		new_net = ERR_PTR(err);
-	}
 	return new_net;
+
+out_free:
+	net_free(new_net);
+out_err:
+	new_net = ERR_PTR(err);
+	goto out;
 }
 
 static void cleanup_net(struct work_struct *work)
@@ -140,9 +157,6 @@ static void cleanup_net(struct work_struct *work)
 	struct pernet_operations *ops;
 	struct net *net;
 
-	/* Be very certain incoming network packets will not find us */
-	rcu_barrier();
-
 	net = container_of(work, struct net, work);
 
 	mutex_lock(&net_mutex);
@@ -188,6 +202,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
 
 static int __init net_ns_init(void)
 {
+	struct net_generic *ng;
 	int err;
 
 	printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
@@ -202,6 +217,12 @@ static int __init net_ns_init(void)
 		panic("Could not create netns workq");
 #endif
 
+	ng = net_alloc_generic();
+	if (!ng)
+		panic("Could not allocate generic netns");
+
+	rcu_assign_pointer(init_net.gen, ng);
+
 	mutex_lock(&net_mutex);
 	err = setup_net(&init_net);
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 65498483325a..32d419f5ac98 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3275,8 +3275,7 @@ static void pktgen_stop(struct pktgen_thread *t)
 
 	list_for_each_entry(pkt_dev, &t->if_list, list) {
 		pktgen_stop_device(pkt_dev);
-		if (pkt_dev->skb)
-			kfree_skb(pkt_dev->skb);
+		kfree_skb(pkt_dev->skb);
 
 		pkt_dev->skb = NULL;
 	}
@@ -3303,8 +3302,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
 		if (!cur->removal_mark)
 			continue;
 
-		if (cur->skb)
-			kfree_skb(cur->skb);
+		kfree_skb(cur->skb);
 		cur->skb = NULL;
 
 		pktgen_remove_device(t, cur);
@@ -3328,8 +3326,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
 	list_for_each_safe(q, n, &t->if_list) {
 		cur = list_entry(q, struct pktgen_dev, list);
 
-		if (cur->skb)
-			kfree_skb(cur->skb);
+		kfree_skb(cur->skb);
 		cur->skb = NULL;
 
 		pktgen_remove_device(t, cur);
@@ -3393,8 +3390,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		if (!netif_running(odev)) {
 			pktgen_stop_device(pkt_dev);
-			if (pkt_dev->skb)
-				kfree_skb(pkt_dev->skb);
+			kfree_skb(pkt_dev->skb);
 			pkt_dev->skb = NULL;
 			goto out;
 		}
@@ -3415,8 +3411,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 		if ((++pkt_dev->clone_count >= pkt_dev->clone_skb)
 		    || (!pkt_dev->skb)) {
 			/* build a new pkt */
-			if (pkt_dev->skb)
-				kfree_skb(pkt_dev->skb);
+			kfree_skb(pkt_dev->skb);
 
 			pkt_dev->skb = fill_packet(odev, pkt_dev);
 			if (pkt_dev->skb == NULL) {
@@ -3498,8 +3493,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		/* Done with this */
 		pktgen_stop_device(pkt_dev);
-		if (pkt_dev->skb)
-			kfree_skb(pkt_dev->skb);
+		kfree_skb(pkt_dev->skb);
 		pkt_dev->skb = NULL;
 	}
 out:;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 790dd205bb5d..d78030f88bd0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 	return nlmsg_unicast(rtnl, skb, pid);
 }
 
-int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
-		struct nlmsghdr *nlh, gfp_t flags)
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+		 struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *rtnl = net->rtnl;
 	int report = 0;
@@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 	if (nlh)
 		report = nlmsg_report(nlh);
 
-	return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+	nlmsg_notify(rtnl, skb, pid, group, report, flags);
 }
 
 void rtnl_set_sk_err(struct net *net, u32 group, int error)
@@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e5a8351ff12d..6acbf9e79eb1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,6 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <trace/skb.h>
 
 #include "kmap_skb.h"
 
@@ -146,14 +147,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 }
 EXPORT_SYMBOL(skb_under_panic);
 
-void skb_truesize_bug(struct sk_buff *skb)
-{
-	WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) "
-	       "len=%u, sizeof(sk_buff)=%Zd\n",
-	       skb->truesize, skb->len, sizeof(struct sk_buff));
-}
-EXPORT_SYMBOL(skb_truesize_bug);
-
 /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
  *	'private' fields and also do memory statistics to find all the
  *	[BEEP] leaks.
@@ -450,11 +443,32 @@ void kfree_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_kfree_skb(skb, __builtin_return_address(0));
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(kfree_skb);
 
 /**
+ *	consume_skb - free an skbuff
+ *	@skb: buffer to free
+ *
+ *	Drop a ref to the buffer and free it if the usage count has hit zero
+ *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ *	is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+/**
  *	skb_recycle_check - check if skb can be reused for receive
  *	@skb: buffer
  *	@skb_size: minimum receive buffer size
@@ -1216,8 +1230,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
 					insp = list;
 				}
 				if (!pskb_pull(list, eat)) {
-					if (clone)
-						kfree_skb(clone);
+					kfree_skb(clone);
 					return NULL;
 				}
 				break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 40887e76652c..0620046e4eba 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -150,7 +150,7 @@ static const char *af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
-  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
@@ -165,7 +165,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
-  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
@@ -180,7 +180,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
-  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
+  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
@@ -725,7 +725,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	if (len < 0)
 		return -EINVAL;
 
-	v.val = 0;
+	memset(&v, 0, sizeof(v));
 
 	switch(optname) {
 	case SO_DEBUG:
@@ -1185,7 +1185,6 @@ void sock_rfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 
-	skb_truesize_check(skb);
 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 	sk_mem_uncharge(skb->sk, skb->truesize);
 }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 83d3398559ea..7db1de0497c6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,6 +11,7 @@
 #include <linux/socket.h>
 #include <linux/netdevice.h>
 #include <linux/init.h>
+#include <net/ip.h>
 #include <net/sock.h>
 
 static struct ctl_table net_core_table[] = {
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 45f95e55f873..7ea557b7c6b1 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -20,6 +20,9 @@
 /* We can spread an ack vector across multiple options */
 #define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
 
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN	16
+
 #define DCCP_ACKVEC_STATE_RECEIVED	0
 #define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
 #define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 08a569ff02d1..d6bc47363b1c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  *    - DCCP-Reset    with ACK Subheader and 4 bytes of Reset Code fields
  *  Hence a safe upper bound for the maximum option length is 1020-28 = 992
  */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
 #define DCCP_MAX_PACKET_HDR 28
 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
 
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD	 (32 * sizeof(uint32_t))
+
 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
 				     * state, about 60 seconds */
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 22a618af4893..36bcc00654d3 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	u32 ccmps = dccp_determine_ccmps(dp);
-	int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+	u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
 
 	/* Account for header lengths and IPv4/v6 option overhead */
 	cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
 		    sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
 
 	/*
-	 * FIXME: this should come from the CCID infrastructure, where, say,
-	 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
-	 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
-	 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
-	 * make it a multiple of 4
+	 * Leave enough headroom for common DCCP header options.
+	 * This only considers options which may appear on DCCP-Data packets, as
+	 * per table 3 in RFC 4340, 5.8. When running out of space for other
+	 * options (eg. Ack Vector which can take up to 255 bytes), it is better
+	 * to schedule a separate Ack. Thus we leave headroom for the following:
+	 *  - 1 byte for Slow Receiver (11.6)
+	 *  - 6 bytes for Timestamp (13.1)
+	 *  - 10 bytes for Timestamp Echo (13.3)
+	 *  - 8 bytes for NDP count (7.7, when activated)
+	 *  - 6 bytes for Data Checksum (9.3)
+	 *  - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
 	 */
-
-	cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4);
+	cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+			   (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
@@ -270,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block)
 			const int len = skb->len;
 
 			if (sk->sk_state == DCCP_PARTOPEN) {
-				/* See 8.1.5.  Handshake Completion */
+				const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+				/*
+				 * See 8.1.5 - Handshake Completion.
+				 *
+				 * For robustness we resend Confirm options until the client has
+				 * entered OPEN. During the initial feature negotiation, the MPS
+				 * is smaller than usual, reduced by the Change/Confirm options.
+				 */
+				if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+					DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+					dccp_send_ack(sk);
+					dccp_feat_list_purge(&dp->dccps_featneg);
+				}
+
 				inet_csk_schedule_ack(sk);
 				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						  inet_csk(sk)->icsk_rto,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 12bf7d4c16c6..9647d911f916 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1246,11 +1246,12 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 	case TIOCINQ:
 		lock_sock(sk);
-		if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) {
+		skb = skb_peek(&scp->other_receive_queue);
+		if (skb) {
 			amount = skb->len;
 		} else {
-			struct sk_buff *skb = sk->sk_receive_queue.next;
-			for(;;) {
+			skb = sk->sk_receive_queue.next;
+			for (;;) {
 				if (skb ==
 				    (struct sk_buff *)&sk->sk_receive_queue)
 					break;
@@ -1579,16 +1580,16 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us
 		default:
 #ifdef CONFIG_NETFILTER
 		{
-			int val, len;
+			int ret, len;
 
 			if(get_user(len, optlen))
 				return -EFAULT;
 
-			val = nf_getsockopt(sk, PF_DECnet, optname,
+			ret = nf_getsockopt(sk, PF_DECnet, optname,
 							optval, &len);
-			if (val >= 0)
-				val = put_user(len, optlen);
-			return val;
+			if (ret >= 0)
+				ret = put_user(len, optlen);
+			return ret;
 		}
 #endif
 		case DSO_STREAM:
@@ -2071,8 +2072,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
 	}
 out:
 
-	if (skb)
-		kfree_skb(skb);
+	kfree_skb(skb);
 
 	release_sock(sk);
 
@@ -2112,9 +2112,8 @@ static struct notifier_block dn_dev_notifier = {
 
 extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 
-static struct packet_type dn_dix_packet_type = {
+static struct packet_type dn_dix_packet_type __read_mostly = {
 	.type =		cpu_to_be16(ETH_P_DNA_RT),
-	.dev =		NULL,		/* All devices */
 	.func =		dn_route_rcv,
 };
 
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index daf2b98b15fe..1c6a5bb6f0c8 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -684,7 +684,6 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		return -ENODEV;
 
 	if ((dn_db = dev->dn_ptr) == NULL) {
-		int err;
 		dn_db = dn_dev_create(dev, &err);
 		if (!dn_db)
 			return err;
@@ -769,7 +768,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+	rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
@@ -1322,6 +1322,7 @@ static inline int is_dn_dev(struct net_device *dev)
 }
 
 static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(&dev_base_lock)
 {
 	int i;
 	struct net_device *dev;
@@ -1364,6 +1365,7 @@ static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void dn_dev_seq_stop(struct seq_file *seq, void *v)
+	__releases(&dev_base_lock)
 {
 	read_unlock(&dev_base_lock);
 }
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5130dee0b384..0cc4394117df 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -380,7 +380,6 @@ static int dn_return_short(struct sk_buff *skb)
 	unsigned char *ptr;
 	__le16 *src;
 	__le16 *dst;
-	__le16 tmp;
 
 	/* Add back headers */
 	skb_push(skb, skb->data - skb_network_header(skb));
@@ -399,10 +398,7 @@ static int dn_return_short(struct sk_buff *skb)
 	ptr += 2;
 	*ptr = 0; /* Zero hop count */
 
-	/* Swap source and destination */
-	tmp  = *src;
-	*src = *dst;
-	*dst = tmp;
+	swap(*src, *dst);
 
 	skb->pkt_type = PACKET_OUTGOING;
 	dn_rt_finish_output(skb, NULL, NULL);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 69ad9280c693..67054b0d550f 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+	rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 965397af9a80..5bcd592ae6dd 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -179,7 +179,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
 	}
 
 	if (write) {
-		int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
+		len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
 
 		if (copy_from_user(addr, buffer, len))
 			return -EFAULT;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 49211b35725b..c51b55400dc5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -41,13 +41,13 @@ config NET_DSA_MV88E6XXX_NEED_PPU
 	default n
 
 config NET_DSA_MV88E6131
-	bool "Marvell 88E6131 ethernet switch chip support"
+	bool "Marvell 88E6095/6095F/6131 ethernet switch chip support"
 	select NET_DSA_MV88E6XXX
 	select NET_DSA_MV88E6XXX_NEED_PPU
 	select NET_DSA_TAG_DSA
 	---help---
-	  This enables support for the Marvell 88E6131 ethernet switch
-	  chip.
+	  This enables support for the Marvell 88E6095/6095F/6131
+	  ethernet switch chips.
 
 config NET_DSA_MV88E6123_61_65
 	bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 33e99462023a..71489f69a42c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/dsa.c - Hardware switch handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -67,12 +67,13 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name)
 
 /* basic switch operations **************************************************/
 static struct dsa_switch *
-dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
-		 struct mii_bus *bus, struct net_device *dev)
+dsa_switch_setup(struct dsa_switch_tree *dst, int index,
+		 struct device *parent, struct mii_bus *bus)
 {
+	struct dsa_chip_data *pd = dst->pd->chip + index;
+	struct dsa_switch_driver *drv;
 	struct dsa_switch *ds;
 	int ret;
-	struct dsa_switch_driver *drv;
 	char *name;
 	int i;
 
@@ -81,11 +82,12 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 	 */
 	drv = dsa_switch_probe(bus, pd->sw_addr, &name);
 	if (drv == NULL) {
-		printk(KERN_ERR "%s: could not detect attached switch\n",
-		       dev->name);
+		printk(KERN_ERR "%s[%d]: could not detect attached switch\n",
+		       dst->master_netdev->name, index);
 		return ERR_PTR(-EINVAL);
 	}
-	printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name);
+	printk(KERN_INFO "%s[%d]: detected a %s switch\n",
+		dst->master_netdev->name, index, name);
 
 
 	/*
@@ -95,18 +97,16 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 	if (ds == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	ds->pd = pd;
-	ds->master_netdev = dev;
-	ds->master_mii_bus = bus;
-
+	ds->dst = dst;
+	ds->index = index;
+	ds->pd = dst->pd->chip + index;
 	ds->drv = drv;
-	ds->tag_protocol = drv->tag_protocol;
+	ds->master_mii_bus = bus;
 
 
 	/*
 	 * Validate supplied switch configuration.
 	 */
-	ds->cpu_port = -1;
 	for (i = 0; i < DSA_MAX_PORTS; i++) {
 		char *name;
 
@@ -115,32 +115,28 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 			continue;
 
 		if (!strcmp(name, "cpu")) {
-			if (ds->cpu_port != -1) {
+			if (dst->cpu_switch != -1) {
 				printk(KERN_ERR "multiple cpu ports?!\n");
 				ret = -EINVAL;
 				goto out;
 			}
-			ds->cpu_port = i;
+			dst->cpu_switch = index;
+			dst->cpu_port = i;
+		} else if (!strcmp(name, "dsa")) {
+			ds->dsa_port_mask |= 1 << i;
 		} else {
-			ds->valid_port_mask |= 1 << i;
+			ds->phys_port_mask |= 1 << i;
 		}
 	}
 
-	if (ds->cpu_port == -1) {
-		printk(KERN_ERR "no cpu port?!\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
 
 	/*
-	 * If we use a tagging format that doesn't have an ethertype
-	 * field, make sure that all packets from this point on get
-	 * sent to the tag format's receive function.  (Which will
-	 * discard received packets until we set ds->ports[] below.)
+	 * If the CPU connects to this switch, set the switch tree
+	 * tagging protocol to the preferred tagging format of this
+	 * switch.
 	 */
-	wmb();
-	dev->dsa_ptr = (void *)ds;
+	if (ds->dst->cpu_switch == index)
+		ds->dst->tag_protocol = drv->tag_protocol;
 
 
 	/*
@@ -150,7 +146,7 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 	if (ret < 0)
 		goto out;
 
-	ret = drv->set_addr(ds, dev->dev_addr);
+	ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
 	if (ret < 0)
 		goto out;
 
@@ -169,18 +165,18 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 	/*
 	 * Create network devices for physical switch ports.
 	 */
-	wmb();
 	for (i = 0; i < DSA_MAX_PORTS; i++) {
 		struct net_device *slave_dev;
 
-		if (!(ds->valid_port_mask & (1 << i)))
+		if (!(ds->phys_port_mask & (1 << i)))
 			continue;
 
 		slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]);
 		if (slave_dev == NULL) {
-			printk(KERN_ERR "%s: can't create dsa slave "
-			       "device for port %d(%s)\n",
-			       dev->name, i, pd->port_names[i]);
+			printk(KERN_ERR "%s[%d]: can't create dsa "
+			       "slave device for port %d(%s)\n",
+			       dst->master_netdev->name,
+			       index, i, pd->port_names[i]);
 			continue;
 		}
 
@@ -192,7 +188,6 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
 out_free:
 	mdiobus_free(ds->slave_mii_bus);
 out:
-	dev->dsa_ptr = NULL;
 	kfree(ds);
 	return ERR_PTR(ret);
 }
@@ -212,35 +207,42 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
  */
 bool dsa_uses_dsa_tags(void *dsa_ptr)
 {
-	struct dsa_switch *ds = dsa_ptr;
+	struct dsa_switch_tree *dst = dsa_ptr;
 
-	return !!(ds->tag_protocol == htons(ETH_P_DSA));
+	return !!(dst->tag_protocol == htons(ETH_P_DSA));
 }
 
 bool dsa_uses_trailer_tags(void *dsa_ptr)
 {
-	struct dsa_switch *ds = dsa_ptr;
+	struct dsa_switch_tree *dst = dsa_ptr;
 
-	return !!(ds->tag_protocol == htons(ETH_P_TRAILER));
+	return !!(dst->tag_protocol == htons(ETH_P_TRAILER));
 }
 
 
 /* link polling *************************************************************/
 static void dsa_link_poll_work(struct work_struct *ugly)
 {
-	struct dsa_switch *ds;
+	struct dsa_switch_tree *dst;
+	int i;
+
+	dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
 
-	ds = container_of(ugly, struct dsa_switch, link_poll_work);
+	for (i = 0; i < dst->pd->nr_chips; i++) {
+		struct dsa_switch *ds = dst->ds[i];
 
-	ds->drv->poll_link(ds);
-	mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ));
+		if (ds != NULL && ds->drv->poll_link != NULL)
+			ds->drv->poll_link(ds);
+	}
+
+	mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
 }
 
-static void dsa_link_poll_timer(unsigned long _ds)
+static void dsa_link_poll_timer(unsigned long _dst)
 {
-	struct dsa_switch *ds = (void *)_ds;
+	struct dsa_switch_tree *dst = (void *)_dst;
 
-	schedule_work(&ds->link_poll_work);
+	schedule_work(&dst->link_poll_work);
 }
 
 
@@ -303,18 +305,14 @@ static int dsa_probe(struct platform_device *pdev)
 	static int dsa_version_printed;
 	struct dsa_platform_data *pd = pdev->dev.platform_data;
 	struct net_device *dev;
-	struct mii_bus *bus;
-	struct dsa_switch *ds;
+	struct dsa_switch_tree *dst;
+	int i;
 
 	if (!dsa_version_printed++)
 		printk(KERN_NOTICE "Distributed Switch Architecture "
 			"driver version %s\n", dsa_driver_version);
 
-	if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL)
-		return -EINVAL;
-
-	bus = dev_to_mii_bus(pd->mii_bus);
-	if (bus == NULL)
+	if (pd == NULL || pd->netdev == NULL)
 		return -EINVAL;
 
 	dev = dev_to_net_device(pd->netdev);
@@ -326,36 +324,79 @@ static int dsa_probe(struct platform_device *pdev)
 		return -EEXIST;
 	}
 
-	ds = dsa_switch_setup(&pdev->dev, pd, bus, dev);
-	if (IS_ERR(ds)) {
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (dst == NULL) {
 		dev_put(dev);
-		return PTR_ERR(ds);
+		return -ENOMEM;
 	}
 
-	if (ds->drv->poll_link != NULL) {
-		INIT_WORK(&ds->link_poll_work, dsa_link_poll_work);
-		init_timer(&ds->link_poll_timer);
-		ds->link_poll_timer.data = (unsigned long)ds;
-		ds->link_poll_timer.function = dsa_link_poll_timer;
-		ds->link_poll_timer.expires = round_jiffies(jiffies + HZ);
-		add_timer(&ds->link_poll_timer);
+	platform_set_drvdata(pdev, dst);
+
+	dst->pd = pd;
+	dst->master_netdev = dev;
+	dst->cpu_switch = -1;
+	dst->cpu_port = -1;
+
+	for (i = 0; i < pd->nr_chips; i++) {
+		struct mii_bus *bus;
+		struct dsa_switch *ds;
+
+		bus = dev_to_mii_bus(pd->chip[i].mii_bus);
+		if (bus == NULL) {
+			printk(KERN_ERR "%s[%d]: no mii bus found for "
+				"dsa switch\n", dev->name, i);
+			continue;
+		}
+
+		ds = dsa_switch_setup(dst, i, &pdev->dev, bus);
+		if (IS_ERR(ds)) {
+			printk(KERN_ERR "%s[%d]: couldn't create dsa switch "
+				"instance (error %ld)\n", dev->name, i,
+				PTR_ERR(ds));
+			continue;
+		}
+
+		dst->ds[i] = ds;
+		if (ds->drv->poll_link != NULL)
+			dst->link_poll_needed = 1;
 	}
 
-	platform_set_drvdata(pdev, ds);
+	/*
+	 * If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.
+	 */
+	wmb();
+	dev->dsa_ptr = (void *)dst;
+
+	if (dst->link_poll_needed) {
+		INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
+		init_timer(&dst->link_poll_timer);
+		dst->link_poll_timer.data = (unsigned long)dst;
+		dst->link_poll_timer.function = dsa_link_poll_timer;
+		dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
+		add_timer(&dst->link_poll_timer);
+	}
 
 	return 0;
 }
 
 static int dsa_remove(struct platform_device *pdev)
 {
-	struct dsa_switch *ds = platform_get_drvdata(pdev);
+	struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+	int i;
 
-	if (ds->drv->poll_link != NULL)
-		del_timer_sync(&ds->link_poll_timer);
+	if (dst->link_poll_needed)
+		del_timer_sync(&dst->link_poll_timer);
 
 	flush_scheduled_work();
 
-	dsa_switch_destroy(ds);
+	for (i = 0; i < dst->pd->nr_chips; i++) {
+		struct dsa_switch *ds = dst->ds[i];
+
+		if (ds != NULL)
+			dsa_switch_destroy(ds);
+	}
 
 	return 0;
 }
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7063378a1ebf..41055f33d28a 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -1,6 +1,6 @@
 /*
  * net/dsa/dsa_priv.h - Hardware switch handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,42 +19,107 @@
 
 struct dsa_switch {
 	/*
-	 * Configuration data for the platform device that owns
-	 * this dsa switch instance.
+	 * Parent switch tree, and switch index.
 	 */
-	struct dsa_platform_data	*pd;
+	struct dsa_switch_tree	*dst;
+	int			index;
 
 	/*
-	 * References to network device and mii bus to use.
+	 * Configuration data for this switch.
 	 */
-	struct net_device		*master_netdev;
-	struct mii_bus			*master_mii_bus;
+	struct dsa_chip_data	*pd;
 
 	/*
-	 * The used switch driver and frame tagging type.
+	 * The used switch driver.
 	 */
 	struct dsa_switch_driver	*drv;
-	__be16				tag_protocol;
+
+	/*
+	 * Reference to mii bus to use.
+	 */
+	struct mii_bus		*master_mii_bus;
 
 	/*
 	 * Slave mii_bus and devices for the individual ports.
 	 */
-	int				cpu_port;
-	u32				valid_port_mask;
-	struct mii_bus			*slave_mii_bus;
-	struct net_device		*ports[DSA_MAX_PORTS];
+	u32			dsa_port_mask;
+	u32			phys_port_mask;
+	struct mii_bus		*slave_mii_bus;
+	struct net_device	*ports[DSA_MAX_PORTS];
+};
+
+struct dsa_switch_tree {
+	/*
+	 * Configuration data for the platform device that owns
+	 * this dsa switch tree instance.
+	 */
+	struct dsa_platform_data	*pd;
+
+	/*
+	 * Reference to network device to use, and which tagging
+	 * protocol to use.
+	 */
+	struct net_device	*master_netdev;
+	__be16			tag_protocol;
+
+	/*
+	 * The switch and port to which the CPU is attached.
+	 */
+	s8			cpu_switch;
+	s8			cpu_port;
 
 	/*
 	 * Link state polling.
 	 */
-	struct work_struct		link_poll_work;
-	struct timer_list		link_poll_timer;
+	int			link_poll_needed;
+	struct work_struct	link_poll_work;
+	struct timer_list	link_poll_timer;
+
+	/*
+	 * Data for the individual switch chips.
+	 */
+	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
 };
 
+static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
+{
+	return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
+}
+
+static inline u8 dsa_upstream_port(struct dsa_switch *ds)
+{
+	struct dsa_switch_tree *dst = ds->dst;
+
+	/*
+	 * If this is the root switch (i.e. the switch that connects
+	 * to the CPU), return the cpu port number on this switch.
+	 * Else return the (DSA) port number that connects to the
+	 * switch that is one hop closer to the cpu.
+	 */
+	if (dst->cpu_switch == ds->index)
+		return dst->cpu_port;
+	else
+		return ds->pd->rtable[dst->cpu_switch];
+}
+
 struct dsa_slave_priv {
+	/*
+	 * The linux network interface corresponding to this
+	 * switch port.
+	 */
 	struct net_device	*dev;
+
+	/*
+	 * Which switch this port is a part of, and the port index
+	 * for this port.
+	 */
 	struct dsa_switch	*parent;
-	int			port;
+	u8			port;
+
+	/*
+	 * The phylib phy_device pointer for the PHY connected
+	 * to this port.
+	 */
 	struct phy_device	*phy;
 };
 
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
index 85081ae9fe89..83277f463af7 100644
--- a/net/dsa/mv88e6060.c
+++ b/net/dsa/mv88e6060.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -81,7 +81,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
 	/*
 	 * Reset the switch.
 	 */
-	REG_WRITE(REG_GLOBAL, 0x0A, 0xa130);
+	REG_WRITE(REG_GLOBAL, 0x0a, 0xa130);
 
 	/*
 	 * Wait up to one second for reset to complete.
@@ -128,7 +128,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
 	 * state to Forwarding.  Additionally, if this is the CPU
 	 * port, enable Ingress and Egress Trailer tagging mode.
 	 */
-	REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003);
+	REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ?  0x4103 : 0x0003);
 
 	/*
 	 * Port based VLAN map: give each port its own address
@@ -138,9 +138,9 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
 	 */
 	REG_WRITE(addr, 0x06,
 			((p & 0xf) << 12) |
-			 ((p == ds->cpu_port) ?
-				ds->valid_port_mask :
-				(1 << ds->cpu_port)));
+			 (dsa_is_cpu_port(ds, p) ?
+				ds->phys_port_mask :
+				(1 << ds->dst->cpu_port)));
 
 	/*
 	 * Port Association Vector: when learning source addresses
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
index 100318722214..52faaa21a4d9 100644
--- a/net/dsa/mv88e6123_61_65.c
+++ b/net/dsa/mv88e6123_61_65.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -98,17 +98,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
 		return ret;
 
 	/*
-	 * Configure the cpu port, and configure the cpu port as the
-	 * port to which ingress and egress monitor frames are to be
-	 * sent.
+	 * Configure the upstream port, and configure the upstream
+	 * port as the port to which ingress and egress monitor frames
+	 * are to be sent.
 	 */
-	REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110));
+	REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110));
 
 	/*
 	 * Disable remote management for now, and set the switch's
-	 * DSA device number to zero.
+	 * DSA device number.
 	 */
-	REG_WRITE(REG_GLOBAL, 0x1c, 0x0000);
+	REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f);
 
 	/*
 	 * Send all frames with destination addresses matching
@@ -133,10 +133,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
 	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
 
 	/*
-	 * Map all DSA device IDs to the CPU port.
+	 * Program the DSA routing table.
 	 */
-	for (i = 0; i < 32; i++)
-		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+	for (i = 0; i < 32; i++) {
+		int nexthop;
+
+		nexthop = 0x1f;
+		if (i != ds->index && i < ds->dst->pd->nr_chips)
+			nexthop = ds->pd->rtable[i] & 0x1f;
+
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+	}
 
 	/*
 	 * Clear all trunk masks.
@@ -176,12 +183,18 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
 static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
 {
 	int addr = REG_PORT(p);
+	u16 val;
 
 	/*
 	 * MAC Forcing register: don't force link, speed, duplex
-	 * or flow control state to any particular values.
+	 * or flow control state to any particular values on physical
+	 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+	 * full duplex.
 	 */
-	REG_WRITE(addr, 0x01, 0x0003);
+	if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+		REG_WRITE(addr, 0x01, 0x003e);
+	else
+		REG_WRITE(addr, 0x01, 0x0003);
 
 	/*
 	 * Do not limit the period of time that this port can be
@@ -192,37 +205,50 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
 
 	/*
 	 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
-	 * configure the requested (DSA/EDSA) tagging mode if this is
-	 * the CPU port, disable Header mode, enable IGMP/MLD snooping,
-	 * disable VLAN tunneling, determine priority by looking at
-	 * 802.1p and IP priority fields (IP prio has precedence), and
-	 * set STP state to Forwarding.  Finally, if this is the CPU
-	 * port, additionally enable forwarding of unknown unicast and
-	 * multicast addresses.
-	 */
-	REG_WRITE(addr, 0x04,
-			(p == ds->cpu_port) ?
-			 (ds->tag_protocol == htons(ETH_P_DSA)) ?
-			  0x053f : 0x373f :
-			 0x0433);
+	 * disable Header mode, enable IGMP/MLD snooping, disable VLAN
+	 * tunneling, determine priority by looking at 802.1p and IP
+	 * priority fields (IP prio has precedence), and set STP state
+	 * to Forwarding.
+	 *
+	 * If this is the CPU link, use DSA or EDSA tagging depending
+	 * on which tagging mode was configured.
+	 *
+	 * If this is a link to another switch, use DSA tagging mode.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown unicasts and multicasts.
+	 */
+	val = 0x0433;
+	if (dsa_is_cpu_port(ds, p)) {
+		if (ds->dst->tag_protocol == htons(ETH_P_EDSA))
+			val |= 0x3300;
+		else
+			val |= 0x0100;
+	}
+	if (ds->dsa_port_mask & (1 << p))
+		val |= 0x0100;
+	if (p == dsa_upstream_port(ds))
+		val |= 0x000c;
+	REG_WRITE(addr, 0x04, val);
 
 	/*
 	 * Port Control 1: disable trunking.  Also, if this is the
 	 * CPU port, enable learn messages to be sent to this port.
 	 */
-	REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+	REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
 
 	/*
 	 * Port based VLAN map: give each port its own address
 	 * database, allow the CPU port to talk to each of the 'real'
 	 * ports, and allow each of the 'real' ports to only talk to
-	 * the CPU port.
-	 */
-	REG_WRITE(addr, 0x06,
-			((p & 0xf) << 12) |
-			 ((p == ds->cpu_port) ?
-				ds->valid_port_mask :
-				(1 << ds->cpu_port)));
+	 * the upstream port.
+	 */
+	val = (p & 0xf) << 12;
+	if (dsa_is_cpu_port(ds, p))
+		val |= ds->phys_port_mask;
+	else
+		val |= 1 << dsa_upstream_port(ds);
+	REG_WRITE(addr, 0x06, val);
 
 	/*
 	 * Default VLAN ID and priority: don't set a default VLAN
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
index 70fae2444cb6..bb2b41bc854e 100644
--- a/net/dsa/mv88e6131.c
+++ b/net/dsa/mv88e6131.c
@@ -1,6 +1,6 @@
 /*
- * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support
- * Copyright (c) 2008 Marvell Semiconductor
+ * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,6 +21,8 @@ static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr)
 	ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
 	if (ret >= 0) {
 		ret &= 0xfff0;
+		if (ret == 0x0950)
+			return "Marvell 88E6095/88E6095F";
 		if (ret == 0x1060)
 			return "Marvell 88E6131";
 	}
@@ -36,7 +38,7 @@ static int mv88e6131_switch_reset(struct dsa_switch *ds)
 	/*
 	 * Set all ports to the disabled state.
 	 */
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < 11; i++) {
 		ret = REG_READ(REG_PORT(i), 0x04);
 		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
 	}
@@ -100,17 +102,17 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
 	REG_WRITE(REG_GLOBAL, 0x19, 0x8100);
 
 	/*
-	 * Disable ARP mirroring, and configure the cpu port as the
-	 * port to which ingress and egress monitor frames are to be
-	 * sent.
+	 * Disable ARP mirroring, and configure the upstream port as
+	 * the port to which ingress and egress monitor frames are to
+	 * be sent.
 	 */
-	REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0);
+	REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0);
 
 	/*
 	 * Disable cascade port functionality, and set the switch's
-	 * DSA device number to zero.
+	 * DSA device number.
 	 */
-	REG_WRITE(REG_GLOBAL, 0x1c, 0xe000);
+	REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f));
 
 	/*
 	 * Send all frames with destination addresses matching
@@ -127,16 +129,23 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
 	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
 
 	/*
-	 * Map all DSA device IDs to the CPU port.
+	 * Program the DSA routing table.
 	 */
-	for (i = 0; i < 32; i++)
-		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port);
+	for (i = 0; i < 32; i++) {
+		int nexthop;
+
+		nexthop = 0x1f;
+		if (i != ds->index && i < ds->dst->pd->nr_chips)
+			nexthop = ds->pd->rtable[i] & 0x1f;
+
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+	}
 
 	/*
 	 * Clear all trunk masks.
 	 */
 	for (i = 0; i < 8; i++)
-		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff);
+		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff);
 
 	/*
 	 * Clear all trunk mappings.
@@ -156,12 +165,18 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
 static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
 {
 	int addr = REG_PORT(p);
+	u16 val;
 
 	/*
 	 * MAC Forcing register: don't force link, speed, duplex
-	 * or flow control state to any particular values.
+	 * or flow control state to any particular values on physical
+	 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+	 * full duplex.
 	 */
-	REG_WRITE(addr, 0x01, 0x0003);
+	if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+		REG_WRITE(addr, 0x01, 0x003e);
+	else
+		REG_WRITE(addr, 0x01, 0x0003);
 
 	/*
 	 * Port Control: disable Core Tag, disable Drop-on-Lock,
@@ -169,29 +184,40 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
 	 * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN
 	 * tunneling, determine priority by looking at 802.1p and
 	 * IP priority fields (IP prio has precedence), and set STP
-	 * state to Forwarding.  Finally, if this is the CPU port,
-	 * additionally enable DSA tagging and forwarding of unknown
-	 * unicast addresses.
+	 * state to Forwarding.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown unicasts, and enable DSA tagging
+	 * mode.
+	 *
+	 * If this is the link to another switch, use DSA tagging
+	 * mode, but do not enable forwarding of unknown unicasts.
 	 */
-	REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433);
+	val = 0x0433;
+	if (p == dsa_upstream_port(ds))
+		val |= 0x0104;
+	if (ds->dsa_port_mask & (1 << p))
+		val |= 0x0100;
+	REG_WRITE(addr, 0x04, val);
 
 	/*
 	 * Port Control 1: disable trunking.  Also, if this is the
 	 * CPU port, enable learn messages to be sent to this port.
 	 */
-	REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000);
+	REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
 
 	/*
 	 * Port based VLAN map: give each port its own address
 	 * database, allow the CPU port to talk to each of the 'real'
 	 * ports, and allow each of the 'real' ports to only talk to
-	 * the CPU port.
+	 * the upstream port.
 	 */
-	REG_WRITE(addr, 0x06,
-			((p & 0xf) << 12) |
-			 ((p == ds->cpu_port) ?
-				ds->valid_port_mask :
-				(1 << ds->cpu_port)));
+	val = (p & 0xf) << 12;
+	if (dsa_is_cpu_port(ds, p))
+		val |= ds->phys_port_mask;
+	else
+		val |= 1 << dsa_upstream_port(ds);
+	REG_WRITE(addr, 0x06, val);
 
 	/*
 	 * Default VLAN ID and priority: don't set a default VLAN
@@ -207,13 +233,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
 	 * untagged frames on this port, do a destination address
 	 * lookup on received packets as usual, don't send a copy
 	 * of all transmitted/received frames on this port to the
-	 * CPU, and configure the CPU port number.  Also, if this
-	 * is the CPU port, enable forwarding of unknown multicast
-	 * addresses.
+	 * CPU, and configure the upstream port number.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown multicast addresses.
 	 */
-	REG_WRITE(addr, 0x08,
-			((p == ds->cpu_port) ? 0x00c0 : 0x0080) |
-			 ds->cpu_port);
+	val = 0x0080 | dsa_upstream_port(ds);
+	if (p == dsa_upstream_port(ds))
+		val |= 0x0040;
+	REG_WRITE(addr, 0x08, val);
 
 	/*
 	 * Rate Control: disable ingress rate limiting.
@@ -268,7 +296,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
 	if (ret < 0)
 		return ret;
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < 11; i++) {
 		ret = mv88e6131_setup_port(ds, i);
 		if (ret < 0)
 			return ret;
@@ -279,7 +307,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
 
 static int mv88e6131_port_to_phy_addr(int port)
 {
-	if (port >= 0 && port != 3 && port <= 7)
+	if (port >= 0 && port <= 11)
 		return port;
 	return -1;
 }
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a68fd79e9eca..ed131181215d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/slave.c - Slave device handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
 {
 	struct dsa_switch *ds = bus->priv;
 
-	if (ds->valid_port_mask & (1 << addr))
+	if (ds->phys_port_mask & (1 << addr))
 		return ds->drv->phy_read(ds, addr, reg);
 
 	return 0xffff;
@@ -29,7 +29,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
 {
 	struct dsa_switch *ds = bus->priv;
 
-	if (ds->valid_port_mask & (1 << addr))
+	if (ds->phys_port_mask & (1 << addr))
 		return ds->drv->phy_write(ds, addr, reg, val);
 
 	return 0;
@@ -43,15 +43,24 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
 	ds->slave_mii_bus->write = dsa_slave_phy_write;
 	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x",
 			ds->master_mii_bus->id, ds->pd->sw_addr);
-	ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev);
+	ds->slave_mii_bus->parent = &ds->master_mii_bus->dev;
 }
 
 
 /* slave device handling ****************************************************/
+static int dsa_slave_init(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	dev->iflink = p->parent->dst->master_netdev->ifindex;
+
+	return 0;
+}
+
 static int dsa_slave_open(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->master_netdev;
+	struct net_device *master = p->parent->dst->master_netdev;
 	int err;
 
 	if (!(master->flags & IFF_UP))
@@ -89,7 +98,7 @@ out:
 static int dsa_slave_close(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->master_netdev;
+	struct net_device *master = p->parent->dst->master_netdev;
 
 	dev_mc_unsync(master, dev);
 	dev_unicast_unsync(master, dev);
@@ -107,7 +116,7 @@ static int dsa_slave_close(struct net_device *dev)
 static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->master_netdev;
+	struct net_device *master = p->parent->dst->master_netdev;
 
 	if (change & IFF_ALLMULTI)
 		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -118,7 +127,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
 static void dsa_slave_set_rx_mode(struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->master_netdev;
+	struct net_device *master = p->parent->dst->master_netdev;
 
 	dev_mc_sync(master, dev);
 	dev_unicast_sync(master, dev);
@@ -127,7 +136,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
 static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct net_device *master = p->parent->master_netdev;
+	struct net_device *master = p->parent->dst->master_netdev;
 	struct sockaddr *addr = a;
 	int err;
 
@@ -288,6 +297,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 
 #ifdef CONFIG_NET_DSA_TAG_DSA
 static const struct net_device_ops dsa_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
 	.ndo_start_xmit		= dsa_xmit,
@@ -300,6 +310,7 @@ static const struct net_device_ops dsa_netdev_ops = {
 #endif
 #ifdef CONFIG_NET_DSA_TAG_EDSA
 static const struct net_device_ops edsa_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
 	.ndo_start_xmit		= edsa_xmit,
@@ -312,6 +323,7 @@ static const struct net_device_ops edsa_netdev_ops = {
 #endif
 #ifdef CONFIG_NET_DSA_TAG_TRAILER
 static const struct net_device_ops trailer_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
 	.ndo_start_xmit		= trailer_xmit,
@@ -328,7 +340,7 @@ struct net_device *
 dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 		 int port, char *name)
 {
-	struct net_device *master = ds->master_netdev;
+	struct net_device *master = ds->dst->master_netdev;
 	struct net_device *slave_dev;
 	struct dsa_slave_priv *p;
 	int ret;
@@ -343,7 +355,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
 	slave_dev->tx_queue_len = 0;
 
-	switch (ds->tag_protocol) {
+	switch (ds->dst->tag_protocol) {
 #ifdef CONFIG_NET_DSA_TAG_DSA
 	case htons(ETH_P_DSA):
 		slave_dev->netdev_ops = &dsa_netdev_ops;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 63e532a69fdb..8fa25bafe6ca 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,7 +36,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x60;
+		dsa_header[0] = 0x60 | p->parent->index;
 		dsa_header[1] = p->port << 3;
 
 		/*
@@ -57,7 +57,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * Construct untagged FROM_CPU DSA tag.
 		 */
 		dsa_header = skb->data + 2 * ETH_ALEN;
-		dsa_header[0] = 0x40;
+		dsa_header[0] = 0x40 | p->parent->index;
 		dsa_header[1] = p->port << 3;
 		dsa_header[2] = 0x00;
 		dsa_header[3] = 0x00;
@@ -65,7 +65,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb->protocol = htons(ETH_P_DSA);
 
-	skb->dev = p->parent->master_netdev;
+	skb->dev = p->parent->dst->master_netdev;
 	dev_queue_xmit(skb);
 
 	return NETDEV_TX_OK;
@@ -78,11 +78,13 @@ out_free:
 static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 		   struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct dsa_switch *ds = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
 	u8 *dsa_header;
+	int source_device;
 	int source_port;
 
-	if (unlikely(ds == NULL))
+	if (unlikely(dst == NULL))
 		goto out_drop;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
@@ -98,16 +100,24 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	dsa_header = skb->data - 2;
 
 	/*
-	 * Check that frame type is either TO_CPU or FORWARD, and
-	 * that the source device is zero.
+	 * Check that frame type is either TO_CPU or FORWARD.
 	 */
-	if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0)
+	if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
 		goto out_drop;
 
 	/*
-	 * Check that the source port is a registered DSA port.
+	 * Determine source device and port.
 	 */
+	source_device = dsa_header[0] & 0x1f;
 	source_port = (dsa_header[1] >> 3) & 0x1f;
+
+	/*
+	 * Check that the source device exists and that the source
+	 * port is a registered DSA port.
+	 */
+	if (source_device >= dst->pd->nr_chips)
+		goto out_drop;
+	ds = dst->ds[source_device];
 	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
 		goto out_drop;
 
@@ -175,7 +185,7 @@ out:
 	return 0;
 }
 
-static struct packet_type dsa_packet_type = {
+static struct packet_type dsa_packet_type __read_mostly = {
 	.type	= cpu_to_be16(ETH_P_DSA),
 	.func	= dsa_rcv,
 };
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6197f9a7ef42..815607bd286f 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/tag_edsa.c - Ethertype DSA tagging
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -45,7 +45,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x60;
+		edsa_header[4] = 0x60 | p->parent->index;
 		edsa_header[5] = p->port << 3;
 
 		/*
@@ -70,7 +70,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 		edsa_header[1] = ETH_P_EDSA & 0xff;
 		edsa_header[2] = 0x00;
 		edsa_header[3] = 0x00;
-		edsa_header[4] = 0x40;
+		edsa_header[4] = 0x40 | p->parent->index;
 		edsa_header[5] = p->port << 3;
 		edsa_header[6] = 0x00;
 		edsa_header[7] = 0x00;
@@ -78,7 +78,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb->protocol = htons(ETH_P_EDSA);
 
-	skb->dev = p->parent->master_netdev;
+	skb->dev = p->parent->dst->master_netdev;
 	dev_queue_xmit(skb);
 
 	return NETDEV_TX_OK;
@@ -91,11 +91,13 @@ out_free:
 static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 		    struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct dsa_switch *ds = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
 	u8 *edsa_header;
+	int source_device;
 	int source_port;
 
-	if (unlikely(ds == NULL))
+	if (unlikely(dst == NULL))
 		goto out_drop;
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
@@ -111,16 +113,24 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
 	edsa_header = skb->data + 2;
 
 	/*
-	 * Check that frame type is either TO_CPU or FORWARD, and
-	 * that the source device is zero.
+	 * Check that frame type is either TO_CPU or FORWARD.
 	 */
-	if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0)
+	if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
 		goto out_drop;
 
 	/*
-	 * Check that the source port is a registered DSA port.
+	 * Determine source device and port.
 	 */
+	source_device = edsa_header[0] & 0x1f;
 	source_port = (edsa_header[1] >> 3) & 0x1f;
+
+	/*
+	 * Check that the source device exists and that the source
+	 * port is a registered DSA port.
+	 */
+	if (source_device >= dst->pd->nr_chips)
+		goto out_drop;
+	ds = dst->ds[source_device];
 	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
 		goto out_drop;
 
@@ -194,7 +204,7 @@ out:
 	return 0;
 }
 
-static struct packet_type edsa_packet_type = {
+static struct packet_type edsa_packet_type __read_mostly = {
 	.type	= cpu_to_be16(ETH_P_EDSA),
 	.func	= edsa_rcv,
 };
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d7e7f424ff0c..1c3e30c38b86 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -1,6 +1,6 @@
 /*
  * net/dsa/tag_trailer.c - Trailer tag format handling
- * Copyright (c) 2008 Marvell Semiconductor
+ * Copyright (c) 2008-2009 Marvell Semiconductor
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,7 +59,7 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	nskb->protocol = htons(ETH_P_TRAILER);
 
-	nskb->dev = p->parent->master_netdev;
+	nskb->dev = p->parent->dst->master_netdev;
 	dev_queue_xmit(nskb);
 
 	return NETDEV_TX_OK;
@@ -68,12 +68,14 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
 		       struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct dsa_switch *ds = dev->dsa_ptr;
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
 	u8 *trailer;
 	int source_port;
 
-	if (unlikely(ds == NULL))
+	if (unlikely(dst == NULL))
 		goto out_drop;
+	ds = dst->ds[0];
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	if (skb == NULL)
@@ -111,7 +113,7 @@ out:
 	return 0;
 }
 
-static struct packet_type trailer_packet_type = {
+static struct packet_type trailer_packet_type __read_mostly = {
 	.type	= cpu_to_be16(ETH_P_TRAILER),
 	.func	= trailer_rcv,
 };
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 7bf35582f656..6f479fa522c3 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -1102,7 +1102,7 @@ drop:
 	return NET_RX_DROP;
 }
 
-static struct packet_type econet_packet_type = {
+static struct packet_type econet_packet_type __read_mostly = {
 	.type =		cpu_to_be16(ETH_P_ECONET),
 	.func =		econet_rcv,
 };
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a359..b2cf91e4ccaa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
 
 	  at boot time after the /proc file system has been mounted.
 
-	  If you turn on IP forwarding, you will also get the rp_filter, which
+	  If you turn on IP forwarding, you should consider the rp_filter, which
 	  automatically rejects incoming packets if the routing table entry
 	  for their source address doesn't match the network interface they're
 	  arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
 	  rp_filter on use:
 
 	  echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
-	  or
+	   and
 	  echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
 
+	  Note that some distributions enable it in startup scripts.
+	  For details about rp_filter strict and loose mode read
+	  <file:Documentation/networking/ip-sysctl.txt>.
+
 	  If unsure, say N here.
 
-choice 
+choice
 	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
 	depends on IP_ADVANCED_ROUTER
 	default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
 config ASK_IP_FIB_HASH
 	bool "FIB_HASH"
 	---help---
-	Current FIB is very proven and good enough for most users.
+	  Current FIB is very proven and good enough for most users.
 
 config IP_FIB_TRIE
 	bool "FIB_TRIE"
 	---help---
-	Use new experimental LC-trie as FIB lookup algorithm. 
-        This improves lookup performance if you have a large
-	number of routes.
-
-	LC-trie is a longest matching prefix lookup algorithm which
-	performs better than FIB_HASH for large routing tables.
-	But, it consumes more memory and is more complex.
-	
-	LC-trie is described in:
-	
- 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
-	An experimental study of compression methods for dynamic tries
- 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-       
+	  Use new experimental LC-trie as FIB lookup algorithm.
+	  This improves lookup performance if you have a large
+	  number of routes.
+
+	  LC-trie is a longest matching prefix lookup algorithm which
+	  performs better than FIB_HASH for large routing tables.
+	  But, it consumes more memory and is more complex.
+
+	  LC-trie is described in:
+
+	  IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+	  IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
+	  June 1999
+
+	  An experimental study of compression methods for dynamic tries
+	  Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+	  http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+
 endchoice
 
 config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
 	  <file:Documentation/filesystems/nfsroot.txt> for details.
 
 # not yet ready..
-#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP		
+#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP
 config NET_IPIP
 	tristate "IP: tunneling"
 	select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
 	---help---
 	  Support for IP Payload Compression Protocol (IPComp) (RFC3173),
 	  typically needed for IPsec.
-	  
+
 	  If unsure, say Y.
 
 config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
 	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
 	  native Linux tools such as ss. ss is included in iproute2, currently
 	  downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
-	  
+
 	  If unsure, say Y.
 
 config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 627be4dc7fb0..d5aaabbb7cb3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1500,7 +1500,7 @@ static int ipv4_proc_init(void);
  *	IP protocol layer initialiser
  */
 
-static struct packet_type ip_packet_type = {
+static struct packet_type ip_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.func = ip_rcv,
 	.gso_send_check = inet_gso_send_check,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3f6b7354699b..f11931c18381 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
  *  cache.
  */
 
-	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
-	if (sip == 0) {
+	/*
+	 *  Special case: IPv4 duplicate address detection packet (RFC2131)
+	 *  and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
+	 */
+	if (sip == 0 || tip == sip) {
 		if (arp->ar_op == htons(ARPOP_REQUEST) &&
 		    inet_addr_type(net, tip) == RTN_LOCAL &&
 		    !arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
 out:
 	if (in_dev)
 		in_dev_put(in_dev);
-	kfree_skb(skb);
+	consume_skb(skb);
 	return 0;
 }
 
@@ -1225,7 +1228,7 @@ void arp_ifdown(struct net_device *dev)
  *	Called once on startup.
  */
 
-static struct packet_type arp_packet_type = {
+static struct packet_type arp_packet_type __read_mostly = {
 	.type =	cpu_to_be16(ETH_P_ARP),
 	.func =	arp_rcv,
 };
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 6bb2635b5ded..7bc992976d29 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,11 +3,16 @@
  *
  * This is an implementation of the CIPSO 2.2 protocol as specified in
  * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
- * FIPS-188, copies of both documents can be found in the Documentation
- * directory.  While CIPSO never became a full IETF RFC standard many vendors
+ * FIPS-188.  While CIPSO never became a full IETF RFC standard many vendors
  * have chosen to adopt the protocol and over the years it has become a
  * de-facto standard for labeled networking.
  *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ *   http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ *   http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
  * Author: Paul Moore <paul.moore@hp.com>
  *
  */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d519a6a66726..126bb911880f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e474..cafcc49d0993 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	fib_res_put(&res);
 	if (no_addr)
 		goto last_resort;
-	if (rpf)
+	if (rpf == 1)
 		goto e_inval;
 	fl.oif = dev->ifindex;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc73..f831df500907 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
-			  info->nlh, GFP_KERNEL);
+	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+		    info->nlh, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 382800a62b31..3f50807237e0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1207,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
 
 int __init icmp_init(void)
 {
-	return register_pernet_device(&icmp_sk_ops);
+	return register_pernet_subsys(&icmp_sk_ops);
 }
 
 EXPORT_SYMBOL(icmp_err_convert);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786e..eaf3e2c8646a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
+	__releases(&f->lock)
 {
 	struct inet_frag_queue *q;
 	struct hlist_node *n;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000eeb..7985346653bd 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev)
 {
+	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
 	struct sk_buff *fp, *head = qp->q.fragments;
 	int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph = ip_hdr(head);
 	iph->frag_off = 0;
 	iph->tot_len = htons(len);
-	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	return 0;
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 07a188afb3ac..e62510d5ea5a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -491,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 		goto out;
 
-	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
@@ -803,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 
 	if (tunnel->err_count > 0) {
-		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 			tunnel->err_count--;
 
 			dst_link_failure(skb);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5079dfbc6f38..9054139795af 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 		goto out;
 
-	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
@@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (tunnel->err_count > 0) {
-		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 			tunnel->err_count--;
 			dst_link_failure(skb);
 		} else
@@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = {
 	.priority	=	1,
 };
 
-static char banner[] __initdata =
+static const char banner[] __initconst =
 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
 
 static void ipip_destroy_tunnels(struct ipip_net *ipn)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90b2f3c192ff..2451aeb5ac23 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 	return NULL;
 }
 
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+				       int large_allowed)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 xmit_size_goal, old_size_goal;
+
+	xmit_size_goal = mss_now;
+
+	if (large_allowed && sk_can_gso(sk)) {
+		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+				  inet_csk(sk)->icsk_af_ops->net_header_len -
+				  inet_csk(sk)->icsk_ext_hdr_len -
+				  tp->tcp_header_len);
+
+		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
+
+		/* We try hard to avoid divides here */
+		old_size_goal = tp->xmit_size_goal_segs * mss_now;
+
+		if (likely(old_size_goal <= xmit_size_goal &&
+			   old_size_goal + mss_now > xmit_size_goal)) {
+			xmit_size_goal = old_size_goal;
+		} else {
+			tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+		}
+	}
+
+	return max(xmit_size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+	int mss_now;
+
+	mss_now = tcp_current_mss(sk);
+	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+	return mss_now;
+}
+
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 			 size_t psize, int flags)
 {
@@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
-	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-	size_goal = tp->xmit_size_goal;
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
 	copied = 0;
 
 	err = -EPIPE;
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
-		goto do_error;
+		goto out_err;
 
 	while (psize > 0) {
 		struct sk_buff *skb = tcp_write_queue_tail(sk);
@@ -761,8 +801,7 @@ wait_for_memory:
 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 			goto do_error;
 
-		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-		size_goal = tp->xmit_size_goal;
+		mss_now = tcp_send_mss(sk, &size_goal, flags);
 	}
 
 out:
@@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	/* This should be in poll */
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
-	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-	size_goal = tp->xmit_size_goal;
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
 
 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 
 	err = -EPIPE;
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
-		goto do_error;
+		goto out_err;
 
 	while (--iovlen >= 0) {
 		int seglen = iov->iov_len;
@@ -1007,8 +1045,7 @@ wait_for_memory:
 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 				goto do_error;
 
-			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
-			size_goal = tp->xmit_size_goal;
+			mss_now = tcp_send_mss(sk, &size_goal, flags);
 		}
 	}
 
@@ -1045,8 +1082,7 @@ out_err:
  */
 
 static int tcp_recv_urg(struct sock *sk, long timeo,
-			struct msghdr *msg, int len, int flags,
-			int *addr_len)
+			struct msghdr *msg, int len, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -1661,7 +1697,7 @@ out:
 	return err;
 
 recv_urg:
-	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+	err = tcp_recv_urg(sk, timeo, msg, len, flags);
 	goto out;
 }
 
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 7eb7636db0d0..3b53fd1af23f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		tcp_slow_start(tp);
 	else {
 		bictcp_update(ca, tp->snd_cwnd);
-
-		/* In dangerous area, increase slowly.
-		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-		 */
-		if (tp->snd_cwnd_cnt >= ca->cnt) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		} else
-			tp->snd_cwnd_cnt++;
+		tcp_cong_avoid_ai(tp, ca->cnt);
 	}
 
 }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4ec5b4e97c4e..e92beb9e55e0 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp)
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+{
+	if (tp->snd_cwnd_cnt >= w) {
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+		tp->snd_cwnd_cnt = 0;
+	} else {
+		tp->snd_cwnd_cnt++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
@@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 				tp->snd_cwnd++;
 		}
 	} else {
-		/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
-		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		} else
-			tp->snd_cwnd_cnt++;
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
 	}
 }
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ee467ec40c4f..71d5f2f29fa6 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 		tcp_slow_start(tp);
 	} else {
 		bictcp_update(ca, tp->snd_cwnd);
-
-		/* In dangerous area, increase slowly.
-		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-		 */
-		if (tp->snd_cwnd_cnt >= ca->cnt) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		} else
-			tp->snd_cwnd_cnt++;
+		tcp_cong_avoid_ai(tp, ca->cnt);
 	}
 
 }
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 937549b8a921..26d5c7fc7de5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
 		return;
 
 	/* achieved throughput calculations */
-	if (icsk->icsk_ca_state != TCP_CA_Open &&
-	    icsk->icsk_ca_state != TCP_CA_Disorder) {
+	if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
 		ca->packetcount = 0;
 		ca->lasttime = now;
 		return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d75c7ea..2bc8e27a163d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -64,6 +64,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/kernel.h>
 #include <net/dst.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
@@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk)
 		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
 			continue;
 
-		if (after(received_upto, ack_seq) &&
-		    (tcp_is_fack(tp) ||
-		     !before(received_upto,
-			     ack_seq + tp->reordering * tp->mss_cache))) {
+		/* TODO: We would like to get rid of tcp_is_fack(tp) only
+		 * constraint here (see above) but figuring out that at
+		 * least tp->reordering SACK blocks reside between ack_seq
+		 * and received_upto is not easy task to do cheaply with
+		 * the available datastructures.
+		 *
+		 * Whether FACK should check here for tp->reordering segs
+		 * in-between one could argue for either way (it would be
+		 * rather simple to implement as we could count fack_count
+		 * during the walk and do tp->fackets_out - fack_count).
+		 */
+		if (after(received_upto, ack_seq)) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 			tp->retrans_out -= tcp_skb_pcount(skb);
 
@@ -1374,7 +1383,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
 static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 			   struct tcp_sacktag_state *state,
-			   unsigned int pcount, int shifted, int mss)
+			   unsigned int pcount, int shifted, int mss,
+			   int dup_sack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1410,7 +1420,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* We discard results */
-	tcp_sacktag_one(skb, sk, state, 0, pcount);
+	tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
 
 	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
 	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1561,7 +1571,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 
 	if (!skb_shift(prev, skb, len))
 		goto fallback;
-	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss))
+	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
 		goto out;
 
 	/* Hole filled allows collapsing with the next as well, this is very
@@ -1580,7 +1590,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	len = skb->len;
 	if (skb_shift(prev, skb, len)) {
 		pcount += tcp_skb_pcount(skb);
-		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss);
+		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
 	}
 
 out:
@@ -1793,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
 	for (i = used_sacks - 1; i > 0; i--) {
 		for (j = 0; j < i; j++) {
 			if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
-				struct tcp_sack_block tmp;
-
-				tmp = sp[j];
-				sp[j] = sp[j + 1];
-				sp[j + 1] = tmp;
+				swap(sp[j], sp[j + 1]);
 
 				/* Track where the first SACK block goes to */
 				if (j == first_sack_index)
@@ -2452,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk)
 	return 0;
 }
 
+/* New heuristics: it is possible only after we switched to restart timer
+ * each time when something is ACKed. Hence, we can detect timed out packets
+ * during fast retransmit without falling to slow start.
+ *
+ * Usefulness of this as is very questionable, since we should know which of
+ * the segments is the next to timeout which is relatively expensive to find
+ * in general case unless we add some data structure just for that. The
+ * current approach certainly won't find the right one too often and when it
+ * finally does find _something_ it usually marks large part of the window
+ * right away (because a retransmission with a larger timestamp blocks the
+ * loop from advancing). -ij
+ */
+static void tcp_timeout_skbs(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
+		return;
+
+	skb = tp->scoreboard_skb_hint;
+	if (tp->scoreboard_skb_hint == NULL)
+		skb = tcp_write_queue_head(sk);
+
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+		if (!tcp_skb_timedout(sk, skb))
+			break;
+
+		tcp_skb_mark_lost(tp, skb);
+	}
+
+	tp->scoreboard_skb_hint = skb;
+
+	tcp_verify_left_out(tp);
+}
+
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
@@ -2524,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 		tcp_mark_head_lost(sk, sacked_upto);
 	}
 
-	/* New heuristics: it is possible only after we switched
-	 * to restart timer each time when something is ACKed.
-	 * Hence, we can detect timed out packets during fast
-	 * retransmit without falling to slow start.
-	 */
-	if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
-		struct sk_buff *skb;
-
-		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
-			: tcp_write_queue_head(sk);
-
-		tcp_for_write_queue_from(skb, sk) {
-			if (skb == tcp_send_head(sk))
-				break;
-			if (!tcp_skb_timedout(sk, skb))
-				break;
-
-			tcp_skb_mark_lost(tp, skb);
-		}
-
-		tp->scoreboard_skb_hint = skb;
-
-		tcp_verify_left_out(tp);
-	}
+	tcp_timeout_skbs(sk);
 }
 
 /* CWND moderation, preventing bursts due to too big ACKs
@@ -2812,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
 	icsk->icsk_mtup.probe_size = 0;
 }
 
-static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+static void tcp_mtup_probe_success(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2840,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int mss = tcp_current_mss(sk, 0);
+	unsigned int mss = tcp_current_mss(sk);
 	u32 prior_lost = tp->lost_out;
 
 	tcp_for_write_queue(skb, sk) {
@@ -3177,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 
 	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-		u32 end_seq;
 		u32 acked_pcount;
 		u8 sacked = scb->sacked;
 
@@ -3192,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 				break;
 
 			fully_acked = 0;
-			end_seq = tp->snd_una;
 		} else {
 			acked_pcount = tcp_skb_pcount(skb);
-			end_seq = scb->end_seq;
-		}
-
-		/* MTU probing checks */
-		if (fully_acked && icsk->icsk_mtup.probe_size &&
-		    !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
-			tcp_mtup_probe_success(sk, skb);
 		}
 
 		if (sacked & TCPCB_RETRANS) {
@@ -3266,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		const struct tcp_congestion_ops *ca_ops
 			= inet_csk(sk)->icsk_ca_ops;
 
+		if (unlikely(icsk->icsk_mtup.probe_size &&
+			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+			tcp_mtup_probe_success(sk);
+		}
+
 		tcp_ack_update_rtt(sk, flag, seq_rtt);
 		tcp_rearm_rto(sk);
 
 		if (tcp_is_reno(tp)) {
 			tcp_remove_reno_sacks(sk, pkts_acked);
 		} else {
+			int delta;
+
 			/* Non-retransmitted hole got filled? That's reordering */
 			if (reord < prior_fackets)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
 
-			/* No need to care for underflows here because
-			 * the lost_skb_hint gets NULLed if we're past it
-			 * (or something non-trivial happened)
-			 */
-			if (tcp_is_fack(tp))
-				tp->lost_cnt_hint -= pkts_acked;
-			else
-				tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
+			delta = tcp_is_fack(tp) ? pkts_acked :
+						  prior_sacked - tp->sacked_out;
+			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 
 		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3395,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
 
 	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
 		flag |= FLAG_WIN_UPDATE;
-		tcp_update_wl(tp, ack, ack_seq);
+		tcp_update_wl(tp, ack_seq);
 
 		if (tp->snd_wnd != nwin) {
 			tp->snd_wnd = nwin;
@@ -3571,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	int prior_packets;
 	int frto_cwnd = 0;
 
-	/* If the ack is newer than sent or older than previous acks
+	/* If the ack is older than previous acks
 	 * then we can probably ignore it.
 	 */
-	if (after(ack, tp->snd_nxt))
-		goto uninteresting_ack;
-
 	if (before(ack, prior_snd_una))
 		goto old_ack;
 
+	/* If the ack includes data we haven't sent yet, discard
+	 * this segment (RFC793 Section 3.9).
+	 */
+	if (after(ack, tp->snd_nxt))
+		goto invalid_ack;
+
 	if (after(ack, prior_snd_una))
 		flag |= FLAG_SND_UNA_ADVANCED;
 
@@ -3600,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		 * No more checks are required.
 		 * Note, we use the fact that SND.UNA>=SND.WL2.
 		 */
-		tcp_update_wl(tp, ack, ack_seq);
+		tcp_update_wl(tp, ack_seq);
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
@@ -3669,6 +3686,10 @@ no_queue:
 		tcp_ack_probe(sk);
 	return 1;
 
+invalid_ack:
+	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	return -1;
+
 old_ack:
 	if (TCP_SKB_CB(skb)->sacked) {
 		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3676,8 +3697,7 @@ old_ack:
 			tcp_try_keep_open(sk);
 	}
 
-uninteresting_ack:
-	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
 	return 0;
 }
 
@@ -3865,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 		 * Not only, also it occurs for expired timestamps.
 		 */
 
-		if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
-		   get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+		if (tcp_paws_check(&tp->rx_opt, 0))
 			tcp_store_ts_recent(tp);
 	}
 }
@@ -3918,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk,
 				   const struct sk_buff *skb)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
-		get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
-		!tcp_disordered_ack(sk, skb));
+
+	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+	       !tcp_disordered_ack(sk, skb);
 }
 
 /* Check segment sequence number for validity.
@@ -4078,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
 		tp->rx_opt.dsack = 1;
 		tp->duplicate_sack[0].start_seq = seq;
 		tp->duplicate_sack[0].end_seq = end_seq;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
 	}
 }
 
@@ -4133,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 			 * Decrease num_sacks.
 			 */
 			tp->rx_opt.num_sacks--;
-			tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
-					       tp->rx_opt.dsack;
 			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
 				sp[i] = sp[i + 1];
 			continue;
@@ -4143,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 	}
 }
 
-static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
-				 struct tcp_sack_block *sack2)
-{
-	__u32 tmp;
-
-	tmp = sack1->start_seq;
-	sack1->start_seq = sack2->start_seq;
-	sack2->start_seq = tmp;
-
-	tmp = sack1->end_seq;
-	sack1->end_seq = sack2->end_seq;
-	sack2->end_seq = tmp;
-}
-
 static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4171,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 		if (tcp_sack_extend(sp, seq, end_seq)) {
 			/* Rotate this_sack to the first one. */
 			for (; this_sack > 0; this_sack--, sp--)
-				tcp_sack_swap(sp, sp - 1);
+				swap(*sp, *(sp - 1));
 			if (cur_sacks > 1)
 				tcp_sack_maybe_coalesce(tp);
 			return;
@@ -4197,7 +4199,6 @@ new_sack:
 	sp->start_seq = seq;
 	sp->end_seq = end_seq;
 	tp->rx_opt.num_sacks++;
-	tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
 }
 
 /* RCV.NXT advances, some SACKs should be eaten. */
@@ -4211,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
 	if (skb_queue_empty(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
-		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
 		return;
 	}
 
@@ -4232,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 		this_sack++;
 		sp++;
 	}
-	if (num_sacks != tp->rx_opt.num_sacks) {
-		tp->rx_opt.num_sacks = num_sacks;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
-				       tp->rx_opt.dsack;
-	}
+	tp->rx_opt.num_sacks = num_sacks;
 }
 
 /* This one checks to see if we can put data from the
@@ -4312,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 
 	TCP_ECN_accept_cwr(tp, skb);
 
-	if (tp->rx_opt.dsack) {
-		tp->rx_opt.dsack = 0;
-		tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
-	}
+	tp->rx_opt.dsack = 0;
 
 	/*  Queue data for delivery to the user.
 	 *  Packets in sequence go to the receive queue.
@@ -4434,8 +4427,6 @@ drop:
 		/* Initial out of order segment, build 1 SACK. */
 		if (tcp_is_sack(tp)) {
 			tp->rx_opt.num_sacks = 1;
-			tp->rx_opt.dsack     = 0;
-			tp->rx_opt.eff_sacks = 1;
 			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
 			tp->selective_acks[0].end_seq =
 						TCP_SKB_CB(skb)->end_seq;
@@ -5156,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 */
 
 	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
-	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
 		int tcp_header_len = tp->tcp_header_len;
 
 		/* Timestamp header prediction: tcp_header_len
@@ -5309,8 +5301,8 @@ slow_path:
 		return -res;
 
 step5:
-	if (th->ack)
-		tcp_ack(sk, skb, FLAG_SLOWPATH);
+	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+		goto discard;
 
 	tcp_rcv_rtt_measure_ts(sk, skb);
 
@@ -5408,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * never scaled.
 		 */
 		tp->snd_wnd = ntohs(th->window);
-		tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
 		if (!tp->rx_opt.wscale_ok) {
 			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5509,7 +5501,7 @@ discard:
 
 	/* PAWS check. */
 	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_check(&tp->rx_opt, 0))
+	    tcp_paws_reject(&tp->rx_opt, 0))
 		goto discard_and_undo;
 
 	if (th->syn) {
@@ -5647,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 	/* step 5: check the ACK field */
 	if (th->ack) {
-		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
 
 		switch (sk->sk_state) {
 		case TCP_SYN_RECV:
@@ -5669,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
 				tp->snd_wnd = ntohs(th->window) <<
 					      tp->rx_opt.snd_wscale;
-				tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
-					    TCP_SKB_CB(skb)->seq);
+				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
 				/* tcp_ack considers this ACK as duplicate
 				 * and does not calculate rtt.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6b962f56ab4..d0a314879d81 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
 
-	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
-		/* Some OSes (unknown ones, but I see them on web server, which
-		 * contains information interesting only for windows'
-		 * users) do not send their stamp in SYN. It is easy case.
-		 * We simply do not advertise TS support.
-		 */
-		tmp_opt.saw_tstamp = 0;
-		tmp_opt.tstamp_ok  = 0;
-	}
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 
 	tcp_openreq_init(req, &tmp_opt, skb);
@@ -2443,7 +2434,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 void __init tcp_v4_init(void)
 {
 	inet_hashinfo_init(&tcp_hashinfo);
-	if (register_pernet_device(&tcp_sk_ops))
+	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102b..43bbba7926ee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
 			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
-			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
 
@@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 
 		tcp_prequeue_init(newtp);
 
-		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
+		tcp_init_wl(newtp, treq->rcv_isn);
 
 		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.saw_tstamp = 0;
 
 		newtp->rx_opt.dsack = 0;
-		newtp->rx_opt.eff_sacks = 0;
-
 		newtp->rx_opt.num_sacks = 0;
+
 		newtp->urg_data = 0;
 
 		if (sock_flag(newsk, SOCK_KEEPOPEN))
@@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			 * from another data.
 			 */
 			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
-			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 		}
 	}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dda42f0bd7a3..c1f259d2d33b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 			*ptr++ = htonl(sp[this_sack].end_seq);
 		}
 
-		if (tp->rx_opt.dsack) {
-			tp->rx_opt.dsack = 0;
-			tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
-		}
+		tp->rx_opt.dsack = 0;
 	}
 }
 
@@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
 	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned size = 0;
+	unsigned int eff_sacks;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
 
-	if (unlikely(tp->rx_opt.eff_sacks)) {
+	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+	if (unlikely(eff_sacks)) {
 		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
 		opts->num_sack_blocks =
-			min_t(unsigned, tp->rx_opt.eff_sacks,
+			min_t(unsigned, eff_sacks,
 			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
 			      TCPOLEN_SACK_PERBLOCK);
 		size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	th->urg_ptr		= 0;
 
 	/* The urg_mode check is necessary during a below snd_una win probe */
-	if (unlikely(tcp_urg_mode(tp) &&
-		     between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
-		th->urg_ptr		= htons(tp->snd_up - tcb->seq);
-		th->urg			= 1;
+	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+		if (before(tp->snd_up, tcb->seq + 0x10000)) {
+			th->urg_ptr = htons(tp->snd_up - tcb->seq);
+			th->urg = 1;
+		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+			th->urg_ptr = 0xFFFF;
+			th->urg = 1;
+		}
 	}
 
 	tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	struct sk_buff *buff;
 	int nsize, old_factor;
 	int nlen;
-	u16 flags;
+	u8 flags;
 
 	BUG_ON(len > skb->len);
 
-	tcp_clear_retrans_hints_partial(tp);
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
 		nsize = 0;
@@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 			tcp_verify_left_out(tp);
 		}
 		tcp_adjust_fackets_out(sk, skb, diff);
+
+		if (tp->lost_skb_hint &&
+		    before(TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+		    (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
+			tp->lost_cnt_hint -= diff;
 	}
 
 	/* Link BUFF into the send queue. */
@@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * factor and mss.
 	 */
 	if (tcp_skb_pcount(skb) > 1)
-		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+		tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
 
 	return 0;
 }
@@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
 	icsk->icsk_mtup.probe_size = 0;
 }
 
-/* Bound MSS / TSO packet size with the half of the window */
-static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
-{
-	if (tp->max_window && pktsize > (tp->max_window >> 1))
-		return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
-	else
-		return pktsize;
-}
-
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
  */
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	u32 mss_now;
-	u16 xmit_size_goal;
-	int doing_tso = 0;
 	unsigned header_len;
 	struct tcp_out_options opts;
 	struct tcp_md5sig_key *md5;
 
 	mss_now = tp->mss_cache;
 
-	if (large_allowed && sk_can_gso(sk))
-		doing_tso = 1;
-
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
 		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 		mss_now -= delta;
 	}
 
-	xmit_size_goal = mss_now;
-
-	if (doing_tso) {
-		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  inet_csk(sk)->icsk_ext_hdr_len -
-				  tp->tcp_header_len);
-
-		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-		xmit_size_goal -= (xmit_size_goal % mss_now);
-	}
-	tp->xmit_size_goal = xmit_size_goal;
-
 	return mss_now;
 }
 
@@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
 	struct sk_buff *skb = tcp_send_head(sk);
 
 	return (skb &&
-		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+		tcp_snd_test(sk, skb, tcp_current_mss(sk),
 			     (tcp_skb_is_last(sk, skb) ?
 			      tp->nonagle : TCP_NAGLE_PUSH)));
 }
@@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
-	u16 flags;
+	u8 flags;
 
 	/* All of a TSO frame must be composed of paged data.  */
 	if (skb->len != skb->data_len)
@@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	if (limit >= sk->sk_gso_max_size)
 		goto send_now;
 
+	/* Middle in queue won't get any more data, full sendable already? */
+	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+		goto send_now;
+
 	if (sysctl_tcp_tso_win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
@@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk)
 	    icsk->icsk_mtup.probe_size ||
 	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
 	    tp->snd_cwnd < 11 ||
-	    tp->rx_opt.eff_sacks)
+	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
 		return -1;
 
 	/* Very simple search strategy: just double the MSS. */
-	mss_now = tcp_current_mss(sk, 0);
+	mss_now = tcp_current_mss(sk);
 	probe_size = 2 * tp->mss_cache;
 	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
 	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
 	int skb_size, next_skb_size;
-	u16 flags;
 
 	skb_size = skb->len;
 	next_skb_size = next_skb->len;
-	flags = TCP_SKB_CB(skb)->flags;
 
 	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
 
@@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	/* Update sequence range on original skb. */
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
 
-	/* Merge over control information. */
-	flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
-	TCP_SKB_CB(skb)->flags = flags;
+	/* Merge over control information. This moves PSH/FIN etc. over */
+	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
 
 	/* All done, get rid of second SKB and account for it so
 	 * packet counting does not break.
@@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
-	cur_mss = tcp_current_mss(sk, 0);
+	cur_mss = tcp_current_mss(sk);
 
 	/* If receiver has shrunk his window, and skb is out of
 	 * new window, do not retransmit it. The exception is the
@@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	if (skb->len > cur_mss) {
 		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
 			return -ENOMEM; /* We'll try again later. */
+	} else {
+		tcp_init_tso_segs(sk, skb, cur_mss);
 	}
 
 	tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -2023,7 +2007,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 		last_lost = tp->snd_una;
 	}
 
-	/* First pass: retransmit lost packets. */
 	tcp_for_write_queue_from(skb, sk) {
 		__u8 sacked = TCP_SKB_CB(skb)->sacked;
 
@@ -2062,7 +2045,7 @@ begin_fwd:
 			goto begin_fwd;
 
 		} else if (!(sacked & TCPCB_LOST)) {
-			if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
+			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 				hole = skb;
 			continue;
 
@@ -2101,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
 	 * unsent frames.  But be careful about outgoing SACKS
 	 * and IP options.
 	 */
-	mss_now = tcp_current_mss(sk, 1);
+	mss_now = tcp_current_mss(sk);
 
 	if (tcp_send_head(sk) != NULL) {
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2326,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk)
 	sk->sk_err = 0;
 	sock_reset_flag(sk, SOCK_DONE);
 	tp->snd_wnd = 0;
-	tcp_init_wl(tp, tp->write_seq, 0);
+	tcp_init_wl(tp, 0);
 	tp->snd_una = tp->write_seq;
 	tp->snd_sml = tp->write_seq;
 	tp->snd_up = tp->write_seq;
@@ -2513,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
 	if ((skb = tcp_send_head(sk)) != NULL &&
 	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
 		int err;
-		unsigned int mss = tcp_current_mss(sk, 0);
+		unsigned int mss = tcp_current_mss(sk);
 		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
 
 		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 25524d4e372a..59f5b5e7c566 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
 static ssize_t tcpprobe_read(struct file *file, char __user *buf,
 			     size_t len, loff_t *ppos)
 {
-	int error = 0, cnt = 0;
+	int error = 0;
+	size_t cnt = 0;
 
-	if (!buf || len < 0)
+	if (!buf)
 		return -EINVAL;
 
 	while (cnt < len) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 2747ec7bfb63..a76513779e2b 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,6 @@
 /* Tom Kelly's Scalable TCP
  *
- * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ * See http://www.deneholme.net/tom/scalable/
  *
  * John Heffner <jheffner@sc.edu>
  */
@@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
 		tcp_slow_start(tp);
-	else {
-		tp->snd_cwnd_cnt++;
-		if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		}
-	}
+	else
+		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
 }
 
 static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0170e914f1b0..b144a26359bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk)
 	if (icsk->icsk_retransmits == 0) {
 		int mib_idx;
 
-		if (icsk->icsk_ca_state == TCP_CA_Disorder ||
-		    icsk->icsk_ca_state == TCP_CA_Recovery) {
-			if (tcp_is_sack(tp)) {
-				if (icsk->icsk_ca_state == TCP_CA_Recovery)
-					mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
-				else
-					mib_idx = LINUX_MIB_TCPSACKFAILURES;
-			} else {
-				if (icsk->icsk_ca_state == TCP_CA_Recovery)
-					mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
-				else
-					mib_idx = LINUX_MIB_TCPRENOFAILURES;
-			}
+		if (icsk->icsk_ca_state == TCP_CA_Disorder) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKFAILURES;
+			else
+				mib_idx = LINUX_MIB_TCPRENOFAILURES;
+		} else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+			else
+				mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
 		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 			mib_idx = LINUX_MIB_TCPLOSSFAILURES;
 		} else {
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index d08b2e855c22..e9bbff746488 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 				/* In the "non-congestive state", increase cwnd
 				 *  every rtt.
 				 */
-				if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
-					if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-						tp->snd_cwnd++;
-					tp->snd_cwnd_cnt = 0;
-				} else
-					tp->snd_cwnd_cnt++;
+				tcp_cong_avoid_ai(tp, tp->snd_cwnd);
 			} else {
 				/* In the "congestive state", increase cwnd
 				 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9ec843a9bbb2..66b6821b984e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 
 	} else {
 		/* Reno */
-
-		if (tp->snd_cwnd_cnt < tp->snd_cwnd)
-			tp->snd_cwnd_cnt++;
-
-		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
-			tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		}
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
 	}
 
 	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a111d5..05b7abb99f69 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 			sk = sknext;
 		} while (sknext);
 	} else
-		kfree_skb(skb);
+		consume_skb(skb);
 	spin_unlock(&hslot->lock);
 	return 0;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 03e2a1ad71e9..8499da9e76a2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -40,6 +40,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/net.h>
@@ -493,15 +494,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 	read_unlock(&dev_base_lock);
 }
 
-static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
+static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
 {
 	struct net *net;
 
 	net = (struct net *)table->extra2;
 	if (p == &net->ipv6.devconf_dflt->forwarding)
-		return;
+		return 0;
+
+	if (!rtnl_trylock())
+		return -ERESTARTSYS;
 
-	rtnl_lock();
 	if (p == &net->ipv6.devconf_all->forwarding) {
 		__s32 newf = net->ipv6.devconf_all->forwarding;
 		net->ipv6.devconf_dflt->forwarding = newf;
@@ -512,6 +515,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
 
 	if (*p)
 		rt6_purge_dflt_routers(net);
+	return 1;
 }
 #endif
 
@@ -587,6 +591,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 {
 	struct inet6_ifaddr *ifa = NULL;
 	struct rt6_info *rt;
+	struct net *net = dev_net(idev->dev);
 	int hash;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
@@ -603,6 +608,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 		goto out2;
 	}
 
+	if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) {
+		err = -EACCES;
+		goto out2;
+	}
+
 	write_lock(&addrconf_hash_lock);
 
 	/* Ignore adding duplicate addresses on an interface */
@@ -1206,16 +1216,12 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
 					}
 					break;
 				} else if (minihiscore < miniscore) {
-					struct ipv6_saddr_score *tmp;
-
 					if (hiscore->ifa)
 						in6_ifa_put(hiscore->ifa);
 
 					in6_ifa_hold(score->ifa);
 
-					tmp = hiscore;
-					hiscore = score;
-					score = tmp;
+					swap(hiscore, score);
 
 					/* restore our iterator */
 					score->ifa = hiscore->ifa;
@@ -1430,6 +1436,11 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
 void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 {
 	struct inet6_dev *idev = ifp->idev;
+
+	if (net_ratelimit())
+		printk(KERN_INFO "%s: IPv6 duplicate address detected!\n",
+			ifp->idev->dev->name);
+
 	if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
 		struct in6_addr addr;
 
@@ -1440,11 +1451,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 		    ipv6_addr_equal(&ifp->addr, &addr)) {
 			/* DAD failed for link-local based on MAC address */
 			idev->cnf.disable_ipv6 = 1;
+
+			printk(KERN_INFO "%s: IPv6 being disabled!\n",
+				ifp->idev->dev->name);
 		}
 	}
 
-	if (net_ratelimit())
-		printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
 	addrconf_dad_stop(ifp);
 }
 
@@ -2599,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	ASSERT_RTNL();
 
-	if ((dev->flags & IFF_LOOPBACK) && how == 1)
-		how = 0;
-
 	rt6_ifdown(net, dev);
 	neigh_ifdown(&nd_tbl, dev);
 
@@ -2823,11 +2832,6 @@ static void addrconf_dad_timer(unsigned long data)
 		read_unlock_bh(&idev->lock);
 		goto out;
 	}
-	if (idev->cnf.accept_dad > 1 && idev->cnf.disable_ipv6) {
-		read_unlock_bh(&idev->lock);
-		addrconf_dad_failure(ifp);
-		return;
-	}
 	spin_lock_bh(&ifp->lock);
 	if (ifp->probes == 0) {
 		/*
@@ -3638,7 +3642,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3849,7 +3854,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3919,7 +3925,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
@@ -3974,7 +3981,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
 
 	if (write)
-		addrconf_fixup_forwarding(ctl, valp, val);
+		ret = addrconf_fixup_forwarding(ctl, valp, val);
 	return ret;
 }
 
@@ -4010,8 +4017,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
 	}
 
 	*valp = new;
-	addrconf_fixup_forwarding(table, valp, val);
-	return 1;
+	return addrconf_fixup_forwarding(table, valp, val);
 }
 
 static struct addrconf_sysctl_table
@@ -4437,25 +4443,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb)
 
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 
-static void addrconf_net_exit(struct net *net)
-{
-	struct net_device *dev;
-
-	rtnl_lock();
-	/* clean dev list */
-	for_each_netdev(net, dev) {
-		if (__in6_dev_get(dev) == NULL)
-			continue;
-		addrconf_ifdown(dev, 1);
-	}
-	addrconf_ifdown(net->loopback_dev, 2);
-	rtnl_unlock();
-}
-
-static struct pernet_operations addrconf_net_ops = {
-	.exit = addrconf_net_exit,
-};
-
 /*
  *	Init / cleanup code
  */
@@ -4497,10 +4484,6 @@ int __init addrconf_init(void)
 	if (err)
 		goto errlo;
 
-	err = register_pernet_device(&addrconf_net_ops);
-	if (err)
-		return err;
-
 	register_netdevice_notifier(&ipv6_dev_notf);
 
 	addrconf_verify(0);
@@ -4530,15 +4513,22 @@ errlo:
 void addrconf_cleanup(void)
 {
 	struct inet6_ifaddr *ifa;
+	struct net_device *dev;
 	int i;
 
 	unregister_netdevice_notifier(&ipv6_dev_notf);
-	unregister_pernet_device(&addrconf_net_ops);
-
 	unregister_pernet_subsys(&addrconf_ops);
 
 	rtnl_lock();
 
+	/* clean dev list */
+	for_each_netdev(&init_net, dev) {
+		if (__in6_dev_get(dev) == NULL)
+			continue;
+		addrconf_ifdown(dev, 1);
+	}
+	addrconf_ifdown(init_net.loopback_dev, 2);
+
 	/*
 	 *	Check hash table.
 	 */
@@ -4559,6 +4549,4 @@ void addrconf_cleanup(void)
 
 	del_timer(&addr_chk_timer);
 	rtnl_unlock();
-
-	unregister_pernet_subsys(&addrconf_net_ops);
 }
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fa2ac7ee662f..fbf533cc9dce 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -72,6 +72,10 @@ MODULE_LICENSE("GPL");
 static struct list_head inetsw6[SOCK_MAX];
 static DEFINE_SPINLOCK(inetsw6_lock);
 
+static int disable_ipv6 = 0;
+module_param_named(disable, disable_ipv6, int, 0);
+MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional");
+
 static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 {
 	const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -889,7 +893,7 @@ out_unlock:
 	return err;
 }
 
-static struct packet_type ipv6_packet_type = {
+static struct packet_type ipv6_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
 	.func = ipv6_rcv,
 	.gso_send_check = ipv6_gso_send_check,
@@ -1001,10 +1005,21 @@ static int __init inet6_init(void)
 {
 	struct sk_buff *dummy_skb;
 	struct list_head *r;
-	int err;
+	int err = 0;
 
 	BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
 
+	/* Register the socket-side information for inet6_create.  */
+	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	if (disable_ipv6) {
+		printk(KERN_INFO
+		       "IPv6: Loaded, but administratively disabled, "
+		       "reboot required to enable\n");
+		goto out;
+	}
+
 	err = proto_register(&tcpv6_prot, 1);
 	if (err)
 		goto out;
@@ -1022,10 +1037,6 @@ static int __init inet6_init(void)
 		goto out_unregister_udplite_proto;
 
 
-	/* Register the socket-side information for inet6_create.  */
-	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
-		INIT_LIST_HEAD(r);
-
 	/* We MUST register RAW sockets before we create the ICMP6,
 	 * IGMP6, or NDISC control sockets.
 	 */
@@ -1191,6 +1202,9 @@ module_init(inet6_init);
 
 static void __exit inet6_exit(void)
 {
+	if (disable_ipv6)
+		return;
+
 	/* First of all disallow new sockets creation. */
 	sock_unregister(PF_INET6);
 	/* Disallow any further netlink messages */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 8fe267feb81e..1bcc3431859e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -258,11 +258,11 @@ unique:
 
 	if (twp != NULL) {
 		*twp = tw;
-		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED);
+		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
 	} else if (tw != NULL) {
 		/* Silly. Should hash-dance instead... */
 		inet_twsk_deschedule(tw, death_row);
-		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED);
+		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
 
 		inet_twsk_put(tw);
 	}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 40f324655e24..d31df0f4bc9a 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -218,8 +218,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			if (opt)
 				sock_kfree_s(sk, opt, opt->tot_len);
 			pktopt = xchg(&np->pktoptions, NULL);
-			if (pktopt)
-				kfree_skb(pktopt);
+			kfree_skb(pktopt);
 
 			sk->sk_destruct = inet_sock_destruct;
 			/*
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3cd83b85e9ef..9f061d1adbc2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
 		&ipv6_hdr(ra)->saddr);
 	nlmsg_end(skb, nlh);
 
-	err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL,
-			  GFP_ATOMIC);
-	if (err < 0)
-		goto errout;
-
+	rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
 	return;
 
 nla_put_failure:
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 165b256a6fa0..41b8a956e1be 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -205,8 +205,9 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
 
 	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
 	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
-		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
-			      "nf_ct_icmpv6: ICMPv6 checksum failed\n");
+		if (LOG_INVALID(net, IPPROTO_ICMPV6))
+			nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmpv6: ICMPv6 checksum failed ");
 		return -NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ed4d79a9e4a6..058a5e4a60c3 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -528,14 +528,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
 		if (!ipv6_ext_hdr(nexthdr)) {
 			return -1;
 		}
-		if (len < (int)sizeof(struct ipv6_opt_hdr)) {
-			pr_debug("too short\n");
-			return -1;
-		}
 		if (nexthdr == NEXTHDR_NONE) {
 			pr_debug("next header is none\n");
 			return -1;
 		}
+		if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+			pr_debug("too short\n");
+			return -1;
+		}
 		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
 			BUG();
 		if (nexthdr == NEXTHDR_AUTH)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3c575118fca5..e9ac7a12f595 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -452,6 +452,7 @@ err:
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			  struct net_device *dev)
 {
+	struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int    payload_len;
 	unsigned int nhoff;
@@ -551,8 +552,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 					  head->csum);
 
 	rcu_read_lock();
-	IP6_INC_STATS_BH(dev_net(dev),
-			 __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
 	rcu_read_unlock();
 	fq->q.fragments = NULL;
 	return 1;
@@ -566,8 +566,7 @@ out_oom:
 		printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
 out_fail:
 	rcu_read_lock();
-	IP6_INC_STATS_BH(dev_net(dev),
-			 __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 	rcu_read_unlock();
 	return -1;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3d486a3edad..1394ddb6e35c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
-			  info->nlh, gfp_any());
+	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
+		    info->nlh, gfp_any());
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d3467e563f02..664ab82e03b2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -188,9 +188,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 	}
 
 	nt = netdev_priv(dev);
-	ipip6_tunnel_init(dev);
 
 	nt->parms = *parms;
+	ipip6_tunnel_init(dev);
 
 	if (parms->i_flags & SIT_ISATAP)
 		dev->priv_flags |= IFF_ISATAP;
@@ -454,7 +454,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 		goto out;
 
-	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
@@ -658,7 +658,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (tunnel->err_count > 0) {
-		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 			tunnel->err_count--;
 			dst_link_failure(skb);
 		} else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 00f1269e11e9..4b5aa1854260 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -533,8 +533,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
-	if (inet6_rsk(req)->pktopts)
-		kfree_skb(inet6_rsk(req)->pktopts);
+	kfree_skb(inet6_rsk(req)->pktopts);
 }
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1611,8 +1610,7 @@ ipv6_pktoptions:
 		}
 	}
 
-	if (opt_skb)
-		kfree_skb(opt_skb);
+	kfree_skb(opt_skb);
 	return 0;
 }
 
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 0e685b05496e..f417b77fa0e1 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -69,7 +69,7 @@ __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
 
 	for (i = 0; i < n; i++) {
 		dst[count[class[i] - 1]++] = src[i];
-		src[i] = 0;
+		src[i] = NULL;
 	}
 
 	return 0;
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 43d0ffc6d565..1627050e29fd 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1958,12 +1958,12 @@ static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
 
 SOCKOPS_WRAP(ipx_dgram, PF_IPX);
 
-static struct packet_type ipx_8023_packet_type = {
+static struct packet_type ipx_8023_packet_type __read_mostly = {
 	.type		= cpu_to_be16(ETH_P_802_3),
 	.func		= ipx_rcv,
 };
 
-static struct packet_type ipx_dix_packet_type = {
+static struct packet_type ipx_dix_packet_type __read_mostly = {
 	.type		= cpu_to_be16(ETH_P_IPX),
 	.func		= ipx_rcv,
 };
@@ -1975,15 +1975,15 @@ static struct notifier_block ipx_dev_notifier = {
 extern struct datalink_proto *make_EII_client(void);
 extern void destroy_EII_client(struct datalink_proto *);
 
-static unsigned char ipx_8022_type = 0xE0;
-static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
-static char ipx_EII_err_msg[] __initdata =
+static const unsigned char ipx_8022_type = 0xE0;
+static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
+static const char ipx_EII_err_msg[] __initconst =
 	KERN_CRIT "IPX: Unable to register with Ethernet II\n";
-static char ipx_8023_err_msg[] __initdata =
+static const char ipx_8023_err_msg[] __initconst =
 	KERN_CRIT "IPX: Unable to register with 802.3\n";
-static char ipx_llc_err_msg[] __initdata =
+static const char ipx_llc_err_msg[] __initconst =
 	KERN_CRIT "IPX: Unable to register with 802.2\n";
-static char ipx_snap_err_msg[] __initdata =
+static const char ipx_snap_err_msg[] __initconst =
 	KERN_CRIT "IPX: Unable to register with SNAP\n";
 
 static int __init ipx_init(void)
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index ea319e3ddc18..bf92e1473447 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -149,13 +149,14 @@ int irda_device_is_receiving(struct net_device *dev)
 
 	IRDA_DEBUG(2, "%s()\n", __func__);
 
-	if (!dev->do_ioctl) {
+	if (!dev->netdev_ops->ndo_do_ioctl) {
 		IRDA_ERROR("%s: do_ioctl not impl. by device driver\n",
 			   __func__);
 		return -1;
 	}
 
-	ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING);
+	ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req,
+					      SIOCGRECEIVING);
 	if (ret < 0)
 		return ret;
 
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 05112be99569..724bcf951b80 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,6 +45,16 @@ static int  irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev);
 static void irlan_eth_set_multicast_list( struct net_device *dev);
 static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
 
+static const struct net_device_ops irlan_eth_netdev_ops = {
+	.ndo_open               = irlan_eth_open,
+	.ndo_stop               = irlan_eth_close,
+	.ndo_start_xmit    	= irlan_eth_xmit,
+	.ndo_get_stats	        = irlan_eth_get_stats,
+	.ndo_set_multicast_list = irlan_eth_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
 /*
  * Function irlan_eth_setup (dev)
  *
@@ -53,14 +63,11 @@ static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
  */
 static void irlan_eth_setup(struct net_device *dev)
 {
-	dev->open               = irlan_eth_open;
-	dev->stop               = irlan_eth_close;
-	dev->hard_start_xmit    = irlan_eth_xmit;
-	dev->get_stats	        = irlan_eth_get_stats;
-	dev->set_multicast_list = irlan_eth_set_multicast_list;
+	ether_setup(dev);
+
+	dev->netdev_ops		= &irlan_eth_netdev_ops;
 	dev->destructor		= free_netdev;
 
-	ether_setup(dev);
 
 	/*
 	 * Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 1bb607f2f5c7..303a68d92731 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(irda_debug);
 /* Packet type handler.
  * Tell the kernel how IrDA packets should be handled.
  */
-static struct packet_type irda_packet_type = {
+static struct packet_type irda_packet_type __read_mostly = {
 	.type	= cpu_to_be16(ETH_P_IRDA),
 	.func	= irlap_driver_rcv,	/* Packet type handler irlap_frame.c */
 };
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index eb8a2a0b6eb7..49e786535dc8 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1171,8 +1171,7 @@ static void iucv_callback_txdone(struct iucv_path *path,
 
 		spin_unlock_irqrestore(&list->lock, flags);
 
-		if (this)
-			kfree_skb(this);
+		kfree_skb(this);
 	}
 	BUG_ON(!this);
 
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7dcbde3ea7d9..643c1be2d02e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -313,8 +313,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
 	if (one_sk != NULL)
 		err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
 
-	if (skb2)
-		kfree_skb(skb2);
+	kfree_skb(skb2);
 	kfree_skb(skb);
 	return err;
 }
@@ -3573,8 +3572,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb,
 out:
 	if (err && hdr && pfkey_error(hdr, err, sk) == 0)
 		err = 0;
-	if (skb)
-		kfree_skb(skb);
+	kfree_skb(skb);
 
 	return err ? : len;
 }
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 56fd85ab358e..febae702685c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1118,11 +1118,11 @@ static const struct proto_ops llc_ui_ops = {
 	.sendpage    = sock_no_sendpage,
 };
 
-static char llc_proc_err_msg[] __initdata =
+static const char llc_proc_err_msg[] __initconst =
 	KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
-static char llc_sysctl_err_msg[] __initdata =
+static const char llc_sysctl_err_msg[] __initconst =
 	KERN_CRIT "LLC: Unable to register the sysctl entries\n";
-static char llc_sock_err_msg[] __initdata =
+static const char llc_sock_err_msg[] __initconst =
 	KERN_CRIT "LLC: Unable to register the network family\n";
 
 static int __init llc2_init(void)
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 5c6d89c6d51d..3477624a4906 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -332,8 +332,7 @@ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked)
 
 	for (i = 0; i < pdu_pos && i < q_len; i++) {
 		skb = skb_dequeue(&llc->pdu_unack_q);
-		if (skb)
-			kfree_skb(skb);
+		kfree_skb(skb);
 		nbr_acked++;
 	}
 out:
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index a7fe1adc378d..ff4c0ab96a69 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -147,12 +147,12 @@ void llc_sap_close(struct llc_sap *sap)
 	kfree(sap);
 }
 
-static struct packet_type llc_packet_type = {
+static struct packet_type llc_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_802_2),
 	.func = llc_rcv,
 };
 
-static struct packet_type llc_tr_packet_type = {
+static struct packet_type llc_tr_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_TR_802_2),
 	.func = llc_rcv,
 };
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 3503a3d21318..0e3ab88bb706 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -9,6 +9,7 @@ mac80211-y := \
 	wpa.o \
 	scan.o \
 	ht.o agg-tx.o agg-rx.o \
+	ibss.o \
 	mlme.o \
 	iface.o \
 	rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 3112bfd441b6..a95affc94629 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -129,7 +129,6 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 				      u8 dialog_token, u16 status, u16 policy,
 				      u16 buf_size, u16 timeout)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
@@ -151,8 +150,9 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
 	if (sdata->vif.type == NL80211_IFTYPE_AP ||
 	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
 
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 1232d9f01ca9..1df116d4d6e7 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -49,7 +49,6 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 					 u16 agg_size, u16 timeout)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u16 capab;
@@ -69,8 +68,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.type == NL80211_IFTYPE_AP ||
 	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
 
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
@@ -132,9 +131,24 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 
 	state = &sta->ampdu_mlme.tid_state_tx[tid];
 
-	if (local->hw.ampdu_queues)
-		ieee80211_stop_queue(&local->hw, sta->tid_to_tx_q[tid]);
+	if (local->hw.ampdu_queues) {
+		if (initiator) {
+			/*
+			 * Stop the AC queue to avoid issues where we send
+			 * unaggregated frames already before the delba.
+			 */
+			ieee80211_stop_queue_by_reason(&local->hw,
+				local->hw.queues + sta->tid_to_tx_q[tid],
+				IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+		}
 
+		/*
+		 * Pretend the driver woke the queue, just in case
+		 * it disabled it before the session was stopped.
+		 */
+		ieee80211_wake_queue(
+			&local->hw, local->hw.queues + sta->tid_to_tx_q[tid]);
+	}
 	*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
 		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
 
@@ -144,8 +158,6 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
 	/* HW shall not deny going back to legacy */
 	if (WARN_ON(ret)) {
 		*state = HT_AGG_STATE_OPERATIONAL;
-		if (local->hw.ampdu_queues)
-			ieee80211_wake_queue(&local->hw, sta->tid_to_tx_q[tid]);
 	}
 
 	return ret;
@@ -189,14 +201,19 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 	spin_unlock_bh(&sta->lock);
 }
 
+static inline int ieee80211_ac_from_tid(int tid)
+{
+	return ieee802_1d_to_ac[tid & 7];
+}
+
 int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct sta_info *sta;
 	struct ieee80211_sub_if_data *sdata;
-	u16 start_seq_num;
 	u8 *state;
-	int ret = 0;
+	int i, qn = -1, ret = 0;
+	u16 start_seq_num;
 
 	if (WARN_ON(!local->ops->ampdu_action))
 		return -EINVAL;
@@ -209,6 +226,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	       ra, tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
+	if (hw->ampdu_queues && ieee80211_ac_from_tid(tid) == 0) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "rejecting on voice AC\n");
+#endif
+		return -EINVAL;
+	}
+
 	rcu_read_lock();
 
 	sta = sta_info_get(local, ra);
@@ -217,7 +241,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 		printk(KERN_DEBUG "Could not find the station\n");
 #endif
 		ret = -ENOENT;
-		goto exit;
+		goto unlock;
 	}
 
 	/*
@@ -230,11 +254,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	    sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
 	    sta->sdata->vif.type != NL80211_IFTYPE_AP) {
 		ret = -EINVAL;
-		goto exit;
+		goto unlock;
 	}
 
 	spin_lock_bh(&sta->lock);
 
+	sdata = sta->sdata;
+
 	/* we have tried too many times, receiver does not want A-MPDU */
 	if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
 		ret = -EBUSY;
@@ -252,6 +278,42 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 		goto err_unlock_sta;
 	}
 
+	if (hw->ampdu_queues) {
+		spin_lock(&local->queue_stop_reason_lock);
+		/* reserve a new queue for this session */
+		for (i = 0; i < local->hw.ampdu_queues; i++) {
+			if (local->ampdu_ac_queue[i] < 0) {
+				qn = i;
+				local->ampdu_ac_queue[qn] =
+					ieee80211_ac_from_tid(tid);
+				break;
+			}
+		}
+		spin_unlock(&local->queue_stop_reason_lock);
+
+		if (qn < 0) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+			printk(KERN_DEBUG "BA request denied - "
+			       "queue unavailable for tid %d\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+			ret = -ENOSPC;
+			goto err_unlock_sta;
+		}
+
+		/*
+		 * If we successfully allocate the session, we can't have
+		 * anything going on on the queue this TID maps into, so
+		 * stop it for now. This is a "virtual" stop using the same
+		 * mechanism that drivers will use.
+		 *
+		 * XXX: queue up frames for this session in the sta_info
+		 *	struct instead to avoid hitting all other STAs.
+		 */
+		ieee80211_stop_queue_by_reason(
+			&local->hw, hw->queues + qn,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	}
+
 	/* prepare A-MPDU MLME for Tx aggregation */
 	sta->ampdu_mlme.tid_tx[tid] =
 			kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
@@ -262,8 +324,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 					tid);
 #endif
 		ret = -ENOMEM;
-		goto err_unlock_sta;
+		goto err_return_queue;
 	}
+
 	/* Tx timer */
 	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
 			sta_addba_resp_timer_expired;
@@ -271,49 +334,25 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 			(unsigned long)&sta->timer_to_tid[tid];
 	init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
 
-	if (hw->ampdu_queues) {
-		/* create a new queue for this aggregation */
-		ret = ieee80211_ht_agg_queue_add(local, sta, tid);
-
-		/* case no queue is available to aggregation
-		 * don't switch to aggregation */
-		if (ret) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-			printk(KERN_DEBUG "BA request denied - "
-			       "queue unavailable for tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-			goto err_unlock_queue;
-		}
-	}
-	sdata = sta->sdata;
-
 	/* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
 	 * call back right away, it must see that the flow has begun */
 	*state |= HT_ADDBA_REQUESTED_MSK;
 
-	/* This is slightly racy because the queue isn't stopped */
 	start_seq_num = sta->tid_seq[tid];
 
 	ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
 				       &sta->sta, tid, &start_seq_num);
 
 	if (ret) {
-		/* No need to requeue the packets in the agg queue, since we
-		 * held the tx lock: no packet could be enqueued to the newly
-		 * allocated queue */
-		if (hw->ampdu_queues)
-			ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "BA request denied - HW unavailable for"
 					" tid %d\n", tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 		*state = HT_AGG_STATE_IDLE;
-		goto err_unlock_queue;
+		goto err_free;
 	}
+	sta->tid_to_tx_q[tid] = qn;
 
-	/* Will put all the packets in the new SW queue */
-	if (hw->ampdu_queues)
-		ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
 	spin_unlock_bh(&sta->lock);
 
 	/* send an addBA request */
@@ -322,7 +361,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 			sta->ampdu_mlme.dialog_token_allocator;
 	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
 
-
 	ieee80211_send_addba_request(sta->sdata, ra, tid,
 			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
 			 sta->ampdu_mlme.tid_tx[tid]->ssn,
@@ -334,15 +372,24 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
 #endif
-	goto exit;
+	goto unlock;
 
-err_unlock_queue:
+ err_free:
 	kfree(sta->ampdu_mlme.tid_tx[tid]);
 	sta->ampdu_mlme.tid_tx[tid] = NULL;
-	ret = -EBUSY;
-err_unlock_sta:
+ err_return_queue:
+	if (qn >= 0) {
+		/* We failed, so start queue again right away. */
+		ieee80211_wake_queue_by_reason(hw, hw->queues + qn,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+		/* give queue back to pool */
+		spin_lock(&local->queue_stop_reason_lock);
+		local->ampdu_ac_queue[qn] = -1;
+		spin_unlock(&local->queue_stop_reason_lock);
+	}
+ err_unlock_sta:
 	spin_unlock_bh(&sta->lock);
-exit:
+ unlock:
 	rcu_read_unlock();
 	return ret;
 }
@@ -375,7 +422,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	state = &sta->ampdu_mlme.tid_state_tx[tid];
 	spin_lock_bh(&sta->lock);
 
-	if (!(*state & HT_ADDBA_REQUESTED_MSK)) {
+	if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
 				*state);
@@ -385,7 +432,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 		return;
 	}
 
-	WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK);
+	if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK))
+		goto out;
 
 	*state |= HT_ADDBA_DRV_READY_MSK;
 
@@ -393,9 +441,18 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
 #endif
-		if (hw->ampdu_queues)
-			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+		if (hw->ampdu_queues) {
+			/*
+			 * Wake up this queue, we stopped it earlier,
+			 * this will in turn wake the entire AC.
+			 */
+			ieee80211_wake_queue_by_reason(hw,
+				hw->queues + sta->tid_to_tx_q[tid],
+				IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+		}
 	}
+
+ out:
 	spin_unlock_bh(&sta->lock);
 	rcu_read_unlock();
 }
@@ -485,7 +542,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct sta_info *sta;
 	u8 *state;
-	int agg_queue;
 
 	if (tid >= STA_TID_NUM) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
@@ -527,19 +583,19 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
 		ieee80211_send_delba(sta->sdata, ra, tid,
 			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
 
-	if (hw->ampdu_queues) {
-		agg_queue = sta->tid_to_tx_q[tid];
-		ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
+	spin_lock_bh(&sta->lock);
 
-		/* We just requeued the all the frames that were in the
-		 * removed queue, and since we might miss a softirq we do
-		 * netif_schedule_queue.  ieee80211_wake_queue is not used
-		 * here as this queue is not necessarily stopped
+	if (*state & HT_AGG_STATE_INITIATOR_MSK &&
+	    hw->ampdu_queues) {
+		/*
+		 * Wake up this queue, we stopped it earlier,
+		 * this will in turn wake the entire AC.
 		 */
-		netif_schedule_queue(netdev_get_tx_queue(local->mdev,
-							 agg_queue));
+		ieee80211_wake_queue_by_reason(hw,
+			hw->queues + sta->tid_to_tx_q[tid],
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
 	}
-	spin_lock_bh(&sta->lock);
+
 	*state = HT_AGG_STATE_IDLE;
 	sta->ampdu_mlme.addba_req_num[tid] = 0;
 	kfree(sta->ampdu_mlme.tid_tx[tid]);
@@ -613,12 +669,21 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 	if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
 			== WLAN_STATUS_SUCCESS) {
+		u8 curstate = *state;
+
 		*state |= HT_ADDBA_RECEIVED_MSK;
-		sta->ampdu_mlme.addba_req_num[tid] = 0;
 
-		if (*state == HT_AGG_STATE_OPERATIONAL &&
-		    local->hw.ampdu_queues)
-			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+		if (hw->ampdu_queues && *state != curstate &&
+		    *state == HT_AGG_STATE_OPERATIONAL) {
+			/*
+			 * Wake up this queue, we stopped it earlier,
+			 * this will in turn wake the entire AC.
+			 */
+			ieee80211_wake_queue_by_reason(hw,
+				hw->queues + sta->tid_to_tx_q[tid],
+				IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+		}
+		sta->ampdu_mlme.addba_req_num[tid] = 0;
 
 		if (local->ops->ampdu_action) {
 			(void)local->ops->ampdu_action(hw,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c8d969be440b..58693e52d458 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -341,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 	sinfo->filled = STATION_INFO_INACTIVE_TIME |
 			STATION_INFO_RX_BYTES |
 			STATION_INFO_TX_BYTES |
+			STATION_INFO_RX_PACKETS |
+			STATION_INFO_TX_PACKETS |
 			STATION_INFO_TX_BITRATE;
 
 	sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
 	sinfo->rx_bytes = sta->rx_bytes;
 	sinfo->tx_bytes = sta->tx_bytes;
+	sinfo->rx_packets = sta->rx_packets;
+	sinfo->tx_packets = sta->tx_packets;
 
 	if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
 		sinfo->filled |= STATION_INFO_SIGNAL;
@@ -447,7 +451,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata,
 	 * This is a kludge. beacon interval should really be part
 	 * of the beacon information.
 	 */
-	if (params->interval) {
+	if (params->interval && (sdata->local->hw.conf.beacon_int !=
+				 params->interval)) {
 		sdata->local->hw.conf.beacon_int = params->interval;
 		err = ieee80211_hw_config(sdata->local,
 					IEEE80211_CONF_CHANGE_BEACON_INTERVAL);
@@ -1180,45 +1185,45 @@ static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata,
 				 u8 subtype, u8 *ies, size_t ies_len)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	switch (subtype) {
 	case IEEE80211_STYPE_PROBE_REQ >> 4:
 		if (local->ops->hw_scan)
 			break;
-		kfree(ifsta->ie_probereq);
-		ifsta->ie_probereq = ies;
-		ifsta->ie_probereq_len = ies_len;
+		kfree(ifmgd->ie_probereq);
+		ifmgd->ie_probereq = ies;
+		ifmgd->ie_probereq_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_PROBE_RESP >> 4:
-		kfree(ifsta->ie_proberesp);
-		ifsta->ie_proberesp = ies;
-		ifsta->ie_proberesp_len = ies_len;
+		kfree(ifmgd->ie_proberesp);
+		ifmgd->ie_proberesp = ies;
+		ifmgd->ie_proberesp_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_AUTH >> 4:
-		kfree(ifsta->ie_auth);
-		ifsta->ie_auth = ies;
-		ifsta->ie_auth_len = ies_len;
+		kfree(ifmgd->ie_auth);
+		ifmgd->ie_auth = ies;
+		ifmgd->ie_auth_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_ASSOC_REQ >> 4:
-		kfree(ifsta->ie_assocreq);
-		ifsta->ie_assocreq = ies;
-		ifsta->ie_assocreq_len = ies_len;
+		kfree(ifmgd->ie_assocreq);
+		ifmgd->ie_assocreq = ies;
+		ifmgd->ie_assocreq_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_REASSOC_REQ >> 4:
-		kfree(ifsta->ie_reassocreq);
-		ifsta->ie_reassocreq = ies;
-		ifsta->ie_reassocreq_len = ies_len;
+		kfree(ifmgd->ie_reassocreq);
+		ifmgd->ie_reassocreq = ies;
+		ifmgd->ie_reassocreq_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_DEAUTH >> 4:
-		kfree(ifsta->ie_deauth);
-		ifsta->ie_deauth = ies;
-		ifsta->ie_deauth_len = ies_len;
+		kfree(ifmgd->ie_deauth);
+		ifmgd->ie_deauth = ies;
+		ifmgd->ie_deauth_len = ies_len;
 		return 0;
 	case IEEE80211_STYPE_DISASSOC >> 4:
-		kfree(ifsta->ie_disassoc);
-		ifsta->ie_disassoc = ies;
-		ifsta->ie_disassoc_len = ies_len;
+		kfree(ifmgd->ie_disassoc);
+		ifmgd->ie_disassoc = ies;
+		ifmgd->ie_disassoc_len = ies_len;
 		return 0;
 	}
 
@@ -1248,7 +1253,6 @@ static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
 		ret = set_mgmt_extra_ie_sta(sdata, params->subtype,
 					    ies, ies_len);
 		break;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c54219301724..e3420329f4e6 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -94,31 +94,31 @@ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC);
 IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC);
 IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC);
 
-/* STA/IBSS attributes */
-IEEE80211_IF_FILE(state, u.sta.state, DEC);
-IEEE80211_IF_FILE(bssid, u.sta.bssid, MAC);
-IEEE80211_IF_FILE(prev_bssid, u.sta.prev_bssid, MAC);
-IEEE80211_IF_FILE(ssid_len, u.sta.ssid_len, SIZE);
-IEEE80211_IF_FILE(aid, u.sta.aid, DEC);
-IEEE80211_IF_FILE(ap_capab, u.sta.ap_capab, HEX);
-IEEE80211_IF_FILE(capab, u.sta.capab, HEX);
-IEEE80211_IF_FILE(extra_ie_len, u.sta.extra_ie_len, SIZE);
-IEEE80211_IF_FILE(auth_tries, u.sta.auth_tries, DEC);
-IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC);
-IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX);
-IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC);
-IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC);
+/* STA attributes */
+IEEE80211_IF_FILE(state, u.mgd.state, DEC);
+IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
+IEEE80211_IF_FILE(prev_bssid, u.mgd.prev_bssid, MAC);
+IEEE80211_IF_FILE(ssid_len, u.mgd.ssid_len, SIZE);
+IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(ap_capab, u.mgd.ap_capab, HEX);
+IEEE80211_IF_FILE(capab, u.mgd.capab, HEX);
+IEEE80211_IF_FILE(extra_ie_len, u.mgd.extra_ie_len, SIZE);
+IEEE80211_IF_FILE(auth_tries, u.mgd.auth_tries, DEC);
+IEEE80211_IF_FILE(assoc_tries, u.mgd.assoc_tries, DEC);
+IEEE80211_IF_FILE(auth_algs, u.mgd.auth_algs, HEX);
+IEEE80211_IF_FILE(auth_alg, u.mgd.auth_alg, DEC);
+IEEE80211_IF_FILE(auth_transaction, u.mgd.auth_transaction, DEC);
 
 static ssize_t ieee80211_if_fmt_flags(
 	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
 {
 	return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n",
-		 sdata->u.sta.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "",
-		 sdata->u.sta.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "",
-		 sdata->u.sta.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "",
-		 sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "",
-		 sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "",
-		 sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "",
+		 sdata->u.mgd.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "",
 		 sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : "");
 }
 __IEEE80211_IF_FILE(flags);
@@ -283,9 +283,11 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 #endif
 		break;
 	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
 		add_sta_files(sdata);
 		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* XXX */
+		break;
 	case NL80211_IFTYPE_AP:
 		add_ap_files(sdata);
 		break;
@@ -418,9 +420,11 @@ static void del_files(struct ieee80211_sub_if_data *sdata)
 #endif
 		break;
 	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
 		del_sta_files(sdata);
 		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* XXX */
+		break;
 	case NL80211_IFTYPE_AP:
 		del_ap_files(sdata);
 		break;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 82ea0b63a386..4e3c72f20de7 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -17,6 +17,7 @@
 #include <net/wireless.h>
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
+#include "rate.h"
 
 void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
 				       struct ieee80211_ht_cap *ht_cap_ie,
@@ -93,7 +94,9 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_bss_ht_conf ht;
+	struct sta_info *sta;
 	u32 changed = 0;
 	bool enable_ht = true, ht_changed;
 	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
@@ -136,6 +139,16 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 	if (ht_changed) {
                 /* channel_type change automatically detected */
 		ieee80211_hw_config(local, 0);
+
+		rcu_read_lock();
+
+		sta = sta_info_get(local, ifmgd->bssid);
+		if (sta)
+			rate_control_rate_update(local, sband, sta,
+						 IEEE80211_RC_HT_CHANGED);
+
+		rcu_read_unlock();
+
         }
 
 	/* disable HT */
@@ -169,7 +182,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 			  u16 initiator, u16 reason_code)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u16 params;
@@ -190,8 +202,9 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 	if (sdata->vif.type == NL80211_IFTYPE_AP ||
 	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
-	else
-		memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
new file mode 100644
index 000000000000..f4becc12904e
--- /dev/null
+++ b/net/mac80211/ibss.c
@@ -0,0 +1,907 @@
+/*
+ * IBSS mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "rate.h"
+
+#define IEEE80211_SCAN_INTERVAL (2 * HZ)
+#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
+
+#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
+#define IEEE80211_IBSS_MERGE_DELAY 0x400000
+#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
+
+#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
+
+
+static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_mgmt *mgmt,
+					size_t len)
+{
+	u16 auth_alg, auth_transaction, status_code;
+
+	if (len < 24 + 6)
+		return;
+
+	auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+	status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+	/*
+	 * IEEE 802.11 standard does not require authentication in IBSS
+	 * networks and most implementations do not seem to use it.
+	 * However, try to reply to authentication attempts if someone
+	 * has actually implemented this.
+	 */
+	if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
+		ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0,
+				    sdata->u.ibss.bssid, 0);
+}
+
+static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+				     const u8 *bssid, const int beacon_int,
+				     const int freq,
+				     const size_t supp_rates_len,
+				     const u8 *supp_rates,
+				     const u16 capability, u64 tsf)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	int res = 0, rates, i, j;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos;
+	struct ieee80211_supported_band *sband;
+	union iwreq_data wrqu;
+
+	if (local->ops->reset_tsf) {
+		/* Reset own TSF to allow time synchronization work. */
+		local->ops->reset_tsf(local_to_hw(local));
+	}
+
+	if ((ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) &&
+	   memcmp(ifibss->bssid, bssid, ETH_ALEN) == 0)
+		return res;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+		       "response\n", sdata->dev->name);
+		return -ENOMEM;
+	}
+
+	if (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) {
+		/* Remove possible STA entries from other IBSS networks. */
+		sta_info_flush_delayed(sdata);
+	}
+
+	memcpy(ifibss->bssid, bssid, ETH_ALEN);
+	res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
+	if (res)
+		return res;
+
+	local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
+
+	sdata->drop_unencrypted = capability &
+		WLAN_CAPABILITY_PRIVACY ? 1 : 0;
+
+	res = ieee80211_set_freq(sdata, freq);
+
+	if (res)
+		return res;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	/* Build IBSS probe response */
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *)
+		skb_put(skb, 24 + sizeof(mgmt->u.beacon));
+	memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_PROBE_RESP);
+	memset(mgmt->da, 0xff, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
+	mgmt->u.beacon.beacon_int =
+		cpu_to_le16(local->hw.conf.beacon_int);
+	mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
+	mgmt->u.beacon.capab_info = cpu_to_le16(capability);
+
+	pos = skb_put(skb, 2 + ifibss->ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ifibss->ssid_len;
+	memcpy(pos, ifibss->ssid, ifibss->ssid_len);
+
+	rates = supp_rates_len;
+	if (rates > 8)
+		rates = 8;
+	pos = skb_put(skb, 2 + rates);
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = rates;
+	memcpy(pos, supp_rates, rates);
+
+	if (sband->band == IEEE80211_BAND_2GHZ) {
+		pos = skb_put(skb, 2 + 1);
+		*pos++ = WLAN_EID_DS_PARAMS;
+		*pos++ = 1;
+		*pos++ = ieee80211_frequency_to_channel(freq);
+	}
+
+	pos = skb_put(skb, 2 + 2);
+	*pos++ = WLAN_EID_IBSS_PARAMS;
+	*pos++ = 2;
+	/* FIX: set ATIM window based on scan results */
+	*pos++ = 0;
+	*pos++ = 0;
+
+	if (supp_rates_len > 8) {
+		rates = supp_rates_len - 8;
+		pos = skb_put(skb, 2 + rates);
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = rates;
+		memcpy(pos, &supp_rates[8], rates);
+	}
+
+	ifibss->probe_resp = skb;
+
+	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
+				   IEEE80211_IFCC_BEACON_ENABLED);
+
+
+	rates = 0;
+	for (i = 0; i < supp_rates_len; i++) {
+		int bitrate = (supp_rates[i] & 0x7f) * 5;
+		for (j = 0; j < sband->n_bitrates; j++)
+			if (sband->bitrates[j].bitrate == bitrate)
+				rates |= BIT(j);
+	}
+
+	ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
+
+	ifibss->flags |= IEEE80211_IBSS_PREV_BSSID_SET;
+	ifibss->state = IEEE80211_IBSS_MLME_JOINED;
+	mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
+
+	memset(&wrqu, 0, sizeof(wrqu));
+	memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
+
+	return res;
+}
+
+static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_bss *bss)
+{
+	return __ieee80211_sta_join_ibss(sdata,
+					 bss->cbss.bssid,
+					 bss->cbss.beacon_interval,
+					 bss->cbss.channel->center_freq,
+					 bss->supp_rates_len, bss->supp_rates,
+					 bss->cbss.capability,
+					 bss->cbss.tsf);
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len,
+				  struct ieee80211_rx_status *rx_status,
+				  struct ieee802_11_elems *elems,
+				  bool beacon)
+{
+	struct ieee80211_local *local = sdata->local;
+	int freq;
+	struct ieee80211_bss *bss;
+	struct sta_info *sta;
+	struct ieee80211_channel *channel;
+	u64 beacon_timestamp, rx_timestamp;
+	u32 supp_rates = 0;
+	enum ieee80211_band band = rx_status->band;
+
+	if (elems->ds_params && elems->ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
+	    memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
+
+		rcu_read_lock();
+
+		sta = sta_info_get(local, mgmt->sa);
+		if (sta) {
+			u32 prev_rates;
+
+			prev_rates = sta->sta.supp_rates[band];
+			/* make sure mandatory rates are always added */
+			sta->sta.supp_rates[band] = supp_rates |
+				ieee80211_mandatory_rates(local, band);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+			if (sta->sta.supp_rates[band] != prev_rates)
+				printk(KERN_DEBUG "%s: updated supp_rates set "
+				    "for %pM based on beacon info (0x%llx | "
+				    "0x%llx -> 0x%llx)\n",
+				    sdata->dev->name,
+				    sta->sta.addr,
+				    (unsigned long long) prev_rates,
+				    (unsigned long long) supp_rates,
+				    (unsigned long long) sta->sta.supp_rates[band]);
+#endif
+		} else
+			ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
+
+		rcu_read_unlock();
+	}
+
+	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+					channel, beacon);
+	if (!bss)
+		return;
+
+	/* was just updated in ieee80211_bss_info_update */
+	beacon_timestamp = bss->cbss.tsf;
+
+	/* check if we need to merge IBSS */
+
+	/* merge only on beacons (???) */
+	if (!beacon)
+		goto put_bss;
+
+	/* we use a fixed BSSID */
+	if (sdata->u.ibss.flags & IEEE80211_IBSS_BSSID_SET)
+		goto put_bss;
+
+	/* not an IBSS */
+	if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
+		goto put_bss;
+
+	/* different channel */
+	if (bss->cbss.channel != local->oper_channel)
+		goto put_bss;
+
+	/* different SSID */
+	if (elems->ssid_len != sdata->u.ibss.ssid_len ||
+	    memcmp(elems->ssid, sdata->u.ibss.ssid,
+				sdata->u.ibss.ssid_len))
+		goto put_bss;
+
+	/* same BSSID */
+	if (memcmp(bss->cbss.bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0)
+		goto put_bss;
+
+	if (rx_status->flag & RX_FLAG_TSFT) {
+		/*
+		 * For correct IBSS merging we need mactime; since mactime is
+		 * defined as the time the first data symbol of the frame hits
+		 * the PHY, and the timestamp of the beacon is defined as "the
+		 * time that the data symbol containing the first bit of the
+		 * timestamp is transmitted to the PHY plus the transmitting
+		 * STA's delays through its local PHY from the MAC-PHY
+		 * interface to its interface with the WM" (802.11 11.1.2)
+		 * - equals the time this bit arrives at the receiver - we have
+		 * to take into account the offset between the two.
+		 *
+		 * E.g. at 1 MBit that means mactime is 192 usec earlier
+		 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
+		 */
+		int rate;
+
+		if (rx_status->flag & RX_FLAG_HT)
+			rate = 65; /* TODO: HT rates */
+		else
+			rate = local->hw.wiphy->bands[band]->
+				bitrates[rx_status->rate_idx].bitrate;
+
+		rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
+	} else if (local && local->ops && local->ops->get_tsf)
+		/* second best option: get current TSF */
+		rx_timestamp = local->ops->get_tsf(local_to_hw(local));
+	else
+		/* can't merge without knowing the TSF */
+		rx_timestamp = -1LLU;
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
+	       "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
+	       mgmt->sa, mgmt->bssid,
+	       (unsigned long long)rx_timestamp,
+	       (unsigned long long)beacon_timestamp,
+	       (unsigned long long)(rx_timestamp - beacon_timestamp),
+	       jiffies);
+#endif
+
+	/* give slow hardware some time to do the TSF sync */
+	if (rx_timestamp < IEEE80211_IBSS_MERGE_DELAY)
+		goto put_bss;
+
+	if (beacon_timestamp > rx_timestamp) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+		printk(KERN_DEBUG "%s: beacon TSF higher than "
+		       "local TSF - IBSS merge with BSSID %pM\n",
+		       sdata->dev->name, mgmt->bssid);
+#endif
+		ieee80211_sta_join_ibss(sdata, bss);
+		ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
+	}
+
+ put_bss:
+	ieee80211_rx_bss_put(local, bss);
+}
+
+/*
+ * Add a new IBSS station, will also be called by the RX code when,
+ * in IBSS mode, receiving a frame from a yet-unknown station, hence
+ * must be callable in atomic context.
+ */
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+					u8 *bssid,u8 *addr, u32 supp_rates)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	int band = local->hw.conf.channel->band;
+
+	/* TODO: Could consider removing the least recently used entry and
+	 * allow new one to be added. */
+	if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "%s: No room for a new IBSS STA "
+			       "entry %pM\n", sdata->dev->name, addr);
+		}
+		return NULL;
+	}
+
+	if (compare_ether_addr(bssid, sdata->u.ibss.bssid))
+		return NULL;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
+	       wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
+#endif
+
+	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
+	if (!sta)
+		return NULL;
+
+	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
+
+	/* make sure mandatory rates are always added */
+	sta->sta.supp_rates[band] = supp_rates |
+			ieee80211_mandatory_rates(local, band);
+
+	rate_control_rate_init(sta);
+
+	if (sta_info_insert(sta))
+		return NULL;
+
+	return sta;
+}
+
+static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	int active = 0;
+	struct sta_info *sta;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sta->sdata == sdata &&
+		    time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
+			       jiffies)) {
+			active++;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return active;
+}
+
+
+static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
+
+	ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
+	if (ieee80211_sta_active_ibss(sdata))
+		return;
+
+	if ((ifibss->flags & IEEE80211_IBSS_BSSID_SET) &&
+	    (!(ifibss->flags & IEEE80211_IBSS_AUTO_CHANNEL_SEL)))
+		return;
+
+	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
+	       "IBSS networks with same SSID (merge)\n", sdata->dev->name);
+
+	/* XXX maybe racy? */
+	if (sdata->local->scan_req)
+		return;
+
+	memcpy(sdata->local->int_scan_req.ssids[0].ssid,
+	       ifibss->ssid, IEEE80211_MAX_SSID_LEN);
+	sdata->local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
+	ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
+}
+
+static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	u8 *pos;
+	u8 bssid[ETH_ALEN];
+	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
+	u16 capability;
+	int i;
+
+	if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) {
+		memcpy(bssid, ifibss->bssid, ETH_ALEN);
+	} else {
+		/* Generate random, not broadcast, locally administered BSSID. Mix in
+		 * own MAC address to make sure that devices that do not have proper
+		 * random number generator get different BSSID. */
+		get_random_bytes(bssid, ETH_ALEN);
+		for (i = 0; i < ETH_ALEN; i++)
+			bssid[i] ^= sdata->dev->dev_addr[i];
+		bssid[0] &= ~0x01;
+		bssid[0] |= 0x02;
+	}
+
+	printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
+	       sdata->dev->name, bssid);
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	if (local->hw.conf.beacon_int == 0)
+		local->hw.conf.beacon_int = 100;
+
+	capability = WLAN_CAPABILITY_IBSS;
+
+	if (sdata->default_key)
+		capability |= WLAN_CAPABILITY_PRIVACY;
+	else
+		sdata->drop_unencrypted = 0;
+
+	pos = supp_rates;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		int rate = sband->bitrates[i].bitrate;
+		*pos++ = (u8) (rate / 5);
+	}
+
+	return __ieee80211_sta_join_ibss(sdata,
+					 bssid, local->hw.conf.beacon_int,
+					 local->hw.conf.channel->center_freq,
+					 sband->n_bitrates, supp_rates,
+					 capability, 0);
+}
+
+static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_bss *bss;
+	const u8 *bssid = NULL;
+	int active_ibss;
+
+	if (ifibss->ssid_len == 0)
+		return -EINVAL;
+
+	active_ibss = ieee80211_sta_active_ibss(sdata);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
+	       sdata->dev->name, active_ibss);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	if (active_ibss)
+		return 0;
+
+	if (ifibss->flags & IEEE80211_IBSS_BSSID_SET)
+		bssid = ifibss->bssid;
+	bss = (void *)cfg80211_get_bss(local->hw.wiphy, NULL, bssid,
+				       ifibss->ssid, ifibss->ssid_len,
+				       WLAN_CAPABILITY_IBSS,
+				       WLAN_CAPABILITY_IBSS);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	if (bss)
+		printk(KERN_DEBUG "   sta_find_ibss: selected %pM current "
+		       "%pM\n", bss->cbss.bssid, ifibss->bssid);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	if (bss &&
+	    (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) ||
+	     memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN))) {
+		int ret;
+
+		printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
+		       " based on configured SSID\n",
+		       sdata->dev->name, bss->cbss.bssid);
+
+		ret = ieee80211_sta_join_ibss(sdata, bss);
+		ieee80211_rx_bss_put(local, bss);
+		return ret;
+	} else if (bss)
+		ieee80211_rx_bss_put(local, bss);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "   did not try to join ibss\n");
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	/* Selected IBSS not found in current scan results - try to scan */
+	if (ifibss->state == IEEE80211_IBSS_MLME_JOINED &&
+	    !ieee80211_sta_active_ibss(sdata)) {
+		mod_timer(&ifibss->timer, jiffies +
+					  IEEE80211_IBSS_MERGE_INTERVAL);
+	} else if (time_after(jiffies, local->last_scan_completed +
+					IEEE80211_SCAN_INTERVAL)) {
+		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
+		       "join\n", sdata->dev->name);
+
+		/* XXX maybe racy? */
+		if (local->scan_req)
+			return -EBUSY;
+
+		memcpy(local->int_scan_req.ssids[0].ssid,
+		       ifibss->ssid, IEEE80211_MAX_SSID_LEN);
+		local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
+		return ieee80211_request_scan(sdata, &local->int_scan_req);
+	} else if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) {
+		int interval = IEEE80211_SCAN_INTERVAL;
+
+		if (time_after(jiffies, ifibss->ibss_join_req +
+			       IEEE80211_IBSS_JOIN_TIMEOUT)) {
+			if (!(local->oper_channel->flags &
+						IEEE80211_CHAN_NO_IBSS))
+				return ieee80211_sta_create_ibss(sdata);
+			printk(KERN_DEBUG "%s: IBSS not allowed on"
+			       " %d MHz\n", sdata->dev->name,
+			       local->hw.conf.channel->center_freq);
+
+			/* No IBSS found - decrease scan interval and continue
+			 * scanning. */
+			interval = IEEE80211_SCAN_INTERVAL_SLOW;
+		}
+
+		ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
+		mod_timer(&ifibss->timer, jiffies + interval);
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_mgmt *mgmt,
+					size_t len)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	int tx_last_beacon;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *resp;
+	u8 *pos, *end;
+
+	if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
+	    len < 24 + 2 || !ifibss->probe_resp)
+		return;
+
+	if (local->ops->tx_last_beacon)
+		tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
+	else
+		tx_last_beacon = 1;
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
+	       " (tx_last_beacon=%d)\n",
+	       sdata->dev->name, mgmt->sa, mgmt->da,
+	       mgmt->bssid, tx_last_beacon);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	if (!tx_last_beacon)
+		return;
+
+	if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 &&
+	    memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
+		return;
+
+	end = ((u8 *) mgmt) + len;
+	pos = mgmt->u.probe_req.variable;
+	if (pos[0] != WLAN_EID_SSID ||
+	    pos + 2 + pos[1] > end) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+		printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
+		       "from %pM\n",
+		       sdata->dev->name, mgmt->sa);
+#endif
+		return;
+	}
+	if (pos[1] != 0 &&
+	    (pos[1] != ifibss->ssid_len ||
+	     memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len) != 0)) {
+		/* Ignore ProbeReq for foreign SSID */
+		return;
+	}
+
+	/* Reply with ProbeResp */
+	skb = skb_copy(ifibss->probe_resp, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	resp = (struct ieee80211_mgmt *) skb->data;
+	memcpy(resp->da, mgmt->sa, ETH_ALEN);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
+	       sdata->dev->name, resp->da);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+	ieee80211_tx_skb(sdata, skb, 0);
+}
+
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_mgmt *mgmt,
+					 size_t len,
+					 struct ieee80211_rx_status *rx_status)
+{
+	size_t baselen;
+	struct ieee802_11_elems elems;
+
+	if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
+		return; /* ignore ProbeResp to foreign address */
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+				&elems);
+
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
+}
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len,
+				     struct ieee80211_rx_status *rx_status)
+{
+	size_t baselen;
+	struct ieee802_11_elems elems;
+
+	/* Process beacon from the current BSS */
+	baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
+
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
+}
+
+static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+					  struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_mgmt *mgmt;
+	u16 fc;
+
+	rx_status = (struct ieee80211_rx_status *) skb->cb;
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	switch (fc & IEEE80211_FCTL_STYPE) {
+	case IEEE80211_STYPE_PROBE_REQ:
+		ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len);
+		break;
+	case IEEE80211_STYPE_PROBE_RESP:
+		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
+					     rx_status);
+		break;
+	case IEEE80211_STYPE_BEACON:
+		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+					 rx_status);
+		break;
+	case IEEE80211_STYPE_AUTH:
+		ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
+		break;
+	}
+
+	kfree_skb(skb);
+}
+
+static void ieee80211_ibss_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, u.ibss.work);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_ibss *ifibss;
+	struct sk_buff *skb;
+
+	if (!netif_running(sdata->dev))
+		return;
+
+	if (local->sw_scanning || local->hw_scanning)
+		return;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC))
+		return;
+	ifibss = &sdata->u.ibss;
+
+	while ((skb = skb_dequeue(&ifibss->skb_queue)))
+		ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+
+	if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request))
+		return;
+
+	switch (ifibss->state) {
+	case IEEE80211_IBSS_MLME_SEARCH:
+		ieee80211_sta_find_ibss(sdata);
+		break;
+	case IEEE80211_IBSS_MLME_JOINED:
+		ieee80211_sta_merge_ibss(sdata);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+static void ieee80211_ibss_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+
+	set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request);
+	queue_work(local->hw.workqueue, &ifibss->work);
+}
+
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	INIT_WORK(&ifibss->work, ieee80211_ibss_work);
+	setup_timer(&ifibss->timer, ieee80211_ibss_timer,
+		    (unsigned long) sdata);
+	skb_queue_head_init(&ifibss->skb_queue);
+
+	ifibss->flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
+			IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+}
+
+int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	ifibss->flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
+
+	if (ifibss->ssid_len)
+		ifibss->flags |= IEEE80211_IBSS_SSID_SET;
+	else
+		ifibss->flags &= ~IEEE80211_IBSS_SSID_SET;
+
+	ifibss->ibss_join_req = jiffies;
+	ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
+
+	return ieee80211_sta_find_ibss(sdata);
+}
+
+int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	if (len > IEEE80211_MAX_SSID_LEN)
+		return -EINVAL;
+
+	if (ifibss->ssid_len != len || memcmp(ifibss->ssid, ssid, len) != 0) {
+		memset(ifibss->ssid, 0, sizeof(ifibss->ssid));
+		memcpy(ifibss->ssid, ssid, len);
+		ifibss->ssid_len = len;
+	}
+
+	return ieee80211_ibss_commit(sdata);
+}
+
+int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	memcpy(ssid, ifibss->ssid, ifibss->ssid_len);
+	*len = ifibss->ssid_len;
+
+	return 0;
+}
+
+int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	if (is_valid_ether_addr(bssid)) {
+		memcpy(ifibss->bssid, bssid, ETH_ALEN);
+		ifibss->flags |= IEEE80211_IBSS_BSSID_SET;
+	} else {
+		memset(ifibss->bssid, 0, ETH_ALEN);
+		ifibss->flags &= ~IEEE80211_IBSS_BSSID_SET;
+	}
+
+	if (netif_running(sdata->dev)) {
+		if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) {
+			printk(KERN_DEBUG "%s: Failed to config new BSSID to "
+			       "the low-level driver\n", sdata->dev->name);
+		}
+	}
+
+	return ieee80211_ibss_commit(sdata);
+}
+
+/* scan finished notification */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+	struct ieee80211_if_ibss *ifibss;
+
+	if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		ifibss = &sdata->u.ibss;
+		if ((!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) ||
+		    !ieee80211_sta_active_ibss(sdata))
+			ieee80211_sta_find_ibss(sdata);
+	}
+}
+
+ieee80211_rx_result
+ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		       struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	u16 fc;
+
+	if (skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	switch (fc & IEEE80211_FCTL_STYPE) {
+	case IEEE80211_STYPE_PROBE_RESP:
+	case IEEE80211_STYPE_BEACON:
+		memcpy(skb->cb, rx_status, sizeof(*rx_status));
+	case IEEE80211_STYPE_PROBE_REQ:
+	case IEEE80211_STYPE_AUTH:
+		skb_queue_tail(&sdata->u.ibss.skb_queue, skb);
+		queue_work(local->hw.workqueue, &sdata->u.ibss.work);
+		return RX_QUEUED;
+	}
+
+	return RX_DROP_MONITOR;
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2cb743ed9f9c..fbb91f1aebb2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -239,7 +239,7 @@ struct mesh_preq_queue {
 	u8 flags;
 };
 
-/* flags used in struct ieee80211_if_sta.flags */
+/* flags used in struct ieee80211_if_managed.flags */
 #define IEEE80211_STA_SSID_SET		BIT(0)
 #define IEEE80211_STA_BSSID_SET		BIT(1)
 #define IEEE80211_STA_PREV_BSSID_SET	BIT(2)
@@ -262,31 +262,30 @@ struct mesh_preq_queue {
 #define IEEE80211_STA_REQ_AUTH 2
 #define IEEE80211_STA_REQ_RUN  3
 
-/* STA/IBSS MLME states */
-enum ieee80211_sta_mlme_state {
-	IEEE80211_STA_MLME_DISABLED,
-	IEEE80211_STA_MLME_DIRECT_PROBE,
-	IEEE80211_STA_MLME_AUTHENTICATE,
-	IEEE80211_STA_MLME_ASSOCIATE,
-	IEEE80211_STA_MLME_ASSOCIATED,
-	IEEE80211_STA_MLME_IBSS_SEARCH,
-	IEEE80211_STA_MLME_IBSS_JOINED,
-};
-
 /* bitfield of allowed auth algs */
 #define IEEE80211_AUTH_ALG_OPEN BIT(0)
 #define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
 #define IEEE80211_AUTH_ALG_LEAP BIT(2)
 
-struct ieee80211_if_sta {
+struct ieee80211_if_managed {
 	struct timer_list timer;
 	struct timer_list chswitch_timer;
 	struct work_struct work;
 	struct work_struct chswitch_work;
+
 	u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
+
 	u8 ssid[IEEE80211_MAX_SSID_LEN];
-	enum ieee80211_sta_mlme_state state;
 	size_t ssid_len;
+
+	enum {
+		IEEE80211_STA_MLME_DISABLED,
+		IEEE80211_STA_MLME_DIRECT_PROBE,
+		IEEE80211_STA_MLME_AUTHENTICATE,
+		IEEE80211_STA_MLME_ASSOCIATE,
+		IEEE80211_STA_MLME_ASSOCIATED,
+	} state;
+
 	u16 aid;
 	u16 ap_capab, capab;
 	u8 *extra_ie; /* to be added to the end of AssocReq */
@@ -319,10 +318,6 @@ struct ieee80211_if_sta {
 		IEEE80211_MFP_REQUIRED
 	} mfp; /* management frame protection */
 
-	unsigned long ibss_join_req;
-	struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
-	u32 supp_rates_bits[IEEE80211_NUM_BANDS];
-
 	int wmm_last_param_set;
 
 	/* Extra IE data for management frames */
@@ -342,6 +337,42 @@ struct ieee80211_if_sta {
 	size_t ie_disassoc_len;
 };
 
+enum ieee80211_ibss_flags {
+	IEEE80211_IBSS_AUTO_CHANNEL_SEL		= BIT(0),
+	IEEE80211_IBSS_AUTO_BSSID_SEL		= BIT(1),
+	IEEE80211_IBSS_BSSID_SET		= BIT(2),
+	IEEE80211_IBSS_PREV_BSSID_SET		= BIT(3),
+	IEEE80211_IBSS_SSID_SET			= BIT(4),
+};
+
+enum ieee80211_ibss_request {
+	IEEE80211_IBSS_REQ_RUN	= 0,
+};
+
+struct ieee80211_if_ibss {
+	struct timer_list timer;
+	struct work_struct work;
+
+	struct sk_buff_head skb_queue;
+
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+	u8 ssid_len;
+
+	u32 flags;
+
+	u8 bssid[ETH_ALEN];
+
+	unsigned long request;
+
+	unsigned long ibss_join_req;
+	struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
+
+	enum {
+		IEEE80211_IBSS_MLME_SEARCH,
+		IEEE80211_IBSS_MLME_JOINED,
+	} state;
+};
+
 struct ieee80211_if_mesh {
 	struct work_struct work;
 	struct timer_list housekeeping_timer;
@@ -445,7 +476,8 @@ struct ieee80211_sub_if_data {
 		struct ieee80211_if_ap ap;
 		struct ieee80211_if_wds wds;
 		struct ieee80211_if_vlan vlan;
-		struct ieee80211_if_sta sta;
+		struct ieee80211_if_managed mgd;
+		struct ieee80211_if_ibss ibss;
 #ifdef CONFIG_MAC80211_MESH
 		struct ieee80211_if_mesh mesh;
 #endif
@@ -564,12 +596,10 @@ enum {
 enum queue_stop_reason {
 	IEEE80211_QUEUE_STOP_REASON_DRIVER,
 	IEEE80211_QUEUE_STOP_REASON_PS,
-	IEEE80211_QUEUE_STOP_REASON_CSA
+	IEEE80211_QUEUE_STOP_REASON_CSA,
+	IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
 };
 
-/* maximum number of hardware queues we support. */
-#define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES)
-
 struct ieee80211_master_priv {
 	struct ieee80211_local *local;
 };
@@ -582,9 +612,15 @@ struct ieee80211_local {
 
 	const struct ieee80211_ops *ops;
 
-	unsigned long queue_pool[BITS_TO_LONGS(QD_MAX_QUEUES)];
-	unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
+	/* AC queue corresponding to each AMPDU queue */
+	s8 ampdu_ac_queue[IEEE80211_MAX_AMPDU_QUEUES];
+	unsigned int amdpu_ac_stop_refcnt[IEEE80211_MAX_AMPDU_QUEUES];
+
+	unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES +
+					 IEEE80211_MAX_AMPDU_QUEUES];
+	/* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
 	spinlock_t queue_stop_reason_lock;
+
 	struct net_device *mdev; /* wmaster# - "master" 802.11 device */
 	int open_count;
 	int monitors, cooked_mntrs;
@@ -888,34 +924,41 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed);
 void ieee80211_configure_filter(struct ieee80211_local *local);
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
 
 /* wireless extensions */
 extern const struct iw_handler_def ieee80211_iw_handler_def;
 
-/* STA/IBSS code */
+/* STA code */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
-void ieee80211_scan_work(struct work_struct *work);
-void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-			   struct ieee80211_rx_status *rx_status);
+ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+					  struct sk_buff *skb,
+					  struct ieee80211_rx_status *rx_status);
+int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata);
 int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
 int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
 int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
-void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
-			    struct ieee80211_if_sta *ifsta);
-struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
-					u8 *bssid, u8 *addr, u32 supp_rates);
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata);
 int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason);
 int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
-u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
-			    struct ieee802_11_elems *elems,
-			    enum ieee80211_band band);
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
-			      u8 *ssid, size_t ssid_len);
 void ieee80211_send_pspoll(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata);
 
+/* IBSS code */
+int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata);
+int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
+int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
+int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
+ieee80211_rx_result
+ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
+		       struct ieee80211_rx_status *rx_status);
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+					u8 *bssid, u8 *addr, u32 supp_rates);
+
 /* scan/BSS handling */
+void ieee80211_scan_work(struct work_struct *work);
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 			   struct cfg80211_scan_request *req);
 int ieee80211_scan_results(struct ieee80211_local *local,
@@ -929,6 +972,7 @@ int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
 			       char *ie, size_t len);
 
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_scan_failed(struct ieee80211_local *local);
 int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 			 struct cfg80211_scan_request *req);
 struct ieee80211_bss *
@@ -1042,6 +1086,25 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
 				     enum queue_stop_reason reason);
 void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
 				     enum queue_stop_reason reason);
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason);
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason);
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+			 u16 transaction, u16 auth_alg,
+			 u8 *extra, size_t extra_len,
+			 const u8 *bssid, int encrypt);
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      u8 *ssid, size_t ssid_len,
+			      u8 *ie, size_t ie_len);
+
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+				  const size_t supp_rates_len,
+				  const u8 *supp_rates);
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+			    struct ieee802_11_elems *elems,
+			    enum ieee80211_band band);
 
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index df94b9365264..f9f27b9cadbe 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -236,7 +236,10 @@ static int ieee80211_open(struct net_device *dev)
 		break;
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_ADHOC:
-		sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+		if (sdata->vif.type == NL80211_IFTYPE_STATION)
+			sdata->u.mgd.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+		else
+			sdata->u.ibss.flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
 		/* fall through */
 	default:
 		conf.vif = &sdata->vif;
@@ -321,11 +324,10 @@ static int ieee80211_open(struct net_device *dev)
 	 * yet be effective. Trigger execution of ieee80211_sta_work
 	 * to fix this.
 	 */
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-		queue_work(local->hw.workqueue, &ifsta->work);
-	}
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		queue_work(local->hw.workqueue, &sdata->u.mgd.work);
+	else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		queue_work(local->hw.workqueue, &sdata->u.ibss.work);
 
 	netif_tx_start_all_queues(dev);
 
@@ -368,6 +370,18 @@ static int ieee80211_stop(struct net_device *dev)
 	rcu_read_unlock();
 
 	/*
+	 * Announce that we are leaving the network, in case we are a
+	 * station interface type. This must be done before removing
+	 * all stations associated with sta_info_flush, otherwise STA
+	 * information will be gone and no announce being done.
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		if (sdata->u.mgd.state != IEEE80211_STA_MLME_DISABLED)
+			ieee80211_sta_deauthenticate(sdata,
+				WLAN_REASON_DEAUTH_LEAVING);
+	}
+
+	/*
 	 * Remove all stations associated with this interface.
 	 *
 	 * This must be done before calling ops->remove_interface()
@@ -452,15 +466,9 @@ static int ieee80211_stop(struct net_device *dev)
 		netif_addr_unlock_bh(local->mdev);
 		break;
 	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
-		/* Announce that we are leaving the network. */
-		if (sdata->u.sta.state != IEEE80211_STA_MLME_DISABLED)
-			ieee80211_sta_deauthenticate(sdata,
-						WLAN_REASON_DEAUTH_LEAVING);
-
-		memset(sdata->u.sta.bssid, 0, ETH_ALEN);
-		del_timer_sync(&sdata->u.sta.chswitch_timer);
-		del_timer_sync(&sdata->u.sta.timer);
+		memset(sdata->u.mgd.bssid, 0, ETH_ALEN);
+		del_timer_sync(&sdata->u.mgd.chswitch_timer);
+		del_timer_sync(&sdata->u.mgd.timer);
 		/*
 		 * If the timer fired while we waited for it, it will have
 		 * requeued the work. Now the work will be running again
@@ -468,8 +476,8 @@ static int ieee80211_stop(struct net_device *dev)
 		 * whether the interface is running, which, at this point,
 		 * it no longer is.
 		 */
-		cancel_work_sync(&sdata->u.sta.work);
-		cancel_work_sync(&sdata->u.sta.chswitch_work);
+		cancel_work_sync(&sdata->u.mgd.work);
+		cancel_work_sync(&sdata->u.mgd.chswitch_work);
 		/*
 		 * When we get here, the interface is marked down.
 		 * Call synchronize_rcu() to wait for the RX path
@@ -477,13 +485,22 @@ static int ieee80211_stop(struct net_device *dev)
 		 * frames at this very time on another CPU.
 		 */
 		synchronize_rcu();
-		skb_queue_purge(&sdata->u.sta.skb_queue);
+		skb_queue_purge(&sdata->u.mgd.skb_queue);
 
-		sdata->u.sta.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED |
+		sdata->u.mgd.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED |
 					IEEE80211_STA_TKIP_WEP_USED);
-		kfree(sdata->u.sta.extra_ie);
-		sdata->u.sta.extra_ie = NULL;
-		sdata->u.sta.extra_ie_len = 0;
+		kfree(sdata->u.mgd.extra_ie);
+		sdata->u.mgd.extra_ie = NULL;
+		sdata->u.mgd.extra_ie_len = 0;
+		/* fall through */
+	case NL80211_IFTYPE_ADHOC:
+		if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+			memset(sdata->u.ibss.bssid, 0, ETH_ALEN);
+			del_timer_sync(&sdata->u.ibss.timer);
+			cancel_work_sync(&sdata->u.ibss.work);
+			synchronize_rcu();
+			skb_queue_purge(&sdata->u.ibss.skb_queue);
+		}
 		/* fall through */
 	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -629,19 +646,20 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			mesh_rmc_free(sdata);
 		break;
-	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_ADHOC:
-		kfree(sdata->u.sta.extra_ie);
-		kfree(sdata->u.sta.assocreq_ies);
-		kfree(sdata->u.sta.assocresp_ies);
-		kfree_skb(sdata->u.sta.probe_resp);
-		kfree(sdata->u.sta.ie_probereq);
-		kfree(sdata->u.sta.ie_proberesp);
-		kfree(sdata->u.sta.ie_auth);
-		kfree(sdata->u.sta.ie_assocreq);
-		kfree(sdata->u.sta.ie_reassocreq);
-		kfree(sdata->u.sta.ie_deauth);
-		kfree(sdata->u.sta.ie_disassoc);
+		kfree_skb(sdata->u.ibss.probe_resp);
+		break;
+	case NL80211_IFTYPE_STATION:
+		kfree(sdata->u.mgd.extra_ie);
+		kfree(sdata->u.mgd.assocreq_ies);
+		kfree(sdata->u.mgd.assocresp_ies);
+		kfree(sdata->u.mgd.ie_probereq);
+		kfree(sdata->u.mgd.ie_proberesp);
+		kfree(sdata->u.mgd.ie_auth);
+		kfree(sdata->u.mgd.ie_assocreq);
+		kfree(sdata->u.mgd.ie_reassocreq);
+		kfree(sdata->u.mgd.ie_deauth);
+		kfree(sdata->u.mgd.ie_disassoc);
 		break;
 	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_AP_VLAN:
@@ -708,9 +726,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 		INIT_LIST_HEAD(&sdata->u.ap.vlans);
 		break;
 	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_ADHOC:
 		ieee80211_sta_setup_sdata(sdata);
 		break;
+	case NL80211_IFTYPE_ADHOC:
+		ieee80211_ibss_setup_sdata(sdata);
+		break;
 	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif))
 			ieee80211_mesh_init_sdata(sdata);
@@ -798,6 +818,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 
 	memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
 	SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));
+	ndev->features |= NETIF_F_NETNS_LOCAL;
 
 	/* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */
 	sdata = netdev_priv(ndev);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 19b480de4bbc..687acf23054d 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -400,7 +400,7 @@ void ieee80211_key_link(struct ieee80211_key *key,
 			 */
 
 			/* same here, the AP could be using QoS */
-			ap = sta_info_get(key->local, key->sdata->u.sta.bssid);
+			ap = sta_info_get(key->local, key->sdata->u.mgd.bssid);
 			if (ap) {
 				if (test_sta_flags(ap, WLAN_STA_WME))
 					key->conf.flags |=
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 5667f4e8067f..f38db4d37e5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -169,9 +169,10 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
 
 	memset(&conf, 0, sizeof(conf));
 
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC)
-		conf.bssid = sdata->u.sta.bssid;
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		conf.bssid = sdata->u.mgd.bssid;
+	else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		conf.bssid = sdata->u.ibss.bssid;
 	else if (sdata->vif.type == NL80211_IFTYPE_AP)
 		conf.bssid = sdata->dev->dev_addr;
 	else if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -210,7 +211,7 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
 					!!rcu_dereference(sdata->u.ap.beacon);
 				break;
 			case NL80211_IFTYPE_ADHOC:
-				conf.enable_beacon = !!sdata->u.sta.probe_resp;
+				conf.enable_beacon = !!sdata->u.ibss.probe_resp;
 				break;
 			case NL80211_IFTYPE_MESH_POINT:
 				conf.enable_beacon = true;
@@ -705,7 +706,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 					const struct ieee80211_ops *ops)
 {
 	struct ieee80211_local *local;
-	int priv_size;
+	int priv_size, i;
 	struct wiphy *wiphy;
 
 	/* Ensure 32-byte alignment of our private data and hw private data.
@@ -779,6 +780,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	setup_timer(&local->dynamic_ps_timer,
 		    ieee80211_dynamic_ps_timer, (unsigned long) local);
 
+	for (i = 0; i < IEEE80211_MAX_AMPDU_QUEUES; i++)
+		local->ampdu_ac_queue[i] = -1;
+	/* using an s8 won't work with more than that */
+	BUILD_BUG_ON(IEEE80211_MAX_AMPDU_QUEUES > 127);
+
 	sta_info_init(local);
 
 	tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
@@ -855,6 +861,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	/* mac80211 always supports monitor */
 	local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
 
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
+
 	result = wiphy_register(local->hw.wiphy);
 	if (result < 0)
 		goto fail_wiphy_register;
@@ -872,7 +883,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv),
 			       "wmaster%d", ieee80211_master_setup,
-			       ieee80211_num_queues(hw));
+			       hw->queues);
 	if (!mdev)
 		goto fail_mdev_alloc;
 
@@ -916,6 +927,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
 	SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy));
+	local->mdev->features |= NETIF_F_NETNS_LOCAL;
 
 	result = register_netdevice(local->mdev);
 	if (result < 0)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fbb766afe599..841b8450b3de 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -15,11 +15,8 @@
 #include <linux/if_ether.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
-#include <linux/wireless.h>
-#include <linux/random.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
-#include <net/iw_handler.h>
 #include <net/mac80211.h>
 #include <asm/unaligned.h>
 
@@ -35,15 +32,6 @@
 #define IEEE80211_MONITORING_INTERVAL (2 * HZ)
 #define IEEE80211_PROBE_INTERVAL (60 * HZ)
 #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
-#define IEEE80211_SCAN_INTERVAL (2 * HZ)
-#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
-#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
-
-#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
-#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
-
-#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
-
 
 /* utils */
 static int ecw2cw(int ecw)
@@ -92,43 +80,6 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss,
 	return count;
 }
 
-/* also used by mesh code */
-u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
-			    struct ieee802_11_elems *elems,
-			    enum ieee80211_band band)
-{
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_rate *bitrates;
-	size_t num_rates;
-	u32 supp_rates;
-	int i, j;
-	sband = local->hw.wiphy->bands[band];
-
-	if (!sband) {
-		WARN_ON(1);
-		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-	}
-
-	bitrates = sband->bitrates;
-	num_rates = sband->n_bitrates;
-	supp_rates = 0;
-	for (i = 0; i < elems->supp_rates_len +
-		     elems->ext_supp_rates_len; i++) {
-		u8 rate = 0;
-		int own_rate;
-		if (i < elems->supp_rates_len)
-			rate = elems->supp_rates[i];
-		else if (elems->ext_supp_rates)
-			rate = elems->ext_supp_rates
-				[i - elems->supp_rates_len];
-		own_rate = 5 * (rate & 0x7f);
-		for (j = 0; j < num_rates; j++)
-			if (bitrates[j].bitrate == own_rate)
-				supp_rates |= BIT(j);
-	}
-	return supp_rates;
-}
-
 /* frame sending functions */
 
 static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
@@ -137,113 +88,9 @@ static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
 		memcpy(skb_put(skb, ies_len), ies, ies_len);
 }
 
-/* also used by scanning code */
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
-			      u8 *ssid, size_t ssid_len)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_supported_band *sband;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *supp_rates, *esupp_rates = NULL;
-	int i;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
-			    sdata->u.sta.ie_probereq_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
-		       "request\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
-	memset(mgmt, 0, 24);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_PROBE_REQ);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	if (dst) {
-		memcpy(mgmt->da, dst, ETH_ALEN);
-		memcpy(mgmt->bssid, dst, ETH_ALEN);
-	} else {
-		memset(mgmt->da, 0xff, ETH_ALEN);
-		memset(mgmt->bssid, 0xff, ETH_ALEN);
-	}
-	pos = skb_put(skb, 2 + ssid_len);
-	*pos++ = WLAN_EID_SSID;
-	*pos++ = ssid_len;
-	memcpy(pos, ssid, ssid_len);
-
-	supp_rates = skb_put(skb, 2);
-	supp_rates[0] = WLAN_EID_SUPP_RATES;
-	supp_rates[1] = 0;
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	for (i = 0; i < sband->n_bitrates; i++) {
-		struct ieee80211_rate *rate = &sband->bitrates[i];
-		if (esupp_rates) {
-			pos = skb_put(skb, 1);
-			esupp_rates[1]++;
-		} else if (supp_rates[1] == 8) {
-			esupp_rates = skb_put(skb, 3);
-			esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
-			esupp_rates[1] = 1;
-			pos = &esupp_rates[2];
-		} else {
-			pos = skb_put(skb, 1);
-			supp_rates[1]++;
-		}
-		*pos = rate->bitrate / 5;
-	}
-
-	add_extra_ies(skb, sdata->u.sta.ie_probereq,
-		      sdata->u.sta.ie_probereq_len);
-
-	ieee80211_tx_skb(sdata, skb, 0);
-}
-
-static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
-				struct ieee80211_if_sta *ifsta,
-				int transaction, u8 *extra, size_t extra_len,
-				int encrypt)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 6 + extra_len +
-			    sdata->u.sta.ie_auth_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
-		       "frame\n", sdata->dev->name);
-		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
-	memset(mgmt, 0, 24 + 6);
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_AUTH);
-	if (encrypt)
-		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
-	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
-	ifsta->auth_transaction = transaction + 1;
-	mgmt->u.auth.status_code = cpu_to_le16(0);
-	if (extra)
-		memcpy(skb_put(skb, extra_len), extra, extra_len);
-	add_extra_ies(skb, sdata->u.sta.ie_auth, sdata->u.sta.ie_auth_len);
-
-	ieee80211_tx_skb(sdata, skb, encrypt);
-}
-
-static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
-				 struct ieee80211_if_sta *ifsta)
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
@@ -256,17 +103,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	u32 rates = 0;
 	size_t e_ies_len;
 
-	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
-		e_ies = sdata->u.sta.ie_reassocreq;
-		e_ies_len = sdata->u.sta.ie_reassocreq_len;
+	if (ifmgd->flags & IEEE80211_IBSS_PREV_BSSID_SET) {
+		e_ies = sdata->u.mgd.ie_reassocreq;
+		e_ies_len = sdata->u.mgd.ie_reassocreq_len;
 	} else {
-		e_ies = sdata->u.sta.ie_assocreq;
-		e_ies_len = sdata->u.sta.ie_assocreq_len;
+		e_ies = sdata->u.mgd.ie_assocreq;
+		e_ies_len = sdata->u.mgd.ie_assocreq_len;
 	}
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
-			    sizeof(*mgmt) + 200 + ifsta->extra_ie_len +
-			    ifsta->ssid_len + e_ies_len);
+			    sizeof(*mgmt) + 200 + ifmgd->extra_ie_len +
+			    ifmgd->ssid_len + e_ies_len);
 	if (!skb) {
 		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
 		       "frame\n", sdata->dev->name);
@@ -276,7 +123,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
-	capab = ifsta->capab;
+	capab = ifmgd->capab;
 
 	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) {
 		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
@@ -285,9 +132,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 			capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
 	}
 
-	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
 				   local->hw.conf.channel->center_freq,
-				   ifsta->ssid, ifsta->ssid_len);
+				   ifmgd->ssid, ifmgd->ssid_len);
 	if (bss) {
 		if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY)
 			capab |= WLAN_CAPABILITY_PRIVACY;
@@ -312,18 +159,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
 
-	if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) {
+	if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) {
 		skb_put(skb, 10);
 		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 						  IEEE80211_STYPE_REASSOC_REQ);
 		mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
 		mgmt->u.reassoc_req.listen_interval =
 				cpu_to_le16(local->hw.conf.listen_interval);
-		memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid,
+		memcpy(mgmt->u.reassoc_req.current_ap, ifmgd->prev_bssid,
 		       ETH_ALEN);
 	} else {
 		skb_put(skb, 4);
@@ -335,10 +182,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* SSID */
-	ies = pos = skb_put(skb, 2 + ifsta->ssid_len);
+	ies = pos = skb_put(skb, 2 + ifmgd->ssid_len);
 	*pos++ = WLAN_EID_SSID;
-	*pos++ = ifsta->ssid_len;
-	memcpy(pos, ifsta->ssid, ifsta->ssid_len);
+	*pos++ = ifmgd->ssid_len;
+	memcpy(pos, ifmgd->ssid, ifmgd->ssid_len);
 
 	/* add all rates which were marked to be used above */
 	supp_rates_len = rates_len;
@@ -393,12 +240,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (ifsta->extra_ie) {
-		pos = skb_put(skb, ifsta->extra_ie_len);
-		memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len);
+	if (ifmgd->extra_ie) {
+		pos = skb_put(skb, ifmgd->extra_ie_len);
+		memcpy(pos, ifmgd->extra_ie, ifmgd->extra_ie_len);
 	}
 
-	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
+	if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) {
 		pos = skb_put(skb, 9);
 		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
 		*pos++ = 7; /* len */
@@ -418,11 +265,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	 * mode (11a/b/g) if any one of these ciphers is
 	 * configured as pairwise.
 	 */
-	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
+	if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
 	    sband->ht_cap.ht_supported &&
 	    (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) &&
 	    ht_ie[1] >= sizeof(struct ieee80211_ht_info) &&
-	    (!(ifsta->flags & IEEE80211_STA_TKIP_WEP_USED))) {
+	    (!(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))) {
 		struct ieee80211_ht_info *ht_info =
 			(struct ieee80211_ht_info *)(ht_ie + 2);
 		u16 cap = sband->ht_cap.cap;
@@ -459,11 +306,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 
 	add_extra_ies(skb, e_ies, e_ies_len);
 
-	kfree(ifsta->assocreq_ies);
-	ifsta->assocreq_ies_len = (skb->data + skb->len) - ies;
-	ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL);
-	if (ifsta->assocreq_ies)
-		memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len);
+	kfree(ifmgd->assocreq_ies);
+	ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies;
+	ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL);
+	if (ifmgd->assocreq_ies)
+		memcpy(ifmgd->assocreq_ies, ies, ifmgd->assocreq_ies_len);
 
 	ieee80211_tx_skb(sdata, skb, 0);
 }
@@ -473,18 +320,18 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 					   u16 stype, u16 reason)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
 	u8 *ies;
 	size_t ies_len;
 
 	if (stype == IEEE80211_STYPE_DEAUTH) {
-		ies = sdata->u.sta.ie_deauth;
-		ies_len = sdata->u.sta.ie_deauth_len;
+		ies = sdata->u.mgd.ie_deauth;
+		ies_len = sdata->u.mgd.ie_deauth_len;
 	} else {
-		ies = sdata->u.sta.ie_disassoc;
-		ies_len = sdata->u.sta.ie_disassoc_len;
+		ies = sdata->u.mgd.ie_disassoc;
+		ies_len = sdata->u.mgd.ie_disassoc_len;
 	}
 
 	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) +
@@ -498,9 +345,9 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
 	memset(mgmt, 0, 24);
-	memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
 	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
+	memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
 	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
 	skb_put(skb, 2);
 	/* u.deauth.reason_code == u.disassoc.reason_code */
@@ -508,13 +355,13 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	add_extra_ies(skb, ies, ies_len);
 
-	ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED);
+	ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED);
 }
 
 void ieee80211_send_pspoll(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_pspoll *pspoll;
 	struct sk_buff *skb;
 	u16 fc;
@@ -531,43 +378,20 @@ void ieee80211_send_pspoll(struct ieee80211_local *local,
 	memset(pspoll, 0, sizeof(*pspoll));
 	fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM;
 	pspoll->frame_control = cpu_to_le16(fc);
-	pspoll->aid = cpu_to_le16(ifsta->aid);
+	pspoll->aid = cpu_to_le16(ifmgd->aid);
 
 	/* aid in PS-Poll has its two MSBs each set to 1 */
 	pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
 
-	memcpy(pspoll->bssid, ifsta->bssid, ETH_ALEN);
+	memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN);
 	memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN);
 
 	ieee80211_tx_skb(sdata, skb, 0);
-
-	return;
 }
 
 /* MLME */
-static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
-					 const size_t supp_rates_len,
-					 const u8 *supp_rates)
-{
-	struct ieee80211_local *local = sdata->local;
-	int i, have_higher_than_11mbit = 0;
-
-	/* cf. IEEE 802.11 9.2.12 */
-	for (i = 0; i < supp_rates_len; i++)
-		if ((supp_rates[i] & 0x7f) * 5 > 110)
-			have_higher_than_11mbit = 1;
-
-	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
-	    have_higher_than_11mbit)
-		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
-	else
-		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
-
-	ieee80211_set_wmm_default(sdata);
-}
-
 static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
-				     struct ieee80211_if_sta *ifsta,
+				     struct ieee80211_if_managed *ifmgd,
 				     u8 *wmm_param, size_t wmm_param_len)
 {
 	struct ieee80211_tx_queue_params params;
@@ -575,7 +399,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 	int count;
 	u8 *pos;
 
-	if (!(ifsta->flags & IEEE80211_STA_WMM_ENABLED))
+	if (!(ifmgd->flags & IEEE80211_STA_WMM_ENABLED))
 		return;
 
 	if (!wmm_param)
@@ -584,18 +408,15 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 	if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1)
 		return;
 	count = wmm_param[6] & 0x0f;
-	if (count == ifsta->wmm_last_param_set)
+	if (count == ifmgd->wmm_last_param_set)
 		return;
-	ifsta->wmm_last_param_set = count;
+	ifmgd->wmm_last_param_set = count;
 
 	pos = wmm_param + 8;
 	left = wmm_param_len - 8;
 
 	memset(&params, 0, sizeof(params));
 
-	if (!local->ops->conf_tx)
-		return;
-
 	local->wmm_acm = 0;
 	for (; left >= 4; left -= 4, pos += 4) {
 		int aci = (pos[0] >> 5) & 0x03;
@@ -603,26 +424,26 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		int queue;
 
 		switch (aci) {
-		case 1:
+		case 1: /* AC_BK */
 			queue = 3;
 			if (acm)
-				local->wmm_acm |= BIT(0) | BIT(3);
+				local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
 			break;
-		case 2:
+		case 2: /* AC_VI */
 			queue = 1;
 			if (acm)
-				local->wmm_acm |= BIT(4) | BIT(5);
+				local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
 			break;
-		case 3:
+		case 3: /* AC_VO */
 			queue = 0;
 			if (acm)
-				local->wmm_acm |= BIT(6) | BIT(7);
+				local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
 			break;
-		case 0:
+		case 0: /* AC_BE */
 		default:
 			queue = 2;
 			if (acm)
-				local->wmm_acm |= BIT(1) | BIT(2);
+				local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
 			break;
 		}
 
@@ -636,9 +457,8 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		       local->mdev->name, queue, aci, acm, params.aifs, params.cw_min,
 		       params.cw_max, params.txop);
 #endif
-		/* TODO: handle ACM (block TX, fallback to next lowest allowed
-		 * AC for now) */
-		if (local->ops->conf_tx(local_to_hw(local), queue, &params)) {
+		if (local->ops->conf_tx &&
+		    local->ops->conf_tx(local_to_hw(local), queue, &params)) {
 			printk(KERN_DEBUG "%s: failed to set TX queue "
 			       "parameters for queue %d\n", local->mdev->name, queue);
 		}
@@ -671,7 +491,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 #endif
 	u32 changed = 0;
 	bool use_protection;
@@ -694,7 +514,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 			printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n",
 			       sdata->dev->name,
 			       use_protection ? "enabled" : "disabled",
-			       ifsta->bssid);
+			       ifmgd->bssid);
 		}
 #endif
 		bss_conf->use_cts_prot = use_protection;
@@ -708,7 +528,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 			       " (BSSID=%pM)\n",
 			       sdata->dev->name,
 			       use_short_preamble ? "short" : "long",
-			       ifsta->bssid);
+			       ifmgd->bssid);
 		}
 #endif
 		bss_conf->use_short_preamble = use_short_preamble;
@@ -722,7 +542,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 			       " (BSSID=%pM)\n",
 			       sdata->dev->name,
 			       use_short_slot ? "short" : "long",
-			       ifsta->bssid);
+			       ifmgd->bssid);
 		}
 #endif
 		bss_conf->use_short_slot = use_short_slot;
@@ -732,57 +552,57 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
 	return changed;
 }
 
-static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata)
 {
 	union iwreq_data wrqu;
+
 	memset(&wrqu, 0, sizeof(wrqu));
-	if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
-		memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN);
+	if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED)
+		memcpy(wrqu.ap_addr.sa_data, sdata->u.mgd.bssid, ETH_ALEN);
 	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
 	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
 }
 
-static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
-					 struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	char *buf;
 	size_t len;
 	int i;
 	union iwreq_data wrqu;
 
-	if (!ifsta->assocreq_ies && !ifsta->assocresp_ies)
+	if (!ifmgd->assocreq_ies && !ifmgd->assocresp_ies)
 		return;
 
-	buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len +
-				ifsta->assocresp_ies_len), GFP_KERNEL);
+	buf = kmalloc(50 + 2 * (ifmgd->assocreq_ies_len +
+				ifmgd->assocresp_ies_len), GFP_KERNEL);
 	if (!buf)
 		return;
 
 	len = sprintf(buf, "ASSOCINFO(");
-	if (ifsta->assocreq_ies) {
+	if (ifmgd->assocreq_ies) {
 		len += sprintf(buf + len, "ReqIEs=");
-		for (i = 0; i < ifsta->assocreq_ies_len; i++) {
+		for (i = 0; i < ifmgd->assocreq_ies_len; i++) {
 			len += sprintf(buf + len, "%02x",
-				       ifsta->assocreq_ies[i]);
+				       ifmgd->assocreq_ies[i]);
 		}
 	}
-	if (ifsta->assocresp_ies) {
-		if (ifsta->assocreq_ies)
+	if (ifmgd->assocresp_ies) {
+		if (ifmgd->assocreq_ies)
 			len += sprintf(buf + len, " ");
 		len += sprintf(buf + len, "RespIEs=");
-		for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+		for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
 			len += sprintf(buf + len, "%02x",
-				       ifsta->assocresp_ies[i]);
+				       ifmgd->assocresp_ies[i]);
 		}
 	}
 	len += sprintf(buf + len, ")");
 
 	if (len > IW_CUSTOM_MAX) {
 		len = sprintf(buf, "ASSOCRESPIE=");
-		for (i = 0; i < ifsta->assocresp_ies_len; i++) {
+		for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
 			len += sprintf(buf + len, "%02x",
-				       ifsta->assocresp_ies[i]);
+				       ifmgd->assocresp_ies[i]);
 		}
 	}
 
@@ -797,20 +617,20 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
 
 
 static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
 				     u32 bss_info_changed)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_conf *conf = &local_to_hw(local)->conf;
 
 	struct ieee80211_bss *bss;
 
 	bss_info_changed |= BSS_CHANGED_ASSOC;
-	ifsta->flags |= IEEE80211_STA_ASSOCIATED;
+	ifmgd->flags |= IEEE80211_STA_ASSOCIATED;
 
-	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
 				   conf->channel->center_freq,
-				   ifsta->ssid, ifsta->ssid_len);
+				   ifmgd->ssid, ifmgd->ssid_len);
 	if (bss) {
 		/* set timing information */
 		sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval;
@@ -823,11 +643,11 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 		ieee80211_rx_bss_put(local, bss);
 	}
 
-	ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
-	memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN);
-	ieee80211_sta_send_associnfo(sdata, ifsta);
+	ifmgd->flags |= IEEE80211_STA_PREV_BSSID_SET;
+	memcpy(ifmgd->prev_bssid, sdata->u.mgd.bssid, ETH_ALEN);
+	ieee80211_sta_send_associnfo(sdata);
 
-	ifsta->last_probe = jiffies;
+	ifmgd->last_probe = jiffies;
 	ieee80211_led_assoc(local, 1);
 
 	sdata->vif.bss_conf.assoc = 1;
@@ -856,70 +676,74 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	netif_tx_start_all_queues(sdata->dev);
 	netif_carrier_on(sdata->dev);
 
-	ieee80211_sta_send_apinfo(sdata, ifsta);
+	ieee80211_sta_send_apinfo(sdata);
 }
 
-static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta)
+static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata)
 {
-	ifsta->direct_probe_tries++;
-	if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	ifmgd->direct_probe_tries++;
+	if (ifmgd->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
 		printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n",
-		       sdata->dev->name, ifsta->bssid);
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_sta_send_apinfo(sdata, ifsta);
+		       sdata->dev->name, ifmgd->bssid);
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+		ieee80211_sta_send_apinfo(sdata);
 
 		/*
 		 * Most likely AP is not in the range so remove the
 		 * bss information associated to the AP
 		 */
-		ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
-				ifsta->ssid, ifsta->ssid_len);
+				ifmgd->ssid, ifmgd->ssid_len);
 		return;
 	}
 
 	printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n",
-			sdata->dev->name, ifsta->bssid,
-			ifsta->direct_probe_tries);
+			sdata->dev->name, ifmgd->bssid,
+			ifmgd->direct_probe_tries);
 
-	ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+	ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
 
-	set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request);
+	set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifmgd->request);
 
 	/* Direct probe is sent to broadcast address as some APs
 	 * will not answer to direct packet in unassociated state.
 	 */
 	ieee80211_send_probe_req(sdata, NULL,
-				 ifsta->ssid, ifsta->ssid_len);
+				 ifmgd->ssid, ifmgd->ssid_len, NULL, 0);
 
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
+	mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
 }
 
 
-static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta)
+static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
 {
-	ifsta->auth_tries++;
-	if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	ifmgd->auth_tries++;
+	if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
 		printk(KERN_DEBUG "%s: authentication with AP %pM"
 		       " timed out\n",
-		       sdata->dev->name, ifsta->bssid);
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_sta_send_apinfo(sdata, ifsta);
-		ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+		       sdata->dev->name, ifmgd->bssid);
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+		ieee80211_sta_send_apinfo(sdata);
+		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
-				ifsta->ssid, ifsta->ssid_len);
+				ifmgd->ssid, ifmgd->ssid_len);
 		return;
 	}
 
-	ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+	ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
 	printk(KERN_DEBUG "%s: authenticate with AP %pM\n",
-	       sdata->dev->name, ifsta->bssid);
+	       sdata->dev->name, ifmgd->bssid);
 
-	ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0);
+	ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0,
+			    ifmgd->bssid, 0);
+	ifmgd->auth_transaction = 2;
 
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
+	mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
 }
 
 /*
@@ -927,27 +751,28 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
  * if self disconnected or a reason code from the AP.
  */
 static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta, bool deauth,
-				   bool self_disconnected, u16 reason)
+				   bool deauth, bool self_disconnected,
+				   u16 reason)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	u32 changed = 0, config_changed = 0;
 
 	rcu_read_lock();
 
-	sta = sta_info_get(local, ifsta->bssid);
+	sta = sta_info_get(local, ifmgd->bssid);
 	if (!sta) {
 		rcu_read_unlock();
 		return;
 	}
 
 	if (deauth) {
-		ifsta->direct_probe_tries = 0;
-		ifsta->auth_tries = 0;
+		ifmgd->direct_probe_tries = 0;
+		ifmgd->auth_tries = 0;
 	}
-	ifsta->assoc_scan_tries = 0;
-	ifsta->assoc_tries = 0;
+	ifmgd->assoc_scan_tries = 0;
+	ifmgd->assoc_tries = 0;
 
 	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
@@ -963,20 +788,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 				IEEE80211_STYPE_DISASSOC, reason);
 	}
 
-	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
+	ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
 	changed |= ieee80211_reset_erp_info(sdata);
 
 	ieee80211_led_assoc(local, 0);
 	changed |= BSS_CHANGED_ASSOC;
 	sdata->vif.bss_conf.assoc = false;
 
-	ieee80211_sta_send_apinfo(sdata, ifsta);
+	ieee80211_sta_send_apinfo(sdata);
 
 	if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) {
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
-				ifsta->ssid, ifsta->ssid_len);
+				ifmgd->ssid, ifmgd->ssid_len);
 	}
 
 	rcu_read_unlock();
@@ -999,7 +824,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	rcu_read_lock();
 
-	sta = sta_info_get(local, ifsta->bssid);
+	sta = sta_info_get(local, ifmgd->bssid);
 	if (!sta) {
 		rcu_read_unlock();
 		return;
@@ -1020,27 +845,27 @@ static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
 	return 1;
 }
 
-static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
-				      struct ieee80211_if_sta *ifsta)
+static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_bss *bss;
 	int bss_privacy;
 	int wep_privacy;
 	int privacy_invoked;
 
-	if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL))
+	if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL))
 		return 0;
 
-	bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
 				   local->hw.conf.channel->center_freq,
-				   ifsta->ssid, ifsta->ssid_len);
+				   ifmgd->ssid, ifmgd->ssid_len);
 	if (!bss)
 		return 0;
 
 	bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY);
 	wep_privacy = !!ieee80211_sta_wep_configured(sdata);
-	privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED);
+	privacy_invoked = !!(ifmgd->flags & IEEE80211_STA_PRIVACY_INVOKED);
 
 	ieee80211_rx_bss_put(local, bss);
 
@@ -1050,41 +875,42 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
 	return 1;
 }
 
-static void ieee80211_associate(struct ieee80211_sub_if_data *sdata,
-				struct ieee80211_if_sta *ifsta)
+static void ieee80211_associate(struct ieee80211_sub_if_data *sdata)
 {
-	ifsta->assoc_tries++;
-	if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	ifmgd->assoc_tries++;
+	if (ifmgd->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
 		printk(KERN_DEBUG "%s: association with AP %pM"
 		       " timed out\n",
-		       sdata->dev->name, ifsta->bssid);
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
-		ieee80211_sta_send_apinfo(sdata, ifsta);
-		ieee80211_rx_bss_remove(sdata, ifsta->bssid,
+		       sdata->dev->name, ifmgd->bssid);
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
+		ieee80211_sta_send_apinfo(sdata);
+		ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
 				sdata->local->hw.conf.channel->center_freq,
-				ifsta->ssid, ifsta->ssid_len);
+				ifmgd->ssid, ifmgd->ssid_len);
 		return;
 	}
 
-	ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
+	ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
 	printk(KERN_DEBUG "%s: associate with AP %pM\n",
-	       sdata->dev->name, ifsta->bssid);
-	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+	       sdata->dev->name, ifmgd->bssid);
+	if (ieee80211_privacy_mismatch(sdata)) {
 		printk(KERN_DEBUG "%s: mismatch in privacy configuration and "
 		       "mixed-cell disabled - abort association\n", sdata->dev->name);
-		ifsta->state = IEEE80211_STA_MLME_DISABLED;
+		ifmgd->state = IEEE80211_STA_MLME_DISABLED;
 		return;
 	}
 
-	ieee80211_send_assoc(sdata, ifsta);
+	ieee80211_send_assoc(sdata);
 
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
+	mod_timer(&ifmgd->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
 }
 
 
-static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
-				 struct ieee80211_if_sta *ifsta)
+static void ieee80211_associated(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 	int disassoc;
@@ -1094,38 +920,40 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 	 * for better APs. */
 	/* TODO: remove expired BSSes */
 
-	ifsta->state = IEEE80211_STA_MLME_ASSOCIATED;
+	ifmgd->state = IEEE80211_STA_MLME_ASSOCIATED;
 
 	rcu_read_lock();
 
-	sta = sta_info_get(local, ifsta->bssid);
+	sta = sta_info_get(local, ifmgd->bssid);
 	if (!sta) {
 		printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n",
-		       sdata->dev->name, ifsta->bssid);
+		       sdata->dev->name, ifmgd->bssid);
 		disassoc = 1;
 	} else {
 		disassoc = 0;
 		if (time_after(jiffies,
 			       sta->last_rx + IEEE80211_MONITORING_INTERVAL)) {
-			if (ifsta->flags & IEEE80211_STA_PROBEREQ_POLL) {
+			if (ifmgd->flags & IEEE80211_STA_PROBEREQ_POLL) {
 				printk(KERN_DEBUG "%s: No ProbeResp from "
 				       "current AP %pM - assume out of "
 				       "range\n",
-				       sdata->dev->name, ifsta->bssid);
+				       sdata->dev->name, ifmgd->bssid);
 				disassoc = 1;
 			} else
-				ieee80211_send_probe_req(sdata, ifsta->bssid,
-							 ifsta->ssid,
-							 ifsta->ssid_len);
-			ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL;
+				ieee80211_send_probe_req(sdata, ifmgd->bssid,
+							 ifmgd->ssid,
+							 ifmgd->ssid_len,
+							 NULL, 0);
+			ifmgd->flags ^= IEEE80211_STA_PROBEREQ_POLL;
 		} else {
-			ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
-			if (time_after(jiffies, ifsta->last_probe +
+			ifmgd->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
+			if (time_after(jiffies, ifmgd->last_probe +
 				       IEEE80211_PROBE_INTERVAL)) {
-				ifsta->last_probe = jiffies;
-				ieee80211_send_probe_req(sdata, ifsta->bssid,
-							 ifsta->ssid,
-							 ifsta->ssid_len);
+				ifmgd->last_probe = jiffies;
+				ieee80211_send_probe_req(sdata, ifmgd->bssid,
+							 ifmgd->ssid,
+							 ifmgd->ssid_len,
+							 NULL, 0);
 			}
 		}
 	}
@@ -1133,25 +961,25 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 
 	if (disassoc)
-		ieee80211_set_disassoc(sdata, ifsta, true, true,
+		ieee80211_set_disassoc(sdata, true, true,
 					WLAN_REASON_PREV_AUTH_NOT_VALID);
 	else
-		mod_timer(&ifsta->timer, jiffies +
+		mod_timer(&ifmgd->timer, jiffies +
 				      IEEE80211_MONITORING_INTERVAL);
 }
 
 
-static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
+static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
 	printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name);
-	ifsta->flags |= IEEE80211_STA_AUTHENTICATED;
-	ieee80211_associate(sdata, ifsta);
+	ifmgd->flags |= IEEE80211_STA_AUTHENTICATED;
+	ieee80211_associate(sdata);
 }
 
 
 static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len)
 {
@@ -1162,59 +990,37 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
 	if (!elems.challenge)
 		return;
-	ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2,
-			    elems.challenge_len + 2, 1);
-}
-
-static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_if_sta *ifsta,
-					struct ieee80211_mgmt *mgmt,
-					size_t len)
-{
-	u16 auth_alg, auth_transaction, status_code;
-
-	if (len < 24 + 6)
-		return;
-
-	auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
-	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
-	status_code = le16_to_cpu(mgmt->u.auth.status_code);
-
-	/*
-	 * IEEE 802.11 standard does not require authentication in IBSS
-	 * networks and most implementations do not seem to use it.
-	 * However, try to reply to authentication attempts if someone
-	 * has actually implemented this.
-	 */
-	if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
-		ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0);
+	ieee80211_send_auth(sdata, 3, sdata->u.mgd.auth_alg,
+			    elems.challenge - 2, elems.challenge_len + 2,
+			    sdata->u.mgd.bssid, 1);
+	sdata->u.mgd.auth_transaction = 4;
 }
 
 static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta,
 				   struct ieee80211_mgmt *mgmt,
 				   size_t len)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u16 auth_alg, auth_transaction, status_code;
 
-	if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE)
+	if (ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE)
 		return;
 
 	if (len < 24 + 6)
 		return;
 
-	if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0)
+	if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
 		return;
 
-	if (memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
+	if (memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
 	auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
 	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
 	status_code = le16_to_cpu(mgmt->u.auth.status_code);
 
-	if (auth_alg != ifsta->auth_alg ||
-	    auth_transaction != ifsta->auth_transaction)
+	if (auth_alg != ifmgd->auth_alg ||
+	    auth_transaction != ifmgd->auth_transaction)
 		return;
 
 	if (status_code != WLAN_STATUS_SUCCESS) {
@@ -1223,15 +1029,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 			const int num_algs = ARRAY_SIZE(algs);
 			int i, pos;
 			algs[0] = algs[1] = algs[2] = 0xff;
-			if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN)
+			if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
 				algs[0] = WLAN_AUTH_OPEN;
-			if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
+			if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
 				algs[1] = WLAN_AUTH_SHARED_KEY;
-			if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP)
+			if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
 				algs[2] = WLAN_AUTH_LEAP;
-			if (ifsta->auth_alg == WLAN_AUTH_OPEN)
+			if (ifmgd->auth_alg == WLAN_AUTH_OPEN)
 				pos = 0;
-			else if (ifsta->auth_alg == WLAN_AUTH_SHARED_KEY)
+			else if (ifmgd->auth_alg == WLAN_AUTH_SHARED_KEY)
 				pos = 1;
 			else
 				pos = 2;
@@ -1239,101 +1045,101 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 				pos++;
 				if (pos >= num_algs)
 					pos = 0;
-				if (algs[pos] == ifsta->auth_alg ||
+				if (algs[pos] == ifmgd->auth_alg ||
 				    algs[pos] == 0xff)
 					continue;
 				if (algs[pos] == WLAN_AUTH_SHARED_KEY &&
 				    !ieee80211_sta_wep_configured(sdata))
 					continue;
-				ifsta->auth_alg = algs[pos];
+				ifmgd->auth_alg = algs[pos];
 				break;
 			}
 		}
 		return;
 	}
 
-	switch (ifsta->auth_alg) {
+	switch (ifmgd->auth_alg) {
 	case WLAN_AUTH_OPEN:
 	case WLAN_AUTH_LEAP:
-		ieee80211_auth_completed(sdata, ifsta);
+		ieee80211_auth_completed(sdata);
 		break;
 	case WLAN_AUTH_SHARED_KEY:
-		if (ifsta->auth_transaction == 4)
-			ieee80211_auth_completed(sdata, ifsta);
+		if (ifmgd->auth_transaction == 4)
+			ieee80211_auth_completed(sdata);
 		else
-			ieee80211_auth_challenge(sdata, ifsta, mgmt, len);
+			ieee80211_auth_challenge(sdata, mgmt, len);
 		break;
 	}
 }
 
 
 static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u16 reason_code;
 
 	if (len < 24 + 2)
 		return;
 
-	if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN))
+	if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
 		return;
 
 	reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
 
-	if (ifsta->flags & IEEE80211_STA_AUTHENTICATED)
+	if (ifmgd->flags & IEEE80211_STA_AUTHENTICATED)
 		printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n",
 				sdata->dev->name, reason_code);
 
-	if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE ||
-	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATE ||
-	    ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
-		ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
-		mod_timer(&ifsta->timer, jiffies +
+	if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE ||
+	    ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE ||
+	    ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+		ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+		mod_timer(&ifmgd->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(sdata, ifsta, true, false, 0);
-	ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED;
+	ieee80211_set_disassoc(sdata, true, false, 0);
+	ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED;
 }
 
 
 static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
-				       struct ieee80211_if_sta *ifsta,
 				       struct ieee80211_mgmt *mgmt,
 				       size_t len)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	u16 reason_code;
 
 	if (len < 24 + 2)
 		return;
 
-	if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN))
+	if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
 		return;
 
 	reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
 
-	if (ifsta->flags & IEEE80211_STA_ASSOCIATED)
+	if (ifmgd->flags & IEEE80211_STA_ASSOCIATED)
 		printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n",
 				sdata->dev->name, reason_code);
 
-	if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) {
-		ifsta->state = IEEE80211_STA_MLME_ASSOCIATE;
-		mod_timer(&ifsta->timer, jiffies +
+	if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
+		ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
+		mod_timer(&ifmgd->timer, jiffies +
 				      IEEE80211_RETRY_AUTH_INTERVAL);
 	}
 
-	ieee80211_set_disassoc(sdata, ifsta, false, false, reason_code);
+	ieee80211_set_disassoc(sdata, false, false, reason_code);
 }
 
 
 static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
-					 struct ieee80211_if_sta *ifsta,
 					 struct ieee80211_mgmt *mgmt,
 					 size_t len,
 					 int reassoc)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_supported_band *sband;
 	struct sta_info *sta;
@@ -1350,13 +1156,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	/* AssocResp and ReassocResp have identical structure, so process both
 	 * of them in this function. */
 
-	if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE)
+	if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE)
 		return;
 
 	if (len < 24 + 6)
 		return;
 
-	if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0)
+	if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
 		return;
 
 	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
@@ -1381,7 +1187,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		       "comeback duration %u TU (%u ms)\n",
 		       sdata->dev->name, tu, ms);
 		if (ms > IEEE80211_ASSOC_TIMEOUT)
-			mod_timer(&ifsta->timer,
+			mod_timer(&ifmgd->timer,
 				  jiffies + msecs_to_jiffies(ms));
 		return;
 	}
@@ -1392,7 +1198,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		/* if this was a reassociation, ensure we try a "full"
 		 * association next time. This works around some broken APs
 		 * which do not correctly reject reassociation requests. */
-		ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+		ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
 		return;
 	}
 
@@ -1408,23 +1214,23 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	}
 
 	printk(KERN_DEBUG "%s: associated\n", sdata->dev->name);
-	ifsta->aid = aid;
-	ifsta->ap_capab = capab_info;
+	ifmgd->aid = aid;
+	ifmgd->ap_capab = capab_info;
 
-	kfree(ifsta->assocresp_ies);
-	ifsta->assocresp_ies_len = len - (pos - (u8 *) mgmt);
-	ifsta->assocresp_ies = kmalloc(ifsta->assocresp_ies_len, GFP_KERNEL);
-	if (ifsta->assocresp_ies)
-		memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len);
+	kfree(ifmgd->assocresp_ies);
+	ifmgd->assocresp_ies_len = len - (pos - (u8 *) mgmt);
+	ifmgd->assocresp_ies = kmalloc(ifmgd->assocresp_ies_len, GFP_KERNEL);
+	if (ifmgd->assocresp_ies)
+		memcpy(ifmgd->assocresp_ies, pos, ifmgd->assocresp_ies_len);
 
 	rcu_read_lock();
 
 	/* Add STA entry for the AP */
-	sta = sta_info_get(local, ifsta->bssid);
+	sta = sta_info_get(local, ifmgd->bssid);
 	if (!sta) {
 		newsta = true;
 
-		sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC);
+		sta = sta_info_alloc(sdata, ifmgd->bssid, GFP_ATOMIC);
 		if (!sta) {
 			printk(KERN_DEBUG "%s: failed to alloc STA entry for"
 			       " the AP\n", sdata->dev->name);
@@ -1497,7 +1303,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	else
 		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
 
-	if (elems.ht_cap_elem)
+	/* If TKIP/WEP is used, no need to parse AP's HT capabilities */
+	if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
 		ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
 				elems.ht_cap_elem, &sta->sta.ht_cap);
 
@@ -1505,7 +1312,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 
 	rate_control_rate_init(sta);
 
-	if (ifsta->flags & IEEE80211_STA_MFP_ENABLED)
+	if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED)
 		set_sta_flags(sta, WLAN_STA_MFP);
 
 	if (elems.wmm_param)
@@ -1524,11 +1331,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 
 	if (elems.wmm_param)
-		ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+		ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
 					 elems.wmm_param_len);
 
 	if (elems.ht_info_elem && elems.wmm_param &&
-	    (ifsta->flags & IEEE80211_STA_WMM_ENABLED))
+	    (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
+	    !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
 		changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem,
 					       ap_ht_cap_flags);
 
@@ -1536,163 +1344,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	 * ieee80211_set_associated() will tell the driver */
 	bss_conf->aid = aid;
 	bss_conf->assoc_capability = capab_info;
-	ieee80211_set_associated(sdata, ifsta, changed);
+	ieee80211_set_associated(sdata, changed);
 
-	ieee80211_associated(sdata, ifsta);
+	ieee80211_associated(sdata);
 }
 
 
-static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta,
-				     const u8 *bssid, const int beacon_int,
-				     const int freq,
-				     const size_t supp_rates_len,
-				     const u8 *supp_rates,
-				     const u16 capability)
-{
-	struct ieee80211_local *local = sdata->local;
-	int res = 0, rates, i, j;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *mgmt;
-	u8 *pos;
-	struct ieee80211_supported_band *sband;
-	union iwreq_data wrqu;
-
-	if (local->ops->reset_tsf) {
-		/* Reset own TSF to allow time synchronization work. */
-		local->ops->reset_tsf(local_to_hw(local));
-	}
-
-	if ((ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) &&
-	   memcmp(ifsta->bssid, bssid, ETH_ALEN) == 0)
-		return res;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 +
-			    sdata->u.sta.ie_proberesp_len);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
-		       "response\n", sdata->dev->name);
-		return -ENOMEM;
-	}
-
-	if (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) {
-		/* Remove possible STA entries from other IBSS networks. */
-		sta_info_flush_delayed(sdata);
-	}
-
-	memcpy(ifsta->bssid, bssid, ETH_ALEN);
-	res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
-	if (res)
-		return res;
-
-	local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
-
-	sdata->drop_unencrypted = capability &
-		WLAN_CAPABILITY_PRIVACY ? 1 : 0;
-
-	res = ieee80211_set_freq(sdata, freq);
-
-	if (res)
-		return res;
-
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	/* Build IBSS probe response */
-
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	mgmt = (struct ieee80211_mgmt *)
-		skb_put(skb, 24 + sizeof(mgmt->u.beacon));
-	memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
-	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
-					  IEEE80211_STYPE_PROBE_RESP);
-	memset(mgmt->da, 0xff, ETH_ALEN);
-	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
-	mgmt->u.beacon.beacon_int =
-		cpu_to_le16(local->hw.conf.beacon_int);
-	mgmt->u.beacon.capab_info = cpu_to_le16(capability);
-
-	pos = skb_put(skb, 2 + ifsta->ssid_len);
-	*pos++ = WLAN_EID_SSID;
-	*pos++ = ifsta->ssid_len;
-	memcpy(pos, ifsta->ssid, ifsta->ssid_len);
-
-	rates = supp_rates_len;
-	if (rates > 8)
-		rates = 8;
-	pos = skb_put(skb, 2 + rates);
-	*pos++ = WLAN_EID_SUPP_RATES;
-	*pos++ = rates;
-	memcpy(pos, supp_rates, rates);
-
-	if (sband->band == IEEE80211_BAND_2GHZ) {
-		pos = skb_put(skb, 2 + 1);
-		*pos++ = WLAN_EID_DS_PARAMS;
-		*pos++ = 1;
-		*pos++ = ieee80211_frequency_to_channel(freq);
-	}
-
-	pos = skb_put(skb, 2 + 2);
-	*pos++ = WLAN_EID_IBSS_PARAMS;
-	*pos++ = 2;
-	/* FIX: set ATIM window based on scan results */
-	*pos++ = 0;
-	*pos++ = 0;
-
-	if (supp_rates_len > 8) {
-		rates = supp_rates_len - 8;
-		pos = skb_put(skb, 2 + rates);
-		*pos++ = WLAN_EID_EXT_SUPP_RATES;
-		*pos++ = rates;
-		memcpy(pos, &supp_rates[8], rates);
-	}
-
-	add_extra_ies(skb, sdata->u.sta.ie_proberesp,
-		      sdata->u.sta.ie_proberesp_len);
-
-	ifsta->probe_resp = skb;
-
-	ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
-				   IEEE80211_IFCC_BEACON_ENABLED);
-
-
-	rates = 0;
-	for (i = 0; i < supp_rates_len; i++) {
-		int bitrate = (supp_rates[i] & 0x7f) * 5;
-		for (j = 0; j < sband->n_bitrates; j++)
-			if (sband->bitrates[j].bitrate == bitrate)
-				rates |= BIT(j);
-	}
-	ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates;
-
-	ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
-
-	ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
-	ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
-
-	ieee80211_led_assoc(local, true);
-
-	memset(&wrqu, 0, sizeof(wrqu));
-	memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
-	wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
-
-	return res;
-}
-
-static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta,
-				   struct ieee80211_bss *bss)
-{
-	return __ieee80211_sta_join_ibss(sdata, ifsta,
-					 bss->cbss.bssid,
-					 bss->cbss.beacon_interval,
-					 bss->cbss.channel->center_freq,
-					 bss->supp_rates_len, bss->supp_rates,
-					 bss->cbss.capability);
-}
-
 static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len,
@@ -1703,11 +1360,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	int freq;
 	struct ieee80211_bss *bss;
-	struct sta_info *sta;
 	struct ieee80211_channel *channel;
-	u64 beacon_timestamp, rx_timestamp;
-	u32 supp_rates = 0;
-	enum ieee80211_band band = rx_status->band;
 
 	if (elems->ds_params && elems->ds_params_len == 1)
 		freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
@@ -1719,133 +1372,18 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
 		return;
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
-	    memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
-		supp_rates = ieee80211_sta_get_rates(local, elems, band);
-
-		rcu_read_lock();
-
-		sta = sta_info_get(local, mgmt->sa);
-		if (sta) {
-			u32 prev_rates;
-
-			prev_rates = sta->sta.supp_rates[band];
-			/* make sure mandatory rates are always added */
-			sta->sta.supp_rates[band] = supp_rates |
-				ieee80211_mandatory_rates(local, band);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-			if (sta->sta.supp_rates[band] != prev_rates)
-				printk(KERN_DEBUG "%s: updated supp_rates set "
-				    "for %pM based on beacon info (0x%llx | "
-				    "0x%llx -> 0x%llx)\n",
-				    sdata->dev->name,
-				    sta->sta.addr,
-				    (unsigned long long) prev_rates,
-				    (unsigned long long) supp_rates,
-				    (unsigned long long) sta->sta.supp_rates[band]);
-#endif
-		} else {
-			ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
-		}
-
-		rcu_read_unlock();
-	}
-
 	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
 					channel, beacon);
 	if (!bss)
 		return;
 
 	if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) &&
-	    (memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0)) {
+	    (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN) == 0)) {
 		struct ieee80211_channel_sw_ie *sw_elem =
 			(struct ieee80211_channel_sw_ie *)elems->ch_switch_elem;
 		ieee80211_process_chanswitch(sdata, sw_elem, bss);
 	}
 
-	/* was just updated in ieee80211_bss_info_update */
-	beacon_timestamp = bss->cbss.tsf;
-
-	if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
-		goto put_bss;
-
-	/* check if we need to merge IBSS */
-
-	/* merge only on beacons (???) */
-	if (!beacon)
-		goto put_bss;
-
-	/* we use a fixed BSSID */
-	if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET)
-		goto put_bss;
-
-	/* not an IBSS */
-	if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
-		goto put_bss;
-
-	/* different channel */
-	if (bss->cbss.channel != local->oper_channel)
-		goto put_bss;
-
-	/* different SSID */
-	if (elems->ssid_len != sdata->u.sta.ssid_len ||
-	    memcmp(elems->ssid, sdata->u.sta.ssid,
-				sdata->u.sta.ssid_len))
-		goto put_bss;
-
-	if (rx_status->flag & RX_FLAG_TSFT) {
-		/*
-		 * For correct IBSS merging we need mactime; since mactime is
-		 * defined as the time the first data symbol of the frame hits
-		 * the PHY, and the timestamp of the beacon is defined as "the
-		 * time that the data symbol containing the first bit of the
-		 * timestamp is transmitted to the PHY plus the transmitting
-		 * STA's delays through its local PHY from the MAC-PHY
-		 * interface to its interface with the WM" (802.11 11.1.2)
-		 * - equals the time this bit arrives at the receiver - we have
-		 * to take into account the offset between the two.
-		 *
-		 * E.g. at 1 MBit that means mactime is 192 usec earlier
-		 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
-		 */
-		int rate;
-
-		if (rx_status->flag & RX_FLAG_HT)
-			rate = 65; /* TODO: HT rates */
-		else
-			rate = local->hw.wiphy->bands[band]->
-				bitrates[rx_status->rate_idx].bitrate;
-
-		rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
-	} else if (local && local->ops && local->ops->get_tsf)
-		/* second best option: get current TSF */
-		rx_timestamp = local->ops->get_tsf(local_to_hw(local));
-	else
-		/* can't merge without knowing the TSF */
-		rx_timestamp = -1LLU;
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
-	       "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
-	       mgmt->sa, mgmt->bssid,
-	       (unsigned long long)rx_timestamp,
-	       (unsigned long long)beacon_timestamp,
-	       (unsigned long long)(rx_timestamp - beacon_timestamp),
-	       jiffies);
-#endif
-
-	if (beacon_timestamp > rx_timestamp) {
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-		printk(KERN_DEBUG "%s: beacon TSF higher than "
-		       "local TSF - IBSS merge with BSSID %pM\n",
-		       sdata->dev->name, mgmt->bssid);
-#endif
-		ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss);
-		ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
-	}
-
- put_bss:
 	ieee80211_rx_bss_put(local, bss);
 }
 
@@ -1857,7 +1395,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 {
 	size_t baselen;
 	struct ieee802_11_elems elems;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 
 	if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
 		return; /* ignore ProbeResp to foreign address */
@@ -1873,20 +1410,19 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 
 	/* direct probe may be part of the association flow */
 	if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE,
-							&ifsta->request)) {
+	    &sdata->u.mgd.request)) {
 		printk(KERN_DEBUG "%s direct probe responded\n",
 		       sdata->dev->name);
-		ieee80211_authenticate(sdata, ifsta);
+		ieee80211_authenticate(sdata);
 	}
 }
 
-
 static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len,
 				     struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_if_sta *ifsta;
+	struct ieee80211_if_managed *ifmgd;
 	size_t baselen;
 	struct ieee802_11_elems elems;
 	struct ieee80211_local *local = sdata->local;
@@ -1905,21 +1441,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return;
-	ifsta = &sdata->u.sta;
 
-	if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED) ||
-	    memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0)
+	ifmgd = &sdata->u.mgd;
+
+	if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED) ||
+	    memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
 		return;
 
 	if (rx_status->freq != local->hw.conf.channel->center_freq)
 		return;
 
-	ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param,
+	ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
 				 elems.wmm_param_len);
 
-	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK &&
-	    local->hw.conf.flags & IEEE80211_CONF_PS) {
-		directed_tim = ieee80211_check_tim(&elems, ifsta->aid);
+	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
+		directed_tim = ieee80211_check_tim(&elems, ifmgd->aid);
 
 		if (directed_tim) {
 			if (local->hw.conf.dynamic_ps_timeout > 0) {
@@ -1954,14 +1490,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 			erp_valid, erp_value);
 
 
-	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param) {
+	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
+	    !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) {
 		struct sta_info *sta;
 		struct ieee80211_supported_band *sband;
 		u16 ap_ht_cap_flags;
 
 		rcu_read_lock();
 
-		sta = sta_info_get(local, ifsta->bssid);
+		sta = sta_info_get(local, ifmgd->bssid);
 		if (!sta) {
 			rcu_read_unlock();
 			return;
@@ -1997,85 +1534,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	ieee80211_bss_info_change_notify(sdata, changed);
 }
 
-
-static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
-					struct ieee80211_if_sta *ifsta,
-					struct ieee80211_mgmt *mgmt,
-					size_t len)
+ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+					  struct sk_buff *skb,
+					  struct ieee80211_rx_status *rx_status)
 {
 	struct ieee80211_local *local = sdata->local;
-	int tx_last_beacon;
-	struct sk_buff *skb;
-	struct ieee80211_mgmt *resp;
-	u8 *pos, *end;
-
-	if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED ||
-	    len < 24 + 2 || !ifsta->probe_resp)
-		return;
-
-	if (local->ops->tx_last_beacon)
-		tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
-	else
-		tx_last_beacon = 1;
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
-	       " (tx_last_beacon=%d)\n",
-	       sdata->dev->name, mgmt->sa, mgmt->da,
-	       mgmt->bssid, tx_last_beacon);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
-	if (!tx_last_beacon)
-		return;
-
-	if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0 &&
-	    memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
-		return;
-
-	end = ((u8 *) mgmt) + len;
-	pos = mgmt->u.probe_req.variable;
-	if (pos[0] != WLAN_EID_SSID ||
-	    pos + 2 + pos[1] > end) {
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-		printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
-		       "from %pM\n",
-		       sdata->dev->name, mgmt->sa);
-#endif
-		return;
-	}
-	if (pos[1] != 0 &&
-	    (pos[1] != ifsta->ssid_len ||
-	     memcmp(pos + 2, ifsta->ssid, ifsta->ssid_len) != 0)) {
-		/* Ignore ProbeReq for foreign SSID */
-		return;
-	}
-
-	/* Reply with ProbeResp */
-	skb = skb_copy(ifsta->probe_resp, GFP_KERNEL);
-	if (!skb)
-		return;
-
-	resp = (struct ieee80211_mgmt *) skb->data;
-	memcpy(resp->da, mgmt->sa, ETH_ALEN);
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
-	       sdata->dev->name, resp->da);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-	ieee80211_tx_skb(sdata, skb, 0);
-}
-
-void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
-			   struct ieee80211_rx_status *rx_status)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta;
 	struct ieee80211_mgmt *mgmt;
 	u16 fc;
 
 	if (skb->len < 24)
-		goto fail;
-
-	ifsta = &sdata->u.sta;
+		return RX_DROP_MONITOR;
 
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
@@ -2090,147 +1558,68 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *
 	case IEEE80211_STYPE_REASSOC_RESP:
 	case IEEE80211_STYPE_DEAUTH:
 	case IEEE80211_STYPE_DISASSOC:
-		skb_queue_tail(&ifsta->skb_queue, skb);
-		queue_work(local->hw.workqueue, &ifsta->work);
-		return;
+		skb_queue_tail(&sdata->u.mgd.skb_queue, skb);
+		queue_work(local->hw.workqueue, &sdata->u.mgd.work);
+		return RX_QUEUED;
 	}
 
- fail:
-	kfree_skb(skb);
+	return RX_DROP_MONITOR;
 }
 
 static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 					 struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status;
-	struct ieee80211_if_sta *ifsta;
 	struct ieee80211_mgmt *mgmt;
 	u16 fc;
 
-	ifsta = &sdata->u.sta;
-
 	rx_status = (struct ieee80211_rx_status *) skb->cb;
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		switch (fc & IEEE80211_FCTL_STYPE) {
-		case IEEE80211_STYPE_PROBE_REQ:
-			ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt,
-						    skb->len);
-			break;
-		case IEEE80211_STYPE_PROBE_RESP:
-			ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
-						     rx_status);
-			break;
-		case IEEE80211_STYPE_BEACON:
-			ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
-						 rx_status);
-			break;
-		case IEEE80211_STYPE_AUTH:
-			ieee80211_rx_mgmt_auth_ibss(sdata, ifsta, mgmt,
-						    skb->len);
-			break;
-		}
-	} else { /* NL80211_IFTYPE_STATION */
-		switch (fc & IEEE80211_FCTL_STYPE) {
-		case IEEE80211_STYPE_PROBE_RESP:
-			ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
-						     rx_status);
-			break;
-		case IEEE80211_STYPE_BEACON:
-			ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
-						 rx_status);
-			break;
-		case IEEE80211_STYPE_AUTH:
-			ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len);
-			break;
-		case IEEE80211_STYPE_ASSOC_RESP:
-			ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
-						     skb->len, 0);
-			break;
-		case IEEE80211_STYPE_REASSOC_RESP:
-			ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
-						     skb->len, 1);
-			break;
-		case IEEE80211_STYPE_DEAUTH:
-			ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len);
-			break;
-		case IEEE80211_STYPE_DISASSOC:
-			ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt,
-						   skb->len);
-			break;
-		}
+	switch (fc & IEEE80211_FCTL_STYPE) {
+	case IEEE80211_STYPE_PROBE_RESP:
+		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
+					     rx_status);
+		break;
+	case IEEE80211_STYPE_BEACON:
+		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+					 rx_status);
+		break;
+	case IEEE80211_STYPE_AUTH:
+		ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len);
+		break;
+	case IEEE80211_STYPE_ASSOC_RESP:
+		ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 0);
+		break;
+	case IEEE80211_STYPE_REASSOC_RESP:
+		ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 1);
+		break;
+	case IEEE80211_STYPE_DEAUTH:
+		ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len);
+		break;
+	case IEEE80211_STYPE_DISASSOC:
+		ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len);
+		break;
 	}
 
 	kfree_skb(skb);
 }
 
-
-static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
-{
-	struct ieee80211_local *local = sdata->local;
-	int active = 0;
-	struct sta_info *sta;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(sta, &local->sta_list, list) {
-		if (sta->sdata == sdata &&
-		    time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
-			       jiffies)) {
-			active++;
-			break;
-		}
-	}
-
-	rcu_read_unlock();
-
-	return active;
-}
-
-
-static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
-{
-	mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
-
-	ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
-	if (ieee80211_sta_active_ibss(sdata))
-		return;
-
-	if ((sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) &&
-	    (!(sdata->u.sta.flags & IEEE80211_STA_AUTO_CHANNEL_SEL)))
-		return;
-
-	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
-	       "IBSS networks with same SSID (merge)\n", sdata->dev->name);
-
-	/* XXX maybe racy? */
-	if (sdata->local->scan_req)
-		return;
-
-	memcpy(sdata->local->int_scan_req.ssids[0].ssid,
-	       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
-	sdata->local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
-	ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
-}
-
-
 static void ieee80211_sta_timer(unsigned long data)
 {
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 
-	set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
-	queue_work(local->hw.workqueue, &ifsta->work);
+	set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
+	queue_work(local->hw.workqueue, &ifmgd->work);
 }
 
-static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
+static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 
 	if (local->ops->reset_tsf) {
@@ -2238,191 +1627,39 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
 		local->ops->reset_tsf(local_to_hw(local));
 	}
 
-	ifsta->wmm_last_param_set = -1; /* allow any WMM update */
+	ifmgd->wmm_last_param_set = -1; /* allow any WMM update */
 
 
-	if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN)
-		ifsta->auth_alg = WLAN_AUTH_OPEN;
-	else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
-		ifsta->auth_alg = WLAN_AUTH_SHARED_KEY;
-	else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP)
-		ifsta->auth_alg = WLAN_AUTH_LEAP;
+	if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
+		ifmgd->auth_alg = WLAN_AUTH_OPEN;
+	else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
+		ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY;
+	else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
+		ifmgd->auth_alg = WLAN_AUTH_LEAP;
 	else
-		ifsta->auth_alg = WLAN_AUTH_OPEN;
-	ifsta->auth_transaction = -1;
-	ifsta->flags &= ~IEEE80211_STA_ASSOCIATED;
-	ifsta->assoc_scan_tries = 0;
-	ifsta->direct_probe_tries = 0;
-	ifsta->auth_tries = 0;
-	ifsta->assoc_tries = 0;
+		ifmgd->auth_alg = WLAN_AUTH_OPEN;
+	ifmgd->auth_transaction = -1;
+	ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
+	ifmgd->assoc_scan_tries = 0;
+	ifmgd->direct_probe_tries = 0;
+	ifmgd->auth_tries = 0;
+	ifmgd->assoc_tries = 0;
 	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
 }
 
-static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_supported_band *sband;
-	u8 *pos;
-	u8 bssid[ETH_ALEN];
-	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
-	u16 capability;
-	int i;
-
-	if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) {
-		memcpy(bssid, ifsta->bssid, ETH_ALEN);
-	} else {
-		/* Generate random, not broadcast, locally administered BSSID. Mix in
-		 * own MAC address to make sure that devices that do not have proper
-		 * random number generator get different BSSID. */
-		get_random_bytes(bssid, ETH_ALEN);
-		for (i = 0; i < ETH_ALEN; i++)
-			bssid[i] ^= sdata->dev->dev_addr[i];
-		bssid[0] &= ~0x01;
-		bssid[0] |= 0x02;
-	}
-
-	printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
-	       sdata->dev->name, bssid);
-
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
-
-	if (local->hw.conf.beacon_int == 0)
-		local->hw.conf.beacon_int = 100;
-
-	capability = WLAN_CAPABILITY_IBSS;
-
-	if (sdata->default_key)
-		capability |= WLAN_CAPABILITY_PRIVACY;
-	else
-		sdata->drop_unencrypted = 0;
-
-	pos = supp_rates;
-	for (i = 0; i < sband->n_bitrates; i++) {
-		int rate = sband->bitrates[i].bitrate;
-		*pos++ = (u8) (rate / 5);
-	}
-
-	return __ieee80211_sta_join_ibss(sdata, ifsta,
-					 bssid, local->hw.conf.beacon_int,
-					 local->hw.conf.channel->center_freq,
-					 sband->n_bitrates, supp_rates,
-					 capability);
-}
-
-
-static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
-				   struct ieee80211_if_sta *ifsta)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_bss *bss;
-	int active_ibss;
-
-	if (ifsta->ssid_len == 0)
-		return -EINVAL;
-
-	active_ibss = ieee80211_sta_active_ibss(sdata);
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
-	       sdata->dev->name, active_ibss);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
-	if (active_ibss)
-		return 0;
-
-	if (ifsta->flags & IEEE80211_STA_BSSID_SET)
-		bss = ieee80211_rx_bss_get(local, ifsta->bssid, 0,
-					   ifsta->ssid, ifsta->ssid_len);
-	else
-		bss = (void *)cfg80211_get_ibss(local->hw.wiphy,
-						NULL,
-						ifsta->ssid, ifsta->ssid_len);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	if (bss)
-		printk(KERN_DEBUG "   sta_find_ibss: selected %pM current "
-		       "%pM\n", bss->cbss.bssid, ifsta->bssid);
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
-	if (bss &&
-	    (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) ||
-	     memcmp(ifsta->bssid, bss->cbss.bssid, ETH_ALEN))) {
-		int ret;
-
-		printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
-		       " based on configured SSID\n",
-		       sdata->dev->name, bss->cbss.bssid);
-
-		ret = ieee80211_sta_join_ibss(sdata, ifsta, bss);
-		ieee80211_rx_bss_put(local, bss);
-		return ret;
-	} else if (bss)
-		ieee80211_rx_bss_put(local, bss);
-
-#ifdef CONFIG_MAC80211_IBSS_DEBUG
-	printk(KERN_DEBUG "   did not try to join ibss\n");
-#endif /* CONFIG_MAC80211_IBSS_DEBUG */
-
-	/* Selected IBSS not found in current scan results - try to scan */
-	if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED &&
-	    !ieee80211_sta_active_ibss(sdata)) {
-		mod_timer(&ifsta->timer, jiffies +
-				      IEEE80211_IBSS_MERGE_INTERVAL);
-	} else if (time_after(jiffies, local->last_scan_completed +
-			      IEEE80211_SCAN_INTERVAL)) {
-		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
-		       "join\n", sdata->dev->name);
-
-		/* XXX maybe racy? */
-		if (local->scan_req)
-			return -EBUSY;
-
-		memcpy(local->int_scan_req.ssids[0].ssid,
-		       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
-		local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
-		return ieee80211_request_scan(sdata, &local->int_scan_req);
-	} else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
-		int interval = IEEE80211_SCAN_INTERVAL;
-
-		if (time_after(jiffies, ifsta->ibss_join_req +
-			       IEEE80211_IBSS_JOIN_TIMEOUT)) {
-			if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) &&
-			    (!(local->oper_channel->flags &
-					IEEE80211_CHAN_NO_IBSS)))
-				return ieee80211_sta_create_ibss(sdata, ifsta);
-			if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) {
-				printk(KERN_DEBUG "%s: IBSS not allowed on"
-				       " %d MHz\n", sdata->dev->name,
-				       local->hw.conf.channel->center_freq);
-			}
-
-			/* No IBSS found - decrease scan interval and continue
-			 * scanning. */
-			interval = IEEE80211_SCAN_INTERVAL_SLOW;
-		}
-
-		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
-		mod_timer(&ifsta->timer, jiffies + interval);
-		return 0;
-	}
-
-	return 0;
-}
-
-
-static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_if_sta *ifsta)
+static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_bss *bss;
-	u8 *bssid = ifsta->bssid, *ssid = ifsta->ssid;
-	u8 ssid_len = ifsta->ssid_len;
+	u8 *bssid = ifmgd->bssid, *ssid = ifmgd->ssid;
+	u8 ssid_len = ifmgd->ssid_len;
 	u16 capa_mask = WLAN_CAPABILITY_ESS;
 	u16 capa_val = WLAN_CAPABILITY_ESS;
 	struct ieee80211_channel *chan = local->oper_channel;
 
-	if (ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL |
+	if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL |
 			    IEEE80211_STA_AUTO_BSSID_SEL |
 			    IEEE80211_STA_AUTO_CHANNEL_SEL)) {
 		capa_mask |= WLAN_CAPABILITY_PRIVACY;
@@ -2430,13 +1667,13 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 			capa_val |= WLAN_CAPABILITY_PRIVACY;
 	}
 
-	if (ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL)
+	if (ifmgd->flags & IEEE80211_STA_AUTO_CHANNEL_SEL)
 		chan = NULL;
 
-	if (ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL)
+	if (ifmgd->flags & IEEE80211_STA_AUTO_BSSID_SEL)
 		bssid = NULL;
 
-	if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) {
+	if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) {
 		ssid = NULL;
 		ssid_len = 0;
 	}
@@ -2447,16 +1684,16 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 
 	if (bss) {
 		ieee80211_set_freq(sdata, bss->cbss.channel->center_freq);
-		if (!(ifsta->flags & IEEE80211_STA_SSID_SET))
+		if (!(ifmgd->flags & IEEE80211_STA_SSID_SET))
 			ieee80211_sta_set_ssid(sdata, bss->ssid,
 					       bss->ssid_len);
 		ieee80211_sta_set_bssid(sdata, bss->cbss.bssid);
 		ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len,
 						    bss->supp_rates);
-		if (sdata->u.sta.mfp == IEEE80211_MFP_REQUIRED)
-			sdata->u.sta.flags |= IEEE80211_STA_MFP_ENABLED;
+		if (sdata->u.mgd.mfp == IEEE80211_MFP_REQUIRED)
+			sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED;
 		else
-			sdata->u.sta.flags &= ~IEEE80211_STA_MFP_ENABLED;
+			sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED;
 
 		/* Send out direct probe if no probe resp was received or
 		 * the one we have is outdated
@@ -2464,31 +1701,34 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 		if (!bss->last_probe_resp ||
 		    time_after(jiffies, bss->last_probe_resp
 					+ IEEE80211_SCAN_RESULT_EXPIRE))
-			ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE;
+			ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
 		else
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
+			ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
 
 		ieee80211_rx_bss_put(local, bss);
-		ieee80211_sta_reset_auth(sdata, ifsta);
+		ieee80211_sta_reset_auth(sdata);
 		return 0;
 	} else {
-		if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
-			ifsta->assoc_scan_tries++;
+		if (ifmgd->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
+			ifmgd->assoc_scan_tries++;
 			/* XXX maybe racy? */
 			if (local->scan_req)
 				return -1;
 			memcpy(local->int_scan_req.ssids[0].ssid,
-			       ifsta->ssid, IEEE80211_MAX_SSID_LEN);
-			if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL)
+			       ifmgd->ssid, IEEE80211_MAX_SSID_LEN);
+			if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL)
 				local->int_scan_req.ssids[0].ssid_len = 0;
 			else
-				local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
-			ieee80211_start_scan(sdata, &local->int_scan_req);
-			ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE;
-			set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
+				local->int_scan_req.ssids[0].ssid_len = ifmgd->ssid_len;
+
+			if (ieee80211_start_scan(sdata, &local->int_scan_req))
+				ieee80211_scan_failed(local);
+
+			ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
+			set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
 		} else {
-			ifsta->assoc_scan_tries = 0;
-			ifsta->state = IEEE80211_STA_MLME_DISABLED;
+			ifmgd->assoc_scan_tries = 0;
+			ifmgd->state = IEEE80211_STA_MLME_DISABLED;
 		}
 	}
 	return -1;
@@ -2498,9 +1738,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
 static void ieee80211_sta_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.sta.work);
+		container_of(work, struct ieee80211_sub_if_data, u.mgd.work);
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta;
+	struct ieee80211_if_managed *ifmgd;
 	struct sk_buff *skb;
 
 	if (!netif_running(sdata->dev))
@@ -2509,60 +1749,60 @@ static void ieee80211_sta_work(struct work_struct *work)
 	if (local->sw_scanning || local->hw_scanning)
 		return;
 
-	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION &&
-		    sdata->vif.type != NL80211_IFTYPE_ADHOC))
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return;
-	ifsta = &sdata->u.sta;
+	ifmgd = &sdata->u.mgd;
 
-	while ((skb = skb_dequeue(&ifsta->skb_queue)))
+	while ((skb = skb_dequeue(&ifmgd->skb_queue)))
 		ieee80211_sta_rx_queued_mgmt(sdata, skb);
 
-	if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
-	    ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE &&
-	    ifsta->state != IEEE80211_STA_MLME_ASSOCIATE &&
-	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) {
-		ieee80211_start_scan(sdata, local->scan_req);
+	if (ifmgd->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
+	    ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE &&
+	    ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE &&
+	    test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request)) {
+		/*
+		 * The call to ieee80211_start_scan can fail but ieee80211_request_scan
+		 * (which queued ieee80211_sta_work) did not return an error. Thus, call
+		 * ieee80211_scan_failed here if ieee80211_start_scan fails in order to
+		 * notify the scan requester.
+		 */
+		if (ieee80211_start_scan(sdata, local->scan_req))
+			ieee80211_scan_failed(local);
 		return;
 	}
 
-	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) {
-		if (ieee80211_sta_config_auth(sdata, ifsta))
+	if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request)) {
+		if (ieee80211_sta_config_auth(sdata))
 			return;
-		clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request);
-	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request))
+		clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
+	} else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request))
 		return;
 
-	switch (ifsta->state) {
+	switch (ifmgd->state) {
 	case IEEE80211_STA_MLME_DISABLED:
 		break;
 	case IEEE80211_STA_MLME_DIRECT_PROBE:
-		ieee80211_direct_probe(sdata, ifsta);
+		ieee80211_direct_probe(sdata);
 		break;
 	case IEEE80211_STA_MLME_AUTHENTICATE:
-		ieee80211_authenticate(sdata, ifsta);
+		ieee80211_authenticate(sdata);
 		break;
 	case IEEE80211_STA_MLME_ASSOCIATE:
-		ieee80211_associate(sdata, ifsta);
+		ieee80211_associate(sdata);
 		break;
 	case IEEE80211_STA_MLME_ASSOCIATED:
-		ieee80211_associated(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_SEARCH:
-		ieee80211_sta_find_ibss(sdata, ifsta);
-		break;
-	case IEEE80211_STA_MLME_IBSS_JOINED:
-		ieee80211_sta_merge_ibss(sdata, ifsta);
+		ieee80211_associated(sdata);
 		break;
 	default:
 		WARN_ON(1);
 		break;
 	}
 
-	if (ieee80211_privacy_mismatch(sdata, ifsta)) {
+	if (ieee80211_privacy_mismatch(sdata)) {
 		printk(KERN_DEBUG "%s: privacy configuration mismatch and "
 		       "mixed-cell disabled - disassociate\n", sdata->dev->name);
 
-		ieee80211_set_disassoc(sdata, ifsta, false, true,
+		ieee80211_set_disassoc(sdata, false, true,
 					WLAN_REASON_UNSPECIFIED);
 	}
 }
@@ -2571,155 +1811,106 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 {
 	if (sdata->vif.type == NL80211_IFTYPE_STATION)
 		queue_work(sdata->local->hw.workqueue,
-			   &sdata->u.sta.work);
+			   &sdata->u.mgd.work);
 }
 
 /* interface setup */
 void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta;
+	struct ieee80211_if_managed *ifmgd;
 
-	ifsta = &sdata->u.sta;
-	INIT_WORK(&ifsta->work, ieee80211_sta_work);
-	INIT_WORK(&ifsta->chswitch_work, ieee80211_chswitch_work);
-	setup_timer(&ifsta->timer, ieee80211_sta_timer,
+	ifmgd = &sdata->u.mgd;
+	INIT_WORK(&ifmgd->work, ieee80211_sta_work);
+	INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
+	setup_timer(&ifmgd->timer, ieee80211_sta_timer,
 		    (unsigned long) sdata);
-	setup_timer(&ifsta->chswitch_timer, ieee80211_chswitch_timer,
+	setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
 		    (unsigned long) sdata);
-	skb_queue_head_init(&ifsta->skb_queue);
+	skb_queue_head_init(&ifmgd->skb_queue);
 
-	ifsta->capab = WLAN_CAPABILITY_ESS;
-	ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN |
+	ifmgd->capab = WLAN_CAPABILITY_ESS;
+	ifmgd->auth_algs = IEEE80211_AUTH_ALG_OPEN |
 		IEEE80211_AUTH_ALG_SHARED_KEY;
-	ifsta->flags |= IEEE80211_STA_CREATE_IBSS |
+	ifmgd->flags |= IEEE80211_STA_CREATE_IBSS |
 		IEEE80211_STA_AUTO_BSSID_SEL |
 		IEEE80211_STA_AUTO_CHANNEL_SEL;
 	if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
-		ifsta->flags |= IEEE80211_STA_WMM_ENABLED;
-}
-
-/*
- * Add a new IBSS station, will also be called by the RX code when,
- * in IBSS mode, receiving a frame from a yet-unknown station, hence
- * must be callable in atomic context.
- */
-struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
-					u8 *bssid,u8 *addr, u32 supp_rates)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct sta_info *sta;
-	int band = local->hw.conf.channel->band;
-
-	/* TODO: Could consider removing the least recently used entry and
-	 * allow new one to be added. */
-	if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
-		if (net_ratelimit()) {
-			printk(KERN_DEBUG "%s: No room for a new IBSS STA "
-			       "entry %pM\n", sdata->dev->name, addr);
-		}
-		return NULL;
-	}
-
-	if (compare_ether_addr(bssid, sdata->u.sta.bssid))
-		return NULL;
-
-#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
-	       wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
-#endif
-
-	sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
-	if (!sta)
-		return NULL;
-
-	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
-
-	/* make sure mandatory rates are always added */
-	sta->sta.supp_rates[band] = supp_rates |
-			ieee80211_mandatory_rates(local, band);
-
-	rate_control_rate_init(sta);
-
-	if (sta_info_insert(sta))
-		return NULL;
-
-	return sta;
+		ifmgd->flags |= IEEE80211_STA_WMM_ENABLED;
 }
 
 /* configuration hooks */
-void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata,
-			    struct ieee80211_if_sta *ifsta)
+void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata)
 {
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
 
-	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return;
 
-	if ((ifsta->flags & (IEEE80211_STA_BSSID_SET |
+	if ((ifmgd->flags & (IEEE80211_STA_BSSID_SET |
 			     IEEE80211_STA_AUTO_BSSID_SEL)) &&
-	    (ifsta->flags & (IEEE80211_STA_SSID_SET |
+	    (ifmgd->flags & (IEEE80211_STA_SSID_SET |
 			     IEEE80211_STA_AUTO_SSID_SEL))) {
 
-		if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED)
-			ieee80211_set_disassoc(sdata, ifsta, true, true,
+		if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED)
+			ieee80211_set_disassoc(sdata, true, true,
 					       WLAN_REASON_DEAUTH_LEAVING);
 
-		set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request);
-		queue_work(local->hw.workqueue, &ifsta->work);
+		set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
+		queue_work(local->hw.workqueue, &ifmgd->work);
 	}
 }
 
-int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_if_sta *ifsta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	if (len > IEEE80211_MAX_SSID_LEN)
-		return -EINVAL;
+	ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
 
-	ifsta = &sdata->u.sta;
+	if (ifmgd->ssid_len)
+		ifmgd->flags |= IEEE80211_STA_SSID_SET;
+	else
+		ifmgd->flags &= ~IEEE80211_STA_SSID_SET;
 
-	if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) {
-		memset(ifsta->ssid, 0, sizeof(ifsta->ssid));
-		memcpy(ifsta->ssid, ssid, len);
-		ifsta->ssid_len = len;
-	}
+	return 0;
+}
 
-	ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
+int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
+{
+	struct ieee80211_if_managed *ifmgd;
 
-	if (len)
-		ifsta->flags |= IEEE80211_STA_SSID_SET;
-	else
-		ifsta->flags &= ~IEEE80211_STA_SSID_SET;
+	if (len > IEEE80211_MAX_SSID_LEN)
+		return -EINVAL;
+
+	ifmgd = &sdata->u.mgd;
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		ifsta->ibss_join_req = jiffies;
-		ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
-		return ieee80211_sta_find_ibss(sdata, ifsta);
+	if (ifmgd->ssid_len != len || memcmp(ifmgd->ssid, ssid, len) != 0) {
+		memset(ifmgd->ssid, 0, sizeof(ifmgd->ssid));
+		memcpy(ifmgd->ssid, ssid, len);
+		ifmgd->ssid_len = len;
 	}
 
-	return 0;
+	return ieee80211_sta_commit(sdata);
 }
 
 int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-	memcpy(ssid, ifsta->ssid, ifsta->ssid_len);
-	*len = ifsta->ssid_len;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	memcpy(ssid, ifmgd->ssid, ifmgd->ssid_len);
+	*len = ifmgd->ssid_len;
 	return 0;
 }
 
 int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
 {
-	struct ieee80211_if_sta *ifsta;
-
-	ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	if (is_valid_ether_addr(bssid)) {
-		memcpy(ifsta->bssid, bssid, ETH_ALEN);
-		ifsta->flags |= IEEE80211_STA_BSSID_SET;
+		memcpy(ifmgd->bssid, bssid, ETH_ALEN);
+		ifmgd->flags |= IEEE80211_STA_BSSID_SET;
 	} else {
-		memset(ifsta->bssid, 0, ETH_ALEN);
-		ifsta->flags &= ~IEEE80211_STA_BSSID_SET;
+		memset(ifmgd->bssid, 0, ETH_ALEN);
+		ifmgd->flags &= ~IEEE80211_STA_BSSID_SET;
 	}
 
 	if (netif_running(sdata->dev)) {
@@ -2729,47 +1920,44 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
 		}
 	}
 
-	return ieee80211_sta_set_ssid(sdata, ifsta->ssid, ifsta->ssid_len);
+	return ieee80211_sta_commit(sdata);
 }
 
 int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	kfree(ifsta->extra_ie);
+	kfree(ifmgd->extra_ie);
 	if (len == 0) {
-		ifsta->extra_ie = NULL;
-		ifsta->extra_ie_len = 0;
+		ifmgd->extra_ie = NULL;
+		ifmgd->extra_ie_len = 0;
 		return 0;
 	}
-	ifsta->extra_ie = kmalloc(len, GFP_KERNEL);
-	if (!ifsta->extra_ie) {
-		ifsta->extra_ie_len = 0;
+	ifmgd->extra_ie = kmalloc(len, GFP_KERNEL);
+	if (!ifmgd->extra_ie) {
+		ifmgd->extra_ie_len = 0;
 		return -ENOMEM;
 	}
-	memcpy(ifsta->extra_ie, ie, len);
-	ifsta->extra_ie_len = len;
+	memcpy(ifmgd->extra_ie, ie, len);
+	ifmgd->extra_ie_len = len;
 	return 0;
 }
 
 int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-
 	printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n",
 	       sdata->dev->name, reason);
 
-	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
-	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EINVAL;
 
-	ieee80211_set_disassoc(sdata, ifsta, true, true, reason);
+	ieee80211_set_disassoc(sdata, true, true, reason);
 	return 0;
 }
 
 int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 {
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n",
 	       sdata->dev->name, reason);
@@ -2777,10 +1965,10 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EINVAL;
 
-	if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED))
-		return -1;
+	if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED))
+		return -ENOLINK;
 
-	ieee80211_set_disassoc(sdata, ifsta, false, true, reason);
+	ieee80211_set_disassoc(sdata, false, true, reason);
 	return 0;
 }
 
@@ -2788,14 +1976,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
 void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
-	struct ieee80211_if_sta *ifsta;
-
-	if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		ifsta = &sdata->u.sta;
-		if ((!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) ||
-		    !ieee80211_sta_active_ibss(sdata))
-			ieee80211_sta_find_ibss(sdata, ifsta);
-	}
 
 	/* Restart STA timers */
 	rcu_read_lock();
@@ -2842,3 +2022,36 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
 
 	queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work);
 }
+
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+			     struct ieee80211_sub_if_data *sdata,
+			     int powersave)
+{
+	struct sk_buff *skb;
+	struct ieee80211_hdr *nullfunc;
+	__le16 fc;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+		return;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
+	memset(nullfunc, 0, 24);
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
+			 IEEE80211_FCTL_TODS);
+	if (powersave)
+		fc |= cpu_to_le16(IEEE80211_FCTL_PM);
+	nullfunc->frame_control = fc;
+	memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+
+	ieee80211_tx_skb(sdata, skb, 0);
+}
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 928da625e281..b9164c9a9563 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -62,6 +62,18 @@ static inline void rate_control_rate_init(struct sta_info *sta)
 	ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
 }
 
+static inline void rate_control_rate_update(struct ieee80211_local *local,
+				    struct ieee80211_supported_band *sband,
+				    struct sta_info *sta, u32 changed)
+{
+	struct rate_control_ref *ref = local->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+
+	if (ref->ops->rate_update)
+		ref->ops->rate_update(ref->priv, sband, ista,
+				      priv_sta, changed);
+}
 
 static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
 					   struct ieee80211_sta *sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1327d424bf31..66f7ecf51b92 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -838,7 +838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
 						NL80211_IFTYPE_ADHOC);
-		if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0)
+		if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0)
 			sta->last_rx = jiffies;
 	} else
 	if (!is_multicast_ether_addr(hdr->addr1) ||
@@ -1702,13 +1702,13 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	if (compare_ether_addr(mgmt->sa, sdata->u.sta.bssid) != 0 ||
-	    compare_ether_addr(mgmt->bssid, sdata->u.sta.bssid) != 0) {
+	if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 ||
+	    compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) {
 		/* Not from the current AP. */
 		return;
 	}
 
-	if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATE) {
+	if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATE) {
 		/* Association in progress; ignore SA Query */
 		return;
 	}
@@ -1727,7 +1727,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
 	memset(resp, 0, 24);
 	memcpy(resp->da, mgmt->sa, ETH_ALEN);
 	memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(resp->bssid, sdata->u.sta.bssid, ETH_ALEN);
+	memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
 	resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
 					  IEEE80211_STYPE_ACTION);
 	skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
@@ -1745,7 +1745,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_local *local = rx->local;
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
 	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
 	struct ieee80211_bss *bss;
 	int len = rx->skb->len;
@@ -1803,6 +1802,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	case WLAN_CATEGORY_SPECTRUM_MGMT:
 		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
 			return RX_DROP_MONITOR;
+
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			return RX_DROP_MONITOR;
+
 		switch (mgmt->u.action.u.measurement.action_code) {
 		case WLAN_ACTION_SPCT_MSR_REQ:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
@@ -1815,12 +1818,13 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 				   sizeof(mgmt->u.action.u.chan_switch)))
 				return RX_DROP_MONITOR;
 
-			if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0)
+			if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN))
 				return RX_DROP_MONITOR;
 
-			bss = ieee80211_rx_bss_get(local, ifsta->bssid,
+			bss = ieee80211_rx_bss_get(local, sdata->u.mgd.bssid,
 					   local->hw.conf.channel->center_freq,
-					   ifsta->ssid, ifsta->ssid_len);
+					   sdata->u.mgd.ssid,
+					   sdata->u.mgd.ssid_len);
 			if (!bss)
 				return RX_DROP_MONITOR;
 
@@ -1876,11 +1880,14 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
 		return RX_DROP_MONITOR;
 
-	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
-		return RX_DROP_MONITOR;
 
-	ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
-	return RX_QUEUED;
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
+			return RX_DROP_MONITOR;
+		return ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
+	}
+
+	return ieee80211_ibss_rx_mgmt(sdata, rx->skb, rx->status);
 }
 
 static void ieee80211_rx_michael_mic_report(struct net_device *dev,
@@ -2083,7 +2090,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 	case NL80211_IFTYPE_STATION:
 		if (!bssid)
 			return 0;
-		if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
+		if (!ieee80211_bssid_match(bssid, sdata->u.mgd.bssid)) {
 			if (!(rx->flags & IEEE80211_RX_IN_SCAN))
 				return 0;
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
@@ -2101,7 +2108,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 		if (ieee80211_is_beacon(hdr->frame_control)) {
 			return 1;
 		}
-		else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) {
+		else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
 			if (!(rx->flags & IEEE80211_RX_IN_SCAN))
 				return 0;
 			rx->flags &= ~IEEE80211_RX_RA_MATCH;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f883ab9f1e6e..5030a3c87509 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -63,20 +63,15 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 {
 	struct ieee80211_bss *bss;
 	int clen;
-	enum cfg80211_signal_type sigtype = CFG80211_SIGNAL_TYPE_NONE;
 	s32 signal = 0;
 
-	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
-		sigtype = CFG80211_SIGNAL_TYPE_MBM;
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
 		signal = rx_status->signal * 100;
-	} else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
-		sigtype = CFG80211_SIGNAL_TYPE_UNSPEC;
+	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
 		signal = (rx_status->signal * 100) / local->hw.max_signal;
-	}
 
 	bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel,
-						mgmt, len, signal, sigtype,
-						GFP_ATOMIC);
+						mgmt, len, signal, GFP_ATOMIC);
 
 	if (!bss)
 		return NULL;
@@ -207,34 +202,16 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 	return RX_QUEUED;
 }
 
-void ieee80211_send_nullfunc(struct ieee80211_local *local,
-				    struct ieee80211_sub_if_data *sdata,
-				    int powersave)
+void ieee80211_scan_failed(struct ieee80211_local *local)
 {
-	struct sk_buff *skb;
-	struct ieee80211_hdr *nullfunc;
-	__le16 fc;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
-	if (!skb) {
-		printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
-		       "frame\n", sdata->dev->name);
+	if (WARN_ON(!local->scan_req))
 		return;
-	}
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
-	memset(nullfunc, 0, 24);
-	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
-			 IEEE80211_FCTL_TODS);
-	if (powersave)
-		fc |= cpu_to_le16(IEEE80211_FCTL_PM);
-	nullfunc->frame_control = fc;
-	memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN);
-	memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
-	memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
-
-	ieee80211_tx_skb(sdata, skb, 0);
+
+	/* notify cfg80211 about the failed scan */
+	if (local->scan_req != &local->int_scan_req)
+		cfg80211_scan_done(local->scan_req, true);
+
+	local->scan_req = NULL;
 }
 
 void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
@@ -280,6 +257,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	netif_addr_unlock(local->mdev);
 	netif_tx_unlock_bh(local->mdev);
 
+	if (local->ops->sw_scan_complete)
+		local->ops->sw_scan_complete(local_to_hw(local));
+
 	mutex_lock(&local->iflist_mtx);
 	list_for_each_entry(sdata, &local->interfaces, list) {
 		if (!netif_running(sdata->dev))
@@ -287,7 +267,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 
 		/* Tell AP we're back */
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
-			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+			if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
 				ieee80211_send_nullfunc(local, sdata, 0);
 				netif_tx_wake_all_queues(sdata->dev);
 			}
@@ -305,6 +285,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 
  done:
 	ieee80211_mlme_notify_scan_completed(local);
+	ieee80211_ibss_notify_scan_completed(local);
 	ieee80211_mesh_notify_scan_completed(local);
 }
 EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -367,7 +348,8 @@ void ieee80211_scan_work(struct work_struct *work)
 			ieee80211_send_probe_req(
 				sdata, NULL,
 				local->scan_req->ssids[i].ssid,
-				local->scan_req->ssids[i].ssid_len);
+				local->scan_req->ssids[i].ssid_len,
+				local->scan_req->ie, local->scan_req->ie_len);
 		next_delay = IEEE80211_CHANNEL_TIME;
 		break;
 	}
@@ -428,6 +410,8 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 	}
 
 	local->sw_scanning = true;
+	if (local->ops->sw_scan_start)
+		local->ops->sw_scan_start(local_to_hw(local));
 
 	mutex_lock(&local->iflist_mtx);
 	list_for_each_entry(sdata, &local->interfaces, list) {
@@ -442,7 +426,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
 					    IEEE80211_IFCC_BEACON_ENABLED);
 
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
-			if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) {
+			if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
 				netif_tx_stop_all_queues(sdata->dev);
 				ieee80211_send_nullfunc(local, sdata, 1);
 			}
@@ -477,7 +461,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 			   struct cfg80211_scan_request *req)
 {
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_sta *ifsta;
+	struct ieee80211_if_managed *ifmgd;
 
 	if (!req)
 		return -EINVAL;
@@ -502,9 +486,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 		return -EBUSY;
 	}
 
-	ifsta = &sdata->u.sta;
-	set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request);
-	queue_work(local->hw.workqueue, &ifsta->work);
+	ifmgd = &sdata->u.mgd;
+	set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request);
+	queue_work(local->hw.workqueue, &ifmgd->work);
 
 	return 0;
 }
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 47bb2aed2813..5f7a2624ed74 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -88,16 +88,16 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
 void ieee80211_chswitch_work(struct work_struct *work)
 {
 	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.sta.chswitch_work);
+		container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work);
 	struct ieee80211_bss *bss;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	if (!netif_running(sdata->dev))
 		return;
 
-	bss = ieee80211_rx_bss_get(sdata->local, ifsta->bssid,
+	bss = ieee80211_rx_bss_get(sdata->local, ifmgd->bssid,
 				   sdata->local->hw.conf.channel->center_freq,
-				   ifsta->ssid, ifsta->ssid_len);
+				   ifmgd->ssid, ifmgd->ssid_len);
 	if (!bss)
 		goto exit;
 
@@ -108,7 +108,7 @@ void ieee80211_chswitch_work(struct work_struct *work)
 
 	ieee80211_rx_bss_put(sdata->local, bss);
 exit:
-	ifsta->flags &= ~IEEE80211_STA_CSA_RECEIVED;
+	ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED;
 	ieee80211_wake_queues_by_reason(&sdata->local->hw,
 					IEEE80211_QUEUE_STOP_REASON_CSA);
 }
@@ -117,9 +117,9 @@ void ieee80211_chswitch_timer(unsigned long data)
 {
 	struct ieee80211_sub_if_data *sdata =
 		(struct ieee80211_sub_if_data *) data;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work);
+	queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
 }
 
 void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
@@ -127,14 +127,14 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 				  struct ieee80211_bss *bss)
 {
 	struct ieee80211_channel *new_ch;
-	struct ieee80211_if_sta *ifsta = &sdata->u.sta;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num);
 
 	/* FIXME: Handle ADHOC later */
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return;
 
-	if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATED)
+	if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATED)
 		return;
 
 	if (sdata->local->sw_scanning || sdata->local->hw_scanning)
@@ -143,7 +143,7 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	/* Disregard subsequent beacons if we are already running a timer
 	   processing a CSA */
 
-	if (ifsta->flags & IEEE80211_STA_CSA_RECEIVED)
+	if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
 		return;
 
 	new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
@@ -153,12 +153,12 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	sdata->local->csa_channel = new_ch;
 
 	if (sw_elem->count <= 1) {
-		queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work);
+		queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
 	} else {
 		ieee80211_stop_queues_by_reason(&sdata->local->hw,
 						IEEE80211_QUEUE_STOP_REASON_CSA);
-		ifsta->flags |= IEEE80211_STA_CSA_RECEIVED;
-		mod_timer(&ifsta->chswitch_timer,
+		ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
+		mod_timer(&ifmgd->chswitch_timer,
 			  jiffies +
 			  msecs_to_jiffies(sw_elem->count *
 					   bss->cbss.beacon_interval));
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 634f65c0130e..4ba3c540fcf3 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -202,6 +202,18 @@ void sta_info_destroy(struct sta_info *sta)
 		/* Make sure timer won't free the tid_rx struct, see below */
 		if (tid_rx)
 			tid_rx->shutdown = true;
+
+		/*
+		 * The stop callback cannot find this station any more, but
+		 * it didn't complete its work -- start the queue if necessary
+		 */
+		if (sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_INITIATOR_MSK &&
+		    sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_REQ_STOP_BA_MSK &&
+		    local->hw.ampdu_queues)
+			ieee80211_wake_queue_by_reason(&local->hw,
+				local->hw.queues + sta->tid_to_tx_q[i],
+				IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+
 		spin_unlock_bh(&sta->lock);
 
 		/*
@@ -275,8 +287,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		 * enable session_timer's data differentiation. refer to
 		 * sta_rx_agg_session_timer_expired for useage */
 		sta->timer_to_tid[i] = i;
-		/* tid to tx queue: initialize according to HW (0 is valid) */
-		sta->tid_to_tx_q[i] = ieee80211_num_queues(&local->hw);
+		sta->tid_to_tx_q[i] = -1;
 		/* rx */
 		sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE;
 		sta->ampdu_mlme.tid_rx[i] = NULL;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d9653231992f..1f45573c580c 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -90,6 +90,7 @@ struct tid_ampdu_tx {
  * @buf_size: buffer size for incoming A-MPDUs
  * @timeout: reset timer value (in TUs).
  * @dialog_token: dialog token for aggregation session
+ * @shutdown: this session is being shut down due to STA removal
  */
 struct tid_ampdu_rx {
 	struct sk_buff **reorder_buf;
@@ -200,7 +201,7 @@ struct sta_ampdu_mlme {
  * @tid_seq: per-TID sequence numbers for sending to this STA
  * @ampdu_mlme: A-MPDU state machine state
  * @timer_to_tid: identity mapping to ID timers
- * @tid_to_tx_q: map tid to tx queue
+ * @tid_to_tx_q: map tid to tx queue (invalid == negative values)
  * @llid: Local link ID
  * @plid: Peer link ID
  * @reason: Cancel reason on PLINK_HOLDING state
@@ -275,7 +276,7 @@ struct sta_info {
 	 */
 	struct sta_ampdu_mlme ampdu_mlme;
 	u8 timer_to_tid[STA_TID_NUM];
-	u8 tid_to_tx_q[STA_TID_NUM];
+	s8 tid_to_tx_q[STA_TID_NUM];
 
 #ifdef CONFIG_MAC80211_MESH
 	/*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 33926831c648..457238a2f3fc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -784,6 +784,8 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 		skb_copy_queue_mapping(frag, first);
 
 		frag->do_not_encrypt = first->do_not_encrypt;
+		frag->dev = first->dev;
+		frag->iif = first->iif;
 
 		pos += copylen;
 		left -= copylen;
@@ -876,7 +878,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
-
 /* actual transmit path */
 
 /*
@@ -1016,12 +1017,20 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 	tx->sta = sta_info_get(local, hdr->addr1);
 
 	if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+		unsigned long flags;
 		qc = ieee80211_get_qos_ctl(hdr);
 		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 
+		spin_lock_irqsave(&tx->sta->lock, flags);
 		state = &tx->sta->ampdu_mlme.tid_state_tx[tid];
-		if (*state == HT_AGG_STATE_OPERATIONAL)
+		if (*state == HT_AGG_STATE_OPERATIONAL) {
 			info->flags |= IEEE80211_TX_CTL_AMPDU;
+			if (local->hw.ampdu_queues)
+				skb_set_queue_mapping(
+					skb, tx->local->hw.queues +
+					     tx->sta->tid_to_tx_q[tid]);
+		}
+		spin_unlock_irqrestore(&tx->sta->lock, flags);
 	}
 
 	if (is_multicast_ether_addr(hdr->addr1)) {
@@ -1085,7 +1094,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
 	int ret, i;
 
 	if (skb) {
-		if (netif_subqueue_stopped(local->mdev, skb))
+		if (ieee80211_queue_stopped(&local->hw,
+					    skb_get_queue_mapping(skb)))
 			return IEEE80211_TX_PENDING;
 
 		ret = local->ops->tx(local_to_hw(local), skb);
@@ -1101,8 +1111,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
 			info = IEEE80211_SKB_CB(tx->extra_frag[i]);
 			info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT |
 					 IEEE80211_TX_CTL_FIRST_FRAGMENT);
-			if (netif_subqueue_stopped(local->mdev,
-						   tx->extra_frag[i]))
+			if (ieee80211_queue_stopped(&local->hw,
+					skb_get_queue_mapping(tx->extra_frag[i])))
 				return IEEE80211_TX_FRAG_AGAIN;
 
 			ret = local->ops->tx(local_to_hw(local),
@@ -1625,7 +1635,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 	case NL80211_IFTYPE_STATION:
 		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
 		/* BSSID SA DA */
-		memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN);
+		memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
 		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
 		memcpy(hdr.addr3, skb->data, ETH_ALEN);
 		hdrlen = 24;
@@ -1634,7 +1644,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
 		/* DA SA BSSID */
 		memcpy(hdr.addr1, skb->data, ETH_ALEN);
 		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
-		memcpy(hdr.addr3, sdata->u.sta.bssid, ETH_ALEN);
+		memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
 		hdrlen = 24;
 		break;
 	default:
@@ -1920,7 +1930,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 	struct ieee80211_tx_info *info;
 	struct ieee80211_sub_if_data *sdata = NULL;
 	struct ieee80211_if_ap *ap = NULL;
-	struct ieee80211_if_sta *ifsta = NULL;
 	struct beacon_data *beacon;
 	struct ieee80211_supported_band *sband;
 	enum ieee80211_band band = local->hw.conf.channel->band;
@@ -1972,13 +1981,13 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
 		} else
 			goto out;
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 		struct ieee80211_hdr *hdr;
-		ifsta = &sdata->u.sta;
 
-		if (!ifsta->probe_resp)
+		if (!ifibss->probe_resp)
 			goto out;
 
-		skb = skb_copy(ifsta->probe_resp, GFP_ATOMIC);
+		skb = skb_copy(ifibss->probe_resp, GFP_ATOMIC);
 		if (!skb)
 			goto out;
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 73c7d7345abd..e0431a1d218b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -344,15 +344,36 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 
-	/* we don't need to track ampdu queues */
-	if (queue < ieee80211_num_regular_queues(hw)) {
-		__clear_bit(reason, &local->queue_stop_reasons[queue]);
+	if (queue >= hw->queues) {
+		if (local->ampdu_ac_queue[queue - hw->queues] < 0)
+			return;
+
+		/*
+		 * for virtual aggregation queues, we need to refcount the
+		 * internal mac80211 disable (multiple times!), keep track of
+		 * driver disable _and_ make sure the regular queue is
+		 * actually enabled.
+		 */
+		if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
+			local->amdpu_ac_stop_refcnt[queue - hw->queues]--;
+		else
+			__clear_bit(reason, &local->queue_stop_reasons[queue]);
 
-		if (local->queue_stop_reasons[queue] != 0)
-			/* someone still has this queue stopped */
+		if (local->queue_stop_reasons[queue] ||
+		    local->amdpu_ac_stop_refcnt[queue - hw->queues])
 			return;
+
+		/* now go on to treat the corresponding regular queue */
+		queue = local->ampdu_ac_queue[queue - hw->queues];
+		reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
 	}
 
+	__clear_bit(reason, &local->queue_stop_reasons[queue]);
+
+	if (local->queue_stop_reasons[queue] != 0)
+		/* someone still has this queue stopped */
+		return;
+
 	if (test_bit(queue, local->queues_pending)) {
 		set_bit(queue, local->queues_pending_run);
 		tasklet_schedule(&local->tx_pending_tasklet);
@@ -361,8 +382,8 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
 	}
 }
 
-static void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
-					   enum queue_stop_reason reason)
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	unsigned long flags;
@@ -384,15 +405,33 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 
-	/* we don't need to track ampdu queues */
-	if (queue < ieee80211_num_regular_queues(hw))
-		__set_bit(reason, &local->queue_stop_reasons[queue]);
+	if (queue >= hw->queues) {
+		if (local->ampdu_ac_queue[queue - hw->queues] < 0)
+			return;
+
+		/*
+		 * for virtual aggregation queues, we need to refcount the
+		 * internal mac80211 disable (multiple times!), keep track of
+		 * driver disable _and_ make sure the regular queue is
+		 * actually enabled.
+		 */
+		if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
+			local->amdpu_ac_stop_refcnt[queue - hw->queues]++;
+		else
+			__set_bit(reason, &local->queue_stop_reasons[queue]);
+
+		/* now go on to treat the corresponding regular queue */
+		queue = local->ampdu_ac_queue[queue - hw->queues];
+		reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
+	}
+
+	__set_bit(reason, &local->queue_stop_reasons[queue]);
 
 	netif_stop_subqueue(local->mdev, queue);
 }
 
-static void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
-					   enum queue_stop_reason reason)
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	unsigned long flags;
@@ -418,7 +457,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
 
 	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 
-	for (i = 0; i < ieee80211_num_queues(hw); i++)
+	for (i = 0; i < hw->queues; i++)
 		__ieee80211_stop_queue(hw, i, reason);
 
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
@@ -434,6 +473,16 @@ EXPORT_SYMBOL(ieee80211_stop_queues);
 int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+
+	if (queue >= hw->queues) {
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+		queue = local->ampdu_ac_queue[queue - hw->queues];
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+		if (queue < 0)
+			return true;
+	}
+
 	return __netif_subqueue_stopped(local->mdev, queue);
 }
 EXPORT_SYMBOL(ieee80211_queue_stopped);
@@ -701,6 +750,27 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
 		local->ops->conf_tx(local_to_hw(local), i, &qparam);
 }
 
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+				  const size_t supp_rates_len,
+				  const u8 *supp_rates)
+{
+	struct ieee80211_local *local = sdata->local;
+	int i, have_higher_than_11mbit = 0;
+
+	/* cf. IEEE 802.11 9.2.12 */
+	for (i = 0; i < supp_rates_len; i++)
+		if ((supp_rates[i] & 0x7f) * 5 > 110)
+			have_higher_than_11mbit = 1;
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	    have_higher_than_11mbit)
+		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+	else
+		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+	ieee80211_set_wmm_default(sdata);
+}
+
 void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
 		      int encrypt)
 {
@@ -767,3 +837,161 @@ u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
 			mandatory_rates |= BIT(i);
 	return mandatory_rates;
 }
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+			 u16 transaction, u16 auth_alg,
+			 u8 *extra, size_t extra_len,
+			 const u8 *bssid, int encrypt)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	const u8 *ie_auth = NULL;
+	int ie_auth_len = 0;
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		ie_auth_len = sdata->u.mgd.ie_auth_len;
+		ie_auth = sdata->u.mgd.ie_auth;
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*mgmt) + 6 + extra_len + ie_auth_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
+		       "frame\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+	memset(mgmt, 0, 24 + 6);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_AUTH);
+	if (encrypt)
+		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+	memcpy(mgmt->da, bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(mgmt->bssid, bssid, ETH_ALEN);
+	mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
+	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+	mgmt->u.auth.status_code = cpu_to_le16(0);
+	if (extra)
+		memcpy(skb_put(skb, extra_len), extra, extra_len);
+	if (ie_auth)
+		memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len);
+
+	ieee80211_tx_skb(sdata, skb, encrypt);
+}
+
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      u8 *ssid, size_t ssid_len,
+			      u8 *ie, size_t ie_len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL;
+	int i, extra_preq_ie_len = 0;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		extra_preq_ie_len = sdata->u.mgd.ie_probereq_len;
+		extra_preq_ie = sdata->u.mgd.ie_probereq;
+		break;
+	default:
+		break;
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
+			    ie_len + extra_preq_ie_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+		       "request\n", sdata->dev->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_PROBE_REQ);
+	memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
+	if (dst) {
+		memcpy(mgmt->da, dst, ETH_ALEN);
+		memcpy(mgmt->bssid, dst, ETH_ALEN);
+	} else {
+		memset(mgmt->da, 0xff, ETH_ALEN);
+		memset(mgmt->bssid, 0xff, ETH_ALEN);
+	}
+	pos = skb_put(skb, 2 + ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ssid_len;
+	memcpy(pos, ssid, ssid_len);
+
+	supp_rates = skb_put(skb, 2);
+	supp_rates[0] = WLAN_EID_SUPP_RATES;
+	supp_rates[1] = 0;
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	for (i = 0; i < sband->n_bitrates; i++) {
+		struct ieee80211_rate *rate = &sband->bitrates[i];
+		if (esupp_rates) {
+			pos = skb_put(skb, 1);
+			esupp_rates[1]++;
+		} else if (supp_rates[1] == 8) {
+			esupp_rates = skb_put(skb, 3);
+			esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
+			esupp_rates[1] = 1;
+			pos = &esupp_rates[2];
+		} else {
+			pos = skb_put(skb, 1);
+			supp_rates[1]++;
+		}
+		*pos = rate->bitrate / 5;
+	}
+
+	if (ie)
+		memcpy(skb_put(skb, ie_len), ie, ie_len);
+	if (extra_preq_ie)
+		memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie,
+		       extra_preq_ie_len);
+
+	ieee80211_tx_skb(sdata, skb, 0);
+}
+
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+			    struct ieee802_11_elems *elems,
+			    enum ieee80211_band band)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	size_t num_rates;
+	u32 supp_rates;
+	int i, j;
+	sband = local->hw.wiphy->bands[band];
+
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	}
+
+	bitrates = sband->bitrates;
+	num_rates = sband->n_bitrates;
+	supp_rates = 0;
+	for (i = 0; i < elems->supp_rates_len +
+		     elems->ext_supp_rates_len; i++) {
+		u8 rate = 0;
+		int own_rate;
+		if (i < elems->supp_rates_len)
+			rate = elems->supp_rates[i];
+		else if (elems->ext_supp_rates)
+			rate = elems->ext_supp_rates
+				[i - elems->supp_rates_len];
+		own_rate = 5 * (rate & 0x7f);
+		for (j = 0; j < num_rates; j++)
+			if (bitrates[j].bitrate == own_rate)
+				supp_rates |= BIT(j);
+	}
+	return supp_rates;
+}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 2b023dce8b24..935c63ed3dfa 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -132,139 +132,37 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
 	if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
 		return -EOPNOTSUPP;
 
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 		int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length);
 		if (ret)
 			return ret;
-		sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
-		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+		ieee80211_sta_req_auth(sdata);
 		return 0;
 	}
 
 	return -EOPNOTSUPP;
 }
 
-static u8 ieee80211_get_wstats_flags(struct ieee80211_local *local)
-{
-	u8 wstats_flags = 0;
-
-	wstats_flags |= local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
-					   IEEE80211_HW_SIGNAL_DBM) ?
-				IW_QUAL_QUAL_UPDATED : IW_QUAL_QUAL_INVALID;
-	wstats_flags |= local->hw.flags & IEEE80211_HW_NOISE_DBM ?
-				IW_QUAL_NOISE_UPDATED : IW_QUAL_NOISE_INVALID;
-	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
-		wstats_flags |= IW_QUAL_DBM;
-
-	return wstats_flags;
-}
-
-static int ieee80211_ioctl_giwrange(struct net_device *dev,
-				 struct iw_request_info *info,
-				 struct iw_point *data, char *extra)
-{
-	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct iw_range *range = (struct iw_range *) extra;
-	enum ieee80211_band band;
-	int c = 0;
-
-	data->length = sizeof(struct iw_range);
-	memset(range, 0, sizeof(struct iw_range));
-
-	range->we_version_compiled = WIRELESS_EXT;
-	range->we_version_source = 21;
-	range->retry_capa = IW_RETRY_LIMIT;
-	range->retry_flags = IW_RETRY_LIMIT;
-	range->min_retry = 0;
-	range->max_retry = 255;
-	range->min_rts = 0;
-	range->max_rts = 2347;
-	range->min_frag = 256;
-	range->max_frag = 2346;
-
-	range->encoding_size[0] = 5;
-	range->encoding_size[1] = 13;
-	range->num_encoding_sizes = 2;
-	range->max_encoding_tokens = NUM_DEFAULT_KEYS;
-
-	/* cfg80211 requires this, and enforces 0..100 */
-	if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
-		range->max_qual.level = 100;
-	else if  (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
-		range->max_qual.level = -110;
-	else
-		range->max_qual.level = 0;
-
-	if (local->hw.flags & IEEE80211_HW_NOISE_DBM)
-		range->max_qual.noise = -110;
-	else
-		range->max_qual.noise = 0;
-
-	range->max_qual.qual = 100;
-	range->max_qual.updated = ieee80211_get_wstats_flags(local);
-
-	range->avg_qual.qual = 50;
-	/* not always true but better than nothing */
-	range->avg_qual.level = range->max_qual.level / 2;
-	range->avg_qual.noise = range->max_qual.noise / 2;
-	range->avg_qual.updated = ieee80211_get_wstats_flags(local);
-
-	range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
-			  IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
-
-
-	for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
-		int i;
-		struct ieee80211_supported_band *sband;
-
-		sband = local->hw.wiphy->bands[band];
-
-		if (!sband)
-			continue;
-
-		for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
-			struct ieee80211_channel *chan = &sband->channels[i];
-
-			if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
-				range->freq[c].i =
-					ieee80211_frequency_to_channel(
-						chan->center_freq);
-				range->freq[c].m = chan->center_freq;
-				range->freq[c].e = 6;
-				c++;
-			}
-		}
-	}
-	range->num_channels = c;
-	range->num_frequency = c;
-
-	IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
-	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
-	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
-
-	range->scan_capa |= IW_SCAN_CAPA_ESSID;
-
-	return 0;
-}
-
-
 static int ieee80211_ioctl_siwfreq(struct net_device *dev,
 				   struct iw_request_info *info,
 				   struct iw_freq *freq, char *extra)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC ||
-	    sdata->vif.type == NL80211_IFTYPE_STATION)
-		sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
 
 	/* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */
 	if (freq->e == 0) {
 		if (freq->m < 0) {
-			if (sdata->vif.type == NL80211_IFTYPE_ADHOC ||
-			    sdata->vif.type == NL80211_IFTYPE_STATION)
-				sdata->u.sta.flags |=
+			if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+				sdata->u.ibss.flags |=
+					IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+			else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+				sdata->u.mgd.flags |=
 					IEEE80211_STA_AUTO_CHANNEL_SEL;
 			return 0;
 		} else
@@ -301,32 +199,35 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
 {
 	struct ieee80211_sub_if_data *sdata;
 	size_t len = data->length;
+	int ret;
 
 	/* iwconfig uses nul termination in SSID.. */
 	if (len > 0 && ssid[len - 1] == '\0')
 		len--;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		int ret;
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 		if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
 			if (len > IEEE80211_MAX_SSID_LEN)
 				return -EINVAL;
-			memcpy(sdata->u.sta.ssid, ssid, len);
-			sdata->u.sta.ssid_len = len;
+			memcpy(sdata->u.mgd.ssid, ssid, len);
+			sdata->u.mgd.ssid_len = len;
 			return 0;
 		}
+
 		if (data->flags)
-			sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
+			sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
 		else
-			sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL;
+			sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL;
+
 		ret = ieee80211_sta_set_ssid(sdata, ssid, len);
 		if (ret)
 			return ret;
-		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+
+		ieee80211_sta_req_auth(sdata);
 		return 0;
-	}
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		return ieee80211_ibss_set_ssid(sdata, ssid, len);
 
 	return -EOPNOTSUPP;
 }
@@ -340,8 +241,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
 
 	struct ieee80211_sub_if_data *sdata;
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 		int res = ieee80211_sta_get_ssid(sdata, ssid, &len);
 		if (res == 0) {
 			data->length = len;
@@ -349,6 +249,14 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
 		} else
 			data->flags = 0;
 		return res;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		int res = ieee80211_ibss_get_ssid(sdata, ssid, &len);
+		if (res == 0) {
+			data->length = len;
+			data->flags = 1;
+		} else
+			data->flags = 0;
+		return res;
 	}
 
 	return -EOPNOTSUPP;
@@ -362,26 +270,35 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 		int ret;
 		if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
-			memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data,
+			memcpy(sdata->u.mgd.bssid, (u8 *) &ap_addr->sa_data,
 			       ETH_ALEN);
 			return 0;
 		}
 		if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
-			sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL |
+			sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL |
 				IEEE80211_STA_AUTO_CHANNEL_SEL;
 		else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
-			sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
+			sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
 		else
-			sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
+			sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
 		ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
 		if (ret)
 			return ret;
-		ieee80211_sta_req_auth(sdata, &sdata->u.sta);
+		ieee80211_sta_req_auth(sdata);
 		return 0;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
+			sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
+					       IEEE80211_IBSS_AUTO_CHANNEL_SEL;
+		else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
+			sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL;
+		else
+			sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_BSSID_SEL;
+
+		return ieee80211_ibss_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
 	} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		/*
 		 * If it is necessary to update the WDS peer address
@@ -410,17 +327,20 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
 	struct ieee80211_sub_if_data *sdata;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-		if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED ||
-		    sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) {
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATED) {
 			ap_addr->sa_family = ARPHRD_ETHER;
-			memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN);
-			return 0;
-		} else {
+			memcpy(&ap_addr->sa_data, sdata->u.mgd.bssid, ETH_ALEN);
+		} else
 			memset(&ap_addr->sa_data, 0, ETH_ALEN);
-			return 0;
-		}
+		return 0;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		if (sdata->u.ibss.state == IEEE80211_IBSS_MLME_JOINED) {
+			ap_addr->sa_family = ARPHRD_ETHER;
+			memcpy(&ap_addr->sa_data, sdata->u.ibss.bssid, ETH_ALEN);
+		} else
+			memset(&ap_addr->sa_data, 0, ETH_ALEN);
+		return 0;
 	} else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		ap_addr->sa_family = ARPHRD_ETHER;
 		memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
@@ -486,7 +406,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev,
 
 	rcu_read_lock();
 
-	sta = sta_info_get(local, sdata->u.sta.bssid);
+	sta = sta_info_get(local, sdata->u.mgd.bssid);
 
 	if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS))
 		rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate;
@@ -687,8 +607,7 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev,
 	struct iw_mlme *mlme = (struct iw_mlme *) extra;
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
-	    sdata->vif.type != NL80211_IFTYPE_ADHOC)
+	if (!(sdata->vif.type == NL80211_IFTYPE_STATION))
 		return -EINVAL;
 
 	switch (mlme->cmd) {
@@ -784,8 +703,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev,
 	erq->flags |= IW_ENCODE_ENABLED;
 
 	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
-		struct ieee80211_if_sta *ifsta = &sdata->u.sta;
-		switch (ifsta->auth_alg) {
+		switch (sdata->u.mgd.auth_alg) {
 		case WLAN_AUTH_OPEN:
 		case WLAN_AUTH_LEAP:
 			erq->flags |= IW_ENCODE_OPEN;
@@ -849,7 +767,7 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev,
 		ret = ieee80211_hw_config(local,
 					  IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT);
 
-	if (!(sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED))
+	if (!(sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED))
 		return ret;
 
 	if (conf->dynamic_ps_timeout > 0 &&
@@ -908,10 +826,10 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			if (data->value & (IW_AUTH_CIPHER_WEP40 |
 			    IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP))
-				sdata->u.sta.flags |=
+				sdata->u.mgd.flags |=
 					IEEE80211_STA_TKIP_WEP_USED;
 			else
-				sdata->u.sta.flags &=
+				sdata->u.mgd.flags &=
 					~IEEE80211_STA_TKIP_WEP_USED;
 		}
 		break;
@@ -922,21 +840,20 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 		if (sdata->vif.type != NL80211_IFTYPE_STATION)
 			ret = -EINVAL;
 		else {
-			sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
+			sdata->u.mgd.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
 			/*
 			 * Privacy invoked by wpa_supplicant, store the
 			 * value and allow associating to a protected
 			 * network without having a key up front.
 			 */
 			if (data->value)
-				sdata->u.sta.flags |=
+				sdata->u.mgd.flags |=
 					IEEE80211_STA_PRIVACY_INVOKED;
 		}
 		break;
 	case IW_AUTH_80211_AUTH_ALG:
-		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-		    sdata->vif.type == NL80211_IFTYPE_ADHOC)
-			sdata->u.sta.auth_algs = data->value;
+		if (sdata->vif.type == NL80211_IFTYPE_STATION)
+			sdata->u.mgd.auth_algs = data->value;
 		else
 			ret = -EOPNOTSUPP;
 		break;
@@ -945,17 +862,16 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
 			ret = -EOPNOTSUPP;
 			break;
 		}
-		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-		    sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			switch (data->value) {
 			case IW_AUTH_MFP_DISABLED:
-				sdata->u.sta.mfp = IEEE80211_MFP_DISABLED;
+				sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED;
 				break;
 			case IW_AUTH_MFP_OPTIONAL:
-				sdata->u.sta.mfp = IEEE80211_MFP_OPTIONAL;
+				sdata->u.mgd.mfp = IEEE80211_MFP_OPTIONAL;
 				break;
 			case IW_AUTH_MFP_REQUIRED:
-				sdata->u.sta.mfp = IEEE80211_MFP_REQUIRED;
+				sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED;
 				break;
 			default:
 				ret = -EINVAL;
@@ -980,9 +896,9 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
 
 	rcu_read_lock();
 
-	if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-	    sdata->vif.type == NL80211_IFTYPE_ADHOC)
-		sta = sta_info_get(local, sdata->u.sta.bssid);
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		sta = sta_info_get(local, sdata->u.mgd.bssid);
+
 	if (!sta) {
 		wstats->discard.fragment = 0;
 		wstats->discard.misc = 0;
@@ -991,10 +907,45 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
 		wstats->qual.noise = 0;
 		wstats->qual.updated = IW_QUAL_ALL_INVALID;
 	} else {
-		wstats->qual.level = sta->last_signal;
-		wstats->qual.qual = sta->last_qual;
-		wstats->qual.noise = sta->last_noise;
-		wstats->qual.updated = ieee80211_get_wstats_flags(local);
+		wstats->qual.updated = 0;
+		/*
+		 * mirror what cfg80211 does for iwrange/scan results,
+		 * otherwise userspace gets confused.
+		 */
+		if (local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
+				       IEEE80211_HW_SIGNAL_DBM)) {
+			wstats->qual.updated |= IW_QUAL_LEVEL_UPDATED;
+			wstats->qual.updated |= IW_QUAL_QUAL_UPDATED;
+		} else {
+			wstats->qual.updated |= IW_QUAL_LEVEL_INVALID;
+			wstats->qual.updated |= IW_QUAL_QUAL_INVALID;
+		}
+
+		if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
+			wstats->qual.level = sta->last_signal;
+			wstats->qual.qual = sta->last_signal;
+		} else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+			int sig = sta->last_signal;
+
+			wstats->qual.updated |= IW_QUAL_DBM;
+			wstats->qual.level = sig;
+			if (sig < -110)
+				sig = -110;
+			else if (sig > -40)
+				sig = -40;
+			wstats->qual.qual = sig + 110;
+		}
+
+		if (local->hw.flags & IEEE80211_HW_NOISE_DBM) {
+			/*
+			 * This assumes that if driver reports noise, it also
+			 * reports signal in dBm.
+			 */
+			wstats->qual.noise = sta->last_noise;
+			wstats->qual.updated |= IW_QUAL_NOISE_UPDATED;
+		} else {
+			wstats->qual.updated |= IW_QUAL_NOISE_INVALID;
+		}
 	}
 
 	rcu_read_unlock();
@@ -1011,9 +962,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev,
 
 	switch (data->flags & IW_AUTH_INDEX) {
 	case IW_AUTH_80211_AUTH_ALG:
-		if (sdata->vif.type == NL80211_IFTYPE_STATION ||
-		    sdata->vif.type == NL80211_IFTYPE_ADHOC)
-			data->value = sdata->u.sta.auth_algs;
+		if (sdata->vif.type == NL80211_IFTYPE_STATION)
+			data->value = sdata->u.mgd.auth_algs;
 		else
 			ret = -EOPNOTSUPP;
 		break;
@@ -1116,7 +1066,7 @@ static const iw_handler ieee80211_handler[] =
 	(iw_handler) NULL,				/* SIOCSIWSENS */
 	(iw_handler) NULL,				/* SIOCGIWSENS */
 	(iw_handler) NULL /* not used */,		/* SIOCSIWRANGE */
-	(iw_handler) ieee80211_ioctl_giwrange,		/* SIOCGIWRANGE */
+	(iw_handler) cfg80211_wext_giwrange,		/* SIOCGIWRANGE */
 	(iw_handler) NULL /* not used */,		/* SIOCSIWPRIV */
 	(iw_handler) NULL /* kernel code */,		/* SIOCGIWPRIV */
 	(iw_handler) NULL /* not used */,		/* SIOCSIWSTATS */
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index ac71b38f7cb5..0b8ad1f4ecdd 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -99,10 +99,13 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb)
 	/* in case we are a client verify acm is not set for this ac */
 	while (unlikely(local->wmm_acm & BIT(skb->priority))) {
 		if (wme_downgrade_ac(skb)) {
-			/* The old code would drop the packet in this
-			 * case.
+			/*
+			 * This should not really happen. The AP has marked all
+			 * lower ACs to require admission control which is not
+			 * a reasonable configuration. Allow the frame to be
+			 * transmitted using AC_BK as a workaround.
 			 */
-			return 0;
+			break;
 		}
 	}
 
@@ -114,9 +117,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
 	struct ieee80211_local *local = mpriv->local;
-	struct ieee80211_hw *hw = &local->hw;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct sta_info *sta;
 	u16 queue;
 	u8 tid;
 
@@ -124,29 +125,11 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 	if (unlikely(queue >= local->hw.queues))
 		queue = local->hw.queues - 1;
 
-	if (skb->requeue) {
-		if (!hw->ampdu_queues)
-			return queue;
-
-		rcu_read_lock();
-		sta = sta_info_get(local, hdr->addr1);
-		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
-		if (sta) {
-			int ampdu_queue = sta->tid_to_tx_q[tid];
-
-			if ((ampdu_queue < ieee80211_num_queues(hw)) &&
-			    test_bit(ampdu_queue, local->queue_pool))
-				queue = ampdu_queue;
-		}
-		rcu_read_unlock();
-
-		return queue;
-	}
-
-	/* Now we know the 1d priority, fill in the QoS header if
-	 * there is one.
+	/*
+	 * Now we know the 1d priority, fill in the QoS header if
+	 * there is one (and we haven't done this before).
 	 */
-	if (ieee80211_is_data_qos(hdr->frame_control)) {
+	if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) {
 		u8 *p = ieee80211_get_qos_ctl(hdr);
 		u8 ack_policy = 0;
 		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
@@ -156,140 +139,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 		/* qos header is 2 bytes, second reserved */
 		*p++ = ack_policy | tid;
 		*p = 0;
-
-		if (!hw->ampdu_queues)
-			return queue;
-
-		rcu_read_lock();
-
-		sta = sta_info_get(local, hdr->addr1);
-		if (sta) {
-			int ampdu_queue = sta->tid_to_tx_q[tid];
-
-			if ((ampdu_queue < ieee80211_num_queues(hw)) &&
-			    test_bit(ampdu_queue, local->queue_pool))
-				queue = ampdu_queue;
-		}
-
-		rcu_read_unlock();
 	}
 
 	return queue;
 }
-
-int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
-			       struct sta_info *sta, u16 tid)
-{
-	int i;
-
-	/* XXX: currently broken due to cb/requeue use */
-	return -EPERM;
-
-	/* prepare the filter and save it for the SW queue
-	 * matching the received HW queue */
-
-	if (!local->hw.ampdu_queues)
-		return -EPERM;
-
-	/* try to get a Qdisc from the pool */
-	for (i = local->hw.queues; i < ieee80211_num_queues(&local->hw); i++)
-		if (!test_and_set_bit(i, local->queue_pool)) {
-			ieee80211_stop_queue(local_to_hw(local), i);
-			sta->tid_to_tx_q[tid] = i;
-
-			/* IF there are already pending packets
-			 * on this tid first we need to drain them
-			 * on the previous queue
-			 * since HT is strict in order */
-#ifdef CONFIG_MAC80211_HT_DEBUG
-			if (net_ratelimit())
-				printk(KERN_DEBUG "allocated aggregation queue"
-					" %d tid %d addr %pM pool=0x%lX\n",
-					i, tid, sta->sta.addr,
-					local->queue_pool[0]);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-			return 0;
-		}
-
-	return -EAGAIN;
-}
-
-/**
- * the caller needs to hold netdev_get_tx_queue(local->mdev, X)->lock
- */
-void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
-				   struct sta_info *sta, u16 tid,
-				   u8 requeue)
-{
-	int agg_queue = sta->tid_to_tx_q[tid];
-	struct ieee80211_hw *hw = &local->hw;
-
-	/* return the qdisc to the pool */
-	clear_bit(agg_queue, local->queue_pool);
-	sta->tid_to_tx_q[tid] = ieee80211_num_queues(hw);
-
-	if (requeue) {
-		ieee80211_requeue(local, agg_queue);
-	} else {
-		struct netdev_queue *txq;
-		spinlock_t *root_lock;
-		struct Qdisc *q;
-
-		txq = netdev_get_tx_queue(local->mdev, agg_queue);
-		q = rcu_dereference(txq->qdisc);
-		root_lock = qdisc_lock(q);
-
-		spin_lock_bh(root_lock);
-		qdisc_reset(q);
-		spin_unlock_bh(root_lock);
-	}
-}
-
-void ieee80211_requeue(struct ieee80211_local *local, int queue)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(local->mdev, queue);
-	struct sk_buff_head list;
-	spinlock_t *root_lock;
-	struct Qdisc *qdisc;
-	u32 len;
-
-	rcu_read_lock_bh();
-
-	qdisc = rcu_dereference(txq->qdisc);
-	if (!qdisc || !qdisc->dequeue)
-		goto out_unlock;
-
-	skb_queue_head_init(&list);
-
-	root_lock = qdisc_root_lock(qdisc);
-	spin_lock(root_lock);
-	for (len = qdisc->q.qlen; len > 0; len--) {
-		struct sk_buff *skb = qdisc->dequeue(qdisc);
-
-		if (skb)
-			__skb_queue_tail(&list, skb);
-	}
-	spin_unlock(root_lock);
-
-	for (len = list.qlen; len > 0; len--) {
-		struct sk_buff *skb = __skb_dequeue(&list);
-		u16 new_queue;
-
-		BUG_ON(!skb);
-		new_queue = ieee80211_select_queue(local->mdev, skb);
-		skb_set_queue_mapping(skb, new_queue);
-
-		txq = netdev_get_tx_queue(local->mdev, new_queue);
-
-
-		qdisc = rcu_dereference(txq->qdisc);
-		root_lock = qdisc_root_lock(qdisc);
-
-		spin_lock(root_lock);
-		qdisc_enqueue_root(skb, qdisc);
-		spin_unlock(root_lock);
-	}
-
-out_unlock:
-	rcu_read_unlock_bh();
-}
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index bc62f28a4d3d..7520d2e014dc 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -21,11 +21,5 @@
 extern const int ieee802_1d_to_ac[8];
 
 u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb);
-int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
-			       struct sta_info *sta, u16 tid);
-void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
-				   struct sta_info *sta, u16 tid,
-				   u8 requeue);
-void ieee80211_requeue(struct ieee80211_local *local, int queue);
 
 #endif /* _WME_H */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 55befe59e1c0..dfb447b584da 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -728,7 +728,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	NF_CT_ASSERT(skb->nfct);
 
 	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
-	if (ret < 0) {
+	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1b75c9efb0eb..7a16bd462f82 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1763,6 +1763,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3, u32 pid, int report)
 		goto out;
 	}
 
+	exp->class = 0;
 	exp->expectfn = NULL;
 	exp->flags = 0;
 	exp->master = ct;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7d3944f02ea1..e46f3b79adb3 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -861,7 +861,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 */
 			if (nf_ct_kill(ct))
 				return -NF_REPEAT;
-			return -NF_DROP;
+			return NF_DROP;
 		}
 		/* Fall through */
 	case TCP_CONNTRACK_IGNORE:
@@ -894,7 +894,7 @@ static int tcp_packet(struct nf_conn *ct,
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
 			nf_ct_kill(ct);
-			return -NF_DROP;
+			return NF_DROP;
 		}
 		ct->proto.tcp.last_index = index;
 		ct->proto.tcp.last_dir = dir;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3eae3fca29d8..fd326ac27ec8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -39,7 +39,7 @@
 #endif
 
 #define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
-#define NFULNL_TIMEOUT_DEFAULT 	HZ	/* every second */
+#define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
 #define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
 #define NFULNL_COPY_RANGE_MAX	0xFFFF	/* max packet size is limited by 16-bit struct nfattr nfa_len field */
 
@@ -590,8 +590,10 @@ nfulnl_log_packet(u_int8_t pf,
 
 	qthreshold = inst->qthreshold;
 	/* per-rule qthreshold overrides per-instance */
-	if (qthreshold > li->u.ulog.qthreshold)
-		qthreshold = li->u.ulog.qthreshold;
+	if (li->u.ulog.qthreshold)
+		if (qthreshold > li->u.ulog.qthreshold)
+			qthreshold = li->u.ulog.qthreshold;
+
 
 	switch (inst->copy_mode) {
 	case NFULNL_COPY_META:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bfcac92d5563..509a95621f9f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -843,59 +843,143 @@ static const struct file_operations xt_table_ops = {
 	.release = seq_release_net,
 };
 
-static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+/*
+ * Traverse state for ip{,6}_{tables,matches} for helping crossing
+ * the multi-AF mutexes.
+ */
+struct nf_mttg_trav {
+	struct list_head *head, *curr;
+	uint8_t class, nfproto;
+};
+
+enum {
+	MTTG_TRAV_INIT,
+	MTTG_TRAV_NFP_UNSPEC,
+	MTTG_TRAV_NFP_SPEC,
+	MTTG_TRAV_DONE,
+};
+
+static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
+    bool is_target)
 {
-	struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
-	u_int16_t af = (unsigned long)pde->data;
+	static const uint8_t next_class[] = {
+		[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
+		[MTTG_TRAV_NFP_SPEC]   = MTTG_TRAV_DONE,
+	};
+	struct nf_mttg_trav *trav = seq->private;
+
+	switch (trav->class) {
+	case MTTG_TRAV_INIT:
+		trav->class = MTTG_TRAV_NFP_UNSPEC;
+		mutex_lock(&xt[NFPROTO_UNSPEC].mutex);
+		trav->head = trav->curr = is_target ?
+			&xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match;
+ 		break;
+	case MTTG_TRAV_NFP_UNSPEC:
+		trav->curr = trav->curr->next;
+		if (trav->curr != trav->head)
+			break;
+		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+		mutex_lock(&xt[trav->nfproto].mutex);
+		trav->head = trav->curr = is_target ?
+			&xt[trav->nfproto].target : &xt[trav->nfproto].match;
+		trav->class = next_class[trav->class];
+		break;
+	case MTTG_TRAV_NFP_SPEC:
+		trav->curr = trav->curr->next;
+		if (trav->curr != trav->head)
+			break;
+		/* fallthru, _stop will unlock */
+	default:
+		return NULL;
+	}
 
-	mutex_lock(&xt[af].mutex);
-	return seq_list_start(&xt[af].match, *pos);
+	if (ppos != NULL)
+		++*ppos;
+	return trav;
 }
 
-static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
+    bool is_target)
 {
-	struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
-	u_int16_t af = (unsigned long)pde->data;
+	struct nf_mttg_trav *trav = seq->private;
+	unsigned int j;
 
-	return seq_list_next(v, &xt[af].match, pos);
+	trav->class = MTTG_TRAV_INIT;
+	for (j = 0; j < *pos; ++j)
+		if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL)
+			return NULL;
+	return trav;
 }
 
-static void xt_match_seq_stop(struct seq_file *seq, void *v)
+static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
 {
-	struct proc_dir_entry *pde = seq->private;
-	u_int16_t af = (unsigned long)pde->data;
+	struct nf_mttg_trav *trav = seq->private;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+		break;
+	case MTTG_TRAV_NFP_SPEC:
+		mutex_unlock(&xt[trav->nfproto].mutex);
+		break;
+	}
+}
 
-	mutex_unlock(&xt[af].mutex);
+static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return xt_mttg_seq_start(seq, pos, false);
 }
 
-static int xt_match_seq_show(struct seq_file *seq, void *v)
+static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
-	struct xt_match *match = list_entry(v, struct xt_match, list);
+	return xt_mttg_seq_next(seq, v, ppos, false);
+}
 
-	if (strlen(match->name))
-		return seq_printf(seq, "%s\n", match->name);
-	else
-		return 0;
+static int xt_match_seq_show(struct seq_file *seq, void *v)
+{
+	const struct nf_mttg_trav *trav = seq->private;
+	const struct xt_match *match;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+	case MTTG_TRAV_NFP_SPEC:
+		if (trav->curr == trav->head)
+			return 0;
+		match = list_entry(trav->curr, struct xt_match, list);
+		return (*match->name == '\0') ? 0 :
+		       seq_printf(seq, "%s\n", match->name);
+	}
+	return 0;
 }
 
 static const struct seq_operations xt_match_seq_ops = {
 	.start	= xt_match_seq_start,
 	.next	= xt_match_seq_next,
-	.stop	= xt_match_seq_stop,
+	.stop	= xt_mttg_seq_stop,
 	.show	= xt_match_seq_show,
 };
 
 static int xt_match_open(struct inode *inode, struct file *file)
 {
+	struct seq_file *seq;
+	struct nf_mttg_trav *trav;
 	int ret;
 
-	ret = seq_open(file, &xt_match_seq_ops);
-	if (!ret) {
-		struct seq_file *seq = file->private_data;
+	trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+	if (trav == NULL)
+		return -ENOMEM;
 
-		seq->private = PDE(inode);
+	ret = seq_open(file, &xt_match_seq_ops);
+	if (ret < 0) {
+		kfree(trav);
+		return ret;
 	}
-	return ret;
+
+	seq = file->private_data;
+	seq->private = trav;
+	trav->nfproto = (unsigned long)PDE(inode)->data;
+	return 0;
 }
 
 static const struct file_operations xt_match_ops = {
@@ -903,62 +987,63 @@ static const struct file_operations xt_match_ops = {
 	.open	 = xt_match_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_private,
 };
 
 static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
-	u_int16_t af = (unsigned long)pde->data;
-
-	mutex_lock(&xt[af].mutex);
-	return seq_list_start(&xt[af].target, *pos);
+	return xt_mttg_seq_start(seq, pos, true);
 }
 
-static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
-	struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private;
-	u_int16_t af = (unsigned long)pde->data;
-
-	return seq_list_next(v, &xt[af].target, pos);
-}
-
-static void xt_target_seq_stop(struct seq_file *seq, void *v)
-{
-	struct proc_dir_entry *pde = seq->private;
-	u_int16_t af = (unsigned long)pde->data;
-
-	mutex_unlock(&xt[af].mutex);
+	return xt_mttg_seq_next(seq, v, ppos, true);
 }
 
 static int xt_target_seq_show(struct seq_file *seq, void *v)
 {
-	struct xt_target *target = list_entry(v, struct xt_target, list);
-
-	if (strlen(target->name))
-		return seq_printf(seq, "%s\n", target->name);
-	else
-		return 0;
+	const struct nf_mttg_trav *trav = seq->private;
+	const struct xt_target *target;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+	case MTTG_TRAV_NFP_SPEC:
+		if (trav->curr == trav->head)
+			return 0;
+		target = list_entry(trav->curr, struct xt_target, list);
+		return (*target->name == '\0') ? 0 :
+		       seq_printf(seq, "%s\n", target->name);
+	}
+	return 0;
 }
 
 static const struct seq_operations xt_target_seq_ops = {
 	.start	= xt_target_seq_start,
 	.next	= xt_target_seq_next,
-	.stop	= xt_target_seq_stop,
+	.stop	= xt_mttg_seq_stop,
 	.show	= xt_target_seq_show,
 };
 
 static int xt_target_open(struct inode *inode, struct file *file)
 {
+	struct seq_file *seq;
+	struct nf_mttg_trav *trav;
 	int ret;
 
-	ret = seq_open(file, &xt_target_seq_ops);
-	if (!ret) {
-		struct seq_file *seq = file->private_data;
+	trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+	if (trav == NULL)
+		return -ENOMEM;
 
-		seq->private = PDE(inode);
+	ret = seq_open(file, &xt_target_seq_ops);
+	if (ret < 0) {
+		kfree(trav);
+		return ret;
 	}
-	return ret;
+
+	seq = file->private_data;
+	seq->private = trav;
+	trav->nfproto = (unsigned long)PDE(inode)->data;
+	return 0;
 }
 
 static const struct file_operations xt_target_ops = {
@@ -966,7 +1051,7 @@ static const struct file_operations xt_target_ops = {
 	.open	 = xt_target_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_private,
 };
 
 #define FORMAT_TABLES	"_tables_names"
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index fe80b614a400..791e030ea903 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -542,7 +542,7 @@ recent_mt_proc_write(struct file *file, const char __user *input,
 	struct recent_entry *e;
 	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
 	const char *c = buf;
-	union nf_inet_addr addr;
+	union nf_inet_addr addr = {};
 	u_int16_t family;
 	bool add, succ;
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5b33879c6422..b73d4e61c5ac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -85,6 +85,7 @@ struct netlink_sock {
 
 #define NETLINK_KERNEL_SOCKET	0x1
 #define NETLINK_RECV_PKTINFO	0x2
+#define NETLINK_BROADCAST_SEND_ERROR	0x4
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
 {
@@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk,
 		netlink_overrun(sk);
 		/* Clone failed. Notify ALL listeners. */
 		p->failure = 1;
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
 	} else if (sk_filter(sk, p->skb2)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
 	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
 		netlink_overrun(sk);
-		p->delivery_failure = 1;
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
 	} else {
 		p->congested |= val;
 		p->delivered = 1;
@@ -1045,10 +1049,9 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
 
 	netlink_unlock_table();
 
-	if (info.skb2)
-		kfree_skb(info.skb2);
+	kfree_skb(info.skb2);
 
-	if (info.delivery_failure || info.failure)
+	if (info.delivery_failure)
 		return -ENOBUFS;
 
 	if (info.delivered) {
@@ -1088,6 +1091,13 @@ out:
 	return 0;
 }
 
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @pid: the PID of a process that we want to skip (if any)
+ * @groups: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ */
 void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 {
 	struct netlink_set_err_data info;
@@ -1097,7 +1107,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 	info.exclude_sk = ssk;
 	info.pid = pid;
 	info.group = group;
-	info.code = code;
+	/* sk->sk_err wants a positive error value */
+	info.code = -code;
 
 	read_lock(&nl_table_lock);
 
@@ -1164,6 +1175,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		err = 0;
 		break;
 	}
+	case NETLINK_BROADCAST_ERROR:
+		if (val)
+			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+		else
+			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1196,6 +1214,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 		err = 0;
 		break;
+	case NETLINK_BROADCAST_ERROR:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1522,8 +1550,7 @@ EXPORT_SYMBOL(netlink_set_nonroot);
 
 static void netlink_destroy_callback(struct netlink_callback *cb)
 {
-	if (cb->skb)
-		kfree_skb(cb->skb);
+	kfree_skb(cb->skb);
 	kfree(cb);
 }
 
@@ -1740,12 +1767,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
 			exclude_pid = pid;
 		}
 
-		/* errors reported via destination sk->sk_err */
-		nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+		/* errors reported via destination sk->sk_err, but propagate
+		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+		err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
 	}
 
-	if (report)
-		err = nlmsg_unicast(sk, skb, pid);
+	if (report) {
+		int err2;
+
+		err2 = nlmsg_unicast(sk, skb, pid);
+		if (!err || err == -ESRCH)
+			err = err2;
+	}
 
 	return err;
 }
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index cba7849de98e..6d9c58ec56ac 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1037,6 +1037,10 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
 	unsigned char *asmptr;
 	int size;
 
+	/* Netrom empty data frame has no meaning : don't send */
+	if (len == 0)
+		return 0;
+
 	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
 		return -EINVAL;
 
@@ -1167,6 +1171,11 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 	skb_reset_transport_header(skb);
 	copied     = skb->len;
 
+	/* NetRom empty data frame has no meaning : ignore it */
+	if (copied == 0) {
+		goto out;
+	}
+
 	if (copied > size) {
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
@@ -1182,7 +1191,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	msg->msg_namelen = sizeof(*sax);
 
-	skb_free_datagram(sk, skb);
+out:	skb_free_datagram(sk, skb);
 
 	release_sock(sk);
 	return copied;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fc4a7885c41..74776de523ec 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -584,7 +584,7 @@ drop_n_restore:
 		skb->len = skb_len;
 	}
 drop:
-	kfree_skb(skb);
+	consume_skb(skb);
 	return 0;
 }
 
@@ -756,8 +756,7 @@ ring_is_full:
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	sk->sk_data_ready(sk, 0);
-	if (copy_skb)
-		kfree_skb(copy_skb);
+	kfree_skb(copy_skb);
 	goto drop_n_restore;
 }
 
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 81795ea87794..a662e62a99cf 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -382,9 +382,8 @@ out:
 	return NET_RX_DROP;
 }
 
-static struct packet_type phonet_packet_type = {
+static struct packet_type phonet_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_PHONET),
-	.dev = NULL,
 	.func = phonet_rcv,
 };
 
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 1ceea1f92413..cec4e5951681 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, dev_net(dev), 0,
-			  RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+	rtnl_notify(skb, dev_net(dev), 0,
+		    RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+	return;
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000000..796773b5df9b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,14 @@
+
+config RDS
+	tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+	depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+	depends on INFINIBAND && INFINIBAND_ADDR_TRANS
+	---help---
+	  RDS provides reliable, sequenced delivery of datagrams
+	  over Infiniband.
+
+config RDS_DEBUG
+        bool "Debugging messages"
+	depends on RDS
+        default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000000..51f27585fa08
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,14 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
+			recv.o send.o stats.o sysctl.o threads.o transport.o \
+			loop.o page.o rdma.o \
+			rdma_transport.o \
+			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+			ib_sysctl.o ib_rdma.o \
+			iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+			iw_sysctl.o iw_rdma.o
+
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000000..20cf16fc572f
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <linux/version.h>
+#include <net/sock.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "rdma_transport.h"
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path.  sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs;
+	unsigned long flags;
+
+	if (sk == NULL)
+		goto out;
+
+	rs = rds_sk_to_rs(sk);
+
+	sock_orphan(sk);
+	/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+	 * that ensures the recv path has completed messing
+	 * with the socket. */
+	rds_clear_recv_queue(rs);
+	rds_cong_remove_socket(rs);
+	rds_remove_bound(rs);
+	rds_send_drop_to(rs, NULL);
+	rds_rdma_drop_keys(rs);
+	rds_notify_queue_get(rs, NULL);
+
+	spin_lock_irqsave(&rds_sock_lock, flags);
+	list_del_init(&rs->rs_item);
+	rds_sock_count--;
+	spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+	sock->sk = NULL;
+	sock_put(sk);
+out:
+	return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	__rds_wake_sk_sleep(rds_rs_to_sk(rs));
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int *uaddr_len, int peer)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+	/* racey, don't care */
+	if (peer) {
+		if (!rs->rs_conn_addr)
+			return -ENOTCONN;
+
+		sin->sin_port = rs->rs_conn_port;
+		sin->sin_addr.s_addr = rs->rs_conn_addr;
+	} else {
+		sin->sin_port = rs->rs_bound_port;
+		sin->sin_addr.s_addr = rs->rs_bound_addr;
+	}
+
+	sin->sin_family = AF_INET;
+
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ *  -	there is data on the receive queue.
+ *  -	to signal that a previously congested destination may have become
+ *	uncongested
+ *  -	A notification has been queued to the socket (this can be a congestion
+ *	update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	unsigned int mask = 0;
+	unsigned long flags;
+
+	poll_wait(file, sk->sk_sleep, wait);
+
+	poll_wait(file, &rds_poll_waitq, wait);
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!rs->rs_cong_monitor) {
+		/* When a congestion map was updated, we signal POLLIN for
+		 * "historical" reasons. Applications can also poll for
+		 * WRBAND instead. */
+		if (rds_cong_updated_since(&rs->rs_cong_track))
+			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+	} else {
+		spin_lock(&rs->rs_lock);
+		if (rs->rs_cong_notify)
+			mask |= (POLLIN | POLLRDNORM);
+		spin_unlock(&rs->rs_lock);
+	}
+	if (!list_empty(&rs->rs_recv_queue)
+	 || !list_empty(&rs->rs_notify_queue))
+		mask |= (POLLIN | POLLRDNORM);
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+		mask |= (POLLOUT | POLLWRNORM);
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+			      int len)
+{
+	struct sockaddr_in sin;
+	int ret = 0;
+
+	/* racing with another thread binding seems ok here */
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (len < sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&sin, optval, sizeof(sin))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	rds_send_drop_to(rs, &sin);
+out:
+	return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+			       int optlen)
+{
+	int value;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(value, (int __user *) optval))
+		return -EFAULT;
+	*optvar = !!value;
+	return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+			    int optlen)
+{
+	int ret;
+
+	ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+	if (ret == 0) {
+		if (rs->rs_cong_monitor) {
+			rds_cong_add_socket(rs);
+		} else {
+			rds_cong_remove_socket(rs);
+			rs->rs_cong_mask = 0;
+			rs->rs_cong_notify = 0;
+		}
+	}
+	return ret;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret;
+
+	if (level != SOL_RDS) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_CANCEL_SENT_TO:
+		ret = rds_cancel_sent_to(rs, optval, optlen);
+		break;
+	case RDS_GET_MR:
+		ret = rds_get_mr(rs, optval, optlen);
+		break;
+	case RDS_FREE_MR:
+		ret = rds_free_mr(rs, optval, optlen);
+		break;
+	case RDS_RECVERR:
+		ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+		break;
+	case RDS_CONG_MONITOR:
+		ret = rds_cong_monitor(rs, optval, optlen);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+	}
+out:
+	return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret = -ENOPROTOOPT, len;
+
+	if (level != SOL_RDS)
+		goto out;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_INFO_FIRST ... RDS_INFO_LAST:
+		ret = rds_info_getsockopt(sock, optname, optval,
+					  optlen);
+		break;
+
+	case RDS_RECVERR:
+		if (len < sizeof(int))
+			ret = -EINVAL;
+		else
+		if (put_user(rs->rs_recverr, (int __user *) optval)
+		 || put_user(sizeof(int), optlen))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+out:
+	return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sin->sin_family != AF_INET) {
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EDESTADDRREQ;
+		goto out;
+	}
+
+	rs->rs_conn_addr = sin->sin_addr.s_addr;
+	rs->rs_conn_port = sin->sin_port;
+
+out:
+	release_sock(sk);
+	return ret;
+}
+
+static struct proto rds_proto = {
+	.name	  = "RDS",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct rds_sock),
+};
+
+static struct proto_ops rds_proto_ops = {
+	.family =	AF_RDS,
+	.owner =	THIS_MODULE,
+	.release =	rds_release,
+	.bind =		rds_bind,
+	.connect =	rds_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	rds_getname,
+	.poll =		rds_poll,
+	.ioctl =	rds_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	rds_setsockopt,
+	.getsockopt =	rds_getsockopt,
+	.sendmsg =	rds_sendmsg,
+	.recvmsg =	rds_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+	unsigned long flags;
+	struct rds_sock *rs;
+
+	sock_init_data(sock, sk);
+	sock->ops		= &rds_proto_ops;
+	sk->sk_protocol		= protocol;
+
+	rs = rds_sk_to_rs(sk);
+	spin_lock_init(&rs->rs_lock);
+	rwlock_init(&rs->rs_recv_lock);
+	INIT_LIST_HEAD(&rs->rs_send_queue);
+	INIT_LIST_HEAD(&rs->rs_recv_queue);
+	INIT_LIST_HEAD(&rs->rs_notify_queue);
+	INIT_LIST_HEAD(&rs->rs_cong_list);
+	spin_lock_init(&rs->rs_rdma_lock);
+	rs->rs_rdma_keys = RB_ROOT;
+
+	spin_lock_irqsave(&rds_sock_lock, flags);
+	list_add_tail(&rs->rs_item, &rds_sock_list);
+	rds_sock_count++;
+	spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+	return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol)
+{
+	struct sock *sk;
+
+	if (sock->type != SOCK_SEQPACKET || protocol)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+	sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+	sock_put(rds_rs_to_sk(rs));
+}
+
+static struct net_proto_family rds_family_ops = {
+	.family =	AF_RDS,
+	.create =	rds_create,
+	.owner	=	THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens)
+{
+	struct rds_sock *rs;
+	struct sock *sk;
+	struct rds_incoming *inc;
+	unsigned long flags;
+	unsigned int total = 0;
+
+	len /= sizeof(struct rds_info_message);
+
+	spin_lock_irqsave(&rds_sock_lock, flags);
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		sk = rds_rs_to_sk(rs);
+		read_lock(&rs->rs_recv_lock);
+
+		/* XXX too lazy to maintain counts.. */
+		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+			total++;
+			if (total <= len)
+				rds_inc_info_copy(inc, iter, inc->i_saddr,
+						  rs->rs_bound_addr, 1);
+		}
+
+		read_unlock(&rs->rs_recv_lock);
+	}
+
+	spin_unlock_irqrestore(&rds_sock_lock, flags);
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	struct rds_info_socket sinfo;
+	struct rds_sock *rs;
+	unsigned long flags;
+
+	len /= sizeof(struct rds_info_socket);
+
+	spin_lock_irqsave(&rds_sock_lock, flags);
+
+	if (len < rds_sock_count)
+		goto out;
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		sinfo.sndbuf = rds_sk_sndbuf(rs);
+		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+		sinfo.bound_addr = rs->rs_bound_addr;
+		sinfo.connected_addr = rs->rs_conn_addr;
+		sinfo.bound_port = rs->rs_bound_port;
+		sinfo.connected_port = rs->rs_conn_port;
+		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+		rds_info_copy(iter, &sinfo, sizeof(sinfo));
+	}
+
+out:
+	lens->nr = rds_sock_count;
+	lens->each = sizeof(struct rds_info_socket);
+
+	spin_unlock_irqrestore(&rds_sock_lock, flags);
+}
+
+static void __exit rds_exit(void)
+{
+	rds_rdma_exit();
+	sock_unregister(rds_family_ops.family);
+	proto_unregister(&rds_proto);
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_sysctl_exit();
+	rds_threads_exit();
+	rds_stats_exit();
+	rds_page_exit();
+	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+static int __init rds_init(void)
+{
+	int ret;
+
+	ret = rds_conn_init();
+	if (ret)
+		goto out;
+	ret = rds_threads_init();
+	if (ret)
+		goto out_conn;
+	ret = rds_sysctl_init();
+	if (ret)
+		goto out_threads;
+	ret = rds_stats_init();
+	if (ret)
+		goto out_sysctl;
+	ret = proto_register(&rds_proto, 1);
+	if (ret)
+		goto out_stats;
+	ret = sock_register(&rds_family_ops);
+	if (ret)
+		goto out_proto;
+
+	rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+	/* ib/iwarp transports currently compiled-in */
+	ret = rds_rdma_init();
+	if (ret)
+		goto out_sock;
+	goto out;
+
+out_sock:
+	sock_unregister(rds_family_ops.family);
+out_proto:
+	proto_unregister(&rds_proto);
+out_stats:
+	rds_stats_exit();
+out_sysctl:
+	rds_sysctl_exit();
+out_threads:
+	rds_threads_exit();
+out_conn:
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_page_exit();
+out:
+	return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION     "4.0"
+#define DRV_RELDATE     "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+		   " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000000..c17cc39160ce
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include "rds.h"
+
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
+static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
+
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+					   struct rds_sock *insert)
+{
+	struct rb_node **p = &rds_bind_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_sock *rs;
+	u64 cmp;
+	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+
+	while (*p) {
+		parent = *p;
+		rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+
+		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+		      be16_to_cpu(rs->rs_bound_port);
+
+		if (needle < cmp)
+			p = &(*p)->rb_left;
+		else if (needle > cmp)
+			p = &(*p)->rb_right;
+		else
+			return rs;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->rs_bound_node, parent, p);
+		rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+	}
+	return NULL;
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release.  We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+	struct rds_sock *rs;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+	rs = rds_bind_tree_walk(addr, port, NULL);
+	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+		rds_sock_addref(rs);
+	else
+		rs = NULL;
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+		ntohs(port));
+	return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+	unsigned long flags;
+	int ret = -EADDRINUSE;
+	u16 rover, last;
+
+	if (*port != 0) {
+		rover = be16_to_cpu(*port);
+		last = rover;
+	} else {
+		rover = max_t(u16, net_random(), 2);
+		last = rover - 1;
+	}
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	do {
+		if (rover == 0)
+			rover++;
+		if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+			*port = cpu_to_be16(rover);
+			ret = 0;
+			break;
+		}
+	} while (rover++ != last);
+
+	if (ret == 0)  {
+		rs->rs_bound_addr = addr;
+		rs->rs_bound_port = *port;
+		rds_sock_addref(rs);
+
+		rdsdebug("rs %p binding to %pI4:%d\n",
+		  rs, &addr, (int)ntohs(*port));
+	}
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+	return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	if (rs->rs_bound_addr) {
+		rdsdebug("rs %p unbinding from %pI4:%d\n",
+		  rs, &rs->rs_bound_addr,
+		  ntohs(rs->rs_bound_port));
+
+		rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+		rds_sock_put(rs);
+		rs->rs_bound_addr = 0;
+	}
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct rds_transport *trans;
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in) ||
+	    sin->sin_family != AF_INET ||
+	    rs->rs_bound_addr ||
+	    sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+	if (ret)
+		goto out;
+
+	trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+	if (trans == NULL) {
+		ret = -EADDRNOTAVAIL;
+		rds_remove_bound(rs);
+		goto out;
+	}
+
+	rs->rs_transport = trans;
+	ret = 0;
+
+out:
+	release_sock(sk);
+	return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000000..710e4599d76c
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+#include <asm-generic/bitops/le.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
+ * message are accounted for.  If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested.  All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs.  An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested.  As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up.  This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently.  This is much easier to implement than some
+ * finer-grained communication of per-port congestion.  The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t		rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock.  It's used so infrequently that it's worth keeping it
+ * global to simplify the locking.  It's only used in the following
+ * circumstances:
+ *
+ *  - on connection buildup to associate a conn with its maps
+ *  - on map changes to inform conns of a new map to send
+ *
+ *  It's sadly ordered under the socket callback lock and the connection lock.
+ *  Receive paths can mark ports congested from interrupt context so the
+ *  lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+					       struct rds_cong_map *insert)
+{
+	struct rb_node **p = &rds_cong_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_cong_map *map;
+
+	while (*p) {
+		parent = *p;
+		map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+		if (addr < map->m_addr)
+			p = &(*p)->rb_left;
+		else if (addr > map->m_addr)
+			p = &(*p)->rb_right;
+		else
+			return map;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->m_rb_node, parent, p);
+		rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+	}
+	return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address.  Connections try and allocate
+ * these bitmaps in the process getting pointers to them.  The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+	struct rds_cong_map *map;
+	struct rds_cong_map *ret = NULL;
+	unsigned long zp;
+	unsigned long i;
+	unsigned long flags;
+
+	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+	if (map == NULL)
+		return NULL;
+
+	map->m_addr = addr;
+	init_waitqueue_head(&map->m_waitq);
+	INIT_LIST_HEAD(&map->m_conn_list);
+
+	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+		zp = get_zeroed_page(GFP_KERNEL);
+		if (zp == 0)
+			goto out;
+		map->m_page_addrs[i] = zp;
+	}
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	ret = rds_cong_tree_walk(addr, map);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (ret == NULL) {
+		ret = map;
+		map = NULL;
+	}
+
+out:
+	if (map) {
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+
+	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+	return ret;
+}
+
+/*
+ * Put the conn on its local map's list.  This is called when the conn is
+ * really added to the hash.  It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_del_init(&conn->c_map_item);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+	if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+	struct rds_connection *conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+
+	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+		if (!test_and_set_bit(0, &conn->c_map_queued)) {
+			rds_stats_inc(s_cong_update_queued);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+		}
+	}
+
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+	rdsdebug("waking map %p for %pI4\n",
+	  map, &map->m_addr);
+	rds_stats_inc(s_cong_update_received);
+	atomic_inc(&rds_cong_generation);
+	if (waitqueue_active(&map->m_waitq))
+		wake_up(&map->m_waitq);
+	if (waitqueue_active(&rds_poll_waitq))
+		wake_up_all(&rds_poll_waitq);
+
+	if (portmask && !list_empty(&rds_cong_monitor)) {
+		unsigned long flags;
+		struct rds_sock *rs;
+
+		read_lock_irqsave(&rds_cong_monitor_lock, flags);
+		list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+			spin_lock(&rs->rs_lock);
+			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+			rs->rs_cong_mask &= ~portmask;
+			spin_unlock(&rs->rs_lock);
+			if (rs->rs_cong_notify)
+				rds_wake_sk_sleep(rs);
+		}
+		read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+	}
+}
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+	unsigned long gen = atomic_read(&rds_cong_generation);
+
+	if (likely(*recent == gen))
+		return 0;
+	*recent = gen;
+	return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption.  This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("setting congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	if (list_empty(&rs->rs_cong_list))
+		list_add(&rs->rs_cong_list, &rds_cong_monitor);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+	struct rds_cong_map *map;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	list_del_init(&rs->rs_cong_list);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+	/* update congestion map for now-closed port */
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+		rds_cong_clear_bit(map, rs->rs_bound_port);
+		rds_cong_queue_updates(map);
+	}
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+		  struct rds_sock *rs)
+{
+	if (!rds_cong_test_bit(map, port))
+		return 0;
+	if (nonblock) {
+		if (rs && rs->rs_cong_monitor) {
+			unsigned long flags;
+
+			/* It would have been nice to have an atomic set_bit on
+			 * a uint64_t. */
+			spin_lock_irqsave(&rs->rs_lock, flags);
+			rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+			spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+			/* Test again - a congestion update may have arrived in
+			 * the meantime. */
+			if (!rds_cong_test_bit(map, port))
+				return 0;
+		}
+		rds_stats_inc(s_cong_send_error);
+		return -ENOBUFS;
+	}
+
+	rds_stats_inc(s_cong_send_blocked);
+	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+	return wait_event_interruptible(map->m_waitq,
+					!rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+	struct rb_node *node;
+	struct rds_cong_map *map;
+	unsigned long i;
+
+	while ((node = rb_first(&rds_cong_tree))) {
+		map = rb_entry(node, struct rds_cong_map, m_rb_node);
+		rdsdebug("freeing map %p\n", map);
+		rb_erase(&map->m_rb_node, &rds_cong_tree);
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+	struct rds_cong_map *map = conn->c_lcong;
+	struct rds_message *rm;
+
+	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+	if (!IS_ERR(rm))
+		rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+	return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000000..273f064930a8
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+#include "rdma.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+	/* Pass NULL, don't need struct net for hash */
+	unsigned long hash = inet_ehashfn(NULL,
+					  be32_to_cpu(laddr), 0,
+					  be32_to_cpu(faddr), 0);
+	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do {		\
+	if (test)						\
+		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
+} while (0)
+
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+	int ret = 0;
+
+	if (!mutex_trylock(&conn->c_send_lock))
+		ret = 1;
+	else
+		mutex_unlock(&conn->c_send_lock);
+
+	return ret;
+}
+
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+					      __be32 laddr, __be32 faddr,
+					      struct rds_transport *trans)
+{
+	struct rds_connection *conn, *ret = NULL;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry(conn, pos, head, c_hash_node) {
+		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+				conn->c_trans == trans) {
+			ret = conn;
+			break;
+		}
+	}
+	rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+		 &laddr, &faddr);
+	return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future.  It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+void rds_conn_reset(struct rds_connection *conn)
+{
+	rdsdebug("connection %pI4 to %pI4 reset\n",
+	  &conn->c_laddr, &conn->c_faddr);
+
+	rds_stats_inc(s_conn_reset);
+	rds_send_reset(conn);
+	conn->c_flags = 0;
+
+	/* Do not clear next_rx_seq here, else we cannot distinguish
+	 * retransmitted packets from new packets, and will hand all
+	 * of them to the application. That is not consistent with the
+	 * reliability guarantees of RDS. */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time.  They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created.  They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int is_outgoing)
+{
+	struct rds_connection *conn, *tmp, *parent = NULL;
+	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	conn = rds_conn_lookup(head, laddr, faddr, trans);
+	if (conn
+	 && conn->c_loopback
+	 && conn->c_trans != &rds_loop_transport
+	 && !is_outgoing) {
+		/* This is a looped back IB connection, and we're
+		 * called by the code handling the incoming connect.
+		 * We need a second connection object into which we
+		 * can stick the other QP. */
+		parent = conn;
+		conn = parent->c_passive;
+	}
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	if (conn)
+		goto out;
+
+	conn = kmem_cache_alloc(rds_conn_slab, gfp);
+	if (conn == NULL) {
+		conn = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	memset(conn, 0, sizeof(*conn));
+
+	INIT_HLIST_NODE(&conn->c_hash_node);
+	conn->c_version = RDS_PROTOCOL_3_0;
+	conn->c_laddr = laddr;
+	conn->c_faddr = faddr;
+	spin_lock_init(&conn->c_lock);
+	conn->c_next_tx_seq = 1;
+
+	mutex_init(&conn->c_send_lock);
+	INIT_LIST_HEAD(&conn->c_send_queue);
+	INIT_LIST_HEAD(&conn->c_retrans);
+
+	ret = rds_cong_get_maps(conn);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	/*
+	 * This is where a connection becomes loopback.  If *any* RDS sockets
+	 * can bind to the destination address then we'd rather the messages
+	 * flow through loopback rather than either transport.
+	 */
+	if (rds_trans_get_preferred(faddr)) {
+		conn->c_loopback = 1;
+		if (is_outgoing && trans->t_prefer_loopback) {
+			/* "outgoing" connection - and the transport
+			 * says it wants the connection handled by the
+			 * loopback transport. This is what TCP does.
+			 */
+			trans = &rds_loop_transport;
+		}
+	}
+
+	conn->c_trans = trans;
+
+	ret = trans->conn_alloc(conn, gfp);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	atomic_set(&conn->c_state, RDS_CONN_DOWN);
+	conn->c_reconnect_jiffies = 0;
+	INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+	INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+	INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+	INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+	mutex_init(&conn->c_cm_lock);
+	conn->c_flags = 0;
+
+	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+	  conn, &laddr, &faddr,
+	  trans->t_name ? trans->t_name : "[unknown]",
+	  is_outgoing ? "(outgoing)" : "");
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	if (parent == NULL) {
+		tmp = rds_conn_lookup(head, laddr, faddr, trans);
+		if (tmp == NULL)
+			hlist_add_head(&conn->c_hash_node, head);
+	} else {
+		tmp = parent->c_passive;
+		if (!tmp)
+			parent->c_passive = conn;
+	}
+
+	if (tmp) {
+		trans->conn_free(conn->c_transport_data);
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = tmp;
+	} else {
+		rds_cong_add_conn(conn);
+		rds_conn_count++;
+	}
+
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+out:
+	return conn;
+}
+
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+
+void rds_conn_destroy(struct rds_connection *conn)
+{
+	struct rds_message *rm, *rtmp;
+
+	rdsdebug("freeing conn %p for %pI4 -> "
+		 "%pI4\n", conn, &conn->c_laddr,
+		 &conn->c_faddr);
+
+	hlist_del_init(&conn->c_hash_node);
+
+	/* wait for the rds thread to shut it down */
+	atomic_set(&conn->c_state, RDS_CONN_ERROR);
+	cancel_delayed_work(&conn->c_conn_w);
+	queue_work(rds_wq, &conn->c_down_w);
+	flush_workqueue(rds_wq);
+
+	/* tear down queued messages */
+	list_for_each_entry_safe(rm, rtmp,
+				 &conn->c_send_queue,
+				 m_conn_item) {
+		list_del_init(&rm->m_conn_item);
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		rds_message_put(rm);
+	}
+	if (conn->c_xmit_rm)
+		rds_message_put(conn->c_xmit_rm);
+
+	conn->c_trans->conn_free(conn->c_transport_data);
+
+	/*
+	 * The congestion maps aren't freed up here.  They're
+	 * freed by rds_cong_exit() after all the connections
+	 * have been freed.
+	 */
+	rds_cong_remove_conn(conn);
+
+	BUG_ON(!list_empty(&conn->c_retrans));
+	kmem_cache_free(rds_conn_slab, conn);
+
+	rds_conn_count--;
+}
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+				  struct rds_info_iterator *iter,
+				  struct rds_info_lengths *lens,
+				  int want_send)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct list_head *list;
+	struct rds_connection *conn;
+	struct rds_message *rm;
+	unsigned long flags;
+	unsigned int total = 0;
+	size_t i;
+
+	len /= sizeof(struct rds_info_message);
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry(conn, pos, head, c_hash_node) {
+			if (want_send)
+				list = &conn->c_send_queue;
+			else
+				list = &conn->c_retrans;
+
+			spin_lock(&conn->c_lock);
+
+			/* XXX too lazy to maintain counts.. */
+			list_for_each_entry(rm, list, m_conn_item) {
+				total++;
+				if (total <= len)
+					rds_inc_info_copy(&rm->m_inc, iter,
+							  conn->c_laddr,
+							  conn->c_faddr, 0);
+			}
+
+			spin_unlock(&conn->c_lock);
+		}
+	}
+
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+				       struct rds_info_iterator *iter,
+				       struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+					  unsigned int len,
+					  struct rds_info_iterator *iter,
+					  struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len)
+{
+	uint64_t buffer[(item_len + 7) / 8];
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct hlist_node *tmp;
+	struct rds_connection *conn;
+	unsigned long flags;
+	size_t i;
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+
+	lens->nr = 0;
+	lens->each = item_len;
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+
+			/* XXX no c_lock usage.. */
+			if (!visitor(conn, buffer))
+				continue;
+
+			/* We copy as much as we can fit in the buffer,
+			 * but we count all items so that the caller
+			 * can resize the buffer. */
+			if (len >= item_len) {
+				rds_info_copy(iter, buffer, item_len);
+				len -= item_len;
+			}
+			lens->nr++;
+		}
+	}
+
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+
+static int rds_conn_info_visitor(struct rds_connection *conn,
+				  void *buffer)
+{
+	struct rds_info_connection *cinfo = buffer;
+
+	cinfo->next_tx_seq = conn->c_next_tx_seq;
+	cinfo->next_rx_seq = conn->c_next_rx_seq;
+	cinfo->laddr = conn->c_laddr;
+	cinfo->faddr = conn->c_faddr;
+	strncpy(cinfo->transport, conn->c_trans->t_name,
+		sizeof(cinfo->transport));
+	cinfo->flags = 0;
+
+	rds_conn_info_set(cinfo->flags,
+			  rds_conn_is_sending(conn), SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_UP,
+			  CONNECTED);
+	return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_conn_info_visitor,
+				sizeof(struct rds_info_connection));
+}
+
+int __init rds_conn_init(void)
+{
+	rds_conn_slab = kmem_cache_create("rds_connection",
+					  sizeof(struct rds_connection),
+					  0, 0, NULL);
+	if (rds_conn_slab == NULL)
+		return -ENOMEM;
+
+	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+			       rds_conn_message_info_send);
+	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+			       rds_conn_message_info_retrans);
+
+	return 0;
+}
+
+void rds_conn_exit(void)
+{
+	rds_loop_exit();
+
+	WARN_ON(!hlist_empty(rds_conn_hash));
+
+	kmem_cache_destroy(rds_conn_slab);
+
+	rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+				 rds_conn_message_info_send);
+	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+				 rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+	atomic_set(&conn->c_state, RDS_CONN_ERROR);
+	queue_work(rds_wq, &conn->c_down_w);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+
+	rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000000..06a7b798d9a7
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+
+#include "rds.h"
+#include "ib.h"
+
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+
+struct list_head rds_ib_devices;
+
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+void rds_ib_add_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle IB (no iWARP) devices */
+	if (device->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+	if (!rds_ibdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_ibdev->spinlock);
+
+	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+
+	rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+	rds_ibdev->fmr_page_size  = 1 << rds_ibdev->fmr_page_shift;
+	rds_ibdev->fmr_page_mask  = ~((u64) rds_ibdev->fmr_page_size - 1);
+	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+	rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+			fmr_pool_size;
+
+	rds_ibdev->dev = device;
+	rds_ibdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_ibdev->pd))
+		goto free_dev;
+
+	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+				      IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(rds_ibdev->mr))
+		goto err_pd;
+
+	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+	if (IS_ERR(rds_ibdev->mr_pool)) {
+		rds_ibdev->mr_pool = NULL;
+		goto err_mr;
+	}
+
+	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+	INIT_LIST_HEAD(&rds_ibdev->conn_list);
+	list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+
+	goto free_attr;
+
+err_mr:
+	ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+	ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+	kfree(rds_ibdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+void rds_ib_remove_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (!rds_ibdev)
+		return;
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	rds_ib_remove_conns(rds_ibdev);
+
+	if (rds_ibdev->mr_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+
+	ib_dereg_mr(rds_ibdev->mr);
+
+	while (ib_dealloc_pd(rds_ibdev->pd)) {
+		rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+		msleep(1);
+	}
+
+	list_del(&rds_ibdev->list);
+	kfree(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+	.name   = "rds_ib",
+	.add    = rds_ib_add_one,
+	.remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_ibdev->max_sge;
+		rds_ib_get_mr_info(rds_ibdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_ib_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+	if (!cm_id)
+		return -EADDRNOTAVAIL;
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support iWARP devices unless we
+	   check node_type. */
+	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+void rds_ib_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_ib_remove_nodev_conns();
+	ib_unregister_client(&rds_ib_client);
+	rds_ib_sysctl_exit();
+	rds_ib_recv_exit();
+	rds_trans_unregister(&rds_ib_transport);
+}
+
+struct rds_transport rds_ib_transport = {
+	.laddr_check		= rds_ib_laddr_check,
+	.xmit_complete		= rds_ib_xmit_complete,
+	.xmit			= rds_ib_xmit,
+	.xmit_cong_map		= NULL,
+	.xmit_rdma		= rds_ib_xmit_rdma,
+	.recv			= rds_ib_recv,
+	.conn_alloc		= rds_ib_conn_alloc,
+	.conn_free		= rds_ib_conn_free,
+	.conn_connect		= rds_ib_conn_connect,
+	.conn_shutdown		= rds_ib_conn_shutdown,
+	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
+	.inc_purge		= rds_ib_inc_purge,
+	.inc_free		= rds_ib_inc_free,
+	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
+	.cm_handle_connect	= rds_ib_cm_handle_connect,
+	.cm_connect_complete	= rds_ib_cm_connect_complete,
+	.stats_info_copy	= rds_ib_stats_info_copy,
+	.exit			= rds_ib_exit,
+	.get_mr			= rds_ib_get_mr,
+	.sync_mr		= rds_ib_sync_mr,
+	.free_mr		= rds_ib_free_mr,
+	.flush_mrs		= rds_ib_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "infiniband",
+};
+
+int __init rds_ib_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_ib_devices);
+
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out;
+
+	ret = rds_ib_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_ib_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_ib_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_ib_recv_exit();
+out_sysctl:
+	rds_ib_sysctl_exit();
+out_ibreg:
+	ib_unregister_client(&rds_ib_client);
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000000..8be563a1363a
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,367 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FMR_SIZE			256
+#define RDS_FMR_POOL_SIZE		4096
+
+#define RDS_IB_MAX_SGE			8
+#define RDS_IB_RECV_SGE 		2
+
+#define RDS_IB_DEFAULT_RECV_WR		1024
+#define RDS_IB_DEFAULT_SEND_WR		256
+
+#define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct page		*f_page;
+	unsigned long		f_offset;
+	dma_addr_t 		f_mapped;
+};
+
+struct rds_ib_incoming {
+	struct list_head	ii_frags;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_ib_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+	struct rds_message	*s_rm;
+	struct rds_rdma_op	*s_op;
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_ib_recv_work {
+	struct rds_ib_incoming 	*r_ibinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_ib_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+	struct list_head	ib_node;
+	struct rds_ib_device	*rds_ibdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_ib_work_ring	i_send_ring;
+	struct rds_message	*i_rm;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_ib_send_work *i_sends;
+
+	/* rx */
+	struct mutex		i_recv_mutex;
+	struct rds_ib_work_ring	i_recv_ring;
+	struct rds_ib_incoming	*i_ibinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_ib_recv_work *i_recvs;
+	struct rds_page_frag	i_frag;
+	u64			i_ack_recv;	/* last ACK received */
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+	u64			i_ack_next;	/* next ACK to send */
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+	long			i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_ib_ipaddr {
+	struct list_head	list;
+	__be32			ipaddr;
+};
+
+struct rds_ib_device {
+	struct list_head	list;
+	struct list_head	ipaddr_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_ib_mr_pool	*mr_pool;
+	int			fmr_page_shift;
+	int			fmr_page_size;
+	u64			fmr_page_mask;
+	unsigned int		fmr_max_remaps;
+	unsigned int		max_fmrs;
+	int			max_sge;
+	unsigned int		max_wrs;
+	spinlock_t		spinlock;	/* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID	(~(u64) 0)
+
+struct rds_ib_statistics {
+	uint64_t	s_ib_connect_raced;
+	uint64_t	s_ib_listen_closed_stale;
+	uint64_t	s_ib_tx_cq_call;
+	uint64_t	s_ib_tx_cq_event;
+	uint64_t	s_ib_tx_ring_full;
+	uint64_t	s_ib_tx_throttle;
+	uint64_t	s_ib_tx_sg_mapping_failure;
+	uint64_t	s_ib_tx_stalled;
+	uint64_t	s_ib_tx_credit_updates;
+	uint64_t	s_ib_rx_cq_call;
+	uint64_t	s_ib_rx_cq_event;
+	uint64_t	s_ib_rx_ring_empty;
+	uint64_t	s_ib_rx_refill_from_cq;
+	uint64_t	s_ib_rx_refill_from_thread;
+	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_credit_updates;
+	uint64_t	s_ib_ack_sent;
+	uint64_t	s_ib_ack_send_failure;
+	uint64_t	s_ib_ack_send_delayed;
+	uint64_t	s_ib_ack_send_piggybacked;
+	uint64_t	s_ib_ack_received;
+	uint64_t	s_ib_rdma_mr_alloc;
+	uint64_t	s_ib_rdma_mr_free;
+	uint64_t	s_ib_rdma_mr_used;
+	uint64_t	s_ib_rdma_mr_pool_flush;
+	uint64_t	s_ib_rdma_mr_pool_wait;
+	uint64_t	s_ib_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int fmr_pool_size;
+extern unsigned int fmr_message_size;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int __init rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_nodev_conns(void);
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int __init rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int __init rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+	return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+	return &sge[1];
+}
+
+static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+	*ptr = val;
+#else
+	set_64bit(ptr, val);
+#endif
+}
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000000..0532237bd128
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (rds_ib_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_ib_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+	int ret;
+
+	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+	if (ret)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = NULL;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev;
+	struct ib_qp_attr qp_attr;
+	int err;
+
+	if (event->param.conn.private_data_len) {
+		dp = event->param.conn.private_data;
+
+		rds_ib_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+					dp->dp_protocol_minor));
+		rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	}
+
+	printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+			&conn->c_laddr,
+			RDS_PROTOCOL_MAJOR(conn->c_version),
+			RDS_PROTOCOL_MINOR(conn->c_version),
+			ic->i_flowctl ? ", flow control" : "");
+
+	/* Tune RNR behavior */
+	rds_ib_tune_rnr(ic, &qp_attr);
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+	if (err)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+	/* update ib_device with this local ipaddr & conn */
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+	if (err)
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+	err = rds_ib_add_conn(rds_ibdev, conn);
+	if (err)
+		printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_ib_connect_private *dp,
+			u32 protocol_version)
+{
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+	/* XXX tune these? */
+	conn_param->responder_resources = 1;
+	conn_param->initiator_depth = 1;
+	conn_param->retry_count = 7;
+	conn_param->rnr_retry_count = 7;
+
+	if (dp) {
+		struct rds_ib_connection *ic = conn->c_transport_data;
+
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	default:
+		printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+		       "on connection to %pI4\n", event->event,
+		       &conn->c_faddr);
+		break;
+	}
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_ib_device *rds_ibdev;
+	int ret;
+
+	/* rds_ib_add_one creates a rds_ib_device object per IB device,
+	 * and allocates a protection domain, memory range and FMR pool
+	 * for each.  If that fails for any reason, it will not register
+	 * the rds_ibdev at all.
+	 */
+	rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+	if (rds_ibdev == NULL) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+					dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_ibdev->pd;
+	ic->i_mr = rds_ibdev->mr;
+
+	ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_send_ring.w_nr + 1, 0);
+	if (IS_ERR(ic->i_send_cq)) {
+		ret = PTR_ERR(ic->i_send_cq);
+		ic->i_send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_recv_ring.w_nr, 0);
+	if (IS_ERR(ic->i_recv_cq)) {
+		ret = PTR_ERR(ic->i_recv_cq);
+		ic->i_recv_cq = NULL;
+		rdsdebug("ib_create_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	/* XXX negotiate max send/recv with remote? */
+	memset(&attr, 0, sizeof(attr));
+	attr.event_handler = rds_ib_qp_event_handler;
+	attr.qp_context = conn;
+	/* + 1 to allow for the single ack message */
+	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+	attr.cap.max_send_sge = rds_ibdev->max_sge;
+	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	attr.send_cq = ic->i_send_cq;
+	attr.recv_cq = ic->i_recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (ic->i_send_hdrs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (ic->i_recv_hdrs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (ic->i_ack == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+	if (ic->i_sends == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+	rds_ib_send_init_ring(ic);
+
+	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+	if (ic->i_recvs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+
+	rds_ib_recv_init_ring(ic);
+	rds_ib_recv_init_ack(ic);
+
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	return ret;
+}
+
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
+{
+	u16 common;
+	u32 version = 0;
+
+	/* rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-( */
+	if (dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else if (printk_ratelimit()) {
+		printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+			"incompatible protocol version %u.%u\n",
+			&dp->dp_saddr,
+			dp->dp_protocol_major,
+			dp->dp_protocol_minor);
+	}
+	return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	struct rds_ib_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_ib_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	u32 version;
+	int err, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_ib_protocol_compatible(dp);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+		 (unsigned long long)be64_to_cpu(lguid),
+		 (unsigned long long)be64_to_cpu(fguid));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_ib_stats_inc(s_ib_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_ib_stats_inc(s_ib_connect_raced);
+		}
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_ib_set_protocol(conn, version);
+	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_ib_setup_qp(conn);
+	if (err) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	mutex_unlock(&conn->c_cm_lock);
+	if (err) {
+		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_ib_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_ib_setup_qp(conn);
+	if (ret) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int err = 0;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("failed to disconnect, cm: %p err %d\n",
+				ic->i_cm_id, err);
+		}
+
+		wait_event(rds_ib_ring_empty_wait,
+			rds_ib_ring_empty(&ic->i_send_ring) &&
+			rds_ib_ring_empty(&ic->i_recv_ring));
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_ib_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_ib_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+		rdma_destroy_id(ic->i_cm_id);
+
+		/*
+		 * Move connection back to the nodev list.
+		 */
+		if (ic->rds_ibdev) {
+
+			spin_lock_irq(&ic->rds_ibdev->spinlock);
+			BUG_ON(list_empty(&ic->ib_node));
+			list_del(&ic->ib_node);
+			spin_unlock_irq(&ic->rds_ibdev->spinlock);
+
+			spin_lock_irq(&ib_nodev_conns_lock);
+			list_add_tail(&ic->ib_node, &ib_nodev_conns);
+			spin_unlock_irq(&ib_nodev_conns_lock);
+			ic->rds_ibdev = NULL;
+		}
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_ibdev);
+
+	/* Clear pending transmit */
+	if (ic->i_rm) {
+		rds_message_put(ic->i_rm);
+		ic->i_rm = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_ib_set_64bit(&ic->i_ack_next, 0);
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	if (ic->i_ibinc) {
+		rds_inc_put(&ic->i_ibinc->ii_inc);
+		ic->i_ibinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+	if (ic == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ic->ib_node);
+	mutex_init(&ic->i_recv_mutex);
+
+	/*
+	 * rds_ib_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+void rds_ib_conn_free(void *arg)
+{
+	struct rds_ib_connection *ic = arg;
+	rdsdebug("ic %p\n", ic);
+	list_del(&ic->ib_node);
+	kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000000..69a6289ed672
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,641 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+	struct rds_ib_device	*device;
+	struct rds_ib_mr_pool	*pool;
+	struct ib_fmr		*fmr;
+	struct list_head	list;
+	unsigned int		remap_count;
+
+	struct scatterlist	*sg;
+	unsigned int		sg_len;
+	u64			*dma;
+	int			sg_dma_len;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct work_struct	flush_worker;		/* flush worker */
+
+	spinlock_t		list_lock;		/* protect variables below */
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+	struct list_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct list_head	free_list;		/* unused MRs */
+	struct list_head	clean_list;		/* unused & unamapped MRs */
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	struct ib_fmr_attr	fmr_attr;
+};
+
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+		spin_lock_irq(&rds_ibdev->spinlock);
+		list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+			if (i_ipaddr->ipaddr == ipaddr) {
+				spin_unlock_irq(&rds_ibdev->spinlock);
+				return rds_ibdev;
+			}
+		}
+		spin_unlock_irq(&rds_ibdev->spinlock);
+	}
+
+	return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+	if (!i_ipaddr)
+		return -ENOMEM;
+
+	i_ipaddr->ipaddr = ipaddr;
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *next;
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+		if (i_ipaddr->ipaddr == ipaddr) {
+			list_del(&i_ipaddr->list);
+			kfree(i_ipaddr);
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_ibdev->spinlock);
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev_old;
+
+	rds_ibdev_old = rds_ib_get_device(ipaddr);
+	if (rds_ibdev_old)
+		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+
+	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+
+int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	BUG_ON(list_empty(&ib_nodev_conns));
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	ic->rds_ibdev = rds_ibdev;
+
+	return 0;
+}
+
+void rds_ib_remove_nodev_conns(void)
+{
+	struct rds_ib_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	INIT_LIST_HEAD(&ib_nodev_conns);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+		if (ic->conn->c_passive)
+			rds_conn_destroy(ic->conn->c_passive);
+		rds_conn_destroy(ic->conn);
+	}
+}
+
+void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_splice(&rds_ibdev->conn_list, &tmp_list);
+	INIT_LIST_HEAD(&rds_ibdev->conn_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+		if (ic->conn->c_passive)
+			rds_conn_destroy(ic->conn->c_passive);
+		rds_conn_destroy(ic->conn);
+	}
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->drop_list);
+	INIT_LIST_HEAD(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	spin_lock_init(&pool->list_lock);
+	INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+	pool->fmr_attr.max_pages = fmr_message_size;
+	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+	pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
+	pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+	pool->max_items = rds_ibdev->max_fmrs;
+
+	return pool;
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+	flush_workqueue(rds_wq);
+	rds_ib_flush_mr_pool(pool, 1);
+	BUG_ON(atomic_read(&pool->item_count));
+	BUG_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	if (!list_empty(&pool->clean_list)) {
+		ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+		list_del_init(&ibmr->list);
+	}
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	return ibmr;
+}
+
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+	struct rds_ib_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	while (1) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+		rds_ib_flush_mr_pool(pool, 0);
+	}
+
+	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+			(IB_ACCESS_LOCAL_WRITE |
+			 IB_ACCESS_REMOTE_READ |
+			 IB_ACCESS_REMOTE_WRITE),
+			&pool->fmr_attr);
+	if (IS_ERR(ibmr->fmr)) {
+		err = PTR_ERR(ibmr->fmr);
+		ibmr->fmr = NULL;
+		printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+		goto out_no_cigar;
+	}
+
+	rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		if (ibmr->fmr)
+			ib_dealloc_fmr(ibmr->fmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+	       struct scatterlist *sg, unsigned int nents)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct scatterlist *scat = sg;
+	u64 io_addr = 0;
+	u64 *dma_pages;
+	u32 len;
+	int page_cnt, sg_dma_len;
+	int i, j;
+	int ret;
+
+	sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+				 DMA_BIDIRECTIONAL);
+	if (unlikely(!sg_dma_len)) {
+		printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+		return -EBUSY;
+	}
+
+	len = 0;
+	page_cnt = 0;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		if (dma_addr & ~rds_ibdev->fmr_page_mask) {
+			if (i > 0)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+		if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
+			if (i < sg_dma_len - 1)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+
+		len += dma_len;
+	}
+
+	page_cnt += len >> rds_ibdev->fmr_page_shift;
+	if (page_cnt > fmr_message_size)
+		return -EINVAL;
+
+	dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+	if (!dma_pages)
+		return -ENOMEM;
+
+	page_cnt = 0;
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
+			dma_pages[page_cnt++] =
+				(dma_addr & rds_ibdev->fmr_page_mask) + j;
+	}
+
+	ret = ib_map_phys_fmr(ibmr->fmr,
+				   dma_pages, page_cnt, io_addr);
+	if (ret)
+		goto out;
+
+	/* Success - we successfully remapped the MR, so we can
+	 * safely tear down the old mapping. */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = scat;
+	ibmr->sg_len = nents;
+	ibmr->sg_dma_len = sg_dma_len;
+	ibmr->remap_count++;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_used);
+	ret = 0;
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	if (ibmr->sg_dma_len) {
+		ib_dma_unmap_sg(rds_ibdev->dev,
+				ibmr->sg, ibmr->sg_len,
+				DMA_BIDIRECTIONAL);
+		ibmr->sg_dma_len = 0;
+	}
+
+	/* Release the s/g list */
+	if (ibmr->sg_len) {
+		unsigned int i;
+
+		for (i = 0; i < ibmr->sg_len; ++i) {
+			struct page *page = sg_page(&ibmr->sg[i]);
+
+			/* FIXME we need a way to tell a r/w MR
+			 * from a r/o MR */
+			set_page_dirty(page);
+			put_page(page);
+		}
+		kfree(ibmr->sg);
+
+		ibmr->sg = NULL;
+		ibmr->sg_len = 0;
+	}
+}
+
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	unsigned int pinned = ibmr->sg_len;
+
+	__rds_ib_teardown_mr(ibmr);
+	if (pinned) {
+		struct rds_ib_device *rds_ibdev = ibmr->device;
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		atomic_sub(pinned, &pool->free_pinned);
+	}
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+{
+	struct rds_ib_mr *ibmr, *next;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+	unsigned long unpinned = 0;
+	unsigned long flags;
+	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	int ret = 0;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+	mutex_lock(&pool->flush_lock);
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	/* Get the list of all MRs to be dropped. Ordering matters -
+	 * we want to put drop_list ahead of free_list. */
+	list_splice_init(&pool->free_list, &unmap_list);
+	list_splice_init(&pool->drop_list, &unmap_list);
+	if (free_all)
+		list_splice_init(&pool->clean_list, &unmap_list);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	free_goal = rds_ib_flush_goal(pool, free_all);
+
+	if (list_empty(&unmap_list))
+		goto out;
+
+	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+	list_for_each_entry(ibmr, &unmap_list, list)
+		list_add(&ibmr->fmr->list, &fmr_list);
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+		unpinned += ibmr->sg_len;
+		__rds_ib_teardown_mr(ibmr);
+		if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+			rds_ib_stats_inc(s_ib_rdma_mr_free);
+			list_del(&ibmr->list);
+			ib_dealloc_fmr(ibmr->fmr);
+			kfree(ibmr);
+			nfreed++;
+		}
+		ncleaned++;
+	}
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	list_splice(&unmap_list, &pool->clean_list);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	atomic_sub(unpinned, &pool->free_pinned);
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+out:
+	mutex_unlock(&pool->flush_lock);
+	return ret;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+
+	rds_ib_flush_mr_pool(pool, 0);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+	unsigned long flags;
+
+	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+	/* Return it to the pool's free list */
+	spin_lock_irqsave(&pool->list_lock, flags);
+	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+		list_add(&ibmr->list, &pool->drop_list);
+	else
+		list_add(&ibmr->list, &pool->free_list);
+
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+	 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_work(rds_wq, &pool->flush_worker);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_ib_flush_mr_pool(pool, 0);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			queue_work(rds_wq, &pool->flush_worker);
+		}
+	}
+}
+
+void rds_ib_flush_mrs(void)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		if (pool)
+			rds_ib_flush_mr_pool(pool, 0);
+	}
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_mr *ibmr = NULL;
+	int ret;
+
+	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+	if (!rds_ibdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_ibdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_ib_alloc_fmr(rds_ibdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->fmr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+
+	ibmr->device = rds_ibdev;
+
+ out:
+	if (ret) {
+		if (ibmr)
+			rds_ib_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	return ibmr;
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000000..5061b5502162
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
+
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	__free_page(frag->f_page);
+	frag->f_page = NULL;
+}
+
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	BUG_ON(frag->f_page != NULL);
+	kmem_cache_free(rds_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+				   struct rds_ib_recv_work *recv)
+{
+	struct rds_page_frag *frag = recv->r_frag;
+
+	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+	if (frag->f_mapped)
+		ib_dma_unmap_page(ic->i_cm_id->device,
+			       frag->f_mapped,
+			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+	frag->f_mapped = 0;
+}
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_ibinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+		sge = rds_ib_data_sge(ic, recv->r_sge);
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = rds_ib_header_sge(ic, recv->r_sge);
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+	}
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+				  struct rds_ib_recv_work *recv)
+{
+	if (recv->r_ibinc) {
+		rds_inc_put(&recv->r_ibinc->ii_inc);
+		recv->r_ibinc = NULL;
+	}
+	if (recv->r_frag) {
+		rds_ib_recv_unmap_page(ic, recv);
+		if (recv->r_frag->f_page)
+			rds_ib_frag_drop_page(recv->r_frag);
+		rds_ib_frag_free(recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+
+	if (ic->i_frag.f_page)
+		rds_ib_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+				  struct rds_ib_recv_work *recv,
+				  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+
+	if (recv->r_ibinc == NULL) {
+		if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+			rds_ib_stats_inc(s_ib_rx_alloc_limit);
+			goto out;
+		}
+		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+						 kptr_gfp);
+		if (recv->r_ibinc == NULL)
+			goto out;
+		atomic_inc(&rds_ib_allocation);
+		INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
+	}
+
+	if (recv->r_frag == NULL) {
+		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+		if (recv->r_frag == NULL)
+			goto out;
+		INIT_LIST_HEAD(&recv->r_frag->f_item);
+		recv->r_frag->f_page = NULL;
+	}
+
+	if (ic->i_frag.f_page == NULL) {
+		ic->i_frag.f_page = alloc_page(page_gfp);
+		if (ic->i_frag.f_page == NULL)
+			goto out;
+		ic->i_frag.f_offset = 0;
+	}
+
+	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+				  ic->i_frag.f_page,
+				  ic->i_frag.f_offset,
+				  RDS_FRAG_SIZE,
+				  DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+		goto out;
+
+	/*
+	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+	 * must be called on this recv.  This happens as completions hit
+	 * in order or on connection shutdown.
+	 */
+	recv->r_frag->f_page = ic->i_frag.f_page;
+	recv->r_frag->f_offset = ic->i_frag.f_offset;
+	recv->r_frag->f_mapped = dma_addr;
+
+	sge = rds_ib_data_sge(ic, recv->r_sge);
+	sge->addr = dma_addr;
+	sge->length = RDS_FRAG_SIZE;
+
+	sge = rds_ib_header_sge(ic, recv->r_sge);
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	get_page(recv->r_frag->f_page);
+
+	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+		ic->i_frag.f_offset += RDS_FRAG_SIZE;
+	} else {
+		put_page(ic->i_frag.f_page);
+		ic->i_frag.f_page = NULL;
+		ic->i_frag.f_offset = 0;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn))
+			&& rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			ret = -EINVAL;
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		if (ret) {
+			ret = -1;
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_ibinc, recv->r_frag->f_page,
+			 (long) recv->r_frag->f_mapped, ret);
+		if (ret) {
+			rds_ib_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			ret = -1;
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_ib_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+	return ret;
+}
+
+void rds_ib_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_drop_page(frag);
+		rds_ib_frag_free(frag);
+	}
+}
+
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	rds_ib_inc_purge(inc);
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+	kmem_cache_free(rds_ib_incoming_slab, ibinc);
+	atomic_dec(&rds_ib_allocation);
+	BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %lu] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 frag->f_page, frag->f_offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(frag->f_page,
+					    frag->f_offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_mr->lkey;
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IB_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	rds_ib_set_64bit(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_clear_bit();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_clear_bit();
+
+	return ic->i_ack_next;
+}
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_ib_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_ib_stats_inc(s_ib_ack_send_failure);
+		/* Need to finesse this later. */
+		BUG();
+	} else
+		rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_ib_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_ib_stats_inc(s_ib_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
+		rds_ib_stats_inc(s_ib_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+	return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+			      struct rds_ib_incoming *ibinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr, KM_SOFTIRQ0);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+				struct rds_ib_recv_work *recv, u32 byte_len,
+				struct rds_ib_ack_state *state)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_incoming *ibinc = ic->i_ibinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+		 byte_len);
+
+	if (byte_len < sizeof(struct rds_header)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 didn't inclue a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	byte_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_ib_stats_inc(s_ib_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.  We can leave the frag, though, it will be
+		 * reused.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_ib_frag_drop_page(recv->r_frag);
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (ibinc == NULL) {
+		ibinc = recv->r_ibinc;
+		recv->r_ibinc = NULL;
+		ic->i_ibinc = ibinc;
+
+		hdr = &ibinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &ibinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence
+		 || hdr->h_len != ihdr->h_len
+		 || hdr->h_sport != ihdr->h_sport
+		 || hdr->h_dport != ihdr->h_dport) {
+			rds_ib_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_ibinc = NULL;
+
+		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_ib_cong_recv(conn, ibinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &ibinc->ii_inc, GFP_ATOMIC,
+					  KM_SOFTIRQ0);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&ibinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_ib_ack_state state = { 0, };
+	struct rds_ib_recv_work *recv;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_ib_stats_inc(s_ib_rx_cq_call);
+
+	ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+
+		rds_ib_recv_unmap_page(ic, recv);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+			/* We expect errors as the qp is drained during shutdown */
+			if (wc.status == IB_WC_SUCCESS) {
+				rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+			} else {
+				rds_ib_conn_error(conn, "recv completion on "
+				       "%pI4 had status %u, disconnecting and "
+				       "reconnecting\n", &conn->c_faddr,
+				       wc.status);
+			}
+		}
+
+		rds_ib_ring_free(&ic->i_recv_ring, 1);
+	}
+
+	if (state.ack_next_valid)
+		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_ib_ring_empty(&ic->i_recv_ring))
+		rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+	/*
+	 * If the ring is running low, then schedule the thread to refill.
+	 */
+	if (rds_ib_ring_low(&ic->i_recv_ring))
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_ib_recv(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+
+	/*
+	 * If we get a temporary posting failure in this context then
+	 * we're really low and we want the caller to back off for a bit.
+	 */
+	mutex_lock(&ic->i_recv_mutex);
+	if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+		ret = -ENOMEM;
+	else
+		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+	mutex_unlock(&ic->i_recv_mutex);
+
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	return ret;
+}
+
+int __init rds_ib_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+					sizeof(struct rds_ib_incoming),
+					0, 0, NULL);
+	if (rds_ib_incoming_slab == NULL)
+		goto out;
+
+	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+					sizeof(struct rds_page_frag),
+					0, 0, NULL);
+	if (rds_ib_frag_slab == NULL)
+		kmem_cache_destroy(rds_ib_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+	kmem_cache_destroy(rds_ib_incoming_slab);
+	kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000000..99a6ccae964c
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_ib_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_ib_ring_empty(ring) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
+}
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000000..cb6c52cb1c4c
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "ib.h"
+
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+				      int wc_status)
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+				   struct rds_rdma_op *op)
+{
+	if (op->r_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+			op->r_sg, op->r_nents,
+			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->r_mapped = 0;
+	}
+}
+
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+			  struct rds_ib_send_work *send,
+			  int wc_status)
+{
+	struct rds_message *rm = send->s_rm;
+
+	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+	ib_dma_unmap_sg(ic->i_cm_id->device,
+		     rm->m_sg, rm->m_nents,
+		     DMA_TO_DEVICE);
+
+	if (rm->m_rdma_op != NULL) {
+		rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+		/* If the user asked for a completion notification on this
+		 * message, we can implement three different semantics:
+		 *  1.	Notify when we received the ACK on the RDS message
+		 *	that was queued with the RDMA. This provides reliable
+		 *	notification of RDMA status at the expense of a one-way
+		 *	packet delay.
+		 *  2.	Notify when the IB stack gives us the completion event for
+		 *	the RDMA operation.
+		 *  3.	Notify when the IB stack gives us the completion event for
+		 *	the accompanying RDS messages.
+		 * Here, we implement approach #3. To implement approach #2,
+		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+		 * don't call rds_rdma_send_complete at all, and fall back to the notify
+		 * handling in the ACK processing code.
+		 *
+		 * Note: There's no need to explicitly sync any RDMA buffers using
+		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+		 * operation itself unmapped the RDMA buffers, which takes care
+		 * of synching.
+		 */
+		rds_ib_send_rdma_complete(rm, wc_status);
+
+		if (rm->m_rdma_op->r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+		else
+			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+	}
+
+	/* If anyone waited for this message to get flushed out, wake
+	 * them up now */
+	rds_message_unmapped(rm);
+
+	rds_message_put(rm);
+	send->s_rm = NULL;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_rm = NULL;
+		send->s_op = NULL;
+
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.num_sge = 1;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.send_flags = 0;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = rds_ib_data_sge(ic, send->s_sge);
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = rds_ib_header_sge(ic, send->s_sge);
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+	}
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		if (send->s_wr.opcode == 0xdead)
+			continue;
+		if (send->s_rm)
+			rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+		if (send->s_op)
+			rds_ib_send_unmap_rdma(ic, send->s_op);
+	}
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_ib_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i = 0;
+	int ret;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_ib_stats_inc(s_ib_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_tx_cq_event);
+
+		if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+			if (ic->i_ack_queued + HZ/2 < jiffies)
+				rds_ib_stats_inc(s_ib_tx_stalled);
+			rds_ib_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+
+			/* In the error case, wc.opcode sometimes contains garbage */
+			switch (send->s_wr.opcode) {
+			case IB_WR_SEND:
+				if (send->s_rm)
+					rds_ib_send_unmap_rm(ic, send, wc.status);
+				break;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+				/* Nothing to be done - the SG list will be unmapped
+				 * when the SEND completes. */
+				break;
+			default:
+				if (printk_ratelimit())
+					printk(KERN_NOTICE
+						"RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+						__func__, send->s_wr.opcode);
+				break;
+			}
+
+			send->s_wr.opcode = 0xdead;
+			send->s_wr.num_sge = 1;
+			if (send->s_queued + HZ/2 < jiffies)
+				rds_ib_stats_inc(s_ib_tx_stalled);
+
+			/* If a RDMA operation produced an error, signal this right
+			 * away. If we don't, the subsequent SEND that goes with this
+			 * RDMA will be canceled with ERR_WFLUSH, and the application
+			 * never learn that the RDMA failed. */
+			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+				struct rds_message *rm;
+
+				rm = rds_send_get_message(conn, send->s_op);
+				if (rm)
+					rds_ib_send_rdma_complete(rm, wc.status);
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_ib_ring_free(&ic->i_send_ring, completed);
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+		 || test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_ib_conn_error(conn,
+				"send completion on %pI4 "
+				"had status %u, disconnecting and reconnecting\n",
+				&conn->c_faddr, wc.status);
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the reciever's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+		struct rds_ib_send_work *send, unsigned int pos,
+		unsigned long buffer, unsigned int length,
+		int send_flags)
+{
+	struct ib_sge *sge;
+
+	WARN_ON(pos != send - ic->i_sends);
+
+	send->s_wr.send_flags = send_flags;
+	send->s_wr.opcode = IB_WR_SEND;
+	send->s_wr.num_sge = 2;
+	send->s_wr.next = NULL;
+	send->s_queued = jiffies;
+	send->s_op = NULL;
+
+	if (length != 0) {
+		sge = rds_ib_data_sge(ic, send->s_sge);
+		sge->addr = buffer;
+		sge->length = length;
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = rds_ib_header_sge(ic, send->s_sge);
+	} else {
+		/* We're sending a packet with no payload. There is only
+		 * one SGE */
+		send->s_wr.num_sge = 1;
+		sge = &send->s_sge[0];
+	}
+
+	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_mr->lkey;
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int sent;
+	int ret;
+	int flow_controlled = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	credit_alloc = work_alloc;
+	if (ic->i_flowctl) {
+		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled++;
+		}
+		if (work_alloc == 0) {
+			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+			rds_ib_stats_inc(s_ib_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (ic->i_rm == NULL) {
+		/*
+		printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+				rm->m_inc.i_hdr.h_flags,
+				be32_to_cpu(rm->m_inc.i_hdr.h_len));
+		   */
+		if (rm->m_nents) {
+			rm->m_count = ib_dma_map_sg(dev,
+					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+			if (rm->m_count == 0) {
+				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->m_count = 0;
+		}
+
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+		rds_message_addref(rm);
+		ic->i_rm = rm;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->m_rdma_op) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_ib_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_ib_send_grab_credits(ic, 0, &posted, 1);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
+	} else if (ic->i_rm != rm)
+		BUG();
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &rm->m_sg[sg];
+	sent = 0;
+	i = 0;
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/*
+	 * We could be copying the header into the unused tail of the page.
+	 * That would need to be changed in the future when those pages might
+	 * be mapped userspace pages or page cache pages.  So instead we always
+	 * use a second sge and our long-lived ring of mapped headers.  We send
+	 * the header after the data so that the data payload can be aligned on
+	 * the receiver.
+	 */
+
+	/* handle a 0-len message */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+		rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+		goto add_header;
+	}
+
+	/* if there's data reference it with a chain of work reqs */
+	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+		unsigned int len;
+
+		send = &ic->i_sends[pos];
+
+		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+		rds_ib_xmit_populate_wr(ic, send, pos,
+				ib_sg_dma_address(dev, scat) + off, len,
+				send_flags);
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time
+		 * on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		ic->i_unsignaled_bytes -= len;
+		if (ic->i_unsignaled_bytes <= 0) {
+			ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		sent += len;
+		off += len;
+		if (off == ib_sg_dma_len(dev, scat)) {
+			scat++;
+			off = 0;
+		}
+
+add_header:
+		/* Tack on the header after the data. The header SGE should already
+		 * have been set up to point to the right header buffer. */
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		if (0) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(hdr->h_dport),
+				hdr->h_flags,
+				be32_to_cpu(hdr->h_len));
+		}
+		if (adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_ib_stats_inc(s_ib_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+	}
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->m_sg[rm->m_count]) {
+		prev->s_rm = ic->i_rm;
+		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		ic->i_rm = NULL;
+	}
+
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_ib_send_add_credits(conn, credit_alloc - i);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		if (prev->s_rm) {
+			ic->i_rm = prev->s_rm;
+			prev->s_rm = NULL;
+		}
+		/* Finesse this later */
+		BUG();
+		goto out;
+	}
+
+	ret = sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->r_remote_addr;
+	u32 pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	/* map the message the first time we see it */
+	if (!op->r_mapped) {
+		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+					op->r_sg, op->r_nents, (op->r_write) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+		if (op->r_count == 0) {
+			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->r_mapped = 1;
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->r_count, rds_ibdev->max_sge);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &op->r_sg[0];
+	sent = 0;
+	num_sge = op->r_count;
+
+	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags = IB_SEND_SIGNALED;
+		}
+
+		send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->r_key;
+		send->s_op = op;
+
+		if (num_sge > rds_ibdev->max_sge) {
+			send->s_wr.num_sge = rds_ibdev->max_sge;
+			num_sge -= rds_ibdev->max_sge;
+		} else {
+			send->s_wr.num_sge = num_sge;
+		}
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+			send->s_sge[j].addr =
+				 ib_sg_dma_address(ic->i_cm_id->device, scat);
+			send->s_sge[j].length = len;
+			send->s_sge[j].lkey = ic->i_mr->lkey;
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+			remote_addr += len;
+			scat++;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &op->r_sg[op->r_count])
+		prev->s_wr.send_flags = IB_SEND_SIGNALED;
+
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &first->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &first->s_wr);
+	}
+
+
+out:
+	return ret;
+}
+
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000000..02e3e3d50d4a
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+
+static char *rds_ib_stat_names[] = {
+	"ib_connect_raced",
+	"ib_listen_closed_stale",
+	"ib_tx_cq_call",
+	"ib_tx_cq_event",
+	"ib_tx_ring_full",
+	"ib_tx_throttle",
+	"ib_tx_sg_mapping_failure",
+	"ib_tx_stalled",
+	"ib_tx_credit_updates",
+	"ib_rx_cq_call",
+	"ib_rx_cq_event",
+	"ib_rx_ring_empty",
+	"ib_rx_refill_from_cq",
+	"ib_rx_refill_from_thread",
+	"ib_rx_alloc_limit",
+	"ib_rx_credit_updates",
+	"ib_ack_sent",
+	"ib_ack_send_failure",
+	"ib_ack_send_delayed",
+	"ib_ack_send_piggybacked",
+	"ib_ack_received",
+	"ib_rdma_mr_alloc",
+	"ib_rdma_mr_free",
+	"ib_rdma_mr_used",
+	"ib_rdma_mr_pool_flush",
+	"ib_rdma_mr_pool_wait",
+	"ib_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_ib_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_ib_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+			    ARRAY_SIZE(rds_ib_stat_names));
+out:
+	return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000000..d87830db93a0
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_ib_sysctl_flow_control = 1;
+
+ctl_table rds_ib_sysctl_table[] = {
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_send_wr",
+		.data		= &rds_ib_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_recv_wr",
+		.data		= &rds_ib_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_ib_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_unsignaled_bytes",
+		.data		= &rds_ib_sysctl_max_unsig_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_unsig_bytes_min,
+		.extra2		= &rds_ib_sysctl_max_unsig_bytes_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_recv_allocation",
+		.data		= &rds_ib_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "flow_control",
+		.data		= &rds_ib_sysctl_flow_control,
+		.maxlen		= sizeof(rds_ib_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0}
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+	{ .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
+	{ }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+	if (rds_ib_sysctl_hdr)
+		unregister_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int __init rds_ib_sysctl_init(void)
+{
+	rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+	if (rds_ib_sysctl_hdr == NULL)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000000..1d885535214d
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time.  The structs are only copied if the user-specified
+ * buffer is big enough.  The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+	struct page **pages;
+	void *addr;
+	unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset] != NULL);
+	rds_info_funcs[offset] = func;
+	spin_unlock(&rds_info_lock);
+}
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset] != func);
+	rds_info_funcs[offset] = NULL;
+	spin_unlock(&rds_info_lock);
+}
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive.  This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+	if (iter->addr != NULL) {
+		kunmap_atomic(iter->addr, KM_USER0);
+		iter->addr = NULL;
+	}
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes)
+{
+	unsigned long this;
+
+	while (bytes) {
+		if (iter->addr == NULL)
+			iter->addr = kmap_atomic(*iter->pages, KM_USER0);
+
+		this = min(bytes, PAGE_SIZE - iter->offset);
+
+		rdsdebug("page %p addr %p offset %lu this %lu data %p "
+			  "bytes %lu\n", *iter->pages, iter->addr,
+			  iter->offset, this, data, bytes);
+
+		memcpy(iter->addr + iter->offset, data, this);
+
+		data += this;
+		bytes -= this;
+		iter->offset += this;
+
+		if (iter->offset == PAGE_SIZE) {
+			kunmap_atomic(iter->addr, KM_USER0);
+			iter->addr = NULL;
+			iter->offset = 0;
+			iter->pages++;
+		}
+	}
+}
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace.  @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen)
+{
+	struct rds_info_iterator iter;
+	struct rds_info_lengths lens;
+	unsigned long nr_pages = 0;
+	unsigned long start;
+	unsigned long i;
+	rds_info_func func;
+	struct page **pages = NULL;
+	int ret;
+	int len;
+	int total;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* check for all kinds of wrapping and the like */
+	start = (unsigned long)optval;
+	if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* a 0 len call is just trying to probe its length */
+	if (len == 0)
+		goto call_func;
+
+	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+			>> PAGE_SHIFT;
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	down_read(&current->mm->mmap_sem);
+	ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+			     pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (ret != nr_pages) {
+		if (ret > 0)
+			nr_pages = ret;
+		else
+			nr_pages = 0;
+		ret = -EAGAIN; /* XXX ? */
+		goto out;
+	}
+
+	rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+	func = rds_info_funcs[optname - RDS_INFO_FIRST];
+	if (func == NULL) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	iter.pages = pages;
+	iter.addr = NULL;
+	iter.offset = start & (PAGE_SIZE - 1);
+
+	func(sock, len, &iter, &lens);
+	BUG_ON(lens.each == 0);
+
+	total = lens.nr * lens.each;
+
+	rds_info_iter_unmap(&iter);
+
+	if (total > len) {
+		len = total;
+		ret = -ENOSPC;
+	} else {
+		len = total;
+		ret = lens.each;
+	}
+
+	if (put_user(len, optlen))
+		ret = -EFAULT;
+
+out:
+	for (i = 0; pages != NULL && i < nr_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+
+	return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000000..b6c052ca7d22
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+	unsigned int	nr;
+	unsigned int	each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source.  If the snapshot fits in @len then it
+ * should be copied using @iter.  The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 000000000000..1b56905c4c08
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+
+#include "rds.h"
+#include "iw.h"
+
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+
+struct list_head rds_iw_devices;
+
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+
+void rds_iw_add_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle iwarp devices */
+	if (device->node_type != RDMA_NODE_RNIC)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+	if (!rds_iwdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_iwdev->spinlock);
+
+	rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+	rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+	rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+
+	rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+
+	rds_iwdev->dev = device;
+	rds_iwdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_iwdev->pd))
+		goto free_dev;
+
+	if (!rds_iwdev->dma_local_lkey) {
+		if (device->node_type != RDMA_NODE_RNIC) {
+			rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+						IB_ACCESS_LOCAL_WRITE);
+		} else {
+			rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+						IB_ACCESS_REMOTE_READ |
+						IB_ACCESS_REMOTE_WRITE |
+						IB_ACCESS_LOCAL_WRITE);
+		}
+		if (IS_ERR(rds_iwdev->mr))
+			goto err_pd;
+	} else
+		rds_iwdev->mr = NULL;
+
+	rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+	if (IS_ERR(rds_iwdev->mr_pool)) {
+		rds_iwdev->mr_pool = NULL;
+		goto err_mr;
+	}
+
+	INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+	INIT_LIST_HEAD(&rds_iwdev->conn_list);
+	list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+
+	ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+
+	goto free_attr;
+
+err_mr:
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+	ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+	kfree(rds_iwdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+void rds_iw_remove_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_cm_id *i_cm_id, *next;
+
+	rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+	if (!rds_iwdev)
+		return;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+		list_del(&i_cm_id->list);
+		kfree(i_cm_id);
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	rds_iw_remove_conns(rds_iwdev);
+
+	if (rds_iwdev->mr_pool)
+		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+
+	while (ib_dealloc_pd(rds_iwdev->pd)) {
+		rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+		msleep(1);
+	}
+
+	list_del(&rds_iwdev->list);
+	kfree(rds_iwdev);
+}
+
+struct ib_client rds_iw_client = {
+	.name   = "rds_iw",
+	.add    = rds_iw_add_one,
+	.remove = rds_iw_remove_one
+};
+
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_iw_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_iw_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_iw_device *rds_iwdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_iwdev->max_sge;
+		rds_iw_get_mr_info(rds_iwdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_iw_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+	if (!cm_id)
+		return -EADDRNOTAVAIL;
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support IB devices unless we
+	   check node_type. */
+	if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+void rds_iw_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+	rds_iw_remove_nodev_conns();
+	ib_unregister_client(&rds_iw_client);
+	rds_iw_sysctl_exit();
+	rds_iw_recv_exit();
+	rds_trans_unregister(&rds_iw_transport);
+}
+
+struct rds_transport rds_iw_transport = {
+	.laddr_check		= rds_iw_laddr_check,
+	.xmit_complete		= rds_iw_xmit_complete,
+	.xmit			= rds_iw_xmit,
+	.xmit_cong_map		= NULL,
+	.xmit_rdma		= rds_iw_xmit_rdma,
+	.recv			= rds_iw_recv,
+	.conn_alloc		= rds_iw_conn_alloc,
+	.conn_free		= rds_iw_conn_free,
+	.conn_connect		= rds_iw_conn_connect,
+	.conn_shutdown		= rds_iw_conn_shutdown,
+	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
+	.inc_purge		= rds_iw_inc_purge,
+	.inc_free		= rds_iw_inc_free,
+	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
+	.cm_handle_connect	= rds_iw_cm_handle_connect,
+	.cm_connect_complete	= rds_iw_cm_connect_complete,
+	.stats_info_copy	= rds_iw_stats_info_copy,
+	.exit			= rds_iw_exit,
+	.get_mr			= rds_iw_get_mr,
+	.sync_mr		= rds_iw_sync_mr,
+	.free_mr		= rds_iw_free_mr,
+	.flush_mrs		= rds_iw_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "iwarp",
+	.t_prefer_loopback	= 1,
+};
+
+int __init rds_iw_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_iw_devices);
+
+	ret = ib_register_client(&rds_iw_client);
+	if (ret)
+		goto out;
+
+	ret = rds_iw_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_iw_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_iw_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_iw_recv_exit();
+out_sysctl:
+	rds_iw_sysctl_exit();
+out_ibreg:
+	ib_unregister_client(&rds_iw_client);
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 000000000000..0ddda34f2a1c
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FASTREG_SIZE		20
+#define RDS_FASTREG_POOL_SIZE		2048
+
+#define RDS_IW_MAX_SGE			8
+#define RDS_IW_RECV_SGE 		2
+
+#define RDS_IW_DEFAULT_RECV_WR		1024
+#define RDS_IW_DEFAULT_SEND_WR		256
+
+#define RDS_IW_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+extern struct list_head rds_iw_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct page		*f_page;
+	unsigned long		f_offset;
+	dma_addr_t 		f_mapped;
+};
+
+struct rds_iw_incoming {
+	struct list_head	ii_frags;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_iw_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_iw_scatterlist {
+	struct scatterlist	*list;
+	unsigned int		len;
+	int			dma_len;
+	unsigned int		dma_npages;
+	unsigned int		bytes;
+};
+
+struct rds_iw_mapping {
+	spinlock_t		m_lock;	/* protect the mapping struct */
+	struct list_head	m_list;
+	struct rds_iw_mr	*m_mr;
+	uint32_t		m_rkey;
+	struct rds_iw_scatterlist m_sg;
+};
+
+struct rds_iw_send_work {
+	struct rds_message	*s_rm;
+
+	/* We should really put these into a union: */
+	struct rds_rdma_op	*s_op;
+	struct rds_iw_mapping	*s_mapping;
+	struct ib_mr		*s_mr;
+	struct ib_fast_reg_page_list *s_page_list;
+	unsigned char		s_remap_count;
+
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IW_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_iw_recv_work {
+	struct rds_iw_incoming 	*r_iwinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_iw_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_iw_device;
+
+struct rds_iw_connection {
+
+	struct list_head	iw_node;
+	struct rds_iw_device 	*rds_iwdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_iw_work_ring	i_send_ring;
+	struct rds_message	*i_rm;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_iw_send_work *i_sends;
+
+	/* rx */
+	struct mutex		i_recv_mutex;
+	struct rds_iw_work_ring	i_recv_ring;
+	struct rds_iw_incoming	*i_iwinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_iw_recv_work *i_recvs;
+	struct rds_page_frag	i_frag;
+	u64			i_ack_recv;	/* last ACK received */
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+	u64			i_ack_next;	/* next ACK to send */
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+	unsigned int		i_dma_local_lkey:1;
+	unsigned int		i_fastreg_posted:1; /* fastreg posted on this connection */
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+	long			i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_iw_cm_id {
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+};
+
+struct rds_iw_device {
+	struct list_head	list;
+	struct list_head	cm_id_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_iw_mr_pool	*mr_pool;
+	int			page_shift;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		dma_local_lkey:1;
+	spinlock_t		spinlock;	/* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID	((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID	((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID	((u64)0xdfdfdfdfdfdfdfdfULL)
+
+struct rds_iw_statistics {
+	uint64_t	s_iw_connect_raced;
+	uint64_t	s_iw_listen_closed_stale;
+	uint64_t	s_iw_tx_cq_call;
+	uint64_t	s_iw_tx_cq_event;
+	uint64_t	s_iw_tx_ring_full;
+	uint64_t	s_iw_tx_throttle;
+	uint64_t	s_iw_tx_sg_mapping_failure;
+	uint64_t	s_iw_tx_stalled;
+	uint64_t	s_iw_tx_credit_updates;
+	uint64_t	s_iw_rx_cq_call;
+	uint64_t	s_iw_rx_cq_event;
+	uint64_t	s_iw_rx_ring_empty;
+	uint64_t	s_iw_rx_refill_from_cq;
+	uint64_t	s_iw_rx_refill_from_thread;
+	uint64_t	s_iw_rx_alloc_limit;
+	uint64_t	s_iw_rx_credit_updates;
+	uint64_t	s_iw_ack_sent;
+	uint64_t	s_iw_ack_send_failure;
+	uint64_t	s_iw_ack_send_delayed;
+	uint64_t	s_iw_ack_send_piggybacked;
+	uint64_t	s_iw_ack_received;
+	uint64_t	s_iw_rdma_mr_alloc;
+	uint64_t	s_iw_rdma_mr_free;
+	uint64_t	s_iw_rdma_mr_used;
+	uint64_t	s_iw_rdma_mr_pool_flush;
+	uint64_t	s_iw_rdma_mr_pool_wait;
+	uint64_t	s_iw_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_iw_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_iw_dma_sync_sg_for_cpu
+
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_iw_dma_sync_sg_for_device
+
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+	return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
+extern struct ib_client rds_iw_client;
+
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int __init rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_iw_conn_error(conn, fmt...) \
+	__rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_nodev_conns(void);
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+
+/* ib_recv.c */
+int __init rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int __init rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[1];
+}
+
+static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
+{
+#if BITS_PER_LONG == 64
+	*ptr = val;
+#else
+	set_64bit(ptr, val);
+#endif
+}
+
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 000000000000..57ecb3d4b8a5
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (rds_iw_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_iw_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = NULL;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	int err;
+
+	if (event->param.conn.private_data_len) {
+		dp = event->param.conn.private_data;
+
+		rds_iw_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+					dp->dp_protocol_minor));
+		rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	}
+
+	/* update ib_device with this local ipaddr & conn */
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+	if (err)
+		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+	err = rds_iw_add_conn(rds_iwdev, conn);
+	if (err)
+		printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+			&conn->c_laddr, &conn->c_faddr,
+			RDS_PROTOCOL_MAJOR(conn->c_version),
+			RDS_PROTOCOL_MINOR(conn->c_version),
+			ic->i_flowctl ? ", flow control" : "");
+
+	rds_connect_complete(conn);
+}
+
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_iw_connect_private *dp,
+			u32 protocol_version)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+	/* XXX tune these? */
+	conn_param->responder_resources = 1;
+	conn_param->initiator_depth = 1;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_FATAL:
+	default:
+		rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
+			event->event, &conn->c_laddr,
+			&conn->c_faddr);
+		break;
+	}
+}
+
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+		struct rds_iw_device *rds_iwdev,
+		struct rds_iw_work_ring *send_ring,
+		void (*send_cq_handler)(struct ib_cq *, void *),
+		struct rds_iw_work_ring *recv_ring,
+		void (*recv_cq_handler)(struct ib_cq *, void *),
+		void *context)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	unsigned int send_size, recv_size;
+	int ret;
+
+	/* The offset of 1 is to accomodate the additional ACK WR. */
+	send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+	recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+	rds_iw_ring_resize(send_ring, send_size - 1);
+	rds_iw_ring_resize(recv_ring, recv_size - 1);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->event_handler = rds_iw_qp_event_handler;
+	attr->qp_context = context;
+	attr->cap.max_send_wr = send_size;
+	attr->cap.max_recv_wr = recv_size;
+	attr->cap.max_send_sge = rds_iwdev->max_sge;
+	attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+	attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr->qp_type = IB_QPT_RC;
+
+	attr->send_cq = ib_create_cq(dev, send_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, send_size, 0);
+	if (IS_ERR(attr->send_cq)) {
+		ret = PTR_ERR(attr->send_cq);
+		attr->send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, recv_size, 0);
+	if (IS_ERR(attr->recv_cq)) {
+		ret = PTR_ERR(attr->recv_cq);
+		attr->recv_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (attr->send_cq)
+			ib_destroy_cq(attr->send_cq);
+		if (attr->recv_cq)
+			ib_destroy_cq(attr->recv_cq);
+	}
+	return ret;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_iw_device *rds_iwdev;
+	int ret;
+
+	/* rds_iw_add_one creates a rds_iw_device object per IB device,
+	 * and allocates a protection domain, memory range and MR pool
+	 * for each.  If that fails for any reason, it will not register
+	 * the rds_iwdev at all.
+	 */
+	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+	if (rds_iwdev == NULL) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+					dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_iwdev->pd;
+	ic->i_mr = rds_iwdev->mr;
+
+	ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+			&ic->i_send_ring, rds_iw_send_cq_comp_handler,
+			&ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+			conn);
+	if (ret < 0)
+		goto out;
+
+	ic->i_send_cq = attr.send_cq;
+	ic->i_recv_cq = attr.recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (ic->i_send_hdrs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (ic->i_recv_hdrs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (ic->i_ack == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+	if (ic->i_sends == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+	rds_iw_send_init_ring(ic);
+
+	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+	if (ic->i_recvs == NULL) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+
+	rds_iw_recv_init_ring(ic);
+	rds_iw_recv_init_ack(ic);
+
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	return ret;
+}
+
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+	u16 common;
+	u32 version = 0;
+
+	/* rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-( */
+	if (dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else if (printk_ratelimit()) {
+		printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+			"incompatible protocol version %u.%u\n",
+			&dp->dp_saddr,
+			dp->dp_protocol_major,
+			dp->dp_protocol_minor);
+	}
+	return version;
+}
+
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+	struct rds_iw_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_iw_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_device *rds_iwdev;
+	u32 version;
+	int err, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_iw_protocol_compatible(dp);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+		 &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_iw_stats_inc(s_iw_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_iw_stats_inc(s_iw_connect_raced);
+		}
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_iw_set_protocol(conn, version);
+	rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_iw_setup_qp(conn);
+	if (err) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	mutex_unlock(&conn->c_cm_lock);
+	if (err) {
+		rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_iw_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_iw_setup_qp(conn);
+	if (ret) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		struct rds_iw_connection *ic = conn->c_transport_data;
+
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	/* First, bind to the local address and device. */
+	ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+	if (ret) {
+		rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+				&conn->c_laddr, ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		goto out;
+	}
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int err = 0;
+	struct ib_qp_attr qp_attr;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+				   " cm: %p err %d\n", ic->i_cm_id, err);
+		}
+
+		if (ic->i_cm_id->qp) {
+			qp_attr.qp_state = IB_QPS_ERR;
+			ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+		}
+
+		wait_event(rds_iw_ring_empty_wait,
+			rds_iw_ring_empty(&ic->i_send_ring) &&
+			rds_iw_ring_empty(&ic->i_recv_ring));
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_iw_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_iw_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+
+		/*
+		 * If associated with an rds_iw_device:
+		 * 	Move connection back to the nodev list.
+		 * 	Remove cm_id from the device cm_id list.
+		 */
+		if (ic->rds_iwdev) {
+
+			spin_lock_irq(&ic->rds_iwdev->spinlock);
+			BUG_ON(list_empty(&ic->iw_node));
+			list_del(&ic->iw_node);
+			spin_unlock_irq(&ic->rds_iwdev->spinlock);
+
+			spin_lock_irq(&iw_nodev_conns_lock);
+			list_add_tail(&ic->iw_node, &iw_nodev_conns);
+			spin_unlock_irq(&iw_nodev_conns_lock);
+			rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+			ic->rds_iwdev = NULL;
+		}
+
+		rdma_destroy_id(ic->i_cm_id);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_iwdev);
+
+	/* Clear pending transmit */
+	if (ic->i_rm) {
+		rds_message_put(ic->i_rm);
+		ic->i_rm = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_iw_set_64bit(&ic->i_ack_next, 0);
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	if (ic->i_iwinc) {
+		rds_inc_put(&ic->i_iwinc->ii_inc);
+		ic->i_iwinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+	rdsdebug("shutdown complete\n");
+}
+
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_iw_connection *ic;
+	unsigned long flags;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+	if (ic == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ic->iw_node);
+	mutex_init(&ic->i_recv_mutex);
+
+	/*
+	 * rds_iw_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+	spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+void rds_iw_conn_free(void *arg)
+{
+	struct rds_iw_connection *ic = arg;
+	rdsdebug("ic %p\n", ic);
+	list_del(&ic->iw_node);
+	kfree(ic);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 000000000000..1c02a8f952d0
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+	struct rds_iw_device	*device;
+	struct rds_iw_mr_pool	*pool;
+	struct rdma_cm_id	*cm_id;
+
+	struct ib_mr	*mr;
+	struct ib_fast_reg_page_list *page_list;
+
+	struct rds_iw_mapping	mapping;
+	unsigned char		remap_count;
+};
+
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+	struct rds_iw_device	*device;		/* back ptr to the device that owns us */
+
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct work_struct	flush_worker;		/* flush worker */
+
+	spinlock_t		list_lock;		/* protect variables below */
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+	struct list_head	dirty_list;		/* dirty mappings */
+	struct list_head	clean_list;		/* unused & unamapped MRs */
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_message_size;	/* in pages */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	int			max_pages;
+};
+
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			  struct rds_iw_mr *ibmr,
+			  struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+			struct list_head *unmap_list,
+			struct list_head *kill_list);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
+{
+	struct rds_iw_device *iwdev;
+	struct rds_iw_cm_id *i_cm_id;
+
+	*rds_iwdev = NULL;
+	*cm_id = NULL;
+
+	list_for_each_entry(iwdev, &rds_iw_devices, list) {
+		spin_lock_irq(&iwdev->spinlock);
+		list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+			struct sockaddr_in *src_addr, *dst_addr;
+
+			src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+			dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+
+			rdsdebug("local ipaddr = %x port %d, "
+				 "remote ipaddr = %x port %d"
+				 "..looking for %x port %d, "
+				 "remote ipaddr = %x port %d\n",
+				src_addr->sin_addr.s_addr,
+				src_addr->sin_port,
+				dst_addr->sin_addr.s_addr,
+				dst_addr->sin_port,
+				rs->rs_bound_addr,
+				rs->rs_bound_port,
+				rs->rs_conn_addr,
+				rs->rs_conn_port);
+#ifdef WORKING_TUPLE_DETECTION
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+			    src_addr->sin_port == rs->rs_bound_port &&
+			    dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+			    dst_addr->sin_port == rs->rs_conn_port) {
+#else
+			/* FIXME - needs to compare the local and remote
+			 * ipaddr/port tuple, but the ipaddr is the only
+			 * available infomation in the rds_sock (as the rest are
+			 * zero'ed.  It doesn't appear to be properly populated
+			 * during connection setup...
+			 */
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
+#endif
+				spin_unlock_irq(&iwdev->spinlock);
+				*rds_iwdev = iwdev;
+				*cm_id = i_cm_id->cm_id;
+				return 0;
+			}
+		}
+		spin_unlock_irq(&iwdev->spinlock);
+	}
+
+	return 1;
+}
+
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+	if (!i_cm_id)
+		return -ENOMEM;
+
+	i_cm_id->cm_id = cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	return 0;
+}
+
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+		if (i_cm_id->cm_id == cm_id) {
+			list_del(&i_cm_id->list);
+			kfree(i_cm_id);
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+}
+
+
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct sockaddr_in *src_addr, *dst_addr;
+	struct rds_iw_device *rds_iwdev_old;
+	struct rds_sock rs;
+	struct rdma_cm_id *pcm_id;
+	int rc;
+
+	src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+	dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+
+	rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+	rs.rs_bound_port = src_addr->sin_port;
+	rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+	rs.rs_conn_port = dst_addr->sin_port;
+
+	rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
+	if (rc)
+		rds_iw_remove_cm_id(rds_iwdev, cm_id);
+
+	return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+
+int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&iw_nodev_conns_lock);
+	BUG_ON(list_empty(&iw_nodev_conns));
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+	spin_unlock_irq(&iw_nodev_conns_lock);
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	ic->rds_iwdev = rds_iwdev;
+
+	return 0;
+}
+
+void rds_iw_remove_nodev_conns(void)
+{
+	struct rds_iw_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&iw_nodev_conns_lock);
+	list_splice(&iw_nodev_conns, &tmp_list);
+	INIT_LIST_HEAD(&iw_nodev_conns);
+	spin_unlock_irq(&iw_nodev_conns_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+		if (ic->conn->c_passive)
+			rds_conn_destroy(ic->conn->c_passive);
+		rds_conn_destroy(ic->conn);
+	}
+}
+
+void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_splice(&rds_iwdev->conn_list, &tmp_list);
+	INIT_LIST_HEAD(&rds_iwdev->conn_list);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+		if (ic->conn->c_passive)
+			rds_conn_destroy(ic->conn->c_passive);
+		rds_conn_destroy(ic->conn);
+	}
+}
+
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+		struct scatterlist *list, unsigned int sg_len)
+{
+	sg->list = list;
+	sg->len = sg_len;
+	sg->dma_len = 0;
+	sg->dma_npages = 0;
+	sg->bytes = 0;
+}
+
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+			struct rds_iw_scatterlist *sg,
+			unsigned int dma_page_shift)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	u64 *dma_pages = NULL;
+	u64 dma_mask;
+	unsigned int dma_page_size;
+	int i, j, ret;
+
+	dma_page_size = 1 << dma_page_shift;
+	dma_mask = dma_page_size - 1;
+
+	WARN_ON(sg->dma_len);
+
+	sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	if (unlikely(!sg->dma_len)) {
+		printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	sg->bytes = 0;
+	sg->dma_npages = 0;
+
+	ret = -EINVAL;
+	for (i = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		sg->bytes += dma_len;
+
+		end_addr = dma_addr + dma_len;
+		if (dma_addr & dma_mask) {
+			if (i > 0)
+				goto out_unmap;
+			dma_addr &= ~dma_mask;
+		}
+		if (end_addr & dma_mask) {
+			if (i < sg->dma_len - 1)
+				goto out_unmap;
+			end_addr = (end_addr + dma_mask) & ~dma_mask;
+		}
+
+		sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
+	}
+
+	/* Now gather the dma addrs into one list */
+	if (sg->dma_npages > fastreg_message_size)
+		goto out_unmap;
+
+	dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+	if (!dma_pages) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	for (i = j = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		end_addr = dma_addr + dma_len;
+		dma_addr &= ~dma_mask;
+		for (; dma_addr < end_addr; dma_addr += dma_page_size)
+			dma_pages[j++] = dma_addr;
+		BUG_ON(j > sg->dma_npages);
+	}
+
+	return dma_pages;
+
+out_unmap:
+	ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	sg->dma_len = 0;
+	kfree(dma_pages);
+	return ERR_PTR(ret);
+}
+
+
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->device = rds_iwdev;
+	INIT_LIST_HEAD(&pool->dirty_list);
+	INIT_LIST_HEAD(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	spin_lock_init(&pool->list_lock);
+	INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+
+	pool->max_message_size = fastreg_message_size;
+	pool->max_items = fastreg_pool_size;
+	pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+	pool->max_pages = fastreg_message_size;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = pool->max_items * 3 / 4;
+
+	return pool;
+}
+
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->max_pages;
+}
+
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+	flush_workqueue(rds_wq);
+	rds_iw_flush_mr_pool(pool, 1);
+	BUG_ON(atomic_read(&pool->item_count));
+	BUG_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+	struct rds_iw_mr *ibmr = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	if (!list_empty(&pool->clean_list)) {
+		ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+		list_del_init(&ibmr->mapping.m_list);
+	}
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	return ibmr;
+}
+
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+	struct rds_iw_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	while (1) {
+		ibmr = rds_iw_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+		rds_iw_flush_mr_pool(pool, 0);
+	}
+
+	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	spin_lock_init(&ibmr->mapping.m_lock);
+	INIT_LIST_HEAD(&ibmr->mapping.m_list);
+	ibmr->mapping.m_mr = ibmr;
+
+	err = rds_iw_init_fastreg(pool, ibmr);
+	if (err)
+		goto out_no_cigar;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_device *rds_iwdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+	struct rds_iw_mr *ibmr, *next;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(kill_list);
+	unsigned long flags;
+	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	int ret = 0;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+
+	mutex_lock(&pool->flush_lock);
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	/* Get the list of all mappings to be destroyed */
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	if (free_all)
+		list_splice_init(&pool->clean_list, &kill_list);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	free_goal = rds_iw_flush_goal(pool, free_all);
+
+	/* Batched invalidate of dirty MRs.
+	 * For FMR based MRs, the mappings on the unmap list are
+	 * actually members of an ibmr (ibmr->mapping). They either
+	 * migrate to the kill_list, or have been cleaned and should be
+	 * moved to the clean_list.
+	 * For fastregs, they will be dynamically allocated, and
+	 * will be destroyed by the unmap function.
+	 */
+	if (!list_empty(&unmap_list)) {
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+		/* If we've been asked to destroy all MRs, move those
+		 * that were simply cleaned to the kill list */
+		if (free_all)
+			list_splice_init(&unmap_list, &kill_list);
+	}
+
+	/* Destroy any MRs that are past their best before date */
+	list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+		rds_iw_stats_inc(s_iw_rdma_mr_free);
+		list_del(&ibmr->mapping.m_list);
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+		nfreed++;
+	}
+
+	/* Anything that remains are laundered ibmrs, which we can add
+	 * back to the clean list. */
+	if (!list_empty(&unmap_list)) {
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_splice(&unmap_list, &pool->clean_list);
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+	mutex_unlock(&pool->flush_lock);
+	return ret;
+}
+
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+
+	rds_iw_flush_mr_pool(pool, 0);
+}
+
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+
+	rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+	if (!pool)
+		return;
+
+	/* Return it to the pool's free list */
+	rds_iw_free_fastreg(pool, ibmr);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+	 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_work(rds_wq, &pool->flush_worker);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_iw_flush_mr_pool(pool, 0);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			queue_work(rds_wq, &pool->flush_worker);
+		}
+	}
+}
+
+void rds_iw_flush_mrs(void)
+{
+	struct rds_iw_device *rds_iwdev;
+
+	list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+		struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+		if (pool)
+			rds_iw_flush_mr_pool(pool, 0);
+	}
+}
+
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_mr *ibmr = NULL;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
+	if (ret || !cm_id) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_iwdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_iw_alloc_mr(rds_iwdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ibmr->cm_id = cm_id;
+	ibmr->device = rds_iwdev;
+
+	ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->mr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+
+out:
+	if (ret) {
+		if (ibmr)
+			rds_iw_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	return ibmr;
+}
+
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup.  If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+				struct rds_iw_mr *ibmr)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct ib_fast_reg_page_list *page_list = NULL;
+	struct ib_mr *mr;
+	int err;
+
+	mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+		return err;
+	}
+
+	/* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+	 * is not filled in.
+	 */
+	page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+	if (IS_ERR(page_list)) {
+		err = PTR_ERR(page_list);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+		ib_dereg_mr(mr);
+		return err;
+	}
+
+	ibmr->page_list = page_list;
+	ibmr->mr = mr;
+	return 0;
+}
+
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+	struct rds_iw_mr *ibmr = mapping->m_mr;
+	struct ib_send_wr f_wr, *failed_wr;
+	int ret;
+
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+	 * counter, which should guarantee uniqueness.
+	 */
+	ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+	mapping->m_rkey = ibmr->mr->rkey;
+
+	memset(&f_wr, 0, sizeof(f_wr));
+	f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+	f_wr.opcode = IB_WR_FAST_REG_MR;
+	f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+	f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+	f_wr.wr.fast_reg.page_list = ibmr->page_list;
+	f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+	f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
+	f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_READ |
+				IB_ACCESS_REMOTE_WRITE;
+	f_wr.wr.fast_reg.iova_start = 0;
+	f_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &f_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+	BUG_ON(failed_wr != &f_wr);
+	if (ret && printk_ratelimit())
+		printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+	return ret;
+}
+
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+	struct ib_send_wr s_wr, *failed_wr;
+	int ret = 0;
+
+	if (!ibmr->cm_id->qp || !ibmr->mr)
+		goto out;
+
+	memset(&s_wr, 0, sizeof(s_wr));
+	s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+	s_wr.opcode = IB_WR_LOCAL_INV;
+	s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+	s_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &s_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+	if (ret && printk_ratelimit()) {
+		printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			struct rds_iw_mr *ibmr,
+			struct scatterlist *sg,
+			unsigned int sg_len)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct rds_iw_mapping *mapping = &ibmr->mapping;
+	u64 *dma_pages;
+	int i, ret = 0;
+
+	rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+
+	dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+				&mapping->m_sg,
+				rds_iwdev->page_shift);
+	if (IS_ERR(dma_pages)) {
+		ret = PTR_ERR(dma_pages);
+		dma_pages = NULL;
+		goto out;
+	}
+
+	if (mapping->m_sg.dma_len > pool->max_message_size) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+		ibmr->page_list->page_list[i] = dma_pages[i];
+
+	ret = rds_iw_rdma_build_fastreg(mapping);
+	if (ret)
+		goto out;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_used);
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!ibmr->mapping.m_sg.dma_len)
+		return;
+
+	ret = rds_iw_rdma_fastreg_inv(ibmr);
+	if (ret)
+		return;
+
+	/* Try to post the LOCAL_INV WR to the queue. */
+	spin_lock_irqsave(&pool->list_lock, flags);
+
+	list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+	atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+				struct list_head *unmap_list,
+				struct list_head *kill_list)
+{
+	struct rds_iw_mapping *mapping, *next;
+	unsigned int ncleaned = 0;
+	LIST_HEAD(laundered);
+
+	/* Batched invalidation of fastreg MRs.
+	 * Why do we do it this way, even though we could pipeline unmap
+	 * and remap? The reason is the application semantics - when the
+	 * application requests an invalidation of MRs, it expects all
+	 * previously released R_Keys to become invalid.
+	 *
+	 * If we implement MR reuse naively, we risk memory corruption
+	 * (this has actually been observed). So the default behavior
+	 * requires that a MR goes through an explicit unmap operation before
+	 * we can reuse it again.
+	 *
+	 * We could probably improve on this a little, by allowing immediate
+	 * reuse of a MR on the same socket (eg you could add small
+	 * cache of unused MRs to strct rds_socket - GET_MR could grab one
+	 * of these without requiring an explicit invalidate).
+	 */
+	while (!list_empty(unmap_list)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			list_move(&mapping->m_list, &laundered);
+			ncleaned++;
+		}
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	/* Move all laundered mappings back to the unmap list.
+	 * We do not kill any WRs right now - it doesn't seem the
+	 * fastreg API has a max_remap limit. */
+	list_splice_init(&laundered, unmap_list);
+
+	return ncleaned;
+}
+
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	if (ibmr->page_list)
+		ib_free_fast_reg_page_list(ibmr->page_list);
+	if (ibmr->mr)
+		ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 000000000000..a1931f0027a2
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t	rds_iw_allocation = ATOMIC_INIT(0);
+
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	__free_page(frag->f_page);
+	frag->f_page = NULL;
+}
+
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	BUG_ON(frag->f_page != NULL);
+	kmem_cache_free(rds_iw_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+				   struct rds_iw_recv_work *recv)
+{
+	struct rds_page_frag *frag = recv->r_frag;
+
+	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+	if (frag->f_mapped)
+		ib_dma_unmap_page(ic->i_cm_id->device,
+			       frag->f_mapped,
+			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+	frag->f_mapped = 0;
+}
+
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_iwinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+
+		sge = rds_iw_data_sge(ic, recv->r_sge);
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, recv->r_sge);
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+	}
+}
+
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+				  struct rds_iw_recv_work *recv)
+{
+	if (recv->r_iwinc) {
+		rds_inc_put(&recv->r_iwinc->ii_inc);
+		recv->r_iwinc = NULL;
+	}
+	if (recv->r_frag) {
+		rds_iw_recv_unmap_page(ic, recv);
+		if (recv->r_frag->f_page)
+			rds_iw_frag_drop_page(recv->r_frag);
+		rds_iw_frag_free(recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+
+	if (ic->i_frag.f_page)
+		rds_iw_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+				  struct rds_iw_recv_work *recv,
+				  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+
+	if (recv->r_iwinc == NULL) {
+		if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
+			rds_iw_stats_inc(s_iw_rx_alloc_limit);
+			goto out;
+		}
+		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+						 kptr_gfp);
+		if (recv->r_iwinc == NULL)
+			goto out;
+		atomic_inc(&rds_iw_allocation);
+		INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+	}
+
+	if (recv->r_frag == NULL) {
+		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+		if (recv->r_frag == NULL)
+			goto out;
+		INIT_LIST_HEAD(&recv->r_frag->f_item);
+		recv->r_frag->f_page = NULL;
+	}
+
+	if (ic->i_frag.f_page == NULL) {
+		ic->i_frag.f_page = alloc_page(page_gfp);
+		if (ic->i_frag.f_page == NULL)
+			goto out;
+		ic->i_frag.f_offset = 0;
+	}
+
+	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+				  ic->i_frag.f_page,
+				  ic->i_frag.f_offset,
+				  RDS_FRAG_SIZE,
+				  DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+		goto out;
+
+	/*
+	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+	 * must be called on this recv.  This happens as completions hit
+	 * in order or on connection shutdown.
+	 */
+	recv->r_frag->f_page = ic->i_frag.f_page;
+	recv->r_frag->f_offset = ic->i_frag.f_offset;
+	recv->r_frag->f_mapped = dma_addr;
+
+	sge = rds_iw_data_sge(ic, recv->r_sge);
+	sge->addr = dma_addr;
+	sge->length = RDS_FRAG_SIZE;
+
+	sge = rds_iw_header_sge(ic, recv->r_sge);
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	get_page(recv->r_frag->f_page);
+
+	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+		ic->i_frag.f_offset += RDS_FRAG_SIZE;
+	} else {
+		put_page(ic->i_frag.f_page);
+		ic->i_frag.f_page = NULL;
+		ic->i_frag.f_offset = 0;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn))
+			&& rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			ret = -EINVAL;
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		if (ret) {
+			ret = -1;
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_iwinc, recv->r_frag->f_page,
+			 (long) recv->r_frag->f_mapped, ret);
+		if (ret) {
+			rds_iw_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			ret = -1;
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_iw_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+	return ret;
+}
+
+void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+
+	list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_iw_frag_drop_page(frag);
+		rds_iw_frag_free(frag);
+	}
+}
+
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+
+	rds_iw_inc_purge(inc);
+	rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+	BUG_ON(!list_empty(&iwinc->ii_frags));
+	kmem_cache_free(rds_iw_incoming_slab, iwinc);
+	atomic_dec(&rds_iw_allocation);
+	BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %lu] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 frag->f_page, frag->f_offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(frag->f_page,
+					    frag->f_offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IW_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	rds_iw_set_64bit(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_clear_bit();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_clear_bit();
+
+	return ic->i_ack_next;
+}
+
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_iw_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_iw_stats_inc(s_iw_ack_send_failure);
+		/* Need to finesse this later. */
+		BUG();
+	} else
+		rds_iw_stats_inc(s_iw_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_iw_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_iw_stats_inc(s_iw_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
+		rds_iw_stats_inc(s_iw_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_iw_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_iw_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+	return rds_iw_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+			      struct rds_iw_incoming *iwinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr, KM_SOFTIRQ0);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_iw_process_recv(struct rds_connection *conn,
+				struct rds_iw_recv_work *recv, u32 byte_len,
+				struct rds_iw_ack_state *state)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_incoming *iwinc = ic->i_iwinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+		 byte_len);
+
+	if (byte_len < sizeof(struct rds_header)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 didn't inclue a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	byte_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_iw_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_iw_stats_inc(s_iw_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.  We can leave the frag, though, it will be
+		 * reused.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_iw_frag_drop_page(recv->r_frag);
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (iwinc == NULL) {
+		iwinc = recv->r_iwinc;
+		recv->r_iwinc = NULL;
+		ic->i_iwinc = iwinc;
+
+		hdr = &iwinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &iwinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence
+		 || hdr->h_len != ihdr->h_len
+		 || hdr->h_sport != ihdr->h_sport
+		 || hdr->h_dport != ihdr->h_dport) {
+			rds_iw_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_iwinc = NULL;
+
+		if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_iw_cong_recv(conn, iwinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &iwinc->ii_inc, GFP_ATOMIC,
+					  KM_SOFTIRQ0);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&iwinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_iw_ack_state state = { 0, };
+	struct rds_iw_recv_work *recv;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_iw_stats_inc(s_iw_rx_cq_call);
+
+	ib_req_notify_cq(cq, IB_CQ_SOLICITED);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+
+		rds_iw_recv_unmap_page(ic, recv);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+			/* We expect errors as the qp is drained during shutdown */
+			if (wc.status == IB_WC_SUCCESS) {
+				rds_iw_process_recv(conn, recv, wc.byte_len, &state);
+			} else {
+				rds_iw_conn_error(conn, "recv completion on "
+				       "%pI4 had status %u, disconnecting and "
+				       "reconnecting\n", &conn->c_faddr,
+				       wc.status);
+			}
+		}
+
+		rds_iw_ring_free(&ic->i_recv_ring, 1);
+	}
+
+	if (state.ack_next_valid)
+		rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_iw_ring_empty(&ic->i_recv_ring))
+		rds_iw_stats_inc(s_iw_rx_ring_empty);
+
+	/*
+	 * If the ring is running low, then schedule the thread to refill.
+	 */
+	if (rds_iw_ring_low(&ic->i_recv_ring))
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_iw_recv(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+
+	/*
+	 * If we get a temporary posting failure in this context then
+	 * we're really low and we want the caller to back off for a bit.
+	 */
+	mutex_lock(&ic->i_recv_mutex);
+	if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+		ret = -ENOMEM;
+	else
+		rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+	mutex_unlock(&ic->i_recv_mutex);
+
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	return ret;
+}
+
+int __init rds_iw_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+					sizeof(struct rds_iw_incoming),
+					0, 0, NULL);
+	if (rds_iw_incoming_slab == NULL)
+		goto out;
+
+	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+					sizeof(struct rds_page_frag),
+					0, 0, NULL);
+	if (rds_iw_frag_slab == NULL)
+		kmem_cache_destroy(rds_iw_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_iw_recv_exit(void)
+{
+	kmem_cache_destroy(rds_iw_incoming_slab);
+	kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 000000000000..d422d4b5deef
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_iw_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) == 0;
+}
+
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_iw_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_iw_ring_empty(ring) &&
+	    waitqueue_active(&rds_iw_ring_empty_wait))
+		wake_up(&rds_iw_ring_empty_wait);
+}
+
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_empty(ring);
+}
+
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
+}
+
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 000000000000..22dd38ffd608
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "rdma.h"
+#include "iw.h"
+
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+				      int wc_status)
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+				   struct rds_rdma_op *op)
+{
+	if (op->r_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+			op->r_sg, op->r_nents,
+			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->r_mapped = 0;
+	}
+}
+
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+			  struct rds_iw_send_work *send,
+			  int wc_status)
+{
+	struct rds_message *rm = send->s_rm;
+
+	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+	ib_dma_unmap_sg(ic->i_cm_id->device,
+		     rm->m_sg, rm->m_nents,
+		     DMA_TO_DEVICE);
+
+	if (rm->m_rdma_op != NULL) {
+		rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+
+		/* If the user asked for a completion notification on this
+		 * message, we can implement three different semantics:
+		 *  1.	Notify when we received the ACK on the RDS message
+		 *	that was queued with the RDMA. This provides reliable
+		 *	notification of RDMA status at the expense of a one-way
+		 *	packet delay.
+		 *  2.	Notify when the IB stack gives us the completion event for
+		 *	the RDMA operation.
+		 *  3.	Notify when the IB stack gives us the completion event for
+		 *	the accompanying RDS messages.
+		 * Here, we implement approach #3. To implement approach #2,
+		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+		 * don't call rds_rdma_send_complete at all, and fall back to the notify
+		 * handling in the ACK processing code.
+		 *
+		 * Note: There's no need to explicitly sync any RDMA buffers using
+		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+		 * operation itself unmapped the RDMA buffers, which takes care
+		 * of synching.
+		 */
+		rds_iw_send_rdma_complete(rm, wc_status);
+
+		if (rm->m_rdma_op->r_write)
+			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+		else
+			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+	}
+
+	/* If anyone waited for this message to get flushed out, wake
+	 * them up now */
+	rds_message_unmapped(rm);
+
+	rds_message_put(rm);
+	send->s_rm = NULL;
+}
+
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_rm = NULL;
+		send->s_op = NULL;
+		send->s_mapping = NULL;
+
+		send->s_wr.next = NULL;
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.num_sge = 1;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.send_flags = 0;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+
+		send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+		if (IS_ERR(send->s_mr)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+			break;
+		}
+
+		send->s_page_list = ib_alloc_fast_reg_page_list(
+			ic->i_cm_id->device, fastreg_message_size);
+		if (IS_ERR(send->s_page_list)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+			break;
+		}
+	}
+}
+
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		BUG_ON(!send->s_mr);
+		ib_dereg_mr(send->s_mr);
+		BUG_ON(!send->s_page_list);
+		ib_free_fast_reg_page_list(send->s_page_list);
+		if (send->s_wr.opcode == 0xdead)
+			continue;
+		if (send->s_rm)
+			rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+		if (send->s_op)
+			rds_iw_send_unmap_rdma(ic, send->s_op);
+	}
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_iw_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i;
+	int ret;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_iw_stats_inc(s_iw_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_tx_cq_event);
+
+		if (wc.status != IB_WC_SUCCESS) {
+			printk(KERN_ERR "WC Error:  status = %d opcode = %d\n", wc.status, wc.opcode);
+			break;
+		}
+
+		if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+			ic->i_fastreg_posted = 0;
+			continue;
+		}
+
+		if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+			ic->i_fastreg_posted = 1;
+			continue;
+		}
+
+		if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+			if (ic->i_ack_queued + HZ/2 < jiffies)
+				rds_iw_stats_inc(s_iw_tx_stalled);
+			rds_iw_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+
+			/* In the error case, wc.opcode sometimes contains garbage */
+			switch (send->s_wr.opcode) {
+			case IB_WR_SEND:
+				if (send->s_rm)
+					rds_iw_send_unmap_rm(ic, send, wc.status);
+				break;
+			case IB_WR_FAST_REG_MR:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_READ_WITH_INV:
+				/* Nothing to be done - the SG list will be unmapped
+				 * when the SEND completes. */
+				break;
+			default:
+				if (printk_ratelimit())
+					printk(KERN_NOTICE
+						"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+						__func__, send->s_wr.opcode);
+				break;
+			}
+
+			send->s_wr.opcode = 0xdead;
+			send->s_wr.num_sge = 1;
+			if (send->s_queued + HZ/2 < jiffies)
+				rds_iw_stats_inc(s_iw_tx_stalled);
+
+			/* If a RDMA operation produced an error, signal this right
+			 * away. If we don't, the subsequent SEND that goes with this
+			 * RDMA will be canceled with ERR_WFLUSH, and the application
+			 * never learn that the RDMA failed. */
+			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+				struct rds_message *rm;
+
+				rm = rds_send_get_message(conn, send->s_op);
+				if (rm)
+					rds_iw_send_rdma_complete(rm, wc.status);
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_iw_ring_free(&ic->i_send_ring, completed);
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+		 || test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_iw_conn_error(conn,
+				"send completion on %pI4 "
+				"had status %u, disconnecting and reconnecting\n",
+				&conn->c_faddr, wc.status);
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the reciever's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+		struct rds_iw_send_work *send, unsigned int pos,
+		unsigned long buffer, unsigned int length,
+		int send_flags)
+{
+	struct ib_sge *sge;
+
+	WARN_ON(pos != send - ic->i_sends);
+
+	send->s_wr.send_flags = send_flags;
+	send->s_wr.opcode = IB_WR_SEND;
+	send->s_wr.num_sge = 2;
+	send->s_wr.next = NULL;
+	send->s_queued = jiffies;
+	send->s_op = NULL;
+
+	if (length != 0) {
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->addr = buffer;
+		sge->length = length;
+		sge->lkey = rds_iw_local_dma_lkey(ic);
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+	} else {
+		/* We're sending a packet with no payload. There is only
+		 * one SGE */
+		send->s_wr.num_sge = 1;
+		sge = &send->s_sge[0];
+	}
+
+	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int sent;
+	int ret;
+	int flow_controlled = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Fastreg support */
+	if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+	 && !ic->i_fastreg_posted) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	credit_alloc = work_alloc;
+	if (ic->i_flowctl) {
+		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled++;
+		}
+		if (work_alloc == 0) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+			rds_iw_stats_inc(s_iw_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (ic->i_rm == NULL) {
+		/*
+		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+				rm->m_inc.i_hdr.h_flags,
+				be32_to_cpu(rm->m_inc.i_hdr.h_len));
+		   */
+		if (rm->m_nents) {
+			rm->m_count = ib_dma_map_sg(dev,
+					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+			if (rm->m_count == 0) {
+				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->m_count = 0;
+		}
+
+		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+		ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+		rds_message_addref(rm);
+		ic->i_rm = rm;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->m_rdma_op) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_iw_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_iw_send_grab_credits(ic, 0, &posted, 1);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
+	} else if (ic->i_rm != rm)
+		BUG();
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &rm->m_sg[sg];
+	sent = 0;
+	i = 0;
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/*
+	 * We could be copying the header into the unused tail of the page.
+	 * That would need to be changed in the future when those pages might
+	 * be mapped userspace pages or page cache pages.  So instead we always
+	 * use a second sge and our long-lived ring of mapped headers.  We send
+	 * the header after the data so that the data payload can be aligned on
+	 * the receiver.
+	 */
+
+	/* handle a 0-len message */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+		rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+		goto add_header;
+	}
+
+	/* if there's data reference it with a chain of work reqs */
+	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+		unsigned int len;
+
+		send = &ic->i_sends[pos];
+
+		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+		rds_iw_xmit_populate_wr(ic, send, pos,
+				ib_sg_dma_address(dev, scat) + off, len,
+				send_flags);
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time
+		 * on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		ic->i_unsignaled_bytes -= len;
+		if (ic->i_unsignaled_bytes <= 0) {
+			ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		sent += len;
+		off += len;
+		if (off == ib_sg_dma_len(dev, scat)) {
+			scat++;
+			off = 0;
+		}
+
+add_header:
+		/* Tack on the header after the data. The header SGE should already
+		 * have been set up to point to the right header buffer. */
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		if (0) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(hdr->h_dport),
+				hdr->h_flags,
+				be32_to_cpu(hdr->h_len));
+		}
+		if (adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_iw_stats_inc(s_iw_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+	}
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->m_sg[rm->m_count]) {
+		prev->s_rm = ic->i_rm;
+		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		ic->i_rm = NULL;
+	}
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_iw_send_add_credits(conn, credit_alloc - i);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		if (prev->s_rm) {
+			ic->i_rm = prev->s_rm;
+			prev->s_rm = NULL;
+		}
+		goto out;
+	}
+
+	ret = sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+	BUG_ON(nent > send->s_page_list->max_page_list_len);
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.
+	 */
+	send->s_wr.opcode = IB_WR_FAST_REG_MR;
+	send->s_wr.wr.fast_reg.length = len;
+	send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+	send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+	send->s_wr.wr.fast_reg.page_list_len = nent;
+	send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
+	send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+	send->s_wr.wr.fast_reg.iova_start = sg_addr;
+
+	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct rds_iw_device *rds_iwdev;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->r_remote_addr;
+	u32 pos, fr_pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+
+	/* map the message the first time we see it */
+	if (!op->r_mapped) {
+		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+					op->r_sg, op->r_nents, (op->r_write) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+		if (op->r_count == 0) {
+			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->r_mapped = 1;
+	}
+
+	if (!op->r_write) {
+		/* Alloc space on the send queue for the fastreg */
+		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+		if (work_alloc != 1) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+			rds_iw_stats_inc(s_iw_tx_ring_full);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->r_count, rds_iwdev->max_sge);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	if (!op->r_write) {
+		first = prev = &ic->i_sends[fr_pos];
+	} else {
+		first = send;
+		prev = NULL;
+	}
+	scat = &op->r_sg[0];
+	sent = 0;
+	num_sge = op->r_count;
+
+	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags = IB_SEND_SIGNALED;
+		}
+
+		/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+		 * for local access after RDS is finished with it, using
+		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+		 */
+		if (op->r_write)
+			send->s_wr.opcode = IB_WR_RDMA_WRITE;
+		else
+			send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->r_key;
+		send->s_op = op;
+
+		if (num_sge > rds_iwdev->max_sge) {
+			send->s_wr.num_sge = rds_iwdev->max_sge;
+			num_sge -= rds_iwdev->max_sge;
+		} else
+			send->s_wr.num_sge = num_sge;
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+
+			if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+				send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+			else {
+				send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+				send->s_sge[j].length = len;
+				send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+			}
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+			remote_addr += len;
+
+			scat++;
+		}
+
+		if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+			send->s_wr.num_sge = 1;
+			send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+			send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+			send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &op->r_sg[op->r_count])
+		first->s_wr.send_flags = IB_SEND_SIGNALED;
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	/* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+	 * recommended.  Putting the lkey on the wire is a security hole, as it can
+	 * allow for memory access to all of memory on the remote system.  Some
+	 * adapters do not allow using the lkey for this at all.  To bypass this use a
+	 * fastreg_mr (or possibly a dma_mr)
+	 */
+	if (!op->r_write) {
+		rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+			op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+		work_alloc++;
+	}
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 000000000000..ccc7e8f0bf0e
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "iw.h"
+
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+
+static char *rds_iw_stat_names[] = {
+	"iw_connect_raced",
+	"iw_listen_closed_stale",
+	"iw_tx_cq_call",
+	"iw_tx_cq_event",
+	"iw_tx_ring_full",
+	"iw_tx_throttle",
+	"iw_tx_sg_mapping_failure",
+	"iw_tx_stalled",
+	"iw_tx_credit_updates",
+	"iw_rx_cq_call",
+	"iw_rx_cq_event",
+	"iw_rx_ring_empty",
+	"iw_rx_refill_from_cq",
+	"iw_rx_refill_from_thread",
+	"iw_rx_alloc_limit",
+	"iw_rx_credit_updates",
+	"iw_ack_sent",
+	"iw_ack_send_failure",
+	"iw_ack_send_delayed",
+	"iw_ack_send_piggybacked",
+	"iw_ack_received",
+	"iw_rdma_mr_alloc",
+	"iw_rdma_mr_free",
+	"iw_rdma_mr_used",
+	"iw_rdma_mr_pool_flush",
+	"iw_rdma_mr_pool_wait",
+	"iw_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_iw_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_iw_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+			    ARRAY_SIZE(rds_iw_stat_names));
+out:
+	return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 000000000000..9590678cd616
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "iw.h"
+
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_iw_sysctl_flow_control = 1;
+
+ctl_table rds_iw_sysctl_table[] = {
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_send_wr",
+		.data		= &rds_iw_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_recv_wr",
+		.data		= &rds_iw_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_iw_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_wr_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_unsignaled_bytes",
+		.data		= &rds_iw_sysctl_max_unsig_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_bytes_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_bytes_max,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "max_recv_allocation",
+		.data		= &rds_iw_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_minmax,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "flow_control",
+		.data		= &rds_iw_sysctl_flow_control,
+		.maxlen		= sizeof(rds_iw_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0}
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+	{ .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
+	{ }
+};
+
+void rds_iw_sysctl_exit(void)
+{
+	if (rds_iw_sysctl_hdr)
+		unregister_sysctl_table(rds_iw_sysctl_hdr);
+}
+
+int __init rds_iw_sysctl_init(void)
+{
+	rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+	if (rds_iw_sysctl_hdr == NULL)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000000..4a61997f554d
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport.  At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver.  In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+			 unsigned int hdr_off, unsigned int sg,
+			 unsigned int off)
+{
+	BUG_ON(hdr_off || sg || off);
+
+	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+	rds_message_addref(rm); /* for the inc */
+
+	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+			  GFP_KERNEL, KM_USER0);
+
+	rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+			    NULL);
+
+	rds_inc_put(&rm->m_inc);
+
+	return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
+}
+
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+				  struct rds_cong_map *map,
+				  unsigned long offset)
+{
+	unsigned long i;
+
+	BUG_ON(offset);
+	BUG_ON(map != conn->c_lcong);
+
+	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+		memcpy((void *)conn->c_fcong->m_page_addrs[i],
+		       (void *)map->m_page_addrs[i], PAGE_SIZE);
+	}
+
+	rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+
+	return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+	return 0;
+}
+
+struct rds_loop_connection {
+	struct list_head loop_node;
+	struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_loop_connection *lc;
+	unsigned long flags;
+
+	lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+	if (lc == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&lc->loop_node);
+	lc->conn = conn;
+	conn->c_transport_data = lc;
+
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_add_tail(&lc->loop_node, &loop_conns);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+	return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+	struct rds_loop_connection *lc = arg;
+	rdsdebug("lc %p\n", lc);
+	list_del(&lc->loop_node);
+	kfree(lc);
+}
+
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+	rds_connect_complete(conn);
+	return 0;
+}
+
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+
+void rds_loop_exit(void)
+{
+	struct rds_loop_connection *lc, *_lc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&loop_conns_lock);
+	list_splice(&loop_conns, &tmp_list);
+	INIT_LIST_HEAD(&loop_conns);
+	spin_unlock_irq(&loop_conns_lock);
+
+	list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+		WARN_ON(lc->conn->c_passive);
+		rds_conn_destroy(lc->conn);
+	}
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+	.xmit			= rds_loop_xmit,
+	.xmit_cong_map		= rds_loop_xmit_cong_map,
+	.recv			= rds_loop_recv,
+	.conn_alloc		= rds_loop_conn_alloc,
+	.conn_free		= rds_loop_conn_free,
+	.conn_connect		= rds_loop_conn_connect,
+	.conn_shutdown		= rds_loop_conn_shutdown,
+	.inc_copy_to_user	= rds_message_inc_copy_to_user,
+	.inc_purge		= rds_message_inc_purge,
+	.inc_free		= rds_message_inc_free,
+	.t_name			= "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000000..f32b0939a04d
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000000..5a15dc8d0cd7
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
+
+static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE]	= 0,
+[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST]	= sizeof(struct rds_ext_header_rdma_dest),
+};
+
+
+void rds_message_addref(struct rds_message *rm)
+{
+	rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+	atomic_inc(&rm->m_refcount);
+}
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+	unsigned long i;
+
+	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+		return;
+
+	for (i = 0; i < rm->m_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+		/* XXX will have to put_page for page refs */
+		__free_page(sg_page(&rm->m_sg[i]));
+	}
+	rm->m_nents = 0;
+
+	if (rm->m_rdma_op)
+		rds_rdma_free_op(rm->m_rdma_op);
+	if (rm->m_rdma_mr)
+		rds_mr_put(rm->m_rdma_mr);
+}
+
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+	rds_message_purge(rm);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+
+	if (atomic_dec_and_test(&rm->m_refcount)) {
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		BUG_ON(!list_empty(&rm->m_conn_item));
+		rds_message_purge(rm);
+
+		kfree(rm);
+	}
+}
+
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+	rds_message_put(rm);
+}
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq)
+{
+	hdr->h_flags = 0;
+	hdr->h_sport = sport;
+	hdr->h_dport = dport;
+	hdr->h_sequence = cpu_to_be64(seq);
+	hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_extension(struct rds_header *hdr,
+		unsigned int type, const void *data, unsigned int len)
+{
+	unsigned int ext_len = sizeof(u8) + len;
+	unsigned char *dst;
+
+	/* For now, refuse to add more than one extension header */
+	if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+		return 0;
+
+	if (type >= __RDS_EXTHDR_MAX
+	 || len != rds_exthdr_size[type])
+		return 0;
+
+	if (ext_len >= RDS_HEADER_EXT_SPACE)
+		return 0;
+	dst = hdr->h_exthdr;
+
+	*dst++ = type;
+	memcpy(dst, data, len);
+
+	dst[len] = RDS_EXTHDR_NONE;
+	return 1;
+}
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ *	buflen = sizeof(buffer);
+ *	type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ *	if (type == RDS_EXTHDR_NONE)
+ *		break;
+ *	...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+		unsigned int *pos, void *buf, unsigned int *buflen)
+{
+	unsigned int offset, ext_type, ext_len;
+	u8 *src = hdr->h_exthdr;
+
+	offset = *pos;
+	if (offset >= RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	/* Get the extension type and length. For now, the
+	 * length is implied by the extension type. */
+	ext_type = src[offset++];
+
+	if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+		goto none;
+	ext_len = rds_exthdr_size[ext_type];
+	if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	*pos = offset + ext_len;
+	if (ext_len < *buflen)
+		*buflen = ext_len;
+	memcpy(buf, src + offset, *buflen);
+	return ext_type;
+
+none:
+	*pos = RDS_HEADER_EXT_SPACE;
+	*buflen = 0;
+	return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+	struct rds_ext_header_version ext_hdr;
+
+	ext_hdr.h_version = cpu_to_be32(version);
+	return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+	struct rds_ext_header_version ext_hdr;
+	unsigned int pos = 0, len = sizeof(ext_hdr);
+
+	/* We assume the version extension is the only one present */
+	if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+		return 0;
+	*version = be32_to_cpu(ext_hdr.h_version);
+	return 1;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+	struct rds_ext_header_rdma_dest ext_hdr;
+
+	ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+	ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+{
+	struct rds_message *rm;
+
+	rm = kzalloc(sizeof(struct rds_message) +
+		     (nents * sizeof(struct scatterlist)), gfp);
+	if (!rm)
+		goto out;
+
+	if (nents)
+		sg_init_table(rm->m_sg, nents);
+	atomic_set(&rm->m_refcount, 1);
+	INIT_LIST_HEAD(&rm->m_sock_item);
+	INIT_LIST_HEAD(&rm->m_conn_item);
+	spin_lock_init(&rm->m_rs_lock);
+
+out:
+	return rm;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+	struct rds_message *rm;
+	unsigned int i;
+
+	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+	if (rm == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+	rm->m_nents = ceil(total_len, PAGE_SIZE);
+
+	for (i = 0; i < rm->m_nents; ++i) {
+		sg_set_page(&rm->m_sg[i],
+				virt_to_page(page_addrs[i]),
+				PAGE_SIZE, 0);
+	}
+
+	return rm;
+}
+
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+					       size_t total_len)
+{
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long sg_off;
+	struct rds_message *rm;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	int ret;
+
+	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+	if (rm == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+
+	/*
+	 * now allocate and copy in the data payload.
+	 */
+	sg = rm->m_sg;
+	iov = first_iov;
+	iov_off = 0;
+	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+	while (total_len) {
+		if (sg_page(sg) == NULL) {
+			ret = rds_page_remainder_alloc(sg, total_len,
+						       GFP_HIGHUSER);
+			if (ret)
+				goto out;
+			rm->m_nents++;
+			sg_off = 0;
+		}
+
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+		to_copy = min_t(size_t, to_copy, total_len);
+
+		rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 (void *)sg_page(sg), sg->offset, sg->length, sg_off);
+
+		ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+					      iov->iov_base + iov_off,
+					      to_copy);
+		if (ret)
+			goto out;
+
+		iov_off += to_copy;
+		total_len -= to_copy;
+		sg_off += to_copy;
+
+		if (sg_off == sg->length)
+			sg++;
+	}
+
+	ret = 0;
+out:
+	if (ret) {
+		if (rm)
+			rds_message_put(rm);
+		rm = ERR_PTR(ret);
+	}
+	return rm;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size)
+{
+	struct rds_message *rm;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long vec_off;
+	int copied;
+	int ret;
+	u32 len;
+
+	rm = container_of(inc, struct rds_message, m_inc);
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	iov = first_iov;
+	iov_off = 0;
+	sg = rm->m_sg;
+	vec_off = 0;
+	copied = 0;
+
+	while (copied < size && copied < len) {
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 sg_page(sg), sg->offset, sg->length, vec_off);
+
+		ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		vec_off += to_copy;
+		copied += to_copy;
+
+		if (vec_off == sg->length) {
+			vec_off = 0;
+			sg++;
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+	wait_event(rds_message_flush_waitq,
+			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+	if (waitqueue_active(&rds_message_flush_waitq))
+		wake_up(&rds_message_flush_waitq);
+}
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000000..c460743a89ad
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+	struct page	*r_page;
+	unsigned long	r_offset;
+};
+
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages.  If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence.  (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user)
+{
+	unsigned long ret;
+	void *addr;
+
+	if (to_user)
+		rds_stats_add(s_copy_to_user, bytes);
+	else
+		rds_stats_add(s_copy_from_user, bytes);
+
+	addr = kmap_atomic(page, KM_USER0);
+	if (to_user)
+		ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+	else
+		ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+	kunmap_atomic(addr, KM_USER0);
+
+	if (ret) {
+		addr = kmap(page);
+		if (to_user)
+			ret = copy_to_user(ptr, addr + offset, bytes);
+		else
+			ret = copy_from_user(addr + offset, ptr, bytes);
+		kunmap(page);
+		if (ret)
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Message allocation uses this to build up regions of a message.
+ *
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure.  Future partial-page allocations may be
+ * satisfied from that cached region.  This lets us waste less memory on
+ * small allocations with minimal complexity.  It works because the transmit
+ * path passes read-only page regions down to devices.  They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp)
+{
+	struct rds_page_remainder *rem;
+	unsigned long flags;
+	struct page *page;
+	int ret;
+
+	gfp |= __GFP_HIGHMEM;
+
+	/* jump straight to allocation if we're trying for a huge page */
+	if (bytes >= PAGE_SIZE) {
+		page = alloc_page(gfp);
+		if (page == NULL) {
+			ret = -ENOMEM;
+		} else {
+			sg_set_page(scat, page, PAGE_SIZE, 0);
+			ret = 0;
+		}
+		goto out;
+	}
+
+	rem = &per_cpu(rds_page_remainders, get_cpu());
+	local_irq_save(flags);
+
+	while (1) {
+		/* avoid a tiny region getting stuck by tossing it */
+		if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+			rds_stats_inc(s_page_remainder_miss);
+			__free_page(rem->r_page);
+			rem->r_page = NULL;
+		}
+
+		/* hand out a fragment from the cached page */
+		if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+			sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+			get_page(sg_page(scat));
+
+			if (rem->r_offset != 0)
+				rds_stats_inc(s_page_remainder_hit);
+
+			rem->r_offset += bytes;
+			if (rem->r_offset == PAGE_SIZE) {
+				__free_page(rem->r_page);
+				rem->r_page = NULL;
+			}
+			ret = 0;
+			break;
+		}
+
+		/* alloc if there is nothing for us to use */
+		local_irq_restore(flags);
+		put_cpu();
+
+		page = alloc_page(gfp);
+
+		rem = &per_cpu(rds_page_remainders, get_cpu());
+		local_irq_save(flags);
+
+		if (page == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/* did someone race to fill the remainder before us? */
+		if (rem->r_page) {
+			__free_page(page);
+			continue;
+		}
+
+		/* otherwise install our page and loop around to alloc */
+		rem->r_page = page;
+		rem->r_offset = 0;
+	}
+
+	local_irq_restore(flags);
+	put_cpu();
+out:
+	rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+		 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+		 ret ? 0 : scat->length);
+	return ret;
+}
+
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+					 unsigned long action, void *hcpu)
+{
+	struct rds_page_remainder *rem;
+	long cpu = (long)hcpu;
+
+	rem = &per_cpu(rds_page_remainders, cpu);
+
+	rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+
+	switch (action) {
+	case CPU_DEAD:
+		if (rem->r_page)
+			__free_page(rem->r_page);
+		rem->r_page = NULL;
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block rds_page_remainder_nb = {
+	.notifier_call = rds_page_remainder_cpu_notify,
+};
+
+void rds_page_exit(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+					      (unsigned long)CPU_DEAD,
+					      (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000000..eaeeb91e1119
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rdma.h"
+
+/*
+ * XXX
+ *  - build with sparse
+ *  - should we limit the size of a mr region?  let transport return failure?
+ *  - should we detect duplicate keys on a socket?  hmm.
+ *  - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int.  This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+	if ((vec->addr + vec->bytes <= vec->addr) ||
+	    (vec->bytes > (u64)UINT_MAX))
+		return 0;
+
+	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+		(vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+				       struct rds_mr *insert)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_mr *mr;
+
+	while (*p) {
+		parent = *p;
+		mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+		if (key < mr->r_key)
+			p = &(*p)->rb_left;
+		else if (key > mr->r_key)
+			p = &(*p)->rb_right;
+		else
+			return mr;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->r_rb_node, parent, p);
+		rb_insert_color(&insert->r_rb_node, root);
+		atomic_inc(&insert->r_refcount);
+	}
+	return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+	struct rds_sock *rs = mr->r_sock;
+	void *trans_private = NULL;
+	unsigned long flags;
+
+	rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+			mr->r_key, atomic_read(&mr->r_refcount));
+
+	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+		return;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	if (!RB_EMPTY_NODE(&mr->r_rb_node))
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+	trans_private = mr->r_trans_private;
+	mr->r_trans_private = NULL;
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (trans_private)
+		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+	rds_destroy_mr(mr);
+	kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+	struct rds_mr *mr;
+	struct rb_node *node;
+
+	/* Release any MRs associated with this socket */
+	while ((node = rb_first(&rs->rs_rdma_keys))) {
+		mr = container_of(node, struct rds_mr, r_rb_node);
+		if (mr->r_trans == rs->rs_transport)
+			mr->r_invalidate = 0;
+		rds_mr_put(mr);
+	}
+
+	if (rs->rs_transport && rs->rs_transport->flush_mrs)
+		rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+			struct page **pages, int write)
+{
+	int ret;
+
+	down_read(&current->mm->mmap_sem);
+	ret = get_user_pages(current, current->mm, user_addr,
+			     nr_pages, write, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (0 <= ret && (unsigned) ret < nr_pages) {
+		while (ret--)
+			put_page(pages[ret]);
+		ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+				u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+	struct rds_mr *mr = NULL, *found;
+	unsigned int nr_pages;
+	struct page **pages = NULL;
+	struct scatterlist *sg;
+	void *trans_private;
+	unsigned long flags;
+	rds_rdma_cookie_t cookie;
+	unsigned int nents;
+	long i;
+	int ret;
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (rs->rs_transport->get_mr == NULL) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	nr_pages = rds_pages_in_vec(&args->vec);
+	if (nr_pages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+		args->vec.addr, args->vec.bytes, nr_pages);
+
+	/* XXX clamp nr_pages to limit the size of this alloc? */
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+	if (mr == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&mr->r_refcount, 1);
+	RB_CLEAR_NODE(&mr->r_rb_node);
+	mr->r_trans = rs->rs_transport;
+	mr->r_sock = rs;
+
+	if (args->flags & RDS_RDMA_USE_ONCE)
+		mr->r_use_once = 1;
+	if (args->flags & RDS_RDMA_INVALIDATE)
+		mr->r_invalidate = 1;
+	if (args->flags & RDS_RDMA_READWRITE)
+		mr->r_write = 1;
+
+	/*
+	 * Pin the pages that make up the user buffer and transfer the page
+	 * pointers to the mr's sg array.  We check to see if we've mapped
+	 * the whole region after transferring the partial page references
+	 * to the sg array so that we can have one page ref cleanup path.
+	 *
+	 * For now we have no flag that tells us whether the mapping is
+	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+	 * the zero page.
+	 */
+	ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+	if (ret < 0)
+		goto out;
+
+	nents = ret;
+	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+	if (sg == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	WARN_ON(!nents);
+	sg_init_table(sg, nents);
+
+	/* Stick all pages into the scatterlist */
+	for (i = 0 ; i < nents; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+	rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+	/* Obtain a transport specific MR. If this succeeds, the
+	 * s/g list is now owned by the MR.
+	 * Note that dma_map() implies that pending writes are
+	 * flushed to RAM, so no dma_sync is needed here. */
+	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+						 &mr->r_key);
+
+	if (IS_ERR(trans_private)) {
+		for (i = 0 ; i < nents; i++)
+			put_page(sg_page(&sg[i]));
+		kfree(sg);
+		ret = PTR_ERR(trans_private);
+		goto out;
+	}
+
+	mr->r_trans_private = trans_private;
+
+	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+	       mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+	/* The user may pass us an unaligned address, but we can only
+	 * map page aligned regions. So we keep the offset, and build
+	 * a 64bit cookie containing <R_Key, offset> and pass that
+	 * around. */
+	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+	if (cookie_ret)
+		*cookie_ret = cookie;
+
+	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Inserting the new MR into the rbtree bumps its
+	 * reference count. */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	BUG_ON(found && found != mr);
+
+	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+	if (mr_ret) {
+		atomic_inc(&mr->r_refcount);
+		*mr_ret = mr;
+	}
+
+	ret = 0;
+out:
+	kfree(pages);
+	if (mr)
+		rds_mr_put(mr);
+	return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_args args;
+
+	if (optlen != sizeof(struct rds_get_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+			   sizeof(struct rds_get_mr_args)))
+		return -EFAULT;
+
+	return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_free_mr_args args;
+	struct rds_mr *mr;
+	unsigned long flags;
+
+	if (optlen != sizeof(struct rds_free_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+			   sizeof(struct rds_free_mr_args)))
+		return -EFAULT;
+
+	/* Special case - a null cookie means flush all unused MRs */
+	if (args.cookie == 0) {
+		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+			return -EINVAL;
+		rs->rs_transport->flush_mrs();
+		return 0;
+	}
+
+	/* Look up the MR given its R_key and remove it from the rbtree
+	 * so nobody else finds it.
+	 * This should also prevent races with rds_rdma_unuse.
+	 */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+	if (mr) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		if (args.flags & RDS_RDMA_INVALIDATE)
+			mr->r_invalidate = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (!mr)
+		return -EINVAL;
+
+	/*
+	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+	 * we return.  If we let rds_mr_put() do it it might not happen until
+	 * someone else drops their ref.
+	 */
+	rds_destroy_mr(mr);
+	rds_mr_put(mr);
+	return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+	struct rds_mr *mr;
+	unsigned long flags;
+	int zot_me = 0;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (mr && (mr->r_use_once || force)) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		zot_me = 1;
+	} else if (mr)
+		atomic_inc(&mr->r_refcount);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	/* May have to issue a dma_sync on this memory region.
+	 * Note we could avoid this if the operation was a RDMA READ,
+	 * but at this point we can't tell. */
+	if (mr != NULL) {
+		if (mr->r_trans->sync_mr)
+			mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+		/* If the MR was marked as invalidate, this will
+		 * trigger an async flush. */
+		if (zot_me)
+			rds_destroy_mr(mr);
+		rds_mr_put(mr);
+	}
+}
+
+void rds_rdma_free_op(struct rds_rdma_op *ro)
+{
+	unsigned int i;
+
+	for (i = 0; i < ro->r_nents; i++) {
+		struct page *page = sg_page(&ro->r_sg[i]);
+
+		/* Mark page dirty if it was possibly modified, which
+		 * is the case for a RDMA_READ which copies from remote
+		 * to local memory */
+		if (!ro->r_write)
+			set_page_dirty(page);
+		put_page(page);
+	}
+
+	kfree(ro->r_notifier);
+	kfree(ro);
+}
+
+/*
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ */
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+					    struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_rdma_op *op = NULL;
+	unsigned int nr_pages;
+	unsigned int max_pages;
+	unsigned int nr_bytes;
+	struct page **pages = NULL;
+	struct rds_iovec __user *local_vec;
+	struct scatterlist *sg;
+	unsigned int nr;
+	unsigned int i, j;
+	int ret;
+
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (args->nr_local > (u64)UINT_MAX) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	nr_pages = 0;
+	max_pages = 0;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		nr = rds_pages_in_vec(&vec);
+		if (nr == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		max_pages = max(nr, max_pages);
+		nr_pages += nr;
+	}
+
+	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+	if (op == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->r_recverr = rs->rs_recverr;
+	WARN_ON(!nr_pages);
+	sg_init_table(op->r_sg, nr_pages);
+
+	if (op->r_notify || op->r_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->r_notifier) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		op->r_notifier->n_user_token = args->user_token;
+		op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	/* The cookie contains the R_Key of the remote memory region, and
+	 * optionally an offset into it. This is how we implement RDMA into
+	 * unaligned memory.
+	 * When setting up the RDMA, we need to add that offset to the
+	 * destination address (which is really an offset into the MR)
+	 * FIXME: We may want to move this into ib_rdma.c
+	 */
+	op->r_key = rds_rdma_cookie_key(args->cookie);
+	op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+	nr_bytes = 0;
+
+	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+	       (unsigned long long)args->nr_local,
+	       (unsigned long long)args->remote_vec.addr,
+	       op->r_key);
+
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec))) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		nr = rds_pages_in_vec(&vec);
+		if (nr == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		rs->rs_user_addr = vec.addr;
+		rs->rs_user_bytes = vec.bytes;
+
+		/* did the user change the vec under us? */
+		if (nr > max_pages || op->r_nents + nr > nr_pages) {
+			ret = -EINVAL;
+			goto out;
+		}
+		/* If it's a WRITE operation, we want to pin the pages for reading.
+		 * If it's a READ operation, we need to pin the pages for writing.
+		 */
+		ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+		if (ret < 0)
+			goto out;
+
+		rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+		       nr_bytes, nr, vec.bytes, vec.addr);
+
+		nr_bytes += vec.bytes;
+
+		for (j = 0; j < nr; j++) {
+			unsigned int offset = vec.addr & ~PAGE_MASK;
+
+			sg = &op->r_sg[op->r_nents + j];
+			sg_set_page(sg, pages[j],
+					min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
+					offset);
+
+			rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+			       sg->offset, sg->length, vec.addr, vec.bytes);
+
+			vec.addr += sg->length;
+			vec.bytes -= sg->length;
+		}
+
+		op->r_nents += nr;
+	}
+
+
+	if (nr_bytes > args->remote_vec.bytes) {
+		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+				nr_bytes,
+				(unsigned int) args->remote_vec.bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+	op->r_bytes = nr_bytes;
+
+	ret = 0;
+out:
+	kfree(pages);
+	if (ret) {
+		if (op)
+			rds_rdma_free_op(op);
+		op = ERR_PTR(ret);
+	}
+	return op;
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	struct rds_rdma_op *op;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	 || rm->m_rdma_op != NULL)
+		return -EINVAL;
+
+	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+	if (IS_ERR(op))
+		return PTR_ERR(op);
+	rds_stats_inc(s_send_rdma);
+	rm->m_rdma_op = op;
+	return 0;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	unsigned long flags;
+	struct rds_mr *mr;
+	u32 r_key;
+	int err = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+	 || rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+	/* We are reusing a previously mapped MR here. Most likely, the
+	 * application has written to the buffer, so we need to explicitly
+	 * flush those writes to RAM. Otherwise the HCA may not see them
+	 * when doing a DMA from that buffer.
+	 */
+	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (mr == NULL)
+		err = -EINVAL;	/* invalid r_key */
+	else
+		atomic_inc(&mr->r_refcount);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (mr) {
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+		rm->m_rdma_mr = mr;
+	}
+	return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+	 || rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644
index 000000000000..425512098b0b
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+
+#include "rds.h"
+
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	atomic_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+struct rds_rdma_op {
+	u32			r_key;
+	u64			r_remote_addr;
+	unsigned int		r_write:1;
+	unsigned int		r_fence:1;
+	unsigned int		r_notify:1;
+	unsigned int		r_recverr:1;
+	unsigned int		r_mapped:1;
+	struct rds_notifier	*r_notifier;
+	unsigned int		r_bytes;
+	unsigned int		r_nents;
+	unsigned int		r_count;
+	struct scatterlist	r_sg[0];
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (atomic_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
+
+#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000000..7b19024f9706
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2009 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <rdma/rdma_cm.h>
+
+#include "rdma_transport.h"
+
+static struct rdma_cm_id *rds_iw_listen_id;
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	/* this can be null in the listening path */
+	struct rds_connection *conn = cm_id->context;
+	struct rds_transport *trans;
+	int ret = 0;
+
+	rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+		 event->event);
+
+	if (cm_id->device->node_type == RDMA_NODE_RNIC)
+		trans = &rds_iw_transport;
+	else
+		trans = &rds_ib_transport;
+
+	/* Prevent shutdown from tearing down the connection
+	 * while we're executing. */
+	if (conn) {
+		mutex_lock(&conn->c_cm_lock);
+
+		/* If the connection is being shut down, bail out
+		 * right away. We return 0 so cm_id doesn't get
+		 * destroyed prematurely */
+		if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+			/* Reject incoming connections while we're tearing
+			 * down an existing one. */
+			if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+				ret = 1;
+			goto out;
+		}
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = trans->cm_handle_connect(cm_id, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		/* XXX do we need to clean up if this fails? */
+		ret = rdma_resolve_route(cm_id,
+					 RDS_RDMA_RESOLVE_TIMEOUT_MS);
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		/* XXX worry about racing with listen acceptance */
+		ret = trans->cm_initiate_connect(cm_id);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		trans->cm_connect_complete(conn, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		if (conn)
+			rds_conn_drop(conn);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
+			"%pI4->%pI4\n", &conn->c_laddr,
+			 &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+
+	default:
+		/* things like device disconnect? */
+		printk(KERN_ERR "unknown event %u\n", event->event);
+		BUG();
+		break;
+	}
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+
+	rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+
+	return ret;
+}
+
+static int __init rds_rdma_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		       "rdma_create_id() returned %d\n", ret);
+		goto out;
+	}
+
+	sin.sin_family = PF_INET,
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_PORT);
+
+	/*
+	 * XXX I bet this binds the cm_id to a device.  If we want to support
+	 * fail-over we'll have to take this into consideration.
+	 */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	if (ret) {
+		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		       "rdma_bind_addr() returned %d\n", ret);
+		goto out;
+	}
+
+	ret = rdma_listen(cm_id, 128);
+	if (ret) {
+		printk(KERN_ERR "RDS/IW: failed to setup listener, "
+		       "rdma_listen() returned %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+	rds_iw_listen_id = cm_id;
+	cm_id = NULL;
+out:
+	if (cm_id)
+		rdma_destroy_id(cm_id);
+	return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+	if (rds_iw_listen_id) {
+		rdsdebug("cm %p\n", rds_iw_listen_id);
+		rdma_destroy_id(rds_iw_listen_id);
+		rds_iw_listen_id = NULL;
+	}
+}
+
+int __init rds_rdma_init(void)
+{
+	int ret;
+
+	ret = rds_rdma_listen_init();
+	if (ret)
+		goto out;
+
+	ret = rds_iw_init();
+	if (ret)
+		goto err_iw_init;
+
+	ret = rds_ib_init();
+	if (ret)
+		goto err_ib_init;
+
+	goto out;
+
+err_ib_init:
+	rds_iw_exit();
+err_iw_init:
+	rds_rdma_listen_stop();
+out:
+	return ret;
+}
+
+void rds_rdma_exit(void)
+{
+	/* stop listening first to ensure no new connections are attempted */
+	rds_rdma_listen_stop();
+	rds_ib_exit();
+	rds_iw_exit();
+}
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000000..2f2c7d976c21
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,28 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event);
+
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000000..060400704979
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,686 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0	0x0300
+#define RDS_PROTOCOL_3_1	0x0301
+#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
+#define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * #               18464-18768 Unassigned
+ * We should do better.  We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT	18634
+
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT	12
+#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+#define RDS_CONG_MAP_BYTES	(65536 / 8)
+#define RDS_CONG_MAP_LONGS	(RDS_CONG_MAP_BYTES / sizeof(unsigned long))
+#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
+
+struct rds_cong_map {
+	struct rb_node		m_rb_node;
+	__be32			m_addr;
+	wait_queue_head_t	m_waitq;
+	struct list_head	m_conn_list;
+	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+	RDS_CONN_DOWN = 0,
+	RDS_CONN_CONNECTING,
+	RDS_CONN_DISCONNECTING,
+	RDS_CONN_UP,
+	RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL	0
+#define RDS_RECONNECT_PENDING	1
+
+struct rds_connection {
+	struct hlist_node	c_hash_node;
+	__be32			c_laddr;
+	__be32			c_faddr;
+	unsigned int		c_loopback:1;
+	struct rds_connection	*c_passive;
+
+	struct rds_cong_map	*c_lcong;
+	struct rds_cong_map	*c_fcong;
+
+	struct mutex		c_send_lock;	/* protect send ring */
+	struct rds_message	*c_xmit_rm;
+	unsigned long		c_xmit_sg;
+	unsigned int		c_xmit_hdr_off;
+	unsigned int		c_xmit_data_off;
+	unsigned int		c_xmit_rdma_sent;
+
+	spinlock_t		c_lock;		/* protect msg queues */
+	u64			c_next_tx_seq;
+	struct list_head	c_send_queue;
+	struct list_head	c_retrans;
+
+	u64			c_next_rx_seq;
+
+	struct rds_transport	*c_trans;
+	void			*c_transport_data;
+
+	atomic_t		c_state;
+	unsigned long		c_flags;
+	unsigned long		c_reconnect_jiffies;
+	struct delayed_work	c_send_w;
+	struct delayed_work	c_recv_w;
+	struct delayed_work	c_conn_w;
+	struct work_struct	c_down_w;
+	struct mutex		c_cm_lock;	/* protect conn state & cm */
+
+	struct list_head	c_map_item;
+	unsigned long		c_map_queued;
+	unsigned long		c_map_offset;
+	unsigned long		c_map_bytes;
+
+	unsigned int		c_unacked_packets;
+	unsigned int		c_unacked_bytes;
+
+	/* Protocol version */
+	unsigned int		c_version;
+};
+
+#define RDS_FLAG_CONG_BITMAP	0x01
+#define RDS_FLAG_ACK_REQUIRED	0x02
+#define RDS_FLAG_RETRANSMITTED	0x04
+#define RDS_MAX_ADV_CREDIT	127
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE	16
+
+struct rds_header {
+	__be64	h_sequence;
+	__be64	h_ack;
+	__be32	h_len;
+	__be16	h_sport;
+	__be16	h_dport;
+	u8	h_flags;
+	u8	h_credit;
+	u8	h_padding[4];
+	__sum16	h_csum;
+
+	u8	h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE		0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION	1
+struct rds_ext_header_version {
+	__be32			h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA		2
+struct rds_ext_header_rdma {
+	__be32			h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST	3
+struct rds_ext_header_rdma_dest {
+	__be32			h_rdma_rkey;
+	__be32			h_rdma_offset;
+};
+
+#define __RDS_EXTHDR_MAX	16 /* for now */
+
+struct rds_incoming {
+	atomic_t		i_refcount;
+	struct list_head	i_item;
+	struct rds_connection	*i_conn;
+	struct rds_header	i_hdr;
+	unsigned long		i_rx_jiffies;
+	__be32			i_saddr;
+
+	rds_rdma_cookie_t	i_rdma_cookie;
+};
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path.  It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue.  m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting.  As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly.  That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate.  They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK		1
+#define RDS_MSG_ON_CONN		2
+#define RDS_MSG_HAS_ACK_SEQ	3
+#define RDS_MSG_ACK_REQUIRED	4
+#define RDS_MSG_RETRANSMITTED	5
+#define RDS_MSG_MAPPED		6
+#define RDS_MSG_PAGEVEC		7
+
+struct rds_message {
+	atomic_t		m_refcount;
+	struct list_head	m_sock_item;
+	struct list_head	m_conn_item;
+	struct rds_incoming	m_inc;
+	u64			m_ack_seq;
+	__be32			m_daddr;
+	unsigned long		m_flags;
+
+	/* Never access m_rs without holding m_rs_lock.
+	 * Lock nesting is
+	 *  rm->m_rs_lock
+	 *   -> rs->rs_lock
+	 */
+	spinlock_t		m_rs_lock;
+	struct rds_sock		*m_rs;
+	struct rds_rdma_op	*m_rdma_op;
+	rds_rdma_cookie_t	m_rdma_cookie;
+	struct rds_mr		*m_rdma_mr;
+	unsigned int		m_nents;
+	unsigned int		m_count;
+	struct scatterlist	m_sg[0];
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+	struct list_head	n_list;
+	uint64_t		n_user_token;
+	int			n_status;
+};
+
+/**
+ * struct rds_transport -  transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ *        part of a message.  The caller serializes on the send_sem so this
+ *        doesn't need to be reentrant for a given conn.  The header must be
+ *        sent before the data payload.  .xmit must be prepared to send a
+ *        message with no data payload.  .xmit should return the number of
+ *        bytes that were sent down the connection, including header bytes.
+ *        Returning 0 tells the caller that it doesn't need to perform any
+ *        additional work now.  This is usually the case when the transport has
+ *        filled the sending queue for its connection and will handle
+ *        triggering the rds thread to continue the send when space becomes
+ *        available.  Returning -EAGAIN tells the caller to retry the send
+ *        immediately.  Returning -ENOMEM tells the caller to retry the send at
+ *        some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
+ *                 it returns the connection can not call rds_recv_incoming().
+ *                 This will only be called once after conn_connect returns
+ *                 non-zero success and will The caller serializes this with
+ *                 the send and connecting paths (xmit_* and conn_*).  The
+ *                 transport is responsible for other serialization, including
+ *                 rds_recv_incoming().  This is called in process context but
+ *                 should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ * 		   given connection.  XXX get a better story about the bitmap
+ * 		   flag and header.
+ */
+
+struct rds_transport {
+	char			t_name[TRANSNAMSIZ];
+	struct list_head	t_item;
+	struct module		*t_owner;
+	unsigned int		t_prefer_loopback:1;
+
+	int (*laddr_check)(__be32 addr);
+	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+	void (*conn_free)(void *data);
+	int (*conn_connect)(struct rds_connection *conn);
+	void (*conn_shutdown)(struct rds_connection *conn);
+	void (*xmit_prepare)(struct rds_connection *conn);
+	void (*xmit_complete)(struct rds_connection *conn);
+	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+		    unsigned int hdr_off, unsigned int sg, unsigned int off);
+	int (*xmit_cong_map)(struct rds_connection *conn,
+			     struct rds_cong_map *map, unsigned long offset);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+	int (*recv)(struct rds_connection *conn);
+	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+				size_t size);
+	void (*inc_purge)(struct rds_incoming *inc);
+	void (*inc_free)(struct rds_incoming *inc);
+
+	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+				 struct rdma_cm_event *event);
+	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+	void (*cm_connect_complete)(struct rds_connection *conn,
+				    struct rdma_cm_event *event);
+
+	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+					unsigned int avail);
+	void (*exit)(void);
+	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+			struct rds_sock *rs, u32 *key_ret);
+	void (*sync_mr)(void *trans_private, int direction);
+	void (*free_mr)(void *trans_private, int invalidate);
+	void (*flush_mrs)(void);
+};
+
+struct rds_sock {
+	struct sock		rs_sk;
+
+	u64			rs_user_addr;
+	u64			rs_user_bytes;
+
+	/*
+	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
+	 * support.
+	 */
+	struct rb_node		rs_bound_node;
+	__be32			rs_bound_addr;
+	__be32			rs_conn_addr;
+	__be16			rs_bound_port;
+	__be16			rs_conn_port;
+
+	/*
+	 * This is only used to communicate the transport between bind and
+	 * initiating connections.  All other trans use is referenced through
+	 * the connection.
+	 */
+	struct rds_transport    *rs_transport;
+
+	/*
+	 * rds_sendmsg caches the conn it used the last time around.
+	 * This helps avoid costly lookups.
+	 */
+	struct rds_connection	*rs_conn;
+
+	/* flag indicating we were congested or not */
+	int			rs_congested;
+
+	/* rs_lock protects all these adjacent members before the newline */
+	spinlock_t		rs_lock;
+	struct list_head	rs_send_queue;
+	u32			rs_snd_bytes;
+	int			rs_rcv_bytes;
+	struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */
+
+	/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+	 * to decide whether the application should be woken up.
+	 * If not set, we use rs_cong_track to find out whether a cong map
+	 * update arrived.
+	 */
+	uint64_t		rs_cong_mask;
+	uint64_t		rs_cong_notify;
+	struct list_head	rs_cong_list;
+	unsigned long		rs_cong_track;
+
+	/*
+	 * rs_recv_lock protects the receive queue, and is
+	 * used to serialize with rds_release.
+	 */
+	rwlock_t		rs_recv_lock;
+	struct list_head	rs_recv_queue;
+
+	/* just for stats reporting */
+	struct list_head	rs_item;
+
+	/* these have their own lock */
+	spinlock_t		rs_rdma_lock;
+	struct rb_root		rs_rdma_keys;
+
+	/* Socket options - in case there will be more */
+	unsigned char		rs_recverr,
+				rs_cong_monitor;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+	return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+	return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead.  We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+	uint64_t	s_conn_reset;
+	uint64_t	s_recv_drop_bad_checksum;
+	uint64_t	s_recv_drop_old_seq;
+	uint64_t	s_recv_drop_no_sock;
+	uint64_t	s_recv_drop_dead_sock;
+	uint64_t	s_recv_deliver_raced;
+	uint64_t	s_recv_delivered;
+	uint64_t	s_recv_queued;
+	uint64_t	s_recv_immediate_retry;
+	uint64_t	s_recv_delayed_retry;
+	uint64_t	s_recv_ack_required;
+	uint64_t	s_recv_rdma_bytes;
+	uint64_t	s_recv_ping;
+	uint64_t	s_send_queue_empty;
+	uint64_t	s_send_queue_full;
+	uint64_t	s_send_sem_contention;
+	uint64_t	s_send_sem_queue_raced;
+	uint64_t	s_send_immediate_retry;
+	uint64_t	s_send_delayed_retry;
+	uint64_t	s_send_drop_acked;
+	uint64_t	s_send_ack_required;
+	uint64_t	s_send_queued;
+	uint64_t	s_send_rdma;
+	uint64_t	s_send_rdma_bytes;
+	uint64_t	s_send_pong;
+	uint64_t	s_page_remainder_hit;
+	uint64_t	s_page_remainder_miss;
+	uint64_t	s_copy_to_user;
+	uint64_t	s_copy_from_user;
+	uint64_t	s_cong_update_queued;
+	uint64_t	s_cong_update_received;
+	uint64_t	s_cong_send_error;
+	uint64_t	s_cong_send_blocked;
+};
+
+/* af_rds.c */
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+	wait_queue_head_t *waitq = sk->sk_sleep;
+
+	if (!sock_flag(sk, SOCK_DEAD) && waitq)
+		wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+int __init rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+				__attribute__ ((format (printf, 2, 3)));
+#define rds_conn_error(conn, fmt...) \
+	__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+	return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state);
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+					       size_t total_len);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+			      unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+			       unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+	hdr->h_csum = 0;
+	hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+	return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+					 struct rds_rdma_op *);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+
+/* stats.c */
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do {		\
+	per_cpu(which, get_cpu()).member++;		\
+	put_cpu();					\
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do {		\
+	per_cpu(which, get_cpu()).member += count;	\
+	put_cpu();					\
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int __init rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, char **names, size_t nr);
+
+/* sysctl.c */
+int __init rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int  rds_sysctl_max_unacked_packets;
+extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int  rds_sysctl_trace_level;
+
+/* threads.c */
+int __init rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail);
+int __init rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000000..f2118c51cfa3
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr)
+{
+	atomic_set(&inc->i_refcount, 1);
+	INIT_LIST_HEAD(&inc->i_item);
+	inc->i_conn = conn;
+	inc->i_saddr = saddr;
+	inc->i_rdma_cookie = 0;
+}
+
+void rds_inc_addref(struct rds_incoming *inc)
+{
+	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	atomic_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	if (atomic_dec_and_test(&inc->i_refcount)) {
+		BUG_ON(!list_empty(&inc->i_item));
+
+		inc->i_conn->c_trans->inc_free(inc);
+	}
+}
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+				  struct rds_cong_map *map,
+				  int delta, __be16 port)
+{
+	int now_congested;
+
+	if (delta == 0)
+		return;
+
+	rs->rs_rcv_bytes += delta;
+	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+	  "now_cong %d delta %d\n",
+	  rs, &rs->rs_bound_addr,
+	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+	  rds_sk_rcvbuf(rs), now_congested, delta);
+
+	/* wasn't -> am congested */
+	if (!rs->rs_congested && now_congested) {
+		rs->rs_congested = 1;
+		rds_cong_set_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+	/* was -> aren't congested */
+	/* Require more free space before reporting uncongested to prevent
+	   bouncing cong/uncong state too often */
+	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+		rs->rs_congested = 0;
+		rds_cong_clear_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+
+	/* do nothing if no change in cong state */
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+	struct rds_header *hdr = &inc->i_hdr;
+	unsigned int pos = 0, type, len;
+	union {
+		struct rds_ext_header_version version;
+		struct rds_ext_header_rdma rdma;
+		struct rds_ext_header_rdma_dest rdma_dest;
+	} buffer;
+
+	while (1) {
+		len = sizeof(buffer);
+		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+		if (type == RDS_EXTHDR_NONE)
+			break;
+		/* Process extension header here */
+		switch (type) {
+		case RDS_EXTHDR_RDMA:
+			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+			break;
+
+		case RDS_EXTHDR_RDMA_DEST:
+			/* We ignore the size for now. We could stash it
+			 * somewhere and use it for error checking. */
+			inc->i_rdma_cookie = rds_rdma_make_cookie(
+					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+			break;
+		}
+	}
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp, enum km_type km)
+{
+	struct rds_sock *rs = NULL;
+	struct sock *sk;
+	unsigned long flags;
+
+	inc->i_conn = conn;
+	inc->i_rx_jiffies = jiffies;
+
+	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+		 "flags 0x%x rx_jiffies %lu\n", conn,
+		 (unsigned long long)conn->c_next_rx_seq,
+		 inc,
+		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+		 be32_to_cpu(inc->i_hdr.h_len),
+		 be16_to_cpu(inc->i_hdr.h_sport),
+		 be16_to_cpu(inc->i_hdr.h_dport),
+		 inc->i_hdr.h_flags,
+		 inc->i_rx_jiffies);
+
+	/*
+	 * Sequence numbers should only increase.  Messages get their
+	 * sequence number as they're queued in a sending conn.  They
+	 * can be dropped, though, if the sending socket is closed before
+	 * they hit the wire.  So sequence numbers can skip forward
+	 * under normal operation.  They can also drop back in the conn
+	 * failover case as previously sent messages are resent down the
+	 * new instance of a conn.  We drop those, otherwise we have
+	 * to assume that the next valid seq does not come after a
+	 * hole in the fragment stream.
+	 *
+	 * The headers don't give us a way to realize if fragments of
+	 * a message have been dropped.  We assume that frags that arrive
+	 * to a flow are part of the current message on the flow that is
+	 * being reassembled.  This means that senders can't drop messages
+	 * from the sending conn until all their frags are sent.
+	 *
+	 * XXX we could spend more on the wire to get more robust failure
+	 * detection, arguably worth it to avoid data corruption.
+	 */
+	if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+	 && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+		rds_stats_inc(s_recv_drop_old_seq);
+		goto out;
+	}
+	conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+		rds_stats_inc(s_recv_ping);
+		rds_send_pong(conn, inc->i_hdr.h_sport);
+		goto out;
+	}
+
+	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+	if (rs == NULL) {
+		rds_stats_inc(s_recv_drop_no_sock);
+		goto out;
+	}
+
+	/* Process extension headers */
+	rds_recv_incoming_exthdrs(inc, rs);
+
+	/* We can be racing with rds_release() which marks the socket dead. */
+	sk = rds_rs_to_sk(rs);
+
+	/* serialize with rds_release -> sock_orphan */
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+		rds_stats_inc(s_recv_queued);
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		rds_inc_addref(inc);
+		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+		__rds_wake_sk_sleep(sk);
+	} else {
+		rds_stats_inc(s_recv_drop_dead_sock);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+	if (rs)
+		rds_sock_put(rs);
+}
+
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+	unsigned long flags;
+
+	if (*inc == NULL) {
+		read_lock_irqsave(&rs->rs_recv_lock, flags);
+		if (!list_empty(&rs->rs_recv_queue)) {
+			*inc = list_entry(rs->rs_recv_queue.next,
+					  struct rds_incoming,
+					  i_item);
+			rds_inc_addref(*inc);
+		}
+		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+	}
+
+	return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+			    int drop)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	int ret = 0;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!list_empty(&inc->i_item)) {
+		ret = 1;
+		if (drop) {
+			/* XXX make sure this i_conn is reliable */
+			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+					      -be32_to_cpu(inc->i_hdr.h_len),
+					      inc->i_hdr.h_dport);
+			list_del_init(&inc->i_item);
+			rds_inc_put(inc);
+		}
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+	return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	struct rds_notifier *notifier;
+	struct rds_rdma_notify cmsg;
+	unsigned int count = 0, max_messages = ~0U;
+	unsigned long flags;
+	LIST_HEAD(copy);
+	int err = 0;
+
+
+	/* put_cmsg copies to user space and thus may sleep. We can't do this
+	 * with rs_lock held, so first grab as many notifications as we can stuff
+	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
+	 * losing notifications - except when the buffer is so small that it wouldn't
+	 * even hold a single notification. Then we give him as much of this single
+	 * msg as we can squeeze in, and set MSG_CTRUNC.
+	 */
+	if (msghdr) {
+		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+		if (!max_messages)
+			max_messages = 1;
+	}
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+		notifier = list_entry(rs->rs_notify_queue.next,
+				struct rds_notifier, n_list);
+		list_move(&notifier->n_list, &copy);
+		count++;
+	}
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (!count)
+		return 0;
+
+	while (!list_empty(&copy)) {
+		notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+		if (msghdr) {
+			cmsg.user_token = notifier->n_user_token;
+			cmsg.status  = notifier->n_status;
+
+			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+					sizeof(cmsg), &cmsg);
+			if (err)
+				break;
+		}
+
+		list_del_init(&notifier->n_list);
+		kfree(notifier);
+	}
+
+	/* If we bailed out because of an error in put_cmsg,
+	 * we may be left with one or more notifications that we
+	 * didn't process. Return them to the head of the list. */
+	if (!list_empty(&copy)) {
+		spin_lock_irqsave(&rs->rs_lock, flags);
+		list_splice(&copy, &rs->rs_notify_queue);
+		spin_unlock_irqrestore(&rs->rs_lock, flags);
+	}
+
+	return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	uint64_t notify = rs->rs_cong_notify;
+	unsigned long flags;
+	int err;
+
+	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+			sizeof(notify), &notify);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	rs->rs_cong_notify &= ~notify;
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+	int ret = 0;
+
+	if (inc->i_rdma_cookie) {
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	long timeo;
+	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+	struct sockaddr_in *sin;
+	struct rds_incoming *inc = NULL;
+
+	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+	if (msg_flags & MSG_OOB)
+		goto out;
+
+	/* If there are pending notifications, do those - and nothing else */
+	if (!list_empty(&rs->rs_notify_queue)) {
+		ret = rds_notify_queue_get(rs, msg);
+		goto out;
+	}
+
+	if (rs->rs_cong_notify) {
+		ret = rds_notify_cong(rs, msg);
+		goto out;
+	}
+
+	while (1) {
+		if (!rds_next_incoming(rs, &inc)) {
+			if (nonblock) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+						rds_next_incoming(rs, &inc),
+						timeo);
+			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+				 timeo);
+			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+				continue;
+
+			ret = timeo;
+			if (ret == 0)
+				ret = -ETIMEDOUT;
+			break;
+		}
+
+		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+			 &inc->i_conn->c_faddr,
+			 ntohs(inc->i_hdr.h_sport));
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+							     size);
+		if (ret < 0)
+			break;
+
+		/*
+		 * if the message we just copied isn't at the head of the
+		 * recv queue then someone else raced us to return it, try
+		 * to get the next message.
+		 */
+		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+			rds_inc_put(inc);
+			inc = NULL;
+			rds_stats_inc(s_recv_deliver_raced);
+			continue;
+		}
+
+		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+			if (msg_flags & MSG_TRUNC)
+				ret = be32_to_cpu(inc->i_hdr.h_len);
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		if (rds_cmsg_recv(inc, msg)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		rds_stats_inc(s_recv_delivered);
+
+		sin = (struct sockaddr_in *)msg->msg_name;
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = inc->i_hdr.h_sport;
+			sin->sin_addr.s_addr = inc->i_saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		}
+		break;
+	}
+
+	if (inc)
+		rds_inc_put(inc);
+
+out:
+	return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct rds_incoming *inc, *tmp;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      -be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		list_del_init(&inc->i_item);
+		rds_inc_put(inc);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip)
+{
+	struct rds_info_message minfo;
+
+	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo.laddr = daddr;
+		minfo.faddr = saddr;
+		minfo.lport = inc->i_hdr.h_dport;
+		minfo.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo.laddr = saddr;
+		minfo.faddr = daddr;
+		minfo.lport = inc->i_hdr.h_sport;
+		minfo.fport = inc->i_hdr.h_dport;
+	}
+
+	rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000000..1b37364656f0
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1003 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+
+#include "rds.h"
+#include "rdma.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+/*
+ * Reset the send state. Caller must hold c_send_lock when calling here.
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+
+	if (conn->c_xmit_rm) {
+		/* Tell the user the RDMA op is no longer mapped by the
+		 * transport. This isn't entirely true (it's flushed out
+		 * independently) but as the connection is down, there's
+		 * no ongoing RDMA to/from that memory */
+		rds_message_unmapped(conn->c_xmit_rm);
+		rds_message_put(conn->c_xmit_rm);
+		conn->c_xmit_rm = NULL;
+	}
+	conn->c_xmit_sg = 0;
+	conn->c_xmit_hdr_off = 0;
+	conn->c_xmit_data_off = 0;
+	conn->c_xmit_rdma_sent = 0;
+
+	conn->c_map_queued = 0;
+
+	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+	/* Mark messages as retransmissions, and move them to the send q */
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+		set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+	}
+	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+
+/*
+ * We're making the concious trade-off here to only send one message
+ * down the connection at a time.
+ *   Pro:
+ *      - tx queueing is a simple fifo list
+ *   	- reassembly is optional and easily done by transports per conn
+ *      - no per flow rx lookup at all, straight to the socket
+ *   	- less per-frag memory and wire overhead
+ *   Con:
+ *      - queued acks can be delayed behind large messages
+ *   Depends:
+ *      - small message latency is higher behind queued large messages
+ *      - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	unsigned int tmp;
+	unsigned int send_quota = send_batch_count;
+	struct scatterlist *sg;
+	int ret = 0;
+	int was_empty = 0;
+	LIST_HEAD(to_be_dropped);
+
+	/*
+	 * sendmsg calls here after having queued its message on the send
+	 * queue.  We only have one task feeding the connection at a time.  If
+	 * another thread is already feeding the queue then we back off.  This
+	 * avoids blocking the caller and trading per-connection data between
+	 * caches per message.
+	 *
+	 * The sem holder will issue a retry if they notice that someone queued
+	 * a message after they stopped walking the send queue but before they
+	 * dropped the sem.
+	 */
+	if (!mutex_trylock(&conn->c_send_lock)) {
+		rds_stats_inc(s_send_sem_contention);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (conn->c_trans->xmit_prepare)
+		conn->c_trans->xmit_prepare(conn);
+
+	/*
+	 * spin trying to push headers and data down the connection until
+	 * the connection doens't make forward progress.
+	 */
+	while (--send_quota) {
+		/*
+		 * See if need to send a congestion map update if we're
+		 * between sending messages.  The send_sem protects our sole
+		 * use of c_map_offset and _bytes.
+		 * Note this is used only by transports that define a special
+		 * xmit_cong_map function. For all others, we create allocate
+		 * a cong_map message and treat it just like any other send.
+		 */
+		if (conn->c_map_bytes) {
+			ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+						conn->c_map_offset);
+			if (ret <= 0)
+				break;
+
+			conn->c_map_offset += ret;
+			conn->c_map_bytes -= ret;
+			if (conn->c_map_bytes)
+				continue;
+		}
+
+		/* If we're done sending the current message, clear the
+		 * offset and S/G temporaries.
+		 */
+		rm = conn->c_xmit_rm;
+		if (rm != NULL &&
+		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+		    conn->c_xmit_sg == rm->m_nents) {
+			conn->c_xmit_rm = NULL;
+			conn->c_xmit_sg = 0;
+			conn->c_xmit_hdr_off = 0;
+			conn->c_xmit_data_off = 0;
+			conn->c_xmit_rdma_sent = 0;
+
+			/* Release the reference to the previous message. */
+			rds_message_put(rm);
+			rm = NULL;
+		}
+
+		/* If we're asked to send a cong map update, do so.
+		 */
+		if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+			if (conn->c_trans->xmit_cong_map != NULL) {
+				conn->c_map_offset = 0;
+				conn->c_map_bytes = sizeof(struct rds_header) +
+					RDS_CONG_MAP_BYTES;
+				continue;
+			}
+
+			rm = rds_cong_update_alloc(conn);
+			if (IS_ERR(rm)) {
+				ret = PTR_ERR(rm);
+				break;
+			}
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/*
+		 * Grab the next message from the send queue, if there is one.
+		 *
+		 * c_xmit_rm holds a ref while we're sending this message down
+		 * the connction.  We can use this ref while holding the
+		 * send_sem.. rds_send_reset() is serialized with it.
+		 */
+		if (rm == NULL) {
+			unsigned int len;
+
+			spin_lock_irqsave(&conn->c_lock, flags);
+
+			if (!list_empty(&conn->c_send_queue)) {
+				rm = list_entry(conn->c_send_queue.next,
+						struct rds_message,
+						m_conn_item);
+				rds_message_addref(rm);
+
+				/*
+				 * Move the message from the send queue to the retransmit
+				 * list right away.
+				 */
+				list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+			}
+
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+
+			if (rm == NULL) {
+				was_empty = 1;
+				break;
+			}
+
+			/* Unfortunately, the way Infiniband deals with
+			 * RDMA to a bad MR key is by moving the entire
+			 * queue pair to error state. We cold possibly
+			 * recover from that, but right now we drop the
+			 * connection.
+			 * Therefore, we never retransmit messages with RDMA ops.
+			 */
+			if (rm->m_rdma_op
+			 && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+				spin_lock_irqsave(&conn->c_lock, flags);
+				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+					list_move(&rm->m_conn_item, &to_be_dropped);
+				spin_unlock_irqrestore(&conn->c_lock, flags);
+				rds_message_put(rm);
+				continue;
+			}
+
+			/* Require an ACK every once in a while */
+			len = ntohl(rm->m_inc.i_hdr.h_len);
+			if (conn->c_unacked_packets == 0
+			 || conn->c_unacked_bytes < len) {
+				__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+				conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+				conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+				rds_stats_inc(s_send_ack_required);
+			} else {
+				conn->c_unacked_bytes -= len;
+				conn->c_unacked_packets--;
+			}
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/*
+		 * Try and send an rdma message.  Let's see if we can
+		 * keep this simple and require that the transport either
+		 * send the whole rdma or none of it.
+		 */
+		if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+			ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+			if (ret)
+				break;
+			conn->c_xmit_rdma_sent = 1;
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+		    conn->c_xmit_sg < rm->m_nents) {
+			ret = conn->c_trans->xmit(conn, rm,
+						  conn->c_xmit_hdr_off,
+						  conn->c_xmit_sg,
+						  conn->c_xmit_data_off);
+			if (ret <= 0)
+				break;
+
+			if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+				tmp = min_t(int, ret,
+					    sizeof(struct rds_header) -
+					    conn->c_xmit_hdr_off);
+				conn->c_xmit_hdr_off += tmp;
+				ret -= tmp;
+			}
+
+			sg = &rm->m_sg[conn->c_xmit_sg];
+			while (ret) {
+				tmp = min_t(int, ret, sg->length -
+						      conn->c_xmit_data_off);
+				conn->c_xmit_data_off += tmp;
+				ret -= tmp;
+				if (conn->c_xmit_data_off == sg->length) {
+					conn->c_xmit_data_off = 0;
+					sg++;
+					conn->c_xmit_sg++;
+					BUG_ON(ret != 0 &&
+					       conn->c_xmit_sg == rm->m_nents);
+				}
+			}
+		}
+	}
+
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped))
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+
+	if (conn->c_trans->xmit_complete)
+		conn->c_trans->xmit_complete(conn);
+
+	/*
+	 * We might be racing with another sender who queued a message but
+	 * backed off on noticing that we held the c_send_lock.  If we check
+	 * for queued messages after dropping the sem then either we'll
+	 * see the queued message or the queuer will get the sem.  If we
+	 * notice the queued message then we trigger an immediate retry.
+	 *
+	 * We need to be careful only to do this when we stopped processing
+	 * the send queue because it was empty.  It's the only way we
+	 * stop processing the loop when the transport hasn't taken
+	 * responsibility for forward progress.
+	 */
+	mutex_unlock(&conn->c_send_lock);
+
+	if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+		/* We exhausted the send quota, but there's work left to
+		 * do. Return and (re-)schedule the send worker.
+		 */
+		ret = -EAGAIN;
+	}
+
+	if (ret == 0 && was_empty) {
+		/* A simple bit test would be way faster than taking the
+		 * spin lock */
+		spin_lock_irqsave(&conn->c_lock, flags);
+		if (!list_empty(&conn->c_send_queue)) {
+			rds_stats_inc(s_send_sem_queue_raced);
+			ret = -EAGAIN;
+		}
+		spin_unlock_irqrestore(&conn->c_lock, flags);
+	}
+out:
+	return ret;
+}
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	assert_spin_locked(&rs->rs_lock);
+
+	BUG_ON(rs->rs_snd_bytes < len);
+	rs->rs_snd_bytes -= len;
+
+	if (rs->rs_snd_bytes == 0)
+		rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+				    is_acked_func is_acked)
+{
+	if (is_acked)
+		return is_acked(rm, ack);
+	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
+ */
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+{
+	struct rds_message *rm, *tmp;
+	int ret = 1;
+
+	spin_lock(&conn->c_lock);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+			ret = 0;
+		break;
+	}
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+			ret = 0;
+		break;
+	}
+
+	spin_unlock(&conn->c_lock);
+
+	return ret;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rds_rdma_op *ro;
+	struct rds_notifier *notifier;
+
+	spin_lock(&rm->m_rs_lock);
+
+	ro = rm->m_rdma_op;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	 && ro && ro->r_notify && ro->r_notifier) {
+		notifier = ro->r_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ro->r_notifier = NULL;
+	}
+
+	spin_unlock(&rm->m_rs_lock);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+	struct rds_rdma_op *ro;
+
+	ro = rm->m_rdma_op;
+	if (ro && ro->r_notify && ro->r_notifier) {
+		ro->r_notifier->n_status = status;
+		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+		ro->r_notifier = NULL;
+	}
+
+	/* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+					 struct rds_rdma_op *op)
+{
+	struct rds_message *rm, *tmp, *found = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (rm->m_rdma_op == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			goto out;
+		}
+	}
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+		if (rm->m_rdma_op == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			break;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	return found;
+}
+
+/*
+ * This removes messages from the socket's list if they're on it.  The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks.  The messages must have a reference held for their
+ * position on the list.  This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+	unsigned long flags = 0; /* silence gcc :P */
+	struct rds_sock *rs = NULL;
+	struct rds_message *rm;
+
+	local_irq_save(flags);
+	while (!list_empty(messages)) {
+		rm = list_entry(messages->next, struct rds_message,
+				m_conn_item);
+		list_del_init(&rm->m_conn_item);
+
+		/*
+		 * If we see this flag cleared then we're *sure* that someone
+		 * else beat us to removing it from the sock.  If we race
+		 * with their flag update we'll get the lock and then really
+		 * see that the flag has been cleared.
+		 *
+		 * The message spinlock makes sure nobody clears rm->m_rs
+		 * while we're messing with it. It does not prevent the
+		 * message from being removed from the socket, though.
+		 */
+		spin_lock(&rm->m_rs_lock);
+		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+			goto unlock_and_drop;
+
+		if (rs != rm->m_rs) {
+			if (rs) {
+				spin_unlock(&rs->rs_lock);
+				rds_wake_sk_sleep(rs);
+				sock_put(rds_rs_to_sk(rs));
+			}
+			rs = rm->m_rs;
+			spin_lock(&rs->rs_lock);
+			sock_hold(rds_rs_to_sk(rs));
+		}
+
+		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+			struct rds_rdma_op *ro = rm->m_rdma_op;
+			struct rds_notifier *notifier;
+
+			list_del_init(&rm->m_sock_item);
+			rds_send_sndbuf_remove(rs, rm);
+
+			if (ro && ro->r_notifier
+			   && (status || ro->r_notify)) {
+				notifier = ro->r_notifier;
+				list_add_tail(&notifier->n_list,
+						&rs->rs_notify_queue);
+				if (!notifier->n_status)
+					notifier->n_status = status;
+				rm->m_rdma_op->r_notifier = NULL;
+			}
+			rds_message_put(rm);
+			rm->m_rs = NULL;
+		}
+
+unlock_and_drop:
+		spin_unlock(&rm->m_rs_lock);
+		rds_message_put(rm);
+	}
+
+	if (rs) {
+		spin_unlock(&rs->rs_lock);
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number.  Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction.  Maybe it should bail if it sees SOCK_DEAD.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (!rds_send_is_acked(rm, ack, is_acked))
+			break;
+
+		list_move(&rm->m_conn_item, &list);
+		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	}
+
+	/* order flag updates with spin locks */
+	if (!list_empty(&list))
+		smp_mb__after_clear_bit();
+
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	/* now remove the messages from the sock list as needed */
+	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+	struct rds_message *rm, *tmp;
+	struct rds_connection *conn;
+	unsigned long flags;
+	LIST_HEAD(list);
+	int wake = 0;
+
+	/* get all the messages we're dropping under the rs lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
+			continue;
+
+		wake = 1;
+		list_move(&rm->m_sock_item, &list);
+		rds_send_sndbuf_remove(rs, rm);
+		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+
+		/* If this is a RDMA operation, notify the app. */
+		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
+	}
+
+	/* order flag updates with the rs lock */
+	if (wake)
+		smp_mb__after_clear_bit();
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (wake)
+		rds_wake_sk_sleep(rs);
+
+	conn = NULL;
+
+	/* now remove the messages from the conn list as needed */
+	list_for_each_entry(rm, &list, m_sock_item) {
+		/* We do this here rather than in the loop above, so that
+		 * we don't have to nest m_rs_lock under rs->rs_lock */
+		spin_lock(&rm->m_rs_lock);
+		rm->m_rs = NULL;
+		spin_unlock(&rm->m_rs_lock);
+
+		/*
+		 * If we see this flag cleared then we're *sure* that someone
+		 * else beat us to removing it from the conn.  If we race
+		 * with their flag update we'll get the lock and then really
+		 * see that the flag has been cleared.
+		 */
+		if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+			continue;
+
+		if (conn != rm->m_inc.i_conn) {
+			if (conn)
+				spin_unlock_irqrestore(&conn->c_lock, flags);
+			conn = rm->m_inc.i_conn;
+			spin_lock_irqsave(&conn->c_lock, flags);
+		}
+
+		if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			list_del_init(&rm->m_conn_item);
+			rds_message_put(rm);
+		}
+	}
+
+	if (conn)
+		spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	while (!list_empty(&list)) {
+		rm = list_entry(list.next, struct rds_message, m_sock_item);
+		list_del_init(&rm->m_sock_item);
+
+		rds_message_wait(rm);
+		rds_message_put(rm);
+	}
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'.  It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+			     struct rds_message *rm, __be16 sport,
+			     __be16 dport, int *queued)
+{
+	unsigned long flags;
+	u32 len;
+
+	if (*queued)
+		goto out;
+
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* this is the only place which holds both the socket's rs_lock
+	 * and the connection's c_lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	/*
+	 * If there is a little space in sndbuf, we don't queue anything,
+	 * and userspace gets -EAGAIN. But poll() indicates there's send
+	 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+	 * freed up by incoming acks. So we check the *old* value of
+	 * rs_snd_bytes here to allow the last msg to exceed the buffer,
+	 * and poll() now knows no more data can be sent.
+	 */
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+		rs->rs_snd_bytes += len;
+
+		/* let recv side know we are close to send space exhaustion.
+		 * This is probably not the optimal way to do it, as this
+		 * means we set the flag on *all* messages as soon as our
+		 * throughput hits a certain threshold.
+		 */
+		if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+			__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+		rds_message_addref(rm);
+		rm->m_rs = rs;
+
+		/* The code ordering is a little weird, but we're
+		   trying to minimize the time we hold c_lock */
+		rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+		rm->m_inc.i_conn = conn;
+		rds_message_addref(rm);
+
+		spin_lock(&conn->c_lock);
+		rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+		list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+		spin_unlock(&conn->c_lock);
+
+		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+			 rm, len, rs, rs->rs_snd_bytes,
+			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+		*queued = 1;
+	}
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+	return *queued;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+			 struct msghdr *msg, int *allocated_mr)
+{
+	struct cmsghdr *cmsg;
+	int ret = 0;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		/* As a side effect, RDMA_DEST and RDMA_MAP will set
+		 * rm->m_rdma_cookie and rm->m_rdma_mr.
+		 */
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+			ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_MAP:
+			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+			if (!ret)
+				*allocated_mr = 1;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+	__be32 daddr;
+	__be16 dport;
+	struct rds_message *rm = NULL;
+	struct rds_connection *conn;
+	int ret = 0;
+	int queued = 0, allocated_mr = 0;
+	int nonblock = msg->msg_flags & MSG_DONTWAIT;
+	long timeo = sock_rcvtimeo(sk, nonblock);
+
+	/* Mirror Linux UDP mirror of BSD error message compatibility */
+	/* XXX: Perhaps MSG_MORE someday */
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+		printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (msg->msg_namelen) {
+		/* XXX fail non-unicast destination IPs? */
+		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+			ret = -EINVAL;
+			goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+	} else {
+		/* We only care about consistency with ->connect() */
+		lock_sock(sk);
+		daddr = rs->rs_conn_addr;
+		dport = rs->rs_conn_port;
+		release_sock(sk);
+	}
+
+	/* racing with another thread binding seems ok here */
+	if (daddr == 0 || rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+	if (IS_ERR(rm)) {
+		ret = PTR_ERR(rm);
+		rm = NULL;
+		goto out;
+	}
+
+	rm->m_daddr = daddr;
+
+	/* Parse any control messages the user may have included. */
+	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+	if (ret)
+		goto out;
+
+	/* rds_conn_create has a spinlock that runs with IRQ off.
+	 * Caching the conn in the socket helps a lot. */
+	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+		conn = rs->rs_conn;
+	else {
+		conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+					rs->rs_transport,
+					sock->sk->sk_allocation);
+		if (IS_ERR(conn)) {
+			ret = PTR_ERR(conn);
+			goto out;
+		}
+		rs->rs_conn = conn;
+	}
+
+	if ((rm->m_rdma_cookie || rm->m_rdma_op)
+	 && conn->c_trans->xmit_rdma == NULL) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+				rm->m_rdma_op, conn->c_trans->xmit_rdma);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* If the connection is down, trigger a connect. We may
+	 * have scheduled a delayed reconnect however - in this case
+	 * we should not interfere.
+	 */
+	if (rds_conn_state(conn) == RDS_CONN_DOWN
+	 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+	if (ret)
+		goto out;
+
+	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+				  dport, &queued)) {
+		rds_stats_inc(s_send_queue_full);
+		/* XXX make sure this is reasonable */
+		if (payload_len > rds_sk_sndbuf(rs)) {
+			ret = -EMSGSIZE;
+			goto out;
+		}
+		if (nonblock) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
+		timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+					rds_send_queue_rm(rs, conn, rm,
+							  rs->rs_bound_port,
+							  dport,
+							  &queued),
+					timeo);
+		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+			continue;
+
+		ret = timeo;
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/*
+	 * By now we've committed to the send.  We reuse rds_send_worker()
+	 * to retry sends in the rds thread if the transport asks us to.
+	 */
+	rds_stats_inc(s_send_queued);
+
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_worker(&conn->c_send_w.work);
+
+	rds_message_put(rm);
+	return payload_len;
+
+out:
+	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+	 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+	 * or in any other way, we need to destroy the MR again */
+	if (allocated_mr)
+		rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	int ret = 0;
+
+	rm = rds_message_alloc(0, GFP_ATOMIC);
+	if (rm == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_daddr = conn->c_faddr;
+
+	/* If the connection is down, trigger a connect. We may
+	 * have scheduled a delayed reconnect however - in this case
+	 * we should not interfere.
+	 */
+	if (rds_conn_state(conn) == RDS_CONN_DOWN
+	 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	rds_message_addref(rm);
+	rm->m_inc.i_conn = conn;
+
+	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+				    conn->c_next_tx_seq);
+	conn->c_next_tx_seq++;
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	rds_stats_inc(s_send_queued);
+	rds_stats_inc(s_send_pong);
+
+	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	rds_message_put(rm);
+	return 0;
+
+out:
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000000..637146893cf3
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static char *rds_stat_names[] = {
+	"conn_reset",
+	"recv_drop_bad_checksum",
+	"recv_drop_old_seq",
+	"recv_drop_no_sock",
+	"recv_drop_dead_sock",
+	"recv_deliver_raced",
+	"recv_delivered",
+	"recv_queued",
+	"recv_immediate_retry",
+	"recv_delayed_retry",
+	"recv_ack_required",
+	"recv_rdma_bytes",
+	"recv_ping",
+	"send_queue_empty",
+	"send_queue_full",
+	"send_sem_contention",
+	"send_sem_queue_raced",
+	"send_immediate_retry",
+	"send_delayed_retry",
+	"send_drop_acked",
+	"send_ack_required",
+	"send_queued",
+	"send_rdma",
+	"send_rdma_bytes",
+	"send_pong",
+	"page_remainder_hit",
+	"page_remainder_miss",
+	"copy_to_user",
+	"copy_from_user",
+	"cong_update_queued",
+	"cong_update_received",
+	"cong_send_error",
+	"cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, char **names, size_t nr)
+{
+	struct rds_info_counter ctr;
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+		strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+		ctr.value = values[i];
+
+		rds_info_copy(iter, &ctr, sizeof(ctr));
+	}
+}
+
+/*
+ * This gives global counters across all the transports.  The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting.  Some are pretty implementation dependent
+ * and may change over time.  That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace.  It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	struct rds_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+	unsigned int avail;
+
+	avail = len / sizeof(struct rds_info_counter);
+
+	if (avail < ARRAY_SIZE(rds_stat_names)) {
+		avail = 0;
+		goto trans;
+	}
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+			    ARRAY_SIZE(rds_stat_names));
+	avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+	lens->each = sizeof(struct rds_info_counter);
+	lens->nr = rds_trans_stats_info_copy(iter, avail) +
+		   ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int __init rds_stats_init(void)
+{
+	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+	return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000000..307dc5c1be15
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int  rds_sysctl_max_unacked_packets = 8;
+unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static ctl_table rds_sysctl_rds_table[] = {
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "reconnect_min_delay_ms",
+		.data		= &rds_sysctl_reconnect_min_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min,
+		.extra2		= &rds_sysctl_reconnect_max_jiffies,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "reconnect_max_delay_ms",
+		.data		= &rds_sysctl_reconnect_max_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min_jiffies,
+		.extra2		= &rds_sysctl_reconnect_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max_unacked_packets",
+		.data		= &rds_sysctl_max_unacked_packets,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max_unacked_bytes",
+		.data		= &rds_sysctl_max_unacked_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ping_enable",
+		.data		= &rds_sysctl_ping_enable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{ .ctl_name = 0}
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+	{ }
+};
+
+
+void rds_sysctl_exit(void)
+{
+	if (rds_sysctl_reg_table)
+		unregister_sysctl_table(rds_sysctl_reg_table);
+}
+
+int __init rds_sysctl_init(void)
+{
+	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+	rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+	if (rds_sysctl_reg_table == NULL)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000000..828a1bf9ea92
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ *  ANY		  -> ERROR
+ *  UP		  -> DISCONNECTING
+ *  ERROR	  -> DISCONNECTING
+ *  DISCONNECTING -> DOWN
+ *  DOWN	  -> CONNECTING
+ *  CONNECTING	  -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ *  -	Inside the shutdown worker; synchronizes with xmit path
+ *	through c_send_lock, and with connection management callbacks
+ *	via c_cm_lock.
+ *
+ *	For receive callbacks, we rely on the underlying transport
+ *	(TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+	if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+		printk(KERN_WARNING "%s: Cannot transition to state UP, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+		atomic_set(&conn->c_state, RDS_CONN_ERROR);
+		queue_work(rds_wq, &conn->c_down_w);
+		return;
+	}
+
+	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+	  conn, &conn->c_laddr, &conn->c_faddr);
+
+	conn->c_reconnect_jiffies = 0;
+	set_bit(0, &conn->c_map_queued);
+	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again.  Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects.  This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+static void rds_queue_reconnect(struct rds_connection *conn)
+{
+	unsigned long rand;
+
+	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+	  conn, &conn->c_laddr, &conn->c_faddr,
+	  conn->c_reconnect_jiffies);
+
+	set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (conn->c_reconnect_jiffies == 0) {
+		conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+		return;
+	}
+
+	get_random_bytes(&rand, sizeof(rand));
+	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+		 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+		 conn, &conn->c_laddr, &conn->c_faddr);
+	queue_delayed_work(rds_wq, &conn->c_conn_w,
+			   rand % conn->c_reconnect_jiffies);
+
+	conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+					rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+	int ret;
+
+	clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		ret = conn->c_trans->conn_connect(conn);
+		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+			conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+		if (ret) {
+			if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+				rds_queue_reconnect(conn);
+			else
+				rds_conn_error(conn, "RDS: connect failed\n");
+		}
+	}
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+	/* shut it down unless it's down already */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&conn->c_cm_lock);
+		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+		 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+			rds_conn_error(conn, "shutdown called in state %d\n",
+					atomic_read(&conn->c_state));
+			mutex_unlock(&conn->c_cm_lock);
+			return;
+		}
+		mutex_unlock(&conn->c_cm_lock);
+
+		mutex_lock(&conn->c_send_lock);
+		conn->c_trans->conn_shutdown(conn);
+		rds_conn_reset(conn);
+		mutex_unlock(&conn->c_send_lock);
+
+		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
+			rds_conn_error(conn,
+				"%s: failed to transition to state DOWN, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work(&conn->c_conn_w);
+	if (!hlist_unhashed(&conn->c_hash_node))
+		rds_queue_reconnect(conn);
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = rds_send_xmit(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_send_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_send_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = conn->c_trans->recv(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_recv_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_recv_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_threads_exit(void)
+{
+	destroy_workqueue(rds_wq);
+}
+
+int __init rds_threads_init(void)
+{
+	rds_wq = create_singlethread_workqueue("krdsd");
+	if (rds_wq == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000000..767da61ad2f3
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static LIST_HEAD(rds_transports);
+static DECLARE_RWSEM(rds_trans_sem);
+
+int rds_trans_register(struct rds_transport *trans)
+{
+	BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+	down_write(&rds_trans_sem);
+
+	list_add_tail(&trans->t_item, &rds_transports);
+	printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+
+	up_write(&rds_trans_sem);
+
+	return 0;
+}
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+	down_write(&rds_trans_sem);
+
+	list_del_init(&trans->t_item);
+	printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+	up_write(&rds_trans_sem);
+}
+
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+	struct rds_transport *trans;
+	struct rds_transport *ret = NULL;
+
+	if (IN_LOOPBACK(ntohl(addr)))
+		return &rds_loop_transport;
+
+	down_read(&rds_trans_sem);
+	list_for_each_entry(trans, &rds_transports, t_item) {
+		if (trans->laddr_check(addr) == 0) {
+			ret = trans;
+			break;
+		}
+	}
+	up_read(&rds_trans_sem);
+
+	return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them.  The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail)
+
+{
+	struct rds_transport *trans;
+	unsigned int total = 0;
+	unsigned int part;
+
+	rds_info_iter_unmap(iter);
+	down_read(&rds_trans_sem);
+
+	list_for_each_entry(trans, &rds_transports, t_item) {
+		if (trans->stats_info_copy == NULL)
+			continue;
+
+		part = trans->stats_info_copy(iter, avail);
+		avail -= min(avail, part);
+		total += part;
+	}
+
+	up_read(&rds_trans_sem);
+
+	return total;
+}
+
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5c72a116b1a4..f8f047b61245 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -183,13 +183,6 @@ override:
 		if (R_tab == NULL)
 			goto failure;
 
-		if (!est && (ret == ACT_P_CREATED ||
-			     !gen_estimator_active(&police->tcf_bstats,
-						   &police->tcf_rate_est))) {
-			err = -EINVAL;
-			goto failure;
-		}
-
 		if (parm->peakrate.rate) {
 			P_tab = qdisc_get_rtab(&parm->peakrate,
 					       tb[TCA_POLICE_PEAKRATE]);
@@ -205,6 +198,12 @@ override:
 					    &police->tcf_lock, est);
 		if (err)
 			goto failure_unlock;
+	} else if (tb[TCA_POLICE_AVRATE] &&
+		   (ret == ACT_P_CREATED ||
+		    !gen_estimator_active(&police->tcf_bstats,
+					  &police->tcf_rate_est))) {
+		err = -EINVAL;
+		goto failure_unlock;
 	}
 
 	/* No failure allowed after this point */
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 9e43ed949167..d728d8111732 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1960,8 +1960,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	cbq_rmprio(q, cl);
 	sch_tree_unlock(sch);
 
-	if (--cl->refcnt == 0)
-		cbq_destroy_class(sch, cl);
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	return 0;
 }
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index f6b4fa97df70..7597fe146866 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -66,11 +66,15 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl = (struct drr_class *)*arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_DRR_MAX + 1];
 	u32 quantum;
 	int err;
 
-	err = nla_parse_nested(tb, TCA_DRR_MAX, tca[TCA_OPTIONS], drr_policy);
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
 	if (err < 0)
 		return err;
 
@@ -151,8 +155,11 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
 	drr_purge_queue(cl);
 	qdisc_class_hash_remove(&q->clhash, &cl->common);
 
-	if (--cl->refcnt == 0)
-		drr_destroy_class(sch, cl);
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 74226b265528..5022f9c1f34b 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1139,8 +1139,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
 	hfsc_purge_queue(sch, cl);
 	qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
 
-	if (--cl->refcnt == 0)
-		hfsc_destroy_class(sch, cl);
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 355974f610c5..88cd02626621 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1275,8 +1275,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 	if (last_child)
 		htb_parent_to_leaf(q, cl, new_q);
 
-	if (--cl->refcnt == 0)
-		htb_destroy_class(sch, cl);
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
 
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a2f93c09f3cc..e22dfe85e43e 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -236,7 +236,6 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
 	struct tc_tbf_qopt *qopt;
 	struct qdisc_rate_table *rtab = NULL;
 	struct qdisc_rate_table *ptab = NULL;
-	struct qdisc_rate_table *tmp;
 	struct Qdisc *child = NULL;
 	int max_size,n;
 
@@ -295,13 +294,9 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
 	q->tokens = q->buffer;
 	q->ptokens = q->mtu;
 
-	tmp = q->R_tab;
-	q->R_tab = rtab;
-	rtab = tmp;
+	swap(q->R_tab, rtab);
+	swap(q->P_tab, ptab);
 
-	tmp = q->P_tab;
-	q->P_tab = ptab;
-	ptab = tmp;
 	sch_tree_unlock(sch);
 	err = 0;
 done:
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 67715f4eb849..7ff548a30cfb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -86,6 +86,9 @@ const char *sctp_cname(const sctp_subtype_t cid)
 	case SCTP_CID_FWD_TSN:
 		return "FWD_TSN";
 
+	case SCTP_CID_AUTH:
+		return "AUTH";
+
 	default:
 		break;
 	}
@@ -135,6 +138,7 @@ static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
 	"PRIMITIVE_ABORT",
 	"PRIMITIVE_SEND",
 	"PRIMITIVE_REQUESTHEARTBEAT",
+	"PRIMITIVE_ASCONF",
 };
 
 /* Lookup primitive debug name. */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 4c8d9f45ce09..905fda582b92 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -111,7 +111,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 		if (sctp_addip_enable) {
 			auth_chunks->chunks[0] = SCTP_CID_ASCONF;
 			auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
-			auth_chunks->param_hdr.length += htons(2);
+			auth_chunks->param_hdr.length =
+					htons(sizeof(sctp_paramhdr_t) + 2);
 		}
 	}
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 07d58903a746..7d08f522ec84 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -49,13 +49,10 @@
 #include <linux/ipv6.h>
 #include <linux/init.h>
 #include <net/inet_ecn.h>
+#include <net/ip.h>
 #include <net/icmp.h>
 #include <net/net_namespace.h>
 
-#ifndef TEST_FRAME
-#include <net/tcp.h>
-#endif /* TEST_FRAME (not defined) */
-
 #include <linux/socket.h> /* for sa_family_t */
 #include <net/sock.h>
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bc411c896216..d765fc53e74d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -428,7 +428,8 @@ void sctp_retransmit_mark(struct sctp_outq *q,
 			 * retransmitting due to T3 timeout.
 			 */
 			if (reason == SCTP_RTXR_T3_RTX &&
-			    (jiffies - chunk->sent_at) < transport->last_rto)
+			    time_before(jiffies, chunk->sent_at +
+						 transport->last_rto))
 				continue;
 
 			/* RFC 2960 6.2.1 Processing a Received SACK
@@ -1757,6 +1758,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
 	struct sctp_chunk *chunk;
 	struct list_head *lchunk, *temp;
 
+	if (!asoc->peer.prsctp_capable)
+		return;
+
 	/* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
 	 * received SACK.
 	 *
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c1e316ee7155..cb198af8887c 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -692,15 +692,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
 static int sctp_ctl_sock_init(void)
 {
 	int err;
-	sa_family_t family;
+	sa_family_t family = PF_INET;
 
 	if (sctp_get_pf_specific(PF_INET6))
 		family = PF_INET6;
-	else
-		family = PF_INET;
 
 	err = inet_ctl_sock_create(&sctp_ctl_sock, family,
 				   SOCK_SEQPACKET, IPPROTO_SCTP, &init_net);
+
+	/* If IPv6 socket could not be created, try the IPv4 socket */
+	if (err < 0 && family == PF_INET6)
+		err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET,
+					   SOCK_SEQPACKET, IPPROTO_SCTP,
+					   &init_net);
+
 	if (err < 0) {
 		printk(KERN_ERR
 		       "SCTP: Failed to create the SCTP control socket.\n");
@@ -1297,9 +1302,8 @@ SCTP_STATIC __init int sctp_init(void)
 out:
 	return status;
 err_v6_add_protocol:
-	sctp_v6_del_protocol();
-err_add_protocol:
 	sctp_v4_del_protocol();
+err_add_protocol:
 	inet_ctl_sock_destroy(sctp_ctl_sock);
 err_ctl_sock_init:
 	sctp_v6_protosw_exit();
@@ -1310,7 +1314,6 @@ err_protosw_init:
 	sctp_v4_pf_exit();
 	sctp_v6_pf_exit();
 	sctp_sysctl_unregister();
-	list_del(&sctp_af_inet.list);
 	free_pages((unsigned long)sctp_port_hashtable,
 		   get_order(sctp_port_hashsize *
 			     sizeof(struct sctp_bind_hashbucket)));
@@ -1358,7 +1361,6 @@ SCTP_STATIC __exit void sctp_exit(void)
 	sctp_v4_pf_exit();
 
 	sctp_sysctl_unregister();
-	list_del(&sctp_af_inet.list);
 
 	free_pages((unsigned long)sctp_assoc_hashtable,
 		   get_order(sctp_assoc_hashsize *
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index b40e95f9851b..6851ee94e974 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -224,7 +224,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 		num_ext += 2;
 	}
 
-	chunksize += sizeof(aiparam);
+	if (sp->adaptation_ind)
+		chunksize += sizeof(aiparam);
+
 	chunksize += vparam_len;
 
 	/* Account for AUTH related parameters */
@@ -304,10 +306,12 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	if (sctp_prsctp_enable)
 		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
 
-	aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
-	aiparam.param_hdr.length = htons(sizeof(aiparam));
-	aiparam.adaptation_ind = htonl(sp->adaptation_ind);
-	sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	if (sp->adaptation_ind) {
+		aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+		aiparam.param_hdr.length = htons(sizeof(aiparam));
+		aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+		sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	}
 
 	/* Add SCTP-AUTH chunks to the parameter list */
 	if (sctp_auth_enable) {
@@ -332,6 +336,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	sctp_inithdr_t initack;
 	struct sctp_chunk *retval;
 	union sctp_params addrs;
+	struct sctp_sock *sp;
 	int addrs_len;
 	sctp_cookie_param_t *cookie;
 	int cookie_len;
@@ -366,22 +371,24 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	/* Calculate the total size of allocation, include the reserved
 	 * space for reporting unknown parameters if it is specified.
 	 */
+	sp = sctp_sk(asoc->base.sk);
 	chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;
 
 	/* Tell peer that we'll do ECN only if peer advertised such cap.  */
 	if (asoc->peer.ecn_capable)
 		chunksize += sizeof(ecap_param);
 
-	if (sctp_prsctp_enable)
+	if (asoc->peer.prsctp_capable)
 		chunksize += sizeof(prsctp_param);
 
-	if (sctp_addip_enable) {
+	if (asoc->peer.asconf_capable) {
 		extensions[num_ext] = SCTP_CID_ASCONF;
 		extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
 		num_ext += 2;
 	}
 
-	chunksize += sizeof(aiparam);
+	if (sp->adaptation_ind)
+		chunksize += sizeof(aiparam);
 
 	if (asoc->peer.auth_capable) {
 		auth_random = (sctp_paramhdr_t *)asoc->c.auth_random;
@@ -432,10 +439,12 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	if (asoc->peer.prsctp_capable)
 		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
 
-	aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
-	aiparam.param_hdr.length = htons(sizeof(aiparam));
-	aiparam.adaptation_ind = htonl(sctp_sk(asoc->base.sk)->adaptation_ind);
-	sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	if (sp->adaptation_ind) {
+		aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+		aiparam.param_hdr.length = htons(sizeof(aiparam));
+		aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+		sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	}
 
 	if (asoc->peer.auth_capable) {
 		sctp_addto_chunk(retval, ntohs(auth_random->length),
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 0146cfb1f182..e2020eb2c8ca 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -434,7 +434,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  *
  */
 static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
-					 struct sctp_transport *transport)
+					 struct sctp_transport *transport,
+					 int is_hb)
 {
 	/* The check for association's overall error counter exceeding the
 	 * threshold is done in the state function.
@@ -466,7 +467,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 	 * The first unacknowleged HB triggers it.  We do this with a flag
 	 * that indicates that we have an outstanding HB.
 	 */
-	if (transport->hb_sent) {
+	if (!is_hb || transport->hb_sent) {
 		transport->last_rto = transport->rto;
 		transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
 	}
@@ -657,20 +658,6 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 		sctp_transport_hold(t);
 }
 
-/* Helper function to do a transport reset at the expiry of the hearbeat
- * timer.
- */
-static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds,
-				     struct sctp_association *asoc,
-				     struct sctp_transport *t)
-{
-	sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
-
-	/* Mark one strike against a transport.  */
-	sctp_do_8_2_transport_strike(asoc, t);
-
-	t->hb_sent = 1;
-}
 
 /* Helper function to process the process SACK command.  */
 static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
@@ -800,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
 				   struct sctp_association *asoc,
 				   struct sctp_chunk *chunk)
 {
-	struct sctp_operr_chunk *operr_chunk;
 	struct sctp_errhdr *err_hdr;
+	struct sctp_ulpevent *ev;
 
-	operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr;
-	err_hdr = &operr_chunk->err_hdr;
+	while (chunk->chunk_end > chunk->skb->data) {
+		err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
 
-	switch (err_hdr->cause) {
-	case SCTP_ERROR_UNKNOWN_CHUNK:
-	{
-		struct sctp_chunkhdr *unk_chunk_hdr;
+		ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
+						     GFP_ATOMIC);
+		if (!ev)
+			return;
 
-		unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable;
-		switch (unk_chunk_hdr->type) {
-		/* ADDIP 4.1 A9) If the peer responds to an ASCONF with an
-		 * ERROR chunk reporting that it did not recognized the ASCONF
-		 * chunk type, the sender of the ASCONF MUST NOT send any
-		 * further ASCONF chunks and MUST stop its T-4 timer.
-		 */
-		case SCTP_CID_ASCONF:
-			asoc->peer.asconf_capable = 0;
-			sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
+		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+
+		switch (err_hdr->cause) {
+		case SCTP_ERROR_UNKNOWN_CHUNK:
+		{
+			sctp_chunkhdr_t *unk_chunk_hdr;
+
+			unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
+			switch (unk_chunk_hdr->type) {
+			/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
+			 * an ERROR chunk reporting that it did not recognized
+			 * the ASCONF chunk type, the sender of the ASCONF MUST
+			 * NOT send any further ASCONF chunks and MUST stop its
+			 * T-4 timer.
+			 */
+			case SCTP_CID_ASCONF:
+				if (asoc->peer.asconf_capable == 0)
+					break;
+
+				asoc->peer.asconf_capable = 0;
+				sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
 					SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+				break;
+			default:
+				break;
+			}
 			break;
+		}
 		default:
 			break;
 		}
-		break;
-	}
-	default:
-		break;
 	}
 }
 
@@ -1459,12 +1458,19 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 		case SCTP_CMD_STRIKE:
 			/* Mark one strike against a transport.  */
-			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport);
+			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
+						    0);
+			break;
+
+		case SCTP_CMD_TRANSPORT_IDLE:
+			t = cmd->obj.transport;
+			sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
 			break;
 
-		case SCTP_CMD_TRANSPORT_RESET:
+		case SCTP_CMD_TRANSPORT_HB_SENT:
 			t = cmd->obj.transport;
-			sctp_cmd_transport_reset(commands, asoc, t);
+			sctp_do_8_2_transport_strike(asoc, t, 1);
+			t->hb_sent = 1;
 			break;
 
 		case SCTP_CMD_TRANSPORT_ON:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3a0cd075914f..55a61aa69662 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -988,7 +988,9 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
 		/* Set transport error counter and association error counter
 		 * when sending heartbeat.
 		 */
-		sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET,
+		sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
+				SCTP_TRANSPORT(transport));
+		sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
 				SCTP_TRANSPORT(transport));
 	}
 	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
@@ -3163,7 +3165,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
 					sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	struct sctp_ulpevent *ev;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
@@ -3173,21 +3174,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
 		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
 						  commands);
 
-	while (chunk->chunk_end > chunk->skb->data) {
-		ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
-						     GFP_ATOMIC);
-		if (!ev)
-			goto nomem;
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
+			SCTP_CHUNK(chunk));
 
-		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
-				SCTP_ULPEVENT(ev));
-		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
-				SCTP_CHUNK(chunk));
-	}
 	return SCTP_DISPOSITION_CONSUME;
-
-nomem:
-	return SCTP_DISPOSITION_NOMEM;
 }
 
 /*
@@ -4967,7 +4957,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
 	 *    to that address and not acknowledged within one RTO.
 	 *
 	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET,
+	sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
 			SCTP_TRANSPORT(arg));
 	return SCTP_DISPOSITION_CONSUME;
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index dea864f5de54..5fb3a8c9792e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3069,9 +3069,6 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
 	int val;
 	int assoc_id = 0;
 
-	if (optlen < sizeof(int))
-		return -EINVAL;
-
 	if (optlen == sizeof(int)) {
 		printk(KERN_WARNING
 		   "SCTP: Use of int in max_burst socket option deprecated\n");
@@ -5283,16 +5280,14 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
 	struct sctp_sock *sp;
 	struct sctp_association *asoc;
 
-	if (len < sizeof(int))
-		return -EINVAL;
-
 	if (len == sizeof(int)) {
 		printk(KERN_WARNING
 		   "SCTP: Use of int in max_burst socket option deprecated\n");
 		printk(KERN_WARNING
 		   "SCTP: Use struct sctp_assoc_value instead\n");
 		params.assoc_id = 0;
-	} else if (len == sizeof (struct sctp_assoc_value)) {
+	} else if (len >= sizeof(struct sctp_assoc_value)) {
+		len = sizeof(struct sctp_assoc_value);
 		if (copy_from_user(&params, optval, len))
 			return -EFAULT;
 	} else
@@ -5848,37 +5843,28 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
 }
 
 /*
- * 3.1.3 listen() - UDP Style Syntax
- *
- *   By default, new associations are not accepted for UDP style sockets.
- *   An application uses listen() to mark a socket as being able to
- *   accept new associations.
+ *  Move a socket to LISTENING state.
  */
-SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
+SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
 {
 	struct sctp_sock *sp = sctp_sk(sk);
 	struct sctp_endpoint *ep = sp->ep;
+	struct crypto_hash *tfm = NULL;
 
-	/* Only UDP style sockets that are not peeled off are allowed to
-	 * listen().
-	 */
-	if (!sctp_style(sk, UDP))
-		return -EINVAL;
-
-	/* If backlog is zero, disable listening. */
-	if (!backlog) {
-		if (sctp_sstate(sk, CLOSED))
-			return 0;
-
-		sctp_unhash_endpoint(ep);
-		sk->sk_state = SCTP_SS_CLOSED;
-		return 0;
+	/* Allocate HMAC for generating cookie. */
+	if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
+		tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm)) {
+			if (net_ratelimit()) {
+				printk(KERN_INFO
+				       "SCTP: failed to load transform for %s: %ld\n",
+					sctp_hmac_alg, PTR_ERR(tfm));
+			}
+			return -ENOSYS;
+		}
+		sctp_sk(sk)->hmac = tfm;
 	}
 
-	/* Return if we are already listening. */
-	if (sctp_sstate(sk, LISTENING))
-		return 0;
-
 	/*
 	 * If a bind() or sctp_bindx() is not called prior to a listen()
 	 * call that allows new associations to be accepted, the system
@@ -5889,7 +5875,6 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
 	 * extensions draft, but follows the practice as seen in TCP
 	 * sockets.
 	 *
-	 * Additionally, turn off fastreuse flag since we are not listening
 	 */
 	sk->sk_state = SCTP_SS_LISTENING;
 	if (!ep->base.bind_addr.port) {
@@ -5900,113 +5885,71 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
 			sk->sk_state = SCTP_SS_CLOSED;
 			return -EADDRINUSE;
 		}
-		sctp_sk(sk)->bind_hash->fastreuse = 0;
 	}
 
-	sctp_hash_endpoint(ep);
-	return 0;
-}
-
-/*
- * 4.1.3 listen() - TCP Style Syntax
- *
- *   Applications uses listen() to ready the SCTP endpoint for accepting
- *   inbound associations.
- */
-SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog)
-{
-	struct sctp_sock *sp = sctp_sk(sk);
-	struct sctp_endpoint *ep = sp->ep;
-
-	/* If backlog is zero, disable listening. */
-	if (!backlog) {
-		if (sctp_sstate(sk, CLOSED))
-			return 0;
-
-		sctp_unhash_endpoint(ep);
-		sk->sk_state = SCTP_SS_CLOSED;
-		return 0;
-	}
-
-	if (sctp_sstate(sk, LISTENING))
-		return 0;
-
-	/*
-	 * If a bind() or sctp_bindx() is not called prior to a listen()
-	 * call that allows new associations to be accepted, the system
-	 * picks an ephemeral port and will choose an address set equivalent
-	 * to binding with a wildcard address.
-	 *
-	 * This is not currently spelled out in the SCTP sockets
-	 * extensions draft, but follows the practice as seen in TCP
-	 * sockets.
-	 */
-	sk->sk_state = SCTP_SS_LISTENING;
-	if (!ep->base.bind_addr.port) {
-		if (sctp_autobind(sk))
-			return -EAGAIN;
-	} else
-		sctp_sk(sk)->bind_hash->fastreuse = 0;
-
 	sk->sk_max_ack_backlog = backlog;
 	sctp_hash_endpoint(ep);
 	return 0;
 }
 
 /*
+ * 4.1.3 / 5.1.3 listen()
+ *
+ *   By default, new associations are not accepted for UDP style sockets.
+ *   An application uses listen() to mark a socket as being able to
+ *   accept new associations.
+ *
+ *   On TCP style sockets, applications use listen() to ready the SCTP
+ *   endpoint for accepting inbound associations.
+ *
+ *   On both types of endpoints a backlog of '0' disables listening.
+ *
  *  Move a socket to LISTENING state.
  */
 int sctp_inet_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
-	struct crypto_hash *tfm = NULL;
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	int err = -EINVAL;
 
 	if (unlikely(backlog < 0))
-		goto out;
+		return err;
 
 	sctp_lock_sock(sk);
 
+	/* Peeled-off sockets are not allowed to listen().  */
+	if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
+		goto out;
+
 	if (sock->state != SS_UNCONNECTED)
 		goto out;
 
-	/* Allocate HMAC for generating cookie. */
-	if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
-		tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR(tfm)) {
-			if (net_ratelimit()) {
-				printk(KERN_INFO
-				       "SCTP: failed to load transform for %s: %ld\n",
-					sctp_hmac_alg, PTR_ERR(tfm));
-			}
-			err = -ENOSYS;
+	/* If backlog is zero, disable listening. */
+	if (!backlog) {
+		if (sctp_sstate(sk, CLOSED))
 			goto out;
-		}
-	}
 
-	switch (sock->type) {
-	case SOCK_SEQPACKET:
-		err = sctp_seqpacket_listen(sk, backlog);
-		break;
-	case SOCK_STREAM:
-		err = sctp_stream_listen(sk, backlog);
-		break;
-	default:
-		break;
+		err = 0;
+		sctp_unhash_endpoint(ep);
+		sk->sk_state = SCTP_SS_CLOSED;
+		if (sk->sk_reuse)
+			sctp_sk(sk)->bind_hash->fastreuse = 1;
+		goto out;
 	}
 
-	if (err)
-		goto cleanup;
+	/* If we are already listening, just update the backlog */
+	if (sctp_sstate(sk, LISTENING))
+		sk->sk_max_ack_backlog = backlog;
+	else {
+		err = sctp_listen_start(sk, backlog);
+		if (err)
+			goto out;
+	}
 
-	/* Store away the transform reference. */
-	if (!sctp_sk(sk)->hmac)
-		sctp_sk(sk)->hmac = tfm;
+	err = 0;
 out:
 	sctp_release_sock(sk);
 	return err;
-cleanup:
-	crypto_free_hash(tfm);
-	goto out;
 }
 
 /*
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 5c29b14ee9af..e5dde45c79d3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -543,8 +543,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 * congestion indications more than once every window of
 		 * data (or more loosely more than once every round-trip time).
 		 */
-		if ((jiffies - transport->last_time_ecne_reduced) >
-		    transport->rtt) {
+		if (time_after(jiffies, transport->last_time_ecne_reduced +
+					transport->rtt)) {
 			transport->ssthresh = max(transport->cwnd/2,
 						  4*transport->asoc->pathmtu);
 			transport->cwnd = transport->ssthresh;
@@ -561,7 +561,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 * to be done every RTO interval, we do it every hearbeat
 		 * interval.
 		 */
-		if ((jiffies - transport->last_time_used) > transport->rto)
+		if (time_after(jiffies, transport->last_time_used +
+					transport->rto))
 			transport->cwnd = max(transport->cwnd/2,
 						 4*transport->asoc->pathmtu);
 		break;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 3ddaff42d1bb..a3bfd4064912 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -119,7 +119,7 @@ static struct bclink *bclink = NULL;
 static struct link *bcl = NULL;
 static DEFINE_SPINLOCK(bc_lock);
 
-char tipc_bclink_name[] = "multicast-link";
+const char tipc_bclink_name[] = "multicast-link";
 
 
 static u32 buf_seqno(struct sk_buff *buf)
@@ -800,7 +800,7 @@ int tipc_bclink_init(void)
 	tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
 	bcl->b_ptr = &bcbearer->bearer;
 	bcl->state = WORKING_WORKING;
-	sprintf(bcl->name, tipc_bclink_name);
+	strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
 
 	if (BCLINK_LOG_BUF_SIZE) {
 		char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 2f2d731bc1c2..4c1771e95c99 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -70,7 +70,7 @@ struct port_list {
 
 struct tipc_node;
 
-extern char tipc_bclink_name[];
+extern const char tipc_bclink_name[];
 
 
 /**
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 29ecae851668..1885a7edb0c8 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -258,7 +258,7 @@ void tipc_printf(struct print_buf *pb, const char *fmt, ...)
 	}
 
 	if (pb->echo)
-		printk(print_string);
+		printk("%s", print_string);
 
 	spin_unlock_bh(&print_lock);
 }
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 20d98c56e152..2c24e7d6d950 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -703,7 +703,7 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
 
 	link_info.dest = htonl(tipc_own_addr & 0xfffff00);
 	link_info.up = htonl(1);
-	sprintf(link_info.str, tipc_bclink_name);
+	strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME);
 	tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
 
 	/* Add TLVs for any other links in scope */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d1b89820ab4f..baac91049b0e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1178,8 +1178,7 @@ out_unlock:
 		unix_state_unlock(other);
 
 out:
-	if (skb)
-		kfree_skb(skb);
+	kfree_skb(skb);
 	if (newsk)
 		unix_release_sock(newsk, 0);
 	if (other)
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 39701dec1dba..466e2d22d256 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -86,8 +86,10 @@ static int wanrouter_device_del_if(struct wan_device *wandev,
 
 static struct wan_device *wanrouter_find_device(char *name);
 static int wanrouter_delete_interface(struct wan_device *wandev, char *name);
-static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags);
-static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags);
+static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__acquires(lock);
+static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__releases(lock);
 
 
 
@@ -763,12 +765,14 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name)
 }
 
 static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__acquires(lock)
 {
 	spin_lock_irqsave(lock, *smp_flags);
 }
 
 
 static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__releases(lock)
 {
 	spin_unlock_irqrestore(lock, *smp_flags);
 }
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
index 267f7ff49827..c44d96b3a437 100644
--- a/net/wanrouter/wanproc.c
+++ b/net/wanrouter/wanproc.c
@@ -80,6 +80,7 @@ static struct proc_dir_entry *proc_router;
  *	Iterator
  */
 static void *r_start(struct seq_file *m, loff_t *pos)
+	__acquires(kernel_lock)
 {
 	struct wan_device *wandev;
 	loff_t l = *pos;
@@ -101,6 +102,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 static void r_stop(struct seq_file *m, void *v)
+	__releases(kernel_lock)
 {
 	unlock_kernel();
 }
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index e28e2b8fa436..092ae6faccca 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -102,3 +102,13 @@ config LIB80211_CRYPT_CCMP
 
 config LIB80211_CRYPT_TKIP
 	tristate
+
+config LIB80211_DEBUG
+	bool "lib80211 debugging messages"
+	depends on LIB80211
+	default n
+	---help---
+	  You can enable this if you want verbose debugging messages
+	  from lib80211.
+
+	  If unsure, say N.
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 0668b2bfc1da..17fe39049740 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -7,7 +7,6 @@
 #include <linux/if.h>
 #include <linux/module.h>
 #include <linux/err.h>
-#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/nl80211.h>
 #include <linux/debugfs.h>
@@ -31,18 +30,29 @@ MODULE_DESCRIPTION("wireless configuration support");
  * only read the list, and that can happen quite
  * often because we need to do it for each command */
 LIST_HEAD(cfg80211_drv_list);
-DEFINE_MUTEX(cfg80211_drv_mutex);
+
+/*
+ * This is used to protect the cfg80211_drv_list, cfg80211_regdomain,
+ * country_ie_regdomain, the reg_beacon_list and the the last regulatory
+ * request receipt (last_request).
+ */
+DEFINE_MUTEX(cfg80211_mutex);
 
 /* for debugfs */
 static struct dentry *ieee80211_debugfs_dir;
 
-/* requires cfg80211_drv_mutex to be held! */
-static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy)
+/* requires cfg80211_mutex to be held! */
+struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx)
 {
 	struct cfg80211_registered_device *result = NULL, *drv;
 
+	if (!wiphy_idx_valid(wiphy_idx))
+		return NULL;
+
+	assert_cfg80211_lock();
+
 	list_for_each_entry(drv, &cfg80211_drv_list, list) {
-		if (drv->idx == wiphy) {
+		if (drv->wiphy_idx == wiphy_idx) {
 			result = drv;
 			break;
 		}
@@ -51,17 +61,44 @@ static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy)
 	return result;
 }
 
+int get_wiphy_idx(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *drv;
+	if (!wiphy)
+		return WIPHY_IDX_STALE;
+	drv = wiphy_to_dev(wiphy);
+	return drv->wiphy_idx;
+}
+
 /* requires cfg80211_drv_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
+{
+	struct cfg80211_registered_device *drv;
+
+	if (!wiphy_idx_valid(wiphy_idx))
+		return NULL;
+
+	assert_cfg80211_lock();
+
+	drv = cfg80211_drv_by_wiphy_idx(wiphy_idx);
+	if (!drv)
+		return NULL;
+	return &drv->wiphy;
+}
+
+/* requires cfg80211_mutex to be held! */
 static struct cfg80211_registered_device *
 __cfg80211_drv_from_info(struct genl_info *info)
 {
 	int ifindex;
-	struct cfg80211_registered_device *bywiphy = NULL, *byifidx = NULL;
+	struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL;
 	struct net_device *dev;
 	int err = -EINVAL;
 
+	assert_cfg80211_lock();
+
 	if (info->attrs[NL80211_ATTR_WIPHY]) {
-		bywiphy = cfg80211_drv_by_wiphy(
+		bywiphyidx = cfg80211_drv_by_wiphy_idx(
 				nla_get_u32(info->attrs[NL80211_ATTR_WIPHY]));
 		err = -ENODEV;
 	}
@@ -78,14 +115,14 @@ __cfg80211_drv_from_info(struct genl_info *info)
 		err = -ENODEV;
 	}
 
-	if (bywiphy && byifidx) {
-		if (bywiphy != byifidx)
+	if (bywiphyidx && byifidx) {
+		if (bywiphyidx != byifidx)
 			return ERR_PTR(-EINVAL);
 		else
-			return bywiphy; /* == byifidx */
+			return bywiphyidx; /* == byifidx */
 	}
-	if (bywiphy)
-		return bywiphy;
+	if (bywiphyidx)
+		return bywiphyidx;
 
 	if (byifidx)
 		return byifidx;
@@ -98,7 +135,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
 {
 	struct cfg80211_registered_device *drv;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 	drv = __cfg80211_drv_from_info(info);
 
 	/* if it is not an error we grab the lock on
@@ -107,7 +144,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
 	if (!IS_ERR(drv))
 		mutex_lock(&drv->mtx);
 
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 
 	return drv;
 }
@@ -118,7 +155,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
 	struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV);
 	struct net_device *dev;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 	dev = dev_get_by_index(&init_net, ifindex);
 	if (!dev)
 		goto out;
@@ -129,7 +166,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
 		drv = ERR_PTR(-ENODEV);
 	dev_put(dev);
  out:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 	return drv;
 }
 
@@ -143,16 +180,16 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
 			char *newname)
 {
 	struct cfg80211_registered_device *drv;
-	int idx, taken = -1, result, digits;
+	int wiphy_idx, taken = -1, result, digits;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 
 	/* prohibit calling the thing phy%d when %d is not its number */
-	sscanf(newname, PHY_NAME "%d%n", &idx, &taken);
-	if (taken == strlen(newname) && idx != rdev->idx) {
-		/* count number of places needed to print idx */
+	sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
+	if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
+		/* count number of places needed to print wiphy_idx */
 		digits = 1;
-		while (idx /= 10)
+		while (wiphy_idx /= 10)
 			digits++;
 		/*
 		 * deny the name if it is phy<idx> where <idx> is printed
@@ -193,7 +230,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
 
 	result = 0;
 out_unlock:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 	if (result == 0)
 		nl80211_notify_dev_rename(rdev);
 
@@ -220,22 +257,22 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
 
 	drv->ops = ops;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 
-	drv->idx = wiphy_counter++;
+	drv->wiphy_idx = wiphy_counter++;
 
-	if (unlikely(drv->idx < 0)) {
+	if (unlikely(!wiphy_idx_valid(drv->wiphy_idx))) {
 		wiphy_counter--;
-		mutex_unlock(&cfg80211_drv_mutex);
+		mutex_unlock(&cfg80211_mutex);
 		/* ugh, wrapped! */
 		kfree(drv);
 		return NULL;
 	}
 
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 
 	/* give it a proper name */
-	dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->idx);
+	dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->wiphy_idx);
 
 	mutex_init(&drv->mtx);
 	mutex_init(&drv->devlist_mtx);
@@ -310,10 +347,10 @@ int wiphy_register(struct wiphy *wiphy)
 	/* check and set up bitrates */
 	ieee80211_set_bitrate_flags(wiphy);
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 
 	/* set up regulatory info */
-	wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE);
+	wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
 
 	res = device_add(&drv->wiphy.dev);
 	if (res)
@@ -328,9 +365,20 @@ int wiphy_register(struct wiphy *wiphy)
 	if (IS_ERR(drv->wiphy.debugfsdir))
 		drv->wiphy.debugfsdir = NULL;
 
+	if (wiphy->custom_regulatory) {
+		struct regulatory_request request;
+
+		request.wiphy_idx = get_wiphy_idx(wiphy);
+		request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+		request.alpha2[0] = '9';
+		request.alpha2[1] = '9';
+
+		nl80211_send_reg_change_event(&request);
+	}
+
 	res = 0;
 out_unlock:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 	return res;
 }
 EXPORT_SYMBOL(wiphy_register);
@@ -340,7 +388,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 	struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy);
 
 	/* protect the device list */
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 
 	BUG_ON(!list_empty(&drv->netdev_list));
 
@@ -366,7 +414,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 	device_del(&drv->wiphy.dev);
 	debugfs_remove(drv->wiphy.debugfsdir);
 
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 }
 EXPORT_SYMBOL(wiphy_unregister);
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index e29ad4cd464f..6acd483a61f8 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -10,6 +10,7 @@
 #include <linux/netdevice.h>
 #include <linux/kref.h>
 #include <linux/rbtree.h>
+#include <linux/mutex.h>
 #include <net/genetlink.h>
 #include <net/wireless.h>
 #include <net/cfg80211.h>
@@ -37,7 +38,7 @@ struct cfg80211_registered_device {
 	enum environment_cap env;
 
 	/* wiphy index, internal only */
-	int idx;
+	int wiphy_idx;
 
 	/* associate netdev list */
 	struct mutex devlist_mtx;
@@ -49,6 +50,7 @@ struct cfg80211_registered_device {
 	struct rb_root bss_tree;
 	u32 bss_generation;
 	struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+	unsigned long suspend_at;
 
 	/* must be last because of the way we do wiphy_priv(),
 	 * and it should at least be aligned to NETDEV_ALIGN */
@@ -62,9 +64,27 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
 	return container_of(wiphy, struct cfg80211_registered_device, wiphy);
 }
 
-extern struct mutex cfg80211_drv_mutex;
+/* Note 0 is valid, hence phy0 */
+static inline
+bool wiphy_idx_valid(int wiphy_idx)
+{
+	return (wiphy_idx >= 0);
+}
+
+extern struct mutex cfg80211_mutex;
 extern struct list_head cfg80211_drv_list;
 
+static inline void assert_cfg80211_lock(void)
+{
+	WARN_ON(!mutex_is_locked(&cfg80211_mutex));
+}
+
+/*
+ * You can use this to mark a wiphy_idx as not having an associated wiphy.
+ * It guarantees cfg80211_drv_by_wiphy_idx(wiphy_idx) will return NULL
+ */
+#define WIPHY_IDX_STALE -1
+
 struct cfg80211_internal_bss {
 	struct list_head list;
 	struct rb_node rbn;
@@ -74,6 +94,9 @@ struct cfg80211_internal_bss {
 	struct cfg80211_bss pub;
 };
 
+struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx);
+int get_wiphy_idx(struct wiphy *wiphy);
+
 /*
  * This function returns a pointer to the driver
  * that the genl_info item that is passed refers to.
@@ -81,13 +104,13 @@ struct cfg80211_internal_bss {
  * the driver's mutex!
  *
  * This means that you need to call cfg80211_put_dev()
- * before being allowed to acquire &cfg80211_drv_mutex!
+ * before being allowed to acquire &cfg80211_mutex!
  *
  * This is necessary because we need to lock the global
  * mutex to get an item off the list safely, and then
  * we lock the drv mutex so it doesn't go away under us.
  *
- * We don't want to keep cfg80211_drv_mutex locked
+ * We don't want to keep cfg80211_mutex locked
  * for all the time in order to allow requests on
  * other interfaces to go through at the same time.
  *
@@ -97,6 +120,9 @@ struct cfg80211_internal_bss {
 extern struct cfg80211_registered_device *
 cfg80211_get_dev_from_info(struct genl_info *info);
 
+/* requires cfg80211_drv_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
+
 /* identical to cfg80211_get_dev_from_info but only operate on ifindex */
 extern struct cfg80211_registered_device *
 cfg80211_get_dev_from_ifindex(int ifindex);
@@ -110,8 +136,11 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
 			       char *newname);
 
 void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator setby);
 
 void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+                      unsigned long age_secs);
 
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index db428194c16a..2301dc1edc4c 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -337,6 +337,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	pos += 8;
 
 	if (ccmp_replay_check(pn, key->rx_pn)) {
+#ifdef CONFIG_LIB80211_DEBUG
 		if (net_ratelimit()) {
 			printk(KERN_DEBUG "CCMP: replay detected: STA=%pM "
 				 "previous PN %02x%02x%02x%02x%02x%02x "
@@ -346,6 +347,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 				 key->rx_pn[3], key->rx_pn[4], key->rx_pn[5],
 				 pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]);
 		}
+#endif
 		key->dot11RSNAStatsCCMPReplays++;
 		return -4;
 	}
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 7e8e22bfed90..c36287399d7e 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -465,12 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 	pos += 8;
 
 	if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
+#ifdef CONFIG_LIB80211_DEBUG
 		if (net_ratelimit()) {
 			printk(KERN_DEBUG "TKIP: replay detected: STA=%pM"
 			       " previous TSC %08x%04x received TSC "
 			       "%08x%04x\n", hdr->addr2,
 			       tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
 		}
+#endif
 		tkey->dot11RSNAStatsTKIPReplays++;
 		return -4;
 	}
@@ -505,10 +507,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
 			 * it needs to be recalculated for the next packet. */
 			tkey->rx_phase1_done = 0;
 		}
+#ifdef CONFIG_LIB80211_DEBUG
 		if (net_ratelimit()) {
 			printk(KERN_DEBUG "TKIP: ICV error detected: STA="
 			       "%pM\n", hdr->addr2);
 		}
+#endif
 		tkey->dot11RSNAStatsTKIPICVErrors++;
 		return -5;
 	}
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 298a4de59948..ab9d8f14e151 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7,7 +7,6 @@
 #include <linux/if.h>
 #include <linux/module.h>
 #include <linux/err.h>
-#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/if_ether.h>
 #include <linux/ieee80211.h>
@@ -142,7 +141,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	if (!hdr)
 		return -1;
 
-	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx);
 	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
 	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
 		   dev->wiphy.max_scan_ssids);
@@ -256,7 +255,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
 	int start = cb->args[0];
 	struct cfg80211_registered_device *dev;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 	list_for_each_entry(dev, &cfg80211_drv_list, list) {
 		if (++idx <= start)
 			continue;
@@ -267,7 +266,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
 			break;
 		}
 	}
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 
 	cb->args[0] = idx;
 
@@ -470,7 +469,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
 	struct cfg80211_registered_device *dev;
 	struct wireless_dev *wdev;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 	list_for_each_entry(dev, &cfg80211_drv_list, list) {
 		if (wp_idx < wp_start) {
 			wp_idx++;
@@ -497,7 +496,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
 		wp_idx++;
 	}
  out:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 
 	cb->args[0] = wp_idx;
 	cb->args[1] = if_idx;
@@ -1206,6 +1205,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 
 		nla_nest_end(msg, txrate);
 	}
+	if (sinfo->filled & STATION_INFO_RX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS,
+			    sinfo->rx_packets);
+	if (sinfo->filled & STATION_INFO_TX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
+			    sinfo->tx_packets);
 	nla_nest_end(msg, sinfoattr);
 
 	return genlmsg_end(msg, hdr);
@@ -1900,6 +1905,19 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 	int r;
 	char *data = NULL;
 
+	/*
+	 * You should only get this when cfg80211 hasn't yet initialized
+	 * completely when built-in to the kernel right between the time
+	 * window between nl80211_init() and regulatory_init(), if that is
+	 * even possible.
+	 */
+	mutex_lock(&cfg80211_mutex);
+	if (unlikely(!cfg80211_regdomain)) {
+		mutex_unlock(&cfg80211_mutex);
+		return -EINPROGRESS;
+	}
+	mutex_unlock(&cfg80211_mutex);
+
 	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
 		return -EINVAL;
 
@@ -1910,14 +1928,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 	if (is_world_regdom(data))
 		return -EINVAL;
 #endif
-	mutex_lock(&cfg80211_drv_mutex);
-	r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY);
-	mutex_unlock(&cfg80211_drv_mutex);
-	/* This means the regulatory domain was already set, however
-	 * we don't want to confuse userspace with a "successful error"
-	 * message so lets just treat it as a success */
-	if (r == -EALREADY)
-		r = 0;
+
+	r = regulatory_hint_user(data);
+
 	return r;
 }
 
@@ -1937,6 +1950,11 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
 	if (err)
 		return err;
 
+	if (!drv->ops->get_mesh_params) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	/* Get the mesh params */
 	rtnl_lock();
 	err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params);
@@ -2046,6 +2064,11 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		return err;
 
+	if (!drv->ops->set_mesh_params) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
 	/* This makes sure that there aren't more than 32 mesh config
 	 * parameters (otherwise our bitfield scheme would not work.) */
 	BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
@@ -2090,6 +2113,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 	err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask);
 	rtnl_unlock();
 
+ out:
 	/* cleanup */
 	cfg80211_put_dev(drv);
 	dev_put(dev);
@@ -2106,7 +2130,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
 	unsigned int i;
 	int err = -EINVAL;
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 
 	if (!cfg80211_regdomain)
 		goto out;
@@ -2169,7 +2193,7 @@ nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	err = -EMSGSIZE;
 out:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 	return err;
 }
 
@@ -2228,9 +2252,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
 
 	BUG_ON(rule_idx != num_rules);
 
-	mutex_lock(&cfg80211_drv_mutex);
+	mutex_lock(&cfg80211_mutex);
 	r = set_regdom(rd);
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 	return r;
 
  bad_reg:
@@ -2286,6 +2310,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	struct wiphy *wiphy;
 	int err, tmp, n_ssids = 0, n_channels = 0, i;
 	enum ieee80211_band band;
+	size_t ie_len;
 
 	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
 	if (err)
@@ -2327,9 +2352,15 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		goto out_unlock;
 	}
 
+	if (info->attrs[NL80211_ATTR_IE])
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	else
+		ie_len = 0;
+
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*ssid) * n_ssids
-			+ sizeof(channel) * n_channels, GFP_KERNEL);
+			+ sizeof(channel) * n_channels
+			+ ie_len, GFP_KERNEL);
 	if (!request) {
 		err = -ENOMEM;
 		goto out_unlock;
@@ -2340,6 +2371,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	if (n_ssids)
 		request->ssids = (void *)(request->channels + n_channels);
 	request->n_ssids = n_ssids;
+	if (ie_len) {
+		if (request->ssids)
+			request->ie = (void *)(request->ssids + n_ssids);
+		else
+			request->ie = (void *)(request->channels + n_channels);
+	}
 
 	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
 		/* user specified, bail out if channel not found */
@@ -2380,6 +2417,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_IE]) {
+		request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+		memcpy(request->ie, nla_data(info->attrs[NL80211_ATTR_IE]),
+		       request->ie_len);
+	}
+
 	request->ifidx = dev->ifindex;
 	request->wiphy = &drv->wiphy;
 
@@ -2432,7 +2475,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
 	NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
 
-	switch (res->signal_type) {
+	switch (rdev->wiphy.signal_type) {
 	case CFG80211_SIGNAL_TYPE_MBM:
 		NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal);
 		break;
@@ -2601,7 +2644,6 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_get_station,
 		.dumpit = nl80211_dump_station,
 		.policy = nl80211_policy,
-		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = NL80211_CMD_SET_STATION,
@@ -2708,6 +2750,9 @@ static struct genl_multicast_group nl80211_config_mcgrp = {
 static struct genl_multicast_group nl80211_scan_mcgrp = {
 	.name = "scan",
 };
+static struct genl_multicast_group nl80211_regulatory_mcgrp = {
+	.name = "regulatory",
+};
 
 /* notification functions */
 
@@ -2739,7 +2784,7 @@ static int nl80211_send_scan_donemsg(struct sk_buff *msg,
 	if (!hdr)
 		return -1;
 
-	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
 	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
 
 	/* XXX: we should probably bounce back the request? */
@@ -2787,6 +2832,61 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 	genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
 }
 
+/*
+ * This can happen on global regulatory changes or device specific settings
+ * based on custom world regulatory domains.
+ */
+void nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	/* Userspace can always count this one always being set */
+	NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator);
+
+	if (request->alpha2[0] == '0' && request->alpha2[1] == '0')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_WORLD);
+	else if (request->alpha2[0] == '9' && request->alpha2[1] == '9')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_CUSTOM_WORLD);
+	else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
+		 request->intersect)
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_INTERSECTION);
+	else {
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_COUNTRY);
+		NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2);
+	}
+
+	if (wiphy_idx_valid(request->wiphy_idx))
+		NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL);
+
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
 /* initialisation/exit functions */
 
 int nl80211_init(void)
@@ -2811,6 +2911,10 @@ int nl80211_init(void)
 	if (err)
 		goto err_out;
 
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
+	if (err)
+		goto err_out;
+
 	return 0;
  err_out:
 	genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b565a5f84e97..e65a3c38c52f 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,6 +11,7 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 				   struct net_device *netdev);
 extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
 				      struct net_device *netdev);
+extern void nl80211_send_reg_change_event(struct regulatory_request *request);
 #else
 static inline int nl80211_init(void)
 {
@@ -27,6 +28,14 @@ static inline void
 nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
 		       struct net_device *netdev)
 {}
+static inline void nl80211_send_scan_aborted(
+					struct cfg80211_registered_device *rdev,
+					struct net_device *netdev)
+{}
+static inline void
+nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+}
 #endif /* CONFIG_NL80211 */
 
 #endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2323644330cd..eb8b8ed16155 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -41,6 +41,7 @@
 #include <net/cfg80211.h>
 #include "core.h"
 #include "reg.h"
+#include "nl80211.h"
 
 /* Receipt of information from last regulatory request */
 static struct regulatory_request *last_request;
@@ -54,22 +55,63 @@ static u32 supported_bandwidths[] = {
 	MHZ_TO_KHZ(20),
 };
 
-/* Central wireless core regulatory domains, we only need two,
+/*
+ * Central wireless core regulatory domains, we only need two,
  * the current one and a world regulatory domain in case we have no
- * information to give us an alpha2 */
+ * information to give us an alpha2
+ */
 const struct ieee80211_regdomain *cfg80211_regdomain;
 
-/* We use this as a place for the rd structure built from the
+/*
+ * We use this as a place for the rd structure built from the
  * last parsed country IE to rest until CRDA gets back to us with
- * what it thinks should apply for the same country */
+ * what it thinks should apply for the same country
+ */
 static const struct ieee80211_regdomain *country_ie_regdomain;
 
+/* Used to queue up regulatory hints */
+static LIST_HEAD(reg_requests_list);
+static spinlock_t reg_requests_lock;
+
+/* Used to queue up beacon hints for review */
+static LIST_HEAD(reg_pending_beacons);
+static spinlock_t reg_pending_beacons_lock;
+
+/* Used to keep track of processed beacon hints */
+static LIST_HEAD(reg_beacon_list);
+
+struct reg_beacon {
+	struct list_head list;
+	struct ieee80211_channel chan;
+};
+
 /* We keep a static world regulatory domain in case of the absence of CRDA */
 static const struct ieee80211_regdomain world_regdom = {
-	.n_reg_rules = 1,
+	.n_reg_rules = 5,
 	.alpha2 =  "00",
 	.reg_rules = {
-		REG_RULE(2412-10, 2462+10, 40, 6, 20,
+		/* IEEE 802.11b/g, channels 1..11 */
+		REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
+		/* IEEE 802.11b/g, channels 12..13. No HT40
+		 * channel fits here. */
+		REG_RULE(2467-10, 2472+10, 20, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+		/* IEEE 802.11 channel 14 - Only JP enables
+		 * this and for 802.11b only */
+		REG_RULE(2484-10, 2484+10, 20, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_NO_OFDM),
+		/* IEEE 802.11a, channel 36..48 */
+		REG_RULE(5180-10, 5240+10, 40, 6, 20,
+                        NL80211_RRF_PASSIVE_SCAN |
+                        NL80211_RRF_NO_IBSS),
+
+		/* NB: 5260 MHz - 5700 MHz requies DFS */
+
+		/* IEEE 802.11a, channel 149..165 */
+		REG_RULE(5745-10, 5825+10, 40, 6, 20,
 			NL80211_RRF_PASSIVE_SCAN |
 			NL80211_RRF_NO_IBSS),
 	}
@@ -83,9 +125,11 @@ static char *ieee80211_regdom = "US";
 module_param(ieee80211_regdom, charp, 0444);
 MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
 
-/* We assume 40 MHz bandwidth for the old regulatory work.
+/*
+ * We assume 40 MHz bandwidth for the old regulatory work.
  * We make emphasis we are using the exact same frequencies
- * as before */
+ * as before
+ */
 
 static const struct ieee80211_regdomain us_regdom = {
 	.n_reg_rules = 6,
@@ -124,8 +168,10 @@ static const struct ieee80211_regdomain jp_regdom = {
 
 static const struct ieee80211_regdomain eu_regdom = {
 	.n_reg_rules = 6,
-	/* This alpha2 is bogus, we leave it here just for stupid
-	 * backward compatibility */
+	/*
+	 * This alpha2 is bogus, we leave it here just for stupid
+	 * backward compatibility
+	 */
 	.alpha2 =  "EU",
 	.reg_rules = {
 		/* IEEE 802.11b/g, channels 1..13 */
@@ -194,8 +240,10 @@ static void reset_regdomains(void)
 	cfg80211_regdomain = NULL;
 }
 
-/* Dynamic world regulatory domain requested by the wireless
- * core upon initialization */
+/*
+ * Dynamic world regulatory domain requested by the wireless
+ * core upon initialization
+ */
 static void update_world_regdomain(const struct ieee80211_regdomain *rd)
 {
 	BUG_ON(!last_request);
@@ -236,8 +284,10 @@ static bool is_unknown_alpha2(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
-	/* Special case where regulatory domain was built by driver
-	 * but a specific alpha2 cannot be determined */
+	/*
+	 * Special case where regulatory domain was built by driver
+	 * but a specific alpha2 cannot be determined
+	 */
 	if (alpha2[0] == '9' && alpha2[1] == '9')
 		return true;
 	return false;
@@ -247,9 +297,11 @@ static bool is_intersected_alpha2(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
-	/* Special case where regulatory domain is the
+	/*
+	 * Special case where regulatory domain is the
 	 * result of an intersection between two regulatory domain
-	 * structures */
+	 * structures
+	 */
 	if (alpha2[0] == '9' && alpha2[1] == '8')
 		return true;
 	return false;
@@ -274,8 +326,10 @@ static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
 	return false;
 }
 
-static bool regdom_changed(const char *alpha2)
+static bool regdom_changes(const char *alpha2)
 {
+	assert_cfg80211_lock();
+
 	if (!cfg80211_regdomain)
 		return true;
 	if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
@@ -302,8 +356,10 @@ static bool country_ie_integrity_changes(u32 checksum)
 	return false;
 }
 
-/* This lets us keep regulatory code which is updated on a regulatory
- * basis in userspace. */
+/*
+ * This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace.
+ */
 static int call_crda(const char *alpha2)
 {
 	char country_env[9 + 2] = "COUNTRY=";
@@ -348,7 +404,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
 
 	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
 
-	if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff)
+	if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
+			freq_range->max_bandwidth_khz > freq_diff)
 		return false;
 
 	return true;
@@ -414,10 +471,12 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
 #undef ONE_GHZ_IN_KHZ
 }
 
-/* Converts a country IE to a regulatory domain. A regulatory domain
+/*
+ * Converts a country IE to a regulatory domain. A regulatory domain
  * structure has a lot of information which the IE doesn't yet have,
  * so for the other values we use upper max values as we will intersect
- * with our userspace regulatory agent to get lower bounds. */
+ * with our userspace regulatory agent to get lower bounds.
+ */
 static struct ieee80211_regdomain *country_ie_2_rd(
 				u8 *country_ie,
 				u8 country_ie_len,
@@ -462,9 +521,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 
 	*checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8);
 
-	/* We need to build a reg rule for each triplet, but first we must
+	/*
+	 * We need to build a reg rule for each triplet, but first we must
 	 * calculate the number of reg rules we will need. We will need one
-	 * for each channel subband */
+	 * for each channel subband
+	 */
 	while (country_ie_len >= 3) {
 		int end_channel = 0;
 		struct ieee80211_country_ie_triplet *triplet =
@@ -502,9 +563,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 		if (cur_sub_max_channel < cur_channel)
 			return NULL;
 
-		/* Do not allow overlapping channels. Also channels
+		/*
+		 * Do not allow overlapping channels. Also channels
 		 * passed in each subband must be monotonically
-		 * increasing */
+		 * increasing
+		 */
 		if (last_sub_max_channel) {
 			if (cur_channel <= last_sub_max_channel)
 				return NULL;
@@ -512,10 +575,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 				return NULL;
 		}
 
-		/* When dot11RegulatoryClassesRequired is supported
+		/*
+		 * When dot11RegulatoryClassesRequired is supported
 		 * we can throw ext triplets as part of this soup,
 		 * for now we don't care when those change as we
-		 * don't support them */
+		 * don't support them
+		 */
 		*checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) |
 		  ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) |
 		  ((triplet->chans.max_power ^ cur_sub_max_channel) << 24);
@@ -526,8 +591,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 		country_ie_len -= 3;
 		num_rules++;
 
-		/* Note: this is not a IEEE requirement but
-		 * simply a memory requirement */
+		/*
+		 * Note: this is not a IEEE requirement but
+		 * simply a memory requirement
+		 */
 		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
 			return NULL;
 	}
@@ -555,8 +622,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 		struct ieee80211_freq_range *freq_range = NULL;
 		struct ieee80211_power_rule *power_rule = NULL;
 
-		/* Must parse if dot11RegulatoryClassesRequired is true,
-		 * we don't support this yet */
+		/*
+		 * Must parse if dot11RegulatoryClassesRequired is true,
+		 * we don't support this yet
+		 */
 		if (triplet->ext.reg_extension_id >=
 				IEEE80211_COUNTRY_EXTENSION_ID) {
 			country_ie += 3;
@@ -578,10 +647,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 			end_channel =  triplet->chans.first_channel +
 				(4 * (triplet->chans.num_channels - 1));
 
-		/* The +10 is since the regulatory domain expects
+		/*
+		 * The +10 is since the regulatory domain expects
 		 * the actual band edge, not the center of freq for
 		 * its start and end freqs, assuming 20 MHz bandwidth on
-		 * the channels passed */
+		 * the channels passed
+		 */
 		freq_range->start_freq_khz =
 			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
 				triplet->chans.first_channel) - 10);
@@ -589,9 +660,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
 				end_channel) + 10);
 
-		/* Large arbitrary values, we intersect later */
-		/* Increment this if we ever support >= 40 MHz channels
-		 * in IEEE 802.11 */
+		/*
+		 * These are large arbitrary values we use to intersect later.
+		 * Increment this if we ever support >= 40 MHz channels
+		 * in IEEE 802.11
+		 */
 		freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40);
 		power_rule->max_antenna_gain = DBI_TO_MBI(100);
 		power_rule->max_eirp = DBM_TO_MBM(100);
@@ -607,8 +680,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
 }
 
 
-/* Helper for regdom_intersect(), this does the real
- * mathematical intersection fun */
+/*
+ * Helper for regdom_intersect(), this does the real
+ * mathematical intersection fun
+ */
 static int reg_rules_intersect(
 	const struct ieee80211_reg_rule *rule1,
 	const struct ieee80211_reg_rule *rule2,
@@ -686,11 +761,13 @@ static struct ieee80211_regdomain *regdom_intersect(
 	if (!rd1 || !rd2)
 		return NULL;
 
-	/* First we get a count of the rules we'll need, then we actually
+	/*
+	 * First we get a count of the rules we'll need, then we actually
 	 * build them. This is to so we can malloc() and free() a
 	 * regdomain once. The reason we use reg_rules_intersect() here
 	 * is it will return -EINVAL if the rule computed makes no sense.
-	 * All rules that do check out OK are valid. */
+	 * All rules that do check out OK are valid.
+	 */
 
 	for (x = 0; x < rd1->n_reg_rules; x++) {
 		rule1 = &rd1->reg_rules[x];
@@ -718,14 +795,18 @@ static struct ieee80211_regdomain *regdom_intersect(
 		rule1 = &rd1->reg_rules[x];
 		for (y = 0; y < rd2->n_reg_rules; y++) {
 			rule2 = &rd2->reg_rules[y];
-			/* This time around instead of using the stack lets
+			/*
+			 * This time around instead of using the stack lets
 			 * write to the target rule directly saving ourselves
-			 * a memcpy() */
+			 * a memcpy()
+			 */
 			intersected_rule = &rd->reg_rules[rule_idx];
 			r = reg_rules_intersect(rule1, rule2,
 				intersected_rule);
-			/* No need to memset here the intersected rule here as
-			 * we're not using the stack anymore */
+			/*
+			 * No need to memset here the intersected rule here as
+			 * we're not using the stack anymore
+			 */
 			if (r)
 				continue;
 			rule_idx++;
@@ -744,8 +825,10 @@ static struct ieee80211_regdomain *regdom_intersect(
 	return rd;
 }
 
-/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
- * want to just have the channel structure use these */
+/*
+ * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these
+ */
 static u32 map_regdom_flags(u32 rd_flags)
 {
 	u32 channel_flags = 0;
@@ -771,10 +854,12 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
 
 	regd = custom_regd ? custom_regd : cfg80211_regdomain;
 
-	/* Follow the driver's regulatory domain, if present, unless a country
-	 * IE has been processed or a user wants to help complaince further */
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE &&
-	    last_request->initiator != REGDOM_SET_BY_USER &&
+	/*
+	 * Follow the driver's regulatory domain, if present, unless a country
+	 * IE has been processed or a user wants to help complaince further
+	 */
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    last_request->initiator != NL80211_REGDOM_SET_BY_USER &&
 	    wiphy->regd)
 		regd = wiphy->regd;
 
@@ -790,9 +875,11 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
 		fr = &rr->freq_range;
 		pr = &rr->power_rule;
 
-		/* We only need to know if one frequency rule was
+		/*
+		 * We only need to know if one frequency rule was
 		 * was in center_freq's band, that's enough, so lets
-		 * not overwrite it once found */
+		 * not overwrite it once found
+		 */
 		if (!band_rule_found)
 			band_rule_found = freq_in_rule_band(fr, center_freq);
 
@@ -829,6 +916,11 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 	const struct ieee80211_power_rule *power_rule = NULL;
 	struct ieee80211_supported_band *sband;
 	struct ieee80211_channel *chan;
+	struct wiphy *request_wiphy = NULL;
+
+	assert_cfg80211_lock();
+
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
 
 	sband = wiphy->bands[band];
 	BUG_ON(chan_idx >= sband->n_channels);
@@ -840,7 +932,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 		&max_bandwidth, &reg_rule);
 
 	if (r) {
-		/* This means no regulatory rule was found in the country IE
+		/*
+		 * This means no regulatory rule was found in the country IE
 		 * with a frequency range on the center_freq's band, since
 		 * IEEE-802.11 allows for a country IE to have a subset of the
 		 * regulatory information provided in a country we ignore
@@ -851,7 +944,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 		 * http://tinyurl.com/11d-clarification
 		 */
 		if (r == -ERANGE &&
-		    last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+		    last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 #ifdef CONFIG_CFG80211_REG_DEBUG
 			printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz "
 				"intact on %s - no rule found in band on "
@@ -859,10 +953,13 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 				chan->center_freq, wiphy_name(wiphy));
 #endif
 		} else {
-		/* In this case we know the country IE has at least one reg rule
-		 * for the band so we respect its band definitions */
+		/*
+		 * In this case we know the country IE has at least one reg rule
+		 * for the band so we respect its band definitions
+		 */
 #ifdef CONFIG_CFG80211_REG_DEBUG
-			if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			if (last_request->initiator ==
+			    NL80211_REGDOM_SET_BY_COUNTRY_IE)
 				printk(KERN_DEBUG "cfg80211: Disabling "
 					"channel %d MHz on %s due to "
 					"Country IE\n",
@@ -876,12 +973,14 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 
 	power_rule = &reg_rule->power_rule;
 
-	if (last_request->initiator == REGDOM_SET_BY_DRIVER &&
-	    last_request->wiphy && last_request->wiphy == wiphy &&
-	    last_request->wiphy->strict_regulatory) {
-		/* This gaurantees the driver's requested regulatory domain
+	if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+	    request_wiphy && request_wiphy == wiphy &&
+	    request_wiphy->strict_regulatory) {
+		/*
+		 * This gaurantees the driver's requested regulatory domain
 		 * will always be used as a base for further regulatory
-		 * settings */
+		 * settings
+		 */
 		chan->flags = chan->orig_flags =
 			map_regdom_flags(reg_rule->flags);
 		chan->max_antenna_gain = chan->orig_mag =
@@ -915,39 +1014,147 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band)
 		handle_channel(wiphy, band, i);
 }
 
-static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby)
+static bool ignore_reg_update(struct wiphy *wiphy,
+			      enum nl80211_reg_initiator initiator)
 {
 	if (!last_request)
 		return true;
-	if (setby == REGDOM_SET_BY_CORE &&
+	if (initiator == NL80211_REGDOM_SET_BY_CORE &&
 		  wiphy->custom_regulatory)
 		return true;
-	/* wiphy->regd will be set once the device has its own
-	 * desired regulatory domain set */
+	/*
+	 * wiphy->regd will be set once the device has its own
+	 * desired regulatory domain set
+	 */
 	if (wiphy->strict_regulatory && !wiphy->regd &&
 	    !is_world_regdom(last_request->alpha2))
 		return true;
 	return false;
 }
 
-static void update_all_wiphy_regulatory(enum reg_set_by setby)
+static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
 {
 	struct cfg80211_registered_device *drv;
 
 	list_for_each_entry(drv, &cfg80211_drv_list, list)
-		wiphy_update_regulatory(&drv->wiphy, setby);
+		wiphy_update_regulatory(&drv->wiphy, initiator);
 }
 
-void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby)
+static void handle_reg_beacon(struct wiphy *wiphy,
+			      unsigned int chan_idx,
+			      struct reg_beacon *reg_beacon)
 {
-	enum ieee80211_band band;
+#ifdef CONFIG_CFG80211_REG_DEBUG
+#define REG_DEBUG_BEACON_FLAG(desc) \
+	printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \
+		"frequency: %d MHz (Ch %d) on %s\n", \
+		reg_beacon->chan.center_freq, \
+		ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \
+		wiphy_name(wiphy));
+#else
+#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0)
+#endif
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *chan;
+
+	assert_cfg80211_lock();
+
+	sband = wiphy->bands[reg_beacon->chan.band];
+	chan = &sband->channels[chan_idx];
+
+	if (likely(chan->center_freq != reg_beacon->chan.center_freq))
+		return;
+
+	if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) {
+		chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
+		REG_DEBUG_BEACON_FLAG("active scanning");
+	}
+
+	if (chan->flags & IEEE80211_CHAN_NO_IBSS) {
+		chan->flags &= ~IEEE80211_CHAN_NO_IBSS;
+		REG_DEBUG_BEACON_FLAG("beaconing");
+	}
+
+	chan->beacon_found = true;
+#undef REG_DEBUG_BEACON_FLAG
+}
+
+/*
+ * Called when a scan on a wiphy finds a beacon on
+ * new channel
+ */
+static void wiphy_update_new_beacon(struct wiphy *wiphy,
+				    struct reg_beacon *reg_beacon)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+
+	assert_cfg80211_lock();
 
-	if (ignore_reg_update(wiphy, setby))
+	if (!wiphy->bands[reg_beacon->chan.band])
 		return;
+
+	sband = wiphy->bands[reg_beacon->chan.band];
+
+	for (i = 0; i < sband->n_channels; i++)
+		handle_reg_beacon(wiphy, i, reg_beacon);
+}
+
+/*
+ * Called upon reg changes or a new wiphy is added
+ */
+static void wiphy_update_beacon_reg(struct wiphy *wiphy)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+	struct reg_beacon *reg_beacon;
+
+	assert_cfg80211_lock();
+
+	if (list_empty(&reg_beacon_list))
+		return;
+
+	list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
+		if (!wiphy->bands[reg_beacon->chan.band])
+			continue;
+		sband = wiphy->bands[reg_beacon->chan.band];
+		for (i = 0; i < sband->n_channels; i++)
+			handle_reg_beacon(wiphy, i, reg_beacon);
+	}
+}
+
+static bool reg_is_world_roaming(struct wiphy *wiphy)
+{
+	if (is_world_regdom(cfg80211_regdomain->alpha2) ||
+	    (wiphy->regd && is_world_regdom(wiphy->regd->alpha2)))
+		return true;
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    wiphy->custom_regulatory)
+		return true;
+	return false;
+}
+
+/* Reap the advantages of previously found beacons */
+static void reg_process_beacons(struct wiphy *wiphy)
+{
+	if (!reg_is_world_roaming(wiphy))
+		return;
+	wiphy_update_beacon_reg(wiphy);
+}
+
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator initiator)
+{
+	enum ieee80211_band band;
+
+	if (ignore_reg_update(wiphy, initiator))
+		goto out;
 	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
 		if (wiphy->bands[band])
 			handle_band(wiphy, band);
 	}
+out:
+	reg_process_beacons(wiphy);
 	if (wiphy->reg_notifier)
 		wiphy->reg_notifier(wiphy, last_request);
 }
@@ -1033,81 +1240,98 @@ static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd,
 	return 0;
 }
 
-/* Return value which can be used by ignore_request() to indicate
- * it has been determined we should intersect two regulatory domains */
+/*
+ * Return value which can be used by ignore_request() to indicate
+ * it has been determined we should intersect two regulatory domains
+ */
 #define REG_INTERSECT	1
 
 /* This has the logic which determines when a new request
  * should be ignored. */
-static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
-			  const char *alpha2)
+static int ignore_request(struct wiphy *wiphy,
+			  struct regulatory_request *pending_request)
 {
+	struct wiphy *last_wiphy = NULL;
+
+	assert_cfg80211_lock();
+
 	/* All initial requests are respected */
 	if (!last_request)
 		return 0;
 
-	switch (set_by) {
-	case REGDOM_SET_BY_INIT:
+	switch (pending_request->initiator) {
+	case NL80211_REGDOM_SET_BY_CORE:
 		return -EINVAL;
-	case REGDOM_SET_BY_CORE:
-		/*
-		 * Always respect new wireless core hints, should only happen
-		 * when updating the world regulatory domain at init.
-		 */
-		return 0;
-	case REGDOM_SET_BY_COUNTRY_IE:
-		if (unlikely(!is_an_alpha2(alpha2)))
+	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+
+		last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+		if (unlikely(!is_an_alpha2(pending_request->alpha2)))
 			return -EINVAL;
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
-			if (last_request->wiphy != wiphy) {
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+			if (last_wiphy != wiphy) {
 				/*
 				 * Two cards with two APs claiming different
 				 * different Country IE alpha2s. We could
 				 * intersect them, but that seems unlikely
 				 * to be correct. Reject second one for now.
 				 */
-				if (!alpha2_equal(alpha2,
-						  cfg80211_regdomain->alpha2))
+				if (regdom_changes(pending_request->alpha2))
 					return -EOPNOTSUPP;
 				return -EALREADY;
 			}
-			/* Two consecutive Country IE hints on the same wiphy.
-			 * This should be picked up early by the driver/stack */
-			if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2,
-				  alpha2)))
+			/*
+			 * Two consecutive Country IE hints on the same wiphy.
+			 * This should be picked up early by the driver/stack
+			 */
+			if (WARN_ON(regdom_changes(pending_request->alpha2)))
 				return 0;
 			return -EALREADY;
 		}
 		return REG_INTERSECT;
-	case REGDOM_SET_BY_DRIVER:
-		if (last_request->initiator == REGDOM_SET_BY_CORE) {
+	case NL80211_REGDOM_SET_BY_DRIVER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
 			if (is_old_static_regdom(cfg80211_regdomain))
 				return 0;
-			if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+			if (regdom_changes(pending_request->alpha2))
 				return 0;
 			return -EALREADY;
 		}
+
+		/*
+		 * This would happen if you unplug and plug your card
+		 * back in or if you add a new device for which the previously
+		 * loaded card also agrees on the regulatory domain.
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+		    !regdom_changes(pending_request->alpha2))
+			return -EALREADY;
+
 		return REG_INTERSECT;
-	case REGDOM_SET_BY_USER:
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+	case NL80211_REGDOM_SET_BY_USER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
 			return REG_INTERSECT;
-		/* If the user knows better the user should set the regdom
-		 * to their country before the IE is picked up */
-		if (last_request->initiator == REGDOM_SET_BY_USER &&
+		/*
+		 * If the user knows better the user should set the regdom
+		 * to their country before the IE is picked up
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_USER &&
 			  last_request->intersect)
 			return -EOPNOTSUPP;
-		/* Process user requests only after previous user/driver/core
-		 * requests have been processed */
-		if (last_request->initiator == REGDOM_SET_BY_CORE ||
-		    last_request->initiator == REGDOM_SET_BY_DRIVER ||
-		    last_request->initiator == REGDOM_SET_BY_USER) {
-			if (!alpha2_equal(last_request->alpha2,
-			    cfg80211_regdomain->alpha2))
+		/*
+		 * Process user requests only after previous user/driver/core
+		 * requests have been processed
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
+			if (regdom_changes(last_request->alpha2))
 				return -EAGAIN;
 		}
 
 		if (!is_old_static_regdom(cfg80211_regdomain) &&
-		    alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+		    !regdom_changes(pending_request->alpha2))
 			return -EALREADY;
 
 		return 0;
@@ -1116,59 +1340,80 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
 	return -EINVAL;
 }
 
-/* Caller must hold &cfg80211_drv_mutex */
-int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
-			const char *alpha2,
-			u32 country_ie_checksum,
-			enum environment_cap env)
+/**
+ * __regulatory_hint - hint to the wireless core a regulatory domain
+ * @wiphy: if the hint comes from country information from an AP, this
+ *	is required to be set to the wiphy that received the information
+ * @pending_request: the regulatory request currently being processed
+ *
+ * The Wireless subsystem can use this function to hint to the wireless core
+ * what it believes should be the current regulatory domain.
+ *
+ * Returns zero if all went fine, %-EALREADY if a regulatory domain had
+ * already been set or other standard error codes.
+ *
+ * Caller must hold &cfg80211_mutex
+ */
+static int __regulatory_hint(struct wiphy *wiphy,
+			     struct regulatory_request *pending_request)
 {
-	struct regulatory_request *request;
 	bool intersect = false;
 	int r = 0;
 
-	r = ignore_request(wiphy, set_by, alpha2);
+	assert_cfg80211_lock();
+
+	r = ignore_request(wiphy, pending_request);
 
 	if (r == REG_INTERSECT) {
-		if (set_by == REGDOM_SET_BY_DRIVER) {
+		if (pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
 			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
-			if (r)
+			if (r) {
+				kfree(pending_request);
 				return r;
+			}
 		}
 		intersect = true;
 	} else if (r) {
-		/* If the regulatory domain being requested by the
+		/*
+		 * If the regulatory domain being requested by the
 		 * driver has already been set just copy it to the
-		 * wiphy */
-		if (r == -EALREADY && set_by == REGDOM_SET_BY_DRIVER) {
+		 * wiphy
+		 */
+		if (r == -EALREADY &&
+		    pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
 			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
-			if (r)
+			if (r) {
+				kfree(pending_request);
 				return r;
+			}
 			r = -EALREADY;
 			goto new_request;
 		}
+		kfree(pending_request);
 		return r;
 	}
 
 new_request:
-	request = kzalloc(sizeof(struct regulatory_request),
-			  GFP_KERNEL);
-	if (!request)
-		return -ENOMEM;
+	kfree(last_request);
 
-	request->alpha2[0] = alpha2[0];
-	request->alpha2[1] = alpha2[1];
-	request->initiator = set_by;
-	request->wiphy = wiphy;
-	request->intersect = intersect;
-	request->country_ie_checksum = country_ie_checksum;
-	request->country_ie_env = env;
+	last_request = pending_request;
+	last_request->intersect = intersect;
 
-	kfree(last_request);
-	last_request = request;
+	pending_request = NULL;
 
 	/* When r == REG_INTERSECT we do need to call CRDA */
-	if (r < 0)
+	if (r < 0) {
+		/*
+		 * Since CRDA will not be called in this case as we already
+		 * have applied the requested regulatory domain before we just
+		 * inform userspace we have processed the request
+		 */
+		if (r == -EALREADY)
+			nl80211_send_reg_change_event(last_request);
 		return r;
+	}
 
 	/*
 	 * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled
@@ -1180,34 +1425,194 @@ new_request:
 	 *
 	 * to intersect with the static rd
 	 */
-	return call_crda(alpha2);
+	return call_crda(last_request->alpha2);
 }
 
-void regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+/* This currently only processes user and driver regulatory hints */
+static void reg_process_hint(struct regulatory_request *reg_request)
 {
-	int r;
-	BUG_ON(!alpha2);
+	int r = 0;
+	struct wiphy *wiphy = NULL;
+
+	BUG_ON(!reg_request->alpha2);
+
+	mutex_lock(&cfg80211_mutex);
+
+	if (wiphy_idx_valid(reg_request->wiphy_idx))
+		wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
+
+	if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+	    !wiphy) {
+		kfree(reg_request);
+		goto out;
+	}
 
-	mutex_lock(&cfg80211_drv_mutex);
-	r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER,
-		alpha2, 0, ENVIRON_ANY);
+	r = __regulatory_hint(wiphy, reg_request);
 	/* This is required so that the orig_* parameters are saved */
-	if (r == -EALREADY && wiphy->strict_regulatory)
-		wiphy_update_regulatory(wiphy, REGDOM_SET_BY_DRIVER);
-	mutex_unlock(&cfg80211_drv_mutex);
+	if (r == -EALREADY && wiphy && wiphy->strict_regulatory)
+		wiphy_update_regulatory(wiphy, reg_request->initiator);
+out:
+	mutex_unlock(&cfg80211_mutex);
+}
+
+/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */
+static void reg_process_pending_hints(void)
+	{
+	struct regulatory_request *reg_request;
+
+	spin_lock(&reg_requests_lock);
+	while (!list_empty(&reg_requests_list)) {
+		reg_request = list_first_entry(&reg_requests_list,
+					       struct regulatory_request,
+					       list);
+		list_del_init(&reg_request->list);
+
+		spin_unlock(&reg_requests_lock);
+		reg_process_hint(reg_request);
+		spin_lock(&reg_requests_lock);
+	}
+	spin_unlock(&reg_requests_lock);
+}
+
+/* Processes beacon hints -- this has nothing to do with country IEs */
+static void reg_process_pending_beacon_hints(void)
+{
+	struct cfg80211_registered_device *drv;
+	struct reg_beacon *pending_beacon, *tmp;
+
+	mutex_lock(&cfg80211_mutex);
+
+	/* This goes through the _pending_ beacon list */
+	spin_lock_bh(&reg_pending_beacons_lock);
+
+	if (list_empty(&reg_pending_beacons)) {
+		spin_unlock_bh(&reg_pending_beacons_lock);
+		goto out;
+	}
+
+	list_for_each_entry_safe(pending_beacon, tmp,
+				 &reg_pending_beacons, list) {
+
+		list_del_init(&pending_beacon->list);
+
+		/* Applies the beacon hint to current wiphys */
+		list_for_each_entry(drv, &cfg80211_drv_list, list)
+			wiphy_update_new_beacon(&drv->wiphy, pending_beacon);
+
+		/* Remembers the beacon hint for new wiphys or reg changes */
+		list_add_tail(&pending_beacon->list, &reg_beacon_list);
+	}
+
+	spin_unlock_bh(&reg_pending_beacons_lock);
+out:
+	mutex_unlock(&cfg80211_mutex);
+}
+
+static void reg_todo(struct work_struct *work)
+{
+	reg_process_pending_hints();
+	reg_process_pending_beacon_hints();
+}
+
+static DECLARE_WORK(reg_work, reg_todo);
+
+static void queue_regulatory_request(struct regulatory_request *request)
+{
+	spin_lock(&reg_requests_lock);
+	list_add_tail(&request->list, &reg_requests_list);
+	spin_unlock(&reg_requests_lock);
+
+	schedule_work(&reg_work);
+}
+
+/* Core regulatory hint -- happens once during cfg80211_init() */
+static int regulatory_hint_core(const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	BUG_ON(last_request);
+
+	request = kzalloc(sizeof(struct regulatory_request),
+			  GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_CORE;
+
+	queue_regulatory_request(request);
+
+	return 0;
+}
+
+/* User hints */
+int regulatory_hint_user(const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	BUG_ON(!alpha2);
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->wiphy_idx = WIPHY_IDX_STALE;
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_USER,
+
+	queue_regulatory_request(request);
+
+	return 0;
+}
+
+/* Driver hints */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	BUG_ON(!alpha2);
+	BUG_ON(!wiphy);
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->wiphy_idx = get_wiphy_idx(wiphy);
+
+	/* Must have registered wiphy first */
+	BUG_ON(!wiphy_idx_valid(request->wiphy_idx));
+
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
+
+	queue_regulatory_request(request);
+
+	return 0;
 }
 EXPORT_SYMBOL(regulatory_hint);
 
 static bool reg_same_country_ie_hint(struct wiphy *wiphy,
 			u32 country_ie_checksum)
 {
-	if (!last_request->wiphy)
+	struct wiphy *request_wiphy;
+
+	assert_cfg80211_lock();
+
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+	if (!request_wiphy)
 		return false;
-	if (likely(last_request->wiphy != wiphy))
+
+	if (likely(request_wiphy != wiphy))
 		return !country_ie_integrity_changes(country_ie_checksum);
-	/* We should not have let these through at this point, they
+	/*
+	 * We should not have let these through at this point, they
 	 * should have been picked up earlier by the first alpha2 check
-	 * on the device */
+	 * on the device
+	 */
 	if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum)))
 		return true;
 	return false;
@@ -1221,11 +1626,14 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	char alpha2[2];
 	u32 checksum = 0;
 	enum environment_cap env = ENVIRON_ANY;
+	struct regulatory_request *request;
 
-	if (!last_request)
-		return;
+	mutex_lock(&cfg80211_mutex);
 
-	mutex_lock(&cfg80211_drv_mutex);
+	if (unlikely(!last_request)) {
+		mutex_unlock(&cfg80211_mutex);
+		return;
+	}
 
 	/* IE len must be evenly divisible by 2 */
 	if (country_ie_len & 0x01)
@@ -1234,9 +1642,11 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
 		goto out;
 
-	/* Pending country IE processing, this can happen after we
+	/*
+	 * Pending country IE processing, this can happen after we
 	 * call CRDA and wait for a response if a beacon was received before
-	 * we were able to process the last regulatory_hint_11d() call */
+	 * we were able to process the last regulatory_hint_11d() call
+	 */
 	if (country_ie_regdomain)
 		goto out;
 
@@ -1248,33 +1658,44 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	else if (country_ie[2] == 'O')
 		env = ENVIRON_OUTDOOR;
 
-	/* We will run this for *every* beacon processed for the BSSID, so
+	/*
+	 * We will run this for *every* beacon processed for the BSSID, so
 	 * we optimize an early check to exit out early if we don't have to
-	 * do anything */
-	if (likely(last_request->wiphy)) {
+	 * do anything
+	 */
+	if (likely(wiphy_idx_valid(last_request->wiphy_idx))) {
 		struct cfg80211_registered_device *drv_last_ie;
 
-		drv_last_ie = wiphy_to_dev(last_request->wiphy);
+		drv_last_ie =
+			cfg80211_drv_by_wiphy_idx(last_request->wiphy_idx);
 
-		/* Lets keep this simple -- we trust the first AP
-		 * after we intersect with CRDA */
-		if (likely(last_request->wiphy == wiphy)) {
-			/* Ignore IEs coming in on this wiphy with
-			 * the same alpha2 and environment cap */
+		/*
+		 * Lets keep this simple -- we trust the first AP
+		 * after we intersect with CRDA
+		 */
+		if (likely(&drv_last_ie->wiphy == wiphy)) {
+			/*
+			 * Ignore IEs coming in on this wiphy with
+			 * the same alpha2 and environment cap
+			 */
 			if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
 				  alpha2) &&
 				  env == drv_last_ie->env)) {
 				goto out;
 			}
-			/* the wiphy moved on to another BSSID or the AP
+			/*
+			 * the wiphy moved on to another BSSID or the AP
 			 * was reconfigured. XXX: We need to deal with the
 			 * case where the user suspends and goes to goes
 			 * to another country, and then gets IEs from an
-			 * AP with different settings */
+			 * AP with different settings
+			 */
 			goto out;
 		} else {
-			/* Ignore IEs coming in on two separate wiphys with
-			 * the same alpha2 and environment cap */
+			/*
+			 * Ignore IEs coming in on two separate wiphys with
+			 * the same alpha2 and environment cap
+			 */
 			if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
 				  alpha2) &&
 				  env == drv_last_ie->env)) {
@@ -1289,28 +1710,97 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	if (!rd)
 		goto out;
 
-	/* This will not happen right now but we leave it here for the
+	/*
+	 * This will not happen right now but we leave it here for the
 	 * the future when we want to add suspend/resume support and having
 	 * the user move to another country after doing so, or having the user
-	 * move to another AP. Right now we just trust the first AP. This is why
-	 * this is marked as likley(). If we hit this before we add this support
-	 * we want to be informed of it as it would indicate a mistake in the
-	 * current design  */
-	if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum))))
-		goto out;
+	 * move to another AP. Right now we just trust the first AP.
+	 *
+	 * If we hit this before we add this support we want to be informed of
+	 * it as it would indicate a mistake in the current design
+	 */
+	if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))
+		goto free_rd_out;
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		goto free_rd_out;
 
-	/* We keep this around for when CRDA comes back with a response so
-	 * we can intersect with that */
+	/*
+	 * We keep this around for when CRDA comes back with a response so
+	 * we can intersect with that
+	 */
 	country_ie_regdomain = rd;
 
-	__regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE,
-		country_ie_regdomain->alpha2, checksum, env);
+	request->wiphy_idx = get_wiphy_idx(wiphy);
+	request->alpha2[0] = rd->alpha2[0];
+	request->alpha2[1] = rd->alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
+	request->country_ie_checksum = checksum;
+	request->country_ie_env = env;
 
+	mutex_unlock(&cfg80211_mutex);
+
+	queue_regulatory_request(request);
+
+	return;
+
+free_rd_out:
+	kfree(rd);
 out:
-	mutex_unlock(&cfg80211_drv_mutex);
+	mutex_unlock(&cfg80211_mutex);
 }
 EXPORT_SYMBOL(regulatory_hint_11d);
 
+static bool freq_is_chan_12_13_14(u16 freq)
+{
+	if (freq == ieee80211_channel_to_frequency(12) ||
+	    freq == ieee80211_channel_to_frequency(13) ||
+	    freq == ieee80211_channel_to_frequency(14))
+		return true;
+	return false;
+}
+
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+				 struct ieee80211_channel *beacon_chan,
+				 gfp_t gfp)
+{
+	struct reg_beacon *reg_beacon;
+
+	if (likely((beacon_chan->beacon_found ||
+	    (beacon_chan->flags & IEEE80211_CHAN_RADAR) ||
+	    (beacon_chan->band == IEEE80211_BAND_2GHZ &&
+	     !freq_is_chan_12_13_14(beacon_chan->center_freq)))))
+		return 0;
+
+	reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
+	if (!reg_beacon)
+		return -ENOMEM;
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+	printk(KERN_DEBUG "cfg80211: Found new beacon on "
+		"frequency: %d MHz (Ch %d) on %s\n",
+		beacon_chan->center_freq,
+		ieee80211_frequency_to_channel(beacon_chan->center_freq),
+		wiphy_name(wiphy));
+#endif
+	memcpy(&reg_beacon->chan, beacon_chan,
+		sizeof(struct ieee80211_channel));
+
+
+	/*
+	 * Since we can be called from BH or and non-BH context
+	 * we must use spin_lock_bh()
+	 */
+	spin_lock_bh(&reg_pending_beacons_lock);
+	list_add_tail(&reg_beacon->list, &reg_pending_beacons);
+	spin_unlock_bh(&reg_pending_beacons_lock);
+
+	schedule_work(&reg_work);
+
+	return 0;
+}
+
 static void print_rd_rules(const struct ieee80211_regdomain *rd)
 {
 	unsigned int i;
@@ -1326,8 +1816,10 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
 		freq_range = &reg_rule->freq_range;
 		power_rule = &reg_rule->power_rule;
 
-		/* There may not be documentation for max antenna gain
-		 * in certain regions */
+		/*
+		 * There may not be documentation for max antenna gain
+		 * in certain regions
+		 */
 		if (power_rule->max_antenna_gain)
 			printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
 				"(%d mBi, %d mBm)\n",
@@ -1350,13 +1842,13 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
 {
 
 	if (is_intersected_alpha2(rd->alpha2)) {
-		struct wiphy *wiphy = NULL;
-		struct cfg80211_registered_device *drv;
 
-		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
-			if (last_request->wiphy) {
-				wiphy = last_request->wiphy;
-				drv = wiphy_to_dev(wiphy);
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+			struct cfg80211_registered_device *drv;
+			drv = cfg80211_drv_by_wiphy_idx(
+				last_request->wiphy_idx);
+			if (drv) {
 				printk(KERN_INFO "cfg80211: Current regulatory "
 					"domain updated by AP to: %c%c\n",
 					drv->country_ie_alpha2[0],
@@ -1422,7 +1914,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 {
 	const struct ieee80211_regdomain *intersected_rd = NULL;
 	struct cfg80211_registered_device *drv = NULL;
-	struct wiphy *wiphy = NULL;
+	struct wiphy *request_wiphy;
 	/* Some basic sanity checks first */
 
 	if (is_world_regdom(rd->alpha2)) {
@@ -1439,23 +1931,27 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	if (!last_request)
 		return -EINVAL;
 
-	/* Lets only bother proceeding on the same alpha2 if the current
+	/*
+	 * Lets only bother proceeding on the same alpha2 if the current
 	 * rd is non static (it means CRDA was present and was used last)
-	 * and the pending request came in from a country IE */
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
-		/* If someone else asked us to change the rd lets only bother
-		 * checking if the alpha2 changes if CRDA was already called */
+	 * and the pending request came in from a country IE
+	 */
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+		/*
+		 * If someone else asked us to change the rd lets only bother
+		 * checking if the alpha2 changes if CRDA was already called
+		 */
 		if (!is_old_static_regdom(cfg80211_regdomain) &&
-		    !regdom_changed(rd->alpha2))
+		    !regdom_changes(rd->alpha2))
 			return -EINVAL;
 	}
 
-	wiphy = last_request->wiphy;
-
-	/* Now lets set the regulatory domain, update all driver channels
+	/*
+	 * Now lets set the regulatory domain, update all driver channels
 	 * and finally inform them of what we have done, in case they want
 	 * to review or adjust their own settings based on their own
-	 * internal EEPROM data */
+	 * internal EEPROM data
+	 */
 
 	if (WARN_ON(!reg_is_valid_request(rd->alpha2)))
 		return -EINVAL;
@@ -1467,21 +1963,25 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 		return -EINVAL;
 	}
 
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
 	if (!last_request->intersect) {
 		int r;
 
-		if (last_request->initiator != REGDOM_SET_BY_DRIVER) {
+		if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) {
 			reset_regdomains();
 			cfg80211_regdomain = rd;
 			return 0;
 		}
 
-		/* For a driver hint, lets copy the regulatory domain the
-		 * driver wanted to the wiphy to deal with conflicts */
+		/*
+		 * For a driver hint, lets copy the regulatory domain the
+		 * driver wanted to the wiphy to deal with conflicts
+		 */
 
-		BUG_ON(last_request->wiphy->regd);
+		BUG_ON(request_wiphy->regd);
 
-		r = reg_copy_regd(&last_request->wiphy->regd, rd);
+		r = reg_copy_regd(&request_wiphy->regd, rd);
 		if (r)
 			return r;
 
@@ -1492,17 +1992,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 
 	/* Intersection requires a bit more work */
 
-	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
 
 		intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
 		if (!intersected_rd)
 			return -EINVAL;
 
-		/* We can trash what CRDA provided now.
+		/*
+		 * We can trash what CRDA provided now.
 		 * However if a driver requested this specific regulatory
-		 * domain we keep it for its private use */
-		if (last_request->initiator == REGDOM_SET_BY_DRIVER)
-			last_request->wiphy->regd = rd;
+		 * domain we keep it for its private use
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER)
+			request_wiphy->regd = rd;
 		else
 			kfree(rd);
 
@@ -1522,8 +2024,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	BUG_ON(!country_ie_regdomain);
 
 	if (rd != country_ie_regdomain) {
-		/* Intersect what CRDA returned and our what we
-		 * had built from the Country IE received */
+		/*
+		 * Intersect what CRDA returned and our what we
+		 * had built from the Country IE received
+		 */
 
 		intersected_rd = regdom_intersect(rd, country_ie_regdomain);
 
@@ -1533,16 +2037,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 		kfree(country_ie_regdomain);
 		country_ie_regdomain = NULL;
 	} else {
-		/* This would happen when CRDA was not present and
+		/*
+		 * This would happen when CRDA was not present and
 		 * OLD_REGULATORY was enabled. We intersect our Country
-		 * IE rd and what was set on cfg80211 originally */
+		 * IE rd and what was set on cfg80211 originally
+		 */
 		intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
 	}
 
 	if (!intersected_rd)
 		return -EINVAL;
 
-	drv = wiphy_to_dev(wiphy);
+	drv = wiphy_to_dev(request_wiphy);
 
 	drv->country_ie_alpha2[0] = rd->alpha2[0];
 	drv->country_ie_alpha2[1] = rd->alpha2[1];
@@ -1560,13 +2066,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 }
 
 
-/* Use this call to set the current regulatory domain. Conflicts with
+/*
+ * Use this call to set the current regulatory domain. Conflicts with
  * multiple drivers can be ironed out later. Caller must've already
- * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */
+ * kmalloc'd the rd structure. Caller must hold cfg80211_mutex
+ */
 int set_regdom(const struct ieee80211_regdomain *rd)
 {
 	int r;
 
+	assert_cfg80211_lock();
+
 	/* Note that this doesn't update the wiphys, this is done below */
 	r = __set_regdom(rd);
 	if (r) {
@@ -1583,57 +2093,87 @@ int set_regdom(const struct ieee80211_regdomain *rd)
 
 	print_regdomain(cfg80211_regdomain);
 
+	nl80211_send_reg_change_event(last_request);
+
 	return r;
 }
 
-/* Caller must hold cfg80211_drv_mutex */
+/* Caller must hold cfg80211_mutex */
 void reg_device_remove(struct wiphy *wiphy)
 {
+	struct wiphy *request_wiphy;
+
+	assert_cfg80211_lock();
+
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
 	kfree(wiphy->regd);
-	if (!last_request || !last_request->wiphy)
+	if (!last_request || !request_wiphy)
 		return;
-	if (last_request->wiphy != wiphy)
+	if (request_wiphy != wiphy)
 		return;
-	last_request->wiphy = NULL;
+	last_request->wiphy_idx = WIPHY_IDX_STALE;
 	last_request->country_ie_env = ENVIRON_ANY;
 }
 
 int regulatory_init(void)
 {
-	int err;
+	int err = 0;
 
 	reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
 	if (IS_ERR(reg_pdev))
 		return PTR_ERR(reg_pdev);
 
+	spin_lock_init(&reg_requests_lock);
+	spin_lock_init(&reg_pending_beacons_lock);
+
 #ifdef CONFIG_WIRELESS_OLD_REGULATORY
 	cfg80211_regdomain = static_regdom(ieee80211_regdom);
 
 	printk(KERN_INFO "cfg80211: Using static regulatory domain info\n");
 	print_regdomain_info(cfg80211_regdomain);
-	/* The old code still requests for a new regdomain and if
+	/*
+	 * The old code still requests for a new regdomain and if
 	 * you have CRDA you get it updated, otherwise you get
 	 * stuck with the static values. We ignore "EU" code as
-	 * that is not a valid ISO / IEC 3166 alpha2 */
+	 * that is not a valid ISO / IEC 3166 alpha2
+	 */
 	if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U')
-		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
-					ieee80211_regdom, 0, ENVIRON_ANY);
+		err = regulatory_hint_core(ieee80211_regdom);
 #else
 	cfg80211_regdomain = cfg80211_world_regdom;
 
-	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY);
-	if (err)
-		printk(KERN_ERR "cfg80211: calling CRDA failed - "
-		       "unable to update world regulatory domain, "
-		       "using static definition\n");
+	err = regulatory_hint_core("00");
 #endif
+	if (err) {
+		if (err == -ENOMEM)
+			return err;
+		/*
+		 * N.B. kobject_uevent_env() can fail mainly for when we're out
+		 * memory which is handled and propagated appropriately above
+		 * but it can also fail during a netlink_broadcast() or during
+		 * early boot for call_usermodehelper(). For now treat these
+		 * errors as non-fatal.
+		 */
+		printk(KERN_ERR "cfg80211: kobject_uevent_env() was unable "
+			"to call CRDA during init");
+#ifdef CONFIG_CFG80211_REG_DEBUG
+		/* We want to find out exactly why when debugging */
+		WARN_ON(err);
+#endif
+	}
 
 	return 0;
 }
 
 void regulatory_exit(void)
 {
-	mutex_lock(&cfg80211_drv_mutex);
+	struct regulatory_request *reg_request, *tmp;
+	struct reg_beacon *reg_beacon, *btmp;
+
+	cancel_work_sync(&reg_work);
+
+	mutex_lock(&cfg80211_mutex);
 
 	reset_regdomains();
 
@@ -1644,5 +2184,33 @@ void regulatory_exit(void)
 
 	platform_device_unregister(reg_pdev);
 
-	mutex_unlock(&cfg80211_drv_mutex);
+	spin_lock_bh(&reg_pending_beacons_lock);
+	if (!list_empty(&reg_pending_beacons)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_pending_beacons, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+	spin_unlock_bh(&reg_pending_beacons_lock);
+
+	if (!list_empty(&reg_beacon_list)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_beacon_list, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+
+	spin_lock(&reg_requests_lock);
+	if (!list_empty(&reg_requests_list)) {
+		list_for_each_entry_safe(reg_request, tmp,
+					 &reg_requests_list, list) {
+			list_del(&reg_request->list);
+			kfree(reg_request);
+		}
+	}
+	spin_unlock(&reg_requests_lock);
+
+	mutex_unlock(&cfg80211_mutex);
 }
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index fe8c83f34fb7..e37829a49dc4 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -6,6 +6,8 @@ extern const struct ieee80211_regdomain *cfg80211_regdomain;
 bool is_world_regdom(const char *alpha2);
 bool reg_is_valid_request(const char *alpha2);
 
+int regulatory_hint_user(const char *alpha2);
+
 void reg_device_remove(struct wiphy *wiphy);
 
 int regulatory_init(void);
@@ -14,26 +16,24 @@ void regulatory_exit(void);
 int set_regdom(const struct ieee80211_regdomain *rd);
 
 /**
- * __regulatory_hint - hint to the wireless core a regulatory domain
- * @wiphy: if the hint comes from country information from an AP, this
- *	is required to be set to the wiphy that received the information
- * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain
- *	should be in.
- * @country_ie_checksum: checksum of processed country IE, set this to 0
- * 	if the hint did not come from a country IE
- * @country_ie_env: the environment the IE told us we are in, %ENVIRON_*
- *
- * The Wireless subsystem can use this function to hint to the wireless core
- * what it believes should be the current regulatory domain by giving it an
- * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be
- * in.
+ * regulatory_hint_found_beacon - hints a beacon was found on a channel
+ * @wiphy: the wireless device where the beacon was found on
+ * @beacon_chan: the channel on which the beacon was found on
+ * @gfp: context flags
  *
- * Returns zero if all went fine, %-EALREADY if a regulatory domain had
- * already been set or other standard error codes.
+ * This informs the wireless core that a beacon from an AP was found on
+ * the channel provided. This allows the wireless core to make educated
+ * guesses on regulatory to help with world roaming. This is only used for
+ * world roaming -- when we do not know our current location. This is
+ * only useful on channels 12, 13 and 14 on the 2 GHz band as channels
+ * 1-11 are already enabled by the world regulatory domain; and on
+ * non-radar 5 GHz channels.
  *
+ * Drivers do not need to call this, cfg80211 will do it for after a scan
+ * on a newly found BSS.
  */
-extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
-			     const char *alpha2, u32 country_ie_checksum,
-			     enum environment_cap country_ie_env);
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+					struct ieee80211_channel *beacon_chan,
+					gfp_t gfp);
 
 #endif  /* __NET_WIRELESS_REG_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b1893c863b97..280dbcd02c15 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -62,6 +62,18 @@ static void bss_release(struct kref *ref)
 }
 
 /* must hold dev->bss_lock! */
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+                      unsigned long age_secs)
+{
+	struct cfg80211_internal_bss *bss;
+	unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		bss->ts -= age_jiffies;
+	}
+}
+
+/* must hold dev->bss_lock! */
 void cfg80211_bss_expire(struct cfg80211_registered_device *dev)
 {
 	struct cfg80211_internal_bss *bss, *tmp;
@@ -358,7 +370,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
 		found->pub.beacon_interval = res->pub.beacon_interval;
 		found->pub.tsf = res->pub.tsf;
 		found->pub.signal = res->pub.signal;
-		found->pub.signal_type = res->pub.signal_type;
 		found->pub.capability = res->pub.capability;
 		found->ts = res->ts;
 		kref_put(&res->ref, bss_release);
@@ -380,8 +391,7 @@ struct cfg80211_bss *
 cfg80211_inform_bss_frame(struct wiphy *wiphy,
 			  struct ieee80211_channel *channel,
 			  struct ieee80211_mgmt *mgmt, size_t len,
-			  s32 signal, enum cfg80211_signal_type sigtype,
-			  gfp_t gfp)
+			  s32 signal, gfp_t gfp)
 {
 	struct cfg80211_internal_bss *res;
 	size_t ielen = len - offsetof(struct ieee80211_mgmt,
@@ -389,7 +399,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	bool overwrite;
 	size_t privsz = wiphy->bss_priv_size;
 
-	if (WARN_ON(sigtype == NL80211_BSS_SIGNAL_UNSPEC &&
+	if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC &&
 	            (signal < 0 || signal > 100)))
 		return NULL;
 
@@ -403,7 +413,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 
 	memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN);
 	res->pub.channel = channel;
-	res->pub.signal_type = sigtype;
 	res->pub.signal = signal;
 	res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
 	res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
@@ -421,6 +430,9 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	if (!res)
 		return NULL;
 
+	if (res->pub.capability & WLAN_CAPABILITY_ESS)
+		regulatory_hint_found_beacon(wiphy, channel, gfp);
+
 	/* cfg80211_bss_update gives us a referenced result */
 	return &res->pub;
 }
@@ -584,16 +596,25 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info,
 	}
 }
 
+static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
+{
+	unsigned long end = jiffies;
+
+	if (end >= start)
+		return jiffies_to_msecs(end - start);
+
+	return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1);
+}
 
 static char *
-ieee80211_bss(struct iw_request_info *info,
-		      struct cfg80211_internal_bss *bss,
-		      char *current_ev, char *end_buf)
+ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
+	      struct cfg80211_internal_bss *bss, char *current_ev,
+	      char *end_buf)
 {
 	struct iw_event iwe;
 	u8 *buf, *cfg, *p;
 	u8 *ie = bss->pub.information_elements;
-	int rem = bss->pub.len_information_elements, i;
+	int rem = bss->pub.len_information_elements, i, sig;
 	bool ismesh = false;
 
 	memset(&iwe, 0, sizeof(iwe));
@@ -617,19 +638,28 @@ ieee80211_bss(struct iw_request_info *info,
 	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
 					  IW_EV_FREQ_LEN);
 
-	if (bss->pub.signal_type != CFG80211_SIGNAL_TYPE_NONE) {
+	if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
 		memset(&iwe, 0, sizeof(iwe));
 		iwe.cmd = IWEVQUAL;
 		iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
 				     IW_QUAL_NOISE_INVALID |
-				     IW_QUAL_QUAL_INVALID;
-		switch (bss->pub.signal_type) {
+				     IW_QUAL_QUAL_UPDATED;
+		switch (wiphy->signal_type) {
 		case CFG80211_SIGNAL_TYPE_MBM:
-			iwe.u.qual.level = bss->pub.signal / 100;
+			sig = bss->pub.signal / 100;
+			iwe.u.qual.level = sig;
 			iwe.u.qual.updated |= IW_QUAL_DBM;
+			if (sig < -110)		/* rather bad */
+				sig = -110;
+			else if (sig > -40)	/* perfect */
+				sig = -40;
+			/* will give a range of 0 .. 70 */
+			iwe.u.qual.qual = sig + 110;
 			break;
 		case CFG80211_SIGNAL_TYPE_UNSPEC:
 			iwe.u.qual.level = bss->pub.signal;
+			/* will give range 0 .. 100 */
+			iwe.u.qual.qual = bss->pub.signal;
 			break;
 		default:
 			/* not reached */
@@ -763,8 +793,8 @@ ieee80211_bss(struct iw_request_info *info,
 						  &iwe, buf);
 		memset(&iwe, 0, sizeof(iwe));
 		iwe.cmd = IWEVCUSTOM;
-		sprintf(buf, " Last beacon: %dms ago",
-			jiffies_to_msecs(jiffies - bss->ts));
+		sprintf(buf, " Last beacon: %ums ago",
+			elapsed_jiffies_msecs(bss->ts));
 		iwe.u.data.length = strlen(buf);
 		current_ev = iwe_stream_add_point(info, current_ev,
 						  end_buf, &iwe, buf);
@@ -793,8 +823,8 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *dev,
 			spin_unlock_bh(&dev->bss_lock);
 			return -E2BIG;
 		}
-		current_ev = ieee80211_bss(info, bss,
-						   current_ev, end_buf);
+		current_ev = ieee80211_bss(&dev->wiphy, info, bss,
+					   current_ev, end_buf);
 	}
 	spin_unlock_bh(&dev->bss_lock);
 	return current_ev - buf;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 26a72b0797a0..efe3c5c92b2d 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -31,7 +31,7 @@ static ssize_t name ## _show(struct device *dev,			\
 	return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member);	\
 }
 
-SHOW_FMT(index, "%d", idx);
+SHOW_FMT(index, "%d", wiphy_idx);
 SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
 
 static struct device_attribute ieee80211_dev_attrs[] = {
@@ -60,6 +60,8 @@ static int wiphy_suspend(struct device *dev, pm_message_t state)
 	struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
 	int ret = 0;
 
+	rdev->suspend_at = get_seconds();
+
 	if (rdev->ops->suspend) {
 		rtnl_lock();
 		ret = rdev->ops->suspend(&rdev->wiphy);
@@ -74,6 +76,11 @@ static int wiphy_resume(struct device *dev)
 	struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
 	int ret = 0;
 
+	/* Age scan results with time spent in suspend */
+	spin_lock_bh(&rdev->bss_lock);
+	cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+	spin_unlock_bh(&rdev->bss_lock);
+
 	if (rdev->ops->resume) {
 		rtnl_lock();
 		ret = rdev->ops->resume(&rdev->wiphy);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 58e489fd4aed..b84a9b4fe96a 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -137,3 +137,100 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
 	return 0;
 }
 EXPORT_SYMBOL(cfg80211_wext_giwmode);
+
+
+int cfg80211_wext_giwrange(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct iw_range *range = (struct iw_range *) extra;
+	enum ieee80211_band band;
+	int c = 0;
+
+	if (!wdev)
+		return -EOPNOTSUPP;
+
+	data->length = sizeof(struct iw_range);
+	memset(range, 0, sizeof(struct iw_range));
+
+	range->we_version_compiled = WIRELESS_EXT;
+	range->we_version_source = 21;
+	range->retry_capa = IW_RETRY_LIMIT;
+	range->retry_flags = IW_RETRY_LIMIT;
+	range->min_retry = 0;
+	range->max_retry = 255;
+	range->min_rts = 0;
+	range->max_rts = 2347;
+	range->min_frag = 256;
+	range->max_frag = 2346;
+
+	range->encoding_size[0] = 5;
+	range->encoding_size[1] = 13;
+	range->num_encoding_sizes = 2;
+	range->max_encoding_tokens = 4;
+
+	range->max_qual.updated = IW_QUAL_NOISE_INVALID;
+
+	switch (wdev->wiphy->signal_type) {
+	case CFG80211_SIGNAL_TYPE_NONE:
+		break;
+	case CFG80211_SIGNAL_TYPE_MBM:
+		range->max_qual.level = -110;
+		range->max_qual.qual = 70;
+		range->avg_qual.qual = 35;
+		range->max_qual.updated |= IW_QUAL_DBM;
+		range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+		range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+		break;
+	case CFG80211_SIGNAL_TYPE_UNSPEC:
+		range->max_qual.level = 100;
+		range->max_qual.qual = 100;
+		range->avg_qual.qual = 50;
+		range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+		range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+		break;
+	}
+
+	range->avg_qual.level = range->max_qual.level / 2;
+	range->avg_qual.noise = range->max_qual.noise / 2;
+	range->avg_qual.updated = range->max_qual.updated;
+
+	range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
+			  IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
+
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
+		int i;
+		struct ieee80211_supported_band *sband;
+
+		sband = wdev->wiphy->bands[band];
+
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
+			struct ieee80211_channel *chan = &sband->channels[i];
+
+			if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
+				range->freq[c].i =
+					ieee80211_frequency_to_channel(
+						chan->center_freq);
+				range->freq[c].m = chan->center_freq;
+				range->freq[c].e = 6;
+				c++;
+			}
+		}
+	}
+	range->num_channels = c;
+	range->num_frequency = c;
+
+	IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
+	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
+	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
+
+	range->scan_capa |= IW_SCAN_CAPA_ESSID;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_wext_giwrange);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8f76f4009c24..9ca17b1ce52e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -951,10 +951,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	/*
 	 *	Incoming Call User Data.
 	 */
-	if (skb->len >= 0) {
-		skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
-		makex25->calluserdata.cudlength = skb->len;
-	}
+	skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
+	makex25->calluserdata.cudlength = skb->len;
 
 	sk->sk_ack_backlog++;
 
@@ -1122,8 +1120,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (msg->msg_flags & MSG_OOB)
 		skb_queue_tail(&x25->interrupt_out_queue, skb);
 	else {
-		len = x25_output(sk, skb);
-		if (len < 0)
+		rc = x25_output(sk, skb);
+		len = rc;
+		if (rc < 0)
 			kfree_skb(skb);
 		else if (x25->qbitincl)
 			len++;
@@ -1608,7 +1607,7 @@ static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
 
 SOCKOPS_WRAP(x25_proto, AF_X25);
 
-static struct packet_type x25_packet_type = {
+static struct packet_type x25_packet_type __read_mostly = {
 	.type =	cpu_to_be16(ETH_P_X25),
 	.func =	x25_lapb_receive_frame,
 };
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e25ff62ab2a6..62a5425cc6aa 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -748,12 +748,51 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
 		schedule_work(&net->xfrm.state_hash_work);
 }
 
+static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
+			       struct flowi *fl, unsigned short family,
+			       xfrm_address_t *daddr, xfrm_address_t *saddr,
+			       struct xfrm_state **best, int *acq_in_progress,
+			       int *error)
+{
+	/* Resolution logic:
+	 * 1. There is a valid state with matching selector. Done.
+	 * 2. Valid state with inappropriate selector. Skip.
+	 *
+	 * Entering area of "sysdeps".
+	 *
+	 * 3. If state is not valid, selector is temporary, it selects
+	 *    only session which triggered previous resolution. Key
+	 *    manager will do something to install a state with proper
+	 *    selector.
+	 */
+	if (x->km.state == XFRM_STATE_VALID) {
+		if ((x->sel.family &&
+		     !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
+		    !security_xfrm_state_pol_flow_match(x, pol, fl))
+			return;
+
+		if (!*best ||
+		    (*best)->km.dying > x->km.dying ||
+		    ((*best)->km.dying == x->km.dying &&
+		     (*best)->curlft.add_time < x->curlft.add_time))
+			*best = x;
+	} else if (x->km.state == XFRM_STATE_ACQ) {
+		*acq_in_progress = 1;
+	} else if (x->km.state == XFRM_STATE_ERROR ||
+		   x->km.state == XFRM_STATE_EXPIRED) {
+		if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
+		    security_xfrm_state_pol_flow_match(x, pol, fl))
+			*error = -ESRCH;
+	}
+}
+
 struct xfrm_state *
 xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 		struct flowi *fl, struct xfrm_tmpl *tmpl,
 		struct xfrm_policy *pol, int *err,
 		unsigned short family)
 {
+	static xfrm_address_t saddr_wildcard = { };
 	struct net *net = xp_net(pol);
 	unsigned int h;
 	struct hlist_node *entry;
@@ -773,40 +812,27 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 		    xfrm_state_addr_check(x, daddr, saddr, family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
-		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) {
-			/* Resolution logic:
-			   1. There is a valid state with matching selector.
-			      Done.
-			   2. Valid state with inappropriate selector. Skip.
-
-			   Entering area of "sysdeps".
-
-			   3. If state is not valid, selector is temporary,
-			      it selects only session which triggered
-			      previous resolution. Key manager will do
-			      something to install a state with proper
-			      selector.
-			 */
-			if (x->km.state == XFRM_STATE_VALID) {
-				if ((x->sel.family && !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
-				    !security_xfrm_state_pol_flow_match(x, pol, fl))
-					continue;
-				if (!best ||
-				    best->km.dying > x->km.dying ||
-				    (best->km.dying == x->km.dying &&
-				     best->curlft.add_time < x->curlft.add_time))
-					best = x;
-			} else if (x->km.state == XFRM_STATE_ACQ) {
-				acquire_in_progress = 1;
-			} else if (x->km.state == XFRM_STATE_ERROR ||
-				   x->km.state == XFRM_STATE_EXPIRED) {
-				if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
-				    security_xfrm_state_pol_flow_match(x, pol, fl))
-					error = -ESRCH;
-			}
-		}
+		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+					   &best, &acquire_in_progress, &error);
+	}
+	if (best)
+		goto found;
+
+	h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+		if (x->props.family == family &&
+		    x->props.reqid == tmpl->reqid &&
+		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
+		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    tmpl->mode == x->props.mode &&
+		    tmpl->id.proto == x->id.proto &&
+		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+					   &best, &acquire_in_progress, &error);
 	}
 
+found:
 	x = best;
 	if (!x && !error && !acquire_in_progress) {
 		if (tmpl->id.spi &&