summaryrefslogtreecommitdiffstats
path: root/src/drivers/net
diff options
context:
space:
mode:
authorMichael Brown2007-10-29 18:21:58 +0100
committerMichael Brown2007-10-29 18:21:58 +0100
commit1620b3512c3ebd348e81bf31c31a5554479a175f (patch)
treebe870f107579350ea755e32a5ffe9d17f9ac0ff3 /src/drivers/net
parentEnsure that empty e820 regions are skipped even at the end of the (diff)
parentChange ROM names to lower case. (diff)
downloadipxe-1620b3512c3ebd348e81bf31c31a5554479a175f.tar.gz
ipxe-1620b3512c3ebd348e81bf31c31a5554479a175f.tar.xz
ipxe-1620b3512c3ebd348e81bf31c31a5554479a175f.zip
Merge branch '3leaf'
Diffstat (limited to 'src/drivers/net')
-rw-r--r--src/drivers/net/ipoib.c930
1 files changed, 930 insertions, 0 deletions
diff --git a/src/drivers/net/ipoib.c b/src/drivers/net/ipoib.c
new file mode 100644
index 00000000..784c0720
--- /dev/null
+++ b/src/drivers/net/ipoib.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <byteswap.h>
+#include <errno.h>
+#include "timer.h"
+#include <gpxe/if_arp.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/infiniband.h>
+#include <gpxe/ipoib.h>
+
+/** @file
+ *
+ * IP over Infiniband
+ */
+
+/** IPoIB MTU */
+#define IPOIB_MTU 2048
+
+/** Number of IPoIB data send work queue entries */
+#define IPOIB_DATA_NUM_SEND_WQES 2
+
+/** Number of IPoIB data receive work queue entries */
+#define IPOIB_DATA_NUM_RECV_WQES 4
+
+/** Number of IPoIB data completion entries */
+#define IPOIB_DATA_NUM_CQES 8
+
+/** Number of IPoIB metadata send work queue entries */
+#define IPOIB_META_NUM_SEND_WQES 2
+
+/** Number of IPoIB metadata receive work queue entries */
+#define IPOIB_META_NUM_RECV_WQES 2
+
+/** Number of IPoIB metadata completion entries */
+#define IPOIB_META_NUM_CQES 8
+
+/** An IPoIB queue set */
+struct ipoib_queue_set {
+ /** Completion queue */
+ struct ib_completion_queue *cq;
+ /** Queue pair */
+ struct ib_queue_pair *qp;
+ /** Receive work queue fill level */
+ unsigned int recv_fill;
+ /** Receive work queue maximum fill level */
+ unsigned int recv_max_fill;
+};
+
+/** An IPoIB device */
+struct ipoib_device {
+ /** Network device */
+ struct net_device *netdev;
+ /** Underlying Infiniband device */
+ struct ib_device *ibdev;
+ /** Data queue set */
+ struct ipoib_queue_set data;
+ /** Data queue set */
+ struct ipoib_queue_set meta;
+ /** Broadcast GID */
+ struct ib_gid broadcast_gid;
+ /** Broadcast LID */
+ unsigned int broadcast_lid;
+ /** Joined to broadcast group */
+ int broadcast_joined;
+ /** Data queue key */
+ unsigned long data_qkey;
+};
+
+/**
+ * IPoIB path cache entry
+ *
+ * This serves a similar role to the ARP cache for Ethernet. (ARP
+ * *is* used on IPoIB; we have two caches to maintain.)
+ */
+struct ipoib_cached_path {
+ /** Destination GID */
+ struct ib_gid gid;
+ /** Destination LID */
+ unsigned int dlid;
+ /** Service level */
+ unsigned int sl;
+ /** Rate */
+ unsigned int rate;
+};
+
+/** Number of IPoIB path cache entries */
+#define IPOIB_NUM_CACHED_PATHS 2
+
+/** IPoIB path cache */
+static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
+
+/** Oldest IPoIB path cache entry index */
+static unsigned int ipoib_path_cache_idx = 0;
+
+/** TID half used to identify get path record replies */
+#define IPOIB_TID_GET_PATH_REC 0x11111111UL
+
+/** TID half used to identify multicast member record replies */
+#define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
+
+/** IPoIB metadata TID */
+static uint32_t ipoib_meta_tid = 0;
+
+/** IPv4 broadcast GID */
+static const struct ib_gid ipv4_broadcast_gid = {
+ { { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
+};
+
+/** Maximum time we will wait for the broadcast join to succeed */
+#define IPOIB_JOIN_MAX_DELAY_MS 1000
+
+/****************************************************************************
+ *
+ * IPoIB link layer
+ *
+ ****************************************************************************
+ */
+
+/** Broadcast QPN used in IPoIB MAC addresses
+ *
+ * This is a guaranteed invalid real QPN
+ */
+#define IPOIB_BROADCAST_QPN 0xffffffffUL
+
+/** Broadcast IPoIB address */
+static struct ipoib_mac ipoib_broadcast = {
+ .qpn = ntohl ( IPOIB_BROADCAST_QPN ),
+};
+
+/**
+ * Transmit IPoIB packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v net_protocol Network-layer protocol
+ * @v ll_dest Link-layer destination address
+ *
+ * Prepends the IPoIB link-layer header and transmits the packet.
+ */
+static int ipoib_tx ( struct io_buffer *iobuf, struct net_device *netdev,
+ struct net_protocol *net_protocol,
+ const void *ll_dest ) {
+ struct ipoib_hdr *ipoib_hdr =
+ iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
+
+ /* Build IPoIB header */
+ memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
+ sizeof ( ipoib_hdr->pseudo.peer ) );
+ ipoib_hdr->real.proto = net_protocol->net_proto;
+ ipoib_hdr->real.reserved = 0;
+
+ /* Hand off to network device */
+ return netdev_tx ( netdev, iobuf );
+}
+
+/**
+ * Process received IPoIB packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ *
+ * Strips off the IPoIB link-layer header and passes up to the
+ * network-layer protocol.
+ */
+static int ipoib_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
+ struct ipoib_hdr *ipoib_hdr = iobuf->data;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
+ DBG ( "IPoIB packet too short for link-layer header\n" );
+ DBG_HD ( iobuf->data, iob_len ( iobuf ) );
+ free_iob ( iobuf );
+ return -EINVAL;
+ }
+
+ /* Strip off IPoIB header */
+ iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
+
+ /* Hand off to network-layer protocol */
+ return net_rx ( iobuf, netdev, ipoib_hdr->real.proto,
+ &ipoib_hdr->pseudo.peer );
+}
+
+/**
+ * Transcribe IPoIB address
+ *
+ * @v ll_addr Link-layer address
+ * @ret string Link-layer address in human-readable format
+ */
+const char * ipoib_ntoa ( const void *ll_addr ) {
+ static char buf[45];
+ const struct ipoib_mac *mac = ll_addr;
+
+ snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
+ htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
+ htonl ( mac->gid.u.dwords[1] ),
+ htonl ( mac->gid.u.dwords[2] ),
+ htonl ( mac->gid.u.dwords[3] ) );
+ return buf;
+}
+
+/** IPoIB protocol */
+struct ll_protocol ipoib_protocol __ll_protocol = {
+ .name = "IPoIB",
+ .ll_proto = htons ( ARPHRD_INFINIBAND ),
+ .ll_addr_len = IPOIB_ALEN,
+ .ll_header_len = IPOIB_HLEN,
+ .ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
+ .tx = ipoib_tx,
+ .rx = ipoib_rx,
+ .ntoa = ipoib_ntoa,
+};
+
+/****************************************************************************
+ *
+ * IPoIB network device
+ *
+ ****************************************************************************
+ */
+
+/**
+ * Destroy queue set
+ *
+ * @v ipoib IPoIB device
+ * @v qset Queue set
+ */
+static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
+ struct ipoib_queue_set *qset ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+
+ if ( qset->qp )
+ ib_destroy_qp ( ibdev, qset->qp );
+ if ( qset->cq )
+ ib_destroy_cq ( ibdev, qset->cq );
+ memset ( qset, 0, sizeof ( *qset ) );
+}
+
+/**
+ * Create queue set
+ *
+ * @v ipoib IPoIB device
+ * @v qset Queue set
+ * @ret rc Return status code
+ */
+static int ipoib_create_qset ( struct ipoib_device *ipoib,
+ struct ipoib_queue_set *qset,
+ unsigned int num_cqes,
+ unsigned int num_send_wqes,
+ unsigned int num_recv_wqes,
+ unsigned long qkey ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+ int rc;
+
+ /* Store queue parameters */
+ qset->recv_max_fill = num_recv_wqes;
+
+ /* Allocate completion queue */
+ qset->cq = ib_create_cq ( ibdev, num_cqes );
+ if ( ! qset->cq ) {
+ DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
+ ipoib );
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ /* Allocate queue pair */
+ qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
+ num_recv_wqes, qset->cq, qkey );
+ if ( ! qset->qp ) {
+ DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
+ ipoib );
+ rc = -ENOMEM;
+ goto err;
+ }
+ qset->qp->owner_priv = ipoib->netdev;
+
+ return 0;
+
+ err:
+ ipoib_destroy_qset ( ipoib, qset );
+ return rc;
+}
+
+/**
+ * Find path cache entry by GID
+ *
+ * @v gid GID
+ * @ret entry Path cache entry, or NULL
+ */
+static struct ipoib_cached_path *
+ipoib_find_cached_path ( struct ib_gid *gid ) {
+ struct ipoib_cached_path *path;
+ unsigned int i;
+
+ for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
+ path = &ipoib_path_cache[i];
+ if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
+ return path;
+ }
+ DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
+ htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
+ htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
+ return NULL;
+}
+
+/**
+ * Transmit path record request
+ *
+ * @v ipoib IPoIB device
+ * @v gid Destination GID
+ * @ret rc Return status code
+ */
+static int ipoib_get_path_record ( struct ipoib_device *ipoib,
+ struct ib_gid *gid ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+ struct io_buffer *iobuf;
+ struct ib_mad_path_record *path_record;
+ struct ib_address_vector av;
+ int rc;
+
+ /* Allocate I/O buffer */
+ iobuf = alloc_iob ( sizeof ( *path_record ) );
+ if ( ! iobuf )
+ return -ENOMEM;
+ iob_put ( iobuf, sizeof ( *path_record ) );
+ path_record = iobuf->data;
+ memset ( path_record, 0, sizeof ( *path_record ) );
+
+ /* Construct path record request */
+ path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ path_record->mad_hdr.class_version = 2;
+ path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
+ path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
+ path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
+ path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
+ path_record->sa_hdr.comp_mask[1] =
+ htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
+ memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
+ memcpy ( &path_record->sgid, &ibdev->port_gid,
+ sizeof ( path_record->sgid ) );
+
+ /* Construct address vector */
+ memset ( &av, 0, sizeof ( av ) );
+ av.dlid = ibdev->sm_lid;
+ av.dest_qp = IB_SA_QPN;
+ av.qkey = IB_GLOBAL_QKEY;
+
+ /* Post send request */
+ if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
+ iobuf ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
+ ipoib, strerror ( rc ) );
+ free_iob ( iobuf );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Transmit multicast group membership request
+ *
+ * @v ipoib IPoIB device
+ * @v gid Multicast GID
+ * @v join Join (rather than leave) group
+ * @ret rc Return status code
+ */
+static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
+ struct ib_gid *gid, int join ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+ struct io_buffer *iobuf;
+ struct ib_mad_mc_member_record *mc_member_record;
+ struct ib_address_vector av;
+ int rc;
+
+ /* Allocate I/O buffer */
+ iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
+ if ( ! iobuf )
+ return -ENOMEM;
+ iob_put ( iobuf, sizeof ( *mc_member_record ) );
+ mc_member_record = iobuf->data;
+ memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
+
+ /* Construct path record request */
+ mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mc_member_record->mad_hdr.class_version = 2;
+ mc_member_record->mad_hdr.method =
+ ( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
+ mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
+ mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
+ mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
+ mc_member_record->sa_hdr.comp_mask[1] =
+ htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_JOIN_STATE );
+ mc_member_record->scope__join_state = 1;
+ memcpy ( &mc_member_record->mgid, gid,
+ sizeof ( mc_member_record->mgid ) );
+ memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
+ sizeof ( mc_member_record->port_gid ) );
+
+ /* Construct address vector */
+ memset ( &av, 0, sizeof ( av ) );
+ av.dlid = ibdev->sm_lid;
+ av.dest_qp = IB_SA_QPN;
+ av.qkey = IB_GLOBAL_QKEY;
+
+ /* Post send request */
+ if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
+ iobuf ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
+ ipoib, strerror ( rc ) );
+ free_iob ( iobuf );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Transmit packet via IPoIB network device
+ *
+ * @v netdev Network device
+ * @v iobuf I/O buffer
+ * @ret rc Return status code
+ */
+static int ipoib_transmit ( struct net_device *netdev,
+ struct io_buffer *iobuf ) {
+ struct ipoib_device *ipoib = netdev->priv;
+ struct ib_device *ibdev = ipoib->ibdev;
+ struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
+ struct ib_address_vector av;
+ struct ib_gid *gid;
+ struct ipoib_cached_path *path;
+ int rc;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
+ DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
+ return -EINVAL;
+ }
+ iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
+
+ /* Construct address vector */
+ memset ( &av, 0, sizeof ( av ) );
+ av.qkey = IB_GLOBAL_QKEY;
+ av.gid_present = 1;
+ if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
+ /* Broadcast address */
+ av.dest_qp = IB_BROADCAST_QPN;
+ av.dlid = ipoib->broadcast_lid;
+ gid = &ipoib->broadcast_gid;
+ } else {
+ /* Unicast - look in path cache */
+ path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
+ if ( ! path ) {
+ /* No path entry - get path record */
+ rc = ipoib_get_path_record ( ipoib,
+ &ipoib_pshdr->peer.gid );
+ netdev_tx_complete ( netdev, iobuf );
+ return rc;
+ }
+ av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
+ av.dlid = path->dlid;
+ av.rate = path->rate;
+ av.sl = path->sl;
+ gid = &ipoib_pshdr->peer.gid;
+ }
+ memcpy ( &av.gid, gid, sizeof ( av.gid ) );
+
+ return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
+}
+
+/**
+ * Handle IPoIB data send completion
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ * @v completion Completion
+ * @v iobuf I/O buffer
+ */
+static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
+ struct ib_queue_pair *qp,
+ struct ib_completion *completion,
+ struct io_buffer *iobuf ) {
+ struct net_device *netdev = qp->owner_priv;
+
+ netdev_tx_complete_err ( netdev, iobuf,
+ ( completion->syndrome ? -EIO : 0 ) );
+}
+
+/**
+ * Handle IPoIB data receive completion
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ * @v completion Completion
+ * @v iobuf I/O buffer
+ */
+static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
+ struct ib_queue_pair *qp,
+ struct ib_completion *completion,
+ struct io_buffer *iobuf ) {
+ struct net_device *netdev = qp->owner_priv;
+ struct ipoib_device *ipoib = netdev->priv;
+ struct ipoib_pseudo_hdr *ipoib_pshdr;
+
+ if ( completion->syndrome ) {
+ netdev_rx_err ( netdev, iobuf, -EIO );
+ goto done;
+ }
+
+ iob_put ( iobuf, completion->len );
+ if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
+ DBGC ( ipoib, "IPoIB %p received data packet too short to "
+ "contain GRH\n", ipoib );
+ DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
+ netdev_rx_err ( netdev, iobuf, -EIO );
+ goto done;
+ }
+ iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
+
+ if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
+ DBGC ( ipoib, "IPoIB %p received data packet too short to "
+ "contain IPoIB header\n", ipoib );
+ DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
+ netdev_rx_err ( netdev, iobuf, -EIO );
+ goto done;
+ }
+
+ ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
+ /* FIXME: fill in a MAC address for the sake of AoE! */
+
+ netdev_rx ( netdev, iobuf );
+
+ done:
+ ipoib->data.recv_fill--;
+}
+
+/**
+ * Handle IPoIB metadata send completion
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ * @v completion Completion
+ * @v iobuf I/O buffer
+ */
+static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
+ struct ib_queue_pair *qp,
+ struct ib_completion *completion,
+ struct io_buffer *iobuf ) {
+ struct net_device *netdev = qp->owner_priv;
+ struct ipoib_device *ipoib = netdev->priv;
+
+ if ( completion->syndrome ) {
+ DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
+ ipoib, completion->syndrome );
+ }
+ free_iob ( iobuf );
+}
+
+/**
+ * Handle received IPoIB path record
+ *
+ * @v ipoib IPoIB device
+ * @v path_record Path record
+ */
+static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
+ struct ib_mad_path_record *path_record ) {
+ struct ipoib_cached_path *path;
+
+ /* Update path cache entry */
+ path = &ipoib_path_cache[ipoib_path_cache_idx];
+ memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
+ path->dlid = ntohs ( path_record->dlid );
+ path->sl = ( path_record->reserved__sl & 0x0f );
+ path->rate = ( path_record->rate_selector__rate & 0x3f );
+
+ DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
+ htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
+ htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
+ path->dlid, path->sl, path->rate );
+
+ /* Update path cache index */
+ ipoib_path_cache_idx++;
+ if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
+ ipoib_path_cache_idx = 0;
+}
+
+/**
+ * Handle received IPoIB multicast membership record
+ *
+ * @v ipoib IPoIB device
+ * @v mc_member_record Multicast membership record
+ */
+static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
+ struct ib_mad_mc_member_record *mc_member_record ) {
+ /* Record parameters */
+ ipoib->broadcast_joined =
+ ( mc_member_record->scope__join_state & 0x0f );
+ ipoib->data_qkey = ntohl ( mc_member_record->qkey );
+ ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
+ DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
+ ipoib, ( ipoib->broadcast_joined ? "joined" : "left" ),
+ ipoib->data_qkey, ipoib->broadcast_lid );
+}
+
+/**
+ * Handle IPoIB metadata receive completion
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ * @v completion Completion
+ * @v iobuf I/O buffer
+ */
+static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
+ struct ib_queue_pair *qp,
+ struct ib_completion *completion,
+ struct io_buffer *iobuf ) {
+ struct net_device *netdev = qp->owner_priv;
+ struct ipoib_device *ipoib = netdev->priv;
+ union ib_mad *mad;
+
+ if ( completion->syndrome ) {
+ DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
+ ipoib, completion->syndrome );
+ goto done;
+ }
+
+ iob_put ( iobuf, completion->len );
+ if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
+ DBGC ( ipoib, "IPoIB %p received metadata packet too short "
+ "to contain GRH\n", ipoib );
+ DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
+ goto done;
+ }
+ iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
+ if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
+ DBGC ( ipoib, "IPoIB %p received metadata packet too short "
+ "to contain reply\n", ipoib );
+ DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
+ goto done;
+ }
+ mad = iobuf->data;
+
+ if ( mad->mad_hdr.status != 0 ) {
+ DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
+ ipoib, ntohs ( mad->mad_hdr.status ) );
+ goto done;
+ }
+
+ switch ( mad->mad_hdr.tid[0] ) {
+ case IPOIB_TID_GET_PATH_REC:
+ ipoib_recv_path_record ( ipoib, &mad->path_record );
+ break;
+ case IPOIB_TID_MC_MEMBER_REC:
+ ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
+ break;
+ default:
+ DBGC ( ipoib, "IPoIB %p unwanted response:\n",
+ ipoib );
+ DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
+ break;
+ }
+
+ done:
+ ipoib->meta.recv_fill--;
+ free_iob ( iobuf );
+}
+
+/**
+ * Refill IPoIB receive ring
+ *
+ * @v ipoib IPoIB device
+ */
+static void ipoib_refill_recv ( struct ipoib_device *ipoib,
+ struct ipoib_queue_set *qset ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+ struct io_buffer *iobuf;
+ int rc;
+
+ while ( qset->recv_fill < qset->recv_max_fill ) {
+ iobuf = alloc_iob ( IPOIB_MTU );
+ if ( ! iobuf )
+ break;
+ if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
+ free_iob ( iobuf );
+ break;
+ }
+ qset->recv_fill++;
+ }
+}
+
+/**
+ * Poll IPoIB network device
+ *
+ * @v netdev Network device
+ */
+static void ipoib_poll ( struct net_device *netdev ) {
+ struct ipoib_device *ipoib = netdev->priv;
+ struct ib_device *ibdev = ipoib->ibdev;
+
+ ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
+ ipoib_meta_complete_recv );
+ ib_poll_cq ( ibdev, ipoib->data.cq, ipoib_data_complete_send,
+ ipoib_data_complete_recv );
+ ipoib_refill_recv ( ipoib, &ipoib->meta );
+ ipoib_refill_recv ( ipoib, &ipoib->data );
+}
+
+/**
+ * Enable/disable interrupts on IPoIB network device
+ *
+ * @v netdev Network device
+ * @v enable Interrupts should be enabled
+ */
+static void ipoib_irq ( struct net_device *netdev __unused,
+ int enable __unused ) {
+ /* No implementation */
+}
+
+/**
+ * Open IPoIB network device
+ *
+ * @v netdev Network device
+ * @ret rc Return status code
+ */
+static int ipoib_open ( struct net_device *netdev ) {
+ struct ipoib_device *ipoib = netdev->priv;
+ struct ib_device *ibdev = ipoib->ibdev;
+ int rc;
+
+ /* Attach to broadcast multicast GID */
+ if ( ( rc = ib_mcast_attach ( ibdev, ipoib->data.qp,
+ &ipoib->broadcast_gid ) ) != 0 ) {
+ DBG ( "Could not attach to broadcast GID: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Fill receive rings */
+ ipoib_refill_recv ( ipoib, &ipoib->meta );
+ ipoib_refill_recv ( ipoib, &ipoib->data );
+
+ return 0;
+}
+
+/**
+ * Close IPoIB network device
+ *
+ * @v netdev Network device
+ */
+static void ipoib_close ( struct net_device *netdev ) {
+ struct ipoib_device *ipoib = netdev->priv;
+ struct ib_device *ibdev = ipoib->ibdev;
+
+ /* Detach from broadcast multicast GID */
+ ib_mcast_detach ( ibdev, ipoib->data.qp, &ipoib->broadcast_gid );
+
+ /* FIXME: should probably flush the receive ring */
+}
+
+/** IPoIB network device operations */
+static struct net_device_operations ipoib_operations = {
+ .open = ipoib_open,
+ .close = ipoib_close,
+ .transmit = ipoib_transmit,
+ .poll = ipoib_poll,
+ .irq = ipoib_irq,
+};
+
+/**
+ * Join IPoIB broadcast group
+ *
+ * @v ipoib IPoIB device
+ * @ret rc Return status code
+ */
+static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
+ struct ib_device *ibdev = ipoib->ibdev;
+ unsigned int delay_ms;
+ int rc;
+
+ /* Make sure we have some receive descriptors */
+ ipoib_refill_recv ( ipoib, &ipoib->meta );
+
+ /* Send join request */
+ if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
+ 1 ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
+ ipoib, strerror ( rc ) );
+ return rc;
+ }
+
+ /* Wait for join to complete. Ideally we wouldn't delay for
+ * this long, but we need the queue key before we can set up
+ * the data queue pair, which we need before we can know the
+ * MAC address.
+ */
+ for ( delay_ms = IPOIB_JOIN_MAX_DELAY_MS ; delay_ms ; delay_ms-- ) {
+ mdelay ( 1 );
+ ib_poll_cq ( ibdev, ipoib->meta.cq, ipoib_meta_complete_send,
+ ipoib_meta_complete_recv );
+ ipoib_refill_recv ( ipoib, &ipoib->meta );
+ if ( ipoib->broadcast_joined )
+ return 0;
+ }
+ DBGC ( ipoib, "IPoIB %p timed out waiting for broadcast join\n",
+ ipoib );
+
+ return -ETIMEDOUT;
+}
+
+/**
+ * Probe IPoIB device
+ *
+ * @v ibdev Infiniband device
+ * @ret rc Return status code
+ */
+int ipoib_probe ( struct ib_device *ibdev ) {
+ struct net_device *netdev;
+ struct ipoib_device *ipoib;
+ struct ipoib_mac *mac;
+ int rc;
+
+ /* Allocate network device */
+ netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
+ if ( ! netdev )
+ return -ENOMEM;
+ netdev_init ( netdev, &ipoib_operations );
+ ipoib = netdev->priv;
+ ib_set_ownerdata ( ibdev, netdev );
+ netdev->dev = ibdev->dev;
+ memset ( ipoib, 0, sizeof ( *ipoib ) );
+ ipoib->netdev = netdev;
+ ipoib->ibdev = ibdev;
+
+ /* Calculate broadcast GID */
+ memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
+ sizeof ( ipoib->broadcast_gid ) );
+ ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
+
+ /* Allocate metadata queue set */
+ if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
+ IPOIB_META_NUM_CQES,
+ IPOIB_META_NUM_SEND_WQES,
+ IPOIB_META_NUM_RECV_WQES,
+ IB_GLOBAL_QKEY ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
+ ipoib, strerror ( rc ) );
+ goto err_create_meta_qset;
+ }
+
+ /* Join broadcast group */
+ if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
+ ipoib, strerror ( rc ) );
+ goto err_join_broadcast_group;
+ }
+
+ /* Allocate data queue set */
+ if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
+ IPOIB_DATA_NUM_CQES,
+ IPOIB_DATA_NUM_SEND_WQES,
+ IPOIB_DATA_NUM_RECV_WQES,
+ ipoib->data_qkey ) ) != 0 ) {
+ DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
+ ipoib, strerror ( rc ) );
+ goto err_create_data_qset;
+ }
+
+ /* Construct MAC address */
+ mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
+ mac->qpn = htonl ( ipoib->data.qp->qpn );
+ memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
+
+ /* Register network device */
+ if ( ( rc = register_netdev ( netdev ) ) != 0 )
+ goto err_register_netdev;
+
+ return 0;
+
+ err_register_netdev:
+ ipoib_destroy_qset ( ipoib, &ipoib->data );
+ err_join_broadcast_group:
+ err_create_data_qset:
+ ipoib_destroy_qset ( ipoib, &ipoib->meta );
+ err_create_meta_qset:
+ netdev_nullify ( netdev );
+ netdev_put ( netdev );
+ return rc;
+}
+
+/**
+ * Remove IPoIB device
+ *
+ * @v ibdev Infiniband device
+ */
+void ipoib_remove ( struct ib_device *ibdev ) {
+ struct net_device *netdev = ib_get_ownerdata ( ibdev );
+ struct ipoib_device *ipoib = netdev->priv;
+
+ unregister_netdev ( netdev );
+ ipoib_destroy_qset ( ipoib, &ipoib->data );
+ ipoib_destroy_qset ( ipoib, &ipoib->meta );
+ netdev_nullify ( netdev );
+ netdev_put ( netdev );
+}