summaryrefslogblamecommitdiffstats
path: root/src/drivers/net/eoib.c
blob: ba2912953031429c987ce09e76323f5e5908e9fa (plain) (tree)














































































































































































































































































                                                                                







                                                                             
























































                                                                               



                                                                  
                                      


                                                        




























































































































                                                                                

                                                                               




































































                                                                            



                                                                               



                                   
                                                                        
                                                                          


                                                                                


















































































































































































































                                                                               












































                                                                             



























































                                                                             
                                 
           
                                           




















                                                                         
/*
 * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 *
 * You can also choose to distribute this program under the terms of
 * the Unmodified Binary Distribution Licence (as given in the file
 * COPYING.UBDL), provided that you have satisfied its requirements.
 */

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <ipxe/errortab.h>
#include <ipxe/malloc.h>
#include <ipxe/iobuf.h>
#include <ipxe/if_ether.h>
#include <ipxe/netdevice.h>
#include <ipxe/ethernet.h>
#include <ipxe/infiniband.h>
#include <ipxe/ib_mcast.h>
#include <ipxe/ib_pathrec.h>
#include <ipxe/eoib.h>

/** @file
 *
 * Ethernet over Infiniband
 *
 */

/** Number of EoIB send work queue entries */
#define EOIB_NUM_SEND_WQES 8

/** Number of EoIB receive work queue entries */
#define EOIB_NUM_RECV_WQES 4

/** Number of EoIB completion queue entries */
#define EOIB_NUM_CQES 16

/** Link status for "broadcast join in progress" */
#define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
#define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
	( EINFO_EINPROGRESS, 0x01, "Joining" )

/** Human-readable message for the link status */
struct errortab eoib_errors[] __errortab = {
	__einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
};

/** List of EoIB devices */
static LIST_HEAD ( eoib_devices );

static struct net_device_operations eoib_operations;

/****************************************************************************
 *
 * EoIB peer cache
 *
 ****************************************************************************
 */

/** An EoIB peer cache entry */
struct eoib_peer {
	/** List of EoIB peer cache entries */
	struct list_head list;
	/** Ethernet MAC */
	uint8_t mac[ETH_ALEN];
	/** Infiniband address vector */
	struct ib_address_vector av;
};

/**
 * Find EoIB peer cache entry
 *
 * @v eoib		EoIB device
 * @v mac		Ethernet MAC
 * @ret peer		EoIB peer, or NULL if not found
 */
static struct eoib_peer * eoib_find_peer ( struct eoib_device *eoib,
					   const uint8_t *mac ) {
	struct eoib_peer *peer;

	/* Find peer cache entry */
	list_for_each_entry ( peer, &eoib->peers, list ) {
		if ( memcmp ( mac, peer->mac, sizeof ( peer->mac ) ) == 0 ) {
			/* Move peer to start of list */
			list_del ( &peer->list );
			list_add ( &peer->list, &eoib->peers );
			return peer;
		}
	}

	return NULL;
}

/**
 * Create EoIB peer cache entry
 *
 * @v eoib		EoIB device
 * @v mac		Ethernet MAC
 * @ret peer		EoIB peer, or NULL on error
 */
static struct eoib_peer * eoib_create_peer ( struct eoib_device *eoib,
					     const uint8_t *mac ) {
	struct eoib_peer *peer;

	/* Allocate and initialise peer cache entry */
	peer = zalloc ( sizeof ( *peer ) );
	if ( peer ) {
		memcpy ( peer->mac, mac, sizeof ( peer->mac ) );
		list_add ( &peer->list, &eoib->peers );
	}
	return peer;
}

/**
 * Flush EoIB peer cache
 *
 * @v eoib		EoIB device
 */
static void eoib_flush_peers ( struct eoib_device *eoib ) {
	struct eoib_peer *peer;
	struct eoib_peer *tmp;

	list_for_each_entry_safe ( peer, tmp, &eoib->peers, list ) {
		list_del ( &peer->list );
		free ( peer );
	}
}

/**
 * Discard some entries from the peer cache
 *
 * @ret discarded	Number of cached items discarded
 */
static unsigned int eoib_discard ( void ) {
	struct net_device *netdev;
	struct eoib_device *eoib;
	struct eoib_peer *peer;
	unsigned int discarded = 0;

	/* Try to discard one cache entry for each EoIB device */
	for_each_netdev ( netdev ) {

		/* Skip non-EoIB devices */
		if ( netdev->op != &eoib_operations )
			continue;
		eoib = netdev->priv;

		/* Discard least recently used cache entry (if any) */
		list_for_each_entry_reverse ( peer, &eoib->peers, list ) {
			list_del ( &peer->list );
			free ( peer );
			discarded++;
			break;
		}
	}

	return discarded;
}

/** EoIB cache discarder */
struct cache_discarder eoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
	.discard = eoib_discard,
};

/**
 * Find destination address vector
 *
 * @v eoib		EoIB device
 * @v mac		Ethernet MAC
 * @ret av		Address vector, or NULL to send as broadcast
 */
static struct ib_address_vector * eoib_tx_av ( struct eoib_device *eoib,
					       const uint8_t *mac ) {
	struct ib_device *ibdev = eoib->ibdev;
	struct eoib_peer *peer;
	int rc;

	/* If this is a broadcast or multicast MAC address, then send
	 * this packet as a broadcast.
	 */
	if ( is_multicast_ether_addr ( mac ) ) {
		DBGCP ( eoib, "EoIB %s %s TX multicast\n",
			eoib->name, eth_ntoa ( mac ) );
		return NULL;
	}

	/* If we have no peer cache entry, then create one and send
	 * this packet as a broadcast.
	 */
	peer = eoib_find_peer ( eoib, mac );
	if ( ! peer ) {
		DBGC ( eoib, "EoIB %s %s TX unknown\n",
		       eoib->name, eth_ntoa ( mac ) );
		eoib_create_peer ( eoib, mac );
		return NULL;
	}

	/* If we have not yet recorded a received GID and QPN for this
	 * peer cache entry, then send this packet as a broadcast.
	 */
	if ( ! peer->av.gid_present ) {
		DBGCP ( eoib, "EoIB %s %s TX not yet recorded\n",
			eoib->name, eth_ntoa ( mac ) );
		return NULL;
	}

	/* If we have not yet resolved a path to this peer, then send
	 * this packet as a broadcast.
	 */
	if ( ( rc = ib_resolve_path ( ibdev, &peer->av ) ) != 0 ) {
		DBGCP ( eoib, "EoIB %s %s TX not yet resolved\n",
			eoib->name, eth_ntoa ( mac ) );
		return NULL;
	}

	/* Force use of GRH even for local destinations */
	peer->av.gid_present = 1;

	/* We have a fully resolved peer: send this packet as a
	 * unicast.
	 */
	DBGCP ( eoib, "EoIB %s %s TX " IB_GID_FMT " QPN %#lx\n", eoib->name,
		eth_ntoa ( mac ), IB_GID_ARGS ( &peer->av.gid ), peer->av.qpn );
	return &peer->av;
}

/**
 * Record source address vector
 *
 * @v eoib		EoIB device
 * @v mac		Ethernet MAC
 * @v lid		Infiniband LID
 */
static void eoib_rx_av ( struct eoib_device *eoib, const uint8_t *mac,
			 const struct ib_address_vector *av ) {
	const union ib_gid *gid = &av->gid;
	unsigned long qpn = av->qpn;
	struct eoib_peer *peer;

	/* Sanity checks */
	if ( ! av->gid_present ) {
		DBGC ( eoib, "EoIB %s %s RX with no GID\n",
		       eoib->name, eth_ntoa ( mac ) );
		return;
	}

	/* Find peer cache entry (if any) */
	peer = eoib_find_peer ( eoib, mac );
	if ( ! peer ) {
		DBGCP ( eoib, "EoIB %s %s RX " IB_GID_FMT " (ignored)\n",
			eoib->name, eth_ntoa ( mac ), IB_GID_ARGS ( gid ) );
		return;
	}

	/* Some dubious EoIB implementations utilise an Ethernet-to-
	 * EoIB gateway that will send packets from the wrong QPN.
	 */
	if ( eoib_has_gateway ( eoib ) &&
	     ( memcmp ( gid, &eoib->gateway.gid, sizeof ( *gid ) ) == 0 ) ) {
		qpn = eoib->gateway.qpn;
	}

	/* Do nothing if peer cache entry is complete and correct */
	if ( ( peer->av.lid == av->lid ) && ( peer->av.qpn == qpn ) ) {
		DBGCP ( eoib, "EoIB %s %s RX unchanged\n",
			eoib->name, eth_ntoa ( mac ) );
		return;
	}

	/* Update peer cache entry */
	peer->av.qpn = qpn;
	peer->av.qkey = eoib->broadcast.qkey;
	peer->av.gid_present = 1;
	memcpy ( &peer->av.gid, gid, sizeof ( peer->av.gid ) );
	DBGC ( eoib, "EoIB %s %s RX " IB_GID_FMT " QPN %#lx\n", eoib->name,
	       eth_ntoa ( mac ), IB_GID_ARGS ( &peer->av.gid ), peer->av.qpn );
}

/****************************************************************************
 *
 * EoIB network device
 *
 ****************************************************************************
 */

/**
 * Transmit packet via EoIB network device
 *
 * @v netdev		Network device
 * @v iobuf		I/O buffer
 * @ret rc		Return status code
 */
static int eoib_transmit ( struct net_device *netdev,
			   struct io_buffer *iobuf ) {
	struct eoib_device *eoib = netdev->priv;
	struct eoib_header *eoib_hdr;
	struct ethhdr *ethhdr;
	struct ib_address_vector *av;
	size_t zlen;

	/* Sanity checks */
	assert ( iob_len ( iobuf ) >= sizeof ( *ethhdr ) );
	assert ( iob_headroom ( iobuf ) >= sizeof ( *eoib_hdr ) );

	/* Look up destination address vector */
	ethhdr = iobuf->data;
	av = eoib_tx_av ( eoib, ethhdr->h_dest );

	/* Prepend EoIB header */
	eoib_hdr = iob_push ( iobuf, sizeof ( *eoib_hdr ) );
	eoib_hdr->magic = htons ( EOIB_MAGIC );
	eoib_hdr->reserved = 0;

	/* Pad buffer to minimum Ethernet frame size */
	zlen = ( sizeof ( *eoib_hdr ) + ETH_ZLEN );
	assert ( zlen <= IOB_ZLEN );
	if ( iob_len ( iobuf ) < zlen )
		iob_pad ( iobuf, zlen );

	/* If we have no unicast address then send as a broadcast,
	 * with a duplicate sent to the gateway if applicable.
	 */
	if ( ! av ) {
		av = &eoib->broadcast;
		if ( eoib_has_gateway ( eoib ) )
			eoib->duplicate ( eoib, iobuf );
	}

	/* Post send work queue entry */
	return ib_post_send ( eoib->ibdev, eoib->qp, av, iobuf );
}

/**
 * Handle EoIB send completion
 *
 * @v ibdev		Infiniband device
 * @v qp		Queue pair
 * @v iobuf		I/O buffer
 * @v rc		Completion status code
 */
static void eoib_complete_send ( struct ib_device *ibdev __unused,
				 struct ib_queue_pair *qp,
				 struct io_buffer *iobuf, int rc ) {
	struct eoib_device *eoib = ib_qp_get_ownerdata ( qp );

	netdev_tx_complete_err ( eoib->netdev, iobuf, rc );
}

/**
 * Handle EoIB receive completion
 *
 * @v ibdev		Infiniband device
 * @v qp		Queue pair
 * @v dest		Destination address vector, or NULL
 * @v source		Source address vector, or NULL
 * @v iobuf		I/O buffer
 * @v rc		Completion status code
 */
static void eoib_complete_recv ( struct ib_device *ibdev __unused,
				 struct ib_queue_pair *qp,
				 struct ib_address_vector *dest __unused,
				 struct ib_address_vector *source,
				 struct io_buffer *iobuf, int rc ) {
	struct eoib_device *eoib = ib_qp_get_ownerdata ( qp );
	struct net_device *netdev = eoib->netdev;
	struct eoib_header *eoib_hdr;
	struct ethhdr *ethhdr;

	/* Record errors */
	if ( rc != 0 ) {
		netdev_rx_err ( netdev, iobuf, rc );
		return;
	}

	/* Sanity check */
	if ( iob_len ( iobuf ) < ( sizeof ( *eoib_hdr ) + sizeof ( *ethhdr ) )){
		DBGC ( eoib, "EoIB %s received packet too short to "
		       "contain EoIB and Ethernet headers\n", eoib->name );
		DBGC_HD ( eoib, iobuf->data, iob_len ( iobuf ) );
		netdev_rx_err ( netdev, iobuf, -EIO );
		return;
	}
	if ( ! source ) {
		DBGC ( eoib, "EoIB %s received packet without address "
		       "vector\n", eoib->name );
		netdev_rx_err ( netdev, iobuf, -ENOTTY );
		return;
	}

	/* Strip EoIB header */
	iob_pull ( iobuf, sizeof ( *eoib_hdr ) );

	/* Update neighbour cache entry, if any */
	ethhdr = iobuf->data;
	eoib_rx_av ( eoib, ethhdr->h_source, source );

	/* Hand off to network layer */
	netdev_rx ( netdev, iobuf );
}

/** EoIB completion operations */
static struct ib_completion_queue_operations eoib_cq_op = {
	.complete_send = eoib_complete_send,
	.complete_recv = eoib_complete_recv,
};

/** EoIB queue pair operations */
static struct ib_queue_pair_operations eoib_qp_op = {
	.alloc_iob = alloc_iob,
};

/**
 * Poll EoIB network device
 *
 * @v netdev		Network device
 */
static void eoib_poll ( struct net_device *netdev ) {
	struct eoib_device *eoib = netdev->priv;
	struct ib_device *ibdev = eoib->ibdev;

	/* Poll Infiniband device */
	ib_poll_eq ( ibdev );

	/* Poll the retry timers (required for EoIB multicast join) */
	retry_poll();
}

/**
 * Handle EoIB broadcast multicast group join completion
 *
 * @v membership	Multicast group membership
 * @v rc		Status code
 */
static void eoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
	struct eoib_device *eoib =
		container_of ( membership, struct eoib_device, membership );

	/* Record join status as link status */
	netdev_link_err ( eoib->netdev, rc );
}

/**
 * Join EoIB broadcast multicast group
 *
 * @v eoib		EoIB device
 * @ret rc		Return status code
 */
static int eoib_join_broadcast_group ( struct eoib_device *eoib ) {
	int rc;

	/* Join multicast group */
	if ( ( rc = ib_mcast_join ( eoib->ibdev, eoib->qp,
				    &eoib->membership, &eoib->broadcast,
				    eoib->mask, eoib_join_complete ) ) != 0 ) {
		DBGC ( eoib, "EoIB %s could not join broadcast group: %s\n",
		       eoib->name, strerror ( rc ) );
		return rc;
	}

	return 0;
}

/**
 * Leave EoIB broadcast multicast group
 *
 * @v eoib		EoIB device
 */
static void eoib_leave_broadcast_group ( struct eoib_device *eoib ) {

	/* Leave multicast group */
	ib_mcast_leave ( eoib->ibdev, eoib->qp, &eoib->membership );
}

/**
 * Handle link status change
 *
 * @v eoib		EoIB device
 */
static void eoib_link_state_changed ( struct eoib_device *eoib ) {
	struct net_device *netdev = eoib->netdev;
	struct ib_device *ibdev = eoib->ibdev;
	int rc;

	/* Leave existing broadcast group */
	if ( eoib->qp )
		eoib_leave_broadcast_group ( eoib );

	/* Update broadcast GID based on potentially-new partition key */
	eoib->broadcast.gid.words[2] = htons ( ibdev->pkey | IB_PKEY_FULL );

	/* Set net device link state to reflect Infiniband link state */
	rc = ib_link_rc ( ibdev );
	netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );

	/* Join new broadcast group */
	if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && eoib->qp &&
	     ( ( rc = eoib_join_broadcast_group ( eoib ) ) != 0 ) ) {
		DBGC ( eoib, "EoIB %s could not rejoin broadcast group: "
		       "%s\n", eoib->name, strerror ( rc ) );
		netdev_link_err ( netdev, rc );
		return;
	}
}

/**
 * Open EoIB network device
 *
 * @v netdev		Network device
 * @ret rc		Return status code
 */
static int eoib_open ( struct net_device *netdev ) {
	struct eoib_device *eoib = netdev->priv;
	struct ib_device *ibdev = eoib->ibdev;
	int rc;

	/* Open IB device */
	if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
		DBGC ( eoib, "EoIB %s could not open %s: %s\n",
		       eoib->name, ibdev->name, strerror ( rc ) );
		goto err_ib_open;
	}

	/* Allocate completion queue */
	if ( ( rc = ib_create_cq ( ibdev, EOIB_NUM_CQES, &eoib_cq_op,
				   &eoib->cq ) ) != 0 ) {
		DBGC ( eoib, "EoIB %s could not create completion queue: %s\n",
		       eoib->name, strerror ( rc ) );
		goto err_create_cq;
	}

	/* Allocate queue pair */
	if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, EOIB_NUM_SEND_WQES,
				   eoib->cq, EOIB_NUM_RECV_WQES, eoib->cq,
				   &eoib_qp_op, netdev->name, &eoib->qp ) )!=0){
		DBGC ( eoib, "EoIB %s could not create queue pair: %s\n",
		       eoib->name, strerror ( rc ) );
		goto err_create_qp;
	}
	ib_qp_set_ownerdata ( eoib->qp, eoib );

	/* Fill receive rings */
	ib_refill_recv ( ibdev, eoib->qp );

	/* Fake a link status change to join the broadcast group */
	eoib_link_state_changed ( eoib );

	return 0;

	ib_destroy_qp ( ibdev, eoib->qp );
	eoib->qp = NULL;
 err_create_qp:
	ib_destroy_cq ( ibdev, eoib->cq );
	eoib->cq = NULL;
 err_create_cq:
	ib_close ( ibdev );
 err_ib_open:
	return rc;
}

/**
 * Close EoIB network device
 *
 * @v netdev		Network device
 */
static void eoib_close ( struct net_device *netdev ) {
	struct eoib_device *eoib = netdev->priv;
	struct ib_device *ibdev = eoib->ibdev;

	/* Flush peer cache */
	eoib_flush_peers ( eoib );

	/* Leave broadcast group */
	eoib_leave_broadcast_group ( eoib );

	/* Tear down the queues */
	ib_destroy_qp ( ibdev, eoib->qp );
	eoib->qp = NULL;
	ib_destroy_cq ( ibdev, eoib->cq );
	eoib->cq = NULL;

	/* Close IB device */
	ib_close ( ibdev );
}

/** EoIB network device operations */
static struct net_device_operations eoib_operations = {
	.open		= eoib_open,
	.close		= eoib_close,
	.transmit	= eoib_transmit,
	.poll		= eoib_poll,
};

/**
 * Create EoIB device
 *
 * @v ibdev		Infiniband device
 * @v hw_addr		Ethernet MAC
 * @v broadcast		Broadcast address vector
 * @v name		Interface name (or NULL to use default)
 * @ret rc		Return status code
 */
int eoib_create ( struct ib_device *ibdev, const uint8_t *hw_addr,
		  struct ib_address_vector *broadcast, const char *name ) {
	struct net_device *netdev;
	struct eoib_device *eoib;
	int rc;

	/* Allocate network device */
	netdev = alloc_etherdev ( sizeof ( *eoib ) );
	if ( ! netdev ) {
		rc = -ENOMEM;
		goto err_alloc;
	}
	netdev_init ( netdev, &eoib_operations );
	eoib = netdev->priv;
	netdev->dev = ibdev->dev;
	eoib->netdev = netdev;
	eoib->ibdev = ibdev_get ( ibdev );
	memcpy ( &eoib->broadcast, broadcast, sizeof ( eoib->broadcast ) );
	INIT_LIST_HEAD ( &eoib->peers );

	/* Set MAC address */
	memcpy ( netdev->hw_addr, hw_addr, ETH_ALEN );

	/* Set interface name, if applicable */
	if ( name )
		snprintf ( netdev->name, sizeof ( netdev->name ), "%s", name );
	eoib->name = netdev->name;

	/* Add to list of EoIB devices */
	list_add_tail ( &eoib->list, &eoib_devices );

	/* Register network device */
	if ( ( rc = register_netdev ( netdev ) ) != 0 )
		goto err_register;

	DBGC ( eoib, "EoIB %s created for %s MAC %s\n",
	       eoib->name, ibdev->name, eth_ntoa ( hw_addr ) );
	DBGC ( eoib, "EoIB %s broadcast GID " IB_GID_FMT "\n",
	       eoib->name, IB_GID_ARGS ( &broadcast->gid ) );
	return 0;

	unregister_netdev ( netdev );
 err_register:
	list_del ( &eoib->list );
	ibdev_put ( ibdev );
	netdev_nullify ( netdev );
	netdev_put ( netdev );
 err_alloc:
	return rc;
}

/**
 * Find EoIB device
 *
 * @v ibdev		Infiniband device
 * @v hw_addr		Original Ethernet MAC
 * @ret eoib		EoIB device
 */
struct eoib_device * eoib_find ( struct ib_device *ibdev,
				 const uint8_t *hw_addr ) {
	struct eoib_device *eoib;

	list_for_each_entry ( eoib, &eoib_devices, list ) {
		if ( ( eoib->ibdev == ibdev ) &&
		     ( memcmp ( eoib->netdev->hw_addr, hw_addr,
				ETH_ALEN ) == 0 ) )
			return eoib;
	}
	return NULL;
}

/**
 * Remove EoIB device
 *
 * @v eoib		EoIB device
 */
void eoib_destroy ( struct eoib_device *eoib ) {
	struct net_device *netdev = eoib->netdev;

	/* Unregister network device */
	unregister_netdev ( netdev );

	/* Remove from list of network devices */
	list_del ( &eoib->list );

	/* Drop reference to Infiniband device */
	ibdev_put ( eoib->ibdev );

	/* Free network device */
	DBGC ( eoib, "EoIB %s destroyed\n", eoib->name );
	netdev_nullify ( netdev );
	netdev_put ( netdev );
}

/**
 * Probe EoIB device
 *
 * @v ibdev		Infiniband device
 * @ret rc		Return status code
 */
static int eoib_probe ( struct ib_device *ibdev __unused ) {

	/* EoIB devices are not created automatically */
	return 0;
}

/**
 * Handle device or link status change
 *
 * @v ibdev		Infiniband device
 */
static void eoib_notify ( struct ib_device *ibdev ) {
	struct eoib_device *eoib;

	/* Handle link status change for any attached EoIB devices */
	list_for_each_entry ( eoib, &eoib_devices, list ) {
		if ( eoib->ibdev != ibdev )
			continue;
		eoib_link_state_changed ( eoib );
	}
}

/**
 * Remove EoIB device
 *
 * @v ibdev		Infiniband device
 */
static void eoib_remove ( struct ib_device *ibdev ) {
	struct eoib_device *eoib;
	struct eoib_device *tmp;

	/* Remove any attached EoIB devices */
	list_for_each_entry_safe ( eoib, tmp, &eoib_devices, list ) {
		if ( eoib->ibdev != ibdev )
			continue;
		eoib_destroy ( eoib );
	}
}

/** EoIB driver */
struct ib_driver eoib_driver __ib_driver = {
	.name = "EoIB",
	.probe = eoib_probe,
	.notify = eoib_notify,
	.remove = eoib_remove,
};

/****************************************************************************
 *
 * EoIB heartbeat packets
 *
 ****************************************************************************
 */

/**
 * Silently ignore incoming EoIB heartbeat packets
 *
 * @v iobuf		I/O buffer
 * @v netdev		Network device
 * @v ll_source		Link-layer source address
 * @v flags		Packet flags
 * @ret rc		Return status code
 */
static int eoib_heartbeat_rx ( struct io_buffer *iobuf,
			       struct net_device *netdev __unused,
			       const void *ll_dest __unused,
			       const void *ll_source __unused,
			       unsigned int flags __unused ) {
	free_iob ( iobuf );
	return 0;
}

/**
 * Transcribe EoIB heartbeat address
 *
 * @v net_addr		EoIB heartbeat address
 * @ret string		"<EoIB>"
 *
 * This operation is meaningless for the EoIB heartbeat protocol.
 */
static const char * eoib_heartbeat_ntoa ( const void *net_addr __unused ) {
	return "<EoIB>";
}

/** EoIB heartbeat network protocol */
struct net_protocol eoib_heartbeat_protocol __net_protocol = {
	.name = "EoIB",
	.net_proto = htons ( EOIB_MAGIC ),
	.rx = eoib_heartbeat_rx,
	.ntoa = eoib_heartbeat_ntoa,
};

/****************************************************************************
 *
 * EoIB gateway
 *
 ****************************************************************************
 *
 * Some dubious EoIB implementations require all broadcast traffic to
 * be sent twice: once to the actual broadcast group, and once as a
 * unicast to the EoIB-to-Ethernet gateway.  This somewhat curious
 * design arises since the EoIB-to-Ethernet gateway hardware lacks the
 * ability to attach a queue pair to a multicast GID (or LID), and so
 * cannot receive traffic sent to the broadcast group.
 *
 */

/**
 * Transmit duplicate packet to the EoIB gateway
 *
 * @v eoib		EoIB device
 * @v original		Original I/O buffer
 */
static void eoib_duplicate ( struct eoib_device *eoib,
			     struct io_buffer *original ) {
	struct net_device *netdev = eoib->netdev;
	struct ib_device *ibdev = eoib->ibdev;
	struct ib_address_vector *av = &eoib->gateway;
	size_t len = iob_len ( original );
	struct io_buffer *copy;
	int rc;

	/* Create copy of I/O buffer */
	copy = alloc_iob ( len );
	if ( ! copy ) {
		rc = -ENOMEM;
		goto err_alloc;
	}
	memcpy ( iob_put ( copy, len ), original->data, len );

	/* Append to network device's transmit queue */
	list_add_tail ( &copy->list, &original->list );

	/* Resolve path to gateway */
	if ( ( rc = ib_resolve_path ( ibdev, av ) ) != 0 ) {
		DBGC ( eoib, "EoIB %s no path to gateway: %s\n",
		       eoib->name, strerror ( rc ) );
		goto err_path;
	}

	/* Force use of GRH even for local destinations */
	av->gid_present = 1;

	/* Post send work queue entry */
	if ( ( rc = ib_post_send ( eoib->ibdev, eoib->qp, av, copy ) ) != 0 )
		goto err_post_send;

	return;

 err_post_send:
 err_path:
	list_del ( &copy->list );
 err_alloc:
	netdev_tx_err ( netdev, copy, rc );
}

/**
 * Set EoIB gateway
 *
 * @v eoib		EoIB device
 * @v av		Address vector, or NULL to clear gateway
 */
void eoib_set_gateway ( struct eoib_device *eoib,
			struct ib_address_vector *av ) {

	if ( av ) {
		DBGC ( eoib, "EoIB %s using gateway " IB_GID_FMT "\n",
		       eoib->name, IB_GID_ARGS ( &av->gid ) );
		memcpy ( &eoib->gateway, av, sizeof ( eoib->gateway ) );
		eoib->duplicate = eoib_duplicate;
	} else {
		DBGC ( eoib, "EoIB %s not using gateway\n", eoib->name );
		eoib->duplicate = NULL;
	}
}