summaryrefslogblamecommitdiffstats
path: root/src/drivers/net/ipoib.c
blob: 33c7ddccdc4a2fc48c1e486c3647d67738c7dde3 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15














                                                                      

                                                                



                                                                    

   
                                       
 
                   
                   
                  
                   


                     
                          
                        
                        
                     

                          
                    




                            
                       
                       





                     













                                                                         
                                              
                             
 

                                                 
 
                                         
                         
 









                                           





                                           

                                    



                                       

                             

                                         

                               

  

                                           
                                                 

                                                                        

  
                                                   


                                                                       


                                                 
                                                       

  


                                   

                                                     

                                                                             
                    



                                                                             
                                 
                   



                                          

                             

  
   
                            
  


                                                        
   

                                                                                
                                
 



                                                               
                                             









                                                                
         
 

                                                    

                    
 
   
                               
  



                                           
   


                                                             
                                
 










                                                                         

         




                                                               
                                                         
                                                
 

                 
 
   
                    
  
                                    
   


                                                              
 



                                                                     


   
                                            
  
                                                        
   
                                                  
                                  




                                                                  







                                                                      





                                                                           

         

                         
 
                            
                                                                                

                                       
 





                                                                             

   





                                                                    



                                            
 




                                                              

 












                                                   
                                    


                                       
   
                        
  

                                                   
   









                                                          

 






                                                                             
   
                                   
  

                                                                             

                                          


















                                                                        
 


                                                                                




                                                                              

















                                                                                

 
   
                               
  



                                                                             
   












                                                                               

 
   
                                
  



                                                                    
   







                                                                 
 









                                                                           
         
 
















                                                                               
 
   
                            
  




                                                                      
   













                                                                       
         

 

















                                                                             
                              
                            
                                    
                                  
                              

                                       
                           
               
 
                          
                                                       



                                                                     


                                                                  
                                     

                                    

                                 
                                                            


                                               
                                          




















                                                                        

























                                                                                








                                                                            
 

                                                              


   
                               


                                         
                                  
                                              
   


                                                                     
                                                                
 
                                                            


   
                                  


                                         
                                                           
                                                      
                                  
                                              
   

                                                                   
                                                                 
                                                                   
                                                                     

                                                                
                                    


                                 
 
                           

                                                    
                       

         

                                                                
                                                                      


                                                                  
                       
         
                         




                                                                         
 





















                                                                          

                                                                       





                                                                      
         
 
                                       
                                    

 



                                                            

  






























                                                                           

                                                      
                                     

  
   







                                                      
                                    
                             


                                                                       


   

                                                        

                                                  
   



                                                                           





                                               







                                                                      
                                  
                                                            
                                                                 
                                                            
                                                                    
                                                                              


                                                
 









                                                                        


                                                        


   

                            
                                    
   


                                                                     


                                            

                                                      

                                                                    

                                                                
 

                                                                             

                                                     






                                                                           




                                                                        
                                                                         








                                                                           






                                                     
                                               

               
                            
                                                




                                                                     
                                       



                                                                            



                                   
                                                                         
                                                                             



                                                                           
                                   
         
                                                 
 
                                         
                                                         
 
                                
                                            
 
                                                                   
                                           
 
                 
 



                                           
                           
             
                  








                                                       
                                               


                                    
 

                                              
 
                                         
                                  

                                  
                                           
                         
                                           
                         

                             
                           







                                                        







                                          
                                                    

                                   







                                                      

                                               
                               
                             
                                         
 
                                      

                                                     
                                                           
 




                                                            

                                                         
 


                                                       





                                                       
                                     
                     
                                  





                                  















                                                                      



                                         
                                                      


                                   
 









                                                                       
 




                                             
                               

                               

















                                                                  
/*
 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA.
 *
 * You can also choose to distribute this program under the terms of
 * the Unmodified Binary Distribution Licence (as given in the file
 * COPYING.UBDL), provided that you have satisfied its requirements.
 */

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <byteswap.h>
#include <errno.h>
#include <ipxe/errortab.h>
#include <ipxe/malloc.h>
#include <ipxe/if_arp.h>
#include <ipxe/arp.h>
#include <ipxe/if_ether.h>
#include <ipxe/ethernet.h>
#include <ipxe/ip.h>
#include <ipxe/iobuf.h>
#include <ipxe/netdevice.h>
#include <ipxe/infiniband.h>
#include <ipxe/ib_pathrec.h>
#include <ipxe/ib_mcast.h>
#include <ipxe/retry.h>
#include <ipxe/ipoib.h>

/** @file
 *
 * IP over Infiniband
 */

/* Disambiguate the various error causes */
#define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
#define EINFO_ENXIO_ARP_REPLY						\
	__einfo_uniqify ( EINFO_ENXIO, 0x01,				\
			  "Missing REMAC for ARP reply target address" )
#define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
#define EINFO_ENXIO_NON_IPV4						\
	__einfo_uniqify ( EINFO_ENXIO, 0x02,				\
			  "Missing REMAC for non-IPv4 packet" )
#define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
#define EINFO_ENXIO_ARP_SENT						\
	__einfo_uniqify ( EINFO_ENXIO, 0x03,				\
			  "Missing REMAC for IPv4 packet (ARP sent)" )

/** Number of IPoIB send work queue entries */
#define IPOIB_NUM_SEND_WQES 8

/** Number of IPoIB receive work queue entries */
#define IPOIB_NUM_RECV_WQES 4

/** Number of IPoIB completion entries */
#define IPOIB_NUM_CQES 16

/** An IPoIB broadcast address */
struct ipoib_broadcast {
	/** MAC address */
	struct ipoib_mac mac;
	/** Address vector */
	struct ib_address_vector av;
	/** Multicast group membership */
	struct ib_mc_membership membership;
};

/** An IPoIB device */
struct ipoib_device {
	/** Network device */
	struct net_device *netdev;
	/** Underlying Infiniband device */
	struct ib_device *ibdev;
	/** List of IPoIB devices */
	struct list_head list;
	/** Completion queue */
	struct ib_completion_queue *cq;
	/** Queue pair */
	struct ib_queue_pair *qp;
	/** Local MAC */
	struct ipoib_mac mac;
	/** Broadcast address */
	struct ipoib_broadcast broadcast;
	/** REMAC cache */
	struct list_head peers;
};

/** Broadcast IPoIB address */
static struct ipoib_mac ipoib_broadcast = {
	.flags__qpn = htonl ( IB_QPN_BROADCAST ),
	.gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
		       0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
};

/** Link status for "broadcast join in progress" */
#define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
#define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
	( EINFO_EINPROGRESS, 0x01, "Joining" )

/** Human-readable message for the link status */
struct errortab ipoib_errors[] __errortab = {
	__einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
};

/** List of all IPoIB devices */
static LIST_HEAD ( ipoib_devices );

static struct net_device_operations ipoib_operations;

/****************************************************************************
 *
 * IPoIB REMAC cache
 *
 ****************************************************************************
 */

/** An IPoIB REMAC cache entry */
struct ipoib_peer {
	/** List of REMAC cache entries */
	struct list_head list;
	/** Remote Ethermet MAC */
	struct ipoib_remac remac;
	/** MAC address */
	struct ipoib_mac mac;
};

/**
 * Find IPoIB MAC from REMAC
 *
 * @v ipoib		IPoIB device
 * @v remac		Remote Ethernet MAC
 * @ret mac		IPoIB MAC (or NULL if not found)
 */
static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
					     const struct ipoib_remac *remac ) {
	struct ipoib_peer *peer;

	/* Check for broadcast or multicast REMAC.  We transmit
	 * multicasts as broadcasts for simplicity.
	 */
	if ( is_multicast_ether_addr ( remac ) )
		return &ipoib->broadcast.mac;

	/* Try to find via REMAC cache */
	list_for_each_entry ( peer, &ipoib->peers, list ) {
		if ( memcmp ( remac, &peer->remac,
			      sizeof ( peer->remac ) ) == 0 ) {
			/* Move peer to start of list */
			list_del ( &peer->list );
			list_add ( &peer->list, &ipoib->peers );
			return &peer->mac;
		}
	}

	DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
	       ipoib, eth_ntoa ( remac ) );
	return NULL;
}

/**
 * Add IPoIB MAC to REMAC cache
 *
 * @v ipoib		IPoIB device
 * @v remac		Remote Ethernet MAC
 * @v mac		IPoIB MAC
 * @ret rc		Return status code
 */
static int ipoib_map_remac ( struct ipoib_device *ipoib,
			     const struct ipoib_remac *remac,
			     const struct ipoib_mac *mac ) {
	struct ipoib_peer *peer;

	/* Check for existing entry in REMAC cache */
	list_for_each_entry ( peer, &ipoib->peers, list ) {
		if ( memcmp ( remac, &peer->remac,
			      sizeof ( peer->remac ) ) == 0 ) {
			/* Move peer to start of list */
			list_del ( &peer->list );
			list_add ( &peer->list, &ipoib->peers );
			/* Update MAC */
			memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
			return 0;
		}
	}

	/* Create new entry */
	peer = malloc ( sizeof ( *peer ) );
	if ( ! peer )
		return -ENOMEM;
	memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
	memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
	list_add ( &peer->list, &ipoib->peers );

	return 0;
}

/**
 * Flush REMAC cache
 *
 * @v ipoib		IPoIB device
 */
static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
	struct ipoib_peer *peer;
	struct ipoib_peer *tmp;

	list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
		list_del ( &peer->list );
		free ( peer );
	}
}

/**
 * Discard some entries from the REMAC cache
 *
 * @ret discarded	Number of cached items discarded
 */
static unsigned int ipoib_discard_remac ( void ) {
	struct net_device *netdev;
	struct ipoib_device *ipoib;
	struct ipoib_peer *peer;
	unsigned int discarded = 0;

	/* Try to discard one cache entry for each IPoIB device */
	for_each_netdev ( netdev ) {

		/* Skip non-IPoIB devices */
		if ( netdev->op != &ipoib_operations )
			continue;
		ipoib = netdev->priv;

		/* Discard least recently used cache entry (if any) */
		list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
			list_del ( &peer->list );
			free ( peer );
			discarded++;
			break;
		}
	}

	return discarded;
}

/** IPoIB cache discarder */
struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
	.discard = ipoib_discard_remac,
};

/****************************************************************************
 *
 * IPoIB link layer
 *
 ****************************************************************************
 */

/**
 * Initialise IPoIB link-layer address
 *
 * @v hw_addr		Hardware address
 * @v ll_addr		Link-layer address
 */
static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
	const uint8_t *guid = hw_addr;
	uint8_t *eth_addr = ll_addr;
	uint8_t guid_mask = IPOIB_GUID_MASK;
	unsigned int i;

	/* Extract bytes from GUID according to mask */
	for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
		if ( guid_mask & 0x80 )
			*(eth_addr++) = *guid;
	}
}

/** IPoIB protocol */
struct ll_protocol ipoib_protocol __ll_protocol = {
	.name		= "IPoIB",
	.ll_proto	= htons ( ARPHRD_ETHER ),
	.hw_addr_len	= sizeof ( union ib_guid ),
	.ll_addr_len	= ETH_ALEN,
	.ll_header_len	= ETH_HLEN,
	.push		= eth_push,
	.pull		= eth_pull,
	.init_addr	= ipoib_init_addr,
	.ntoa		= eth_ntoa,
	.mc_hash	= eth_mc_hash,
	.eth_addr	= eth_eth_addr,
	.eui64		= eth_eui64,
	.flags		= LL_NAME_ONLY,
};

/**
 * Allocate IPoIB device
 *
 * @v priv_size		Size of driver private data
 * @ret netdev		Network device, or NULL
 */
struct net_device * alloc_ipoibdev ( size_t priv_size ) {
	struct net_device *netdev;

	netdev = alloc_netdev ( priv_size );
	if ( netdev ) {
		netdev->ll_protocol = &ipoib_protocol;
		netdev->ll_broadcast = eth_broadcast;
		netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
	}
	return netdev;
}

/****************************************************************************
 *
 * IPoIB translation layer
 *
 ****************************************************************************
 */

/**
 * Translate transmitted ARP packet
 *
 * @v netdev		Network device
 * @v iobuf		Packet to be transmitted (with no link-layer headers)
 * @ret rc		Return status code
 */
static int ipoib_translate_tx_arp ( struct net_device *netdev,
				    struct io_buffer *iobuf ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct arphdr *arphdr = iobuf->data;
	struct ipoib_mac *target_ha = NULL;
	void *sender_pa;
	void *target_pa;

	/* Do nothing unless ARP contains eIPoIB link-layer addresses */
	if ( arphdr->ar_hln != ETH_ALEN )
		return 0;

	/* Fail unless we have room to expand packet */
	if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
					      ETH_ALEN ) ) ) {
		DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
		       ipoib );
		return -ENOBUFS;
	}

	/* Look up REMAC, if applicable */
	if ( arphdr->ar_op == ARPOP_REPLY ) {
		target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
		if ( ! target_ha ) {
			DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
			       ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
			return -ENXIO_ARP_REPLY;
		}
	}

	/* Construct new packet */
	iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
	sender_pa = arp_sender_pa ( arphdr );
	target_pa = arp_target_pa ( arphdr );
	arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
	arphdr->ar_hln = sizeof ( ipoib->mac );
	memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
	memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
	memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
	memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
	if ( target_ha ) {
		memcpy ( arp_target_ha ( arphdr ), target_ha,
			 sizeof ( *target_ha ) );
	}

	return 0;
}

/**
 * Translate transmitted packet
 *
 * @v netdev		Network device
 * @v iobuf		Packet to be transmitted (with no link-layer headers)
 * @v net_proto		Network-layer protocol (in network byte order)
 * @ret rc		Return status code
 */
static int ipoib_translate_tx ( struct net_device *netdev,
				struct io_buffer *iobuf, uint16_t net_proto ) {

	switch ( net_proto ) {
	case htons ( ETH_P_ARP ) :
		return ipoib_translate_tx_arp ( netdev, iobuf );
	case htons ( ETH_P_IP ) :
		/* No translation needed */
		return 0;
	default:
		/* Cannot handle other traffic via eIPoIB */
		return -ENOTSUP;
	}
}

/**
 * Translate received ARP packet
 *
 * @v netdev		Network device
 * @v iobuf		Received packet (with no link-layer headers)
 * @v remac		Constructed Remote Ethernet MAC
 * @ret rc		Return status code
 */
static int ipoib_translate_rx_arp ( struct net_device *netdev,
				    struct io_buffer *iobuf,
				    struct ipoib_remac *remac ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct arphdr *arphdr = iobuf->data;
	void *sender_pa;
	void *target_pa;
	int rc;

	/* Do nothing unless ARP contains IPoIB link-layer addresses */
	if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
		return 0;

	/* Create REMAC cache entry */
	if ( ( rc = ipoib_map_remac ( ipoib, remac,
				      arp_sender_ha ( arphdr ) ) ) != 0 ) {
		DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
		       ipoib, strerror ( rc ) );
		return rc;
	}

	/* Construct new packet */
	sender_pa = arp_sender_pa ( arphdr );
	target_pa = arp_target_pa ( arphdr );
	arphdr->ar_hrd = htons ( ARPHRD_ETHER );
	arphdr->ar_hln = ETH_ALEN;
	memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
	memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
	memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
	memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
	if ( arphdr->ar_op == ARPOP_REPLY ) {
		/* Assume received replies were directed to us */
		memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
	}
	iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );

	return 0;
}

/**
 * Translate received packet
 *
 * @v netdev		Network device
 * @v iobuf		Received packet (with no link-layer headers)
 * @v remac		Constructed Remote Ethernet MAC
 * @v net_proto		Network-layer protocol (in network byte order)
 * @ret rc		Return status code
 */
static int ipoib_translate_rx ( struct net_device *netdev,
				struct io_buffer *iobuf,
				struct ipoib_remac *remac,
				uint16_t net_proto ) {

	switch ( net_proto ) {
	case htons ( ETH_P_ARP ) :
		return ipoib_translate_rx_arp ( netdev, iobuf, remac );
	case htons ( ETH_P_IP ) :
		/* No translation needed */
		return 0;
	default:
		/* Cannot handle other traffic via eIPoIB */
		return -ENOTSUP;
	}
}

/****************************************************************************
 *
 * IPoIB network device
 *
 ****************************************************************************
 */

/**
 * Transmit packet via IPoIB network device
 *
 * @v netdev		Network device
 * @v iobuf		I/O buffer
 * @ret rc		Return status code
 */
static int ipoib_transmit ( struct net_device *netdev,
			    struct io_buffer *iobuf ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct ib_device *ibdev = ipoib->ibdev;
	struct ethhdr *ethhdr;
	struct iphdr *iphdr;
	struct ipoib_hdr *ipoib_hdr;
	struct ipoib_remac *remac;
	struct ipoib_mac *mac;
	struct ib_address_vector *dest;
	struct ib_address_vector av;
	uint16_t net_proto;
	int rc;

	/* Sanity check */
	if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
		DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
		return -EINVAL;
	}

	/* Attempting transmission while link is down will put the
	 * queue pair into an error state, so don't try it.
	 */
	if ( ! ib_link_ok ( ibdev ) )
		return -ENETUNREACH;

	/* Strip eIPoIB header */
	ethhdr = iobuf->data;
	remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
	net_proto = ethhdr->h_protocol;
	iob_pull ( iobuf, sizeof ( *ethhdr ) );

	/* Identify destination address */
	if ( is_multicast_ether_addr ( remac ) ) {

		/* Transmit multicasts as broadcasts, for simplicity */
		dest = &ipoib->broadcast.av;

	} else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {

		/* Construct address vector from IPoIB MAC */
		dest = &av;
		memset ( dest, 0, sizeof ( *dest ) );
		dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
		dest->qkey = ipoib->broadcast.av.qkey;
		dest->gid_present = 1;
		memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
		if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
			/* Path not resolved yet */
			return rc;
		}

	} else {

		/* Generate a new ARP request (if possible) to trigger
		 * population of the REMAC cache entry.
		 */
		if ( ( net_proto != htons ( ETH_P_IP ) ) ||
		     ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
			DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
			       "packet type %04x\n", ipoib,
			       eth_ntoa ( ethhdr->h_dest ),
			       ntohs ( net_proto ) );
			return -ENXIO_NON_IPV4;
		}
		iphdr = iobuf->data;
		if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
					     &iphdr->dest, &iphdr->src ) ) !=0){
			DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
			       ipoib, eth_ntoa ( ethhdr->h_dest ),
			       inet_ntoa ( iphdr->dest ) );
			DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
			       strerror ( rc ) );
			return rc;
		}
		DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
		       eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
		DBGC  ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
		return -ENXIO_ARP_SENT;
	}

	/* Translate packet if applicable */
	if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
		return rc;

	/* Prepend real IPoIB header */
	ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
	ipoib_hdr->proto = net_proto;
	ipoib_hdr->reserved = 0;

	/* Transmit packet */
	return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
}

/**
 * Handle IPoIB send completion
 *
 * @v ibdev		Infiniband device
 * @v qp		Queue pair
 * @v iobuf		I/O buffer
 * @v rc		Completion status code
 */
static void ipoib_complete_send ( struct ib_device *ibdev __unused,
				  struct ib_queue_pair *qp,
				  struct io_buffer *iobuf, int rc ) {
	struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );

	netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
}

/**
 * Handle IPoIB receive completion
 *
 * @v ibdev		Infiniband device
 * @v qp		Queue pair
 * @v dest		Destination address vector, or NULL
 * @v source		Source address vector, or NULL
 * @v iobuf		I/O buffer
 * @v rc		Completion status code
 */
static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
				  struct ib_queue_pair *qp,
				  struct ib_address_vector *dest,
				  struct ib_address_vector *source,
				  struct io_buffer *iobuf, int rc ) {
	struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
	struct net_device *netdev = ipoib->netdev;
	struct ipoib_hdr *ipoib_hdr;
	struct ethhdr *ethhdr;
	struct ipoib_remac remac;
	uint16_t net_proto;

	/* Record errors */
	if ( rc != 0 ) {
		netdev_rx_err ( netdev, iobuf, rc );
		return;
	}

	/* Sanity check */
	if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
		DBGC ( ipoib, "IPoIB %p received packet too short to "
		       "contain IPoIB header\n", ipoib );
		DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
		netdev_rx_err ( netdev, iobuf, -EIO );
		return;
	}
	if ( ! source ) {
		DBGC ( ipoib, "IPoIB %p received packet without address "
		       "vector\n", ipoib );
		netdev_rx_err ( netdev, iobuf, -ENOTTY );
		return;
	}

	/* Strip real IPoIB header */
	ipoib_hdr = iobuf->data;
	net_proto = ipoib_hdr->proto;
	iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );

	/* Construct source address from remote QPN and LID */
	remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
	remac.lid = htons ( source->lid );

	/* Translate packet if applicable */
	if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
					 net_proto ) ) != 0 ) {
		netdev_rx_err ( netdev, iobuf, rc );
		return;
	}

	/* Prepend eIPoIB header */
	ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
	memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
	ethhdr->h_protocol = net_proto;

	/* Construct destination address */
	if ( dest->gid_present && IB_GID_MULTICAST ( &dest->gid ) ) {
		/* Multicast GID: use the Ethernet broadcast address */
		memcpy ( &ethhdr->h_dest, eth_broadcast,
			 sizeof ( ethhdr->h_dest ) );
	} else {
		/* Assume destination address is local Ethernet MAC */
		memcpy ( &ethhdr->h_dest, netdev->ll_addr,
			 sizeof ( ethhdr->h_dest ) );
	}

	/* Hand off to network layer */
	netdev_rx ( netdev, iobuf );
}

/** IPoIB completion operations */
static struct ib_completion_queue_operations ipoib_cq_op = {
	.complete_send = ipoib_complete_send,
	.complete_recv = ipoib_complete_recv,
};

/**
 * Allocate IPoIB receive I/O buffer
 *
 * @v len		Length of buffer
 * @ret iobuf		I/O buffer, or NULL
 *
 * Some Infiniband hardware requires 2kB alignment of receive buffers
 * and provides no way to disable header separation.  The result is
 * that there are only four bytes of link-layer header (the real IPoIB
 * header) before the payload.  This is not sufficient space to insert
 * an eIPoIB link-layer pseudo-header.
 *
 * We therefore allocate I/O buffers offset to start slightly before
 * the natural alignment boundary, in order to allow sufficient space.
 */
static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
	struct io_buffer *iobuf;
	size_t reserve_len;

	/* Calculate additional length required at start of buffer */
	reserve_len = ( sizeof ( struct ethhdr ) -
			sizeof ( struct ipoib_hdr ) );

	/* Allocate buffer */
	iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
	if ( iobuf ) {
		iob_reserve ( iobuf, reserve_len );
	}
	return iobuf;
}

/** IPoIB queue pair operations */
static struct ib_queue_pair_operations ipoib_qp_op = {
	.alloc_iob = ipoib_alloc_iob,
};

/**
 * Poll IPoIB network device
 *
 * @v netdev		Network device
 */
static void ipoib_poll ( struct net_device *netdev ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct ib_device *ibdev = ipoib->ibdev;

	/* Poll Infiniband device */
	ib_poll_eq ( ibdev );

	/* Poll the retry timers (required for IPoIB multicast join) */
	retry_poll();
}

/**
 * Handle IPv4 broadcast multicast group join completion
 *
 * @v membership	Multicast group membership
 * @v rc		Status code
 */
void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
	struct ipoib_device *ipoib = container_of ( membership,
						    struct ipoib_device,
						    broadcast.membership );

	/* Record join status as link status */
	netdev_link_err ( ipoib->netdev, rc );
}

/**
 * Join IPv4 broadcast multicast group
 *
 * @v ipoib		IPoIB device
 * @ret rc		Return status code
 */
static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
	int rc;

	/* Join multicast group */
	if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
				    &ipoib->broadcast.membership,
				    &ipoib->broadcast.av, 0,
				    ipoib_join_complete ) ) != 0 ) {
		DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
		       ipoib, strerror ( rc ) );
		return rc;
	}

	return 0;
}

/**
 * Leave IPv4 broadcast multicast group
 *
 * @v ipoib		IPoIB device
 */
static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {

	/* Leave multicast group */
	ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
			 &ipoib->broadcast.membership );
}

/**
 * Handle link status change
 *
 * @v ipoib		IPoIB device
 */
static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
	struct ib_device *ibdev = ipoib->ibdev;
	struct net_device *netdev = ipoib->netdev;
	int rc;

	/* Leave existing broadcast group */
	if ( ipoib->qp )
		ipoib_leave_broadcast_group ( ipoib );

	/* Update MAC address based on potentially-new GID prefix */
	memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
		 sizeof ( ipoib->mac.gid.s.prefix ) );

	/* Update broadcast MAC GID based on potentially-new partition key */
	ipoib->broadcast.mac.gid.words[2] =
		htons ( ibdev->pkey | IB_PKEY_FULL );

	/* Construct broadcast address vector from broadcast MAC address */
	memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
	ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
	ipoib->broadcast.av.gid_present = 1;
	memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
		 sizeof ( ipoib->broadcast.av.gid ) );

	/* Set net device link state to reflect Infiniband link state */
	rc = ib_link_rc ( ibdev );
	netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );

	/* Join new broadcast group */
	if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
	     ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
		DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
		       "%s\n", ipoib, strerror ( rc ) );
		netdev_link_err ( netdev, rc );
		return;
	}
}

/**
 * Open IPoIB network device
 *
 * @v netdev		Network device
 * @ret rc		Return status code
 */
static int ipoib_open ( struct net_device *netdev ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct ib_device *ibdev = ipoib->ibdev;
	int rc;

	/* Open IB device */
	if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
		DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
		       ipoib, strerror ( rc ) );
		goto err_ib_open;
	}

	/* Allocate completion queue */
	if ( ( rc = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op,
				   &ipoib->cq ) ) != 0 ) {
		DBGC ( ipoib, "IPoIB %p could not create completion queue: "
		       "%s\n", ipoib, strerror ( rc ) );
		goto err_create_cq;
	}

	/* Allocate queue pair */
	if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
				   ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
				   &ipoib_qp_op, netdev->name,
				   &ipoib->qp ) ) != 0 ) {
		DBGC ( ipoib, "IPoIB %p could not create queue pair: %s\n",
		       ipoib, strerror ( rc ) );
		goto err_create_qp;
	}
	ib_qp_set_ownerdata ( ipoib->qp, ipoib );

	/* Update MAC address with QPN */
	ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );

	/* Fill receive rings */
	ib_refill_recv ( ibdev, ipoib->qp );

	/* Fake a link status change to join the broadcast group */
	ipoib_link_state_changed ( ipoib );

	return 0;

	ib_destroy_qp ( ibdev, ipoib->qp );
 err_create_qp:
	ib_destroy_cq ( ibdev, ipoib->cq );
 err_create_cq:
	ib_close ( ibdev );
 err_ib_open:
	return rc;
}

/**
 * Close IPoIB network device
 *
 * @v netdev		Network device
 */
static void ipoib_close ( struct net_device *netdev ) {
	struct ipoib_device *ipoib = netdev->priv;
	struct ib_device *ibdev = ipoib->ibdev;

	/* Flush REMAC cache */
	ipoib_flush_remac ( ipoib );

	/* Leave broadcast group */
	ipoib_leave_broadcast_group ( ipoib );

	/* Remove QPN from MAC address */
	ipoib->mac.flags__qpn = 0;

	/* Tear down the queues */
	ib_destroy_qp ( ibdev, ipoib->qp );
	ipoib->qp = NULL;
	ib_destroy_cq ( ibdev, ipoib->cq );
	ipoib->cq = NULL;

	/* Close IB device */
	ib_close ( ibdev );
}

/** IPoIB network device operations */
static struct net_device_operations ipoib_operations = {
	.open		= ipoib_open,
	.close		= ipoib_close,
	.transmit	= ipoib_transmit,
	.poll		= ipoib_poll,
};

/**
 * Probe IPoIB device
 *
 * @v ibdev		Infiniband device
 * @ret rc		Return status code
 */
static int ipoib_probe ( struct ib_device *ibdev ) {
	struct net_device *netdev;
	struct ipoib_device *ipoib;
	int rc;

	/* Allocate network device */
	netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
	if ( ! netdev )
		return -ENOMEM;
	netdev_init ( netdev, &ipoib_operations );
	ipoib = netdev->priv;
	netdev->dev = ibdev->dev;
	memset ( ipoib, 0, sizeof ( *ipoib ) );
	ipoib->netdev = netdev;
	ipoib->ibdev = ibdev;
	INIT_LIST_HEAD ( &ipoib->peers );

	/* Extract hardware address */
	memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
		 sizeof ( ibdev->gid.s.guid ) );
	memcpy ( netdev->ll_addr, ibdev->lemac, ETH_ALEN );

	/* Set local MAC address */
	memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
		 sizeof ( ipoib->mac.gid.s.guid ) );

	/* Set default broadcast MAC address */
	memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
		 sizeof ( ipoib->broadcast.mac ) );

	/* Add to list of IPoIB devices */
	list_add_tail ( &ipoib->list, &ipoib_devices );

	/* Register network device */
	if ( ( rc = register_netdev ( netdev ) ) != 0 )
		goto err_register_netdev;

	return 0;

	unregister_netdev ( netdev );
 err_register_netdev:
	list_del ( &ipoib->list );
	netdev_nullify ( netdev );
	netdev_put ( netdev );
	return rc;
}

/**
 * Handle device or link status change
 *
 * @v ibdev		Infiniband device
 */
static void ipoib_notify ( struct ib_device *ibdev ) {
	struct ipoib_device *ipoib;

	/* Handle link status change for any attached IPoIB devices */
	list_for_each_entry ( ipoib, &ipoib_devices, list ) {
		if ( ipoib->ibdev != ibdev )
			continue;
		ipoib_link_state_changed ( ipoib );
	}
}

/**
 * Remove IPoIB device
 *
 * @v ibdev		Infiniband device
 */
static void ipoib_remove ( struct ib_device *ibdev ) {
	struct ipoib_device *ipoib;
	struct ipoib_device *tmp;
	struct net_device *netdev;

	/* Remove any attached IPoIB devices */
	list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
		if ( ipoib->ibdev != ibdev )
			continue;
		netdev = ipoib->netdev;
		unregister_netdev ( netdev );
		list_del ( &ipoib->list );
		netdev_nullify ( netdev );
		netdev_put ( netdev );
	}
}

/** IPoIB driver */
struct ib_driver ipoib_driver __ib_driver = {
	.name = "IPoIB",
	.probe = ipoib_probe,
	.notify = ipoib_notify,
	.remove = ipoib_remove,
};

/**
 * Find IPoIB network device
 *
 * @v ibdev		Infiniband device
 * @ret netdev		IPoIB network device, or NULL if not found
 */
struct net_device * ipoib_netdev ( struct ib_device *ibdev ) {
	struct ipoib_device *ipoib;

	/* Find matching IPoIB device */
	list_for_each_entry ( ipoib, &ipoib_devices, list ) {
		if ( ipoib->ibdev != ibdev )
			continue;
		return ipoib->netdev;
	}
	return NULL;
}