/* * Copyright (C) 2016 Michael Brown . * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * You can also choose to distribute this program under the terms of * the Unmodified Binary Distribution Licence (as given in the file * COPYING.UBDL), provided that you have satisfied its requirements. */ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); #include #include #include #include #include #include #include #include #include #include #include #include #include /** @file * * Ethernet over Infiniband * */ /** Number of EoIB send work queue entries */ #define EOIB_NUM_SEND_WQES 8 /** Number of EoIB receive work queue entries */ #define EOIB_NUM_RECV_WQES 4 /** Number of EoIB completion queue entries */ #define EOIB_NUM_CQES 16 /** Link status for "broadcast join in progress" */ #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING ) #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \ ( EINFO_EINPROGRESS, 0x01, "Joining" ) /** Human-readable message for the link status */ struct errortab eoib_errors[] __errortab = { __einfo_errortab ( EINFO_EINPROGRESS_JOINING ), }; /** List of EoIB devices */ static LIST_HEAD ( eoib_devices ); static struct net_device_operations eoib_operations; /**************************************************************************** * * EoIB peer cache * **************************************************************************** */ /** An EoIB peer cache entry */ struct eoib_peer { /** List of EoIB peer cache entries */ struct list_head list; /** Ethernet MAC */ uint8_t mac[ETH_ALEN]; /** Infiniband address vector */ struct ib_address_vector av; }; /** * Find EoIB peer cache entry * * @v eoib EoIB device * @v mac Ethernet MAC * @ret peer EoIB peer, or NULL if not found */ static struct eoib_peer * eoib_find_peer ( struct eoib_device *eoib, const uint8_t *mac ) { struct eoib_peer *peer; /* Find peer cache entry */ list_for_each_entry ( peer, &eoib->peers, list ) { if ( memcmp ( mac, peer->mac, sizeof ( peer->mac ) ) == 0 ) { /* Move peer to start of list */ list_del ( &peer->list ); list_add ( &peer->list, &eoib->peers ); return peer; } } return NULL; } /** * Create EoIB peer cache entry * * @v eoib EoIB device * @v mac Ethernet MAC * @ret peer EoIB peer, or NULL on error */ static struct eoib_peer * eoib_create_peer ( struct eoib_device *eoib, const uint8_t *mac ) { struct eoib_peer *peer; /* Allocate and initialise peer cache entry */ peer = zalloc ( sizeof ( *peer ) ); if ( peer ) { memcpy ( peer->mac, mac, sizeof ( peer->mac ) ); list_add ( &peer->list, &eoib->peers ); } return peer; } /** * Flush EoIB peer cache * * @v eoib EoIB device */ static void eoib_flush_peers ( struct eoib_device *eoib ) { struct eoib_peer *peer; struct eoib_peer *tmp; list_for_each_entry_safe ( peer, tmp, &eoib->peers, list ) { list_del ( &peer->list ); free ( peer ); } } /** * Discard some entries from the peer cache * * @ret discarded Number of cached items discarded */ static unsigned int eoib_discard ( void ) { struct net_device *netdev; struct eoib_device *eoib; struct eoib_peer *peer; unsigned int discarded = 0; /* Try to discard one cache entry for each EoIB device */ for_each_netdev ( netdev ) { /* Skip non-EoIB devices */ if ( netdev->op != &eoib_operations ) continue; eoib = netdev->priv; /* Discard least recently used cache entry (if any) */ list_for_each_entry_reverse ( peer, &eoib->peers, list ) { list_del ( &peer->list ); free ( peer ); discarded++; break; } } return discarded; } /** EoIB cache discarder */ struct cache_discarder eoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = { .discard = eoib_discard, }; /** * Find destination address vector * * @v eoib EoIB device * @v mac Ethernet MAC * @ret av Address vector, or NULL to send as broadcast */ static struct ib_address_vector * eoib_tx_av ( struct eoib_device *eoib, const uint8_t *mac ) { struct ib_device *ibdev = eoib->ibdev; struct eoib_peer *peer; int rc; /* If this is a broadcast or multicast MAC address, then send * this packet as a broadcast. */ if ( is_multicast_ether_addr ( mac ) ) { DBGCP ( eoib, "EoIB %s %s TX multicast\n", eoib->name, eth_ntoa ( mac ) ); return NULL; } /* If we have no peer cache entry, then create one and send * this packet as a broadcast. */ peer = eoib_find_peer ( eoib, mac ); if ( ! peer ) { DBGC ( eoib, "EoIB %s %s TX unknown\n", eoib->name, eth_ntoa ( mac ) ); eoib_create_peer ( eoib, mac ); return NULL; } /* If we have not yet recorded a received GID and QPN for this * peer cache entry, then send this packet as a broadcast. */ if ( ! peer->av.gid_present ) { DBGCP ( eoib, "EoIB %s %s TX not yet recorded\n", eoib->name, eth_ntoa ( mac ) ); return NULL; } /* If we have not yet resolved a path to this peer, then send * this packet as a broadcast. */ if ( ( rc = ib_resolve_path ( ibdev, &peer->av ) ) != 0 ) { DBGCP ( eoib, "EoIB %s %s TX not yet resolved\n", eoib->name, eth_ntoa ( mac ) ); return NULL; } /* Force use of GRH even for local destinations */ peer->av.gid_present = 1; /* We have a fully resolved peer: send this packet as a * unicast. */ DBGCP ( eoib, "EoIB %s %s TX " IB_GID_FMT " QPN %#lx\n", eoib->name, eth_ntoa ( mac ), IB_GID_ARGS ( &peer->av.gid ), peer->av.qpn ); return &peer->av; } /** * Record source address vector * * @v eoib EoIB device * @v mac Ethernet MAC * @v lid Infiniband LID */ static void eoib_rx_av ( struct eoib_device *eoib, const uint8_t *mac, const struct ib_address_vector *av ) { const union ib_gid *gid = &av->gid; unsigned long qpn = av->qpn; struct eoib_peer *peer; /* Sanity checks */ if ( ! av->gid_present ) { DBGC ( eoib, "EoIB %s %s RX with no GID\n", eoib->name, eth_ntoa ( mac ) ); return; } /* Find peer cache entry (if any) */ peer = eoib_find_peer ( eoib, mac ); if ( ! peer ) { DBGCP ( eoib, "EoIB %s %s RX " IB_GID_FMT " (ignored)\n", eoib->name, eth_ntoa ( mac ), IB_GID_ARGS ( gid ) ); return; } /* Some dubious EoIB implementations utilise an Ethernet-to- * EoIB gateway that will send packets from the wrong QPN. */ if ( eoib_has_gateway ( eoib ) && ( memcmp ( gid, &eoib->gateway.gid, sizeof ( *gid ) ) == 0 ) ) { qpn = eoib->gateway.qpn; } /* Do nothing if peer cache entry is complete and correct */ if ( ( peer->av.lid == av->lid ) && ( peer->av.qpn == qpn ) ) { DBGCP ( eoib, "EoIB %s %s RX unchanged\n", eoib->name, eth_ntoa ( mac ) ); return; } /* Update peer cache entry */ peer->av.qpn = qpn; peer->av.qkey = eoib->broadcast.qkey; peer->av.gid_present = 1; memcpy ( &peer->av.gid, gid, sizeof ( peer->av.gid ) ); DBGC ( eoib, "EoIB %s %s RX " IB_GID_FMT " QPN %#lx\n", eoib->name, eth_ntoa ( mac ), IB_GID_ARGS ( &peer->av.gid ), peer->av.qpn ); } /**************************************************************************** * * EoIB network device * **************************************************************************** */ /** * Transmit packet via EoIB network device * * @v netdev Network device * @v iobuf I/O buffer * @ret rc Return status code */ static int eoib_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) { struct eoib_device *eoib = netdev->priv; struct eoib_header *eoib_hdr; struct ethhdr *ethhdr; struct ib_address_vector *av; size_t zlen; /* Sanity checks */ assert ( iob_len ( iobuf ) >= sizeof ( *ethhdr ) ); assert ( iob_headroom ( iobuf ) >= sizeof ( *eoib_hdr ) ); /* Look up destination address vector */ ethhdr = iobuf->data; av = eoib_tx_av ( eoib, ethhdr->h_dest ); /* Prepend EoIB header */ eoib_hdr = iob_push ( iobuf, sizeof ( *eoib_hdr ) ); eoib_hdr->magic = htons ( EOIB_MAGIC ); eoib_hdr->reserved = 0; /* Pad buffer to minimum Ethernet frame size */ zlen = ( sizeof ( *eoib_hdr ) + ETH_ZLEN ); assert ( zlen <= IOB_ZLEN ); if ( iob_len ( iobuf ) < zlen ) iob_pad ( iobuf, zlen ); /* If we have no unicast address then send as a broadcast, * with a duplicate sent to the gateway if applicable. */ if ( ! av ) { av = &eoib->broadcast; if ( eoib_has_gateway ( eoib ) ) eoib->duplicate ( eoib, iobuf ); } /* Post send work queue entry */ return ib_post_send ( eoib->ibdev, eoib->qp, av, iobuf ); } /** * Handle EoIB send completion * * @v ibdev Infiniband device * @v qp Queue pair * @v iobuf I/O buffer * @v rc Completion status code */ static void eoib_complete_send ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp, struct io_buffer *iobuf, int rc ) { struct eoib_device *eoib = ib_qp_get_ownerdata ( qp ); netdev_tx_complete_err ( eoib->netdev, iobuf, rc ); } /** * Handle EoIB receive completion * * @v ibdev Infiniband device * @v qp Queue pair * @v dest Destination address vector, or NULL * @v source Source address vector, or NULL * @v iobuf I/O buffer * @v rc Completion status code */ static void eoib_complete_recv ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp, struct ib_address_vector *dest __unused, struct ib_address_vector *source, struct io_buffer *iobuf, int rc ) { struct eoib_device *eoib = ib_qp_get_ownerdata ( qp ); struct net_device *netdev = eoib->netdev; struct eoib_header *eoib_hdr; struct ethhdr *ethhdr; /* Record errors */ if ( rc != 0 ) { netdev_rx_err ( netdev, iobuf, rc ); return; } /* Sanity check */ if ( iob_len ( iobuf ) < ( sizeof ( *eoib_hdr ) + sizeof ( *ethhdr ) )){ DBGC ( eoib, "EoIB %s received packet too short to " "contain EoIB and Ethernet headers\n", eoib->name ); DBGC_HD ( eoib, iobuf->data, iob_len ( iobuf ) ); netdev_rx_err ( netdev, iobuf, -EIO ); return; } if ( ! source ) { DBGC ( eoib, "EoIB %s received packet without address " "vector\n", eoib->name ); netdev_rx_err ( netdev, iobuf, -ENOTTY ); return; } /* Strip EoIB header */ iob_pull ( iobuf, sizeof ( *eoib_hdr ) ); /* Update neighbour cache entry, if any */ ethhdr = iobuf->data; eoib_rx_av ( eoib, ethhdr->h_source, source ); /* Hand off to network layer */ netdev_rx ( netdev, iobuf ); } /** EoIB completion operations */ static struct ib_completion_queue_operations eoib_cq_op = { .complete_send = eoib_complete_send, .complete_recv = eoib_complete_recv, }; /** EoIB queue pair operations */ static struct ib_queue_pair_operations eoib_qp_op = { .alloc_iob = alloc_iob, }; /** * Poll EoIB network device * * @v netdev Network device */ static void eoib_poll ( struct net_device *netdev ) { struct eoib_device *eoib = netdev->priv; struct ib_device *ibdev = eoib->ibdev; /* Poll Infiniband device */ ib_poll_eq ( ibdev ); /* Poll the retry timers (required for EoIB multicast join) */ retry_poll(); } /** * Handle EoIB broadcast multicast group join completion * * @v membership Multicast group membership * @v rc Status code */ static void eoib_join_complete ( struct ib_mc_membership *membership, int rc ) { struct eoib_device *eoib = container_of ( membership, struct eoib_device, membership ); /* Record join status as link status */ netdev_link_err ( eoib->netdev, rc ); } /** * Join EoIB broadcast multicast group * * @v eoib EoIB device * @ret rc Return status code */ static int eoib_join_broadcast_group ( struct eoib_device *eoib ) { int rc; /* Join multicast group */ if ( ( rc = ib_mcast_join ( eoib->ibdev, eoib->qp, &eoib->membership, &eoib->broadcast, eoib->mask, eoib_join_complete ) ) != 0 ) { DBGC ( eoib, "EoIB %s could not join broadcast group: %s\n", eoib->name, strerror ( rc ) ); return rc; } return 0; } /** * Leave EoIB broadcast multicast group * * @v eoib EoIB device */ static void eoib_leave_broadcast_group ( struct eoib_device *eoib ) { /* Leave multicast group */ ib_mcast_leave ( eoib->ibdev, eoib->qp, &eoib->membership ); } /** * Handle link status change * * @v eoib EoIB device */ static void eoib_link_state_changed ( struct eoib_device *eoib ) { struct net_device *netdev = eoib->netdev; struct ib_device *ibdev = eoib->ibdev; int rc; /* Leave existing broadcast group */ if ( eoib->qp ) eoib_leave_broadcast_group ( eoib ); /* Update broadcast GID based on potentially-new partition key */ eoib->broadcast.gid.words[2] = htons ( ibdev->pkey | IB_PKEY_FULL ); /* Set net device link state to reflect Infiniband link state */ rc = ib_link_rc ( ibdev ); netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) ); /* Join new broadcast group */ if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && eoib->qp && ( ( rc = eoib_join_broadcast_group ( eoib ) ) != 0 ) ) { DBGC ( eoib, "EoIB %s could not rejoin broadcast group: " "%s\n", eoib->name, strerror ( rc ) ); netdev_link_err ( netdev, rc ); return; } } /** * Open EoIB network device * * @v netdev Network device * @ret rc Return status code */ static int eoib_open ( struct net_device *netdev ) { struct eoib_device *eoib = netdev->priv; struct ib_device *ibdev = eoib->ibdev; int rc; /* Open IB device */ if ( ( rc = ib_open ( ibdev ) ) != 0 ) { DBGC ( eoib, "EoIB %s could not open %s: %s\n", eoib->name, ibdev->name, strerror ( rc ) ); goto err_ib_open; } /* Allocate completion queue */ if ( ( rc = ib_create_cq ( ibdev, EOIB_NUM_CQES, &eoib_cq_op, &eoib->cq ) ) != 0 ) { DBGC ( eoib, "EoIB %s could not create completion queue: %s\n", eoib->name, strerror ( rc ) ); goto err_create_cq; } /* Allocate queue pair */ if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, EOIB_NUM_SEND_WQES, eoib->cq, EOIB_NUM_RECV_WQES, eoib->cq, &eoib_qp_op, netdev->name, &eoib->qp ) )!=0){ DBGC ( eoib, "EoIB %s could not create queue pair: %s\n", eoib->name, strerror ( rc ) ); goto err_create_qp; } ib_qp_set_ownerdata ( eoib->qp, eoib ); /* Fill receive rings */ ib_refill_recv ( ibdev, eoib->qp ); /* Fake a link status change to join the broadcast group */ eoib_link_state_changed ( eoib ); return 0; ib_destroy_qp ( ibdev, eoib->qp ); eoib->qp = NULL; err_create_qp: ib_destroy_cq ( ibdev, eoib->cq ); eoib->cq = NULL; err_create_cq: ib_close ( ibdev ); err_ib_open: return rc; } /** * Close EoIB network device * * @v netdev Network device */ static void eoib_close ( struct net_device *netdev ) { struct eoib_device *eoib = netdev->priv; struct ib_device *ibdev = eoib->ibdev; /* Flush peer cache */ eoib_flush_peers ( eoib ); /* Leave broadcast group */ eoib_leave_broadcast_group ( eoib ); /* Tear down the queues */ ib_destroy_qp ( ibdev, eoib->qp ); eoib->qp = NULL; ib_destroy_cq ( ibdev, eoib->cq ); eoib->cq = NULL; /* Close IB device */ ib_close ( ibdev ); } /** EoIB network device operations */ static struct net_device_operations eoib_operations = { .open = eoib_open, .close = eoib_close, .transmit = eoib_transmit, .poll = eoib_poll, }; /** * Create EoIB device * * @v ibdev Infiniband device * @v hw_addr Ethernet MAC * @v broadcast Broadcast address vector * @v name Interface name (or NULL to use default) * @ret rc Return status code */ int eoib_create ( struct ib_device *ibdev, const uint8_t *hw_addr, struct ib_address_vector *broadcast, const char *name ) { struct net_device *netdev; struct eoib_device *eoib; int rc; /* Allocate network device */ netdev = alloc_etherdev ( sizeof ( *eoib ) ); if ( ! netdev ) { rc = -ENOMEM; goto err_alloc; } netdev_init ( netdev, &eoib_operations ); eoib = netdev->priv; netdev->dev = ibdev->dev; eoib->netdev = netdev; eoib->ibdev = ibdev_get ( ibdev ); memcpy ( &eoib->broadcast, broadcast, sizeof ( eoib->broadcast ) ); INIT_LIST_HEAD ( &eoib->peers ); /* Set MAC address */ memcpy ( netdev->hw_addr, hw_addr, ETH_ALEN ); /* Set interface name, if applicable */ if ( name ) snprintf ( netdev->name, sizeof ( netdev->name ), "%s", name ); eoib->name = netdev->name; /* Add to list of EoIB devices */ list_add_tail ( &eoib->list, &eoib_devices ); /* Register network device */ if ( ( rc = register_netdev ( netdev ) ) != 0 ) goto err_register; DBGC ( eoib, "EoIB %s created for %s MAC %s\n", eoib->name, ibdev->name, eth_ntoa ( hw_addr ) ); DBGC ( eoib, "EoIB %s broadcast GID " IB_GID_FMT "\n", eoib->name, IB_GID_ARGS ( &broadcast->gid ) ); return 0; unregister_netdev ( netdev ); err_register: list_del ( &eoib->list ); ibdev_put ( ibdev ); netdev_nullify ( netdev ); netdev_put ( netdev ); err_alloc: return rc; } /** * Find EoIB device * * @v ibdev Infiniband device * @v hw_addr Original Ethernet MAC * @ret eoib EoIB device */ struct eoib_device * eoib_find ( struct ib_device *ibdev, const uint8_t *hw_addr ) { struct eoib_device *eoib; list_for_each_entry ( eoib, &eoib_devices, list ) { if ( ( eoib->ibdev == ibdev ) && ( memcmp ( eoib->netdev->hw_addr, hw_addr, ETH_ALEN ) == 0 ) ) return eoib; } return NULL; } /** * Remove EoIB device * * @v eoib EoIB device */ void eoib_destroy ( struct eoib_device *eoib ) { struct net_device *netdev = eoib->netdev; /* Unregister network device */ unregister_netdev ( netdev ); /* Remove from list of network devices */ list_del ( &eoib->list ); /* Drop reference to Infiniband device */ ibdev_put ( eoib->ibdev ); /* Free network device */ DBGC ( eoib, "EoIB %s destroyed\n", eoib->name ); netdev_nullify ( netdev ); netdev_put ( netdev ); } /** * Probe EoIB device * * @v ibdev Infiniband device * @ret rc Return status code */ static int eoib_probe ( struct ib_device *ibdev __unused ) { /* EoIB devices are not created automatically */ return 0; } /** * Handle device or link status change * * @v ibdev Infiniband device */ static void eoib_notify ( struct ib_device *ibdev ) { struct eoib_device *eoib; /* Handle link status change for any attached EoIB devices */ list_for_each_entry ( eoib, &eoib_devices, list ) { if ( eoib->ibdev != ibdev ) continue; eoib_link_state_changed ( eoib ); } } /** * Remove EoIB device * * @v ibdev Infiniband device */ static void eoib_remove ( struct ib_device *ibdev ) { struct eoib_device *eoib; struct eoib_device *tmp; /* Remove any attached EoIB devices */ list_for_each_entry_safe ( eoib, tmp, &eoib_devices, list ) { if ( eoib->ibdev != ibdev ) continue; eoib_destroy ( eoib ); } } /** EoIB driver */ struct ib_driver eoib_driver __ib_driver = { .name = "EoIB", .probe = eoib_probe, .notify = eoib_notify, .remove = eoib_remove, }; /**************************************************************************** * * EoIB heartbeat packets * **************************************************************************** */ /** * Silently ignore incoming EoIB heartbeat packets * * @v iobuf I/O buffer * @v netdev Network device * @v ll_source Link-layer source address * @v flags Packet flags * @ret rc Return status code */ static int eoib_heartbeat_rx ( struct io_buffer *iobuf, struct net_device *netdev __unused, const void *ll_dest __unused, const void *ll_source __unused, unsigned int flags __unused ) { free_iob ( iobuf ); return 0; } /** * Transcribe EoIB heartbeat address * * @v net_addr EoIB heartbeat address * @ret string "" * * This operation is meaningless for the EoIB heartbeat protocol. */ static const char * eoib_heartbeat_ntoa ( const void *net_addr __unused ) { return ""; } /** EoIB heartbeat network protocol */ struct net_protocol eoib_heartbeat_protocol __net_protocol = { .name = "EoIB", .net_proto = htons ( EOIB_MAGIC ), .rx = eoib_heartbeat_rx, .ntoa = eoib_heartbeat_ntoa, }; /**************************************************************************** * * EoIB gateway * **************************************************************************** * * Some dubious EoIB implementations require all broadcast traffic to * be sent twice: once to the actual broadcast group, and once as a * unicast to the EoIB-to-Ethernet gateway. This somewhat curious * design arises since the EoIB-to-Ethernet gateway hardware lacks the * ability to attach a queue pair to a multicast GID (or LID), and so * cannot receive traffic sent to the broadcast group. * */ /** * Transmit duplicate packet to the EoIB gateway * * @v eoib EoIB device * @v original Original I/O buffer */ static void eoib_duplicate ( struct eoib_device *eoib, struct io_buffer *original ) { struct net_device *netdev = eoib->netdev; struct ib_device *ibdev = eoib->ibdev; struct ib_address_vector *av = &eoib->gateway; size_t len = iob_len ( original ); struct io_buffer *copy; int rc; /* Create copy of I/O buffer */ copy = alloc_iob ( len ); if ( ! copy ) { rc = -ENOMEM; goto err_alloc; } memcpy ( iob_put ( copy, len ), original->data, len ); /* Append to network device's transmit queue */ list_add_tail ( ©->list, &original->list ); /* Resolve path to gateway */ if ( ( rc = ib_resolve_path ( ibdev, av ) ) != 0 ) { DBGC ( eoib, "EoIB %s no path to gateway: %s\n", eoib->name, strerror ( rc ) ); goto err_path; } /* Force use of GRH even for local destinations */ av->gid_present = 1; /* Post send work queue entry */ if ( ( rc = ib_post_send ( eoib->ibdev, eoib->qp, av, copy ) ) != 0 ) goto err_post_send; return; err_post_send: err_path: list_del ( ©->list ); err_alloc: netdev_tx_err ( netdev, copy, rc ); } /** * Set EoIB gateway * * @v eoib EoIB device * @v av Address vector, or NULL to clear gateway */ void eoib_set_gateway ( struct eoib_device *eoib, struct ib_address_vector *av ) { if ( av ) { DBGC ( eoib, "EoIB %s using gateway " IB_GID_FMT "\n", eoib->name, IB_GID_ARGS ( &av->gid ) ); memcpy ( &eoib->gateway, av, sizeof ( eoib->gateway ) ); eoib->duplicate = eoib_duplicate; } else { DBGC ( eoib, "EoIB %s not using gateway\n", eoib->name ); eoib->duplicate = NULL; } }