diff options
Diffstat (limited to 'src/drivers/net/gve.c')
| -rw-r--r-- | src/drivers/net/gve.c | 2016 |
1 files changed, 2016 insertions, 0 deletions
diff --git a/src/drivers/net/gve.c b/src/drivers/net/gve.c new file mode 100644 index 000000000..77eb4b674 --- /dev/null +++ b/src/drivers/net/gve.c @@ -0,0 +1,2016 @@ +/* + * Copyright (C) 2024 Michael Brown <mbrown@fensystems.co.uk>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * You can also choose to distribute this program under the terms of + * the Unmodified Binary Distribution Licence (as given in the file + * COPYING.UBDL), provided that you have satisfied its requirements. + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <assert.h> +#include <byteswap.h> +#include <ipxe/netdevice.h> +#include <ipxe/ethernet.h> +#include <ipxe/if_ether.h> +#include <ipxe/iobuf.h> +#include <ipxe/dma.h> +#include <ipxe/pci.h> +#include <ipxe/fault.h> +#include "gve.h" + +/** @file + * + * Google Virtual Ethernet network driver + * + */ + +/* Disambiguate the various error causes */ +#define EINFO_EIO_ADMIN_UNSET \ + __einfo_uniqify ( EINFO_EIO, 0x00, "Uncompleted" ) +#define EIO_ADMIN_UNSET \ + __einfo_error ( EINFO_EIO_ADMIN_UNSET ) +#define EINFO_EIO_ADMIN_ABORTED \ + __einfo_uniqify ( EINFO_EIO, 0x10, "Aborted" ) +#define EIO_ADMIN_ABORTED \ + __einfo_error ( EINFO_EIO_ADMIN_ABORTED ) +#define EINFO_EIO_ADMIN_EXISTS \ + __einfo_uniqify ( EINFO_EIO, 0x11, "Already exists" ) +#define EIO_ADMIN_EXISTS \ + __einfo_error ( EINFO_EIO_ADMIN_EXISTS ) +#define EINFO_EIO_ADMIN_CANCELLED \ + __einfo_uniqify ( EINFO_EIO, 0x12, "Cancelled" ) +#define EIO_ADMIN_CANCELLED \ + __einfo_error ( EINFO_EIO_ADMIN_CANCELLED ) +#define EINFO_EIO_ADMIN_DATALOSS \ + __einfo_uniqify ( EINFO_EIO, 0x13, "Data loss" ) +#define EIO_ADMIN_DATALOSS \ + __einfo_error ( EINFO_EIO_ADMIN_DATALOSS ) +#define EINFO_EIO_ADMIN_DEADLINE \ + __einfo_uniqify ( EINFO_EIO, 0x14, "Deadline exceeded" ) +#define EIO_ADMIN_DEADLINE \ + __einfo_error ( EINFO_EIO_ADMIN_DEADLINE ) +#define EINFO_EIO_ADMIN_PRECONDITION \ + __einfo_uniqify ( EINFO_EIO, 0x15, "Failed precondition" ) +#define EIO_ADMIN_PRECONDITION \ + __einfo_error ( EINFO_EIO_ADMIN_PRECONDITION ) +#define EINFO_EIO_ADMIN_INTERNAL \ + __einfo_uniqify ( EINFO_EIO, 0x16, "Internal error" ) +#define EIO_ADMIN_INTERNAL \ + __einfo_error ( EINFO_EIO_ADMIN_INTERNAL ) +#define EINFO_EIO_ADMIN_INVAL \ + __einfo_uniqify ( EINFO_EIO, 0x17, "Invalid argument" ) +#define EIO_ADMIN_INVAL \ + __einfo_error ( EINFO_EIO_ADMIN_INVAL ) +#define EINFO_EIO_ADMIN_NOT_FOUND \ + __einfo_uniqify ( EINFO_EIO, 0x18, "Not found" ) +#define EIO_ADMIN_NOT_FOUND \ + __einfo_error ( EINFO_EIO_ADMIN_NOT_FOUND ) +#define EINFO_EIO_ADMIN_RANGE \ + __einfo_uniqify ( EINFO_EIO, 0x19, "Out of range" ) +#define EIO_ADMIN_RANGE \ + __einfo_error ( EINFO_EIO_ADMIN_RANGE ) +#define EINFO_EIO_ADMIN_PERM \ + __einfo_uniqify ( EINFO_EIO, 0x1a, "Permission denied" ) +#define EIO_ADMIN_PERM \ + __einfo_error ( EINFO_EIO_ADMIN_PERM ) +#define EINFO_EIO_ADMIN_UNAUTH \ + __einfo_uniqify ( EINFO_EIO, 0x1b, "Unauthenticated" ) +#define EIO_ADMIN_UNAUTH \ + __einfo_error ( EINFO_EIO_ADMIN_UNAUTH ) +#define EINFO_EIO_ADMIN_RESOURCE \ + __einfo_uniqify ( EINFO_EIO, 0x1c, "Resource exhausted" ) +#define EIO_ADMIN_RESOURCE \ + __einfo_error ( EINFO_EIO_ADMIN_RESOURCE ) +#define EINFO_EIO_ADMIN_UNAVAIL \ + __einfo_uniqify ( EINFO_EIO, 0x1d, "Unavailable" ) +#define EIO_ADMIN_UNAVAIL \ + __einfo_error ( EINFO_EIO_ADMIN_UNAVAIL ) +#define EINFO_EIO_ADMIN_NOTSUP \ + __einfo_uniqify ( EINFO_EIO, 0x1e, "Unimplemented" ) +#define EIO_ADMIN_NOTSUP \ + __einfo_error ( EINFO_EIO_ADMIN_NOTSUP ) +#define EINFO_EIO_ADMIN_UNKNOWN \ + __einfo_uniqify ( EINFO_EIO, 0x1f, "Unknown error" ) +#define EIO_ADMIN_UNKNOWN \ + __einfo_error ( EINFO_EIO_ADMIN_UNKNOWN ) +#define EIO_ADMIN( status ) \ + EUNIQ ( EINFO_EIO, ( (status) & 0x1f ), \ + EIO_ADMIN_UNSET, EIO_ADMIN_ABORTED, EIO_ADMIN_EXISTS, \ + EIO_ADMIN_CANCELLED, EIO_ADMIN_DATALOSS, \ + EIO_ADMIN_DEADLINE, EIO_ADMIN_PRECONDITION, \ + EIO_ADMIN_INTERNAL, EIO_ADMIN_INVAL, \ + EIO_ADMIN_NOT_FOUND, EIO_ADMIN_RANGE, EIO_ADMIN_PERM, \ + EIO_ADMIN_UNAUTH, EIO_ADMIN_RESOURCE, \ + EIO_ADMIN_UNAVAIL, EIO_ADMIN_NOTSUP, EIO_ADMIN_UNKNOWN ) + +/****************************************************************************** + * + * Buffer layout + * + ****************************************************************************** + */ + +/** + * Get buffer offset (within queue page list allocation) + * + * @v queue Descriptor queue + * @v tag Buffer tag + * @ret addr Buffer address within queue page list address space + */ +static inline __attribute__ (( always_inline)) size_t +gve_offset ( struct gve_queue *queue, unsigned int tag ) { + + /* We allocate sufficient pages for the maximum fill level of + * buffers, and reuse the buffers in strict rotation as they + * are released by the hardware. + */ + assert ( tag < queue->fill ); + return ( tag * GVE_BUF_SIZE ); +} + +/** + * Get buffer address (within queue page list address space) + * + * @v queue Descriptor queue + * @v tag Buffer tag + * @ret addr Buffer address within queue page list address space + */ +static inline __attribute__ (( always_inline)) physaddr_t +gve_address ( struct gve_queue *queue, unsigned int tag ) { + + /* Pages are allocated as a single contiguous block */ + return ( queue->qpl.base + gve_offset ( queue, tag ) ); +} + +/** + * Get buffer address + * + * @v queue Descriptor queue + * @v tag Buffer tag + * @ret addr Buffer address + */ +static inline __attribute__ (( always_inline )) void * +gve_buffer ( struct gve_queue *queue, unsigned int tag ) { + + /* Pages are allocated as a single contiguous block */ + return ( queue->qpl.data + gve_offset ( queue, tag ) ); +} + +/****************************************************************************** + * + * Device reset + * + ****************************************************************************** + */ + +/** + * Reset hardware + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_reset ( struct gve_nic *gve ) { + uint32_t pfn; + unsigned int i; + + /* Skip reset if admin queue page frame number is already + * clear. Triggering a reset on an already-reset device seems + * to cause a delayed reset to be scheduled. This can cause + * the device to end up in a reset loop, where each attempt to + * recover from reset triggers another reset a few seconds + * later. + */ + pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN ); + if ( ! pfn ) { + DBGC ( gve, "GVE %p skipping reset\n", gve ); + return 0; + } + + /* Clear admin queue page frame number */ + writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN ); + wmb(); + + /* Wait for device to reset */ + for ( i = 0 ; i < GVE_RESET_MAX_WAIT_MS ; i++ ) { + + /* Delay */ + mdelay ( 1 ); + + /* Check for reset completion */ + pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN ); + if ( ! pfn ) + return 0; + } + + DBGC ( gve, "GVE %p reset timed out (PFN %#08x devstat %#08x)\n", + gve, bswap_32 ( pfn ), + bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) ); + return -ETIMEDOUT; +} + +/****************************************************************************** + * + * Admin queue + * + ****************************************************************************** + */ + +/** + * Get operating mode name (for debugging) + * + * @v mode Operating mode + * @ret name Mode name + */ +static inline const char * gve_mode_name ( unsigned int mode ) { + static char buf[ 8 /* "XXX-XXX" + NUL */ ]; + + snprintf ( buf, sizeof ( buf ), "%s-%s", + ( ( mode & GVE_MODE_DQO ) ? "DQO" : "GQI" ), + ( ( mode & GVE_MODE_QPL ) ? "QPL" : "RDA" ) ); + return buf; +} + +/** + * Allocate admin queue + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_admin_alloc ( struct gve_nic *gve ) { + struct dma_device *dma = gve->dma; + struct gve_admin *admin = &gve->admin; + struct gve_scratch *scratch = &gve->scratch; + size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) ); + size_t scratch_len = sizeof ( *scratch->buf ); + int rc; + + /* Allocate admin queue */ + admin->cmd = dma_alloc ( dma, &admin->map, admin_len, GVE_ALIGN ); + if ( ! admin->cmd ) { + rc = -ENOMEM; + goto err_admin; + } + + /* Allocate scratch buffer */ + scratch->buf = dma_alloc ( dma, &scratch->map, scratch_len, GVE_ALIGN ); + if ( ! scratch->buf ) { + rc = -ENOMEM; + goto err_scratch; + } + + DBGC ( gve, "GVE %p AQ at [%08lx,%08lx) scratch [%08lx,%08lx)\n", + gve, virt_to_phys ( admin->cmd ), + ( virt_to_phys ( admin->cmd ) + admin_len ), + virt_to_phys ( scratch->buf ), + ( virt_to_phys ( scratch->buf ) + scratch_len ) ); + return 0; + + dma_free ( &scratch->map, scratch->buf, scratch_len ); + err_scratch: + dma_free ( &admin->map, admin->cmd, admin_len ); + err_admin: + return rc; +} + +/** + * Free admin queue + * + * @v gve GVE device + */ +static void gve_admin_free ( struct gve_nic *gve ) { + struct gve_admin *admin = &gve->admin; + struct gve_scratch *scratch = &gve->scratch; + size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) ); + size_t scratch_len = sizeof ( *scratch->buf ); + + /* Free scratch buffer */ + dma_free ( &scratch->map, scratch->buf, scratch_len ); + + /* Free admin queue */ + dma_free ( &admin->map, admin->cmd, admin_len ); +} + +/** + * Enable admin queue + * + * @v gve GVE device + */ +static void gve_admin_enable ( struct gve_nic *gve ) { + struct gve_admin *admin = &gve->admin; + size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) ); + physaddr_t base; + + /* Reset queue */ + admin->prod = 0; + + /* Program queue addresses and capabilities */ + base = dma ( &admin->map, admin->cmd ); + writel ( bswap_32 ( base / GVE_PAGE_SIZE ), + gve->cfg + GVE_CFG_ADMIN_PFN ); + writel ( bswap_32 ( base & 0xffffffffUL ), + gve->cfg + GVE_CFG_ADMIN_BASE_LO ); + if ( sizeof ( base ) > sizeof ( uint32_t ) ) { + writel ( bswap_32 ( ( ( uint64_t ) base ) >> 32 ), + gve->cfg + GVE_CFG_ADMIN_BASE_HI ); + } else { + writel ( 0, gve->cfg + GVE_CFG_ADMIN_BASE_HI ); + } + writel ( bswap_16 ( admin_len ), gve->cfg + GVE_CFG_ADMIN_LEN ); + writel ( bswap_32 ( GVE_CFG_DRVSTAT_RUN ), gve->cfg + GVE_CFG_DRVSTAT ); +} + +/** + * Get next available admin queue command slot + * + * @v gve GVE device + * @ret cmd Admin queue command + */ +static union gve_admin_command * gve_admin_command ( struct gve_nic *gve ) { + struct gve_admin *admin = &gve->admin; + union gve_admin_command *cmd; + unsigned int index; + + /* Get next command slot */ + index = admin->prod; + cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ]; + + /* Initialise request */ + memset ( cmd, 0, sizeof ( *cmd ) ); + + return cmd; +} + +/** + * Wait for admin queue command to complete + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_admin_wait ( struct gve_nic *gve ) { + struct gve_admin *admin = &gve->admin; + uint32_t evt; + uint32_t pfn; + unsigned int i; + + /* Wait for any outstanding commands to complete */ + for ( i = 0 ; i < GVE_ADMIN_MAX_WAIT_MS ; i++ ) { + + /* Check event counter */ + rmb(); + evt = bswap_32 ( readl ( gve->cfg + GVE_CFG_ADMIN_EVT ) ); + if ( evt == admin->prod ) + return 0; + + /* Check for device reset */ + pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN ); + if ( ! pfn ) + break; + + /* Delay */ + mdelay ( 1 ); + } + + DBGC ( gve, "GVE %p AQ %#02x %s (completed %#02x, status %#08x)\n", + gve, admin->prod, ( pfn ? "timed out" : "saw reset" ), evt, + bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) ); + return ( pfn ? -ETIMEDOUT : -ECONNRESET ); +} + +/** + * Issue admin queue command + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_admin ( struct gve_nic *gve ) { + struct gve_admin *admin = &gve->admin; + union gve_admin_command *cmd; + unsigned int index; + uint32_t opcode; + uint32_t status; + int rc; + + /* Ensure admin queue is idle */ + if ( ( rc = gve_admin_wait ( gve ) ) != 0 ) + return rc; + + /* Get next command slot */ + index = admin->prod; + cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ]; + opcode = cmd->hdr.opcode; + DBGC2 ( gve, "GVE %p AQ %#02x command %#04x request:\n", + gve, index, opcode ); + DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) ); + + /* Increment producer counter */ + admin->prod++; + + /* Ring doorbell */ + wmb(); + writel ( bswap_32 ( admin->prod ), gve->cfg + GVE_CFG_ADMIN_DB ); + + /* Wait for command to complete */ + if ( ( rc = gve_admin_wait ( gve ) ) != 0 ) + return rc; + + /* Check command status */ + status = be32_to_cpu ( cmd->hdr.status ); + if ( status != GVE_ADMIN_STATUS_OK ) { + rc = -EIO_ADMIN ( status ); + DBGC ( gve, "GVE %p AQ %#02x command %#04x failed: %#08x\n", + gve, index, opcode, status ); + DBGC_HDA ( gve, 0, cmd, sizeof ( *cmd ) ); + DBGC ( gve, "GVE %p AQ error: %s\n", gve, strerror ( rc ) ); + return rc; + } + + DBGC2 ( gve, "GVE %p AQ %#02x command %#04x result:\n", + gve, index, opcode ); + DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) ); + return 0; +} + +/** + * Issue simple admin queue command + * + * @v gve GVE device + * @v opcode Operation code + * @v id ID parameter (or zero if not applicable) + * @ret rc Return status code + * + * Several admin queue commands take either an empty parameter list or + * a single 32-bit ID parameter. + */ +static int gve_admin_simple ( struct gve_nic *gve, unsigned int opcode, + unsigned int id ) { + union gve_admin_command *cmd; + int rc; + + /* Construct request */ + cmd = gve_admin_command ( gve ); + cmd->hdr.opcode = opcode; + cmd->simple.id = cpu_to_be32 ( id ); + + /* Issue command */ + if ( ( rc = gve_admin ( gve ) ) != 0 ) + return rc; + + return 0; +} + +/** + * Get device descriptor + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_describe ( struct gve_nic *gve ) { + struct net_device *netdev = gve->netdev; + struct gve_device_descriptor *desc = &gve->scratch.buf->desc; + union gve_admin_command *cmd; + struct gve_option *opt; + unsigned int count; + unsigned int id; + size_t offset; + size_t max; + size_t len; + int rc; + + /* Construct request */ + cmd = gve_admin_command ( gve ); + cmd->hdr.opcode = GVE_ADMIN_DESCRIBE; + cmd->desc.addr = cpu_to_be64 ( dma ( &gve->scratch.map, desc ) ); + cmd->desc.ver = cpu_to_be32 ( GVE_ADMIN_DESCRIBE_VER ); + cmd->desc.len = cpu_to_be32 ( sizeof ( *desc ) ); + + /* Issue command */ + if ( ( rc = gve_admin ( gve ) ) != 0 ) + return rc; + DBGC2 ( gve, "GVE %p device descriptor:\n", gve ); + DBGC2_HDA ( gve, 0, desc, sizeof ( *desc ) ); + + /* Extract queue parameters */ + gve->events.count = be16_to_cpu ( desc->counters ); + gve->tx.count = be16_to_cpu ( desc->tx_count ); + gve->rx.count = be16_to_cpu ( desc->rx_count ); + DBGC ( gve, "GVE %p using %d TX, %d RX, %d events\n", + gve, gve->tx.count, gve->rx.count, gve->events.count ); + + /* Extract network parameters */ + build_assert ( sizeof ( desc->mac ) == ETH_ALEN ); + memcpy ( netdev->hw_addr, &desc->mac, sizeof ( desc->mac ) ); + netdev->mtu = be16_to_cpu ( desc->mtu ); + netdev->max_pkt_len = ( netdev->mtu + ETH_HLEN ); + DBGC ( gve, "GVE %p MAC %s (\"%s\") MTU %zd\n", + gve, eth_ntoa ( netdev->hw_addr ), + inet_ntoa ( desc->mac.in ), netdev->mtu ); + + /* Parse options */ + count = be16_to_cpu ( desc->opt_count ); + max = be16_to_cpu ( desc->len ); + gve->options = 0; + for ( offset = offsetof ( typeof ( *desc ), opts ) ; count ; + count--, offset += len ) { + + /* Check space for option header */ + if ( ( offset + sizeof ( *opt ) ) > max ) { + DBGC ( gve, "GVE %p underlength option at +%#02zx:\n", + gve, offset ); + DBGC_HDA ( gve, 0, desc, sizeof ( *desc ) ); + return -EINVAL; + } + opt = ( ( ( void * ) desc ) + offset ); + + /* Check space for option body */ + len = ( sizeof ( *opt ) + be16_to_cpu ( opt->len ) ); + if ( ( offset + len ) > max ) { + DBGC ( gve, "GVE %p malformed option at +%#02zx:\n", + gve, offset ); + DBGC_HDA ( gve, 0, desc, sizeof ( *desc ) ); + return -EINVAL; + } + + /* Record option as supported */ + id = be16_to_cpu ( opt->id ); + if ( id < ( 8 * sizeof ( gve->options ) ) ) + gve->options |= ( 1 << id ); + } + DBGC ( gve, "GVE %p supports options %#08x\n", gve, gve->options ); + + /* Select preferred operating mode */ + if ( gve->options & ( 1 << GVE_OPT_GQI_QPL ) ) { + /* GQI-QPL: in-order queues, queue page list addressing */ + gve->mode = GVE_MODE_QPL; + } else if ( gve->options & ( 1 << GVE_OPT_GQI_RDA ) ) { + /* GQI-RDA: in-order queues, raw DMA addressing */ + gve->mode = 0; + } else if ( gve->options & ( 1 << GVE_OPT_DQO_QPL ) ) { + /* DQO-QPL: out-of-order queues, queue page list addressing */ + gve->mode = ( GVE_MODE_DQO | GVE_MODE_QPL ); + } else if ( gve->options & ( 1 << GVE_OPT_DQO_RDA ) ) { + /* DQO-RDA: out-of-order queues, raw DMA addressing */ + gve->mode = GVE_MODE_DQO; + } else { + /* No options matched: assume the original GQI-QPL mode */ + gve->mode = GVE_MODE_QPL; + } + DBGC ( gve, "GVE %p using %s mode\n", + gve, gve_mode_name ( gve->mode ) ); + + return 0; +} + +/** + * Configure device resources + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_configure ( struct gve_nic *gve ) { + struct gve_events *events = &gve->events; + struct gve_irqs *irqs = &gve->irqs; + union gve_admin_command *cmd; + uint32_t doorbell; + unsigned int db_off; + unsigned int i; + int rc; + + /* Construct request */ + cmd = gve_admin_command ( gve ); + cmd->hdr.opcode = GVE_ADMIN_CONFIGURE; + cmd->conf.events = + cpu_to_be64 ( dma ( &events->map, events->event ) ); + cmd->conf.irqs = + cpu_to_be64 ( dma ( &irqs->map, irqs->irq ) ); + cmd->conf.num_events = cpu_to_be32 ( events->count ); + cmd->conf.num_irqs = cpu_to_be32 ( GVE_IRQ_COUNT ); + cmd->conf.irq_stride = cpu_to_be32 ( sizeof ( irqs->irq[0] ) ); + cmd->conf.format = GVE_FORMAT ( gve->mode ); + + /* Issue command */ + if ( ( rc = gve_admin ( gve ) ) != 0 ) + return rc; + + /* Disable all interrupts */ + doorbell = ( ( gve->mode & GVE_MODE_DQO ) ? + 0 : bswap_32 ( GVE_GQI_IRQ_DISABLE ) ); + for ( i = 0 ; i < GVE_IRQ_COUNT ; i++ ) { + db_off = ( be32_to_cpu ( irqs->irq[i].db_idx ) * + sizeof ( uint32_t ) ); + DBGC ( gve, "GVE %p IRQ %d doorbell +%#04x\n", gve, i, db_off ); + irqs->db[i] = ( gve->db + db_off ); + writel ( doorbell, irqs->db[i] ); + } + + return 0; +} + +/** + * Deconfigure device resources + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_deconfigure ( struct gve_nic *gve ) { + int rc; + + /* Issue command (with meaningless ID) */ + if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_DECONFIGURE, 0 ) ) != 0 ) + return rc; + + return 0; +} + +/** + * Register queue page list + * + * @v gve GVE device + * @v qpl Queue page list + * @ret rc Return status code + */ +static int gve_register ( struct gve_nic *gve, struct gve_qpl *qpl ) { + struct gve_pages *pages = &gve->scratch.buf->pages; + union gve_admin_command *cmd; + void *addr; + unsigned int i; + int rc; + + /* Do nothing if using raw DMA addressing */ + if ( ! ( gve->mode & GVE_MODE_QPL ) ) + return 0; + + /* Build page address list */ + for ( i = 0 ; i < qpl->count ; i++ ) { + addr = ( qpl->data + ( i * GVE_PAGE_SIZE ) ); + pages->addr[i] = cpu_to_be64 ( dma ( &qpl->map, addr ) ); + } + + /* Construct request */ + cmd = gve_admin_command ( gve ); + cmd->hdr.opcode = GVE_ADMIN_REGISTER; + cmd->reg.id = cpu_to_be32 ( qpl->id ); + cmd->reg.count = cpu_to_be32 ( qpl->count ); + cmd->reg.addr = cpu_to_be64 ( dma ( &gve->scratch.map, pages ) ); + cmd->reg.size = cpu_to_be64 ( GVE_PAGE_SIZE ); + + /* Issue command */ + if ( ( rc = gve_admin ( gve ) ) != 0 ) + return rc; + + return 0; +} + +/** + * Unregister page list + * + * @v gve GVE device + * @v qpl Queue page list + * @ret rc Return status code + */ +static int gve_unregister ( struct gve_nic *gve, struct gve_qpl *qpl ) { + int rc; + + /* Do nothing if using raw DMA addressing */ + if ( ! ( gve->mode & GVE_MODE_QPL ) ) + return 0; + + /* Issue command */ + if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_UNREGISTER, + qpl->id ) ) != 0 ) { + return rc; + } + + return 0; +} + +/** + * Construct command to create transmit queue + * + * @v queue Transmit queue + * @v qpl Queue page list ID + * @v cmd Admin queue command + */ +static void gve_create_tx_param ( struct gve_queue *queue, uint32_t qpl, + union gve_admin_command *cmd ) { + struct gve_admin_create_tx *create = &cmd->create_tx; + const struct gve_queue_type *type = queue->type; + + /* Construct request parameters */ + create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) ); + create->desc = + cpu_to_be64 ( dma ( &queue->desc_map, queue->desc.raw ) ); + create->qpl_id = cpu_to_be32 ( qpl ); + create->notify_id = cpu_to_be32 ( type->irq ); + create->desc_count = cpu_to_be16 ( queue->count ); + if ( queue->cmplt.raw ) { + create->cmplt = cpu_to_be64 ( dma ( &queue->cmplt_map, + queue->cmplt.raw ) ); + create->cmplt_count = cpu_to_be16 ( queue->count ); + } +} + +/** + * Construct command to create receive queue + * + * @v queue Receive queue + * @v qpl Queue page list ID + * @v cmd Admin queue command + */ +static void gve_create_rx_param ( struct gve_queue *queue, uint32_t qpl, + union gve_admin_command *cmd ) { + struct gve_admin_create_rx *create = &cmd->create_rx; + const struct gve_queue_type *type = queue->type; + + /* Construct request parameters */ + create->notify_id = cpu_to_be32 ( type->irq ); + create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) ); + create->desc = + cpu_to_be64 ( dma ( &queue->desc_map, queue->desc.raw ) ); + create->cmplt = + cpu_to_be64 ( dma ( &queue->cmplt_map, queue->cmplt.raw ) ); + create->qpl_id = cpu_to_be32 ( qpl ); + create->desc_count = cpu_to_be16 ( queue->count ); + create->bufsz = cpu_to_be16 ( GVE_BUF_SIZE ); + create->cmplt_count = cpu_to_be16 ( queue->count ); +} + +/** + * Create transmit or receive queue + * + * @v gve GVE device + * @v queue Descriptor queue + * @ret rc Return status code + */ +static int gve_create_queue ( struct gve_nic *gve, struct gve_queue *queue ) { + const struct gve_queue_type *type = queue->type; + const struct gve_queue_stride *stride = &queue->stride; + union gve_admin_command *cmd; + struct gve_buffer *buf; + unsigned int db_off; + unsigned int evt_idx; + unsigned int tag; + unsigned int i; + uint32_t qpl; + int rc; + + /* Reset queue */ + queue->prod = 0; + queue->cons = 0; + queue->done = 0; + memset ( queue->desc.raw, 0, ( queue->count * stride->desc ) ); + memset ( queue->cmplt.raw, 0, ( queue->count * stride->cmplt ) ); + for ( i = 0 ; i < queue->fill ; i++ ) + queue->tag[i] = i; + + /* Pre-populate descriptor offsets for in-order queues */ + if ( ! ( gve->mode & GVE_MODE_DQO ) ) { + buf = ( queue->desc.raw + stride->desc - sizeof ( *buf ) ); + for ( i = 0 ; i < queue->count ; i++ ) { + tag = ( i & ( queue->fill - 1 ) ); + buf->addr = cpu_to_be64 ( gve_address ( queue, tag ) ); + buf = ( ( ( void * ) buf ) + stride->desc ); + } + } + + /* Construct request */ + cmd = gve_admin_command ( gve ); + cmd->hdr.opcode = type->create; + qpl = ( ( gve->mode & GVE_MODE_QPL ) ? type->qpl : GVE_RAW_QPL ); + type->param ( queue, qpl, cmd ); + + /* Issue command */ + if ( ( rc = gve_admin ( gve ) ) != 0 ) + return rc; + + /* Record indices */ + db_off = ( be32_to_cpu ( queue->res->db_idx ) * sizeof ( uint32_t ) ); + evt_idx = be32_to_cpu ( queue->res->evt_idx ); + DBGC ( gve, "GVE %p %s doorbell +%#04x event counter %d\n", + gve, type->name, db_off, evt_idx ); + queue->db = ( gve->db + db_off ); + assert ( evt_idx < gve->events.count ); + queue->event = &gve->events.event[evt_idx]; + assert ( queue->event->count == 0 ); + + /* Unmask dummy interrupt */ + pci_msix_unmask ( &gve->msix, type->irq ); + + /* Rearm queue interrupt if applicable */ + if ( gve->mode & GVE_MODE_DQO ) + writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[type->irq] ); + + return 0; +} + +/** + * Destroy transmit or receive queue + * + * @v gve GVE device + * @v queue Descriptor queue + * @ret rc Return status code + */ +static int gve_destroy_queue ( struct gve_nic *gve, struct gve_queue *queue ) { + const struct gve_queue_type *type = queue->type; + int rc; + + /* Mask dummy interrupt */ + pci_msix_mask ( &gve->msix, type->irq ); + + /* Issue command */ + if ( ( rc = gve_admin_simple ( gve, type->destroy, 0 ) ) != 0 ) + return rc; + + return 0; +} + +/****************************************************************************** + * + * Network device interface + * + ****************************************************************************** + */ + +/** + * Allocate shared queue resources + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_alloc_shared ( struct gve_nic *gve ) { + struct dma_device *dma = gve->dma; + struct gve_irqs *irqs = &gve->irqs; + struct gve_events *events = &gve->events; + size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) ); + size_t events_len = ( gve->events.count * sizeof ( events->event[0] ) ); + int rc; + + /* Allocate interrupt channels */ + irqs->irq = dma_alloc ( dma, &irqs->map, irqs_len, GVE_ALIGN ); + if ( ! irqs->irq ) { + rc = -ENOMEM; + goto err_irqs; + } + DBGC ( gve, "GVE %p IRQs at [%08lx,%08lx)\n", + gve, virt_to_phys ( irqs->irq ), + ( virt_to_phys ( irqs->irq ) + irqs_len ) ); + + /* Allocate event counters */ + events->event = dma_alloc ( dma, &events->map, events_len, GVE_ALIGN ); + if ( ! events->event ) { + rc = -ENOMEM; + goto err_events; + } + DBGC ( gve, "GVE %p events at [%08lx,%08lx)\n", + gve, virt_to_phys ( events->event ), + ( virt_to_phys ( events->event ) + events_len ) ); + + return 0; + + dma_free ( &events->map, events->event, events_len ); + err_events: + dma_free ( &irqs->map, irqs->irq, irqs_len ); + err_irqs: + return rc; +} + +/** + * Free shared queue resources + * + * @v gve GVE device + */ +static void gve_free_shared ( struct gve_nic *gve ) { + struct gve_irqs *irqs = &gve->irqs; + struct gve_events *events = &gve->events; + size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) ); + size_t events_len = ( gve->events.count * sizeof ( events->event[0] ) ); + + /* Free event counters */ + dma_free ( &events->map, events->event, events_len ); + + /* Free interrupt channels */ + dma_free ( &irqs->map, irqs->irq, irqs_len ); +} + +/** + * Allocate queue page list + * + * @v gve GVE device + * @v qpl Queue page list + * @v id Queue page list ID + * @v buffers Number of data buffers + * @ret rc Return status code + */ +static int gve_alloc_qpl ( struct gve_nic *gve, struct gve_qpl *qpl, + uint32_t id, unsigned int buffers ) { + size_t len; + + /* Record ID */ + qpl->id = id; + + /* Calculate number of pages required */ + build_assert ( GVE_BUF_SIZE <= GVE_PAGE_SIZE ); + qpl->count = ( ( buffers + GVE_BUF_PER_PAGE - 1 ) / GVE_BUF_PER_PAGE ); + assert ( qpl->count <= GVE_QPL_MAX ); + + /* Allocate pages (as a single block) */ + len = ( qpl->count * GVE_PAGE_SIZE ); + qpl->data = dma_umalloc ( gve->dma, &qpl->map, len, GVE_ALIGN ); + if ( ! qpl->data ) + return -ENOMEM; + qpl->base = ( ( gve->mode == GVE_MODE_QPL ) ? + 0 : dma ( &qpl->map, qpl->data ) ); + + DBGC ( gve, "GVE %p QPL %#08x at [%08lx,%08lx)\n", + gve, qpl->id, virt_to_phys ( qpl->data ), + ( virt_to_phys ( qpl->data ) + len ) ); + return 0; +} + +/** + * Free queue page list + * + * @v gve GVE device + * @v qpl Queue page list + */ +static void gve_free_qpl ( struct gve_nic *nic __unused, + struct gve_qpl *qpl ) { + size_t len = ( qpl->count * GVE_PAGE_SIZE ); + + /* Free pages */ + dma_ufree ( &qpl->map, qpl->data, len ); +} + +/** + * Calculate next receive sequence number + * + * @v seq Current sequence number, or zero to start sequence + * @ret next Next sequence number + */ +static inline __attribute__ (( always_inline )) unsigned int +gve_next ( unsigned int seq ) { + + /* The receive completion sequence number is a modulo 7 + * counter that cycles through the non-zero three-bit values 1 + * to 7 inclusive. + * + * Since 7 is coprime to 2^n, this ensures that the sequence + * number changes each time that a new completion is written + * to memory. + * + * Since the counter takes only non-zero values, this ensures + * that the sequence number changes whenever a new completion + * is first written to a zero-initialised completion ring. + */ + seq = ( ( seq + 1 ) & GVE_GQI_RX_SEQ_MASK ); + return ( seq ? seq : 1 ); +} + +/** + * Allocate descriptor queue + * + * @v gve GVE device + * @v queue Descriptor queue + * @ret rc Return status code + */ +static int gve_alloc_queue ( struct gve_nic *gve, struct gve_queue *queue ) { + const struct gve_queue_type *type = queue->type; + struct gve_queue_stride *stride = &queue->stride; + struct dma_device *dma = gve->dma; + size_t desc_len; + size_t cmplt_len; + size_t res_len; + int rc; + + /* Sanity checks */ + if ( ( queue->count == 0 ) || + ( queue->count & ( queue->count - 1 ) ) ) { + DBGC ( gve, "GVE %p %s invalid queue size %d\n", + gve, type->name, queue->count ); + rc = -EINVAL; + goto err_sanity; + } + + /* Set queue strides and calculate total lengths */ + *stride = ( ( gve->mode & GVE_MODE_DQO ) ? + type->stride.dqo : type->stride.gqi ); + desc_len = ( queue->count * stride->desc ); + cmplt_len = ( queue->count * stride->cmplt ); + res_len = sizeof ( *queue->res ); + + /* Calculate maximum fill level */ + assert ( ( type->fill & ( type->fill - 1 ) ) == 0 ); + queue->fill = type->fill; + if ( queue->fill > queue->count ) + queue->fill = queue->count; + DBGC ( gve, "GVE %p %s using QPL %#08x with %d/%d descriptors\n", + gve, type->name, type->qpl, queue->fill, queue->count ); + + /* Allocate queue page list */ + if ( ( rc = gve_alloc_qpl ( gve, &queue->qpl, type->qpl, + queue->fill ) ) != 0 ) + goto err_qpl; + + /* Allocate descriptors */ + queue->desc.raw = dma_umalloc ( dma, &queue->desc_map, desc_len, + GVE_ALIGN ); + if ( ! queue->desc.raw ) { + rc = -ENOMEM; + goto err_desc; + } + DBGC ( gve, "GVE %p %s descriptors at [%08lx,%08lx)\n", + gve, type->name, virt_to_phys ( queue->desc.raw ), + ( virt_to_phys ( queue->desc.raw ) + desc_len ) ); + + /* Allocate completions */ + if ( cmplt_len ) { + queue->cmplt.raw = dma_umalloc ( dma, &queue->cmplt_map, + cmplt_len, GVE_ALIGN ); + if ( ! queue->cmplt.raw ) { + rc = -ENOMEM; + goto err_cmplt; + } + DBGC ( gve, "GVE %p %s completions at [%08lx,%08lx)\n", + gve, type->name, virt_to_phys ( queue->cmplt.raw ), + ( virt_to_phys ( queue->cmplt.raw ) + cmplt_len ) ); + } + + /* Allocate queue resources */ + queue->res = dma_alloc ( dma, &queue->res_map, res_len, GVE_ALIGN ); + if ( ! queue->res ) { + rc = -ENOMEM; + goto err_res; + } + memset ( queue->res, 0, res_len ); + + return 0; + + dma_free ( &queue->res_map, queue->res, res_len ); + err_res: + if ( cmplt_len ) + dma_ufree ( &queue->cmplt_map, queue->cmplt.raw, cmplt_len ); + err_cmplt: + dma_ufree ( &queue->desc_map, queue->desc.raw, desc_len ); + err_desc: + gve_free_qpl ( gve, &queue->qpl ); + err_qpl: + err_sanity: + return rc; +} + +/** + * Free descriptor queue + * + * @v gve GVE device + * @v queue Descriptor queue + */ +static void gve_free_queue ( struct gve_nic *gve, struct gve_queue *queue ) { + const struct gve_queue_stride *stride = &queue->stride; + size_t desc_len = ( queue->count * stride->desc ); + size_t cmplt_len = ( queue->count * stride->cmplt ); + size_t res_len = sizeof ( *queue->res ); + + /* Free queue resources */ + dma_free ( &queue->res_map, queue->res, res_len ); + + /* Free completions, if applicable */ + if ( cmplt_len ) + dma_ufree ( &queue->cmplt_map, queue->cmplt.raw, cmplt_len ); + + /* Free descriptors */ + dma_ufree ( &queue->desc_map, queue->desc.raw, desc_len ); + + /* Free queue page list */ + gve_free_qpl ( gve, &queue->qpl ); +} + +/** + * Cancel any pending transmissions + * + * @v gve GVE device + */ +static void gve_cancel_tx ( struct gve_nic *gve ) { + struct net_device *netdev = gve->netdev; + struct io_buffer *iobuf; + unsigned int i; + + /* Cancel any pending transmissions */ + for ( i = 0 ; i < ( sizeof ( gve->tx_iobuf ) / + sizeof ( gve->tx_iobuf[0] ) ) ; i++ ) { + iobuf = gve->tx_iobuf[i]; + gve->tx_iobuf[i] = NULL; + if ( iobuf ) + netdev_tx_complete_err ( netdev, iobuf, -ECANCELED ); + } +} + +/** + * Start up device + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_start ( struct gve_nic *gve ) { + struct gve_queue *tx = &gve->tx; + struct gve_queue *rx = &gve->rx; + int rc; + + /* Cancel any pending transmissions */ + gve_cancel_tx ( gve ); + + /* Reset receive sequence */ + gve->seq = gve_next ( 0 ); + + /* Configure device resources */ + if ( ( rc = gve_configure ( gve ) ) != 0 ) + goto err_configure; + + /* Register transmit queue page list */ + if ( ( rc = gve_register ( gve, &tx->qpl ) ) != 0 ) + goto err_register_tx; + + /* Register receive queue page list */ + if ( ( rc = gve_register ( gve, &rx->qpl ) ) != 0 ) + goto err_register_rx; + + /* Create transmit queue */ + if ( ( rc = gve_create_queue ( gve, tx ) ) != 0 ) + goto err_create_tx; + + /* Create receive queue */ + if ( ( rc = gve_create_queue ( gve, rx ) ) != 0 ) + goto err_create_rx; + + return 0; + + gve_destroy_queue ( gve, rx ); + err_create_rx: + gve_destroy_queue ( gve, tx ); + err_create_tx: + gve_unregister ( gve, &rx->qpl ); + err_register_rx: + gve_unregister ( gve, &tx->qpl ); + err_register_tx: + gve_deconfigure ( gve ); + err_configure: + return rc; +} + +/** + * Stop device + * + * @v gve GVE device + */ +static void gve_stop ( struct gve_nic *gve ) { + struct gve_queue *tx = &gve->tx; + struct gve_queue *rx = &gve->rx; + + /* Destroy queues */ + gve_destroy_queue ( gve, rx ); + gve_destroy_queue ( gve, tx ); + + /* Unregister page lists */ + gve_unregister ( gve, &rx->qpl ); + gve_unregister ( gve, &tx->qpl ); + + /* Deconfigure device */ + gve_deconfigure ( gve ); +} + +/** + * Device startup process + * + * @v gve GVE device + */ +static void gve_startup ( struct gve_nic *gve ) { + struct net_device *netdev = gve->netdev; + int rc; + + /* Reset device */ + if ( ( rc = gve_reset ( gve ) ) != 0 ) + goto err_reset; + + /* Enable admin queue */ + gve_admin_enable ( gve ); + + /* Start device */ + if ( ( rc = gve_start ( gve ) ) != 0 ) + goto err_start; + + /* Reset retry count */ + gve->retries = 0; + + /* (Ab)use link status to report startup status */ + netdev_link_up ( netdev ); + + return; + + gve_stop ( gve ); + err_start: + err_reset: + DBGC ( gve, "GVE %p startup failed: %s\n", gve, strerror ( rc ) ); + netdev_link_err ( netdev, rc ); + if ( gve->retries++ < GVE_RESET_MAX_RETRY ) + process_add ( &gve->startup ); +} + +/** + * Trigger startup process + * + * @v gve GVE device + */ +static void gve_restart ( struct gve_nic *gve ) { + struct net_device *netdev = gve->netdev; + + /* Mark link down to inhibit polling and transmit activity */ + netdev_link_down ( netdev ); + + /* Schedule startup process */ + process_add ( &gve->startup ); +} + +/** + * Reset recovery watchdog + * + * @v timer Reset recovery watchdog timer + * @v over Failure indicator + */ +static void gve_watchdog ( struct retry_timer *timer, int over __unused ) { + struct gve_nic *gve = container_of ( timer, struct gve_nic, watchdog ); + uint32_t activity; + uint32_t pfn; + int rc; + + /* Reschedule watchdog */ + start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT ); + + /* Reset device (for test purposes) if applicable */ + if ( ( rc = inject_fault ( VM_MIGRATED_RATE ) ) != 0 ) { + DBGC ( gve, "GVE %p synthesising host reset\n", gve ); + writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN ); + } + + /* Check for activity since last timer invocation */ + activity = ( gve->tx.cons + gve->rx.cons ); + if ( activity != gve->activity ) { + gve->activity = activity; + return; + } + + /* Check for reset */ + pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN ); + if ( pfn ) { + DBGC2 ( gve, "GVE %p idle but not in reset\n", gve ); + return; + } + + /* Schedule restart */ + DBGC ( gve, "GVE %p watchdog detected reset by host\n", gve ); + gve_restart ( gve ); +} + +/** + * Open network device + * + * @v netdev Network device + * @ret rc Return status code + */ +static int gve_open ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *tx = &gve->tx; + struct gve_queue *rx = &gve->rx; + int rc; + + /* Allocate shared queue resources */ + if ( ( rc = gve_alloc_shared ( gve ) ) != 0 ) + goto err_alloc_shared; + + /* Allocate and prepopulate transmit queue */ + if ( ( rc = gve_alloc_queue ( gve, tx ) ) != 0 ) + goto err_alloc_tx; + + /* Allocate and prepopulate receive queue */ + if ( ( rc = gve_alloc_queue ( gve, rx ) ) != 0 ) + goto err_alloc_rx; + + /* Trigger startup */ + gve_restart ( gve ); + + /* Start reset recovery watchdog timer */ + start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT ); + + return 0; + + gve_free_queue ( gve, rx ); + err_alloc_rx: + gve_free_queue ( gve, tx ); + err_alloc_tx: + gve_free_shared ( gve ); + err_alloc_shared: + return rc; +} + +/** + * Close network device + * + * @v netdev Network device + */ +static void gve_close ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *tx = &gve->tx; + struct gve_queue *rx = &gve->rx; + + /* Stop reset recovery timer */ + stop_timer ( &gve->watchdog ); + + /* Terminate startup process */ + process_del ( &gve->startup ); + + /* Stop and reset device */ + gve_stop ( gve ); + gve_reset ( gve ); + + /* Cancel any pending transmissions */ + gve_cancel_tx ( gve ); + + /* Free queues */ + gve_free_queue ( gve, rx ); + gve_free_queue ( gve, tx ); + + /* Free shared queue resources */ + gve_free_shared ( gve ); +} + +/** + * Transmit packet + * + * @v netdev Network device + * @v iobuf I/O buffer + * @ret rc Return status code + */ +static int gve_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *tx = &gve->tx; + struct gve_gqi_tx_descriptor *gqi; + struct gve_dqo_tx_descriptor *dqo; + unsigned int count; + unsigned int index; + unsigned int tag; + unsigned int chain; + uint32_t doorbell; + size_t frag_len; + size_t offset; + size_t next; + size_t len; + + /* Do nothing if queues are not yet set up */ + if ( ! netdev_link_ok ( netdev ) ) + return -ENETDOWN; + + /* Defer packet if there is no space in the transmit ring */ + len = iob_len ( iobuf ); + count = ( ( len + GVE_BUF_SIZE - 1 ) / GVE_BUF_SIZE ); + if ( ( ( tx->prod - tx->cons ) + count ) > tx->fill ) { + netdev_tx_defer ( netdev, iobuf ); + return 0; + } + + /* Copy packet to queue pages and populate descriptors */ + for ( offset = 0, chain = 0 ; ; offset = next, chain = tag ) { + + /* Identify next available buffer */ + index = ( tx->prod++ & ( tx->count - 1 ) ); + tag = tx->tag[ index % GVE_TX_FILL ]; + + /* Sanity check */ + assert ( gve->tx_iobuf[tag] == NULL ); + + /* Copy packet fragment */ + frag_len = ( len - offset ); + if ( frag_len > GVE_BUF_SIZE ) + frag_len = GVE_BUF_SIZE; + memcpy ( gve_buffer ( tx, tag ), + ( iobuf->data + offset ), frag_len ); + next = ( offset + frag_len ); + + /* Populate descriptor */ + if ( gve->mode & GVE_MODE_DQO ) { + + /* Out-of-order descriptor */ + dqo = &tx->desc.tx.dqo[index]; + dqo->buf.addr = + cpu_to_le64 ( gve_address ( tx, tag ) ); + if ( next == len ) { + dqo->type = ( GVE_DQO_TX_TYPE_PACKET | + GVE_DQO_TX_TYPE_LAST ); + dqo->tag.id = tag; + dqo->tag.count = count; + } else { + dqo->type = GVE_DQO_TX_TYPE_PACKET; + dqo->tag.id = 0; + dqo->tag.count = 0; + } + dqo->len = cpu_to_le16 ( frag_len ); + gve->tx_chain[tag] = chain; + + } else { + + /* In-order descriptor */ + gqi = &tx->desc.tx.gqi[index]; + if ( offset ) { + gqi->type = GVE_GQI_TX_TYPE_CONT; + gqi->count = 0; + gqi->total = 0; + } else { + gqi->type = GVE_GQI_TX_TYPE_START; + gqi->count = count; + gqi->total = cpu_to_be16 ( len ); + } + gqi->len = cpu_to_be16 ( frag_len ); + + } + DBGC2 ( gve, "GVE %p TXD %#04x %#02x:%#02x len %#04zx/%#04zx " + "at %#08lx\n", gve, index, tag, count, frag_len, len, + gve_address ( tx, tag ) ); + + /* Record I/O buffer against final descriptor */ + if ( next == len ) { + gve->tx_iobuf[tag] = iobuf; + break; + } + } + assert ( ( tx->prod - tx->cons ) <= tx->fill ); + + /* Ring doorbell */ + doorbell = tx->prod; + if ( gve->mode & GVE_MODE_DQO ) { + doorbell &= ( tx->count - 1 ); + } else { + doorbell = bswap_32 ( doorbell ); + } + wmb(); + writel ( doorbell, tx->db ); + + return 0; +} + +/** + * Poll for completed transmissions + * + * @v netdev Network device + */ +static void gve_poll_tx ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *tx = &gve->tx; + struct gve_dqo_tx_completion *dqo; + struct io_buffer *iobuf; + unsigned int index; + unsigned int gen; + unsigned int bit; + unsigned int tag; + uint32_t count; + + /* Process transmit completions */ + if ( gve->mode & GVE_MODE_DQO ) { + + /* Out-of-order completions */ + while ( 1 ) { + + /* Read next possible completion */ + gen = ( tx->done & tx->count ); + index = ( tx->done & ( tx->count - 1 ) ); + dqo = &tx->cmplt.tx.dqo[index]; + + /* Check generation bit */ + bit = ( dqo->flags & GVE_DQO_TXF_GEN ); + if ( ( !! bit ) == ( !! gen ) ) + break; + rmb(); + tx->done++; + + /* Ignore non-packet completions */ + if ( ( ! ( dqo->flags & GVE_DQO_TXF_PKT ) ) || + ( dqo->tag.count < 0 ) ) { + DBGC2 ( gve, "GVE %p TXC %#04x flags %#02x " + "ignored\n", gve, index, dqo->flags ); + continue; + } + + /* Parse completion */ + tag = dqo->tag.id; + count = dqo->tag.count; + iobuf = gve->tx_iobuf[tag]; + gve->tx_iobuf[tag] = NULL; + assert ( iobuf != NULL ); + + /* Return completed descriptors to ring */ + while ( count-- ) { + DBGC2 ( gve, "GVE %p TXC %#04x %#02x:%#02x " + "complete\n", gve, index, tag, + dqo->tag.count ); + tx->tag[ tx->cons++ % GVE_TX_FILL ] = tag; + tag = gve->tx_chain[tag]; + } + + /* Hand off to network stack */ + if ( iobuf ) + netdev_tx_complete ( netdev, iobuf ); + } + + } else { + + /* Read event counter */ + count = be32_to_cpu ( tx->event->count ); + + /* Process transmit completions */ + while ( count != tx->cons ) { + DBGC2 ( gve, "GVE %p TXC %#04x complete\n", + gve, tx->cons ); + tag = ( tx->cons % GVE_TX_FILL ); + iobuf = gve->tx_iobuf[tag]; + gve->tx_iobuf[tag] = NULL; + tx->cons++; + if ( iobuf ) + netdev_tx_complete ( netdev, iobuf ); + } + } +} + +/** + * Poll for received packets + * + * @v netdev Network device + */ +static void gve_poll_rx ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *rx = &gve->rx; + struct gve_gqi_rx_completion *gqi; + struct gve_dqo_rx_completion *dqo; + struct io_buffer *iobuf; + unsigned int index; + unsigned int gen; + unsigned int bit; + unsigned int seq; + unsigned int tag; + uint32_t done; + size_t total; + size_t len; + int rc; + + /* Process receive completions */ + done = rx->done; + seq = gve->seq; + total = 0; + while ( 1 ) { + + /* Read next possible completion */ + rc = 0; + gen = ( done & rx->count ); + index = ( done++ & ( rx->count - 1 ) ); + if ( gve->mode & GVE_MODE_DQO ) { + + /* Out-of-order completion */ + dqo = &rx->cmplt.rx.dqo[index]; + + /* Check generation bit */ + bit = ( dqo->len & cpu_to_le16 ( GVE_DQO_RXL_GEN ) ); + if ( ( !! bit ) == ( !! gen ) ) + break; + rmb(); + + /* Parse completion */ + len = ( le16_to_cpu ( dqo->len ) & + ( GVE_BUF_SIZE - 1 ) ); + tag = dqo->tag; + DBGC2 ( gve, "GVE %p RXC %#04x %#02x:%#02x len %#04zx " + "at %#08zx\n", gve, index, tag, dqo->flags, + len, gve_offset ( rx, tag ) ); + + /* Accumulate a complete packet */ + if ( dqo->status & GVE_DQO_RXS_ERROR ) { + rc = -EIO; + total = 0; + } else { + total += len; + if ( ! ( dqo->flags & GVE_DQO_RXF_LAST ) ) + continue; + } + + } else { + + /* In-order completion */ + gqi = &rx->cmplt.rx.gqi[index]; + + /* Check sequence number */ + if ( ( gqi->seq & GVE_GQI_RX_SEQ_MASK ) != seq ) + break; + rmb(); + seq = gve_next ( seq ); + + /* Parse completion */ + len = be16_to_cpu ( gqi->len ); + tag = ( index % GVE_RX_FILL ); + DBGC2 ( gve, "GVE %p RXC %#04x %#02x:%#02x len %#04zx " + "at %#08zx\n", gve, index, gqi->seq, + gqi->flags, len, gve_offset ( rx, tag ) ); + + /* Accumulate a complete packet */ + if ( gqi->flags & GVE_GQI_RXF_ERROR ) { + rc = -EIO; + total = 0; + } else { + total += len; + if ( gqi->flags & GVE_GQI_RXF_MORE ) + continue; + } + gve->seq = seq; + } + + /* Allocate and populate I/O buffer */ + iobuf = ( total ? alloc_iob ( total ) : NULL ); + for ( ; rx->done != done ; rx->done++ ) { + + /* Re-read completion and return tag to ring */ + index = ( rx->done & ( rx->count - 1 ) ); + if ( gve->mode & GVE_MODE_DQO ) { + dqo = &rx->cmplt.rx.dqo[index]; + tag = dqo->tag; + len = ( le16_to_cpu ( dqo->len ) & + ( GVE_BUF_SIZE - 1 ) ); + rx->tag[ rx->cons++ % GVE_RX_FILL ] = tag; + } else { + gqi = &rx->cmplt.rx.gqi[index]; + tag = ( index % GVE_RX_FILL ); + len = be16_to_cpu ( gqi->len ); + assert ( rx->cons == rx->done ); + rx->cons++; + } + + /* Copy data */ + if ( iobuf ) { + memcpy ( iob_put ( iobuf, len ), + gve_buffer ( rx, tag ), len ); + } + } + assert ( ( iobuf == NULL ) || ( iob_len ( iobuf ) == total ) ); + total = 0; + + /* Hand off packet to network stack */ + if ( iobuf ) { + if ( ! ( gve->mode & GVE_MODE_DQO ) ) + iob_pull ( iobuf, GVE_GQI_RX_PAD ); + netdev_rx ( netdev, iobuf ); + } else { + netdev_rx_err ( netdev, NULL, ( rc ? rc : -ENOMEM ) ); + } + } +} + +/** + * Refill receive queue + * + * @v netdev Network device + */ +static void gve_refill_rx ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + struct gve_queue *rx = &gve->rx; + struct gve_dqo_rx_descriptor *dqo; + unsigned int refill; + unsigned int index; + unsigned int tag; + uint32_t doorbell; + + /* Calculate refill quantity */ + doorbell = ( rx->cons + rx->fill ); + refill = ( doorbell - rx->prod ); + if ( ! refill ) + return; + + /* Refill ring */ + if ( gve->mode & GVE_MODE_DQO ) { + + /* Out-of-order descriptors */ + while ( refill-- ) { + + /* Identify next available buffer */ + index = ( rx->prod++ & ( rx->count - 1 ) ); + tag = rx->tag[ index % GVE_RX_FILL ]; + + /* Populate descriptor */ + dqo = &rx->desc.rx.dqo[index]; + dqo->tag = tag; + dqo->buf.addr = + cpu_to_le64 ( gve_address ( rx, tag ) ); + DBGC2 ( gve, "GVE %p RXD %#04x:%#02x at %#08llx\n", + gve, index, dqo->tag, + ( ( unsigned long long ) + le64_to_cpu ( dqo->buf.addr ) ) ); + } + wmb(); + assert ( rx->prod == doorbell ); + + } else { + + /* The in-order receive descriptors are prepopulated + * at the time of creating the receive queue (pointing + * to the preallocated queue pages). Refilling is + * therefore just a case of ringing the doorbell if + * the device is not yet aware of any available + * descriptors. + */ + rx->prod += refill; + assert ( rx->prod == doorbell ); + DBGC2 ( gve, "GVE %p RXD %#04x ready\n", gve, rx->prod ); + + /* Doorbell is big-endian */ + doorbell = bswap_32 ( doorbell ); + } + + /* Ring doorbell */ + writel ( doorbell, rx->db ); +} + +/** + * Poll for completed and received packets + * + * @v netdev Network device + */ +static void gve_poll ( struct net_device *netdev ) { + struct gve_nic *gve = netdev->priv; + + /* Do nothing if queues are not yet set up */ + if ( ! netdev_link_ok ( netdev ) ) + return; + + /* Poll for transmit completions */ + gve_poll_tx ( netdev ); + + /* Poll for receive completions */ + gve_poll_rx ( netdev ); + + /* Refill receive queue */ + gve_refill_rx ( netdev ); + + /* Rearm queue interrupts if applicable */ + if ( gve->mode & GVE_MODE_DQO ) { + writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[GVE_TX_IRQ] ); + writel ( GVE_DQO_IRQ_REARM, gve->irqs.db[GVE_RX_IRQ] ); + } +} + +/** GVE network device operations */ +static struct net_device_operations gve_operations = { + .open = gve_open, + .close = gve_close, + .transmit = gve_transmit, + .poll = gve_poll, +}; + +/****************************************************************************** + * + * PCI interface + * + ****************************************************************************** + */ + +/** Transmit descriptor queue type */ +static const struct gve_queue_type gve_tx_type = { + .name = "TX", + .param = gve_create_tx_param, + .qpl = GVE_TX_QPL, + .irq = GVE_TX_IRQ, + .fill = GVE_TX_FILL, + .stride = { + .gqi = { + .desc = sizeof ( struct gve_gqi_tx_descriptor ), + }, + .dqo = { + .desc = sizeof ( struct gve_dqo_tx_descriptor ), + .cmplt = sizeof ( struct gve_dqo_tx_completion ), + }, + }, + .create = GVE_ADMIN_CREATE_TX, + .destroy = GVE_ADMIN_DESTROY_TX, +}; + +/** Receive descriptor queue type */ +static const struct gve_queue_type gve_rx_type = { + .name = "RX", + .param = gve_create_rx_param, + .qpl = GVE_RX_QPL, + .irq = GVE_RX_IRQ, + .fill = GVE_RX_FILL, + .stride = { + .gqi = { + .desc = sizeof ( struct gve_gqi_rx_descriptor ), + .cmplt = sizeof ( struct gve_gqi_rx_completion ), + }, + .dqo = { + .desc = sizeof ( struct gve_dqo_rx_descriptor ), + .cmplt = sizeof ( struct gve_dqo_rx_completion ), + }, + }, + .create = GVE_ADMIN_CREATE_RX, + .destroy = GVE_ADMIN_DESTROY_RX, +}; + +/** + * Set up admin queue and get device description + * + * @v gve GVE device + * @ret rc Return status code + */ +static int gve_setup ( struct gve_nic *gve ) { + unsigned int i; + int rc; + + /* Attempt several times, since the device may decide to add + * in a few spurious resets. + */ + for ( i = 0 ; i < GVE_RESET_MAX_RETRY ; i++ ) { + + /* Reset device */ + if ( ( rc = gve_reset ( gve ) ) != 0 ) + continue; + + /* Enable admin queue */ + gve_admin_enable ( gve ); + + /* Fetch MAC address */ + if ( ( rc = gve_describe ( gve ) ) != 0 ) + continue; + + /* Success */ + return 0; + } + + DBGC ( gve, "GVE %p failed to get device description: %s\n", + gve, strerror ( rc ) ); + return rc; +} + +/** Device startup process descriptor */ +static struct process_descriptor gve_startup_desc = + PROC_DESC_ONCE ( struct gve_nic, startup, gve_startup ); + +/** + * Probe PCI device + * + * @v pci PCI device + * @ret rc Return status code + */ +static int gve_probe ( struct pci_device *pci ) { + struct net_device *netdev; + struct gve_nic *gve; + unsigned long cfg_start; + unsigned long db_start; + unsigned long db_size; + int rc; + + /* Allocate and initialise net device */ + netdev = alloc_etherdev ( sizeof ( *gve ) ); + if ( ! netdev ) { + rc = -ENOMEM; + goto err_alloc; + } + netdev_init ( netdev, &gve_operations ); + gve = netdev->priv; + pci_set_drvdata ( pci, netdev ); + netdev->dev = &pci->dev; + memset ( gve, 0, sizeof ( *gve ) ); + gve->netdev = netdev; + gve->tx.type = &gve_tx_type; + gve->rx.type = &gve_rx_type; + gve->tx.tag = gve->tx_tag; + gve->rx.tag = gve->rx_tag; + process_init_stopped ( &gve->startup, &gve_startup_desc, + &netdev->refcnt ); + timer_init ( &gve->watchdog, gve_watchdog, &netdev->refcnt ); + + /* Fix up PCI device */ + adjust_pci_device ( pci ); + + /* Check PCI revision */ + pci_read_config_byte ( pci, PCI_REVISION, &gve->revision ); + DBGC ( gve, "GVE %p is revision %#02x\n", gve, gve->revision ); + + /* Map configuration registers */ + cfg_start = pci_bar_start ( pci, GVE_CFG_BAR ); + gve->cfg = pci_ioremap ( pci, cfg_start, GVE_CFG_SIZE ); + if ( ! gve->cfg ) { + rc = -ENODEV; + goto err_cfg; + } + + /* Map doorbell registers */ + db_start = pci_bar_start ( pci, GVE_DB_BAR ); + db_size = pci_bar_size ( pci, GVE_DB_BAR ); + gve->db = pci_ioremap ( pci, db_start, db_size ); + if ( ! gve->db ) { + rc = -ENODEV; + goto err_db; + } + + /* Configure DMA */ + gve->dma = &pci->dma; + dma_set_mask_64bit ( gve->dma ); + assert ( netdev->dma == NULL ); + + /* Configure dummy MSI-X interrupt */ + if ( ( rc = pci_msix_enable ( pci, &gve->msix ) ) != 0 ) + goto err_msix; + + /* Allocate admin queue */ + if ( ( rc = gve_admin_alloc ( gve ) ) != 0 ) + goto err_admin; + + /* Set up the device */ + if ( ( rc = gve_setup ( gve ) ) != 0 ) + goto err_setup; + + /* Register network device */ + if ( ( rc = register_netdev ( netdev ) ) != 0 ) + goto err_register_netdev; + + return 0; + + unregister_netdev ( netdev ); + err_register_netdev: + err_setup: + gve_reset ( gve ); + gve_admin_free ( gve ); + err_admin: + pci_msix_disable ( pci, &gve->msix ); + err_msix: + iounmap ( gve->db ); + err_db: + iounmap ( gve->cfg ); + err_cfg: + netdev_nullify ( netdev ); + netdev_put ( netdev ); + err_alloc: + return rc; +} + +/** + * Remove PCI device + * + * @v pci PCI device + */ +static void gve_remove ( struct pci_device *pci ) { + struct net_device *netdev = pci_get_drvdata ( pci ); + struct gve_nic *gve = netdev->priv; + + /* Unregister network device */ + unregister_netdev ( netdev ); + + /* Reset device */ + gve_reset ( gve ); + + /* Free admin queue */ + gve_admin_free ( gve ); + + /* Disable dummy MSI-X interrupt */ + pci_msix_disable ( pci, &gve->msix ); + + /* Unmap registers */ + iounmap ( gve->db ); + iounmap ( gve->cfg ); + + /* Free network device */ + netdev_nullify ( netdev ); + netdev_put ( netdev ); +} + +/** GVE PCI device IDs */ +static struct pci_device_id gve_nics[] = { + PCI_ROM ( 0x1ae0, 0x0042, "gve", "gVNIC", 0 ), +}; + +/** GVE PCI driver */ +struct pci_driver gve_driver __pci_driver = { + .ids = gve_nics, + .id_count = ( sizeof ( gve_nics ) / sizeof ( gve_nics[0] ) ), + .probe = gve_probe, + .remove = gve_remove, +}; |
