summaryrefslogtreecommitdiffstats
path: root/contrib/libvhost-user/libvhost-user.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/libvhost-user/libvhost-user.c')
-rw-r--r--contrib/libvhost-user/libvhost-user.c468
1 files changed, 401 insertions, 67 deletions
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 3f14b4138b..e08d6c7b97 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -41,6 +41,8 @@
#endif
#include "qemu/atomic.h"
+#include "qemu/osdep.h"
+#include "qemu/memfd.h"
#include "libvhost-user.h"
@@ -53,6 +55,18 @@
_min1 < _min2 ? _min1 : _min2; })
#endif
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+/* Align each region to cache line size in inflight buffer */
+#define INFLIGHT_ALIGNMENT 64
+
+/* The version of inflight buffer */
+#define INFLIGHT_VERSION 1
+
#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
/* The version of the protocol we support */
@@ -66,6 +80,20 @@
} \
} while (0)
+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+ assert(fbit < 64);
+ return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+ unsigned int fbit)
+{
+ return has_feature(dev->features, fbit);
+}
+
static const char *
vu_request_to_string(unsigned int req)
{
@@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
REQ(VHOST_USER_POSTCOPY_ADVISE),
REQ(VHOST_USER_POSTCOPY_LISTEN),
REQ(VHOST_USER_POSTCOPY_END),
+ REQ(VHOST_USER_GET_INFLIGHT_FD),
+ REQ(VHOST_USER_SET_INFLIGHT_FD),
REQ(VHOST_USER_MAX),
};
#undef REQ
@@ -890,6 +920,91 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
return true;
}
+static int
+inflight_desc_compare(const void *a, const void *b)
+{
+ VuVirtqInflightDesc *desc0 = (VuVirtqInflightDesc *)a,
+ *desc1 = (VuVirtqInflightDesc *)b;
+
+ if (desc1->counter > desc0->counter &&
+ (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
+ return 1;
+ }
+
+ return -1;
+}
+
+static int
+vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
+{
+ int i = 0;
+
+ if (!has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ if (unlikely(!vq->inflight)) {
+ return -1;
+ }
+
+ if (unlikely(!vq->inflight->version)) {
+ /* initialize the buffer */
+ vq->inflight->version = INFLIGHT_VERSION;
+ return 0;
+ }
+
+ vq->used_idx = vq->vring.used->idx;
+ vq->resubmit_num = 0;
+ vq->resubmit_list = NULL;
+ vq->counter = 0;
+
+ if (unlikely(vq->inflight->used_idx != vq->used_idx)) {
+ vq->inflight->desc[vq->inflight->last_batch_head].inflight = 0;
+
+ barrier();
+
+ vq->inflight->used_idx = vq->used_idx;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight == 1) {
+ vq->inuse++;
+ }
+ }
+
+ vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
+
+ if (vq->inuse) {
+ vq->resubmit_list = malloc(sizeof(VuVirtqInflightDesc) * vq->inuse);
+ if (!vq->resubmit_list) {
+ return -1;
+ }
+
+ for (i = 0; i < vq->inflight->desc_num; i++) {
+ if (vq->inflight->desc[i].inflight) {
+ vq->resubmit_list[vq->resubmit_num].index = i;
+ vq->resubmit_list[vq->resubmit_num].counter =
+ vq->inflight->desc[i].counter;
+ vq->resubmit_num++;
+ }
+ }
+
+ if (vq->resubmit_num > 1) {
+ qsort(vq->resubmit_list, vq->resubmit_num,
+ sizeof(VuVirtqInflightDesc), inflight_desc_compare);
+ }
+ vq->counter = vq->resubmit_list[0].counter + 1;
+ }
+
+ /* in case of I/O hang after reconnecting */
+ if (eventfd_write(vq->kick_fd, 1)) {
+ return -1;
+ }
+
+ return 0;
+}
+
static bool
vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
{
@@ -907,10 +1022,8 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].kick_fd = -1;
}
- if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
- dev->vq[index].kick_fd = vmsg->fds[0];
- DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
- }
+ dev->vq[index].kick_fd = vmsg->fds[0];
+ DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
dev->vq[index].started = true;
if (dev->iface->queue_set_started) {
@@ -925,6 +1038,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].kick_fd, index);
}
+ if (vu_check_queue_inflights(dev, &dev->vq[index])) {
+ vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
+ }
+
return false;
}
@@ -995,8 +1112,11 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].call_fd = -1;
}
- if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
- dev->vq[index].call_fd = vmsg->fds[0];
+ dev->vq[index].call_fd = vmsg->fds[0];
+
+ /* in case of I/O hang after reconnecting */
+ if (eventfd_write(vmsg->fds[0], 1)) {
+ return -1;
}
DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
@@ -1020,9 +1140,7 @@ vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].err_fd = -1;
}
- if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
- dev->vq[index].err_fd = vmsg->fds[0];
- }
+ dev->vq[index].err_fd = vmsg->fds[0];
return false;
}
@@ -1215,6 +1333,116 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
return true;
}
+static inline uint64_t
+vu_inflight_queue_size(uint16_t queue_size)
+{
+ return ALIGN_UP(sizeof(VuDescStateSplit) * queue_size +
+ sizeof(uint16_t), INFLIGHT_ALIGNMENT);
+}
+
+static bool
+vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int fd;
+ void *addr;
+ uint64_t mmap_size;
+ uint16_t num_queues, queue_size;
+
+ if (vmsg->size != sizeof(vmsg->payload.inflight)) {
+ vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
+ vmsg->payload.inflight.mmap_size = 0;
+ return true;
+ }
+
+ num_queues = vmsg->payload.inflight.num_queues;
+ queue_size = vmsg->payload.inflight.queue_size;
+
+ DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+ DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+ mmap_size = vu_inflight_queue_size(queue_size) * num_queues;
+
+ addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
+ F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+ &fd, NULL);
+
+ if (!addr) {
+ vu_panic(dev, "Failed to alloc vhost inflight area");
+ vmsg->payload.inflight.mmap_size = 0;
+ return true;
+ }
+
+ memset(addr, 0, mmap_size);
+
+ dev->inflight_info.addr = addr;
+ dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
+ dev->inflight_info.fd = vmsg->fds[0] = fd;
+ vmsg->fd_num = 1;
+ vmsg->payload.inflight.mmap_offset = 0;
+
+ DPRINT("send inflight mmap_size: %"PRId64"\n",
+ vmsg->payload.inflight.mmap_size);
+ DPRINT("send inflight mmap offset: %"PRId64"\n",
+ vmsg->payload.inflight.mmap_offset);
+
+ return true;
+}
+
+static bool
+vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+ int fd, i;
+ uint64_t mmap_size, mmap_offset;
+ uint16_t num_queues, queue_size;
+ void *rc;
+
+ if (vmsg->fd_num != 1 ||
+ vmsg->size != sizeof(vmsg->payload.inflight)) {
+ vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
+ vmsg->size, vmsg->fd_num);
+ return false;
+ }
+
+ fd = vmsg->fds[0];
+ mmap_size = vmsg->payload.inflight.mmap_size;
+ mmap_offset = vmsg->payload.inflight.mmap_offset;
+ num_queues = vmsg->payload.inflight.num_queues;
+ queue_size = vmsg->payload.inflight.queue_size;
+
+ DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
+ DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
+ DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+ DPRINT("set_inflight_fd queue_size: %"PRId16"\n", queue_size);
+
+ rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, mmap_offset);
+
+ if (rc == MAP_FAILED) {
+ vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
+ return false;
+ }
+
+ if (dev->inflight_info.fd) {
+ close(dev->inflight_info.fd);
+ }
+
+ if (dev->inflight_info.addr) {
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+ }
+
+ dev->inflight_info.fd = fd;
+ dev->inflight_info.addr = rc;
+ dev->inflight_info.size = mmap_size;
+
+ for (i = 0; i < num_queues; i++) {
+ dev->vq[i].inflight = (VuVirtqInflight *)rc;
+ dev->vq[i].inflight->desc_num = queue_size;
+ rc = (void *)((char *)rc + vu_inflight_queue_size(queue_size));
+ }
+
+ return false;
+}
+
static bool
vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
{
@@ -1285,13 +1513,18 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
case VHOST_USER_SET_CONFIG:
return vu_set_config(dev, vmsg);
case VHOST_USER_NONE:
- break;
+ /* if you need processing before exit, override iface->process_msg */
+ exit(0);
case VHOST_USER_POSTCOPY_ADVISE:
return vu_set_postcopy_advise(dev, vmsg);
case VHOST_USER_POSTCOPY_LISTEN:
return vu_set_postcopy_listen(dev, vmsg);
case VHOST_USER_POSTCOPY_END:
return vu_set_postcopy_end(dev, vmsg);
+ case VHOST_USER_GET_INFLIGHT_FD:
+ return vu_get_inflight_fd(dev, vmsg);
+ case VHOST_USER_SET_INFLIGHT_FD:
+ return vu_set_inflight_fd(dev, vmsg);
default:
vmsg_close_fds(vmsg);
vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -1359,8 +1592,24 @@ vu_deinit(VuDev *dev)
close(vq->err_fd);
vq->err_fd = -1;
}
+
+ if (vq->resubmit_list) {
+ free(vq->resubmit_list);
+ vq->resubmit_list = NULL;
+ }
+
+ vq->inflight = NULL;
}
+ if (dev->inflight_info.addr) {
+ munmap(dev->inflight_info.addr, dev->inflight_info.size);
+ dev->inflight_info.addr = NULL;
+ }
+
+ if (dev->inflight_info.fd > 0) {
+ close(dev->inflight_info.fd);
+ dev->inflight_info.fd = -1;
+ }
vu_close_log(dev);
if (dev->slave_fd != -1) {
@@ -1687,20 +1936,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
return vring_avail_idx(vq) == vq->last_avail_idx;
}
-static inline
-bool has_feature(uint64_t features, unsigned int fbit)
-{
- assert(fbit < 64);
- return !!(features & (1ULL << fbit));
-}
-
-static inline
-bool vu_has_feature(VuDev *dev,
- unsigned int fbit)
-{
- return has_feature(dev->features, fbit);
-}
-
static bool
vring_notify(VuDev *dev, VuVirtq *vq)
{
@@ -1829,12 +2064,6 @@ virtqueue_map_desc(VuDev *dev,
*p_num_sg = num_sg;
}
-/* Round number down to multiple */
-#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
-
-/* Round number up to multiple */
-#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
-
static void *
virtqueue_alloc_element(size_t sz,
unsigned out_num, unsigned in_num)
@@ -1853,49 +2082,20 @@ virtqueue_alloc_element(size_t sz,
return elem;
}
-void *
-vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+static void *
+vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
{
- unsigned int i, head, max, desc_len;
+ struct vring_desc *desc = vq->vring.desc;
uint64_t desc_addr, read_len;
+ unsigned int desc_len;
+ unsigned int max = vq->vring.num;
+ unsigned int i = idx;
VuVirtqElement *elem;
- unsigned out_num, in_num;
+ unsigned int out_num = 0, in_num = 0;
struct iovec iov[VIRTQUEUE_MAX_SIZE];
struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
- struct vring_desc *desc;
int rc;
- if (unlikely(dev->broken) ||
- unlikely(!vq->vring.avail)) {
- return NULL;
- }
-
- if (vu_queue_empty(dev, vq)) {
- return NULL;
- }
- /* Needed after virtio_queue_empty(), see comment in
- * virtqueue_num_heads(). */
- smp_rmb();
-
- /* When we start there are none of either input nor output. */
- out_num = in_num = 0;
-
- max = vq->vring.num;
- if (vq->inuse >= vq->vring.num) {
- vu_panic(dev, "Virtqueue size exceeded");
- return NULL;
- }
-
- if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
- return NULL;
- }
-
- if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
- vring_set_avail_event(vq, vq->last_avail_idx);
- }
-
- i = head;
- desc = vq->vring.desc;
if (desc[i].flags & VRING_DESC_F_INDIRECT) {
if (desc[i].len % sizeof(struct vring_desc)) {
vu_panic(dev, "Invalid size for indirect buffer table");
@@ -1947,12 +2147,13 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
} while (rc == VIRTQUEUE_READ_DESC_MORE);
if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+ vu_panic(dev, "read descriptor error");
return NULL;
}
/* Now copy what we have collected and mapped */
elem = virtqueue_alloc_element(sz, out_num, in_num);
- elem->index = head;
+ elem->index = idx;
for (i = 0; i < out_num; i++) {
elem->out_sg[i] = iov[i];
}
@@ -1960,11 +2161,142 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
elem->in_sg[i] = iov[out_num + i];
}
+ return elem;
+}
+
+static int
+vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+ if (!has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ if (unlikely(!vq->inflight)) {
+ return -1;
+ }
+
+ vq->inflight->desc[desc_idx].counter = vq->counter++;
+ vq->inflight->desc[desc_idx].inflight = 1;
+
+ return 0;
+}
+
+static int
+vu_queue_inflight_pre_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+ if (!has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ if (unlikely(!vq->inflight)) {
+ return -1;
+ }
+
+ vq->inflight->last_batch_head = desc_idx;
+
+ return 0;
+}
+
+static int
+vu_queue_inflight_post_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+ if (!has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ if (unlikely(!vq->inflight)) {
+ return -1;
+ }
+
+ barrier();
+
+ vq->inflight->desc[desc_idx].inflight = 0;
+
+ barrier();
+
+ vq->inflight->used_idx = vq->used_idx;
+
+ return 0;
+}
+
+void *
+vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
+{
+ int i;
+ unsigned int head;
+ VuVirtqElement *elem;
+
+ if (unlikely(dev->broken) ||
+ unlikely(!vq->vring.avail)) {
+ return NULL;
+ }
+
+ if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
+ i = (--vq->resubmit_num);
+ elem = vu_queue_map_desc(dev, vq, vq->resubmit_list[i].index, sz);
+
+ if (!vq->resubmit_num) {
+ free(vq->resubmit_list);
+ vq->resubmit_list = NULL;
+ }
+
+ return elem;
+ }
+
+ if (vu_queue_empty(dev, vq)) {
+ return NULL;
+ }
+ /*
+ * Needed after virtio_queue_empty(), see comment in
+ * virtqueue_num_heads().
+ */
+ smp_rmb();
+
+ if (vq->inuse >= vq->vring.num) {
+ vu_panic(dev, "Virtqueue size exceeded");
+ return NULL;
+ }
+
+ if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) {
+ return NULL;
+ }
+
+ if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
+ vring_set_avail_event(vq, vq->last_avail_idx);
+ }
+
+ elem = vu_queue_map_desc(dev, vq, head, sz);
+
+ if (!elem) {
+ return NULL;
+ }
+
vq->inuse++;
+ vu_queue_inflight_get(dev, vq, head);
+
return elem;
}
+static void
+vu_queue_detach_element(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
+ size_t len)
+{
+ vq->inuse--;
+ /* unmap, when DMA support is added */
+}
+
+void
+vu_queue_unpop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem,
+ size_t len)
+{
+ vq->last_avail_idx--;
+ vu_queue_detach_element(dev, vq, elem, len);
+}
+
bool
vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num)
{
@@ -2106,5 +2438,7 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
const VuVirtqElement *elem, unsigned int len)
{
vu_queue_fill(dev, vq, elem, len, 0);
+ vu_queue_inflight_pre_put(dev, vq, elem->index);
vu_queue_flush(dev, vq, 1);
+ vu_queue_inflight_post_put(dev, vq, elem->index);
}