From e70372fcaffc99444edce400a5178cb196cddaf7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Feb 2018 10:39:32 -0500 Subject: lockable: add QemuLockable QemuLockable is a polymorphic lock type that takes an object and knows which function to use for locking and unlocking. The implementation could use C11 _Generic, but since the support is not very widespread I am instead using __builtin_choose_expr and __builtin_types_compatible_p, which are already used by include/qemu/atomic.h. QemuLockable can be used to implement lock guards, or to pass around a lock in such a way that a function can release it and re-acquire it. The next patch will do this for CoQueue. Signed-off-by: Paolo Bonzini Message-Id: <20180203153935.8056-3-pbonzini@redhat.com> Reviewed-by: Richard Henderson Reviewed-by: Stefan Hajnoczi Reviewed-by: Fam Zheng Signed-off-by: Fam Zheng --- include/qemu/compiler.h | 39 ++++++++++++++++++++ include/qemu/coroutine.h | 4 +- include/qemu/lockable.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ include/qemu/thread.h | 5 +-- include/qemu/typedefs.h | 4 ++ 5 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 include/qemu/lockable.h (limited to 'include') diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 5fcc4f7ec7..2cbe6a4f16 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -114,5 +114,44 @@ #ifndef __has_feature #define __has_feature(x) 0 /* compatibility with non-clang compilers */ #endif +/* Implement C11 _Generic via GCC builtins. Example: + * + * QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x) + * + * The first argument is the discriminator. The last is the default value. + * The middle ones are tuples in "(type, expansion)" format. + */ + +/* First, find out the number of generic cases. */ +#define QEMU_GENERIC(x, ...) \ + QEMU_GENERIC_(typeof(x), __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +/* There will be extra arguments, but they are not used. */ +#define QEMU_GENERIC_(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, count, ...) \ + QEMU_GENERIC##count(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) + +/* Two more helper macros, this time to extract items from a parenthesized + * list. + */ +#define QEMU_FIRST_(a, b) a +#define QEMU_SECOND_(a, b) b + +/* ... and a final one for the common part of the "recursion". */ +#define QEMU_GENERIC_IF(x, type_then, else_) \ + __builtin_choose_expr(__builtin_types_compatible_p(x, \ + QEMU_FIRST_ type_then), \ + QEMU_SECOND_ type_then, else_) + +/* CPP poor man's "recursion". */ +#define QEMU_GENERIC1(x, a0, ...) (a0) +#define QEMU_GENERIC2(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC1(x, __VA_ARGS__)) +#define QEMU_GENERIC3(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC2(x, __VA_ARGS__)) +#define QEMU_GENERIC4(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC3(x, __VA_ARGS__)) +#define QEMU_GENERIC5(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC4(x, __VA_ARGS__)) +#define QEMU_GENERIC6(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC5(x, __VA_ARGS__)) +#define QEMU_GENERIC7(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC6(x, __VA_ARGS__)) +#define QEMU_GENERIC8(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC7(x, __VA_ARGS__)) +#define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__)) +#define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__)) #endif /* COMPILER_H */ diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index ce2eb73670..8a5129741c 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -121,7 +121,7 @@ bool qemu_coroutine_entered(Coroutine *co); * Provides a mutex that can be used to synchronise coroutines */ struct CoWaitRecord; -typedef struct CoMutex { +struct CoMutex { /* Count of pending lockers; 0 for a free mutex, 1 for an * uncontended mutex. */ @@ -142,7 +142,7 @@ typedef struct CoMutex { unsigned handoff, sequence; Coroutine *holder; -} CoMutex; +}; /** * Initialises a CoMutex. This must be called before any other operation is used diff --git a/include/qemu/lockable.h b/include/qemu/lockable.h new file mode 100644 index 0000000000..b6ed6c89ec --- /dev/null +++ b/include/qemu/lockable.h @@ -0,0 +1,96 @@ +/* + * Polymorphic locking functions (aka poor man templates) + * + * Copyright Red Hat, Inc. 2017, 2018 + * + * Author: Paolo Bonzini + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef QEMU_LOCKABLE_H +#define QEMU_LOCKABLE_H + +#include "qemu/coroutine.h" +#include "qemu/thread.h" + +typedef void QemuLockUnlockFunc(void *); + +struct QemuLockable { + void *object; + QemuLockUnlockFunc *lock; + QemuLockUnlockFunc *unlock; +}; + +/* This function gives an error if an invalid, non-NULL pointer type is passed + * to QEMU_MAKE_LOCKABLE. For optimized builds, we can rely on dead-code elimination + * from the compiler, and give the errors already at link time. + */ +#ifdef __OPTIMIZE__ +void unknown_lock_type(void *); +#else +static inline void unknown_lock_type(void *unused) +{ + abort(); +} +#endif + +static inline __attribute__((__always_inline__)) QemuLockable * +qemu_make_lockable(void *x, QemuLockable *lockable) +{ + /* We cannot test this in a macro, otherwise we get compiler + * warnings like "the address of 'm' will always evaluate as 'true'". + */ + return x ? lockable : NULL; +} + +/* Auxiliary macros to simplify QEMU_MAKE_LOCABLE. */ +#define QEMU_LOCK_FUNC(x) ((QemuLockUnlockFunc *) \ + QEMU_GENERIC(x, \ + (QemuMutex *, qemu_mutex_lock), \ + (CoMutex *, qemu_co_mutex_lock), \ + (QemuSpin *, qemu_spin_lock), \ + unknown_lock_type)) + +#define QEMU_UNLOCK_FUNC(x) ((QemuLockUnlockFunc *) \ + QEMU_GENERIC(x, \ + (QemuMutex *, qemu_mutex_unlock), \ + (CoMutex *, qemu_co_mutex_unlock), \ + (QemuSpin *, qemu_spin_unlock), \ + unknown_lock_type)) + +/* In C, compound literals have the lifetime of an automatic variable. + * In C++ it would be different, but then C++ wouldn't need QemuLockable + * either... + */ +#define QEMU_MAKE_LOCKABLE_(x) qemu_make_lockable((x), &(QemuLockable) { \ + .object = (x), \ + .lock = QEMU_LOCK_FUNC(x), \ + .unlock = QEMU_UNLOCK_FUNC(x), \ + }) + +/* QEMU_MAKE_LOCKABLE - Make a polymorphic QemuLockable + * + * @x: a lock object (currently one of QemuMutex, CoMutex, QemuSpin). + * + * Returns a QemuLockable object that can be passed around + * to a function that can operate with locks of any kind. + */ +#define QEMU_MAKE_LOCKABLE(x) \ + QEMU_GENERIC(x, \ + (QemuLockable *, (x)), \ + QEMU_MAKE_LOCKABLE_(x)) + +static inline void qemu_lockable_lock(QemuLockable *x) +{ + x->lock(x->object); +} + +static inline void qemu_lockable_unlock(QemuLockable *x) +{ + x->unlock(x->object); +} + +#endif diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 9af4e945aa..ef7bd16123 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -4,7 +4,6 @@ #include "qemu/processor.h" #include "qemu/atomic.h" -typedef struct QemuMutex QemuMutex; typedef struct QemuCond QemuCond; typedef struct QemuSemaphore QemuSemaphore; typedef struct QemuEvent QemuEvent; @@ -97,9 +96,9 @@ struct Notifier; void qemu_thread_atexit_add(struct Notifier *notifier); void qemu_thread_atexit_remove(struct Notifier *notifier); -typedef struct QemuSpin { +struct QemuSpin { int value; -} QemuSpin; +}; static inline void qemu_spin_init(QemuSpin *spin) { diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 9bd7a834ba..5923849cdd 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -19,6 +19,7 @@ typedef struct BusClass BusClass; typedef struct BusState BusState; typedef struct Chardev Chardev; typedef struct CompatProperty CompatProperty; +typedef struct CoMutex CoMutex; typedef struct CPUAddressSpace CPUAddressSpace; typedef struct CPUState CPUState; typedef struct DeviceListener DeviceListener; @@ -86,9 +87,12 @@ typedef struct QEMUBH QEMUBH; typedef struct QemuConsole QemuConsole; typedef struct QemuDmaBuf QemuDmaBuf; typedef struct QEMUFile QEMUFile; +typedef struct QemuLockable QemuLockable; +typedef struct QemuMutex QemuMutex; typedef struct QemuOpt QemuOpt; typedef struct QemuOpts QemuOpts; typedef struct QemuOptsList QemuOptsList; +typedef struct QemuSpin QemuSpin; typedef struct QEMUSGList QEMUSGList; typedef struct QEMUTimer QEMUTimer; typedef struct QEMUTimerListGroup QEMUTimerListGroup; -- cgit v1.2.3-55-g7522 From 1a957cf9c4637abe4b7d67a91312a2565306641e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Feb 2018 10:39:33 -0500 Subject: coroutine-lock: convert CoQueue to use QemuLockable There are cases in which a queued coroutine must be restarted from non-coroutine context (with qemu_co_enter_next). In this cases, qemu_co_enter_next also needs to be thread-safe, but it cannot use a CoMutex and so cannot qemu_co_queue_wait. Use QemuLockable so that the CoQueue can interchangeably use CoMutex or QemuMutex. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Message-Id: <20180203153935.8056-4-pbonzini@redhat.com> Reviewed-by: Fam Zheng Signed-off-by: Fam Zheng --- include/qemu/coroutine.h | 6 +++++- util/qemu-coroutine-lock.c | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 8a5129741c..1e5f0957e6 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -183,7 +183,9 @@ void qemu_co_queue_init(CoQueue *queue); * caller of the coroutine. The mutex is unlocked during the wait and * locked again afterwards. */ -void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex); +#define qemu_co_queue_wait(queue, lock) \ + qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock)) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock); /** * Restarts the next coroutine in the CoQueue and removes it from the queue. @@ -271,4 +273,6 @@ void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns); */ void coroutine_fn yield_until_fd_readable(int fd); +#include "qemu/lockable.h" + #endif /* QEMU_COROUTINE_H */ diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 846ff9167f..2a66fc1467 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -40,13 +40,13 @@ void qemu_co_queue_init(CoQueue *queue) QSIMPLEQ_INIT(&queue->entries); } -void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *self = qemu_coroutine_self(); QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); - if (mutex) { - qemu_co_mutex_unlock(mutex); + if (lock) { + qemu_lockable_unlock(lock); } /* There is no race condition here. Other threads will call @@ -60,9 +60,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) /* TODO: OSv implements wait morphing here, where the wakeup * primitive automatically places the woken coroutine on the * mutex's queue. This avoids the thundering herd effect. + * This could be implemented for CoMutexes, but not really for + * other cases of QemuLockable. */ - if (mutex) { - qemu_co_mutex_lock(mutex); + if (lock) { + qemu_lockable_lock(lock); } } -- cgit v1.2.3-55-g7522 From 5261dd7b0106a88c9b323607136b1dfa1d7db689 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 3 Feb 2018 10:39:34 -0500 Subject: coroutine-lock: make qemu_co_enter_next thread-safe qemu_co_queue_next does not need to release and re-acquire the mutex, because the queued coroutine does not run immediately. However, this does not hold for qemu_co_enter_next. Now that qemu_co_queue_wait can synchronize (via QemuLockable) with code that is not running in coroutine context, it's important that code using qemu_co_enter_next can easily use a standardized locking idiom. First of all, qemu_co_enter_next must use aio_co_wake to restart the coroutine. Second, the function gains a second argument, a QemuLockable*, and the comments of qemu_co_queue_next and qemu_co_queue_restart_all are adjusted to clarify the difference. Signed-off-by: Paolo Bonzini Message-Id: <20180203153935.8056-5-pbonzini@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Fam Zheng Signed-off-by: Fam Zheng --- fsdev/qemu-fsdev-throttle.c | 4 ++-- include/qemu/coroutine.h | 19 +++++++++++++------ util/qemu-coroutine-lock.c | 10 ++++++++-- 3 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fsdev/qemu-fsdev-throttle.c b/fsdev/qemu-fsdev-throttle.c index 49eebb5412..1dc07fbc12 100644 --- a/fsdev/qemu-fsdev-throttle.c +++ b/fsdev/qemu-fsdev-throttle.c @@ -20,13 +20,13 @@ static void fsdev_throttle_read_timer_cb(void *opaque) { FsThrottle *fst = opaque; - qemu_co_enter_next(&fst->throttled_reqs[false]); + qemu_co_enter_next(&fst->throttled_reqs[false], NULL); } static void fsdev_throttle_write_timer_cb(void *opaque) { FsThrottle *fst = opaque; - qemu_co_enter_next(&fst->throttled_reqs[true]); + qemu_co_enter_next(&fst->throttled_reqs[true], NULL); } void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp) diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 1e5f0957e6..6f8a487041 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -188,21 +188,28 @@ void qemu_co_queue_init(CoQueue *queue); void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock); /** - * Restarts the next coroutine in the CoQueue and removes it from the queue. - * - * Returns true if a coroutine was restarted, false if the queue is empty. + * Removes the next coroutine from the CoQueue, and wake it up. + * Returns true if a coroutine was removed, false if the queue is empty. */ bool coroutine_fn qemu_co_queue_next(CoQueue *queue); /** - * Restarts all coroutines in the CoQueue and leaves the queue empty. + * Empties the CoQueue; all coroutines are woken up. */ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue); /** - * Enter the next coroutine in the queue + * Removes the next coroutine from the CoQueue, and wake it up. Unlike + * qemu_co_queue_next, this function releases the lock during aio_co_wake + * because it is meant to be used outside coroutine context; in that case, the + * coroutine is entered immediately, before qemu_co_enter_next returns. + * + * If used in coroutine context, qemu_co_enter_next is equivalent to + * qemu_co_queue_next. */ -bool qemu_co_enter_next(CoQueue *queue); +#define qemu_co_enter_next(queue, lock) \ + qemu_co_enter_next_impl(queue, QEMU_MAKE_LOCKABLE(lock)) +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock); /** * Checks if the CoQueue is empty. diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 2a66fc1467..78fb79acf8 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -132,7 +132,7 @@ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue) qemu_co_queue_do_restart(queue, false); } -bool qemu_co_enter_next(CoQueue *queue) +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *next; @@ -142,7 +142,13 @@ bool qemu_co_enter_next(CoQueue *queue) } QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next); - qemu_coroutine_enter(next); + if (lock) { + qemu_lockable_unlock(lock); + } + aio_co_wake(next); + if (lock) { + qemu_lockable_lock(lock); + } return true; } -- cgit v1.2.3-55-g7522 From 418026ca43bc2626db092d7558258f9594366f28 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 16 Jan 2018 14:08:54 +0800 Subject: util: Introduce vfio helpers This is a library to manage the host vfio interface, which could be used to implement userspace device driver code in QEMU such as NVMe or net controllers. Signed-off-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Message-Id: <20180116060901.17413-3-famz@redhat.com> Signed-off-by: Fam Zheng --- include/qemu/vfio-helpers.h | 33 ++ util/Makefile.objs | 1 + util/trace-events | 11 + util/vfio-helpers.c | 727 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 772 insertions(+) create mode 100644 include/qemu/vfio-helpers.h create mode 100644 util/vfio-helpers.c (limited to 'include') diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h new file mode 100644 index 0000000000..ce7e7b057f --- /dev/null +++ b/include/qemu/vfio-helpers.h @@ -0,0 +1,33 @@ +/* + * QEMU VFIO helpers + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_VFIO_HELPERS_H +#define QEMU_VFIO_HELPERS_H +#include "qemu/typedefs.h" + +typedef struct QEMUVFIOState QEMUVFIOState; + +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp); +void qemu_vfio_close(QEMUVFIOState *s); +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova_list); +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s); +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host); +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, + Error **errp); +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size); +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp); + +#endif diff --git a/util/Makefile.objs b/util/Makefile.objs index 2973b0a323..3fb611631f 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -46,3 +46,4 @@ util-obj-y += qht.o util-obj-y += range.o util-obj-y += stats64.o util-obj-y += systemd.o +util-obj-$(CONFIG_LINUX) += vfio-helpers.o diff --git a/util/trace-events b/util/trace-events index 515e6257fb..4822434c89 100644 --- a/util/trace-events +++ b/util/trace-events @@ -60,3 +60,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter" qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)" qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)" qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)" + +# util/vfio-helpers.c +qemu_vfio_dma_reset_temporary(void *s) "s %p" +qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" +qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64 +qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64 +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p" +qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size, void *host, size_t size) "s %p mapping %p %zu requested %p %zu" +qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c new file mode 100644 index 0000000000..f478b68400 --- /dev/null +++ b/util/vfio-helpers.c @@ -0,0 +1,727 @@ +/* + * VFIO utility + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include +#include +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/cpu-common.h" +#include "trace.h" +#include "qemu/queue.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/pci_regs.h" +#include "qemu/event_notifier.h" +#include "qemu/vfio-helpers.h" +#include "trace.h" + +#define QEMU_VFIO_DEBUG 0 + +#define QEMU_VFIO_IOVA_MIN 0x10000ULL +/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, + * we can use a runtime limit; alternatively it's also possible to do platform + * specific detection by reading sysfs entries. Until then, 39 is a safe bet. + **/ +#define QEMU_VFIO_IOVA_MAX (1ULL << 39) + +typedef struct { + /* Page aligned addr. */ + void *host; + size_t size; + uint64_t iova; +} IOVAMapping; + +struct QEMUVFIOState { + QemuMutex lock; + + /* These fields are protected by BQL */ + int container; + int group; + int device; + RAMBlockNotifier ram_notifier; + struct vfio_region_info config_region_info, bar_region_info[6]; + + /* These fields are protected by @lock */ + /* VFIO's IO virtual address space is managed by splitting into a few + * sections: + * + * --------------- <= 0 + * |xxxxxxxxxxxxx| + * |-------------| <= QEMU_VFIO_IOVA_MIN + * | | + * | Fixed | + * | | + * |-------------| <= low_water_mark + * | | + * | Free | + * | | + * |-------------| <= high_water_mark + * | | + * | Temp | + * | | + * |-------------| <= QEMU_VFIO_IOVA_MAX + * |xxxxxxxxxxxxx| + * |xxxxxxxxxxxxx| + * --------------- + * + * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; + * + * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of + * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be + * reclaimed - low_water_mark never shrinks; + * + * - IOVAs in range [low_water_mark, high_water_mark) are free; + * + * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile + * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area + * is recycled. The caller should make sure I/O's depending on these + * mappings are completed before calling. + **/ + uint64_t low_water_mark; + uint64_t high_water_mark; + IOVAMapping *mappings; + int nr_mappings; +}; + +/** + * Find group file by PCI device address as specified @device, and return the + * path. The returned string is owned by caller and should be g_free'ed later. + */ +static char *sysfs_find_group_file(const char *device, Error **errp) +{ + char *sysfs_link; + char *sysfs_group; + char *p; + char *path = NULL; + + sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); + sysfs_group = g_malloc(PATH_MAX); + if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { + error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); + goto out; + } + p = strrchr(sysfs_group, '/'); + if (!p) { + error_setg(errp, "Failed to find iommu group number"); + goto out; + } + + path = g_strdup_printf("/dev/vfio/%s", p + 1); +out: + g_free(sysfs_link); + g_free(sysfs_group); + return path; +} + +static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) +{ + assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); +} + +static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) +{ + assert_bar_index_valid(s, index); + s->bar_region_info[index] = (struct vfio_region_info) { + .index = VFIO_PCI_BAR0_REGION_INDEX + index, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { + error_setg_errno(errp, errno, "Failed to get BAR region info"); + return -errno; + } + + return 0; +} + +/** + * Map a PCI bar area. + */ +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, + Error **errp) +{ + void *p; + assert_bar_index_valid(s, index); + p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), + PROT_READ | PROT_WRITE, MAP_SHARED, + s->device, s->bar_region_info[index].offset + offset); + if (p == MAP_FAILED) { + error_setg_errno(errp, errno, "Failed to map BAR region"); + p = NULL; + } + return p; +} + +/** + * Unmap a PCI bar area. + */ +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size) +{ + if (bar) { + munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); + } +} + +/** + * Initialize device IRQ with @irq_type and and register an event notifier. + */ +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp) +{ + int r; + struct vfio_irq_set *irq_set; + size_t irq_set_size; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + + irq_info.index = irq_type; + if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { + error_setg_errno(errp, errno, "Failed to get device interrupt info"); + return -errno; + } + if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + error_setg(errp, "Device interrupt doesn't support eventfd"); + return -EINVAL; + } + + irq_set_size = sizeof(*irq_set) + sizeof(int); + irq_set = g_malloc0(irq_set_size); + + /* Get to a known IRQ state */ + *irq_set = (struct vfio_irq_set) { + .argsz = irq_set_size, + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = irq_info.index, + .start = 0, + .count = 1, + }; + + *(int *)&irq_set->data = event_notifier_get_fd(e); + r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (r) { + error_setg_errno(errp, errno, "Failed to setup device interrupt"); + return -errno; + } + return 0; +} + +static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, + int size, int ofs) +{ + int ret; + + do { + ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) +{ + int ret; + + do { + ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, + Error **errp) +{ + int ret; + int i; + uint16_t pci_cmd; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char *group_file = NULL; + + /* Create a new container */ + s->container = open("/dev/vfio/vfio", O_RDWR); + + if (s->container == -1) { + error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); + return -errno; + } + if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + error_setg(errp, "Invalid VFIO version"); + ret = -EINVAL; + goto fail_container; + } + + if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "VFIO IOMMU check failed"); + ret = -EINVAL; + goto fail_container; + } + + /* Open the group */ + group_file = sysfs_find_group_file(device, errp); + if (!group_file) { + ret = -EINVAL; + goto fail_container; + } + + s->group = open(group_file, O_RDWR); + if (s->group == -1) { + error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", + group_file); + g_free(group_file); + ret = -errno; + goto fail_container; + } + g_free(group_file); + + /* Test the group is viable and available */ + if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { + error_setg_errno(errp, errno, "Failed to get VFIO group status"); + ret = -errno; + goto fail; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "VFIO group is not viable"); + ret = -EINVAL; + goto fail; + } + + /* Add the group to the container */ + if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { + error_setg_errno(errp, errno, "Failed to add group to VFIO container"); + ret = -errno; + goto fail; + } + + /* Enable the IOMMU model we want */ + if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); + ret = -errno; + goto fail; + } + + /* Get additional IOMMU info */ + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { + error_setg_errno(errp, errno, "Failed to get IOMMU info"); + ret = -errno; + goto fail; + } + + s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); + + if (s->device < 0) { + error_setg_errno(errp, errno, "Failed to get device fd"); + ret = -errno; + goto fail; + } + + /* Test and setup the device */ + if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { + error_setg_errno(errp, errno, "Failed to get device info"); + ret = -errno; + goto fail; + } + + if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { + error_setg(errp, "Invalid device regions"); + ret = -EINVAL; + goto fail; + } + + s->config_region_info = (struct vfio_region_info) { + .index = VFIO_PCI_CONFIG_REGION_INDEX, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { + error_setg_errno(errp, errno, "Failed to get config region info"); + ret = -errno; + goto fail; + } + + for (i = 0; i < 6; i++) { + ret = qemu_vfio_pci_init_bar(s, i, errp); + if (ret) { + goto fail; + } + } + + /* Enable bus master */ + ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + pci_cmd |= PCI_COMMAND_MASTER; + ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + return 0; +fail: + close(s->group); +fail_container: + close(s->container); + return ret; +} + +static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + trace_qemu_vfio_ram_block_added(s, host, size); + qemu_vfio_dma_map(s, host, size, false, NULL); +} + +static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + if (host) { + trace_qemu_vfio_ram_block_removed(s, host, size); + qemu_vfio_dma_unmap(s, host); + } +} + +static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, + void *opaque) +{ + int ret; + QEMUVFIOState *s = opaque; + + if (!host_addr) { + return 0; + } + ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); + if (ret) { + fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", + host_addr, (uint64_t)length); + } + return 0; +} + +static void qemu_vfio_open_common(QEMUVFIOState *s) +{ + s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; + s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; + ram_block_notifier_add(&s->ram_notifier); + s->low_water_mark = QEMU_VFIO_IOVA_MIN; + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); + qemu_mutex_init(&s->lock); +} + +/** + * Open a PCI device, e.g. "0000:00:01.0". + */ +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) +{ + int r; + QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); + + r = qemu_vfio_init_pci(s, device, errp); + if (r) { + g_free(s); + return NULL; + } + qemu_vfio_open_common(s); + return s; +} + +static void qemu_vfio_dump_mapping(IOVAMapping *m) +{ + if (QEMU_VFIO_DEBUG) { + printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, + (uint64_t)m->size, (uint64_t)m->iova); + } +} + +static void qemu_vfio_dump_mappings(QEMUVFIOState *s) +{ + int i; + + if (QEMU_VFIO_DEBUG) { + printf("vfio mappings\n"); + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_dump_mapping(&s->mappings[i]); + } + } +} + +/** + * Find the mapping entry that contains [host, host + size) and set @index to + * the position. If no entry contains it, @index is the position _after_ which + * to insert the new mapping. IOW, it is the index of the largest element that + * is smaller than @host, or -1 if no entry is. + */ +static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, + int *index) +{ + IOVAMapping *p = s->mappings; + IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; + IOVAMapping *mid; + trace_qemu_vfio_find_mapping(s, host); + if (!p) { + *index = -1; + return NULL; + } + while (true) { + mid = p + (q - p) / 2; + if (mid == p) { + break; + } + if (mid->host > host) { + q = mid; + } else if (mid->host < host) { + p = mid; + } else { + break; + } + } + if (mid->host > host) { + mid--; + } else if (mid < &s->mappings[s->nr_mappings - 1] + && (mid + 1)->host <= host) { + mid++; + } + *index = mid - &s->mappings[0]; + if (mid >= &s->mappings[0] && + mid->host <= host && mid->host + mid->size > host) { + assert(mid < &s->mappings[s->nr_mappings]); + return mid; + } + /* At this point *index + 1 is the right position to insert the new + * mapping.*/ + return NULL; +} + +/** + * Allocate IOVA and and create a new mapping record and insert it in @s. + */ +static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, + void *host, size_t size, + int index, uint64_t iova) +{ + int shift; + IOVAMapping m = {.host = host, .size = size, .iova = iova}; + IOVAMapping *insert; + + assert(QEMU_IS_ALIGNED(size, getpagesize())); + assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); + assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); + trace_qemu_vfio_new_mapping(s, host, size, index, iova); + + assert(index >= 0); + s->nr_mappings++; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); + insert = &s->mappings[index]; + shift = s->nr_mappings - index - 1; + if (shift) { + memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); + } + *insert = m; + return insert; +} + +/* Do the DMA mapping with VFIO. */ +static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, + uint64_t iova) +{ + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .iova = iova, + .vaddr = (uintptr_t)host, + .size = size, + }; + trace_qemu_vfio_do_mapping(s, host, size, iova); + + if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { + error_report("VFIO_MAP_DMA: %d", -errno); + return -errno; + } + return 0; +} + +/** + * Undo the DMA mapping from @s with VFIO, and remove from mapping list. + */ +static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, + Error **errp) +{ + int index; + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = mapping->iova, + .size = mapping->size, + }; + + index = mapping - s->mappings; + assert(mapping->size > 0); + assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); + assert(index >= 0 && index < s->nr_mappings); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); + } + memmove(mapping, &s->mappings[index + 1], + sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); + s->nr_mappings--; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); +} + +/* Check if the mapping list is (ascending) ordered. */ +static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) +{ + int i; + if (QEMU_VFIO_DEBUG) { + for (i = 0; i < s->nr_mappings - 1; ++i) { + if (!(s->mappings[i].host < s->mappings[i + 1].host)) { + fprintf(stderr, "item %d not sorted!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + if (!(s->mappings[i].host + s->mappings[i].size <= + s->mappings[i + 1].host)) { + fprintf(stderr, "item %d overlap with next!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + } + } + return true; +} + +/* Map [host, host + size) area into a contiguous IOVA address space, and store + * the result in @iova if not NULL. The caller need to make sure the area is + * aligned to page size, and mustn't overlap with existing mapping areas (split + * mapping status within this area is not allowed). + */ +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova) +{ + int ret = 0; + int index; + IOVAMapping *mapping; + uint64_t iova0; + + assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); + assert(QEMU_IS_ALIGNED(size, getpagesize())); + trace_qemu_vfio_dma_map(s, host, size, temporary, iova); + qemu_mutex_lock(&s->lock); + mapping = qemu_vfio_find_mapping(s, host, &index); + if (mapping) { + iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); + } else { + if (s->high_water_mark - s->low_water_mark + 1 < size) { + ret = -ENOMEM; + goto out; + } + if (!temporary) { + iova0 = s->low_water_mark; + mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); + if (!mapping) { + ret = -ENOMEM; + goto out; + } + assert(qemu_vfio_verify_mappings(s)); + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + qemu_vfio_undo_mapping(s, mapping, NULL); + goto out; + } + s->low_water_mark += size; + qemu_vfio_dump_mappings(s); + } else { + iova0 = s->high_water_mark - size; + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + goto out; + } + s->high_water_mark -= size; + } + } + if (iova) { + *iova = iova0; + } +out: + qemu_mutex_unlock(&s->lock); + return ret; +} + +/* Reset the high watermark and free all "temporary" mappings. */ +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = s->high_water_mark, + .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, + }; + trace_qemu_vfio_dma_reset_temporary(s); + qemu_mutex_lock(&s->lock); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_report("VFIO_UNMAP_DMA: %d", -errno); + qemu_mutex_unlock(&s->lock); + return -errno; + } + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_mutex_unlock(&s->lock); + return 0; +} + +/* Unmapping the whole area that was previously mapped with + * qemu_vfio_dma_map(). */ +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) +{ + int index = 0; + IOVAMapping *m; + + if (!host) { + return; + } + + trace_qemu_vfio_dma_unmap(s, host); + qemu_mutex_lock(&s->lock); + m = qemu_vfio_find_mapping(s, host, &index); + if (!m) { + goto out; + } + qemu_vfio_undo_mapping(s, m, NULL); +out: + qemu_mutex_unlock(&s->lock); +} + +static void qemu_vfio_reset(QEMUVFIOState *s) +{ + ioctl(s->device, VFIO_DEVICE_RESET); +} + +/* Close and free the VFIO resources. */ +void qemu_vfio_close(QEMUVFIOState *s) +{ + int i; + + if (!s) { + return; + } + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); + } + ram_block_notifier_remove(&s->ram_notifier); + qemu_vfio_reset(s); + close(s->device); + close(s->group); + close(s->container); +} -- cgit v1.2.3-55-g7522 From 23d0ba9319f182e258f480bafb7de9f79fb6e0e9 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 16 Jan 2018 14:08:56 +0800 Subject: block: Introduce buf register API Allow block driver to map and unmap a buffer for later I/O, as a performance hint. Signed-off-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Message-Id: <20180116060901.17413-5-famz@redhat.com> Signed-off-by: Fam Zheng --- block/block-backend.c | 10 ++++++++++ block/io.c | 24 ++++++++++++++++++++++++ include/block/block.h | 11 ++++++++++- include/block/block_int.h | 9 +++++++++ include/sysemu/block-backend.h | 3 +++ 5 files changed, 56 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/block/block-backend.c b/block/block-backend.c index baef8e7abc..f66349c2c9 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2096,3 +2096,13 @@ static void blk_root_drained_end(BdrvChild *child) } } } + +void blk_register_buf(BlockBackend *blk, void *host, size_t size) +{ + bdrv_register_buf(blk_bs(blk), host, size); +} + +void blk_unregister_buf(BlockBackend *blk, void *host) +{ + bdrv_unregister_buf(blk_bs(blk), host); +} diff --git a/block/io.c b/block/io.c index 7ea402352e..89d0745e95 100644 --- a/block/io.c +++ b/block/io.c @@ -2825,3 +2825,27 @@ void bdrv_io_unplug(BlockDriverState *bs) bdrv_io_unplug(child->bs); } } + +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_register_buf) { + bs->drv->bdrv_register_buf(bs, host, size); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_register_buf(child->bs, host, size); + } +} + +void bdrv_unregister_buf(BlockDriverState *bs, void *host) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_unregister_buf) { + bs->drv->bdrv_unregister_buf(bs, host); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_unregister_buf(child->bs, host); + } +} diff --git a/include/block/block.h b/include/block/block.h index 9b12774ddf..2025d7ed19 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -631,5 +631,14 @@ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, uint32_t granularity, Error **errp); - +/** + * + * bdrv_register_buf/bdrv_unregister_buf: + * + * Register/unregister a buffer for I/O. For example, VFIO drivers are + * interested to know the memory areas that would later be used for I/O, so + * that they can prepare IOMMU mapping etc., to get better performance. + */ +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); +void bdrv_unregister_buf(BlockDriverState *bs, void *host); #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 29cafa4236..99b9190627 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -446,6 +446,15 @@ struct BlockDriver { const char *name, Error **errp); + /** + * Register/unregister a buffer for I/O. For example, when the driver is + * interested to know the memory areas that will later be used in iovs, so + * that it can do IOMMU mapping with VFIO etc., in order to get better + * performance. In the case of VFIO drivers, this callback is used to do + * DMA mapping for hot buffers. + */ + void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); + void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); QLIST_ENTRY(BlockDriver) list; }; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index c4e52a5fa3..92ab624fac 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -229,4 +229,7 @@ void blk_io_limits_enable(BlockBackend *blk, const char *group); void blk_io_limits_update_group(BlockBackend *blk, const char *group); void blk_set_force_allow_inactivate(BlockBackend *blk); +void blk_register_buf(BlockBackend *blk, void *host, size_t size); +void blk_unregister_buf(BlockBackend *blk, void *host); + #endif -- cgit v1.2.3-55-g7522 From a3d9a352d4899a2f3567ae923069ee4a0959c0f4 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 16 Jan 2018 14:08:59 +0800 Subject: block: Move NVMe constants to a separate header Signed-off-by: Fam Zheng Reviewed-by: Stefan Hajnoczi Message-Id: <20180116060901.17413-8-famz@redhat.com> Signed-off-by: Fam Zheng --- block/nvme.c | 7 +- hw/block/nvme.h | 698 +------------------------------------------------- include/block/nvme.h | 700 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 702 insertions(+), 703 deletions(-) create mode 100644 include/block/nvme.h (limited to 'include') diff --git a/block/nvme.c b/block/nvme.c index a487b4d381..e9d0e218fc 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -22,12 +22,7 @@ #include "block/block_int.h" #include "trace.h" -/* TODO: Move nvme spec definitions from hw/block/nvme.h into a separate file - * that doesn't depend on dma/pci headers. */ -#include "sysemu/dma.h" -#include "hw/pci/pci.h" -#include "hw/block/block.h" -#include "hw/block/nvme.h" +#include "block/nvme.h" #define NVME_SQ_ENTRY_BYTES 64 #define NVME_CQ_ENTRY_BYTES 16 diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 7b62dad072..8f3981121d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -1,703 +1,7 @@ #ifndef HW_NVME_H #define HW_NVME_H #include "qemu/cutils.h" - -typedef struct NvmeBar { - uint64_t cap; - uint32_t vs; - uint32_t intms; - uint32_t intmc; - uint32_t cc; - uint32_t rsvd1; - uint32_t csts; - uint32_t nssrc; - uint32_t aqa; - uint64_t asq; - uint64_t acq; - uint32_t cmbloc; - uint32_t cmbsz; -} NvmeBar; - -enum NvmeCapShift { - CAP_MQES_SHIFT = 0, - CAP_CQR_SHIFT = 16, - CAP_AMS_SHIFT = 17, - CAP_TO_SHIFT = 24, - CAP_DSTRD_SHIFT = 32, - CAP_NSSRS_SHIFT = 33, - CAP_CSS_SHIFT = 37, - CAP_MPSMIN_SHIFT = 48, - CAP_MPSMAX_SHIFT = 52, -}; - -enum NvmeCapMask { - CAP_MQES_MASK = 0xffff, - CAP_CQR_MASK = 0x1, - CAP_AMS_MASK = 0x3, - CAP_TO_MASK = 0xff, - CAP_DSTRD_MASK = 0xf, - CAP_NSSRS_MASK = 0x1, - CAP_CSS_MASK = 0xff, - CAP_MPSMIN_MASK = 0xf, - CAP_MPSMAX_MASK = 0xf, -}; - -#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) -#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK) -#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK) -#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK) -#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK) -#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK) -#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) -#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) -#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) - -#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ - << CAP_MQES_SHIFT) -#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ - << CAP_CQR_SHIFT) -#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ - << CAP_AMS_SHIFT) -#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ - << CAP_TO_SHIFT) -#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ - << CAP_DSTRD_SHIFT) -#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ - << CAP_NSSRS_SHIFT) -#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ - << CAP_CSS_SHIFT) -#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ - << CAP_MPSMIN_SHIFT) -#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ - << CAP_MPSMAX_SHIFT) - -enum NvmeCcShift { - CC_EN_SHIFT = 0, - CC_CSS_SHIFT = 4, - CC_MPS_SHIFT = 7, - CC_AMS_SHIFT = 11, - CC_SHN_SHIFT = 14, - CC_IOSQES_SHIFT = 16, - CC_IOCQES_SHIFT = 20, -}; - -enum NvmeCcMask { - CC_EN_MASK = 0x1, - CC_CSS_MASK = 0x7, - CC_MPS_MASK = 0xf, - CC_AMS_MASK = 0x7, - CC_SHN_MASK = 0x3, - CC_IOSQES_MASK = 0xf, - CC_IOCQES_MASK = 0xf, -}; - -#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) -#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK) -#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) -#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK) -#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK) -#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) -#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) - -enum NvmeCstsShift { - CSTS_RDY_SHIFT = 0, - CSTS_CFS_SHIFT = 1, - CSTS_SHST_SHIFT = 2, - CSTS_NSSRO_SHIFT = 4, -}; - -enum NvmeCstsMask { - CSTS_RDY_MASK = 0x1, - CSTS_CFS_MASK = 0x1, - CSTS_SHST_MASK = 0x3, - CSTS_NSSRO_MASK = 0x1, -}; - -enum NvmeCsts { - NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT, - NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT, - NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT, - NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT, - NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT, - NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT, -}; - -#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK) -#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK) -#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK) -#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK) - -enum NvmeAqaShift { - AQA_ASQS_SHIFT = 0, - AQA_ACQS_SHIFT = 16, -}; - -enum NvmeAqaMask { - AQA_ASQS_MASK = 0xfff, - AQA_ACQS_MASK = 0xfff, -}; - -#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) -#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) - -enum NvmeCmblocShift { - CMBLOC_BIR_SHIFT = 0, - CMBLOC_OFST_SHIFT = 12, -}; - -enum NvmeCmblocMask { - CMBLOC_BIR_MASK = 0x7, - CMBLOC_OFST_MASK = 0xfffff, -}; - -#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ - CMBLOC_BIR_MASK) -#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ - CMBLOC_OFST_MASK) - -#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ - (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) -#define NVME_CMBLOC_SET_OFST(cmbloc, val) \ - (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) - -enum NvmeCmbszShift { - CMBSZ_SQS_SHIFT = 0, - CMBSZ_CQS_SHIFT = 1, - CMBSZ_LISTS_SHIFT = 2, - CMBSZ_RDS_SHIFT = 3, - CMBSZ_WDS_SHIFT = 4, - CMBSZ_SZU_SHIFT = 8, - CMBSZ_SZ_SHIFT = 12, -}; - -enum NvmeCmbszMask { - CMBSZ_SQS_MASK = 0x1, - CMBSZ_CQS_MASK = 0x1, - CMBSZ_LISTS_MASK = 0x1, - CMBSZ_RDS_MASK = 0x1, - CMBSZ_WDS_MASK = 0x1, - CMBSZ_SZU_MASK = 0xf, - CMBSZ_SZ_MASK = 0xfffff, -}; - -#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK) -#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK) -#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK) -#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK) -#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK) -#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK) -#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK) - -#define NVME_CMBSZ_SET_SQS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT) -#define NVME_CMBSZ_SET_CQS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT) -#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT) -#define NVME_CMBSZ_SET_RDS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT) -#define NVME_CMBSZ_SET_WDS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT) -#define NVME_CMBSZ_SET_SZU(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT) -#define NVME_CMBSZ_SET_SZ(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT) - -#define NVME_CMBSZ_GETSIZE(cmbsz) \ - (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) - -typedef struct NvmeCmd { - uint8_t opcode; - uint8_t fuse; - uint16_t cid; - uint32_t nsid; - uint64_t res1; - uint64_t mptr; - uint64_t prp1; - uint64_t prp2; - uint32_t cdw10; - uint32_t cdw11; - uint32_t cdw12; - uint32_t cdw13; - uint32_t cdw14; - uint32_t cdw15; -} NvmeCmd; - -enum NvmeAdminCommands { - NVME_ADM_CMD_DELETE_SQ = 0x00, - NVME_ADM_CMD_CREATE_SQ = 0x01, - NVME_ADM_CMD_GET_LOG_PAGE = 0x02, - NVME_ADM_CMD_DELETE_CQ = 0x04, - NVME_ADM_CMD_CREATE_CQ = 0x05, - NVME_ADM_CMD_IDENTIFY = 0x06, - NVME_ADM_CMD_ABORT = 0x08, - NVME_ADM_CMD_SET_FEATURES = 0x09, - NVME_ADM_CMD_GET_FEATURES = 0x0a, - NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, - NVME_ADM_CMD_ACTIVATE_FW = 0x10, - NVME_ADM_CMD_DOWNLOAD_FW = 0x11, - NVME_ADM_CMD_FORMAT_NVM = 0x80, - NVME_ADM_CMD_SECURITY_SEND = 0x81, - NVME_ADM_CMD_SECURITY_RECV = 0x82, -}; - -enum NvmeIoCommands { - NVME_CMD_FLUSH = 0x00, - NVME_CMD_WRITE = 0x01, - NVME_CMD_READ = 0x02, - NVME_CMD_WRITE_UNCOR = 0x04, - NVME_CMD_COMPARE = 0x05, - NVME_CMD_WRITE_ZEROS = 0x08, - NVME_CMD_DSM = 0x09, -}; - -typedef struct NvmeDeleteQ { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[9]; - uint16_t qid; - uint16_t rsvd10; - uint32_t rsvd11[5]; -} NvmeDeleteQ; - -typedef struct NvmeCreateCq { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[5]; - uint64_t prp1; - uint64_t rsvd8; - uint16_t cqid; - uint16_t qsize; - uint16_t cq_flags; - uint16_t irq_vector; - uint32_t rsvd12[4]; -} NvmeCreateCq; - -#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) -#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) - -typedef struct NvmeCreateSq { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[5]; - uint64_t prp1; - uint64_t rsvd8; - uint16_t sqid; - uint16_t qsize; - uint16_t sq_flags; - uint16_t cqid; - uint32_t rsvd12[4]; -} NvmeCreateSq; - -#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) -#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) - -enum NvmeQueueFlags { - NVME_Q_PC = 1, - NVME_Q_PRIO_URGENT = 0, - NVME_Q_PRIO_HIGH = 1, - NVME_Q_PRIO_NORMAL = 2, - NVME_Q_PRIO_LOW = 3, -}; - -typedef struct NvmeIdentify { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2[2]; - uint64_t prp1; - uint64_t prp2; - uint32_t cns; - uint32_t rsvd11[5]; -} NvmeIdentify; - -typedef struct NvmeRwCmd { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2; - uint64_t mptr; - uint64_t prp1; - uint64_t prp2; - uint64_t slba; - uint16_t nlb; - uint16_t control; - uint32_t dsmgmt; - uint32_t reftag; - uint16_t apptag; - uint16_t appmask; -} NvmeRwCmd; - -enum { - NVME_RW_LR = 1 << 15, - NVME_RW_FUA = 1 << 14, - NVME_RW_DSM_FREQ_UNSPEC = 0, - NVME_RW_DSM_FREQ_TYPICAL = 1, - NVME_RW_DSM_FREQ_RARE = 2, - NVME_RW_DSM_FREQ_READS = 3, - NVME_RW_DSM_FREQ_WRITES = 4, - NVME_RW_DSM_FREQ_RW = 5, - NVME_RW_DSM_FREQ_ONCE = 6, - NVME_RW_DSM_FREQ_PREFETCH = 7, - NVME_RW_DSM_FREQ_TEMP = 8, - NVME_RW_DSM_LATENCY_NONE = 0 << 4, - NVME_RW_DSM_LATENCY_IDLE = 1 << 4, - NVME_RW_DSM_LATENCY_NORM = 2 << 4, - NVME_RW_DSM_LATENCY_LOW = 3 << 4, - NVME_RW_DSM_SEQ_REQ = 1 << 6, - NVME_RW_DSM_COMPRESSED = 1 << 7, - NVME_RW_PRINFO_PRACT = 1 << 13, - NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, - NVME_RW_PRINFO_PRCHK_APP = 1 << 11, - NVME_RW_PRINFO_PRCHK_REF = 1 << 10, -}; - -typedef struct NvmeDsmCmd { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2[2]; - uint64_t prp1; - uint64_t prp2; - uint32_t nr; - uint32_t attributes; - uint32_t rsvd12[4]; -} NvmeDsmCmd; - -enum { - NVME_DSMGMT_IDR = 1 << 0, - NVME_DSMGMT_IDW = 1 << 1, - NVME_DSMGMT_AD = 1 << 2, -}; - -typedef struct NvmeDsmRange { - uint32_t cattr; - uint32_t nlb; - uint64_t slba; -} NvmeDsmRange; - -enum NvmeAsyncEventRequest { - NVME_AER_TYPE_ERROR = 0, - NVME_AER_TYPE_SMART = 1, - NVME_AER_TYPE_IO_SPECIFIC = 6, - NVME_AER_TYPE_VENDOR_SPECIFIC = 7, - NVME_AER_INFO_ERR_INVALID_SQ = 0, - NVME_AER_INFO_ERR_INVALID_DB = 1, - NVME_AER_INFO_ERR_DIAG_FAIL = 2, - NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, - NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, - NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, - NVME_AER_INFO_SMART_RELIABILITY = 0, - NVME_AER_INFO_SMART_TEMP_THRESH = 1, - NVME_AER_INFO_SMART_SPARE_THRESH = 2, -}; - -typedef struct NvmeAerResult { - uint8_t event_type; - uint8_t event_info; - uint8_t log_page; - uint8_t resv; -} NvmeAerResult; - -typedef struct NvmeCqe { - uint32_t result; - uint32_t rsvd; - uint16_t sq_head; - uint16_t sq_id; - uint16_t cid; - uint16_t status; -} NvmeCqe; - -enum NvmeStatusCodes { - NVME_SUCCESS = 0x0000, - NVME_INVALID_OPCODE = 0x0001, - NVME_INVALID_FIELD = 0x0002, - NVME_CID_CONFLICT = 0x0003, - NVME_DATA_TRAS_ERROR = 0x0004, - NVME_POWER_LOSS_ABORT = 0x0005, - NVME_INTERNAL_DEV_ERROR = 0x0006, - NVME_CMD_ABORT_REQ = 0x0007, - NVME_CMD_ABORT_SQ_DEL = 0x0008, - NVME_CMD_ABORT_FAILED_FUSE = 0x0009, - NVME_CMD_ABORT_MISSING_FUSE = 0x000a, - NVME_INVALID_NSID = 0x000b, - NVME_CMD_SEQ_ERROR = 0x000c, - NVME_LBA_RANGE = 0x0080, - NVME_CAP_EXCEEDED = 0x0081, - NVME_NS_NOT_READY = 0x0082, - NVME_NS_RESV_CONFLICT = 0x0083, - NVME_INVALID_CQID = 0x0100, - NVME_INVALID_QID = 0x0101, - NVME_MAX_QSIZE_EXCEEDED = 0x0102, - NVME_ACL_EXCEEDED = 0x0103, - NVME_RESERVED = 0x0104, - NVME_AER_LIMIT_EXCEEDED = 0x0105, - NVME_INVALID_FW_SLOT = 0x0106, - NVME_INVALID_FW_IMAGE = 0x0107, - NVME_INVALID_IRQ_VECTOR = 0x0108, - NVME_INVALID_LOG_ID = 0x0109, - NVME_INVALID_FORMAT = 0x010a, - NVME_FW_REQ_RESET = 0x010b, - NVME_INVALID_QUEUE_DEL = 0x010c, - NVME_FID_NOT_SAVEABLE = 0x010d, - NVME_FID_NOT_NSID_SPEC = 0x010f, - NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, - NVME_CONFLICTING_ATTRS = 0x0180, - NVME_INVALID_PROT_INFO = 0x0181, - NVME_WRITE_TO_RO = 0x0182, - NVME_WRITE_FAULT = 0x0280, - NVME_UNRECOVERED_READ = 0x0281, - NVME_E2E_GUARD_ERROR = 0x0282, - NVME_E2E_APP_ERROR = 0x0283, - NVME_E2E_REF_ERROR = 0x0284, - NVME_CMP_FAILURE = 0x0285, - NVME_ACCESS_DENIED = 0x0286, - NVME_MORE = 0x2000, - NVME_DNR = 0x4000, - NVME_NO_COMPLETE = 0xffff, -}; - -typedef struct NvmeFwSlotInfoLog { - uint8_t afi; - uint8_t reserved1[7]; - uint8_t frs1[8]; - uint8_t frs2[8]; - uint8_t frs3[8]; - uint8_t frs4[8]; - uint8_t frs5[8]; - uint8_t frs6[8]; - uint8_t frs7[8]; - uint8_t reserved2[448]; -} NvmeFwSlotInfoLog; - -typedef struct NvmeErrorLog { - uint64_t error_count; - uint16_t sqid; - uint16_t cid; - uint16_t status_field; - uint16_t param_error_location; - uint64_t lba; - uint32_t nsid; - uint8_t vs; - uint8_t resv[35]; -} NvmeErrorLog; - -typedef struct NvmeSmartLog { - uint8_t critical_warning; - uint8_t temperature[2]; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved1[26]; - uint64_t data_units_read[2]; - uint64_t data_units_written[2]; - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t number_of_error_log_entries[2]; - uint8_t reserved2[320]; -} NvmeSmartLog; - -enum NvmeSmartWarn { - NVME_SMART_SPARE = 1 << 0, - NVME_SMART_TEMPERATURE = 1 << 1, - NVME_SMART_RELIABILITY = 1 << 2, - NVME_SMART_MEDIA_READ_ONLY = 1 << 3, - NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, -}; - -enum LogIdentifier { - NVME_LOG_ERROR_INFO = 0x01, - NVME_LOG_SMART_INFO = 0x02, - NVME_LOG_FW_SLOT_INFO = 0x03, -}; - -typedef struct NvmePSD { - uint16_t mp; - uint16_t reserved; - uint32_t enlat; - uint32_t exlat; - uint8_t rrt; - uint8_t rrl; - uint8_t rwt; - uint8_t rwl; - uint8_t resv[16]; -} NvmePSD; - -typedef struct NvmeIdCtrl { - uint16_t vid; - uint16_t ssvid; - uint8_t sn[20]; - uint8_t mn[40]; - uint8_t fr[8]; - uint8_t rab; - uint8_t ieee[3]; - uint8_t cmic; - uint8_t mdts; - uint8_t rsvd255[178]; - uint16_t oacs; - uint8_t acl; - uint8_t aerl; - uint8_t frmw; - uint8_t lpa; - uint8_t elpe; - uint8_t npss; - uint8_t rsvd511[248]; - uint8_t sqes; - uint8_t cqes; - uint16_t rsvd515; - uint32_t nn; - uint16_t oncs; - uint16_t fuses; - uint8_t fna; - uint8_t vwc; - uint16_t awun; - uint16_t awupf; - uint8_t rsvd703[174]; - uint8_t rsvd2047[1344]; - NvmePSD psd[32]; - uint8_t vs[1024]; -} NvmeIdCtrl; - -enum NvmeIdCtrlOacs { - NVME_OACS_SECURITY = 1 << 0, - NVME_OACS_FORMAT = 1 << 1, - NVME_OACS_FW = 1 << 2, -}; - -enum NvmeIdCtrlOncs { - NVME_ONCS_COMPARE = 1 << 0, - NVME_ONCS_WRITE_UNCORR = 1 << 1, - NVME_ONCS_DSM = 1 << 2, - NVME_ONCS_WRITE_ZEROS = 1 << 3, - NVME_ONCS_FEATURES = 1 << 4, - NVME_ONCS_RESRVATIONS = 1 << 5, -}; - -#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) -#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) -#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) -#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) - -typedef struct NvmeFeatureVal { - uint32_t arbitration; - uint32_t power_mgmt; - uint32_t temp_thresh; - uint32_t err_rec; - uint32_t volatile_wc; - uint32_t num_queues; - uint32_t int_coalescing; - uint32_t *int_vector_config; - uint32_t write_atomicity; - uint32_t async_config; - uint32_t sw_prog_marker; -} NvmeFeatureVal; - -#define NVME_ARB_AB(arb) (arb & 0x7) -#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) -#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) -#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) - -#define NVME_INTC_THR(intc) (intc & 0xff) -#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) - -enum NvmeFeatureIds { - NVME_ARBITRATION = 0x1, - NVME_POWER_MANAGEMENT = 0x2, - NVME_LBA_RANGE_TYPE = 0x3, - NVME_TEMPERATURE_THRESHOLD = 0x4, - NVME_ERROR_RECOVERY = 0x5, - NVME_VOLATILE_WRITE_CACHE = 0x6, - NVME_NUMBER_OF_QUEUES = 0x7, - NVME_INTERRUPT_COALESCING = 0x8, - NVME_INTERRUPT_VECTOR_CONF = 0x9, - NVME_WRITE_ATOMICITY = 0xa, - NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, - NVME_SOFTWARE_PROGRESS_MARKER = 0x80 -}; - -typedef struct NvmeRangeType { - uint8_t type; - uint8_t attributes; - uint8_t rsvd2[14]; - uint64_t slba; - uint64_t nlb; - uint8_t guid[16]; - uint8_t rsvd48[16]; -} NvmeRangeType; - -typedef struct NvmeLBAF { - uint16_t ms; - uint8_t ds; - uint8_t rp; -} NvmeLBAF; - -typedef struct NvmeIdNs { - uint64_t nsze; - uint64_t ncap; - uint64_t nuse; - uint8_t nsfeat; - uint8_t nlbaf; - uint8_t flbas; - uint8_t mc; - uint8_t dpc; - uint8_t dps; - uint8_t res30[98]; - NvmeLBAF lbaf[16]; - uint8_t res192[192]; - uint8_t vs[3712]; -} NvmeIdNs; - -#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) -#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) -#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) -#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) -#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) -#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1) -#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1) -#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1) -#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1) -#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1)) -#define NVME_ID_NS_DPC_TYPE_MASK 0x7 - -enum NvmeIdNsDps { - DPS_TYPE_NONE = 0, - DPS_TYPE_1 = 1, - DPS_TYPE_2 = 2, - DPS_TYPE_3 = 3, - DPS_TYPE_MASK = 0x7, - DPS_FIRST_EIGHT = 8, -}; - -static inline void _nvme_check_size(void) -{ - QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); - QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); - QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); - QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); - QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); -} +#include "block/nvme.h" typedef struct NvmeAsyncEvent { QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry; diff --git a/include/block/nvme.h b/include/block/nvme.h new file mode 100644 index 0000000000..849a6f3fa3 --- /dev/null +++ b/include/block/nvme.h @@ -0,0 +1,700 @@ +#ifndef BLOCK_NVME_H +#define BLOCK_NVME_H + +typedef struct NvmeBar { + uint64_t cap; + uint32_t vs; + uint32_t intms; + uint32_t intmc; + uint32_t cc; + uint32_t rsvd1; + uint32_t csts; + uint32_t nssrc; + uint32_t aqa; + uint64_t asq; + uint64_t acq; + uint32_t cmbloc; + uint32_t cmbsz; +} NvmeBar; + +enum NvmeCapShift { + CAP_MQES_SHIFT = 0, + CAP_CQR_SHIFT = 16, + CAP_AMS_SHIFT = 17, + CAP_TO_SHIFT = 24, + CAP_DSTRD_SHIFT = 32, + CAP_NSSRS_SHIFT = 33, + CAP_CSS_SHIFT = 37, + CAP_MPSMIN_SHIFT = 48, + CAP_MPSMAX_SHIFT = 52, +}; + +enum NvmeCapMask { + CAP_MQES_MASK = 0xffff, + CAP_CQR_MASK = 0x1, + CAP_AMS_MASK = 0x3, + CAP_TO_MASK = 0xff, + CAP_DSTRD_MASK = 0xf, + CAP_NSSRS_MASK = 0x1, + CAP_CSS_MASK = 0xff, + CAP_MPSMIN_MASK = 0xf, + CAP_MPSMAX_MASK = 0xf, +}; + +#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) +#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK) +#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK) +#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK) +#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK) +#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK) +#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) +#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) +#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) + +#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ + << CAP_MQES_SHIFT) +#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ + << CAP_CQR_SHIFT) +#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ + << CAP_AMS_SHIFT) +#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ + << CAP_TO_SHIFT) +#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ + << CAP_DSTRD_SHIFT) +#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ + << CAP_NSSRS_SHIFT) +#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ + << CAP_CSS_SHIFT) +#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ + << CAP_MPSMIN_SHIFT) +#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ + << CAP_MPSMAX_SHIFT) + +enum NvmeCcShift { + CC_EN_SHIFT = 0, + CC_CSS_SHIFT = 4, + CC_MPS_SHIFT = 7, + CC_AMS_SHIFT = 11, + CC_SHN_SHIFT = 14, + CC_IOSQES_SHIFT = 16, + CC_IOCQES_SHIFT = 20, +}; + +enum NvmeCcMask { + CC_EN_MASK = 0x1, + CC_CSS_MASK = 0x7, + CC_MPS_MASK = 0xf, + CC_AMS_MASK = 0x7, + CC_SHN_MASK = 0x3, + CC_IOSQES_MASK = 0xf, + CC_IOCQES_MASK = 0xf, +}; + +#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) +#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK) +#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) +#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK) +#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK) +#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) +#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) + +enum NvmeCstsShift { + CSTS_RDY_SHIFT = 0, + CSTS_CFS_SHIFT = 1, + CSTS_SHST_SHIFT = 2, + CSTS_NSSRO_SHIFT = 4, +}; + +enum NvmeCstsMask { + CSTS_RDY_MASK = 0x1, + CSTS_CFS_MASK = 0x1, + CSTS_SHST_MASK = 0x3, + CSTS_NSSRO_MASK = 0x1, +}; + +enum NvmeCsts { + NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT, + NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT, + NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT, + NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT, +}; + +#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK) +#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK) +#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK) +#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK) + +enum NvmeAqaShift { + AQA_ASQS_SHIFT = 0, + AQA_ACQS_SHIFT = 16, +}; + +enum NvmeAqaMask { + AQA_ASQS_MASK = 0xfff, + AQA_ACQS_MASK = 0xfff, +}; + +#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) +#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) + +enum NvmeCmblocShift { + CMBLOC_BIR_SHIFT = 0, + CMBLOC_OFST_SHIFT = 12, +}; + +enum NvmeCmblocMask { + CMBLOC_BIR_MASK = 0x7, + CMBLOC_OFST_MASK = 0xfffff, +}; + +#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ + CMBLOC_BIR_MASK) +#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ + CMBLOC_OFST_MASK) + +#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) +#define NVME_CMBLOC_SET_OFST(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) + +enum NvmeCmbszShift { + CMBSZ_SQS_SHIFT = 0, + CMBSZ_CQS_SHIFT = 1, + CMBSZ_LISTS_SHIFT = 2, + CMBSZ_RDS_SHIFT = 3, + CMBSZ_WDS_SHIFT = 4, + CMBSZ_SZU_SHIFT = 8, + CMBSZ_SZ_SHIFT = 12, +}; + +enum NvmeCmbszMask { + CMBSZ_SQS_MASK = 0x1, + CMBSZ_CQS_MASK = 0x1, + CMBSZ_LISTS_MASK = 0x1, + CMBSZ_RDS_MASK = 0x1, + CMBSZ_WDS_MASK = 0x1, + CMBSZ_SZU_MASK = 0xf, + CMBSZ_SZ_MASK = 0xfffff, +}; + +#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK) +#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK) +#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK) +#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK) +#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK) +#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK) +#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK) + +#define NVME_CMBSZ_SET_SQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT) +#define NVME_CMBSZ_SET_CQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT) +#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT) +#define NVME_CMBSZ_SET_RDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT) +#define NVME_CMBSZ_SET_WDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT) +#define NVME_CMBSZ_SET_SZU(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT) +#define NVME_CMBSZ_SET_SZ(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT) + +#define NVME_CMBSZ_GETSIZE(cmbsz) \ + (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) + +typedef struct NvmeCmd { + uint8_t opcode; + uint8_t fuse; + uint16_t cid; + uint32_t nsid; + uint64_t res1; + uint64_t mptr; + uint64_t prp1; + uint64_t prp2; + uint32_t cdw10; + uint32_t cdw11; + uint32_t cdw12; + uint32_t cdw13; + uint32_t cdw14; + uint32_t cdw15; +} NvmeCmd; + +enum NvmeAdminCommands { + NVME_ADM_CMD_DELETE_SQ = 0x00, + NVME_ADM_CMD_CREATE_SQ = 0x01, + NVME_ADM_CMD_GET_LOG_PAGE = 0x02, + NVME_ADM_CMD_DELETE_CQ = 0x04, + NVME_ADM_CMD_CREATE_CQ = 0x05, + NVME_ADM_CMD_IDENTIFY = 0x06, + NVME_ADM_CMD_ABORT = 0x08, + NVME_ADM_CMD_SET_FEATURES = 0x09, + NVME_ADM_CMD_GET_FEATURES = 0x0a, + NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, + NVME_ADM_CMD_ACTIVATE_FW = 0x10, + NVME_ADM_CMD_DOWNLOAD_FW = 0x11, + NVME_ADM_CMD_FORMAT_NVM = 0x80, + NVME_ADM_CMD_SECURITY_SEND = 0x81, + NVME_ADM_CMD_SECURITY_RECV = 0x82, +}; + +enum NvmeIoCommands { + NVME_CMD_FLUSH = 0x00, + NVME_CMD_WRITE = 0x01, + NVME_CMD_READ = 0x02, + NVME_CMD_WRITE_UNCOR = 0x04, + NVME_CMD_COMPARE = 0x05, + NVME_CMD_WRITE_ZEROS = 0x08, + NVME_CMD_DSM = 0x09, +}; + +typedef struct NvmeDeleteQ { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[9]; + uint16_t qid; + uint16_t rsvd10; + uint32_t rsvd11[5]; +} NvmeDeleteQ; + +typedef struct NvmeCreateCq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t cqid; + uint16_t qsize; + uint16_t cq_flags; + uint16_t irq_vector; + uint32_t rsvd12[4]; +} NvmeCreateCq; + +#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) +#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) + +typedef struct NvmeCreateSq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t sqid; + uint16_t qsize; + uint16_t sq_flags; + uint16_t cqid; + uint32_t rsvd12[4]; +} NvmeCreateSq; + +#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) +#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) + +enum NvmeQueueFlags { + NVME_Q_PC = 1, + NVME_Q_PRIO_URGENT = 0, + NVME_Q_PRIO_HIGH = 1, + NVME_Q_PRIO_NORMAL = 2, + NVME_Q_PRIO_LOW = 3, +}; + +typedef struct NvmeIdentify { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + uint64_t prp1; + uint64_t prp2; + uint32_t cns; + uint32_t rsvd11[5]; +} NvmeIdentify; + +typedef struct NvmeRwCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2; + uint64_t mptr; + uint64_t prp1; + uint64_t prp2; + uint64_t slba; + uint16_t nlb; + uint16_t control; + uint32_t dsmgmt; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeRwCmd; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, +}; + +typedef struct NvmeDsmCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + uint64_t prp1; + uint64_t prp2; + uint32_t nr; + uint32_t attributes; + uint32_t rsvd12[4]; +} NvmeDsmCmd; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +typedef struct NvmeDsmRange { + uint32_t cattr; + uint32_t nlb; + uint64_t slba; +} NvmeDsmRange; + +enum NvmeAsyncEventRequest { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_IO_SPECIFIC = 6, + NVME_AER_TYPE_VENDOR_SPECIFIC = 7, + NVME_AER_INFO_ERR_INVALID_SQ = 0, + NVME_AER_INFO_ERR_INVALID_DB = 1, + NVME_AER_INFO_ERR_DIAG_FAIL = 2, + NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, + NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, + NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, + NVME_AER_INFO_SMART_RELIABILITY = 0, + NVME_AER_INFO_SMART_TEMP_THRESH = 1, + NVME_AER_INFO_SMART_SPARE_THRESH = 2, +}; + +typedef struct NvmeAerResult { + uint8_t event_type; + uint8_t event_info; + uint8_t log_page; + uint8_t resv; +} NvmeAerResult; + +typedef struct NvmeCqe { + uint32_t result; + uint32_t rsvd; + uint16_t sq_head; + uint16_t sq_id; + uint16_t cid; + uint16_t status; +} NvmeCqe; + +enum NvmeStatusCodes { + NVME_SUCCESS = 0x0000, + NVME_INVALID_OPCODE = 0x0001, + NVME_INVALID_FIELD = 0x0002, + NVME_CID_CONFLICT = 0x0003, + NVME_DATA_TRAS_ERROR = 0x0004, + NVME_POWER_LOSS_ABORT = 0x0005, + NVME_INTERNAL_DEV_ERROR = 0x0006, + NVME_CMD_ABORT_REQ = 0x0007, + NVME_CMD_ABORT_SQ_DEL = 0x0008, + NVME_CMD_ABORT_FAILED_FUSE = 0x0009, + NVME_CMD_ABORT_MISSING_FUSE = 0x000a, + NVME_INVALID_NSID = 0x000b, + NVME_CMD_SEQ_ERROR = 0x000c, + NVME_LBA_RANGE = 0x0080, + NVME_CAP_EXCEEDED = 0x0081, + NVME_NS_NOT_READY = 0x0082, + NVME_NS_RESV_CONFLICT = 0x0083, + NVME_INVALID_CQID = 0x0100, + NVME_INVALID_QID = 0x0101, + NVME_MAX_QSIZE_EXCEEDED = 0x0102, + NVME_ACL_EXCEEDED = 0x0103, + NVME_RESERVED = 0x0104, + NVME_AER_LIMIT_EXCEEDED = 0x0105, + NVME_INVALID_FW_SLOT = 0x0106, + NVME_INVALID_FW_IMAGE = 0x0107, + NVME_INVALID_IRQ_VECTOR = 0x0108, + NVME_INVALID_LOG_ID = 0x0109, + NVME_INVALID_FORMAT = 0x010a, + NVME_FW_REQ_RESET = 0x010b, + NVME_INVALID_QUEUE_DEL = 0x010c, + NVME_FID_NOT_SAVEABLE = 0x010d, + NVME_FID_NOT_NSID_SPEC = 0x010f, + NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, + NVME_CONFLICTING_ATTRS = 0x0180, + NVME_INVALID_PROT_INFO = 0x0181, + NVME_WRITE_TO_RO = 0x0182, + NVME_WRITE_FAULT = 0x0280, + NVME_UNRECOVERED_READ = 0x0281, + NVME_E2E_GUARD_ERROR = 0x0282, + NVME_E2E_APP_ERROR = 0x0283, + NVME_E2E_REF_ERROR = 0x0284, + NVME_CMP_FAILURE = 0x0285, + NVME_ACCESS_DENIED = 0x0286, + NVME_MORE = 0x2000, + NVME_DNR = 0x4000, + NVME_NO_COMPLETE = 0xffff, +}; + +typedef struct NvmeFwSlotInfoLog { + uint8_t afi; + uint8_t reserved1[7]; + uint8_t frs1[8]; + uint8_t frs2[8]; + uint8_t frs3[8]; + uint8_t frs4[8]; + uint8_t frs5[8]; + uint8_t frs6[8]; + uint8_t frs7[8]; + uint8_t reserved2[448]; +} NvmeFwSlotInfoLog; + +typedef struct NvmeErrorLog { + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status_field; + uint16_t param_error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vs; + uint8_t resv[35]; +} NvmeErrorLog; + +typedef struct NvmeSmartLog { + uint8_t critical_warning; + uint8_t temperature[2]; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + uint8_t reserved1[26]; + uint64_t data_units_read[2]; + uint64_t data_units_written[2]; + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t number_of_error_log_entries[2]; + uint8_t reserved2[320]; +} NvmeSmartLog; + +enum NvmeSmartWarn { + NVME_SMART_SPARE = 1 << 0, + NVME_SMART_TEMPERATURE = 1 << 1, + NVME_SMART_RELIABILITY = 1 << 2, + NVME_SMART_MEDIA_READ_ONLY = 1 << 3, + NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, +}; + +enum LogIdentifier { + NVME_LOG_ERROR_INFO = 0x01, + NVME_LOG_SMART_INFO = 0x02, + NVME_LOG_FW_SLOT_INFO = 0x03, +}; + +typedef struct NvmePSD { + uint16_t mp; + uint16_t reserved; + uint32_t enlat; + uint32_t exlat; + uint8_t rrt; + uint8_t rrl; + uint8_t rwt; + uint8_t rwl; + uint8_t resv[16]; +} NvmePSD; + +typedef struct NvmeIdCtrl { + uint16_t vid; + uint16_t ssvid; + uint8_t sn[20]; + uint8_t mn[40]; + uint8_t fr[8]; + uint8_t rab; + uint8_t ieee[3]; + uint8_t cmic; + uint8_t mdts; + uint8_t rsvd255[178]; + uint16_t oacs; + uint8_t acl; + uint8_t aerl; + uint8_t frmw; + uint8_t lpa; + uint8_t elpe; + uint8_t npss; + uint8_t rsvd511[248]; + uint8_t sqes; + uint8_t cqes; + uint16_t rsvd515; + uint32_t nn; + uint16_t oncs; + uint16_t fuses; + uint8_t fna; + uint8_t vwc; + uint16_t awun; + uint16_t awupf; + uint8_t rsvd703[174]; + uint8_t rsvd2047[1344]; + NvmePSD psd[32]; + uint8_t vs[1024]; +} NvmeIdCtrl; + +enum NvmeIdCtrlOacs { + NVME_OACS_SECURITY = 1 << 0, + NVME_OACS_FORMAT = 1 << 1, + NVME_OACS_FW = 1 << 2, +}; + +enum NvmeIdCtrlOncs { + NVME_ONCS_COMPARE = 1 << 0, + NVME_ONCS_WRITE_UNCORR = 1 << 1, + NVME_ONCS_DSM = 1 << 2, + NVME_ONCS_WRITE_ZEROS = 1 << 3, + NVME_ONCS_FEATURES = 1 << 4, + NVME_ONCS_RESRVATIONS = 1 << 5, +}; + +#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) +#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) +#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) +#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) + +typedef struct NvmeFeatureVal { + uint32_t arbitration; + uint32_t power_mgmt; + uint32_t temp_thresh; + uint32_t err_rec; + uint32_t volatile_wc; + uint32_t num_queues; + uint32_t int_coalescing; + uint32_t *int_vector_config; + uint32_t write_atomicity; + uint32_t async_config; + uint32_t sw_prog_marker; +} NvmeFeatureVal; + +#define NVME_ARB_AB(arb) (arb & 0x7) +#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) +#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) +#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) + +#define NVME_INTC_THR(intc) (intc & 0xff) +#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) + +enum NvmeFeatureIds { + NVME_ARBITRATION = 0x1, + NVME_POWER_MANAGEMENT = 0x2, + NVME_LBA_RANGE_TYPE = 0x3, + NVME_TEMPERATURE_THRESHOLD = 0x4, + NVME_ERROR_RECOVERY = 0x5, + NVME_VOLATILE_WRITE_CACHE = 0x6, + NVME_NUMBER_OF_QUEUES = 0x7, + NVME_INTERRUPT_COALESCING = 0x8, + NVME_INTERRUPT_VECTOR_CONF = 0x9, + NVME_WRITE_ATOMICITY = 0xa, + NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, + NVME_SOFTWARE_PROGRESS_MARKER = 0x80 +}; + +typedef struct NvmeRangeType { + uint8_t type; + uint8_t attributes; + uint8_t rsvd2[14]; + uint64_t slba; + uint64_t nlb; + uint8_t guid[16]; + uint8_t rsvd48[16]; +} NvmeRangeType; + +typedef struct NvmeLBAF { + uint16_t ms; + uint8_t ds; + uint8_t rp; +} NvmeLBAF; + +typedef struct NvmeIdNs { + uint64_t nsze; + uint64_t ncap; + uint64_t nuse; + uint8_t nsfeat; + uint8_t nlbaf; + uint8_t flbas; + uint8_t mc; + uint8_t dpc; + uint8_t dps; + uint8_t res30[98]; + NvmeLBAF lbaf[16]; + uint8_t res192[192]; + uint8_t vs[3712]; +} NvmeIdNs; + +#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) +#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) +#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) +#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) +#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) +#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1) +#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1) +#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1) +#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1) +#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1)) +#define NVME_ID_NS_DPC_TYPE_MASK 0x7 + +enum NvmeIdNsDps { + DPS_TYPE_NONE = 0, + DPS_TYPE_1 = 1, + DPS_TYPE_2 = 2, + DPS_TYPE_3 = 3, + DPS_TYPE_MASK = 0x7, + DPS_FIRST_EIGHT = 8, +}; + +static inline void _nvme_check_size(void) +{ + QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); +} +#endif -- cgit v1.2.3-55-g7522