diff options
249 files changed, 10441 insertions, 1456 deletions
diff --git a/.github/lockdown.yml b/.github/lockdown.yml index 9acc393f1c..07fc2f31ee 100644 --- a/.github/lockdown.yml +++ b/.github/lockdown.yml @@ -10,8 +10,8 @@ issues: comment: | Thank you for your interest in the QEMU project. - This repository is a read-only mirror of the project's master - repostories hosted on https://git.qemu.org/git/qemu.git. + This repository is a read-only mirror of the project's repostories hosted + at https://gitlab.com/qemu-project/qemu.git. The project does not process issues filed on GitHub. The project issues are tracked on Launchpad: @@ -24,8 +24,8 @@ pulls: comment: | Thank you for your interest in the QEMU project. - This repository is a read-only mirror of the project's master - repostories hosted on https://git.qemu.org/git/qemu.git. + This repository is a read-only mirror of the project's repostories hosted + on https://gitlab.com/qemu-project/qemu.git. The project does not process merge requests filed on GitHub. QEMU welcomes contributions of code (either fixing bugs or adding new diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7c0db64710..28a83afb91 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,7 +18,6 @@ include: image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest before_script: - JOBS=$(expr $(nproc) + 1) - - sed -i s,git.qemu.org/git,gitlab.com/qemu-project, .gitmodules script: - mkdir build - cd build diff --git a/.gitmodules b/.gitmodules index 2bdeeacef8..08b1b48a09 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,66 +1,66 @@ [submodule "roms/seabios"] path = roms/seabios - url = https://git.qemu.org/git/seabios.git/ + url = https://gitlab.com/qemu-project/seabios.git/ [submodule "roms/SLOF"] path = roms/SLOF - url = https://git.qemu.org/git/SLOF.git + url = https://gitlab.com/qemu-project/SLOF.git [submodule "roms/ipxe"] path = roms/ipxe - url = https://git.qemu.org/git/ipxe.git + url = https://gitlab.com/qemu-project/ipxe.git [submodule "roms/openbios"] path = roms/openbios - url = https://git.qemu.org/git/openbios.git + url = https://gitlab.com/qemu-project/openbios.git [submodule "roms/qemu-palcode"] path = roms/qemu-palcode - url = https://git.qemu.org/git/qemu-palcode.git + url = https://gitlab.com/qemu-project/qemu-palcode.git [submodule "roms/sgabios"] path = roms/sgabios - url = https://git.qemu.org/git/sgabios.git + url = https://gitlab.com/qemu-project/sgabios.git [submodule "dtc"] path = dtc - url = https://git.qemu.org/git/dtc.git + url = https://gitlab.com/qemu-project/dtc.git [submodule "roms/u-boot"] path = roms/u-boot - url = https://git.qemu.org/git/u-boot.git + url = https://gitlab.com/qemu-project/u-boot.git [submodule "roms/skiboot"] path = roms/skiboot - url = https://git.qemu.org/git/skiboot.git + url = https://gitlab.com/qemu-project/skiboot.git [submodule "roms/QemuMacDrivers"] path = roms/QemuMacDrivers - url = https://git.qemu.org/git/QemuMacDrivers.git + url = https://gitlab.com/qemu-project/QemuMacDrivers.git [submodule "ui/keycodemapdb"] path = ui/keycodemapdb - url = https://git.qemu.org/git/keycodemapdb.git + url = https://gitlab.com/qemu-project/keycodemapdb.git [submodule "capstone"] path = capstone - url = https://git.qemu.org/git/capstone.git + url = https://gitlab.com/qemu-project/capstone.git [submodule "roms/seabios-hppa"] path = roms/seabios-hppa - url = https://git.qemu.org/git/seabios-hppa.git + url = https://gitlab.com/qemu-project/seabios-hppa.git [submodule "roms/u-boot-sam460ex"] path = roms/u-boot-sam460ex - url = https://git.qemu.org/git/u-boot-sam460ex.git + url = https://gitlab.com/qemu-project/u-boot-sam460ex.git [submodule "tests/fp/berkeley-testfloat-3"] path = tests/fp/berkeley-testfloat-3 - url = https://git.qemu.org/git/berkeley-testfloat-3.git + url = https://gitlab.com/qemu-project/berkeley-testfloat-3.git [submodule "tests/fp/berkeley-softfloat-3"] path = tests/fp/berkeley-softfloat-3 - url = https://git.qemu.org/git/berkeley-softfloat-3.git + url = https://gitlab.com/qemu-project/berkeley-softfloat-3.git [submodule "roms/edk2"] path = roms/edk2 - url = https://git.qemu.org/git/edk2.git + url = https://gitlab.com/qemu-project/edk2.git [submodule "slirp"] path = slirp - url = https://git.qemu.org/git/libslirp.git + url = https://gitlab.com/qemu-project/libslirp.git [submodule "roms/opensbi"] path = roms/opensbi - url = https://git.qemu.org/git/opensbi.git + url = https://gitlab.com/qemu-project/opensbi.git [submodule "roms/qboot"] path = roms/qboot - url = https://git.qemu.org/git/qboot.git + url = https://gitlab.com/qemu-project/qboot.git [submodule "meson"] path = meson - url = https://git.qemu.org/git/meson.git + url = https://gitlab.com/qemu-project/meson.git [submodule "roms/vbootrom"] path = roms/vbootrom - url = https://git.qemu.org/git/vbootrom.git + url = https://gitlab.com/qemu-project/vbootrom.git diff --git a/Kconfig.host b/Kconfig.host index a9a55a9c31..24255ef441 100644 --- a/Kconfig.host +++ b/Kconfig.host @@ -37,3 +37,7 @@ config VIRTFS config PVRDMA bool + +config MULTIPROCESS_ALLOWED + bool + imply MULTIPROCESS diff --git a/MAINTAINERS b/MAINTAINERS index 5f2ce9c1d0..de5fe1c65f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -281,6 +281,7 @@ F: target/ppc/ F: hw/ppc/ F: include/hw/ppc/ F: disas/ppc.c +F: tests/acceptance/machine_ppc.py RISC-V TCG CPUs M: Palmer Dabbelt <palmer@dabbelt.com> @@ -1120,6 +1121,7 @@ M: Edgar E. Iglesias <edgar.iglesias@gmail.com> S: Maintained F: hw/microblaze/petalogix_s3adsp1800_mmu.c F: include/hw/char/xilinx_uartlite.h +F: tests/acceptance/machine_microblaze.py petalogix_ml605 M: Edgar E. Iglesias <edgar.iglesias@gmail.com> @@ -2541,6 +2543,7 @@ F: qapi/qom.json F: qapi/qdev.json F: scripts/coccinelle/qom-parent-type.cocci F: softmmu/qdev-monitor.c +F: stubs/qdev.c F: qom/ F: tests/check-qom-interface.c F: tests/check-qom-proplist.c @@ -3199,6 +3202,30 @@ S: Maintained F: hw/semihosting/ F: include/hw/semihosting/ +Multi-process QEMU +M: Elena Ufimtseva <elena.ufimtseva@oracle.com> +M: Jagannathan Raman <jag.raman@oracle.com> +M: John G Johnson <john.g.johnson@oracle.com> +S: Maintained +F: docs/devel/multi-process.rst +F: docs/system/multi-process.rst +F: hw/pci-host/remote.c +F: include/hw/pci-host/remote.h +F: hw/remote/machine.c +F: include/hw/remote/machine.h +F: hw/remote/mpqemu-link.c +F: include/hw/remote/mpqemu-link.h +F: hw/remote/message.c +F: hw/remote/remote-obj.c +F: include/hw/remote/memory.h +F: hw/remote/memory.c +F: hw/remote/proxy.c +F: include/hw/remote/proxy.h +F: hw/remote/proxy-memory-listener.c +F: include/hw/remote/proxy-memory-listener.h +F: hw/remote/iohub.c +F: include/hw/remote/iohub.h + Build and test automation ------------------------- Build and test automation @@ -305,7 +305,7 @@ endif @echo 'Test targets:' $(call print-help,check,Run all tests (check-help for details)) $(call print-help,bench,Run all benchmarks) - $(call print-help,docker,Help about targets running tests inside containers) + $(call print-help,docker-help,Help about targets running tests inside containers) $(call print-help,vm-help,Help about targets running tests inside VM) @echo '' @echo 'Documentation targets:' diff --git a/README.rst b/README.rst index 58b9f2dc15..ce39d89077 100644 --- a/README.rst +++ b/README.rst @@ -60,7 +60,7 @@ The QEMU source code is maintained under the GIT version control system. .. code-block:: shell - git clone https://git.qemu.org/git/qemu.git + git clone https://gitlab.com/qemu-project/qemu.git When submitting patches, one common approach is to use 'git format-patch' and/or 'git send-email' to format & send the mail to the @@ -78,7 +78,7 @@ The QEMU website is also maintained under source control. .. code-block:: shell - git clone https://git.qemu.org/git/qemu-web.git + git clone https://gitlab.com/qemu-project/qemu-web.git * `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_ diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 5164d838b9..47516913b7 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -123,10 +123,6 @@ struct KVMState KVMMemoryListener memory_listener; QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; - /* memory encryption */ - void *memcrypt_handle; - int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); - /* For "info mtree -f" to tell if an MR is registered in KVM */ int nr_as; struct KVMAs { @@ -225,26 +221,6 @@ int kvm_get_max_memslots(void) return s->nr_slots; } -bool kvm_memcrypt_enabled(void) -{ - if (kvm_state && kvm_state->memcrypt_handle) { - return true; - } - - return false; -} - -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) -{ - if (kvm_state->memcrypt_handle && - kvm_state->memcrypt_encrypt_data) { - return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, - ptr, len); - } - - return 1; -} - /* Called with KVMMemoryListener.slots_lock held */ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) { @@ -668,16 +644,19 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, d.dirty_bitmap = mem->dirty_bmap; d.slot = mem->slot | (kml->as_id << 16); - if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { - DPRINTF("ioctl failed %d\n", errno); - ret = -1; + ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); + if (ret == -ENOENT) { + /* kernel does not have dirty bitmap in this slot */ + ret = 0; + } else if (ret < 0) { + error_report("ioctl KVM_GET_DIRTY_LOG failed: %d", errno); goto out; + } else { + subsection.offset_within_region += slot_offset; + subsection.size = int128_make64(slot_size); + kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); } - subsection.offset_within_region += slot_offset; - subsection.size = int128_make64(slot_size); - kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); - slot_offset += slot_size; start_addr += slot_size; size -= slot_size; @@ -774,8 +753,8 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, d.num_pages = bmap_npages; d.slot = mem->slot | (as_id << 16); - if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) { - ret = -errno; + ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d); + if (ret < 0 && ret != -ENOENT) { error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", __func__, d.slot, (uint64_t)d.first_page, @@ -2204,20 +2183,6 @@ static int kvm_init(MachineState *ms) kvm_state = s; - /* - * if memory encryption object is specified then initialize the memory - * encryption context. - */ - if (ms->memory_encryption) { - kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); - if (!kvm_state->memcrypt_handle) { - ret = -1; - goto err; - } - - kvm_state->memcrypt_encrypt_data = sev_encrypt_data; - } - ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; diff --git a/accel/kvm/sev-stub.c b/accel/kvm/sev-stub.c index 4f97452585..9587d1b2a3 100644 --- a/accel/kvm/sev-stub.c +++ b/accel/kvm/sev-stub.c @@ -15,12 +15,8 @@ #include "qemu-common.h" #include "sysemu/sev.h" -int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len) +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { - abort(); -} - -void *sev_guest_init(const char *id) -{ - return NULL; + /* If we get here, cgs must be some non-SEV thing */ + return 0; } diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 680e099463..0f17acfac0 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -81,16 +81,6 @@ int kvm_on_sigbus(int code, void *addr) return 1; } -bool kvm_memcrypt_enabled(void) -{ - return false; -} - -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) -{ - return 1; -} - #ifndef CONFIG_USER_ONLY int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) { diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index d9ef69121c..f2819eec7d 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -773,17 +773,30 @@ int cpu_exec(CPUState *cpu) /* prepare setjmp context for exception handling */ if (sigsetjmp(cpu->jmp_env, 0) != 0) { #if defined(__clang__) - /* Some compilers wrongly smash all local variables after - * siglongjmp. There were bug reports for gcc 4.5.0 and clang. + /* + * Some compilers wrongly smash all local variables after + * siglongjmp (the spec requires that only non-volatile locals + * which are changed between the sigsetjmp and siglongjmp are + * permitted to be trashed). There were bug reports for gcc + * 4.5.0 and clang. The bug is fixed in all versions of gcc + * that we support, but is still unfixed in clang: + * https://bugs.llvm.org/show_bug.cgi?id=21183 + * * Reload essential local variables here for those compilers. - * Newer versions of gcc would complain about this code (-Wclobbered). */ + * Newer versions of gcc would complain about this code (-Wclobbered), + * so we only perform the workaround for clang. + */ cpu = current_cpu; cc = CPU_GET_CLASS(cpu); -#else /* buggy compiler */ - /* Assert that the compiler does not smash local variables. */ +#else + /* + * Non-buggy compilers preserve these locals; assert that + * they have the correct value. + */ g_assert(cpu == current_cpu); g_assert(cc == CPU_GET_CLASS(cpu)); -#endif /* buggy compiler */ +#endif + #ifndef CONFIG_SOFTMMU tcg_debug_assert(!have_mmap_lock()); #endif diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c index 87762b469c..13b8fbeb69 100644 --- a/accel/tcg/tcg-accel-ops-icount.c +++ b/accel/tcg/tcg-accel-ops-icount.c @@ -81,7 +81,13 @@ void icount_handle_deadline(void) int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL, QEMU_TIMER_ATTR_ALL); - if (deadline == 0) { + /* + * Instructions, interrupts, and exceptions are processed in cpu-exec. + * Don't interrupt cpu thread, when these events are waiting + * (i.e., there is no checkpoint) + */ + if (deadline == 0 + && (replay_mode != REPLAY_MODE_PLAY || replay_has_checkpoint())) { icount_notify_aio_contexts(); } } diff --git a/backends/confidential-guest-support.c b/backends/confidential-guest-support.c new file mode 100644 index 0000000000..052fde8db0 --- /dev/null +++ b/backends/confidential-guest-support.c @@ -0,0 +1,33 @@ +/* + * QEMU Confidential Guest support + * + * Copyright Red Hat. + * + * Authors: + * David Gibson <david@gibson.dropbear.id.au> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "exec/confidential-guest-support.h" + +OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport, + confidential_guest_support, + CONFIDENTIAL_GUEST_SUPPORT, + OBJECT) + +static void confidential_guest_support_class_init(ObjectClass *oc, void *data) +{ +} + +static void confidential_guest_support_init(Object *obj) +{ +} + +static void confidential_guest_support_finalize(Object *obj) +{ +} diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c index e5626d4330..69b0ae30bb 100644 --- a/backends/hostmem-memfd.c +++ b/backends/hostmem-memfd.c @@ -55,7 +55,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) name = host_memory_backend_get_name(backend); memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, - backend->share, fd, errp); + backend->share, fd, 0, errp); g_free(name); } diff --git a/backends/hostmem.c b/backends/hostmem.c index be0c3b079f..c6c1ff5b99 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -494,6 +494,16 @@ host_memory_backend_class_init(ObjectClass *oc, void *data) host_memory_backend_get_share, host_memory_backend_set_share); object_class_property_set_description(oc, "share", "Mark the memory as private to QEMU or shared"); + /* + * Do not delete/rename option. This option must be considered stable + * (as if it didn't have the 'x-' prefix including deprecation period) as + * long as 4.0 and older machine types exists. + * Option will be used by upper layers to override (disable) canonical path + * for ramblock-id set by compat properties on old machine types ( <= 4.0), + * to keep migration working when backend is used for main RAM with + * -machine memory-backend= option (main RAM historically used prefix-less + * ramblock-id). + */ object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id", host_memory_backend_get_use_canonical_path, host_memory_backend_set_use_canonical_path); diff --git a/backends/meson.build b/backends/meson.build index 484456ece7..d4221831fc 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -6,6 +6,7 @@ softmmu_ss.add([files( 'rng-builtin.c', 'rng-egd.c', 'rng.c', + 'confidential-guest-support.c', ), numa]) softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c')) diff --git a/backends/rng-builtin.c b/backends/rng-builtin.c index f38dff117d..f367eb665c 100644 --- a/backends/rng-builtin.c +++ b/backends/rng-builtin.c @@ -10,6 +10,7 @@ #include "qemu/main-loop.h" #include "qemu/guest-random.h" #include "qom/object.h" +#include "sysemu/replay.h" OBJECT_DECLARE_SIMPLE_TYPE(RngBuiltin, RNG_BUILTIN) @@ -37,7 +38,7 @@ static void rng_builtin_request_entropy(RngBackend *b, RngRequest *req) { RngBuiltin *s = RNG_BUILTIN(b); - qemu_bh_schedule(s->bh); + replay_bh_schedule_event(s->bh); } static void rng_builtin_init(Object *obj) diff --git a/block/backup.c b/block/backup.c index cc525d5544..94e6dcd72e 100644 --- a/block/backup.c +++ b/block/backup.c @@ -35,6 +35,7 @@ typedef struct BackupBlockJob { BlockJob common; BlockDriverState *backup_top; BlockDriverState *source_bs; + BlockDriverState *target_bs; BdrvDirtyBitmap *sync_bitmap; @@ -329,6 +330,13 @@ static void coroutine_fn backup_set_speed(BlockJob *job, int64_t speed) } } +static void backup_cancel(Job *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); + + bdrv_cancel_in_flight(s->target_bs); +} + static const BlockJobDriver backup_job_driver = { .job_driver = { .instance_size = sizeof(BackupBlockJob), @@ -340,6 +348,7 @@ static const BlockJobDriver backup_job_driver = { .abort = backup_abort, .clean = backup_clean, .pause = backup_pause, + .cancel = backup_cancel, }, .set_speed = backup_set_speed, }; @@ -528,6 +537,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, job->backup_top = backup_top; job->source_bs = bs; + job->target_bs = target; job->on_source_error = on_source_error; job->on_target_error = on_target_error; job->sync_mode = sync_mode; diff --git a/block/io.c b/block/io.c index b0435ed670..ca2dca3007 100644 --- a/block/io.c +++ b/block/io.c @@ -3460,3 +3460,14 @@ out: return ret; } + +void bdrv_cancel_in_flight(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + return; + } + + if (bs->drv->bdrv_cancel_in_flight) { + bs->drv->bdrv_cancel_in_flight(bs); + } +} diff --git a/block/mirror.c b/block/mirror.c index 8e1ad6eceb..9faffe4707 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1179,6 +1179,14 @@ static bool mirror_drained_poll(BlockJob *job) return !!s->in_flight; } +static void mirror_cancel(Job *job) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job); + BlockDriverState *target = blk_bs(s->target); + + bdrv_cancel_in_flight(target); +} + static const BlockJobDriver mirror_job_driver = { .job_driver = { .instance_size = sizeof(MirrorBlockJob), @@ -1190,6 +1198,7 @@ static const BlockJobDriver mirror_job_driver = { .abort = mirror_abort, .pause = mirror_pause, .complete = mirror_complete, + .cancel = mirror_cancel, }, .drained_poll = mirror_drained_poll, }; diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index afd75ab628..75d7fa9510 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -900,10 +900,11 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) ImageEntry *image_entry, *next_ie; SnapshotEntry *snapshot_entry; + Error *err = NULL; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, &err); if (!bs) { - monitor_printf(mon, "No available block device supports snapshots\n"); + error_report_err(err); return; } aio_context = bdrv_get_aio_context(bs); @@ -953,7 +954,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict) total = 0; for (i = 0; i < nb_sns; i++) { SnapshotEntry *next_sn; - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs1) == 0) { + if (bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL) == 1) { global_snapshots[total] = i; total++; QTAILQ_FOREACH(image_entry, &image_list, next) { diff --git a/block/nbd.c b/block/nbd.c index b3cbbeb4b0..c26dc5a54f 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -2458,6 +2458,18 @@ static const char *const nbd_strong_runtime_opts[] = { NULL }; +static void nbd_cancel_in_flight(BlockDriverState *bs) +{ + BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + + reconnect_delay_timer_del(s); + + if (s->state == NBD_CLIENT_CONNECTING_WAIT) { + s->state = NBD_CLIENT_CONNECTING_NOWAIT; + qemu_co_queue_restart_all(&s->free_sema); + } +} + static BlockDriver bdrv_nbd = { .format_name = "nbd", .protocol_name = "nbd", @@ -2484,6 +2496,7 @@ static BlockDriver bdrv_nbd = { .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, + .bdrv_cancel_in_flight = nbd_cancel_in_flight, }; static BlockDriver bdrv_nbd_tcp = { @@ -2512,6 +2525,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, + .bdrv_cancel_in_flight = nbd_cancel_in_flight, }; static BlockDriver bdrv_nbd_unix = { @@ -2540,6 +2554,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, .strong_runtime_opts = nbd_strong_runtime_opts, + .bdrv_cancel_in_flight = nbd_cancel_in_flight, }; static void bdrv_nbd_init(void) diff --git a/block/raw-format.c b/block/raw-format.c index 42ec50802b..7717578ed6 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -575,6 +575,11 @@ static const char *const raw_strong_runtime_opts[] = { NULL }; +static void raw_cancel_in_flight(BlockDriverState *bs) +{ + bdrv_cancel_in_flight(bs->file->bs); +} + BlockDriver bdrv_raw = { .format_name = "raw", .instance_size = sizeof(BDRVRawState), @@ -608,6 +613,7 @@ BlockDriver bdrv_raw = { .bdrv_has_zero_init = &raw_has_zero_init, .strong_runtime_opts = raw_strong_runtime_opts, .mutable_opts = mutable_opts, + .bdrv_cancel_in_flight = raw_cancel_in_flight, }; static void bdrv_raw_init(void) diff --git a/block/snapshot.c b/block/snapshot.c index a2bf3a54eb..e8ae9a28c1 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -447,6 +447,41 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, return ret; } + +static int bdrv_all_get_snapshot_devices(bool has_devices, strList *devices, + GList **all_bdrvs, + Error **errp) +{ + g_autoptr(GList) bdrvs = NULL; + + if (has_devices) { + if (!devices) { + error_setg(errp, "At least one device is required for snapshot"); + return -1; + } + + while (devices) { + BlockDriverState *bs = bdrv_find_node(devices->value); + if (!bs) { + error_setg(errp, "No block device node '%s'", devices->value); + return -1; + } + bdrvs = g_list_append(bdrvs, bs); + devices = devices->next; + } + } else { + BlockDriverState *bs; + BdrvNextIterator it; + for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + bdrvs = g_list_append(bdrvs, bs); + } + } + + *all_bdrvs = g_steal_pointer(&bdrvs); + return 0; +} + + static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) { if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { @@ -462,44 +497,59 @@ static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs) * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers) */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp) { - bool ok = true; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return false; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + bool ok = true; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ok = bdrv_can_snapshot(bs); } aio_context_release(ctx); if (!ok) { - bdrv_next_cleanup(&it); - goto fail; + error_setg(errp, "Device '%s' is writable but does not support " + "snapshots", bdrv_get_device_or_node_name(bs)); + return false; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ok; + return true; } -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; - QEMUSnapshotInfo sn1, *snapshot = &sn1; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn1, *snapshot = &sn1; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs) && + if ((devices || bdrv_all_snapshots_includes_bs(bs)) && bdrv_snapshot_find(bs, snapshot, name) >= 0) { ret = bdrv_snapshot_delete(bs, snapshot->id_str, @@ -507,118 +557,180 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not delete snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp) { - int ret = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { + if (devices || bdrv_all_snapshots_includes_bs(bs)) { ret = bdrv_snapshot_goto(bs, name, errp); } aio_context_release(ctx); if (ret < 0) { - bdrv_next_cleanup(&it); - goto fail; + error_prepend(errp, "Could not load snapshot '%s' on '%s': ", + name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return ret; + return 0; } -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp) { - QEMUSnapshotInfo sn; - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } + + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + QEMUSnapshotInfo sn; + int ret = 0; aio_context_acquire(ctx); - if (bdrv_all_snapshots_includes_bs(bs)) { - err = bdrv_snapshot_find(bs, &sn, name); + if (devices || bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_find(bs, &sn, name); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + if (ret == -ENOENT) { + return 0; + } else { + error_setg_errno(errp, errno, + "Could not check snapshot '%s' on '%s'", + name, bdrv_get_device_or_node_name(bs)); + return -1; + } } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 1; } int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs) + bool has_devices, strList *devices, + Error **errp) { - int err = 0; - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return -1; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); + int ret = 0; aio_context_acquire(ctx); if (bs == vm_state_bs) { sn->vm_state_size = vm_state_size; - err = bdrv_snapshot_create(bs, sn); - } else if (bdrv_all_snapshots_includes_bs(bs)) { + ret = bdrv_snapshot_create(bs, sn); + } else if (devices || bdrv_all_snapshots_includes_bs(bs)) { sn->vm_state_size = 0; - err = bdrv_snapshot_create(bs, sn); + ret = bdrv_snapshot_create(bs, sn); } aio_context_release(ctx); - if (err < 0) { - bdrv_next_cleanup(&it); - goto fail; + if (ret < 0) { + error_setg(errp, "Could not create snapshot '%s' on '%s'", + sn->name, bdrv_get_device_or_node_name(bs)); + return -1; } + + iterbdrvs = iterbdrvs->next; } -fail: - *first_bad_bs = bs; - return err; + return 0; } -BlockDriverState *bdrv_all_find_vmstate_bs(void) + +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp) { - BlockDriverState *bs; - BdrvNextIterator it; + g_autoptr(GList) bdrvs = NULL; + GList *iterbdrvs; + + if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) { + return NULL; + } - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { + iterbdrvs = bdrvs; + while (iterbdrvs) { + BlockDriverState *bs = iterbdrvs->data; AioContext *ctx = bdrv_get_aio_context(bs); - bool found; + bool found = false; aio_context_acquire(ctx); - found = bdrv_all_snapshots_includes_bs(bs) && bdrv_can_snapshot(bs); + found = (devices || bdrv_all_snapshots_includes_bs(bs)) && + bdrv_can_snapshot(bs); aio_context_release(ctx); - if (found) { - bdrv_next_cleanup(&it); - break; + if (vmstate_bs) { + if (g_str_equal(vmstate_bs, + bdrv_get_node_name(bs))) { + if (found) { + return bs; + } else { + error_setg(errp, + "vmstate block device '%s' does not support snapshots", + vmstate_bs); + return NULL; + } + } + } else if (found) { + return bs; } + + iterbdrvs = iterbdrvs->next; + } + + if (vmstate_bs) { + error_setg(errp, + "vmstate block device '%s' does not exist", vmstate_bs); + } else { + error_setg(errp, + "no block device can store vmstate for snapshot"); } - return bs; + return NULL; } diff --git a/blockdev-nbd.c b/blockdev-nbd.c index d8443d235b..b264620b98 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds, qio_net_listener_set_name(nbd_server->listener, "nbd-listener"); - if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) { + /* + * Because this server is persistent, a backlog of SOMAXCONN is + * better than trying to size it to max_connections. + */ + if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN, + errp) < 0) { goto error; } @@ -198,8 +198,8 @@ has() { } version_ge () { - local_ver1=`echo $1 | tr . ' '` - local_ver2=`echo $2 | tr . ' '` + local_ver1=$(expr "$1" : '\([0-9.]*\)' | tr . ' ') + local_ver2=$(echo "$2" | tr . ' ') while true; do set x $local_ver1 local_first=${2-0} @@ -463,6 +463,7 @@ skip_meson=no gettext="auto" fuse="auto" fuse_lseek="auto" +multiprocess="no" malloc_trim="auto" @@ -797,6 +798,7 @@ Linux) linux="yes" linux_user="yes" vhost_user=${default_feature:-yes} + multiprocess=${default_feature:-yes} ;; esac @@ -1556,6 +1558,10 @@ for opt do ;; --disable-fuse-lseek) fuse_lseek="disabled" ;; + --enable-multiprocess) multiprocess="yes" + ;; + --disable-multiprocess) multiprocess="no" + ;; *) echo "ERROR: unknown option $opt" echo "Try '$0 --help' for more information" @@ -1764,7 +1770,7 @@ Advanced options (experts only): --with-trace-file=NAME Full PATH,NAME of file to store traces Default:trace-<pid> --disable-slirp disable SLIRP userspace network connectivity - --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI) + --enable-tcg-interpreter enable TCI (TCG with bytecode interpreter, experimental and slow) --enable-malloc-trim enable libc malloc_trim() for memory optimization --oss-lib path to OSS library --cpu=CPU Build for host CPU [$cpu] @@ -1908,6 +1914,7 @@ disabled with --disable-FEATURE, default is enabled if available libdaxctl libdaxctl support fuse FUSE block device export fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports + multiprocess Multiprocess QEMU support NOTE: The object files are built at the place where configure is launched EOF @@ -6082,6 +6089,9 @@ fi if test "$have_mlockall" = "yes" ; then echo "HAVE_MLOCKALL=y" >> $config_host_mak fi +if test "$multiprocess" = "yes" ; then + echo "CONFIG_MULTIPROCESS_ALLOWED=y" >> $config_host_mak +fi if test "$fuzzing" = "yes" ; then # If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the # needed CFLAGS have already been provided @@ -6115,7 +6125,7 @@ fi if test -n "$gdb_bin"; then gdb_version=$($gdb_bin --version | head -n 1) - if version_ge ${gdb_version##* } 8.3.1; then + if version_ge ${gdb_version##* } 9.1; then echo "HAVE_GDB_BIN=$gdb_bin" >> $config_host_mak fi fi diff --git a/docs/amd-memory-encryption.txt b/docs/amd-memory-encryption.txt index 80b8eb00e9..145896aec7 100644 --- a/docs/amd-memory-encryption.txt +++ b/docs/amd-memory-encryption.txt @@ -73,7 +73,7 @@ complete flow chart. To launch a SEV guest # ${QEMU} \ - -machine ...,memory-encryption=sev0 \ + -machine ...,confidential-guest-support=sev0 \ -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 Debugging diff --git a/docs/confidential-guest-support.txt b/docs/confidential-guest-support.txt new file mode 100644 index 0000000000..71d07ba57a --- /dev/null +++ b/docs/confidential-guest-support.txt @@ -0,0 +1,49 @@ +Confidential Guest Support +========================== + +Traditionally, hypervisors such as QEMU have complete access to a +guest's memory and other state, meaning that a compromised hypervisor +can compromise any of its guests. A number of platforms have added +mechanisms in hardware and/or firmware which give guests at least some +protection from a compromised hypervisor. This is obviously +especially desirable for public cloud environments. + +These mechanisms have different names and different modes of +operation, but are often referred to as Secure Guests or Confidential +Guests. We use the term "Confidential Guest Support" to distinguish +this from other aspects of guest security (such as security against +attacks from other guests, or from network sources). + +Running a Confidential Guest +---------------------------- + +To run a confidential guest you need to add two command line parameters: + +1. Use "-object" to create a "confidential guest support" object. The + type and parameters will vary with the specific mechanism to be + used +2. Set the "confidential-guest-support" machine parameter to the ID of + the object from (1). + +Example (for AMD SEV):: + + qemu-system-x86_64 \ + <other parameters> \ + -machine ...,confidential-guest-support=sev0 \ + -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 + +Supported mechanisms +-------------------- + +Currently supported confidential guest mechanisms are: + +AMD Secure Encrypted Virtualization (SEV) + docs/amd-memory-encryption.txt + +POWER Protected Execution Facility (PEF) + docs/papr-pef.txt + +s390x Protected Virtualization (PV) + docs/system/s390x/protvirt.rst + +Other mechanisms may be supported in future. diff --git a/docs/devel/build-system.rst b/docs/devel/build-system.rst index 31f4dced2a..69ce3087e3 100644 --- a/docs/devel/build-system.rst +++ b/docs/devel/build-system.rst @@ -100,7 +100,7 @@ In meson.build:: # Detect dependency sdl_image = dependency('SDL2_image', required: get_option('sdl_image'), method: 'pkg-config', - static: enable_static) + kwargs: static_kwargs) # Create config-host.h (if applicable) config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found()) diff --git a/docs/devel/fuzzing.rst b/docs/devel/fuzzing.rst index 6096242d99..97797c4f8c 100644 --- a/docs/devel/fuzzing.rst +++ b/docs/devel/fuzzing.rst @@ -119,7 +119,7 @@ Adding a new fuzzer Coverage over virtual devices can be improved by adding additional fuzzers. Fuzzers are kept in ``tests/qtest/fuzz/`` and should be added to -``tests/qtest/fuzz/Makefile.include`` +``tests/qtest/fuzz/meson.build`` Fuzzers can rely on both qtest and libqos to communicate with virtual devices. @@ -128,8 +128,7 @@ Fuzzers can rely on both qtest and libqos to communicate with virtual devices. 2. Write the fuzzing code using the libqtest/libqos API. See existing fuzzers for reference. -3. Register the fuzzer in ``tests/fuzz/Makefile.include`` by appending the - corresponding object to fuzz-obj-y +3. Add the fuzzer to ``tests/qtest/fuzz/meson.build``. Fuzzers can be more-or-less thought of as special qtest programs which can modify the qtest commands and/or qtest command arguments based on inputs @@ -181,6 +180,36 @@ To ensure that these env variables have been configured correctly, we can use:: The output should contain a complete list of matched MemoryRegions. +OSS-Fuzz +-------- +QEMU is continuously fuzzed on `OSS-Fuzz` __(https://github.com/google/oss-fuzz). +By default, the OSS-Fuzz build will try to fuzz every fuzz-target. Since the +generic-fuzz target requires additional information provided in environment +variables, we pre-define some generic-fuzz configs in +``tests/qtest/fuzz/generic_fuzz_configs.h``. Each config must specify: + +- ``.name``: To identify the fuzzer config + +- ``.args`` OR ``.argfunc``: A string or pointer to a function returning a + string. These strings are used to specify the ``QEMU_FUZZ_ARGS`` + environment variable. ``argfunc`` is useful when the config relies on e.g. + a dynamically created temp directory, or a free tcp/udp port. + +- ``.objects``: A string that specifies the ``QEMU_FUZZ_OBJECTS`` environment + variable. + +To fuzz additional devices/device configuration on OSS-Fuzz, send patches for +either a new device-specific fuzzer or a new generic-fuzz config. + +Build details: + +- The Dockerfile that sets up the environment for building QEMU's + fuzzers on OSS-Fuzz can be fund in the OSS-Fuzz repository + __(https://github.com/google/oss-fuzz/blob/master/projects/qemu/Dockerfile) + +- The script responsible for building the fuzzers can be found in the + QEMU source tree at ``scripts/oss-fuzz/build.sh`` + Implementation Details / Fuzzer Lifecycle ----------------------------------------- diff --git a/docs/devel/index.rst b/docs/devel/index.rst index 98a7016a9b..22854e334d 100644 --- a/docs/devel/index.rst +++ b/docs/devel/index.rst @@ -37,3 +37,4 @@ Contents: clocks qom block-coroutine-wrapper + multi-process diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst new file mode 100644 index 0000000000..69699329d6 --- /dev/null +++ b/docs/devel/multi-process.rst @@ -0,0 +1,966 @@ +This is the design document for multi-process QEMU. It does not +necessarily reflect the status of the current implementation, which +may lack features or be considerably different from what is described +in this document. This document is still useful as a description of +the goals and general direction of this feature. + +Please refer to the following wiki for latest details: +https://wiki.qemu.org/Features/MultiProcessQEMU + +Multi-process QEMU +=================== + +QEMU is often used as the hypervisor for virtual machines running in the +Oracle cloud. Since one of the advantages of cloud computing is the +ability to run many VMs from different tenants in the same cloud +infrastructure, a guest that compromised its hypervisor could +potentially use the hypervisor's access privileges to access data it is +not authorized for. + +QEMU can be susceptible to security attacks because it is a large, +monolithic program that provides many features to the VMs it services. +Many of these features can be configured out of QEMU, but even a reduced +configuration QEMU has a large amount of code a guest can potentially +attack. Separating QEMU reduces the attack surface by aiding to +limit each component in the system to only access the resources that +it needs to perform its job. + +QEMU services +------------- + +QEMU can be broadly described as providing three main services. One is a +VM control point, where VMs can be created, migrated, re-configured, and +destroyed. A second is to emulate the CPU instructions within the VM, +often accelerated by HW virtualization features such as Intel's VT +extensions. Finally, it provides IO services to the VM by emulating HW +IO devices, such as disk and network devices. + +A multi-process QEMU +~~~~~~~~~~~~~~~~~~~~ + +A multi-process QEMU involves separating QEMU services into separate +host processes. Each of these processes can be given only the privileges +it needs to provide its service, e.g., a disk service could be given +access only to the disk images it provides, and not be allowed to +access other files, or any network devices. An attacker who compromised +this service would not be able to use this exploit to access files or +devices beyond what the disk service was given access to. + +A QEMU control process would remain, but in multi-process mode, will +have no direct interfaces to the VM. During VM execution, it would still +provide the user interface to hot-plug devices or live migrate the VM. + +A first step in creating a multi-process QEMU is to separate IO services +from the main QEMU program, which would continue to provide CPU +emulation. i.e., the control process would also be the CPU emulation +process. In a later phase, CPU emulation could be separated from the +control process. + +Separating IO services +---------------------- + +Separating IO services into individual host processes is a good place to +begin for a couple of reasons. One is the sheer number of IO devices QEMU +can emulate provides a large surface of interfaces which could potentially +be exploited, and, indeed, have been a source of exploits in the past. +Another is the modular nature of QEMU device emulation code provides +interface points where the QEMU functions that perform device emulation +can be separated from the QEMU functions that manage the emulation of +guest CPU instructions. The devices emulated in the separate process are +referred to as remote devices. + +QEMU device emulation +~~~~~~~~~~~~~~~~~~~~~ + +QEMU uses an object oriented SW architecture for device emulation code. +Configured objects are all compiled into the QEMU binary, then objects +are instantiated by name when used by the guest VM. For example, the +code to emulate a device named "foo" is always present in QEMU, but its +instantiation code is only run when the device is included in the target +VM. (e.g., via the QEMU command line as *-device foo*) + +The object model is hierarchical, so device emulation code names its +parent object (such as "pci-device" for a PCI device) and QEMU will +instantiate a parent object before calling the device's instantiation +code. + +Current separation models +~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to separate the device emulation code from the CPU emulation +code, the device object code must run in a different process. There are +a couple of existing QEMU features that can run emulation code +separately from the main QEMU process. These are examined below. + +vhost user model +^^^^^^^^^^^^^^^^ + +Virtio guest device drivers can be connected to vhost user applications +in order to perform their IO operations. This model uses special virtio +device drivers in the guest and vhost user device objects in QEMU, but +once the QEMU vhost user code has configured the vhost user application, +mission-mode IO is performed by the application. The vhost user +application is a daemon process that can be contacted via a known UNIX +domain socket. + +vhost socket +'''''''''''' + +As mentioned above, one of the tasks of the vhost device object within +QEMU is to contact the vhost application and send it configuration +information about this device instance. As part of the configuration +process, the application can also be sent other file descriptors over +the socket, which then can be used by the vhost user application in +various ways, some of which are described below. + +vhost MMIO store acceleration +''''''''''''''''''''''''''''' + +VMs are often run using HW virtualization features via the KVM kernel +driver. This driver allows QEMU to accelerate the emulation of guest CPU +instructions by running the guest in a virtual HW mode. When the guest +executes instructions that cannot be executed by virtual HW mode, +execution returns to the KVM driver so it can inform QEMU to emulate the +instructions in SW. + +One of the events that can cause a return to QEMU is when a guest device +driver accesses an IO location. QEMU then dispatches the memory +operation to the corresponding QEMU device object. In the case of a +vhost user device, the memory operation would need to be sent over a +socket to the vhost application. This path is accelerated by the QEMU +virtio code by setting up an eventfd file descriptor that the vhost +application can directly receive MMIO store notifications from the KVM +driver, instead of needing them to be sent to the QEMU process first. + +vhost interrupt acceleration +'''''''''''''''''''''''''''' + +Another optimization used by the vhost application is the ability to +directly inject interrupts into the VM via the KVM driver, again, +bypassing the need to send the interrupt back to the QEMU process first. +The QEMU virtio setup code configures the KVM driver with an eventfd +that triggers the device interrupt in the guest when the eventfd is +written. This irqfd file descriptor is then passed to the vhost user +application program. + +vhost access to guest memory +'''''''''''''''''''''''''''' + +The vhost application is also allowed to directly access guest memory, +instead of needing to send the data as messages to QEMU. This is also +done with file descriptors sent to the vhost user application by QEMU. +These descriptors can be passed to ``mmap()`` by the vhost application +to map the guest address space into the vhost application. + +IOMMUs introduce another level of complexity, since the address given to +the guest virtio device to DMA to or from is not a guest physical +address. This case is handled by having vhost code within QEMU register +as a listener for IOMMU mapping changes. The vhost application maintains +a cache of IOMMMU translations: sending translation requests back to +QEMU on cache misses, and in turn receiving flush requests from QEMU +when mappings are purged. + +applicability to device separation +'''''''''''''''''''''''''''''''''' + +Much of the vhost model can be re-used by separated device emulation. In +particular, the ideas of using a socket between QEMU and the device +emulation application, using a file descriptor to inject interrupts into +the VM via KVM, and allowing the application to ``mmap()`` the guest +should be re used. + +There are, however, some notable differences between how a vhost +application works and the needs of separated device emulation. The most +basic is that vhost uses custom virtio device drivers which always +trigger IO with MMIO stores. A separated device emulation model must +work with existing IO device models and guest device drivers. MMIO loads +break vhost store acceleration since they are synchronous - guest +progress cannot continue until the load has been emulated. By contrast, +stores are asynchronous, the guest can continue after the store event +has been sent to the vhost application. + +Another difference is that in the vhost user model, a single daemon can +support multiple QEMU instances. This is contrary to the security regime +desired, in which the emulation application should only be allowed to +access the files or devices the VM it's running on behalf of can access. +#### qemu-io model + +Qemu-io is a test harness used to test changes to the QEMU block backend +object code. (e.g., the code that implements disk images for disk driver +emulation) Qemu-io is not a device emulation application per se, but it +does compile the QEMU block objects into a separate binary from the main +QEMU one. This could be useful for disk device emulation, since its +emulation applications will need to include the QEMU block objects. + +New separation model based on proxy objects +------------------------------------------- + +A different model based on proxy objects in the QEMU program +communicating with remote emulation programs could provide separation +while minimizing the changes needed to the device emulation code. The +rest of this section is a discussion of how a proxy object model would +work. + +Remote emulation processes +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The remote emulation process will run the QEMU object hierarchy without +modification. The device emulation objects will be also be based on the +QEMU code, because for anything but the simplest device, it would not be +a tractable to re-implement both the object model and the many device +backends that QEMU has. + +The processes will communicate with the QEMU process over UNIX domain +sockets. The processes can be executed either as standalone processes, +or be executed by QEMU. In both cases, the host backends the emulation +processes will provide are specified on its command line, as they would +be for QEMU. For example: + +:: + + disk-proc -blockdev driver=file,node-name=file0,filename=disk-file0 \ + -blockdev driver=qcow2,node-name=drive0,file=file0 + +would indicate process *disk-proc* uses a qcow2 emulated disk named +*file0* as its backend. + +Emulation processes may emulate more than one guest controller. A common +configuration might be to put all controllers of the same device class +(e.g., disk, network, etc.) in a single process, so that all backends of +the same type can be managed by a single QMP monitor. + +communication with QEMU +^^^^^^^^^^^^^^^^^^^^^^^ + +The first argument to the remote emulation process will be a Unix domain +socket that connects with the Proxy object. This is a required argument. + +:: + + disk-proc <socket number> <backend list> + +remote process QMP monitor +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Remote emulation processes can be monitored via QMP, similar to QEMU +itself. The QMP monitor socket is specified the same as for a QEMU +process: + +:: + + disk-proc -qmp unix:/tmp/disk-mon,server + +can be monitored over the UNIX socket path */tmp/disk-mon*. + +QEMU command line +~~~~~~~~~~~~~~~~~ + +Each remote device emulated in a remote process on the host is +represented as a *-device* of type *pci-proxy-dev*. A socket +sub-option to this option specifies the Unix socket that connects +to the remote process. An *id* sub-option is required, and it should +be the same id as used in the remote process. + +:: + + qemu-system-x86_64 ... -device pci-proxy-dev,id=lsi0,socket=3 + +can be used to add a device emulated in a remote process + + +QEMU management of remote processes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +QEMU is not aware of the type of type of the remote PCI device. It is +a pass through device as far as QEMU is concerned. + +communication with emulation process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +primary channel +''''''''''''''' + +The primary channel (referred to as com in the code) is used to bootstrap +the remote process. It is also used to pass on device-agnostic commands +like reset. + +per-device channels +''''''''''''''''''' + +Each remote device communicates with QEMU using a dedicated communication +channel. The proxy object sets up this channel using the primary +channel during its initialization. + +QEMU device proxy objects +~~~~~~~~~~~~~~~~~~~~~~~~~ + +QEMU has an object model based on sub-classes inherited from the +"object" super-class. The sub-classes that are of interest here are the +"device" and "bus" sub-classes whose child sub-classes make up the +device tree of a QEMU emulated system. + +The proxy object model will use device proxy objects to replace the +device emulation code within the QEMU process. These objects will live +in the same place in the object and bus hierarchies as the objects they +replace. i.e., the proxy object for an LSI SCSI controller will be a +sub-class of the "pci-device" class, and will have the same PCI bus +parent and the same SCSI bus child objects as the LSI controller object +it replaces. + +It is worth noting that the same proxy object is used to mediate with +all types of remote PCI devices. + +object initialization +^^^^^^^^^^^^^^^^^^^^^ + +The Proxy device objects are initialized in the exact same manner in +which any other QEMU device would be initialized. + +In addition, the Proxy objects perform the following two tasks: +- Parses the "socket" sub option and connects to the remote process +using this channel +- Uses the "id" sub-option to connect to the emulated device on the +separate process + +class\_init +''''''''''' + +The ``class_init()`` method of a proxy object will, in general behave +similarly to the object it replaces, including setting any static +properties and methods needed by the proxy. + +instance\_init / realize +'''''''''''''''''''''''' + +The ``instance_init()`` and ``realize()`` functions would only need to +perform tasks related to being a proxy, such are registering its own +MMIO handlers, or creating a child bus that other proxy devices can be +attached to later. + +Other tasks will be device-specific. For example, PCI device objects +will initialize the PCI config space in order to make a valid PCI device +tree within the QEMU process. + +address space registration +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Most devices are driven by guest device driver accesses to IO addresses +or ports. The QEMU device emulation code uses QEMU's memory region +function calls (such as ``memory_region_init_io()``) to add callback +functions that QEMU will invoke when the guest accesses the device's +areas of the IO address space. When a guest driver does access the +device, the VM will exit HW virtualization mode and return to QEMU, +which will then lookup and execute the corresponding callback function. + +A proxy object would need to mirror the memory region calls the actual +device emulator would perform in its initialization code, but with its +own callbacks. When invoked by QEMU as a result of a guest IO operation, +they will forward the operation to the device emulation process. + +PCI config space +^^^^^^^^^^^^^^^^ + +PCI devices also have a configuration space that can be accessed by the +guest driver. Guest accesses to this space is not handled by the device +emulation object, but by its PCI parent object. Much of this space is +read-only, but certain registers (especially BAR and MSI-related ones) +need to be propagated to the emulation process. + +PCI parent proxy +'''''''''''''''' + +One way to propagate guest PCI config accesses is to create a +"pci-device-proxy" class that can serve as the parent of a PCI device +proxy object. This class's parent would be "pci-device" and it would +override the PCI parent's ``config_read()`` and ``config_write()`` +methods with ones that forward these operations to the emulation +program. + +interrupt receipt +^^^^^^^^^^^^^^^^^ + +A proxy for a device that generates interrupts will need to create a +socket to receive interrupt indications from the emulation process. An +incoming interrupt indication would then be sent up to its bus parent to +be injected into the guest. For example, a PCI device object may use +``pci_set_irq()``. + +live migration +^^^^^^^^^^^^^^ + +The proxy will register to save and restore any *vmstate* it needs over +a live migration event. The device proxy does not need to manage the +remote device's *vmstate*; that will be handled by the remote process +proxy (see below). + +QEMU remote device operation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generic device operations, such as DMA, will be performed by the remote +process proxy by sending messages to the remote process. + +DMA operations +^^^^^^^^^^^^^^ + +DMA operations would be handled much like vhost applications do. One of +the initial messages sent to the emulation process is a guest memory +table. Each entry in this table consists of a file descriptor and size +that the emulation process can ``mmap()`` to directly access guest +memory, similar to ``vhost_user_set_mem_table()``. Note guest memory +must be backed by file descriptors, such as when QEMU is given the +*-mem-path* command line option. + +IOMMU operations +^^^^^^^^^^^^^^^^ + +When the emulated system includes an IOMMU, the remote process proxy in +QEMU will need to create a socket for IOMMU requests from the emulation +process. It will handle those requests with an +``address_space_get_iotlb_entry()`` call. In order to handle IOMMU +unmaps, the remote process proxy will also register as a listener on the +device's DMA address space. When an IOMMU memory region is created +within the DMA address space, an IOMMU notifier for unmaps will be added +to the memory region that will forward unmaps to the emulation process +over the IOMMU socket. + +device hot-plug via QMP +^^^^^^^^^^^^^^^^^^^^^^^ + +An QMP "device\_add" command can add a device emulated by a remote +process. It will also have "rid" option to the command, just as the +*-device* command line option does. The remote process may either be one +started at QEMU startup, or be one added by the "add-process" QMP +command described above. In either case, the remote process proxy will +forward the new device's JSON description to the corresponding emulation +process. + +live migration +^^^^^^^^^^^^^^ + +The remote process proxy will also register for live migration +notifications with ``vmstate_register()``. When called to save state, +the proxy will send the remote process a secondary socket file +descriptor to save the remote process's device *vmstate* over. The +incoming byte stream length and data will be saved as the proxy's +*vmstate*. When the proxy is resumed on its new host, this *vmstate* +will be extracted, and a secondary socket file descriptor will be sent +to the new remote process through which it receives the *vmstate* in +order to restore the devices there. + +device emulation in remote process +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The parts of QEMU that the emulation program will need include the +object model; the memory emulation objects; the device emulation objects +of the targeted device, and any dependent devices; and, the device's +backends. It will also need code to setup the machine environment, +handle requests from the QEMU process, and route machine-level requests +(such as interrupts or IOMMU mappings) back to the QEMU process. + +initialization +^^^^^^^^^^^^^^ + +The process initialization sequence will follow the same sequence +followed by QEMU. It will first initialize the backend objects, then +device emulation objects. The JSON descriptions sent by the QEMU process +will drive which objects need to be created. + +- address spaces + +Before the device objects are created, the initial address spaces and +memory regions must be configured with ``memory_map_init()``. This +creates a RAM memory region object (*system\_memory*) and an IO memory +region object (*system\_io*). + +- RAM + +RAM memory region creation will follow how ``pc_memory_init()`` creates +them, but must use ``memory_region_init_ram_from_fd()`` instead of +``memory_region_allocate_system_memory()``. The file descriptors needed +will be supplied by the guest memory table from above. Those RAM regions +would then be added to the *system\_memory* memory region with +``memory_region_add_subregion()``. + +- PCI + +IO initialization will be driven by the JSON descriptions sent from the +QEMU process. For a PCI device, a PCI bus will need to be created with +``pci_root_bus_new()``, and a PCI memory region will need to be created +and added to the *system\_memory* memory region with +``memory_region_add_subregion_overlap()``. The overlap version is +required for architectures where PCI memory overlaps with RAM memory. + +MMIO handling +^^^^^^^^^^^^^ + +The device emulation objects will use ``memory_region_init_io()`` to +install their MMIO handlers, and ``pci_register_bar()`` to associate +those handlers with a PCI BAR, as they do within QEMU currently. + +In order to use ``address_space_rw()`` in the emulation process to +handle MMIO requests from QEMU, the PCI physical addresses must be the +same in the QEMU process and the device emulation process. In order to +accomplish that, guest BAR programming must also be forwarded from QEMU +to the emulation process. + +interrupt injection +^^^^^^^^^^^^^^^^^^^ + +When device emulation wants to inject an interrupt into the VM, the +request climbs the device's bus object hierarchy until the point where a +bus object knows how to signal the interrupt to the guest. The details +depend on the type of interrupt being raised. + +- PCI pin interrupts + +On x86 systems, there is an emulated IOAPIC object attached to the root +PCI bus object, and the root PCI object forwards interrupt requests to +it. The IOAPIC object, in turn, calls the KVM driver to inject the +corresponding interrupt into the VM. The simplest way to handle this in +an emulation process would be to setup the root PCI bus driver (via +``pci_bus_irqs()``) to send a interrupt request back to the QEMU +process, and have the device proxy object reflect it up the PCI tree +there. + +- PCI MSI/X interrupts + +PCI MSI/X interrupts are implemented in HW as DMA writes to a +CPU-specific PCI address. In QEMU on x86, a KVM APIC object receives +these DMA writes, then calls into the KVM driver to inject the interrupt +into the VM. A simple emulation process implementation would be to send +the MSI DMA address from QEMU as a message at initialization, then +install an address space handler at that address which forwards the MSI +message back to QEMU. + +DMA operations +^^^^^^^^^^^^^^ + +When a emulation object wants to DMA into or out of guest memory, it +first must use dma\_memory\_map() to convert the DMA address to a local +virtual address. The emulation process memory region objects setup above +will be used to translate the DMA address to a local virtual address the +device emulation code can access. + +IOMMU +^^^^^ + +When an IOMMU is in use in QEMU, DMA translation uses IOMMU memory +regions to translate the DMA address to a guest physical address before +that physical address can be translated to a local virtual address. The +emulation process will need similar functionality. + +- IOTLB cache + +The emulation process will maintain a cache of recent IOMMU translations +(the IOTLB). When the translate() callback of an IOMMU memory region is +invoked, the IOTLB cache will be searched for an entry that will map the +DMA address to a guest PA. On a cache miss, a message will be sent back +to QEMU requesting the corresponding translation entry, which be both be +used to return a guest address and be added to the cache. + +- IOTLB purge + +The IOMMU emulation will also need to act on unmap requests from QEMU. +These happen when the guest IOMMU driver purges an entry from the +guest's translation table. + +live migration +^^^^^^^^^^^^^^ + +When a remote process receives a live migration indication from QEMU, it +will set up a channel using the received file descriptor with +``qio_channel_socket_new_fd()``. This channel will be used to create a +*QEMUfile* that can be passed to ``qemu_save_device_state()`` to send +the process's device state back to QEMU. This method will be reversed on +restore - the channel will be passed to ``qemu_loadvm_state()`` to +restore the device state. + +Accelerating device emulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The messages that are required to be sent between QEMU and the emulation +process can add considerable latency to IO operations. The optimizations +described below attempt to ameliorate this effect by allowing the +emulation process to communicate directly with the kernel KVM driver. +The KVM file descriptors created would be passed to the emulation process +via initialization messages, much like the guest memory table is done. +#### MMIO acceleration + +Vhost user applications can receive guest virtio driver stores directly +from KVM. The issue with the eventfd mechanism used by vhost user is +that it does not pass any data with the event indication, so it cannot +handle guest loads or guest stores that carry store data. This concept +could, however, be expanded to cover more cases. + +The expanded idea would require a new type of KVM device: +*KVM\_DEV\_TYPE\_USER*. This device has two file descriptors: a master +descriptor that QEMU can use for configuration, and a slave descriptor +that the emulation process can use to receive MMIO notifications. QEMU +would create both descriptors using the KVM driver, and pass the slave +descriptor to the emulation process via an initialization message. + +data structures +^^^^^^^^^^^^^^^ + +- guest physical range + +The guest physical range structure describes the address range that a +device will respond to. It includes the base and length of the range, as +well as which bus the range resides on (e.g., on an x86machine, it can +specify whether the range refers to memory or IO addresses). + +A device can have multiple physical address ranges it responds to (e.g., +a PCI device can have multiple BARs), so the structure will also include +an enumerated identifier to specify which of the device's ranges is +being referred to. + ++--------+----------------------------+ +| Name | Description | ++========+============================+ +| addr | range base address | ++--------+----------------------------+ +| len | range length | ++--------+----------------------------+ +| bus | addr type (memory or IO) | ++--------+----------------------------+ +| id | range ID (e.g., PCI BAR) | ++--------+----------------------------+ + +- MMIO request structure + +This structure describes an MMIO operation. It includes which guest +physical range the MMIO was within, the offset within that range, the +MMIO type (e.g., load or store), and its length and data. It also +includes a sequence number that can be used to reply to the MMIO, and +the CPU that issued the MMIO. + ++----------+------------------------+ +| Name | Description | ++==========+========================+ +| rid | range MMIO is within | ++----------+------------------------+ +| offset | offset withing *rid* | ++----------+------------------------+ +| type | e.g., load or store | ++----------+------------------------+ +| len | MMIO length | ++----------+------------------------+ +| data | store data | ++----------+------------------------+ +| seq | sequence ID | ++----------+------------------------+ + +- MMIO request queues + +MMIO request queues are FIFO arrays of MMIO request structures. There +are two queues: pending queue is for MMIOs that haven't been read by the +emulation program, and the sent queue is for MMIOs that haven't been +acknowledged. The main use of the second queue is to validate MMIO +replies from the emulation program. + +- scoreboard + +Each CPU in the VM is emulated in QEMU by a separate thread, so multiple +MMIOs may be waiting to be consumed by an emulation program and multiple +threads may be waiting for MMIO replies. The scoreboard would contain a +wait queue and sequence number for the per-CPU threads, allowing them to +be individually woken when the MMIO reply is received from the emulation +program. It also tracks the number of posted MMIO stores to the device +that haven't been replied to, in order to satisfy the PCI constraint +that a load to a device will not complete until all previous stores to +that device have been completed. + +- device shadow memory + +Some MMIO loads do not have device side-effects. These MMIOs can be +completed without sending a MMIO request to the emulation program if the +emulation program shares a shadow image of the device's memory image +with the KVM driver. + +The emulation program will ask the KVM driver to allocate memory for the +shadow image, and will then use ``mmap()`` to directly access it. The +emulation program can control KVM access to the shadow image by sending +KVM an access map telling it which areas of the image have no +side-effects (and can be completed immediately), and which require a +MMIO request to the emulation program. The access map can also inform +the KVM drive which size accesses are allowed to the image. + +master descriptor +^^^^^^^^^^^^^^^^^ + +The master descriptor is used by QEMU to configure the new KVM device. +The descriptor would be returned by the KVM driver when QEMU issues a +*KVM\_CREATE\_DEVICE* ``ioctl()`` with a *KVM\_DEV\_TYPE\_USER* type. + +KVM\_DEV\_TYPE\_USER device ops + + +The *KVM\_DEV\_TYPE\_USER* operations vector will be registered by a +``kvm_register_device_ops()`` call when the KVM system in initialized by +``kvm_init()``. These device ops are called by the KVM driver when QEMU +executes certain ``ioctl()`` operations on its KVM file descriptor. They +include: + +- create + +This routine is called when QEMU issues a *KVM\_CREATE\_DEVICE* +``ioctl()`` on its per-VM file descriptor. It will allocate and +initialize a KVM user device specific data structure, and assign the +*kvm\_device* private field to it. + +- ioctl + +This routine is invoked when QEMU issues an ``ioctl()`` on the master +descriptor. The ``ioctl()`` commands supported are defined by the KVM +device type. *KVM\_DEV\_TYPE\_USER* ones will need several commands: + +*KVM\_DEV\_USER\_SLAVE\_FD* creates the slave file descriptor that will +be passed to the device emulation program. Only one slave can be created +by each master descriptor. The file operations performed by this +descriptor are described below. + +The *KVM\_DEV\_USER\_PA\_RANGE* command configures a guest physical +address range that the slave descriptor will receive MMIO notifications +for. The range is specified by a guest physical range structure +argument. For buses that assign addresses to devices dynamically, this +command can be executed while the guest is running, such as the case +when a guest changes a device's PCI BAR registers. + +*KVM\_DEV\_USER\_PA\_RANGE* will use ``kvm_io_bus_register_dev()`` to +register *kvm\_io\_device\_ops* callbacks to be invoked when the guest +performs a MMIO operation within the range. When a range is changed, +``kvm_io_bus_unregister_dev()`` is used to remove the previous +instantiation. + +*KVM\_DEV\_USER\_TIMEOUT* will configure a timeout value that specifies +how long KVM will wait for the emulation process to respond to a MMIO +indication. + +- destroy + +This routine is called when the VM instance is destroyed. It will need +to destroy the slave descriptor; and free any memory allocated by the +driver, as well as the *kvm\_device* structure itself. + +slave descriptor +^^^^^^^^^^^^^^^^ + +The slave descriptor will have its own file operations vector, which +responds to system calls on the descriptor performed by the device +emulation program. + +- read + +A read returns any pending MMIO requests from the KVM driver as MMIO +request structures. Multiple structures can be returned if there are +multiple MMIO operations pending. The MMIO requests are moved from the +pending queue to the sent queue, and if there are threads waiting for +space in the pending to add new MMIO operations, they will be woken +here. + +- write + +A write also consists of a set of MMIO requests. They are compared to +the MMIO requests in the sent queue. Matches are removed from the sent +queue, and any threads waiting for the reply are woken. If a store is +removed, then the number of posted stores in the per-CPU scoreboard is +decremented. When the number is zero, and a non side-effect load was +waiting for posted stores to complete, the load is continued. + +- ioctl + +There are several ioctl()s that can be performed on the slave +descriptor. + +A *KVM\_DEV\_USER\_SHADOW\_SIZE* ``ioctl()`` causes the KVM driver to +allocate memory for the shadow image. This memory can later be +``mmap()``\ ed by the emulation process to share the emulation's view of +device memory with the KVM driver. + +A *KVM\_DEV\_USER\_SHADOW\_CTRL* ``ioctl()`` controls access to the +shadow image. It will send the KVM driver a shadow control map, which +specifies which areas of the image can complete guest loads without +sending the load request to the emulation program. It will also specify +the size of load operations that are allowed. + +- poll + +An emulation program will use the ``poll()`` call with a *POLLIN* flag +to determine if there are MMIO requests waiting to be read. It will +return if the pending MMIO request queue is not empty. + +- mmap + +This call allows the emulation program to directly access the shadow +image allocated by the KVM driver. As device emulation updates device +memory, changes with no side-effects will be reflected in the shadow, +and the KVM driver can satisfy guest loads from the shadow image without +needing to wait for the emulation program. + +kvm\_io\_device ops +^^^^^^^^^^^^^^^^^^^ + +Each KVM per-CPU thread can handle MMIO operation on behalf of the guest +VM. KVM will use the MMIO's guest physical address to search for a +matching *kvm\_io\_device* to see if the MMIO can be handled by the KVM +driver instead of exiting back to QEMU. If a match is found, the +corresponding callback will be invoked. + +- read + +This callback is invoked when the guest performs a load to the device. +Loads with side-effects must be handled synchronously, with the KVM +driver putting the QEMU thread to sleep waiting for the emulation +process reply before re-starting the guest. Loads that do not have +side-effects may be optimized by satisfying them from the shadow image, +if there are no outstanding stores to the device by this CPU. PCI memory +ordering demands that a load cannot complete before all older stores to +the same device have been completed. + +- write + +Stores can be handled asynchronously unless the pending MMIO request +queue is full. In this case, the QEMU thread must sleep waiting for +space in the queue. Stores will increment the number of posted stores in +the per-CPU scoreboard, in order to implement the PCI ordering +constraint above. + +interrupt acceleration +^^^^^^^^^^^^^^^^^^^^^^ + +This performance optimization would work much like a vhost user +application does, where the QEMU process sets up *eventfds* that cause +the device's corresponding interrupt to be triggered by the KVM driver. +These irq file descriptors are sent to the emulation process at +initialization, and are used when the emulation code raises a device +interrupt. + +intx acceleration +''''''''''''''''' + +Traditional PCI pin interrupts are level based, so, in addition to an +irq file descriptor, a re-sampling file descriptor needs to be sent to +the emulation program. This second file descriptor allows multiple +devices sharing an irq to be notified when the interrupt has been +acknowledged by the guest, so they can re-trigger the interrupt if their +device has not de-asserted its interrupt. + +intx irq descriptor + + +The irq descriptors are created by the proxy object +``using event_notifier_init()`` to create the irq and re-sampling +*eventds*, and ``kvm_vm_ioctl(KVM_IRQFD)`` to bind them to an interrupt. +The interrupt route can be found with +``pci_device_route_intx_to_irq()``. + +intx routing changes + + +Intx routing can be changed when the guest programs the APIC the device +pin is connected to. The proxy object in QEMU will use +``pci_device_set_intx_routing_notifier()`` to be informed of any guest +changes to the route. This handler will broadly follow the VFIO +interrupt logic to change the route: de-assigning the existing irq +descriptor from its route, then assigning it the new route. (see +``vfio_intx_update()``) + +MSI/X acceleration +'''''''''''''''''' + +MSI/X interrupts are sent as DMA transactions to the host. The interrupt +data contains a vector that is programmed by the guest, A device may have +multiple MSI interrupts associated with it, so multiple irq descriptors +may need to be sent to the emulation program. + +MSI/X irq descriptor + + +This case will also follow the VFIO example. For each MSI/X interrupt, +an *eventfd* is created, a virtual interrupt is allocated by +``kvm_irqchip_add_msi_route()``, and the virtual interrupt is bound to +the eventfd with ``kvm_irqchip_add_irqfd_notifier()``. + +MSI/X config space changes + + +The guest may dynamically update several MSI-related tables in the +device's PCI config space. These include per-MSI interrupt enables and +vector data. Additionally, MSIX tables exist in device memory space, not +config space. Much like the BAR case above, the proxy object must look +at guest config space programming to keep the MSI interrupt state +consistent between QEMU and the emulation program. + +-------------- + +Disaggregated CPU emulation +--------------------------- + +After IO services have been disaggregated, a second phase would be to +separate a process to handle CPU instruction emulation from the main +QEMU control function. There are no object separation points for this +code, so the first task would be to create one. + +Host access controls +-------------------- + +Separating QEMU relies on the host OS's access restriction mechanisms to +enforce that the differing processes can only access the objects they +are entitled to. There are a couple types of mechanisms usually provided +by general purpose OSs. + +Discretionary access control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Discretionary access control allows each user to control who can access +their files. In Linux, this type of control is usually too coarse for +QEMU separation, since it only provides three separate access controls: +one for the same user ID, the second for users IDs with the same group +ID, and the third for all other user IDs. Each device instance would +need a separate user ID to provide access control, which is likely to be +unwieldy for dynamically created VMs. + +Mandatory access control +~~~~~~~~~~~~~~~~~~~~~~~~ + +Mandatory access control allows the OS to add an additional set of +controls on top of discretionary access for the OS to control. It also +adds other attributes to processes and files such as types, roles, and +categories, and can establish rules for how processes and files can +interact. + +Type enforcement +^^^^^^^^^^^^^^^^ + +Type enforcement assigns a *type* attribute to processes and files, and +allows rules to be written on what operations a process with a given +type can perform on a file with a given type. QEMU separation could take +advantage of type enforcement by running the emulation processes with +different types, both from the main QEMU process, and from the emulation +processes of different classes of devices. + +For example, guest disk images and disk emulation processes could have +types separate from the main QEMU process and non-disk emulation +processes, and the type rules could prevent processes other than disk +emulation ones from accessing guest disk images. Similarly, network +emulation processes can have a type separate from the main QEMU process +and non-network emulation process, and only that type can access the +host tun/tap device used to provide guest networking. + +Category enforcement +^^^^^^^^^^^^^^^^^^^^ + +Category enforcement assigns a set of numbers within a given range to +the process or file. The process is granted access to the file if the +process's set is a superset of the file's set. This enforcement can be +used to separate multiple instances of devices in the same class. + +For example, if there are multiple disk devices provides to a guest, +each device emulation process could be provisioned with a separate +category. The different device emulation processes would not be able to +access each other's backing disk images. + +Alternatively, categories could be used in lieu of the type enforcement +scheme described above. In this scenario, different categories would be +used to prevent device emulation processes in different classes from +accessing resources assigned to other classes. diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst index 809af69725..209f9d8172 100644 --- a/docs/devel/testing.rst +++ b/docs/devel/testing.rst @@ -765,9 +765,6 @@ and hypothetical example follows: class MultipleMachines(Test): - """ - :avocado: enable - """ def test_multiple_machines(self): first_machine = self.get_vm() second_machine = self.get_vm() diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt index e9271eba5d..f15bf35bd1 100644 --- a/docs/interop/parallels.txt +++ b/docs/interop/parallels.txt @@ -208,7 +208,7 @@ of its data area are: 28 - 31: l1_size The number of entries in the L1 table of the bitmap. - variable: l1 (64 * l1_size bytes) + variable: l1_table (8 * l1_size bytes) L1 offset table (in bytes) A dirty bitmap is stored using a one-level structure for the mapping to host diff --git a/docs/meson.build b/docs/meson.build index bb14eaebd3..f84306ba7e 100644 --- a/docs/meson.build +++ b/docs/meson.build @@ -46,9 +46,11 @@ if build_docs meson.source_root() / 'docs/sphinx/qmp_lexer.py', qapi_gen_depends ] + have_ga = have_tools and config_host.has_key('CONFIG_GUEST_AGENT') + man_pages = { - 'qemu-ga.8': (have_tools ? 'man8' : ''), - 'qemu-ga-ref.7': 'man7', + 'qemu-ga.8': (have_ga ? 'man8' : ''), + 'qemu-ga-ref.7': (have_ga ? 'man7' : ''), 'qemu-qmp-ref.7': 'man7', 'qemu-storage-daemon-qmp-ref.7': (have_tools ? 'man7' : ''), 'qemu-img.1': (have_tools ? 'man1' : ''), diff --git a/docs/papr-pef.txt b/docs/papr-pef.txt new file mode 100644 index 0000000000..72550e9bf8 --- /dev/null +++ b/docs/papr-pef.txt @@ -0,0 +1,30 @@ +POWER (PAPR) Protected Execution Facility (PEF) +=============================================== + +Protected Execution Facility (PEF), also known as Secure Guest support +is a feature found on IBM POWER9 and POWER10 processors. + +If a suitable firmware including an Ultravisor is installed, it adds +an extra memory protection mode to the CPU. The ultravisor manages a +pool of secure memory which cannot be accessed by the hypervisor. + +When this feature is enabled in QEMU, a guest can use ultracalls to +enter "secure mode". This transfers most of its memory to secure +memory, where it cannot be eavesdropped by a compromised hypervisor. + +Launching +--------- + +To launch a guest which will be permitted to enter PEF secure mode: + +# ${QEMU} \ + -object pef-guest,id=pef0 \ + -machine confidential-guest-support=pef0 \ + ... + +Live Migration +---------------- + +Live migration is not yet implemented for PEF guests. For +consistency, we currently prevent migration if the PEF feature is +enabled, whether or not the guest has actually entered secure mode. diff --git a/docs/system/arm/versatile.rst b/docs/system/arm/versatile.rst index 51221c30a4..2ae792bac3 100644 --- a/docs/system/arm/versatile.rst +++ b/docs/system/arm/versatile.rst @@ -27,3 +27,37 @@ The Arm Versatile baseboard is emulated with the following devices: devices. - PL181 MultiMedia Card Interface with SD card. + +Booting a Linux kernel +---------------------- + +Building a current Linux kernel with ``versatile_defconfig`` should be +enough to get something running. Nowadays an out-of-tree build is +recommended (and also useful if you build a lot of different targets). +In the following example $BLD points to the build directory and $SRC +points to the root of the Linux source tree. You can drop $SRC if you +are running from there. + +.. code-block:: bash + + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- versatile_defconfig + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- + +You may want to enable some additional modules if you want to boot +something from the SCSI interface:: + + CONFIG_PCI=y + CONFIG_PCI_VERSATILE=y + CONFIG_SCSI=y + CONFIG_SCSI_SYM53C8XX_2=y + +You can then boot with a command line like: + +.. code-block:: bash + + $ qemu-system-arm -machine type=versatilepb \ + -serial mon:stdio \ + -drive if=scsi,driver=file,filename=debian-buster-armel-rootfs.ext4 \ + -kernel zImage \ + -dtb versatile-pb.dtb \ + -append "console=ttyAMA0 ro root=/dev/sda" diff --git a/docs/system/arm/vexpress.rst b/docs/system/arm/vexpress.rst index 7f1bcbef07..3e3839e923 100644 --- a/docs/system/arm/vexpress.rst +++ b/docs/system/arm/vexpress.rst @@ -58,3 +58,31 @@ Other differences between the hardware and the QEMU model: ``vexpress-a15``, and have IRQs from 40 upwards. If a dtb is provided on the command line then QEMU will edit it to include suitable entries describing these transports for the guest. + +Booting a Linux kernel +---------------------- + +Building a current Linux kernel with ``multi_v7_defconfig`` should be +enough to get something running. Nowadays an out-of-tree build is +recommended (and also useful if you build a lot of different targets). +In the following example $BLD points to the build directory and $SRC +points to the root of the Linux source tree. You can drop $SRC if you +are running from there. + +.. code-block:: bash + + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- multi_v7_defconfig + $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- + +By default you will want to boot your rootfs off the sdcard interface. +Your rootfs will need to be padded to the right size. With a suitable +DTB you could also add devices to the virtio-mmio bus. + +.. code-block:: bash + + $ qemu-system-arm -cpu cortex-a15 -smp 4 -m 4096 \ + -machine type=vexpress-a15 -serial mon:stdio \ + -drive if=sd,driver=file,filename=armel-rootfs.ext4 \ + -kernel zImage \ + -dtb vexpress-v2p-ca15-tc1.dtb \ + -append "console=ttyAMA0 root=/dev/mmcblk0 ro" diff --git a/docs/system/index.rst b/docs/system/index.rst index d40f72c92b..625b494372 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -34,6 +34,7 @@ Contents: pr-manager targets security + multi-process deprecated removed-features build-platforms diff --git a/docs/system/multi-process.rst b/docs/system/multi-process.rst new file mode 100644 index 0000000000..46bb0cafc2 --- /dev/null +++ b/docs/system/multi-process.rst @@ -0,0 +1,64 @@ +Multi-process QEMU +================== + +This document describes how to configure and use multi-process qemu. +For the design document refer to docs/devel/qemu-multiprocess. + +1) Configuration +---------------- + +multi-process is enabled by default for targets that enable KVM + + +2) Usage +-------- + +Multi-process QEMU requires an orchestrator to launch. + +Following is a description of command-line used to launch mpqemu. + +* Orchestrator: + + - The Orchestrator creates a unix socketpair + + - It launches the remote process and passes one of the + sockets to it via command-line. + + - It then launches QEMU and specifies the other socket as an option + to the Proxy device object + +* Remote Process: + + - QEMU can enter remote process mode by using the "remote" machine + option. + + - The orchestrator creates a "remote-object" with details about + the device and the file descriptor for the device + + - The remaining options are no different from how one launches QEMU with + devices. + + - Example command-line for the remote process is as follows: + + /usr/bin/qemu-system-x86_64 \ + -machine x-remote \ + -device lsi53c895a,id=lsi0 \ + -drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2 \ + -device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0 \ + -object x-remote-object,id=robj1,devid=lsi1,fd=4, + +* QEMU: + + - Since parts of the RAM are shared between QEMU & remote process, a + memory-backend-memfd is required to facilitate this, as follows: + + -object memory-backend-memfd,id=mem,size=2G + + - A "x-pci-proxy-dev" device is created for each of the PCI devices emulated + in the remote process. A "socket" sub-option specifies the other end of + unix channel created by orchestrator. The "id" sub-option must be specified + and should be the same as the "id" specified for the remote PCI device + + - Example commandline for QEMU is as follows: + + -device x-pci-proxy-dev,id=lsi0,socket=3 diff --git a/docs/system/s390x/protvirt.rst b/docs/system/s390x/protvirt.rst index 712974ad87..0f481043d9 100644 --- a/docs/system/s390x/protvirt.rst +++ b/docs/system/s390x/protvirt.rst @@ -22,15 +22,22 @@ If those requirements are met, the capability `KVM_CAP_S390_PROTECTED` will indicate that KVM can support PVMs on that LPAR. -QEMU Settings -------------- +Running a Protected Virtual Machine +----------------------------------- -To indicate to the VM that it can transition into protected mode, the +To run a PVM you will need to select a CPU model which includes the `Unpack facility` (stfle bit 161 represented by the feature -`unpack`/`S390_FEAT_UNPACK`) needs to be part of the cpu model of -the VM. +`unpack`/`S390_FEAT_UNPACK`), and add these options to the command line:: + + -object s390-pv-guest,id=pv0 \ + -machine confidential-guest-support=pv0 + +Adding these options will: + +* Ensure the `unpack` facility is available +* Enable the IOMMU by default for all I/O devices +* Initialize the PV mechanism -All I/O devices need to use the IOMMU. Passthrough (vfio) devices are currently not supported. Host huge page backings are not supported. However guests can use huge diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst index fe41336dc5..ee862fa0bc 100644 --- a/docs/tools/qemu-nbd.rst +++ b/docs/tools/qemu-nbd.rst @@ -136,8 +136,8 @@ driver options if ``--image-opts`` is specified. .. option:: -e, --shared=NUM Allow up to *NUM* clients to share the device (default - ``1``). Safe for readers, but for now, consistency is not - guaranteed between multiple writers. + ``1``), 0 for unlimited. Safe for readers, but for now, + consistency is not guaranteed between multiple writers. .. option:: -t, --persistent @@ -2245,7 +2245,6 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) { TaskState *ts; unsigned long offset, len, saved_auxv, auxv_len; - const char *mem; if (gdb_ctx->num_params < 2) { put_packet("E22"); @@ -2257,8 +2256,8 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) ts = gdbserver_state.c_cpu->opaque; saved_auxv = ts->info->saved_auxv; auxv_len = ts->info->auxv_len; - mem = (const char *)(saved_auxv + offset); - if (offset > auxv_len) { + + if (offset >= auxv_len) { put_packet("E00"); return; } @@ -2269,12 +2268,20 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx) if (len < auxv_len - offset) { g_string_assign(gdbserver_state.str_buf, "m"); - memtox(gdbserver_state.str_buf, mem, len); } else { g_string_assign(gdbserver_state.str_buf, "l"); - memtox(gdbserver_state.str_buf, mem, auxv_len - offset); + len = auxv_len - offset; + } + + g_byte_array_set_size(gdbserver_state.mem_buf, len); + if (target_memory_rw_debug(gdbserver_state.g_cpu, saved_auxv + offset, + gdbserver_state.mem_buf->data, len, false)) { + put_packet("E14"); + return; } + memtox(gdbserver_state.str_buf, + (const char *)gdbserver_state.mem_buf->data, len); put_packet_binary(gdbserver_state.str_buf->str, gdbserver_state.str_buf->len, true); } diff --git a/hw/Kconfig b/hw/Kconfig index d4cec9e476..8ea26479c4 100644 --- a/hw/Kconfig +++ b/hw/Kconfig @@ -27,6 +27,7 @@ source pci-host/Kconfig source pcmcia/Kconfig source pci/Kconfig source rdma/Kconfig +source remote/Kconfig source rtc/Kconfig source scsi/Kconfig source sd/Kconfig diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c index 12e4a16d37..bf31ca351f 100644 --- a/hw/arm/aspeed_ast2600.c +++ b/hw/arm/aspeed_ast2600.c @@ -216,7 +216,7 @@ static void aspeed_soc_ast2600_init(Object *obj) /* * ASPEED ast2600 has 0xf as cluster ID * - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0388e/CIHEBGFG.html + * https://developer.arm.com/documentation/ddi0388/e/the-system-control-coprocessors/summary-of-system-control-coprocessor-registers/multiprocessor-affinity-register */ static uint64_t aspeed_calc_affinity(int cpu) { diff --git a/hw/arm/musca.c b/hw/arm/musca.c index 945643c3cd..7a83f7dda7 100644 --- a/hw/arm/musca.c +++ b/hw/arm/musca.c @@ -15,8 +15,8 @@ * https://developer.arm.com/products/system-design/development-boards/iot-test-chips-and-boards/musca-a-test-chip-board * https://developer.arm.com/products/system-design/development-boards/iot-test-chips-and-boards/musca-b-test-chip-board * We model the A and B1 variants of this board, as described in the TRMs: - * http://infocenter.arm.com/help/topic/com.arm.doc.101107_0000_00_en/index.html - * http://infocenter.arm.com/help/topic/com.arm.doc.101312_0000_00_en/index.html + * https://developer.arm.com/documentation/101107/latest/ + * https://developer.arm.com/documentation/101312/latest/ */ #include "qemu/osdep.h" diff --git a/hw/arm/npcm7xx.c b/hw/arm/npcm7xx.c index 72040d4079..d1fe9bd1df 100644 --- a/hw/arm/npcm7xx.c +++ b/hw/arm/npcm7xx.c @@ -576,14 +576,6 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp) create_unimplemented_device("npcm7xx.pcierc", 0xe1000000, 64 * KiB); create_unimplemented_device("npcm7xx.kcs", 0xf0007000, 4 * KiB); create_unimplemented_device("npcm7xx.gfxi", 0xf000e000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[0]", 0xf0010000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[1]", 0xf0011000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[2]", 0xf0012000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[3]", 0xf0013000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[4]", 0xf0014000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[5]", 0xf0015000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[6]", 0xf0016000, 4 * KiB); - create_unimplemented_device("npcm7xx.gpio[7]", 0xf0017000, 4 * KiB); create_unimplemented_device("npcm7xx.smbus[0]", 0xf0080000, 4 * KiB); create_unimplemented_device("npcm7xx.smbus[1]", 0xf0081000, 4 * KiB); create_unimplemented_device("npcm7xx.smbus[2]", 0xf0082000, 4 * KiB); diff --git a/hw/arm/xlnx-versal.c b/hw/arm/xlnx-versal.c index b0777166e8..628e77ef66 100644 --- a/hw/arm/xlnx-versal.c +++ b/hw/arm/xlnx-versal.c @@ -67,10 +67,10 @@ static void versal_create_apu_gic(Versal *s, qemu_irq *pic) gicbusdev = SYS_BUS_DEVICE(&s->fpd.apu.gic); gicdev = DEVICE(&s->fpd.apu.gic); qdev_prop_set_uint32(gicdev, "revision", 3); - qdev_prop_set_uint32(gicdev, "num-cpu", 2); + qdev_prop_set_uint32(gicdev, "num-cpu", nr_apu_cpus); qdev_prop_set_uint32(gicdev, "num-irq", XLNX_VERSAL_NR_IRQS + 32); qdev_prop_set_uint32(gicdev, "len-redist-region-count", 1); - qdev_prop_set_uint32(gicdev, "redist-region-count[0]", 2); + qdev_prop_set_uint32(gicdev, "redist-region-count[0]", nr_apu_cpus); qdev_prop_set_bit(gicdev, "has-security-extensions", true); sysbus_realize(SYS_BUS_DEVICE(&s->fpd.apu.gic), &error_fatal); diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 2670787d26..93ac6e107a 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -16,6 +16,7 @@ #include "qemu/units.h" #include "qemu/cutils.h" #include "qemu/log.h" +#include "qemu/error-report.h" #include "hw/block/block.h" #include "hw/pci/pci.h" #include "sysemu/sysemu.h" @@ -25,28 +26,47 @@ #include "hw/qdev-properties.h" #include "hw/qdev-core.h" +#include "trace.h" #include "nvme.h" #include "nvme-ns.h" -static void nvme_ns_init(NvmeNamespace *ns) +#define MIN_DISCARD_GRANULARITY (4 * KiB) + +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) { + BlockDriverInfo bdi; NvmeIdNs *id_ns = &ns->id_ns; int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + int npdg; - if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) { - ns->id_ns.dlfeat = 0x9; - } + ns->id_ns.dlfeat = 0x9; id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size); id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns)); + ns->csi = NVME_CSI_NVM; + /* no thin provisioning */ id_ns->ncap = id_ns->nsze; id_ns->nuse = id_ns->ncap; + + /* support DULBE and I/O optimization fields */ + id_ns->nsfeat |= (0x4 | 0x10); + + npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size; + + if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 && + bdi.cluster_size > ns->blkconf.discard_granularity) { + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; + } + + id_ns->npda = id_ns->npdg = npdg - 1; + + return 0; } -static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) { bool read_only; @@ -59,19 +79,225 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } + if (ns->blkconf.discard_granularity == -1) { + ns->blkconf.discard_granularity = + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); + } + ns->size = blk_getlength(ns->blkconf.blk); if (ns->size < 0) { error_setg_errno(errp, -ns->size, "could not get blockdev size"); return -1; } - if (blk_enable_write_cache(ns->blkconf.blk)) { - n->features.vwc = 0x1; + return 0; +} + +static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) +{ + uint64_t zone_size, zone_cap; + uint32_t lbasz = ns->blkconf.logical_block_size; + + /* Make sure that the values of ZNS properties are sane */ + if (ns->params.zone_size_bs) { + zone_size = ns->params.zone_size_bs; + } else { + zone_size = NVME_DEFAULT_ZONE_SIZE; + } + if (ns->params.zone_cap_bs) { + zone_cap = ns->params.zone_cap_bs; + } else { + zone_cap = zone_size; + } + if (zone_cap > zone_size) { + error_setg(errp, "zone capacity %"PRIu64"B exceeds " + "zone size %"PRIu64"B", zone_cap, zone_size); + return -1; + } + if (zone_size < lbasz) { + error_setg(errp, "zone size %"PRIu64"B too small, " + "must be at least %"PRIu32"B", zone_size, lbasz); + return -1; + } + if (zone_cap < lbasz) { + error_setg(errp, "zone capacity %"PRIu64"B too small, " + "must be at least %"PRIu32"B", zone_cap, lbasz); + return -1; + } + + /* + * Save the main zone geometry values to avoid + * calculating them later again. + */ + ns->zone_size = zone_size / lbasz; + ns->zone_capacity = zone_cap / lbasz; + ns->num_zones = ns->size / lbasz / ns->zone_size; + + /* Do a few more sanity checks of ZNS properties */ + if (!ns->num_zones) { + error_setg(errp, + "insufficient drive capacity, must be at least the size " + "of one zone (%"PRIu64"B)", zone_size); + return -1; + } + + if (ns->params.max_open_zones > ns->num_zones) { + error_setg(errp, + "max_open_zones value %u exceeds the number of zones %u", + ns->params.max_open_zones, ns->num_zones); + return -1; + } + if (ns->params.max_active_zones > ns->num_zones) { + error_setg(errp, + "max_active_zones value %u exceeds the number of zones %u", + ns->params.max_active_zones, ns->num_zones); + return -1; + } + + if (ns->params.zd_extension_size) { + if (ns->params.zd_extension_size & 0x3f) { + error_setg(errp, + "zone descriptor extension size must be a multiple of 64B"); + return -1; + } + if ((ns->params.zd_extension_size >> 6) > 0xff) { + error_setg(errp, "zone descriptor extension size is too large"); + return -1; + } } return 0; } +static void nvme_ns_zoned_init_state(NvmeNamespace *ns) +{ + uint64_t start = 0, zone_size = ns->zone_size; + uint64_t capacity = ns->num_zones * zone_size; + NvmeZone *zone; + int i; + + ns->zone_array = g_new0(NvmeZone, ns->num_zones); + if (ns->params.zd_extension_size) { + ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * + ns->num_zones); + } + + QTAILQ_INIT(&ns->exp_open_zones); + QTAILQ_INIT(&ns->imp_open_zones); + QTAILQ_INIT(&ns->closed_zones); + QTAILQ_INIT(&ns->full_zones); + + zone = ns->zone_array; + for (i = 0; i < ns->num_zones; i++, zone++) { + if (start + zone_size > capacity) { + zone_size = capacity - start; + } + zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE; + nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); + zone->d.za = 0; + zone->d.zcap = ns->zone_capacity; + zone->d.zslba = start; + zone->d.wp = start; + zone->w_ptr = start; + start += zone_size; + } + + ns->zone_size_log2 = 0; + if (is_power_of_2(ns->zone_size)) { + ns->zone_size_log2 = 63 - clz64(ns->zone_size); + } +} + +static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index) +{ + NvmeIdNsZoned *id_ns_z; + + nvme_ns_zoned_init_state(ns); + + id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); + + /* MAR/MOR are zeroes-based, 0xffffffff means no limit */ + id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); + id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); + id_ns_z->zoc = 0; + id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; + + id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size); + id_ns_z->lbafe[lba_index].zdes = + ns->params.zd_extension_size >> 6; /* Units of 64B */ + + ns->csi = NVME_CSI_ZONED; + ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); + ns->id_ns.ncap = ns->id_ns.nsze; + ns->id_ns.nuse = ns->id_ns.ncap; + + /* + * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" + * status of logical blocks. Since the spec defines that logical blocks + * SHALL be deallocated when then zone is in the Empty or Offline states, + * we can only support DULBE if the zone size is a multiple of the + * calculated NPDG. + */ + if (ns->zone_size % (ns->id_ns.npdg + 1)) { + warn_report("the zone size (%"PRIu64" blocks) is not a multiple of " + "the calculated deallocation granularity (%d blocks); " + "DULBE support disabled", + ns->zone_size, ns->id_ns.npdg + 1); + + ns->id_ns.nsfeat &= ~0x4; + } + + ns->id_ns_zoned = id_ns_z; +} + +static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) +{ + uint8_t state; + + zone->w_ptr = zone->d.wp; + state = nvme_get_zone_state(zone); + if (zone->d.wp != zone->d.zslba || + (zone->d.za & NVME_ZA_ZD_EXT_VALID)) { + if (state != NVME_ZONE_STATE_CLOSED) { + trace_pci_nvme_clear_ns_close(state, zone->d.zslba); + nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); + } + nvme_aor_inc_active(ns); + QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); + } else { + trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); + nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); + } +} + +/* + * Close all the zones that are currently open. + */ +static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) +{ + NvmeZone *zone, *next; + + QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { + QTAILQ_REMOVE(&ns->closed_zones, zone, entry); + nvme_aor_dec_active(ns); + nvme_clear_zone(ns, zone); + } + QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_aor_dec_active(ns); + nvme_clear_zone(ns, zone); + } + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { + QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_aor_dec_active(ns); + nvme_clear_zone(ns, zone); + } + + assert(ns->nr_open_zones == 0); +} + static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) { if (!ns->blkconf.blk) { @@ -82,20 +308,25 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return 0; } -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +int nvme_ns_setup(NvmeNamespace *ns, Error **errp) { if (nvme_ns_check_constraints(ns, errp)) { return -1; } - if (nvme_ns_init_blk(n, ns, errp)) { + if (nvme_ns_init_blk(ns, errp)) { return -1; } - nvme_ns_init(ns); - if (nvme_register_namespace(n, ns, errp)) { + if (nvme_ns_init(ns, errp)) { return -1; } + if (ns->params.zoned) { + if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { + return -1; + } + nvme_ns_init_zoned(ns, 0); + } return 0; } @@ -105,9 +336,21 @@ void nvme_ns_drain(NvmeNamespace *ns) blk_drain(ns->blkconf.blk); } -void nvme_ns_flush(NvmeNamespace *ns) +void nvme_ns_shutdown(NvmeNamespace *ns) { blk_flush(ns->blkconf.blk); + if (ns->params.zoned) { + nvme_zoned_ns_shutdown(ns); + } +} + +void nvme_ns_cleanup(NvmeNamespace *ns) +{ + if (ns->params.zoned) { + g_free(ns->id_ns_zoned); + g_free(ns->zone_array); + g_free(ns->zd_extensions); + } } static void nvme_ns_realize(DeviceState *dev, Error **errp) @@ -115,18 +358,34 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) NvmeNamespace *ns = NVME_NS(dev); BusState *s = qdev_get_parent_bus(dev); NvmeCtrl *n = NVME(s->parent); - Error *local_err = NULL; - if (nvme_ns_setup(n, ns, &local_err)) { - error_propagate_prepend(errp, local_err, - "could not setup namespace: "); + if (nvme_ns_setup(ns, errp)) { return; } + + if (nvme_register_namespace(n, ns, errp)) { + return; + } + } static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), + DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), + DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), + DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, + NVME_DEFAULT_ZONE_SIZE), + DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs, + 0), + DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, + params.cross_zone_read, false), + DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace, + params.max_active_zones, 0), + DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, + params.max_open_zones, 0), + DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, + params.zd_extension_size, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 83734f4606..293ac990e3 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -19,8 +19,23 @@ #define NVME_NS(obj) \ OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) +typedef struct NvmeZone { + NvmeZoneDescr d; + uint64_t w_ptr; + QTAILQ_ENTRY(NvmeZone) entry; +} NvmeZone; + typedef struct NvmeNamespaceParams { uint32_t nsid; + QemuUUID uuid; + + bool zoned; + bool cross_zone_read; + uint64_t zone_size_bs; + uint64_t zone_cap_bs; + uint32_t max_active_zones; + uint32_t max_open_zones; + uint32_t zd_extension_size; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -29,8 +44,28 @@ typedef struct NvmeNamespace { int32_t bootindex; int64_t size; NvmeIdNs id_ns; + const uint32_t *iocs; + uint8_t csi; + + NvmeIdNsZoned *id_ns_zoned; + NvmeZone *zone_array; + QTAILQ_HEAD(, NvmeZone) exp_open_zones; + QTAILQ_HEAD(, NvmeZone) imp_open_zones; + QTAILQ_HEAD(, NvmeZone) closed_zones; + QTAILQ_HEAD(, NvmeZone) full_zones; + uint32_t num_zones; + uint64_t zone_size; + uint64_t zone_capacity; + uint32_t zone_size_log2; + uint8_t *zd_extensions; + int32_t nr_open_zones; + int32_t nr_active_zones; NvmeNamespaceParams params; + + struct { + uint32_t err_rec; + } features; } NvmeNamespace; static inline uint32_t nvme_nsid(NvmeNamespace *ns) @@ -67,8 +102,81 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) typedef struct NvmeCtrl NvmeCtrl; -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); +static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone) +{ + return zone->d.zs >> 4; +} + +static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state) +{ + zone->d.zs = state << 4; +} + +static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone) +{ + return zone->d.zslba + ns->zone_size; +} + +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone) +{ + return zone->d.zslba + zone->d.zcap; +} + +static inline bool nvme_wp_is_valid(NvmeZone *zone) +{ + uint8_t st = nvme_get_zone_state(zone); + + return st != NVME_ZONE_STATE_FULL && + st != NVME_ZONE_STATE_READ_ONLY && + st != NVME_ZONE_STATE_OFFLINE; +} + +static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns, + uint32_t zone_idx) +{ + return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size]; +} + +static inline void nvme_aor_inc_open(NvmeNamespace *ns) +{ + assert(ns->nr_open_zones >= 0); + if (ns->params.max_open_zones) { + ns->nr_open_zones++; + assert(ns->nr_open_zones <= ns->params.max_open_zones); + } +} + +static inline void nvme_aor_dec_open(NvmeNamespace *ns) +{ + if (ns->params.max_open_zones) { + assert(ns->nr_open_zones > 0); + ns->nr_open_zones--; + } + assert(ns->nr_open_zones >= 0); +} + +static inline void nvme_aor_inc_active(NvmeNamespace *ns) +{ + assert(ns->nr_active_zones >= 0); + if (ns->params.max_active_zones) { + ns->nr_active_zones++; + assert(ns->nr_active_zones <= ns->params.max_active_zones); + } +} + +static inline void nvme_aor_dec_active(NvmeNamespace *ns) +{ + if (ns->params.max_active_zones) { + assert(ns->nr_active_zones > 0); + ns->nr_active_zones--; + assert(ns->nr_active_zones >= ns->nr_open_zones); + } + assert(ns->nr_active_zones >= 0); +} + +int nvme_ns_setup(NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); -void nvme_ns_flush(NvmeNamespace *ns); +void nvme_ns_shutdown(NvmeNamespace *ns); +void nvme_ns_cleanup(NvmeNamespace *ns); #endif /* NVME_NS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 27d2c72716..fb83636abd 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -9,7 +9,7 @@ */ /** - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e + * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e * * https://nvmexpress.org/developers/nvme-specification/ */ @@ -22,33 +22,67 @@ * [pmrdev=<mem_backend_file_id>,] \ * max_ioqpairs=<N[optional]>, \ * aerl=<N[optional]>, aer_max_queued=<N[optional]>, \ - * mdts=<N[optional]> - * -device nvme-ns,drive=<drive_id>,bus=bus_name,nsid=<nsid> + * mdts=<N[optional]>,zoned.append_size_limit=<N[optional]> \ + * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ + * zoned=<true|false[optional]> * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. + * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the + * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to + * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). * - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when - * both provided. * Enabling pmr emulation can be achieved by pointing to memory-backend-file. * For example: * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \ * size=<size> .... -device nvme,...,pmrdev=<mem_id> * + * The PMR will use BAR 4/5 exclusively. + * * * nvme device parameters * ~~~~~~~~~~~~~~~~~~~~~~ * - `aerl` * The Asynchronous Event Request Limit (AERL). Indicates the maximum number - * of concurrently outstanding Asynchronous Event Request commands suppoert + * of concurrently outstanding Asynchronous Event Request commands support * by the controller. This is a 0's based value. * * - `aer_max_queued` * This is the maximum number of events that the device will enqueue for - * completion when there are no oustanding AERs. When the maximum number of + * completion when there are no outstanding AERs. When the maximum number of * enqueued events are reached, subsequent events will be dropped. * + * - `zoned.append_size_limit` + * The maximum I/O size in bytes that is allowed in Zone Append command. + * The default is 128KiB. Since internally this this value is maintained as + * ZASL = log2(<maximum append size> / <page size>), some values assigned + * to this property may be rounded down and result in a lower maximum ZA + * data size being in effect. By setting this property to 0, users can make + * ZASL to be equal to MDTS. This property only affects zoned namespaces. + * + * Setting `zoned` to true selects Zoned Command Set at the namespace. + * In this case, the following namespace properties are available to configure + * zoned operation: + * zoned.zone_size=<zone size in bytes, default: 128MiB> + * The number may be followed by K, M, G as in kilo-, mega- or giga-. + * + * zoned.zone_capacity=<zone capacity in bytes, default: zone size> + * The value 0 (default) forces zone capacity to be the same as zone + * size. The value of this property may not exceed zone size. + * + * zoned.descr_ext_size=<zone descriptor extension size, default 0> + * This value needs to be specified in 64B units. If it is zero, + * namespace(s) will not support zone descriptor extensions. + * + * zoned.max_active=<Maximum Active Resources (zones), default: 0> + * The default value means there is no limit to the number of + * concurrently active zones. + * + * zoned.max_open=<Maximum Open Resources (zones), default: 0> + * The default value means there is no limit to the number of + * concurrently open zones. + * + * zoned.cross_read=<enable RAZB, default: false> + * Setting this property to true enables Read Across Zone Boundaries. */ #include "qemu/osdep.h" @@ -74,9 +108,9 @@ #define NVME_MAX_IOQPAIRS 0xffff #define NVME_DB_SIZE 4 -#define NVME_SPEC_VER 0x00010300 +#define NVME_SPEC_VER 0x00010400 #define NVME_CMB_BIR 2 -#define NVME_PMR_BIR 2 +#define NVME_PMR_BIR 4 #define NVME_TEMPERATURE 0x143 #define NVME_TEMPERATURE_WARNING 0x157 #define NVME_TEMPERATURE_CRITICAL 0x175 @@ -105,12 +139,49 @@ static const bool nvme_feature_support[NVME_FID_MAX] = { static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, + [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, }; +static const uint32_t nvme_cse_acs[256] = { + [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, +}; + +static const uint32_t nvme_cse_iocs_none[256]; + +static const uint32_t nvme_cse_iocs_nvm[256] = { + [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, +}; + +static const uint32_t nvme_cse_iocs_zoned[256] = { + [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -127,19 +198,104 @@ static uint16_t nvme_sqid(NvmeRequest *req) return le16_to_cpu(req->sq->sqid); } +static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state) +{ + if (QTAILQ_IN_USE(zone, entry)) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_CLOSED: + QTAILQ_REMOVE(&ns->closed_zones, zone, entry); + break; + case NVME_ZONE_STATE_FULL: + QTAILQ_REMOVE(&ns->full_zones, zone, entry); + default: + ; + } + } + + nvme_set_zone_state(zone, state); + + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_CLOSED: + QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); + break; + case NVME_ZONE_STATE_FULL: + QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); + case NVME_ZONE_STATE_READ_ONLY: + break; + default: + zone->d.za = 0; + } +} + +/* + * Check if we can open a zone without exceeding open/active limits. + * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). + */ +static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) +{ + if (ns->params.max_active_zones != 0 && + ns->nr_active_zones + act > ns->params.max_active_zones) { + trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); + return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; + } + if (ns->params.max_open_zones != 0 && + ns->nr_open_zones + opn > ns->params.max_open_zones) { + trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); + return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; + } + + return NVME_SUCCESS; +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { - hwaddr low = n->ctrl_mem.addr; - hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size); + hwaddr hi, lo; + + if (!n->cmb.cmse) { + return false; + } + + lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; + hi = lo + int128_get64(n->cmb.mem.size); - return addr >= low && addr < hi; + return addr >= lo && addr < hi; } static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) { - assert(nvme_addr_is_cmb(n, addr)); + hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; + return &n->cmb.buf[addr - base]; +} + +static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) +{ + hwaddr hi; - return &n->cmbuf[addr - n->ctrl_mem.addr]; + if (!n->pmr.cmse) { + return false; + } + + hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); + + return addr >= n->pmr.cba && addr < hi; +} + +static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) +{ + return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); } static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) @@ -154,6 +310,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) return 0; } + if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { + memcpy(buf, nvme_addr_to_pmr(n, addr), size); + return 0; + } + return pci_dma_read(&n->parent_obj, addr, buf, size); } @@ -241,6 +402,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) static void nvme_req_clear(NvmeRequest *req) { req->ns = NULL; + req->opaque = NULL; memset(&req->cqe, 0x0, sizeof(req->cqe)); req->status = NVME_SUCCESS; } @@ -274,9 +436,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, return NVME_SUCCESS; } +static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, + size_t len) +{ + if (!len) { + return NVME_SUCCESS; + } + + if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { + return NVME_DATA_TRAS_ERROR; + } + + qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); + + return NVME_SUCCESS; +} + static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, hwaddr addr, size_t len) { + bool cmb = false, pmr = false; + if (!len) { return NVME_SUCCESS; } @@ -284,6 +464,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, trace_pci_nvme_map_addr(addr, len); if (nvme_addr_is_cmb(n, addr)) { + cmb = true; + } else if (nvme_addr_is_pmr(n, addr)) { + pmr = true; + } + + if (cmb || pmr) { if (qsg && qsg->sg) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } @@ -294,7 +480,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, qemu_iovec_init(iov, 1); } - return nvme_map_addr_cmb(n, iov, addr, len); + if (cmb) { + return nvme_map_addr_cmb(n, iov, addr, len); + } else { + return nvme_map_addr_pmr(n, iov, addr, len); + } } if (iov && iov->iov) { @@ -319,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; uint16_t status; - bool prp_list_in_cmb = false; int ret; QEMUSGList *qsg = &req->qsg; @@ -327,7 +516,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (nvme_addr_is_cmb(n, prp1)) { + if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) { qemu_iovec_init(iov, num_prps); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); @@ -345,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint32_t nents, prp_trans; int i = 0; - if (nvme_addr_is_cmb(n, prp2)) { - prp_list_in_cmb = true; - } - nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); @@ -365,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_INVALID_PRP_OFFSET | NVME_DNR; } - if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { - return NVME_INVALID_USE_OF_CMB | NVME_DNR; - } - i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); @@ -502,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, uint64_t nsgld; uint32_t seg_len; uint16_t status; - bool sgl_in_cmb = false; hwaddr addr; int ret; @@ -524,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, goto out; } - /* - * If the segment is located in the CMB, the submission queue of the - * request must also reside there. - */ - if (nvme_addr_is_cmb(n, addr)) { - if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { - return NVME_INVALID_USE_OF_CMB | NVME_DNR; - } - - sgl_in_cmb = true; - } - for (;;) { switch (NVME_SGL_TYPE(sgld->type)) { case NVME_SGL_DESCR_TYPE_SEGMENT: @@ -624,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, if (status) { goto unmap; } - - /* - * If the next segment is in the CMB, make sure that the sgl was - * already located there. - */ - if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { - status = NVME_INVALID_USE_OF_CMB | NVME_DNR; - goto unmap; - } } out: @@ -847,6 +1006,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, nvme_process_aers(n); } +static void nvme_smart_event(NvmeCtrl *n, uint8_t event) +{ + uint8_t aer_info; + + /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */ + if (!(NVME_AEC_SMART(n->features.async_config) & event)) { + return; + } + + switch (event) { + case NVME_SMART_SPARE: + aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; + break; + case NVME_SMART_TEMPERATURE: + aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; + break; + case NVME_SMART_RELIABILITY: + case NVME_SMART_MEDIA_READ_ONLY: + case NVME_SMART_FAILED_VOLATILE_MEDIA: + case NVME_SMART_PMR_UNRELIABLE: + aer_info = NVME_AER_INFO_SMART_RELIABILITY; + break; + default: + return; + } + + nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); +} + static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) { n->aer_mask &= ~(1 << event_type); @@ -866,8 +1054,8 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) return NVME_SUCCESS; } -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, - uint64_t slba, uint32_t nlb) +static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) { uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); @@ -878,6 +1066,307 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, return NVME_SUCCESS; } +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + BlockDriverState *bs = blk_bs(ns->blkconf.blk); + + int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + bool zeroed; + int ret; + + Error *local_err = NULL; + + /* + * `pnum` holds the number of bytes after offset that shares the same + * allocation status as the byte at offset. If `pnum` is different from + * `bytes`, we should check the allocation status of the next range and + * continue this until all bytes have been checked. + */ + do { + bytes -= pnum; + + ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); + if (ret < 0) { + error_setg_errno(&local_err, -ret, "unable to get block status"); + error_report_err(local_err); + + return NVME_INTERNAL_DEV_ERROR; + } + + zeroed = !!(ret & BDRV_BLOCK_ZERO); + + trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed); + + if (zeroed) { + return NVME_DULB; + } + + offset += pnum; + } while (pnum != bytes); + + return NVME_SUCCESS; +} + +static void nvme_aio_err(NvmeRequest *req, int ret) +{ + uint16_t status = NVME_SUCCESS; + Error *local_err = NULL; + + switch (req->cmd.opcode) { + case NVME_CMD_READ: + status = NVME_UNRECOVERED_READ; + break; + case NVME_CMD_FLUSH: + case NVME_CMD_WRITE: + case NVME_CMD_WRITE_ZEROES: + case NVME_CMD_ZONE_APPEND: + status = NVME_WRITE_FAULT; + break; + default: + status = NVME_INTERNAL_DEV_ERROR; + break; + } + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + + error_setg_errno(&local_err, -ret, "aio failed"); + error_report_err(local_err); + + /* + * Set the command status code to the first encountered error but allow a + * subsequent Internal Device Error to trump it. + */ + if (req->status && status != NVME_INTERNAL_DEV_ERROR) { + return; + } + + req->status = status; +} + +static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) +{ + return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : + slba / ns->zone_size; +} + +static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) +{ + uint32_t zone_idx = nvme_zone_idx(ns, slba); + + assert(zone_idx < ns->num_zones); + return &ns->zone_array[zone_idx]; +} + +static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) +{ + uint64_t zslba = zone->d.zslba; + + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; + case NVME_ZONE_STATE_FULL: + trace_pci_nvme_err_zone_is_full(zslba); + return NVME_ZONE_FULL; + case NVME_ZONE_STATE_OFFLINE: + trace_pci_nvme_err_zone_is_offline(zslba); + return NVME_ZONE_OFFLINE; + case NVME_ZONE_STATE_READ_ONLY: + trace_pci_nvme_err_zone_is_read_only(zslba); + return NVME_ZONE_READ_ONLY; + default: + assert(false); + } + + return NVME_INTERNAL_DEV_ERROR; +} + +static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, + NvmeZone *zone, uint64_t slba, + uint32_t nlb) +{ + uint64_t zcap = nvme_zone_wr_boundary(zone); + uint16_t status; + + status = nvme_check_zone_state_for_write(zone); + if (status) { + return status; + } + + if (unlikely(slba != zone->w_ptr)) { + trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); + return NVME_ZONE_INVALID_WRITE; + } + + if (unlikely((slba + nlb) > zcap)) { + trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); + return NVME_ZONE_BOUNDARY_ERROR; + } + + return NVME_SUCCESS; +} + +static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) +{ + uint16_t status; + + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_FULL: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_READ_ONLY: + status = NVME_SUCCESS; + break; + case NVME_ZONE_STATE_OFFLINE: + status = NVME_ZONE_OFFLINE; + break; + default: + assert(false); + } + + return status; +} + +static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + NvmeZone *zone = nvme_get_zone_by_slba(ns, slba); + uint64_t bndry = nvme_zone_rd_boundary(ns, zone); + uint64_t end = slba + nlb; + uint16_t status; + + status = nvme_check_zone_state_for_read(zone); + if (status) { + ; + } else if (unlikely(end > bndry)) { + if (!ns->params.cross_zone_read) { + status = NVME_ZONE_BOUNDARY_ERROR; + } else { + /* + * Read across zone boundary - check that all subsequent + * zones that are being read have an appropriate state. + */ + do { + zone++; + status = nvme_check_zone_state_for_read(zone); + if (status) { + break; + } + } while (end > nvme_zone_rd_boundary(ns, zone)); + } + } + + return status; +} + +static void nvme_auto_transition_zone(NvmeNamespace *ns) +{ + NvmeZone *zone; + + if (ns->params.max_open_zones && + ns->nr_open_zones == ns->params.max_open_zones) { + zone = QTAILQ_FIRST(&ns->imp_open_zones); + if (zone) { + /* + * Automatically close this implicitly open zone. + */ + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + } + } +} + +static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone) +{ + uint16_t status = NVME_SUCCESS; + uint8_t zs = nvme_get_zone_state(zone); + + if (zs == NVME_ZONE_STATE_EMPTY) { + nvme_auto_transition_zone(ns); + status = nvme_aor_check(ns, 1, 1); + } else if (zs == NVME_ZONE_STATE_CLOSED) { + nvme_auto_transition_zone(ns); + status = nvme_aor_check(ns, 0, 1); + } + + return status; +} + +static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, + bool failed) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeZone *zone; + NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; + uint64_t slba; + uint32_t nlb; + + slba = le64_to_cpu(rw->slba); + nlb = le16_to_cpu(rw->nlb) + 1; + zone = nvme_get_zone_by_slba(ns, slba); + + zone->d.wp += nlb; + + if (failed) { + res->slba = 0; + } + + if (zone->d.wp == nvme_zone_wr_boundary(zone)) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ + case NVME_ZONE_STATE_EMPTY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + /* fall through */ + case NVME_ZONE_STATE_FULL: + break; + default: + assert(false); + } + } +} + +static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) +{ + uint8_t zs; + + zone->w_ptr += nlb; + + if (zone->w_ptr < nvme_zone_wr_boundary(zone)) { + zs = nvme_get_zone_state(zone); + switch (zs) { + case NVME_ZONE_STATE_EMPTY: + nvme_aor_inc_active(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_inc_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); + } + } +} + +static inline bool nvme_is_write(NvmeRequest *req) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + + return rw->opcode == NVME_CMD_WRITE || + rw->opcode == NVME_CMD_ZONE_APPEND || + rw->opcode == NVME_CMD_WRITE_ZEROES; +} + static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -887,91 +1376,278 @@ static void nvme_rw_cb(void *opaque, int ret) BlockAcctCookie *acct = &req->acct; BlockAcctStats *stats = blk_get_stats(blk); - Error *local_err = NULL; - trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); + if (ns->params.zoned && nvme_is_write(req)) { + nvme_finalize_zoned_write(ns, req, ret != 0); + } + if (!ret) { block_acct_done(stats, acct); } else { - uint16_t status; - block_acct_failed(stats, acct); + nvme_aio_err(req, ret); + } - switch (req->cmd.opcode) { - case NVME_CMD_READ: - status = NVME_UNRECOVERED_READ; - break; - case NVME_CMD_FLUSH: - case NVME_CMD_WRITE: - case NVME_CMD_WRITE_ZEROES: - status = NVME_WRITE_FAULT; - break; + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static void nvme_aio_discard_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + uintptr_t *discards = (uintptr_t *)&req->opaque; + + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); + + if (ret) { + nvme_aio_err(req, ret); + } + + (*discards)--; + + if (*discards) { + return; + } + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +struct nvme_zone_reset_ctx { + NvmeRequest *req; + NvmeZone *zone; +}; + +static void nvme_aio_zone_reset_cb(void *opaque, int ret) +{ + struct nvme_zone_reset_ctx *ctx = opaque; + NvmeRequest *req = ctx->req; + NvmeNamespace *ns = req->ns; + NvmeZone *zone = ctx->zone; + uintptr_t *resets = (uintptr_t *)&req->opaque; + + g_free(ctx); + + trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); + + if (!ret) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ + case NVME_ZONE_STATE_FULL: + zone->w_ptr = zone->d.zslba; + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); + /* fall through */ default: - status = NVME_INTERNAL_DEV_ERROR; break; } + } else { + nvme_aio_err(req, ret); + } - trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + (*resets)--; - error_setg_errno(&local_err, -ret, "aio failed"); - error_report_err(local_err); + if (*resets) { + return; + } + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +struct nvme_compare_ctx { + QEMUIOVector iov; + uint8_t *bounce; + size_t len; +}; + +static void nvme_compare_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + NvmeNamespace *ns = req->ns; + struct nvme_compare_ctx *ctx = req->opaque; + g_autofree uint8_t *buf = NULL; + uint16_t status; + + trace_pci_nvme_compare_cb(nvme_cid(req)); + + if (!ret) { + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + } else { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + nvme_aio_err(req, ret); + goto out; + } + + buf = g_malloc(ctx->len); + + status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE, + req); + if (status) { req->status = status; + goto out; + } + + if (memcmp(buf, ctx->bounce, ctx->len)) { + req->status = NVME_CMP_FAILURE; } +out: + qemu_iovec_destroy(&ctx->iov); + g_free(ctx->bounce); + g_free(ctx); + nvme_enqueue_req_completion(nvme_cq(req), req); } -static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) { - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); - return NVME_NO_COMPLETE; + NvmeNamespace *ns = req->ns; + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + + uint32_t attr = le32_to_cpu(dsm->attributes); + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + + uint16_t status = NVME_SUCCESS; + + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); + + if (attr & NVME_DSMGMT_AD) { + int64_t offset; + size_t len; + NvmeDsmRange range[nr]; + uintptr_t *discards = (uintptr_t *)&req->opaque; + + status = nvme_dma(n, (uint8_t *)range, sizeof(range), + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + return status; + } + + /* + * AIO callbacks may be called immediately, so initialize discards to 1 + * to make sure the the callback does not complete the request before + * all discards have been issued. + */ + *discards = 1; + + for (int i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le32_to_cpu(range[i].nlb); + + if (nvme_check_bounds(ns, slba, nlb)) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, + ns->id_ns.nsze); + continue; + } + + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, + nlb); + + offset = nvme_l2b(ns, slba); + len = nvme_l2b(ns, nlb); + + while (len) { + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + + (*discards)++; + + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, + nvme_aio_discard_cb, req); + + offset += bytes; + len -= bytes; + } + } + + /* account for the 1-initialization */ + (*discards)--; + + if (*discards) { + status = NVME_NO_COMPLETE; + } else { + status = req->status; + } + } + + return status; } -static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; + BlockBackend *blk = ns->blkconf.blk; uint64_t slba = le64_to_cpu(rw->slba); - uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; - uint64_t offset = nvme_l2b(ns, slba); - uint32_t count = nvme_l2b(ns, nlb); + uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + size_t len = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + uint8_t *bounce = NULL; + struct nvme_compare_ctx *ctx = NULL; uint16_t status; - trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb); + trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); + + status = nvme_check_mdts(n, len); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), len); + return status; + } - status = nvme_check_bounds(n, ns, slba, nlb); + status = nvme_check_bounds(ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); return status; } + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + return status; + } + } + + bounce = g_malloc(len); + + ctx = g_new(struct nvme_compare_ctx, 1); + ctx->bounce = bounce; + ctx->len = len; + + req->opaque = ctx; + + qemu_iovec_init(&ctx->iov, 1); + qemu_iovec_add(&ctx->iov, bounce, len); + + block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ); + blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req); + + return NVME_NO_COMPLETE; +} + +static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) +{ block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_WRITE); - req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count, - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); + BLOCK_ACCT_FLUSH); + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); return NVME_NO_COMPLETE; } -static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; - uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); - + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t data_size = nvme_l2b(ns, nlb); - uint64_t data_offset = nvme_l2b(ns, slba); - enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ? - BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; + uint64_t data_offset; BlockBackend *blk = ns->blkconf.blk; uint16_t status; - trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), - nvme_nsid(ns), nlb, data_size, slba); + trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba); status = nvme_check_mdts(n, data_size); if (status) { @@ -979,39 +1655,680 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - status = nvme_check_bounds(n, ns, slba, nlb); + status = nvme_check_bounds(ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); goto invalid; } + if (ns->params.zoned) { + status = nvme_check_zone_read(ns, slba, nlb); + if (status) { + trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); + goto invalid; + } + } + status = nvme_map_dptr(n, data_size, req); if (status) { goto invalid; } - block_acct_start(blk_get_stats(blk), &req->acct, data_size, acct); + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + goto invalid; + } + } + + data_offset = nvme_l2b(ns, slba); + + block_acct_start(blk_get_stats(blk), &req->acct, data_size, + BLOCK_ACCT_READ); if (req->qsg.sg) { - if (acct == BLOCK_ACCT_WRITE) { + req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); + } + return NVME_NO_COMPLETE; + +invalid: + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); + return status | NVME_DNR; +} + +static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, + bool wrz) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + uint64_t slba = le64_to_cpu(rw->slba); + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; + uint64_t data_size = nvme_l2b(ns, nlb); + uint64_t data_offset; + NvmeZone *zone; + NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; + BlockBackend *blk = ns->blkconf.blk; + uint16_t status; + + trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), + nvme_nsid(ns), nlb, data_size, slba); + + if (!wrz) { + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + goto invalid; + } + } + + status = nvme_check_bounds(ns, slba, nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); + goto invalid; + } + + if (ns->params.zoned) { + zone = nvme_get_zone_by_slba(ns, slba); + + if (append) { + if (unlikely(slba != zone->d.zslba)) { + trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); + status = NVME_INVALID_FIELD; + goto invalid; + } + + if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { + trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); + status = NVME_INVALID_FIELD; + goto invalid; + } + + slba = zone->w_ptr; + res->slba = cpu_to_le64(slba); + } + + status = nvme_check_zone_write(n, ns, zone, slba, nlb); + if (status) { + goto invalid; + } + + status = nvme_auto_open_zone(ns, zone); + if (status) { + goto invalid; + } + + nvme_advance_zone_wp(ns, zone, nlb); + } + + data_offset = nvme_l2b(ns, slba); + + if (!wrz) { + status = nvme_map_dptr(n, data_size, req); + if (status) { + goto invalid; + } + + block_acct_start(blk_get_stats(blk), &req->acct, data_size, + BLOCK_ACCT_WRITE); + if (req->qsg.sg) { req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req); } else { - req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } - } else { - if (acct == BLOCK_ACCT_WRITE) { req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); } + } else { + block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE); + req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, + BDRV_REQ_MAY_UNMAP, nvme_rw_cb, + req); } return NVME_NO_COMPLETE; invalid: - block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct); + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); + return status | NVME_DNR; +} + +static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, false, false); +} + +static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, false, true); +} + +static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, true, false); +} + +static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, + uint64_t *slba, uint32_t *zone_idx) +{ + uint32_t dw10 = le32_to_cpu(c->cdw10); + uint32_t dw11 = le32_to_cpu(c->cdw11); + + if (!ns->params.zoned) { + trace_pci_nvme_err_invalid_opc(c->opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + + *slba = ((uint64_t)dw11) << 32 | dw10; + if (unlikely(*slba >= ns->id_ns.nsze)) { + trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); + *slba = 0; + return NVME_LBA_RANGE | NVME_DNR; + } + + *zone_idx = nvme_zone_idx(ns, *slba); + assert(*zone_idx < ns->num_zones); + + return NVME_SUCCESS; +} + +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, + NvmeRequest *); + +enum NvmeZoneProcessingMask { + NVME_PROC_CURRENT_ZONE = 0, + NVME_PROC_OPENED_ZONES = 1 << 0, + NVME_PROC_CLOSED_ZONES = 1 << 1, + NVME_PROC_READ_ONLY_ZONES = 1 << 2, + NVME_PROC_FULL_ZONES = 1 << 3, +}; + +static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state, NvmeRequest *req) +{ + uint16_t status; + + switch (state) { + case NVME_ZONE_STATE_EMPTY: + status = nvme_aor_check(ns, 1, 0); + if (status) { + return status; + } + nvme_aor_inc_active(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + status = nvme_aor_check(ns, 0, 1); + if (status) { + if (state == NVME_ZONE_STATE_EMPTY) { + nvme_aor_dec_active(ns); + } + return status; + } + nvme_aor_inc_open(ns); + /* fall through */ + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); + /* fall through */ + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state, NvmeRequest *req) +{ + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state, NvmeRequest *req) +{ + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ + case NVME_ZONE_STATE_EMPTY: + zone->w_ptr = nvme_zone_wr_boundary(zone); + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + /* fall through */ + case NVME_ZONE_STATE_FULL: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state, NvmeRequest *req) +{ + uintptr_t *resets = (uintptr_t *)&req->opaque; + struct nvme_zone_reset_ctx *ctx; + + switch (state) { + case NVME_ZONE_STATE_EMPTY: + return NVME_SUCCESS; + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_FULL: + break; + default: + return NVME_ZONE_INVAL_TRANSITION; + } + + /* + * The zone reset aio callback needs to know the zone that is being reset + * in order to transition the zone on completion. + */ + ctx = g_new(struct nvme_zone_reset_ctx, 1); + ctx->req = req; + ctx->zone = zone; + + (*resets)++; + + blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba), + nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, + nvme_aio_zone_reset_cb, ctx); + + return NVME_NO_COMPLETE; +} + +static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, + NvmeZoneState state, NvmeRequest *req) +{ + switch (state) { + case NVME_ZONE_STATE_READ_ONLY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); + /* fall through */ + case NVME_ZONE_STATE_OFFLINE: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) +{ + uint16_t status; + uint8_t state = nvme_get_zone_state(zone); + + if (state == NVME_ZONE_STATE_EMPTY) { + status = nvme_aor_check(ns, 1, 0); + if (status) { + return status; + } + nvme_aor_inc_active(ns); + zone->d.za |= NVME_ZA_ZD_EXT_VALID; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + return NVME_SUCCESS; + } + + return NVME_ZONE_INVAL_TRANSITION; +} + +static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneProcessingMask proc_mask, + op_handler_t op_hndlr, NvmeRequest *req) +{ + uint16_t status = NVME_SUCCESS; + NvmeZoneState zs = nvme_get_zone_state(zone); + bool proc_zone; + + switch (zs) { + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; + break; + case NVME_ZONE_STATE_CLOSED: + proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; + break; + case NVME_ZONE_STATE_READ_ONLY: + proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; + break; + case NVME_ZONE_STATE_FULL: + proc_zone = proc_mask & NVME_PROC_FULL_ZONES; + break; + default: + proc_zone = false; + } + + if (proc_zone) { + status = op_hndlr(ns, zone, zs, req); + } + + return status; +} + +static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneProcessingMask proc_mask, + op_handler_t op_hndlr, NvmeRequest *req) +{ + NvmeZone *next; + uint16_t status = NVME_SUCCESS; + int i; + + if (!proc_mask) { + status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); + } else { + if (proc_mask & NVME_PROC_CLOSED_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { + goto out; + } + } + } + if (proc_mask & NVME_PROC_OPENED_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { + goto out; + } + } + + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { + goto out; + } + } + } + if (proc_mask & NVME_PROC_FULL_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { + goto out; + } + } + } + + if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { + for (i = 0; i < ns->num_zones; i++, zone++) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { + goto out; + } + } + } + } + +out: + return status; +} + +static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = (NvmeCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + NvmeZone *zone; + uintptr_t *resets; + uint8_t *zd_ext; + uint32_t dw13 = le32_to_cpu(cmd->cdw13); + uint64_t slba = 0; + uint32_t zone_idx = 0; + uint16_t status; + uint8_t action; + bool all; + enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; + + action = dw13 & 0xff; + all = dw13 & 0x100; + + req->status = NVME_SUCCESS; + + if (!all) { + status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); + if (status) { + return status; + } + } + + zone = &ns->zone_array[zone_idx]; + if (slba != zone->d.zslba) { + trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (action) { + + case NVME_ZONE_ACTION_OPEN: + if (all) { + proc_mask = NVME_PROC_CLOSED_ZONES; + } + trace_pci_nvme_open_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); + break; + + case NVME_ZONE_ACTION_CLOSE: + if (all) { + proc_mask = NVME_PROC_OPENED_ZONES; + } + trace_pci_nvme_close_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); + break; + + case NVME_ZONE_ACTION_FINISH: + if (all) { + proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; + } + trace_pci_nvme_finish_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); + break; + + case NVME_ZONE_ACTION_RESET: + resets = (uintptr_t *)&req->opaque; + + if (all) { + proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | + NVME_PROC_FULL_ZONES; + } + trace_pci_nvme_reset_zone(slba, zone_idx, all); + + *resets = 1; + + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req); + + (*resets)--; + + return *resets ? NVME_NO_COMPLETE : req->status; + + case NVME_ZONE_ACTION_OFFLINE: + if (all) { + proc_mask = NVME_PROC_READ_ONLY_ZONES; + } + trace_pci_nvme_offline_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); + break; + + case NVME_ZONE_ACTION_SET_ZD_EXT: + trace_pci_nvme_set_descriptor_extension(slba, zone_idx); + if (all || !ns->params.zd_extension_size) { + return NVME_INVALID_FIELD | NVME_DNR; + } + zd_ext = nvme_get_zd_extension(ns, zone_idx); + status = nvme_dma(n, zd_ext, ns->params.zd_extension_size, + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + trace_pci_nvme_err_zd_extension_map_error(zone_idx); + return status; + } + + status = nvme_set_zd_ext(ns, zone); + if (status == NVME_SUCCESS) { + trace_pci_nvme_zd_extension_set(zone_idx); + return status; + } + break; + + default: + trace_pci_nvme_err_invalid_mgmt_action(action); + status = NVME_INVALID_FIELD; + } + + if (status == NVME_ZONE_INVAL_TRANSITION) { + trace_pci_nvme_err_invalid_zone_state_transition(action, slba, + zone->d.za); + } + if (status) { + status |= NVME_DNR; + } + + return status; +} + +static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) +{ + NvmeZoneState zs = nvme_get_zone_state(zl); + + switch (zafs) { + case NVME_ZONE_REPORT_ALL: + return true; + case NVME_ZONE_REPORT_EMPTY: + return zs == NVME_ZONE_STATE_EMPTY; + case NVME_ZONE_REPORT_IMPLICITLY_OPEN: + return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; + case NVME_ZONE_REPORT_EXPLICITLY_OPEN: + return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; + case NVME_ZONE_REPORT_CLOSED: + return zs == NVME_ZONE_STATE_CLOSED; + case NVME_ZONE_REPORT_FULL: + return zs == NVME_ZONE_STATE_FULL; + case NVME_ZONE_REPORT_READ_ONLY: + return zs == NVME_ZONE_STATE_READ_ONLY; + case NVME_ZONE_REPORT_OFFLINE: + return zs == NVME_ZONE_STATE_OFFLINE; + default: + return false; + } +} + +static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = (NvmeCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + /* cdw12 is zero-based number of dwords to return. Convert to bytes */ + uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; + uint32_t dw13 = le32_to_cpu(cmd->cdw13); + uint32_t zone_idx, zra, zrasf, partial; + uint64_t max_zones, nr_zones = 0; + uint16_t status; + uint64_t slba, capacity = nvme_ns_nlbas(ns); + NvmeZoneDescr *z; + NvmeZone *zone; + NvmeZoneReportHeader *header; + void *buf, *buf_p; + size_t zone_entry_sz; + + req->status = NVME_SUCCESS; + + status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); + if (status) { + return status; + } + + zra = dw13 & 0xff; + if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { + return NVME_INVALID_FIELD | NVME_DNR; + } + if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + zrasf = (dw13 >> 8) & 0xff; + if (zrasf > NVME_ZONE_REPORT_OFFLINE) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (data_size < sizeof(NvmeZoneReportHeader)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + return status; + } + + partial = (dw13 >> 16) & 0x01; + + zone_entry_sz = sizeof(NvmeZoneDescr); + if (zra == NVME_ZONE_REPORT_EXTENDED) { + zone_entry_sz += ns->params.zd_extension_size; + } + + max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; + buf = g_malloc0(data_size); + + zone = &ns->zone_array[zone_idx]; + for (; slba < capacity; slba += ns->zone_size) { + if (partial && nr_zones >= max_zones) { + break; + } + if (nvme_zone_matches_filter(zrasf, zone++)) { + nr_zones++; + } + } + header = (NvmeZoneReportHeader *)buf; + header->nr_zones = cpu_to_le64(nr_zones); + + buf_p = buf + sizeof(NvmeZoneReportHeader); + for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { + zone = &ns->zone_array[zone_idx]; + if (nvme_zone_matches_filter(zrasf, zone)) { + z = (NvmeZoneDescr *)buf_p; + buf_p += sizeof(NvmeZoneDescr); + + z->zt = zone->d.zt; + z->zs = zone->d.zs; + z->zcap = cpu_to_le64(zone->d.zcap); + z->zslba = cpu_to_le64(zone->d.zslba); + z->za = zone->d.za; + + if (nvme_wp_is_valid(zone)) { + z->wp = cpu_to_le64(zone->d.wp); + } else { + z->wp = cpu_to_le64(~0ULL); + } + + if (zra == NVME_ZONE_REPORT_EXTENDED) { + if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { + memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), + ns->params.zd_extension_size); + } + buf_p += ns->params.zd_extension_size; + } + + max_zones--; + } + } + + status = nvme_dma(n, (uint8_t *)buf, data_size, + DMA_DIRECTION_FROM_DEVICE, req); + + g_free(buf); + return status; } @@ -1022,10 +2339,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); - if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) { - return NVME_INVALID_OPCODE | NVME_DNR; - } - if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } @@ -1035,18 +2348,35 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } + if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_CMD_FLUSH: return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); + case NVME_CMD_ZONE_APPEND: + return nvme_zone_append(n, req); case NVME_CMD_WRITE: + return nvme_write(n, req); case NVME_CMD_READ: - return nvme_rw(n, req); + return nvme_read(n, req); + case NVME_CMD_COMPARE: + return nvme_compare(n, req); + case NVME_CMD_DSM: + return nvme_dsm(n, req); + case NVME_CMD_ZONE_MGMT_SEND: + return nvme_zone_mgmt_send(n, req); + case NVME_CMD_ZONE_MGMT_RECV: + return nvme_zone_mgmt_recv(n, req); default: - trace_pci_nvme_err_invalid_opc(req->cmd.opcode); - return NVME_INVALID_OPCODE | NVME_DNR; + assert(false); } + + return NVME_INVALID_OPCODE | NVME_DNR; } static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) @@ -1214,6 +2544,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, } trans_len = MIN(sizeof(smart) - off, buf_len); + smart.critical_warning = n->smart_critical_warning; smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 1000)); @@ -1281,6 +2612,47 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, DMA_DIRECTION_FROM_DEVICE, req); } +static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + NvmeEffectsLog log = {}; + const uint32_t *src_iocs = NULL; + uint32_t trans_len; + + if (off >= sizeof(log)) { + trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (NVME_CC_CSS(n->bar.cc)) { + case NVME_CC_CSS_NVM: + src_iocs = nvme_cse_iocs_nvm; + /* fall through */ + case NVME_CC_CSS_ADMIN_ONLY: + break; + case NVME_CC_CSS_CSI: + switch (csi) { + case NVME_CSI_NVM: + src_iocs = nvme_cse_iocs_nvm; + break; + case NVME_CSI_ZONED: + src_iocs = nvme_cse_iocs_zoned; + break; + } + } + + memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); + + if (src_iocs) { + memcpy(log.iocs, src_iocs, sizeof(log.iocs)); + } + + trans_len = MIN(sizeof(log) - off, buf_len); + + return nvme_dma(n, ((uint8_t *)&log) + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); +} + static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = &req->cmd; @@ -1292,6 +2664,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) uint8_t lid = dw10 & 0xff; uint8_t lsp = (dw10 >> 8) & 0xf; uint8_t rae = (dw10 >> 15) & 0x1; + uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; uint32_t numdl, numdu; uint64_t off, lpol, lpou; size_t len; @@ -1324,6 +2697,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); + case NVME_LOG_CMD_EFFECTS: + return nvme_cmd_effects(n, csi, len, off, req); default: trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); return NVME_INVALID_FIELD | NVME_DNR; @@ -1334,7 +2709,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { n->cq[cq->cqid] = NULL; timer_free(cq->timer); - msix_vector_unuse(&n->parent_obj, cq->vector); + if (msix_enabled(&n->parent_obj)) { + msix_vector_unuse(&n->parent_obj, cq->vector); + } if (cq->cqid) { g_free(cq); } @@ -1368,8 +2745,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, { int ret; - ret = msix_vector_use(&n->parent_obj, vector); - assert(ret == 0); + if (msix_enabled(&n->parent_obj)) { + ret = msix_vector_use(&n->parent_obj, vector); + assert(ret == 0); + } cq->ctrl = n; cq->cqid = cqid; cq->size = size; @@ -1436,6 +2815,23 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) +{ + uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; + + return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); +} + +static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) +{ + switch (ns->csi) { + case NVME_CSI_NVM: + case NVME_CSI_ZONED: + return true; + } + return false; +} + static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); @@ -1444,11 +2840,30 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) DMA_DIRECTION_FROM_DEVICE, req); } +static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + NvmeIdCtrlZoned id = {}; + + trace_pci_nvme_identify_ctrl_csi(c->csi); + + if (c->csi == NVME_CSI_NVM) { + return nvme_rpt_empty_id_struct(n, req); + } else if (c->csi == NVME_CSI_ZONED) { + if (n->params.zasl_bs) { + id.zasl = n->zasl; + } + return nvme_dma(n, (uint8_t *)&id, sizeof(id), + DMA_DIRECTION_FROM_DEVICE, req); + } + + return NVME_INVALID_FIELD | NVME_DNR; +} + static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - NvmeIdNs *id_ns, inactive = { 0 }; uint32_t nsid = le32_to_cpu(c->nsid); trace_pci_nvme_identify_ns(nsid); @@ -1459,23 +2874,53 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - id_ns = &inactive; - } else { - id_ns = &ns->id_ns; + return nvme_rpt_empty_id_struct(n, req); } - return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs), - DMA_DIRECTION_FROM_DEVICE, req); + if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), + DMA_DIRECTION_FROM_DEVICE, req); + } + + return NVME_INVALID_CMD_SET | NVME_DNR; +} + +static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint32_t nsid = le32_to_cpu(c->nsid); + + trace_pci_nvme_identify_ns_csi(nsid, c->csi); + + if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return nvme_rpt_empty_id_struct(n, req); + } + + if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { + return nvme_rpt_empty_id_struct(n, req); + } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { + return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), + DMA_DIRECTION_FROM_DEVICE, req); + } + + return NVME_INVALID_FIELD | NVME_DNR; } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) { + NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); - uint32_t *list; - uint16_t ret; - int j = 0; + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + uint32_t *list_ptr = (uint32_t *)list; + int i, j = 0; trace_pci_nvme_identify_nslist(min_nsid); @@ -1489,33 +2934,79 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } - list = g_malloc0(data_len); - for (int i = 1; i <= n->num_namespaces; i++) { - if (i <= min_nsid || !nvme_ns(n, i)) { + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { continue; } - list[j++] = cpu_to_le32(i); + if (ns->params.nsid <= min_nsid) { + continue; + } + list_ptr[j++] = cpu_to_le32(ns->params.nsid); if (j == data_len / sizeof(uint32_t)) { break; } } - ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE, - req); - g_free(list); - return ret; + + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); +} + +static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint32_t min_nsid = le32_to_cpu(c->nsid); + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + uint32_t *list_ptr = (uint32_t *)list; + int i, j = 0; + + trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); + + /* + * Same as in nvme_identify_nslist(), 0xffffffff/0xfffffffe are invalid. + */ + if (min_nsid >= NVME_NSID_BROADCAST - 1) { + return NVME_INVALID_NSID | NVME_DNR; + } + + if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { + continue; + } + list_ptr[j++] = cpu_to_le32(ns->params.nsid); + if (j == data_len / sizeof(uint32_t)) { + break; + } + } + + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { + NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint8_t list[NVME_IDENTIFY_DATA_SIZE]; + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; struct data { struct { NvmeIdNsDescr hdr; - uint8_t v[16]; + uint8_t v[NVME_NIDL_UUID]; } uuid; + struct { + NvmeIdNsDescr hdr; + uint8_t v; + } csi; }; struct data *ns_descrs = (struct data *)list; @@ -1526,24 +3017,38 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } - if (unlikely(!nvme_ns(n, nsid))) { + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { return NVME_INVALID_FIELD | NVME_DNR; } - memset(list, 0x0, sizeof(list)); - /* * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data * structure, a Namespace UUID (nidt = 0x3) must be reported in the - * Namespace Identification Descriptor. Add a very basic Namespace UUID - * here. + * Namespace Identification Descriptor. Add the namespace UUID here. */ ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; - ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; - stl_be_p(&ns_descrs->uuid.v, nsid); + ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID; + memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); - return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, - DMA_DIRECTION_FROM_DEVICE, req); + ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI; + ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; + ns_descrs->csi.v = ns->csi; + + return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req); +} + +static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) +{ + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + + trace_pci_nvme_identify_cmd_set(); + + NVME_SET_CSI(*list, NVME_CSI_NVM); + NVME_SET_CSI(*list, NVME_CSI_ZONED); + + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1552,13 +3057,29 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) switch (le32_to_cpu(c->cns)) { case NVME_ID_CNS_NS: + /* fall through */ + case NVME_ID_CNS_NS_PRESENT: return nvme_identify_ns(n, req); + case NVME_ID_CNS_CS_NS: + /* fall through */ + case NVME_ID_CNS_CS_NS_PRESENT: + return nvme_identify_ns_csi(n, req); case NVME_ID_CNS_CTRL: return nvme_identify_ctrl(n, req); + case NVME_ID_CNS_CS_CTRL: + return nvme_identify_ctrl_csi(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: + /* fall through */ + case NVME_ID_CNS_NS_PRESENT_LIST: return nvme_identify_nslist(n, req); + case NVME_ID_CNS_CS_NS_ACTIVE_LIST: + /* fall through */ + case NVME_ID_CNS_CS_NS_PRESENT_LIST: + return nvme_identify_nslist_csi(n, req); case NVME_ID_CNS_NS_DESCR_LIST: return nvme_identify_ns_descr_list(n, req); + case NVME_ID_CNS_IO_COMMAND_SET: + return nvme_identify_cmd_set(n, req); default: trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); return NVME_INVALID_FIELD | NVME_DNR; @@ -1630,6 +3151,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) uint8_t fid = NVME_GETSETFEAT_FID(dw10); NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; + NvmeNamespace *ns; + int i; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, @@ -1692,8 +3215,31 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) } return NVME_INVALID_FIELD | NVME_DNR; + case NVME_ERROR_RECOVERY: + if (!nvme_nsid_valid(n, nsid)) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + result = ns->features.err_rec; + goto out; case NVME_VOLATILE_WRITE_CACHE: - result = n->features.vwc; + result = 0; + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + result = blk_enable_write_cache(ns->blkconf.blk); + if (result) { + break; + } + } trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); goto out; case NVME_ASYNCHRONOUS_EVENT_CONF: @@ -1734,7 +3280,9 @@ defaults: if (iv == n->admin_cq.vector) { result |= NVME_INTVC_NOCOALESCING; } - + break; + case NVME_COMMAND_SET_PROFILE: + result = 0; break; default: result = nvme_feature_default[fid]; @@ -1753,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), DMA_DIRECTION_TO_DEVICE, req); - if (ret != NVME_SUCCESS) { + if (ret) { return ret; } @@ -1764,7 +3312,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) { - NvmeNamespace *ns; + NvmeNamespace *ns = NULL; NvmeCmd *cmd = &req->cmd; uint32_t dw10 = le32_to_cpu(cmd->cdw10); @@ -1772,10 +3320,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) uint32_t nsid = le32_to_cpu(cmd->nsid); uint8_t fid = NVME_GETSETFEAT_FID(dw10); uint8_t save = NVME_SETFEAT_SAVE(dw10); + int i; trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); - if (save) { + if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { return NVME_FID_NOT_SAVEABLE | NVME_DNR; } @@ -1823,19 +3372,36 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - if (((n->temperature >= n->features.temp_thresh_hi) || - (n->temperature <= n->features.temp_thresh_low)) && - NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) { - nvme_enqueue_event(n, NVME_AER_TYPE_SMART, - NVME_AER_INFO_SMART_TEMP_THRESH, - NVME_LOG_SMART_INFO); + if ((n->temperature >= n->features.temp_thresh_hi) || + (n->temperature <= n->features.temp_thresh_low)) { + nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); } break; - case NVME_VOLATILE_WRITE_CACHE: - n->features.vwc = dw11 & 0x1; + case NVME_ERROR_RECOVERY: + if (nsid == NVME_NSID_BROADCAST) { + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); - for (int i = 1; i <= n->num_namespaces; i++) { + if (!ns) { + continue; + } + + if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { + ns->features.err_rec = dw11; + } + } + + break; + } + + assert(ns); + if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { + ns->features.err_rec = dw11; + } + break; + case NVME_VOLATILE_WRITE_CACHE: + for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { continue; @@ -1875,6 +3441,12 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) break; case NVME_TIMESTAMP: return nvme_set_feature_timestamp(n, req); + case NVME_COMMAND_SET_PROFILE: + if (dw11 & 0x1ff) { + trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); + return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; + } + break; default: return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; } @@ -1905,6 +3477,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, nvme_adm_opc_str(req->cmd.opcode)); + if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: return nvme_del_sq(n, req); @@ -1927,9 +3504,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_ADM_CMD_ASYNC_EV_REQ: return nvme_aer(n, req); default: - trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); - return NVME_INVALID_OPCODE | NVME_DNR; + assert(false); } + + return NVME_INVALID_OPCODE | NVME_DNR; } static void nvme_process_sq(void *opaque) @@ -1969,7 +3547,7 @@ static void nvme_process_sq(void *opaque) } } -static void nvme_clear_ctrl(NvmeCtrl *n) +static void nvme_ctrl_reset(NvmeCtrl *n) { NvmeNamespace *ns; int i; @@ -2004,16 +3582,54 @@ static void nvme_clear_ctrl(NvmeCtrl *n) n->outstanding_aers = 0; n->qs_created = false; + n->bar.cc = 0; +} + +static void nvme_ctrl_shutdown(NvmeCtrl *n) +{ + NvmeNamespace *ns; + int i; + + if (n->pmr.dev) { + memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); + } + for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { continue; } - nvme_ns_flush(ns); + nvme_ns_shutdown(ns); } +} - n->bar.cc = 0; +static void nvme_select_ns_iocs(NvmeCtrl *n) +{ + NvmeNamespace *ns; + int i; + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + ns->iocs = nvme_cse_iocs_none; + switch (ns->csi) { + case NVME_CSI_NVM: + if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + case NVME_CSI_ZONED: + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { + ns->iocs = nvme_cse_iocs_zoned; + } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + } + } } static int nvme_start_ctrl(NvmeCtrl *n) @@ -2110,13 +3726,41 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, NVME_AQA_ASQS(n->bar.aqa) + 1); + if (!n->params.zasl_bs) { + n->zasl = n->params.mdts; + } else { + if (n->params.zasl_bs < n->page_size) { + trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, + n->page_size); + return -1; + } + n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); + } + nvme_set_timestamp(n, 0ULL); QTAILQ_INIT(&n->aer_queue); + nvme_select_ns_iocs(n); + return 0; } +static void nvme_cmb_enable_regs(NvmeCtrl *n) +{ + NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1); + NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1); + NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); + + NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); + NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ + NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); +} + static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, unsigned size) { @@ -2180,12 +3824,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, } } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) { trace_pci_nvme_mmio_stopped(); - nvme_clear_ctrl(n); + nvme_ctrl_reset(n); n->bar.csts &= ~NVME_CSTS_READY; } if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) { trace_pci_nvme_mmio_shutdown_set(); - nvme_clear_ctrl(n); + nvme_ctrl_shutdown(n); n->bar.cc = data; n->bar.csts |= NVME_CSTS_SHST_COMPLETE; } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) { @@ -2218,19 +3862,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, trace_pci_nvme_mmio_aqattr(data & 0xffffffff); break; case 0x28: /* ASQ */ - n->bar.asq = data; + n->bar.asq = size == 8 ? data : + (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff); trace_pci_nvme_mmio_asqaddr(data); break; case 0x2c: /* ASQ hi */ - n->bar.asq |= data << 32; + n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32); trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq); break; case 0x30: /* ACQ */ trace_pci_nvme_mmio_acqaddr(data); - n->bar.acq = data; + n->bar.acq = size == 8 ? data : + (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff); break; case 0x34: /* ACQ hi */ - n->bar.acq |= data << 32; + n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32); trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq); break; case 0x38: /* CMBLOC */ @@ -2242,12 +3888,53 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, "invalid write to read only CMBSZ, ignored"); return; + case 0x50: /* CMBMSC */ + if (!NVME_CAP_CMBS(n->bar.cap)) { + return; + } + + n->bar.cmbmsc = size == 8 ? data : + (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff); + n->cmb.cmse = false; + + if (NVME_CMBMSC_CRE(data)) { + nvme_cmb_enable_regs(n); + + if (NVME_CMBMSC_CMSE(data)) { + hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT; + if (cba + int128_get64(n->cmb.mem.size) < cba) { + NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1); + return; + } + + n->cmb.cba = cba; + n->cmb.cmse = true; + } + } else { + n->bar.cmbsz = 0; + n->bar.cmbloc = 0; + } + + return; + case 0x54: /* CMBMSC hi */ + n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32); + return; + case 0xE00: /* PMRCAP */ NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, "invalid write to PMRCAP register, ignored"); return; - case 0xE04: /* TODO PMRCTL */ - break; + case 0xE04: /* PMRCTL */ + n->bar.pmrctl = data; + if (NVME_PMRCTL_EN(data)) { + memory_region_set_enabled(&n->pmr.dev->mr, true); + n->bar.pmrsts = 0; + } else { + memory_region_set_enabled(&n->pmr.dev->mr, false); + NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); + n->pmr.cmse = false; + } + return; case 0xE08: /* PMRSTS */ NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, "invalid write to PMRSTS register, ignored"); @@ -2260,8 +3947,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, "invalid write to PMRSWTP register, ignored"); return; - case 0xE14: /* TODO PMRMSC */ - break; + case 0xE14: /* PMRMSCL */ + if (!NVME_CAP_PMRS(n->bar.cap)) { + return; + } + + n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff); + n->pmr.cmse = false; + + if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) { + hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT; + if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { + NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1); + return; + } + + n->pmr.cmse = true; + n->pmr.cba = cba; + } + + return; + case 0xE18: /* PMRMSCU */ + if (!NVME_CAP_PMRS(n->bar.cap)) { + return; + } + + n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32); + return; default: NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, "invalid MMIO write," @@ -2277,7 +3989,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) uint8_t *ptr = (uint8_t *)&n->bar; uint64_t val = 0; - trace_pci_nvme_mmio_read(addr); + trace_pci_nvme_mmio_read(addr, size); if (unlikely(addr & (sizeof(uint32_t) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, @@ -2299,7 +4011,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) */ if (addr == 0xE08 && (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) { - memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size); + memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); } memcpy(&val, ptr + addr, size); } else { @@ -2441,7 +4153,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, { NvmeCtrl *n = (NvmeCtrl *)opaque; - trace_pci_nvme_mmio_write(addr, data); + trace_pci_nvme_mmio_write(addr, data, size); if (addr < sizeof(n->bar)) { nvme_write_bar(n, addr, data, size); @@ -2464,13 +4176,13 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; - stn_le_p(&n->cmbuf[addr], size, data); + stn_le_p(&n->cmb.buf[addr], size, data); } static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; - return ldn_le_p(&n->cmbuf[addr], size); + return ldn_le_p(&n->cmb.buf[addr], size); } static const MemoryRegionOps nvme_cmb_ops = { @@ -2518,19 +4230,26 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } - if (!n->params.cmb_size_mb && n->pmrdev) { - if (host_memory_backend_is_mapped(n->pmrdev)) { + if (n->pmr.dev) { + if (host_memory_backend_is_mapped(n->pmr.dev)) { error_setg(errp, "can't use already busy memdev: %s", - object_get_canonical_path_component(OBJECT(n->pmrdev))); + object_get_canonical_path_component(OBJECT(n->pmr.dev))); return; } - if (!is_power_of_2(n->pmrdev->size)) { + if (!is_power_of_2(n->pmr.dev->size)) { error_setg(errp, "pmr backend size needs to be power of 2 in size"); return; } - host_memory_backend_set_mapped(n->pmrdev, true); + host_memory_backend_set_mapped(n->pmr.dev, true); + } + + if (n->params.zasl_bs) { + if (!is_power_of_2(n->params.zasl_bs)) { + error_setg(errp, "zone append size limit has to be a power of 2"); + return; + } } } @@ -2586,78 +4305,49 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) { - NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); - NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0); + uint64_t cmb_size = n->params.cmb_size_mb * MiB; - NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); - NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ - NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); - - n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); - memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n, - "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); - pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc), + n->cmb.buf = g_malloc0(cmb_size); + memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, + "nvme-cmb", cmb_size); + pci_register_bar(pci_dev, NVME_CMB_BIR, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | - PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem); + PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); + + NVME_CAP_SET_CMBS(n->bar.cap, 1); + + if (n->params.legacy_cmb) { + nvme_cmb_enable_regs(n); + n->cmb.cmse = true; + } } static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { - /* Controller Capabilities register */ - NVME_CAP_SET_PMRS(n->bar.cap, 1); - - /* PMR Capabities register */ - n->bar.pmrcap = 0; - NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0); - NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0); + NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1); + NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1); NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); - NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0); /* Turn on bit 1 support */ NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); - NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0); - NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0); - - /* PMR Control register */ - n->bar.pmrctl = 0; - NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0); - - /* PMR Status register */ - n->bar.pmrsts = 0; - NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0); - - /* PMR Elasticity Buffer Size register */ - n->bar.pmrebs = 0; - NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0); - NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0); - NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0); - - /* PMR Sustained Write Throughput register */ - n->bar.pmrswtp = 0; - NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0); - NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0); - - /* PMR Memory Space Control register */ - n->bar.pmrmsc = 0; - NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0); - NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0); + NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1); pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | - PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); + PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); + + memory_region_set_enabled(&n->pmr.dev->mr, false); } -static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) +static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; + uint64_t bar_size, msix_table_size, msix_pba_size; + unsigned msix_table_offset, msix_pba_offset; + int ret; + + Error *err = NULL; pci_conf[PCI_INTERRUPT_PIN] = 1; pci_config_set_prog_interface(pci_conf, 0x2); @@ -2673,19 +4363,46 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); pcie_endpoint_cap_init(pci_dev, 0x80); + bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); + msix_table_offset = bar_size; + msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; + + bar_size += msix_table_size; + bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); + msix_pba_offset = bar_size; + msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; + + bar_size += msix_pba_size; + bar_size = pow2ceil(bar_size); + + memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", n->reg_size); + memory_region_add_subregion(&n->bar0, 0, &n->iomem); + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); - if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) { - return; + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); + ret = msix_init(pci_dev, n->params.msix_qsize, + &n->bar0, 0, msix_table_offset, + &n->bar0, 0, msix_pba_offset, 0, &err); + if (ret < 0) { + if (ret == -ENOTSUP) { + warn_report_err(err); + } else { + error_propagate(errp, err); + return ret; + } } if (n->params.cmb_size_mb) { nvme_init_cmb(n, pci_dev); - } else if (n->pmrdev) { + } + + if (n->pmr.dev) { nvme_init_pmr(n, pci_dev); } + + return 0; } static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) @@ -2706,6 +4423,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); + id->cntrltype = 0x1; /* * Because the controller always completes the Abort command immediately, @@ -2721,7 +4439,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->acl = 3; id->aerl = n->params.aerl; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; - id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED; + id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; /* recommended default value (~70 C) */ id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); @@ -2731,9 +4449,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | - NVME_ONCS_FEATURES); + NVME_ONCS_FEATURES | NVME_ONCS_DSM | + NVME_ONCS_COMPARE); - id->vwc = 0x1; + id->vwc = (0x2 << 1) | 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); @@ -2745,13 +4464,15 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); - n->bar.cap = 0; NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); + NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); + NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; @@ -2773,9 +4494,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) &pci_dev->qdev, n->parent_obj.qdev.id); nvme_init_state(n); - nvme_init_pci(n, pci_dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (nvme_init_pci(n, pci_dev, errp)) { return; } @@ -2786,7 +4505,11 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) ns = &n->namespace; ns->params.nsid = 1; - if (nvme_ns_setup(n, ns, errp)) { + if (nvme_ns_setup(ns, errp)) { + return; + } + + if (nvme_register_namespace(n, ns, errp)) { return; } } @@ -2795,25 +4518,37 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) static void nvme_exit(PCIDevice *pci_dev) { NvmeCtrl *n = NVME(pci_dev); + NvmeNamespace *ns; + int i; + + nvme_ctrl_reset(n); + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + nvme_ns_cleanup(ns); + } - nvme_clear_ctrl(n); g_free(n->cq); g_free(n->sq); g_free(n->aer_reqs); if (n->params.cmb_size_mb) { - g_free(n->cmbuf); + g_free(n->cmb.buf); } - if (n->pmrdev) { - host_memory_backend_set_mapped(n->pmrdev, false); + if (n->pmr.dev) { + host_memory_backend_set_mapped(n->pmr.dev, false); } msix_uninit_exclusive_bar(pci_dev); } static Property nvme_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), - DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND, + DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, HostMemoryBackend *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), @@ -2824,9 +4559,54 @@ static Property nvme_props[] = { DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), + DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), + DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs, + NVME_DEFAULT_MAX_ZA_SIZE), DEFINE_PROP_END_OF_LIST(), }; +static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NvmeCtrl *n = NVME(obj); + uint8_t value = n->smart_critical_warning; + + visit_type_uint8(v, name, &value, errp); +} + +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NvmeCtrl *n = NVME(obj); + uint8_t value, old_value, cap = 0, index, event; + + if (!visit_type_uint8(v, name, &value, errp)) { + return; + } + + cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY + | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; + if (NVME_CAP_PMRS(n->bar.cap)) { + cap |= NVME_SMART_PMR_UNRELIABLE; + } + + if ((value & cap) != value) { + error_setg(errp, "unsupported smart critical warning bits: 0x%x", + value & ~cap); + return; + } + + old_value = n->smart_critical_warning; + n->smart_critical_warning = value; + + /* only inject new bits of smart critical warning */ + for (index = 0; index < NVME_SMART_WARN_MAX; index++) { + event = 1 << index; + if (value & ~old_value & event) + nvme_smart_event(n, event); + } +} + static const VMStateDescription nvme_vmstate = { .name = "nvme", .unmigratable = 1, @@ -2850,13 +4630,17 @@ static void nvme_class_init(ObjectClass *oc, void *data) static void nvme_instance_init(Object *obj) { - NvmeCtrl *s = NVME(obj); + NvmeCtrl *n = NVME(obj); - if (s->namespace.blkconf.blk) { - device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex, + if (n->namespace.blkconf.blk) { + device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, "bootindex", "/namespace@1,0", DEVICE(obj)); } + + object_property_add(obj, "smart_critical_warning", "uint8", + nvme_get_smart_warning, + nvme_set_smart_warning, NULL, NULL); } static const TypeInfo nvme_info = { diff --git a/hw/block/nvme.h b/hw/block/nvme.h index e080a2318a..dee6092bd4 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -6,6 +6,9 @@ #define NVME_MAX_NAMESPACES 256 +#define NVME_DEFAULT_ZONE_SIZE (128 * MiB) +#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ @@ -16,6 +19,8 @@ typedef struct NvmeParams { uint32_t aer_max_queued; uint8_t mdts; bool use_intel_id; + uint32_t zasl_bs; + bool legacy_cmb; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -28,6 +33,7 @@ typedef struct NvmeRequest { struct NvmeNamespace *ns; BlockAIOCB *aiocb; uint16_t status; + void *opaque; NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; @@ -59,7 +65,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH"; case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; + case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; + case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; + case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; + case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; default: return "NVME_NVM_CMD_UNKNOWN"; } } @@ -111,13 +122,12 @@ typedef struct NvmeFeatureVal { uint16_t temp_thresh_low; }; uint32_t async_config; - uint32_t vwc; } NvmeFeatureVal; typedef struct NvmeCtrl { PCIDevice parent_obj; + MemoryRegion bar0; MemoryRegion iomem; - MemoryRegion ctrl_mem; NvmeBar bar; NvmeParams params; NvmeBus bus; @@ -133,20 +143,33 @@ typedef struct NvmeCtrl { uint32_t num_namespaces; uint32_t max_q_ents; uint8_t outstanding_aers; - uint8_t *cmbuf; uint32_t irq_status; uint64_t host_timestamp; /* Timestamp sent by the host */ uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */ uint64_t starttime_ms; uint16_t temperature; + uint8_t smart_critical_warning; - HostMemoryBackend *pmrdev; + struct { + MemoryRegion mem; + uint8_t *buf; + bool cmse; + hwaddr cba; + } cmb; + + struct { + HostMemoryBackend *dev; + bool cmse; + hwaddr cba; + } pmr; uint8_t aer_mask; NvmeRequest **aer_reqs; QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; + uint8_t zasl; + NvmeNamespace namespace; NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; diff --git a/hw/block/trace-events b/hw/block/trace-events index c1537e3ac0..d32475c398 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -40,16 +40,27 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" -pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" +pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" +pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" -pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" +pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" +pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" +pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" +pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8"" +pci_nvme_identify_cmd_set(void) "identify i/o command set" pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" @@ -69,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16"" -pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" -pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" +pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d" +pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16"" pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" @@ -85,6 +96,15 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded" pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" +pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32"" +pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32"" +pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state" +pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" # nvme traces for error conditions pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" @@ -102,6 +122,25 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" +pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" +pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" +pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" +pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64"" +pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32"" +pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" +pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64"" +pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64"" +pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64"" +pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" +pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded" +pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded" +pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32"" +pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16"" @@ -134,7 +173,9 @@ pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_ pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u" pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero" pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero" +pci_nvme_err_startfail_zasl_too_small(uint32_t zasl, uint32_t pagesz) "nvme_start_ctrl failed because zone append size limit %"PRIu32" is too small, needs to be >= %"PRIu32"" pci_nvme_err_startfail(void) "setting controller enable bit failed" +pci_nvme_err_invalid_mgmt_action(uint8_t action) "action=0x%"PRIx8"" # Traces for undefined behavior pci_nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64"" @@ -158,6 +199,7 @@ pci_nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring" pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring" pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring" +pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field" # xen-block.c xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s d%up%u" diff --git a/hw/core/machine.c b/hw/core/machine.c index 5d6163ab70..970046f438 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -32,6 +32,9 @@ #include "hw/mem/nvdimm.h" #include "migration/global_state.h" #include "migration/vmstate.h" +#include "exec/confidential-guest-support.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-pci.h" GlobalProperty hw_compat_5_2[] = {}; const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2); @@ -427,24 +430,37 @@ static char *machine_get_memory_encryption(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); - return g_strdup(ms->memory_encryption); + if (ms->cgs) { + return g_strdup(object_get_canonical_path_component(OBJECT(ms->cgs))); + } + + return NULL; } static void machine_set_memory_encryption(Object *obj, const char *value, Error **errp) { - MachineState *ms = MACHINE(obj); + Object *cgs = + object_resolve_path_component(object_get_objects_root(), value); + + if (!cgs) { + error_setg(errp, "No such memory encryption object '%s'", value); + return; + } - g_free(ms->memory_encryption); - ms->memory_encryption = g_strdup(value); + object_property_set_link(obj, "confidential-guest-support", cgs, errp); +} +static void machine_check_confidential_guest_support(const Object *obj, + const char *name, + Object *new_target, + Error **errp) +{ /* - * With memory encryption, the host can't see the real contents of RAM, - * so there's no point in it trying to merge areas. + * So far the only constraint is that the target has the + * TYPE_CONFIDENTIAL_GUEST_SUPPORT interface, and that's checked + * by the QOM core */ - if (value) { - machine_set_mem_merge(obj, false, errp); - } } static bool machine_get_nvdimm(Object *obj, Error **errp) @@ -844,6 +860,15 @@ static void machine_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, "suppress-vmdesc", "Set on to disable self-describing migration"); + object_class_property_add_link(oc, "confidential-guest-support", + TYPE_CONFIDENTIAL_GUEST_SUPPORT, + offsetof(MachineState, cgs), + machine_check_confidential_guest_support, + OBJ_PROP_LINK_STRONG); + object_class_property_set_description(oc, "confidential-guest-support", + "Set confidential guest scheme to support"); + + /* For compatibility */ object_class_property_add_str(oc, "memory-encryption", machine_get_memory_encryption, machine_set_memory_encryption); object_class_property_set_description(oc, "memory-encryption", @@ -1166,6 +1191,26 @@ void machine_run_board_init(MachineState *machine) cc->deprecation_note); } + if (machine->cgs) { + /* + * With confidential guests, the host can't see the real + * contents of RAM, so there's no point in it trying to merge + * areas. + */ + machine_set_mem_merge(OBJECT(machine), false, &error_abort); + + /* + * Virtio devices can't count on directly accessing guest + * memory, so they need iommu_platform=on to use normal DMA + * mechanisms. That requires also disabling legacy virtio + * support for those virtio pci devices which allow it. + */ + object_register_sugar_prop(TYPE_VIRTIO_PCI, "disable-legacy", + "on", true); + object_register_sugar_prop(TYPE_VIRTIO_DEVICE, "iommu_platform", + "on", false); + } + machine_class->init(machine); phase_advance(PHASE_MACHINE_INITIALIZED); } diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index 92e90ff013..11172214f1 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -38,6 +38,7 @@ #include "sysemu/sysemu.h" #include "hw/block/flash.h" #include "sysemu/kvm.h" +#include "sysemu/sev.h" #define FLASH_SECTOR_SIZE 4096 @@ -147,7 +148,7 @@ static void pc_system_flash_map(PCMachineState *pcms, PFlashCFI01 *system_flash; MemoryRegion *flash_mem; void *flash_ptr; - int ret, flash_size; + int flash_size; assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled); @@ -191,16 +192,10 @@ static void pc_system_flash_map(PCMachineState *pcms, flash_mem = pflash_cfi01_get_memory(system_flash); pc_isa_bios_init(rom_memory, flash_mem, size); - /* Encrypt the pflash boot ROM */ - if (kvm_memcrypt_enabled()) { - flash_ptr = memory_region_get_ram_ptr(flash_mem); - flash_size = memory_region_size(flash_mem); - ret = kvm_memcrypt_encrypt_data(flash_ptr, flash_size); - if (ret) { - error_report("failed to encrypt pflash rom"); - exit(1); - } - } + /* Encrypt the pflash boot ROM, if necessary */ + flash_ptr = memory_region_get_ram_ptr(flash_mem); + flash_size = memory_region_size(flash_mem); + sev_encrypt_flash(flash_ptr, flash_size, &error_fatal); } } } diff --git a/hw/intc/pnv_xive.c b/hw/intc/pnv_xive.c index 5f69626b3a..ad43483612 100644 --- a/hw/intc/pnv_xive.c +++ b/hw/intc/pnv_xive.c @@ -24,6 +24,7 @@ #include "hw/ppc/xive_regs.h" #include "hw/qdev-properties.h" #include "hw/ppc/ppc.h" +#include "trace.h" #include <libfdt.h> @@ -1319,6 +1320,8 @@ static void pnv_xive_ic_hw_trigger(PnvXive *xive, hwaddr addr, uint64_t val) uint8_t blk; uint32_t idx; + trace_pnv_xive_ic_hw_trigger(addr, val); + if (val & XIVE_TRIGGER_END) { xive_error(xive, "IC: END trigger at @0x%"HWADDR_PRIx" data 0x%"PRIx64, addr, val); diff --git a/hw/intc/trace-events b/hw/intc/trace-events index 8ed397a0d5..45ddaf48df 100644 --- a/hw/intc/trace-events +++ b/hw/intc/trace-events @@ -236,3 +236,6 @@ xive_tctx_tm_write(uint64_t offset, unsigned int size, uint64_t value) "@0x0x%"P xive_tctx_tm_read(uint64_t offset, unsigned int size, uint64_t value) "@0x0x%"PRIx64" sz=%d val=0x%" PRIx64 xive_presenter_notify(uint8_t nvt_blk, uint32_t nvt_idx, uint8_t ring) "found NVT 0x%x/0x%x ring=0x%x" xive_end_source_read(uint8_t end_blk, uint32_t end_idx, uint64_t addr) "END 0x%x/0x%x @0x0x%"PRIx64 + +# pnv_xive.c +pnv_xive_ic_hw_trigger(uint64_t addr, uint64_t val) "@0x%"PRIx64" val=0x%"PRIx64 diff --git a/hw/intc/xive.c b/hw/intc/xive.c index fa8c3d8287..eeb4e62ba9 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -1294,7 +1294,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon) pq = xive_get_field32(END_W1_ESn, end->w1); - monitor_printf(mon, " %08x %c%c %c%c%c%c%c%c%c prio:%d nvt:%02x/%04x", + monitor_printf(mon, " %08x %c%c %c%c%c%c%c%c%c%c prio:%d nvt:%02x/%04x", end_idx, pq & XIVE_ESB_VAL_P ? 'P' : '-', pq & XIVE_ESB_VAL_Q ? 'Q' : '-', @@ -1305,6 +1305,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon) xive_end_is_escalate(end) ? 'e' : '-', xive_end_is_uncond_escalation(end) ? 'u' : '-', xive_end_is_silent_escalation(end) ? 's' : '-', + xive_end_is_firmware(end) ? 'f' : '-', priority, nvt_blk, nvt_idx); if (qaddr_base) { diff --git a/hw/meson.build b/hw/meson.build index 010de7219c..e615d72d4d 100644 --- a/hw/meson.build +++ b/hw/meson.build @@ -56,6 +56,7 @@ subdir('moxie') subdir('nios2') subdir('openrisc') subdir('ppc') +subdir('remote') subdir('riscv') subdir('rx') subdir('s390x') diff --git a/hw/misc/arm_integrator_debug.c b/hw/misc/arm_integrator_debug.c index ec0d4b90d3..9a19727829 100644 --- a/hw/misc/arm_integrator_debug.c +++ b/hw/misc/arm_integrator_debug.c @@ -6,7 +6,7 @@ * to this area. * * The real h/w is described at: - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0159b/Babbfijf.html + * https://developer.arm.com/documentation/dui0159/b/peripherals-and-interfaces/debug-leds-and-dip-switch-interface * * Copyright (c) 2013 Alex Bennée <alex@bennee.com> * diff --git a/hw/misc/imx7_ccm.c b/hw/misc/imx7_ccm.c index 02fc1ae8d0..075159e497 100644 --- a/hw/misc/imx7_ccm.c +++ b/hw/misc/imx7_ccm.c @@ -131,8 +131,16 @@ static const struct MemoryRegionOps imx7_set_clr_tog_ops = { }, }; +static void imx7_digprog_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + qemu_log_mask(LOG_GUEST_ERROR, + "Guest write to read-only ANALOG_DIGPROG register\n"); +} + static const struct MemoryRegionOps imx7_digprog_ops = { .read = imx7_set_clr_tog_read, + .write = imx7_digprog_write, .endianness = DEVICE_NATIVE_ENDIAN, .impl = { .min_access_size = 4, diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 0505b52c98..603e992a7f 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -495,7 +495,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp) /* mmap the region and map into the BAR2 */ memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s), - "ivshmem.bar2", size, true, fd, &local_err); + "ivshmem.bar2", size, true, fd, 0, + &local_err); if (local_err) { error_propagate(errp, local_err); return; diff --git a/hw/misc/tz-ppc.c b/hw/misc/tz-ppc.c index 6431257b52..36495c68e7 100644 --- a/hw/misc/tz-ppc.c +++ b/hw/misc/tz-ppc.c @@ -196,7 +196,21 @@ static bool tz_ppc_dummy_accepts(void *opaque, hwaddr addr, g_assert_not_reached(); } +static uint64_t tz_ppc_dummy_read(void *opaque, hwaddr addr, unsigned size) +{ + g_assert_not_reached(); +} + +static void tz_ppc_dummy_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + g_assert_not_reached(); +} + static const MemoryRegionOps tz_ppc_dummy_ops = { + /* define r/w methods to avoid assert failure in memory_region_init_io */ + .read = tz_ppc_dummy_read, + .write = tz_ppc_dummy_write, .valid.accepts = tz_ppc_dummy_accepts, }; diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c index 121415abfe..fe055d3381 100644 --- a/hw/net/fsl_etsec/rings.c +++ b/hw/net/fsl_etsec/rings.c @@ -502,7 +502,7 @@ ssize_t etsec_rx_ring_write(eTSEC *etsec, const uint8_t *buf, size_t size) return -1; } - if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) { + if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) { /* CRC is not in the packet yet, so short frame is below 60 bytes */ RING_DEBUG("%s: Drop short frame\n", __func__); return -1; diff --git a/hw/nvram/nrf51_nvm.c b/hw/nvram/nrf51_nvm.c index f2283c1a8d..7b3460d52d 100644 --- a/hw/nvram/nrf51_nvm.c +++ b/hw/nvram/nrf51_nvm.c @@ -273,6 +273,15 @@ static const MemoryRegionOps io_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static uint64_t flash_read(void *opaque, hwaddr offset, unsigned size) +{ + /* + * This is a rom_device MemoryRegion which is always in + * romd_mode (we never put it in MMIO mode), so reads always + * go directly to RAM and never come here. + */ + g_assert_not_reached(); +} static void flash_write(void *opaque, hwaddr offset, uint64_t value, unsigned int size) @@ -300,6 +309,7 @@ static void flash_write(void *opaque, hwaddr offset, uint64_t value, static const MemoryRegionOps flash_ops = { + .read = flash_read, .write = flash_write, .valid.min_access_size = 4, .valid.max_access_size = 4, diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig index eb03f0489d..8b8c763c28 100644 --- a/hw/pci-host/Kconfig +++ b/hw/pci-host/Kconfig @@ -65,3 +65,6 @@ config PCI_POWERNV select PCI_EXPRESS select MSI_NONBROKEN select PCIE_PORT + +config REMOTE_PCIHOST + bool diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c index f9fb97a3e3..bde3a343a2 100644 --- a/hw/pci-host/designware.c +++ b/hw/pci-host/designware.c @@ -21,6 +21,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu/module.h" +#include "qemu/log.h" #include "hw/pci/msi.h" #include "hw/pci/pci_bridge.h" #include "hw/pci/pci_host.h" @@ -63,6 +64,23 @@ designware_pcie_root_to_host(DesignwarePCIERoot *root) return DESIGNWARE_PCIE_HOST(bus->parent); } +static uint64_t designware_pcie_root_msi_read(void *opaque, hwaddr addr, + unsigned size) +{ + /* + * Attempts to read from the MSI address are undefined in + * the PCI specifications. For this hardware, the datasheet + * specifies that a read from the magic address is simply not + * intercepted by the MSI controller, and will go out to the + * AHB/AXI bus like any other PCI-device-initiated DMA read. + * This is not trivial to implement in QEMU, so since + * well-behaved guests won't ever ask a PCI device to DMA from + * this address we just log the missing functionality. + */ + qemu_log_mask(LOG_UNIMP, "%s not implemented\n", __func__); + return 0; +} + static void designware_pcie_root_msi_write(void *opaque, hwaddr addr, uint64_t val, unsigned len) { @@ -77,6 +95,7 @@ static void designware_pcie_root_msi_write(void *opaque, hwaddr addr, } static const MemoryRegionOps designware_pci_host_msi_ops = { + .read = designware_pcie_root_msi_read, .write = designware_pcie_root_msi_write, .endianness = DEVICE_LITTLE_ENDIAN, .valid = { diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build index da9d1a9964..1847c69905 100644 --- a/hw/pci-host/meson.build +++ b/hw/pci-host/meson.build @@ -9,6 +9,7 @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c')) pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c')) pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c')) pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c')) +pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c')) # PPC devices pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c')) diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c index 6328e985f8..54f57c660a 100644 --- a/hw/pci-host/pnv_phb4.c +++ b/hw/pci-host/pnv_phb4.c @@ -22,6 +22,7 @@ #include "hw/irq.h" #include "hw/qdev-properties.h" #include "qom/object.h" +#include "trace.h" #define phb_error(phb, fmt, ...) \ qemu_log_mask(LOG_GUEST_ERROR, "phb4[%d:%d]: " fmt "\n", \ @@ -1257,6 +1258,8 @@ static void pnv_phb4_xive_notify(XiveNotifier *xf, uint32_t srcno) uint64_t data = XIVE_TRIGGER_PQ | offset | srcno; MemTxResult result; + trace_pnv_phb4_xive_notify(notif_port, data); + address_space_stq_be(&address_space_memory, notif_port, data, MEMTXATTRS_UNSPECIFIED, &result); if (result != MEMTX_OK) { diff --git a/hw/pci-host/prep.c b/hw/pci-host/prep.c index 0469db8c1d..0a9162fba9 100644 --- a/hw/pci-host/prep.c +++ b/hw/pci-host/prep.c @@ -27,6 +27,7 @@ #include "qemu-common.h" #include "qemu/datadir.h" #include "qemu/units.h" +#include "qemu/log.h" #include "qapi/error.h" #include "hw/pci/pci.h" #include "hw/pci/pci_bus.h" @@ -121,8 +122,15 @@ static uint64_t raven_intack_read(void *opaque, hwaddr addr, return pic_read_irq(isa_pic); } +static void raven_intack_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + qemu_log_mask(LOG_UNIMP, "%s not implemented\n", __func__); +} + static const MemoryRegionOps raven_intack_ops = { .read = raven_intack_read, + .write = raven_intack_write, .valid = { .max_access_size = 1, }, diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c new file mode 100644 index 0000000000..eee45444ef --- /dev/null +++ b/hw/pci-host/remote.c @@ -0,0 +1,75 @@ +/* + * Remote PCI host device + * + * Unlike PCI host devices that model physical hardware, the purpose + * of this PCI host is to host multi-process QEMU devices. + * + * Multi-process QEMU extends the PCI host of a QEMU machine into a + * remote process. Any PCI device attached to the remote process is + * visible in the QEMU guest. This allows existing QEMU device models + * to be reused in the remote process. + * + * This PCI host is purely a container for PCI devices. It's fake in the + * sense that the guest never sees this PCI host and has no way of + * accessing it. Its job is just to provide the environment that QEMU + * PCI device models need when running in a remote process. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/pci/pci.h" +#include "hw/pci/pci_host.h" +#include "hw/pci/pcie_host.h" +#include "hw/qdev-properties.h" +#include "hw/pci-host/remote.h" +#include "exec/memory.h" + +static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge, + PCIBus *rootbus) +{ + return "0000:00"; +} + +static void remote_pcihost_realize(DeviceState *dev, Error **errp) +{ + PCIHostState *pci = PCI_HOST_BRIDGE(dev); + RemotePCIHost *s = REMOTE_PCIHOST(dev); + + pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci", + s->mr_pci_mem, s->mr_sys_io, + 0, TYPE_PCIE_BUS); +} + +static void remote_pcihost_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass); + + hc->root_bus_path = remote_pcihost_root_bus_path; + dc->realize = remote_pcihost_realize; + + dc->user_creatable = false; + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + dc->fw_name = "pci"; +} + +static const TypeInfo remote_pcihost_info = { + .name = TYPE_REMOTE_PCIHOST, + .parent = TYPE_PCIE_HOST_BRIDGE, + .instance_size = sizeof(RemotePCIHost), + .class_init = remote_pcihost_class_init, +}; + +static void remote_pcihost_register(void) +{ + type_register_static(&remote_pcihost_info); +} + +type_init(remote_pcihost_register) diff --git a/hw/pci-host/trace-events b/hw/pci-host/trace-events index d19ca9aef6..7d8063ac42 100644 --- a/hw/pci-host/trace-events +++ b/hw/pci-host/trace-events @@ -20,3 +20,6 @@ unin_data_write(uint64_t addr, unsigned len, uint64_t val) "write addr 0x%"PRIx6 unin_data_read(uint64_t addr, unsigned len, uint64_t val) "read addr 0x%"PRIx64 " len %d val 0x%"PRIx64 unin_write(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64 unin_read(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64 + +# pnv_phb4.c +pnv_phb4_xive_notify(uint64_t notif_port, uint64_t data) "notif=@0x%"PRIx64" data=0x%"PRIx64 diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index c64b5d08bd..01517a6c6c 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -74,6 +74,8 @@ #define MPC8544_I2C_IRQ 43 #define RTC_REGS_OFFSET 0x68 +#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000) + struct boot_info { uint32_t dt_base; @@ -124,7 +126,7 @@ static void dt_serial_create(void *fdt, unsigned long long offset, qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550"); qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100); qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx); - qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", 0); + qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ); qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2); qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic); qemu_fdt_setprop_string(fdt, "/aliases", alias, ser); @@ -320,8 +322,8 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms, int fdt_size; void *fdt; uint8_t hypercall[16]; - uint32_t clock_freq = 400000000; - uint32_t tb_freq = 400000000; + uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ; + uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ; int i; char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus"; char *soc; @@ -890,7 +892,7 @@ void ppce500_init(MachineState *machine) env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i; env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0; - ppc_booke_timers_init(cpu, 400000000, PPC_TIMER_E500); + ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500); /* Register reset handler */ if (!i) { diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index ffa2ec37fa..218631c883 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -27,6 +27,7 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files( 'spapr_nvdimm.c', 'spapr_rtas_ddw.c', 'spapr_numa.c', + 'pef.c', )) ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c')) ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files( diff --git a/hw/ppc/pef.c b/hw/ppc/pef.c new file mode 100644 index 0000000000..573be3ed79 --- /dev/null +++ b/hw/ppc/pef.c @@ -0,0 +1,140 @@ +/* + * PEF (Protected Execution Facility) for POWER support + * + * Copyright Red Hat. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/kvm.h" +#include "migration/blocker.h" +#include "exec/confidential-guest-support.h" +#include "hw/ppc/pef.h" + +#define TYPE_PEF_GUEST "pef-guest" +OBJECT_DECLARE_SIMPLE_TYPE(PefGuest, PEF_GUEST) + +typedef struct PefGuest PefGuest; +typedef struct PefGuestClass PefGuestClass; + +struct PefGuestClass { + ConfidentialGuestSupportClass parent_class; +}; + +/** + * PefGuest: + * + * The PefGuest object is used for creating and managing a PEF + * guest. + * + * # $QEMU \ + * -object pef-guest,id=pef0 \ + * -machine ...,confidential-guest-support=pef0 + */ +struct PefGuest { + ConfidentialGuestSupport parent_obj; +}; + +static int kvmppc_svm_init(Error **errp) +{ +#ifdef CONFIG_KVM + static Error *pef_mig_blocker; + + if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_SECURE_GUEST)) { + error_setg(errp, + "KVM implementation does not support Secure VMs (is an ultravisor running?)"); + return -1; + } else { + int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SECURE_GUEST, 0, 1); + + if (ret < 0) { + error_setg(errp, + "Error enabling PEF with KVM"); + return -1; + } + } + + /* add migration blocker */ + error_setg(&pef_mig_blocker, "PEF: Migration is not implemented"); + /* NB: This can fail if --only-migratable is used */ + migrate_add_blocker(pef_mig_blocker, &error_fatal); + + return 0; +#else + g_assert_not_reached(); +#endif +} + +/* + * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels + * that don't support this ioctl. + */ +static int kvmppc_svm_off(Error **errp) +{ +#ifdef CONFIG_KVM + int rc; + + rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF); + if (rc && rc != -ENOTTY) { + error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed"); + return rc; + } + return 0; +#else + g_assert_not_reached(); +#endif +} + +int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) { + return 0; + } + + if (!kvm_enabled()) { + error_setg(errp, "PEF requires KVM"); + return -1; + } + + return kvmppc_svm_init(errp); +} + +int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) { + return 0; + } + + /* + * If we don't have KVM we should never have been able to + * initialize PEF, so we should never get this far + */ + assert(kvm_enabled()); + + return kvmppc_svm_off(errp); +} + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(PefGuest, + pef_guest, + PEF_GUEST, + CONFIDENTIAL_GUEST_SUPPORT, + { TYPE_USER_CREATABLE }, + { NULL }) + +static void pef_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static void pef_guest_init(Object *obj) +{ +} + +static void pef_guest_finalize(Object *obj) +{ +} diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 14fc9758a9..77af846cdf 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -21,6 +21,7 @@ #include "qemu-common.h" #include "qemu/datadir.h" #include "qemu/units.h" +#include "qemu/cutils.h" #include "qapi/error.h" #include "sysemu/qtest.h" #include "sysemu/sysemu.h" @@ -65,9 +66,9 @@ #define FW_MAX_SIZE (16 * MiB) #define KERNEL_LOAD_ADDR 0x20000000 -#define KERNEL_MAX_SIZE (256 * MiB) -#define INITRD_LOAD_ADDR 0x60000000 -#define INITRD_MAX_SIZE (256 * MiB) +#define KERNEL_MAX_SIZE (128 * MiB) +#define INITRD_LOAD_ADDR 0x28000000 +#define INITRD_MAX_SIZE (128 * MiB) static const char *pnv_chip_core_typename(const PnvChip *o) { @@ -725,8 +726,11 @@ static void pnv_init(MachineState *machine) DeviceState *dev; /* allocate RAM */ - if (machine->ram_size < (1 * GiB)) { - warn_report("skiboot may not work with < 1GB of RAM"); + if (machine->ram_size < mc->default_ram_size) { + char *sz = size_to_str(mc->default_ram_size); + error_report("Invalid RAM size, should be bigger than %s", sz); + g_free(sz); + exit(EXIT_FAILURE); } memory_region_add_subregion(get_system_memory(), 0, machine->ram); @@ -872,6 +876,14 @@ static void pnv_init(MachineState *machine) } /* + * The PNOR is mapped on the LPC FW address space by the BMC. + * Since we can not reach the remote BMC machine with LPC memops, + * map it always for now. + */ + memory_region_add_subregion(pnv->chips[0]->fw_mr, PNOR_SPI_OFFSET, + &pnv->pnor->mmio); + + /* * OpenPOWER systems use a IPMI SEL Event message to notify the * host to powerdown */ @@ -1150,6 +1162,7 @@ static void pnv_chip_power8_realize(DeviceState *dev, Error **errp) qdev_realize(DEVICE(&chip8->lpc), NULL, &error_fatal); pnv_xscom_add_subregion(chip, PNV_XSCOM_LPC_BASE, &chip8->lpc.xscom_regs); + chip->fw_mr = &chip8->lpc.isa_fw; chip->dt_isa_nodename = g_strdup_printf("/xscom@%" PRIx64 "/isa@%x", (uint64_t) PNV_XSCOM_BASE(chip), PNV_XSCOM_LPC_BASE); @@ -1479,6 +1492,7 @@ static void pnv_chip_power9_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(get_system_memory(), PNV9_LPCM_BASE(chip), &chip9->lpc.xscom_regs); + chip->fw_mr = &chip9->lpc.isa_fw; chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0", (uint64_t) PNV9_LPCM_BASE(chip)); @@ -1592,6 +1606,7 @@ static void pnv_chip_power10_realize(DeviceState *dev, Error **errp) memory_region_add_subregion(get_system_memory(), PNV10_LPCM_BASE(chip), &chip10->lpc.xscom_regs); + chip->fw_mr = &chip10->lpc.isa_fw; chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0", (uint64_t) PNV10_LPCM_BASE(chip)); } @@ -1983,7 +1998,7 @@ static void pnv_machine_class_init(ObjectClass *oc, void *data) * RAM defaults to less than 2048 for 32-bit hosts, and large * enough to fit the maximum initrd size at it's load address */ - mc->default_ram_size = INITRD_LOAD_ADDR + INITRD_MAX_SIZE; + mc->default_ram_size = 1 * GiB; mc->default_ram_id = "pnv.ram"; ispc->print_info = pnv_pic_print_info; nc->nmi_monitor_handler = pnv_nmi; diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c index 67ebb16c4d..b9bf5735ea 100644 --- a/hw/ppc/pnv_bmc.c +++ b/hw/ppc/pnv_bmc.c @@ -51,6 +51,11 @@ typedef struct OemSel { #define SOFT_OFF 0x00 #define SOFT_REBOOT 0x01 +static bool pnv_bmc_is_simulator(IPMIBmc *bmc) +{ + return object_dynamic_cast(OBJECT(bmc), TYPE_IPMI_BMC_SIMULATOR); +} + static void pnv_gen_oem_sel(IPMIBmc *bmc, uint8_t reboot) { /* IPMI SEL Event are 16 bytes long */ @@ -79,6 +84,10 @@ void pnv_dt_bmc_sensors(IPMIBmc *bmc, void *fdt) const struct ipmi_sdr_compact *sdr; uint16_t nextrec; + if (!pnv_bmc_is_simulator(bmc)) { + return; + } + offset = fdt_add_subnode(fdt, 0, "bmc"); _FDT(offset); @@ -243,6 +252,10 @@ static const IPMINetfn hiomap_netfn = { void pnv_bmc_set_pnor(IPMIBmc *bmc, PnvPnor *pnor) { + if (!pnv_bmc_is_simulator(bmc)) { + return; + } + object_ref(OBJECT(pnor)); object_property_add_const_link(OBJECT(bmc), "pnor", OBJECT(pnor)); @@ -260,13 +273,8 @@ IPMIBmc *pnv_bmc_create(PnvPnor *pnor) Object *obj; obj = object_new(TYPE_IPMI_BMC_SIMULATOR); - object_ref(OBJECT(pnor)); - object_property_add_const_link(obj, "pnor", OBJECT(pnor)); qdev_realize(DEVICE(obj), NULL, &error_fatal); - - /* Install the HIOMAP protocol handlers to access the PNOR */ - ipmi_sim_register_netfn(IPMI_BMC_SIMULATOR(obj), IPMI_NETFN_OEM, - &hiomap_netfn); + pnv_bmc_set_pnor(IPMI_BMC(obj), pnor); return IPMI_BMC(obj); } @@ -291,7 +299,7 @@ static int bmc_find(Object *child, void *opaque) IPMIBmc *pnv_bmc_find(Error **errp) { - ForeachArgs args = { TYPE_IPMI_BMC_SIMULATOR, NULL }; + ForeachArgs args = { TYPE_IPMI_BMC, NULL }; int ret; ret = object_child_foreach_recursive(object_get_root(), bmc_find, &args); diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c index 5903590220..bcbca3db97 100644 --- a/hw/ppc/pnv_lpc.c +++ b/hw/ppc/pnv_lpc.c @@ -824,8 +824,6 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool use_cpld, Error **errp) ISABus *isa_bus; qemu_irq *irqs; qemu_irq_handler handler; - PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine()); - bool hostboot_mode = !!pnv->fw_load_addr; /* let isa_bus_new() create its own bridge on SysBus otherwise * devices specified on the command line won't find the bus and @@ -851,18 +849,5 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool use_cpld, Error **errp) isa_bus_irqs(isa_bus, irqs); - /* - * TODO: Map PNOR on the LPC FW address space on demand ? - */ - memory_region_add_subregion(&lpc->isa_fw, PNOR_SPI_OFFSET, - &pnv->pnor->mmio); - /* - * Start disabled. The HIOMAP protocol will activate the mapping - * with HIOMAP_C_CREATE_WRITE_WINDOW - */ - if (!hostboot_mode) { - memory_region_set_enabled(&pnv->pnor->mmio, false); - } - return isa_bus; } diff --git a/hw/ppc/prep_systemio.c b/hw/ppc/prep_systemio.c index 4e48ef245c..b2bd783248 100644 --- a/hw/ppc/prep_systemio.c +++ b/hw/ppc/prep_systemio.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/log.h" #include "hw/irq.h" #include "hw/isa/isa.h" #include "hw/qdev-properties.h" @@ -235,8 +236,15 @@ static uint64_t ppc_parity_error_readl(void *opaque, hwaddr addr, return val; } +static void ppc_parity_error_writel(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); +} + static const MemoryRegionOps ppc_parity_error_ops = { .read = ppc_parity_error_readl, + .write = ppc_parity_error_writel, .valid = { .min_access_size = 4, .max_access_size = 4, diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 6c47466fc2..85fe65f894 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -83,6 +83,7 @@ #include "hw/ppc/spapr_tpm_proxy.h" #include "hw/ppc/spapr_nvdimm.h" #include "hw/ppc/spapr_numa.h" +#include "hw/ppc/pef.h" #include "monitor/monitor.h" @@ -295,15 +296,6 @@ static hwaddr spapr_node0_size(MachineState *machine) return machine->ram_size; } -bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) -{ - MachineState *machine = MACHINE(spapr); - SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); - - return smc->pre_5_2_numa_associativity || - machine->numa_state->num_nodes <= 1; -} - static void add_str(GString *s, const gchar *s1) { g_string_append_len(s, s1, strlen(s1) + 1); @@ -790,7 +782,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr) CPUState *cs; int n_cpus; int cpus_offset; - char *nodename; int i; cpus_offset = fdt_add_subnode(fdt, 0, "cpus"); @@ -818,6 +809,7 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr) PowerPCCPU *cpu = POWERPC_CPU(cs); int index = spapr_get_vcpu_id(cpu); DeviceClass *dc = DEVICE_GET_CLASS(cs); + g_autofree char *nodename = NULL; int offset; if (!spapr_is_thread0_in_vcore(spapr, cpu)) { @@ -826,7 +818,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr) nodename = g_strdup_printf("%s@%x", dc->fw_name, index); offset = fdt_add_subnode(fdt, cpus_offset, nodename); - g_free(nodename); _FDT(offset); spapr_dt_cpu(cs, fdt, offset, spapr); } @@ -1574,7 +1565,7 @@ static void spapr_machine_reset(MachineState *machine) void *fdt; int rc; - kvmppc_svm_off(&error_fatal); + pef_kvm_reset(machine->cgs, &error_fatal); spapr_caps_apply(spapr); first_ppc_cpu = POWERPC_CPU(first_cpu); @@ -2658,6 +2649,11 @@ static void spapr_machine_init(MachineState *machine) char *filename; Error *resize_hpt_err = NULL; + /* + * if Secure VM (PEF) support is configured, then initialize it + */ + pef_kvm_init(machine->cgs, &error_fatal); + msi_nonbroken = true; QLIST_INIT(&spapr->phbs); @@ -2774,16 +2770,7 @@ static void spapr_machine_init(MachineState *machine) } - /* - * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. - * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is - * called from vPHB reset handler so we initialize the counter here. - * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM - * must be equally distant from any other node. - * The final value of spapr->gpu_numa_id is going to be written to - * max-associativity-domains in spapr_build_fdt(). - */ - spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes); + spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine); /* Init numa_assoc_array */ spapr_numa_associativity_init(spapr, machine); @@ -3049,6 +3036,7 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE); SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE); VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON); + PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE); if (d) { void *spapr = CAST(void, bus->parent, "spapr-vscsi"); @@ -3122,6 +3110,10 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn)); } + if (pcidev) { + return spapr_pci_fw_dev_name(pcidev); + } + return NULL; } @@ -3743,15 +3735,27 @@ int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, PowerPCCPU *cpu = POWERPC_CPU(cs); DeviceClass *dc = DEVICE_GET_CLASS(cs); int id = spapr_get_vcpu_id(cpu); - char *nodename; + g_autofree char *nodename = NULL; int offset; nodename = g_strdup_printf("%s@%x", dc->fw_name, id); offset = fdt_add_subnode(fdt, 0, nodename); - g_free(nodename); spapr_dt_cpu(cs, fdt, offset, spapr); + /* + * spapr_dt_cpu() does not fill the 'name' property in the + * CPU node. The function is called during boot process, before + * and after CAS, and overwriting the 'name' property written + * by SLOF is not allowed. + * + * Write it manually after spapr_dt_cpu(). This makes the hotplug + * CPUs more compatible with the coldplugged ones, which have + * the 'name' property. Linux Kernel also relies on this + * property to identify CPU nodes. + */ + _FDT((fdt_setprop_string(fdt, offset, "name", nodename))); + *fdt_start_offset = offset; return 0; } diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index b50796bbe3..779f18b994 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,6 +19,15 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) +static bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) +{ + MachineState *machine = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); + + return smc->pre_5_2_numa_associativity || + machine->numa_state->num_nodes <= 1; +} + static bool spapr_numa_is_symmetrical(MachineState *ms) { int src, dst; @@ -38,6 +47,20 @@ static bool spapr_numa_is_symmetrical(MachineState *ms) } /* + * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. + * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is + * called from vPHB reset handler so we initialize the counter here. + * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM + * must be equally distant from any other node. + * The final value of spapr->gpu_numa_id is going to be written to + * max-associativity-domains in spapr_build_fdt(). + */ +unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine) +{ + return MAX(1, machine->numa_state->num_nodes); +} + +/* * This function will translate the user distances into * what the kernel understand as possible values: 10 * (local distance), 20, 40, 80 and 160, and return the equivalent @@ -288,6 +311,8 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) { MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + uint32_t number_nvgpus_nodes = spapr->gpu_numa_id - + spapr_numa_initial_nvgpu_numa_id(ms); uint32_t refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x3), @@ -295,7 +320,7 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) cpu_to_be32(0x1), }; uint32_t nr_refpoints = ARRAY_SIZE(refpoints); - uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id; + uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes; uint32_t maxdomains[] = { cpu_to_be32(4), cpu_to_be32(maxdomain), diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 76d7c91e9c..f1c7479816 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -53,6 +53,7 @@ #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "hw/ppc/spapr_numa.h" +#include "qemu/log.h" /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ #define RTAS_QUERY_FN 0 @@ -739,6 +740,12 @@ static PCIINTxRoute spapr_route_intx_pin_to_irq(void *opaque, int pin) return route; } +static uint64_t spapr_msi_read(void *opaque, hwaddr addr, unsigned size) +{ + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); + return 0; +} + /* * MSI/MSIX memory region implementation. * The handler handles both MSI and MSIX. @@ -756,8 +763,11 @@ static void spapr_msi_write(void *opaque, hwaddr addr, } static const MemoryRegionOps spapr_msi_ops = { - /* There is no .read as the read result is undefined by PCI spec */ - .read = NULL, + /* + * .read result is undefined by PCI spec. + * define .read method to avoid assert failure in memory_region_init_io + */ + .read = spapr_msi_read, .write = spapr_msi_write, .endianness = DEVICE_LITTLE_ENDIAN }; @@ -1334,15 +1344,29 @@ static int spapr_dt_pci_bus(SpaprPhbState *sphb, PCIBus *bus, return offset; } +char *spapr_pci_fw_dev_name(PCIDevice *dev) +{ + const gchar *basename; + int slot = PCI_SLOT(dev->devfn); + int func = PCI_FUNC(dev->devfn); + uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3); + + basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, + ccode & 0xff); + + if (func != 0) { + return g_strdup_printf("%s@%x,%x", basename, slot, func); + } else { + return g_strdup_printf("%s@%x", basename, slot); + } +} + /* create OF node for pci device and required OF DT properties */ static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, void *fdt, int parent_offset) { int offset; - const gchar *basename; - gchar *nodename; - int slot = PCI_SLOT(dev->devfn); - int func = PCI_FUNC(dev->devfn); + g_autofree gchar *nodename = spapr_pci_fw_dev_name(dev); PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); ResourceProps rp; SpaprDrc *drc = drc_from_dev(sphb, dev); @@ -1359,19 +1383,8 @@ static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev, uint32_t pci_status = pci_default_read_config(dev, PCI_STATUS, 2); gchar *loc_code; - basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff, - ccode & 0xff); - - if (func != 0) { - nodename = g_strdup_printf("%s@%x,%x", basename, slot, func); - } else { - nodename = g_strdup_printf("%s@%x", basename, slot); - } - _FDT(offset = fdt_add_subnode(fdt, parent_offset, nodename)); - g_free(nodename); - /* in accordance with PAPR+ v2.7 13.6.3, Table 181 */ _FDT(fdt_setprop_cell(fdt, offset, "vendor-id", vendor_id)); _FDT(fdt_setprop_cell(fdt, offset, "device-id", device_id)); @@ -2173,6 +2186,16 @@ static int spapr_pci_pre_save(void *opaque) return 0; } +static int spapr_pci_post_save(void *opaque) +{ + SpaprPhbState *sphb = opaque; + + g_free(sphb->msi_devs); + sphb->msi_devs = NULL; + sphb->msi_devs_num = 0; + return 0; +} + static int spapr_pci_post_load(void *opaque, int version_id) { SpaprPhbState *sphb = opaque; @@ -2205,6 +2228,7 @@ static const VMStateDescription vmstate_spapr_pci = { .version_id = 2, .minimum_version_id = 2, .pre_save = spapr_pci_pre_save, + .post_save = spapr_pci_post_save, .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL), diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig new file mode 100644 index 0000000000..08c16e235f --- /dev/null +++ b/hw/remote/Kconfig @@ -0,0 +1,4 @@ +config MULTIPROCESS + bool + depends on PCI && PCI_EXPRESS && KVM + select REMOTE_PCIHOST diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c new file mode 100644 index 0000000000..e4ff131a6b --- /dev/null +++ b/hw/remote/iohub.c @@ -0,0 +1,119 @@ +/* + * Remote IO Hub + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/pci/pci.h" +#include "hw/pci/pci_ids.h" +#include "hw/pci/pci_bus.h" +#include "qemu/thread.h" +#include "hw/boards.h" +#include "hw/remote/machine.h" +#include "hw/remote/iohub.h" +#include "qemu/main-loop.h" + +void remote_iohub_init(RemoteIOHubState *iohub) +{ + int pirq; + + memset(&iohub->irqfds, 0, sizeof(iohub->irqfds)); + memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds)); + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_mutex_init(&iohub->irq_level_lock[pirq]); + iohub->irq_level[pirq] = 0; + event_notifier_init_fd(&iohub->irqfds[pirq], -1); + event_notifier_init_fd(&iohub->resamplefds[pirq], -1); + } +} + +void remote_iohub_finalize(RemoteIOHubState *iohub) +{ + int pirq; + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + qemu_mutex_destroy(&iohub->irq_level_lock[pirq]); + } +} + +int remote_iohub_map_irq(PCIDevice *pci_dev, int intx) +{ + return pci_dev->devfn; +} + +void remote_iohub_set_irq(void *opaque, int pirq, int level) +{ + RemoteIOHubState *iohub = opaque; + + assert(pirq >= 0); + assert(pirq < PCI_DEVFN_MAX); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (level) { + if (++iohub->irq_level[pirq] == 1) { + event_notifier_set(&iohub->irqfds[pirq]); + } + } else if (iohub->irq_level[pirq] > 0) { + iohub->irq_level[pirq]--; + } +} + +static void intr_resample_handler(void *opaque) +{ + ResampleToken *token = opaque; + RemoteIOHubState *iohub = token->iohub; + int pirq, s; + + pirq = token->pirq; + + s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]); + + assert(s >= 0); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (iohub->irq_level[pirq]) { + event_notifier_set(&iohub->irqfds[pirq]); + } +} + +void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg) +{ + RemoteMachineState *machine = REMOTE_MACHINE(current_machine); + RemoteIOHubState *iohub = &machine->iohub; + int pirq, intx; + + intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + pirq = remote_iohub_map_irq(pci_dev, intx); + + if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + memset(&iohub->token[pirq], 0, sizeof(ResampleToken)); + } + + event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]); + event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]); + + iohub->token[pirq].iohub = iohub; + iohub->token[pirq].pirq = pirq; + + qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL, + &iohub->token[pirq]); +} diff --git a/hw/remote/machine.c b/hw/remote/machine.c new file mode 100644 index 0000000000..c0ab4f528a --- /dev/null +++ b/hw/remote/machine.c @@ -0,0 +1,80 @@ +/* + * Machine for remote device + * + * This machine type is used by the remote device process in multi-process + * QEMU. QEMU device models depend on parent busses, interrupt controllers, + * memory regions, etc. The remote machine type offers this environment so + * that QEMU device models can be used as remote devices. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/machine.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" +#include "qapi/error.h" +#include "hw/pci/pci_host.h" +#include "hw/remote/iohub.h" + +static void remote_machine_init(MachineState *machine) +{ + MemoryRegion *system_memory, *system_io, *pci_memory; + RemoteMachineState *s = REMOTE_MACHINE(machine); + RemotePCIHost *rem_host; + PCIHostState *pci_host; + + system_memory = get_system_memory(); + system_io = get_system_io(); + + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + + rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST)); + + rem_host->mr_pci_mem = pci_memory; + rem_host->mr_sys_mem = system_memory; + rem_host->mr_sys_io = system_io; + + s->host = rem_host; + + object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host)); + memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1); + + qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal); + + pci_host = PCI_HOST_BRIDGE(rem_host); + + remote_iohub_init(&s->iohub); + + pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq, + &s->iohub, REMOTE_IOHUB_NB_PIRQS); +} + +static void remote_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + + mc->init = remote_machine_init; + mc->desc = "Experimental remote machine"; +} + +static const TypeInfo remote_machine = { + .name = TYPE_REMOTE_MACHINE, + .parent = TYPE_MACHINE, + .instance_size = sizeof(RemoteMachineState), + .class_init = remote_machine_class_init, +}; + +static void remote_machine_register_types(void) +{ + type_register_static(&remote_machine); +} + +type_init(remote_machine_register_types); diff --git a/hw/remote/memory.c b/hw/remote/memory.c new file mode 100644 index 0000000000..32085b1e05 --- /dev/null +++ b/hw/remote/memory.c @@ -0,0 +1,65 @@ +/* + * Memory manager for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/memory.h" +#include "exec/address-spaces.h" +#include "exec/ram_addr.h" +#include "qapi/error.h" + +static void remote_sysmem_reset(void) +{ + MemoryRegion *sysmem, *subregion, *next; + + sysmem = get_system_memory(); + + QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) { + if (subregion->ram) { + memory_region_del_subregion(sysmem, subregion); + object_unparent(OBJECT(subregion)); + } + } +} + +void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem; + MemoryRegion *sysmem, *subregion; + static unsigned int suffix; + int region; + + sysmem = get_system_memory(); + + remote_sysmem_reset(); + + for (region = 0; region < msg->num_fds; region++) { + g_autofree char *name; + subregion = g_new(MemoryRegion, 1); + name = g_strdup_printf("remote-mem-%u", suffix++); + memory_region_init_ram_from_fd(subregion, NULL, + name, sysmem_info->sizes[region], + true, msg->fds[region], + sysmem_info->offsets[region], + errp); + + if (*errp) { + g_free(subregion); + remote_sysmem_reset(); + return; + } + + memory_region_add_subregion(sysmem, sysmem_info->gpas[region], + subregion); + + } +} diff --git a/hw/remote/meson.build b/hw/remote/meson.build new file mode 100644 index 0000000000..e6a5574242 --- /dev/null +++ b/hw/remote/meson.build @@ -0,0 +1,13 @@ +remote_ss = ss.source_set() + +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c')) + +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c')) +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c')) + +softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss) diff --git a/hw/remote/message.c b/hw/remote/message.c new file mode 100644 index 0000000000..11d729845c --- /dev/null +++ b/hw/remote/message.c @@ -0,0 +1,230 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/machine.h" +#include "io/channel.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "sysemu/runstate.h" +#include "hw/pci/pci.h" +#include "exec/memattrs.h" +#include "hw/remote/memory.h" +#include "hw/remote/iohub.h" +#include "sysemu/reset.h" + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp); + +void coroutine_fn mpqemu_remote_msg_loop_co(void *data) +{ + g_autofree RemoteCommDev *com = (RemoteCommDev *)data; + PCIDevice *pci_dev = NULL; + Error *local_err = NULL; + + assert(com->ioc); + + pci_dev = com->dev; + for (; !local_err;) { + MPQemuMsg msg = {0}; + + if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) { + break; + } + + if (!mpqemu_msg_valid(&msg)) { + error_setg(&local_err, "Received invalid message from proxy" + "in remote process pid="FMT_pid"", + getpid()); + break; + } + + switch (msg.cmd) { + case MPQEMU_CMD_PCI_CFGWRITE: + process_config_write(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_PCI_CFGREAD: + process_config_read(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_WRITE: + process_bar_write(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_READ: + process_bar_read(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_SYNC_SYSMEM: + remote_sysmem_reconfig(&msg, &local_err); + break; + case MPQEMU_CMD_SET_IRQFD: + process_set_irqfd_msg(pci_dev, &msg); + break; + case MPQEMU_CMD_DEVICE_RESET: + process_device_reset_msg(com->ioc, pci_dev, &local_err); + break; + default: + error_setg(&local_err, + "Unknown command (%d) received for device %s" + " (pid="FMT_pid")", + msg.cmd, DEVICE(pci_dev)->id, getpid()); + } + } + + if (local_err) { + error_report_err(local_err); + qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR); + } else { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } +} + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + pci_default_write_config(dev, conf->addr, conf->val, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + AddressSpace *as = + bar_access->memory ? &address_space_memory : &address_space_io; + MPQemuMsg ret = { 0 }; + MemTxResult res; + uint64_t val; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + ret.data.u64 = UINT64_MAX; + goto fail; + } + + val = cpu_to_le64(bar_access->val); + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, true); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".", + bar_access->addr, getpid()); + ret.data.u64 = -1; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + MPQemuMsg ret = { 0 }; + AddressSpace *as; + MemTxResult res; + uint64_t val = 0; + + as = bar_access->memory ? &address_space_memory : &address_space_io; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + val = UINT64_MAX; + goto fail; + } + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, false); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".", + bar_access->addr, getpid()); + val = UINT64_MAX; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.data.u64 = le64_to_cpu(val); + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp) +{ + DeviceClass *dc = DEVICE_GET_CLASS(dev); + DeviceState *s = DEVICE(dev); + MPQemuMsg ret = { 0 }; + + if (dc->reset) { + dc->reset(s); + } + + ret.cmd = MPQEMU_CMD_RET; + + mpqemu_msg_send(&ret, ioc, errp); +} diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c new file mode 100644 index 0000000000..9ce31526e8 --- /dev/null +++ b/hw/remote/mpqemu-link.c @@ -0,0 +1,267 @@ +/* + * Communication channel between QEMU and remote device process + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/module.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "qemu/iov.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "io/channel.h" +#include "sysemu/iothread.h" +#include "trace.h" + +/* + * Send message over the ioc QIOChannel. + * This function is safe to call from: + * - main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - IOThread within co-routine context, outside of co-routine context + * will block IOThread; + * Returns true if no errors were encountered, false otherwise. + */ +bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + ERRP_GUARD(); + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + struct iovec send[2] = {}; + int *fds = NULL; + size_t nfds = 0; + bool ret = false; + + send[0].iov_base = msg; + send[0].iov_len = MPQEMU_MSG_HDR_SIZE; + + send[1].iov_base = (void *)&msg->data; + send[1].iov_len = msg->size; + + if (msg->num_fds) { + nfds = msg->num_fds; + fds = msg->fds; + } + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + /* + * Skip unlocking/locking iothread lock when the IOThread is running + * in co-routine context. Co-routine context is asserted above + * for IOThread case. + * Also skip lock handling while in a co-routine in the main context. + */ + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send), + fds, nfds, errp)) { + ret = true; + } else { + trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds); + } + + if (iolock && !iothread && !qemu_in_coroutine()) { + /* See above comment why skip locking here. */ + qemu_mutex_lock_iothread(); + } + + return ret; +} + +/* + * Read message from the ioc QIOChannel. + * This function is safe to call from: + * - From main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - From vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - From IOThread within co-routine context, outside of co-routine context + * will block IOThread; + */ +static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds, + size_t *nfds, Error **errp) +{ + ERRP_GUARD(); + struct iovec iov = { .iov_base = buf, .iov_len = len }; + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + int ret = -1; + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_lock_iothread(); + } + + return (ret <= 0) ? ret : iov.iov_len; +} + +bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + ERRP_GUARD(); + g_autofree int *fds = NULL; + size_t nfds = 0; + ssize_t len; + bool ret = false; + + len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp); + if (len <= 0) { + goto fail; + } else if (len != MPQEMU_MSG_HDR_SIZE) { + error_setg(errp, "Message header corrupted"); + goto fail; + } + + if (msg->size > sizeof(msg->data)) { + error_setg(errp, "Invalid size for message"); + goto fail; + } + + if (!msg->size) { + goto copy_fds; + } + + len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp); + if (len <= 0) { + goto fail; + } + if (len != msg->size) { + error_setg(errp, "Unable to read full message"); + goto fail; + } + +copy_fds: + msg->num_fds = nfds; + if (nfds > G_N_ELEMENTS(msg->fds)) { + error_setg(errp, + "Overflow error: received %zu fds, more than max of %d fds", + nfds, REMOTE_MAX_FDS); + goto fail; + } + if (nfds) { + memcpy(msg->fds, fds, nfds * sizeof(int)); + } + + ret = true; + +fail: + if (*errp) { + trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds); + } + while (*errp && nfds) { + close(fds[nfds - 1]); + nfds--; + } + + return ret; +} + +/* + * Send msg and wait for a reply with command code RET_MSG. + * Returns the message received of size u64 or UINT64_MAX + * on error. + * Called from VCPU thread in non-coroutine context. + * Used by the Proxy object to communicate to remote processes. + */ +uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev, + Error **errp) +{ + ERRP_GUARD(); + MPQemuMsg msg_reply = {0}; + uint64_t ret = UINT64_MAX; + + assert(!qemu_in_coroutine()); + + QEMU_LOCK_GUARD(&pdev->io_mutex); + if (!mpqemu_msg_send(msg, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) { + error_setg(errp, "ERROR: Invalid reply received for command %d", + msg->cmd); + return ret; + } + + return msg_reply.data.u64; +} + +bool mpqemu_msg_valid(MPQemuMsg *msg) +{ + if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) { + return false; + } + + /* Verify FDs. */ + if (msg->num_fds >= REMOTE_MAX_FDS) { + return false; + } + + if (msg->num_fds > 0) { + for (int i = 0; i < msg->num_fds; i++) { + if (fcntl(msg->fds[i], F_GETFL) == -1) { + return false; + } + } + } + + /* Verify message specific fields. */ + switch (msg->cmd) { + case MPQEMU_CMD_SYNC_SYSMEM: + if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) { + return false; + } + break; + case MPQEMU_CMD_PCI_CFGWRITE: + case MPQEMU_CMD_PCI_CFGREAD: + if (msg->size != sizeof(PciConfDataMsg)) { + return false; + } + break; + case MPQEMU_CMD_BAR_WRITE: + case MPQEMU_CMD_BAR_READ: + if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) { + return false; + } + break; + case MPQEMU_CMD_SET_IRQFD: + if (msg->size || (msg->num_fds != 2)) { + return false; + } + break; + default: + break; + } + + return true; +} diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c new file mode 100644 index 0000000000..af1fa6f5aa --- /dev/null +++ b/hw/remote/proxy-memory-listener.c @@ -0,0 +1,227 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/compiler.h" +#include "qemu/int128.h" +#include "qemu/range.h" +#include "exec/memory.h" +#include "exec/cpu-common.h" +#include "cpu.h" +#include "exec/ram_addr.h" +#include "exec/address-spaces.h" +#include "qapi/error.h" +#include "hw/remote/mpqemu-link.h" +#include "hw/remote/proxy-memory-listener.h" + +/* + * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and + * proxy_memory_listener_commit() defined below perform tasks similar to the + * functions defined in vhost-user.c. These functions are good candidates + * for refactoring. + * + */ + +static void proxy_memory_listener_reset(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + int mrs; + + for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) { + memory_region_unref(proxy_listener->mr_sections[mrs].mr); + } + + g_free(proxy_listener->mr_sections); + proxy_listener->mr_sections = NULL; + proxy_listener->n_mr_sections = 0; +} + +static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset) +{ + MemoryRegion *mr; + ram_addr_t off; + + /** + * Assumes that the host address is a valid address as it's + * coming from the MemoryListener system. In the case host + * address is not valid, the following call would return + * the default subregion of "system_memory" region, and + * not NULL. So it's not possible to check for NULL here. + */ + mr = memory_region_from_host((void *)(uintptr_t)host, &off); + + if (offset) { + *offset = off; + } + + return memory_region_get_fd(mr); +} + +static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size) +{ + if (((prev_host + size) != host)) { + return false; + } + + if (get_fd_from_hostaddr(host, NULL) != + get_fd_from_hostaddr(prev_host, NULL)) { + return false; + } + + return true; +} + +static bool try_merge(ProxyMemoryListener *proxy_listener, + MemoryRegionSection *section) +{ + uint64_t mrs_size, mrs_gpa, mrs_page; + MemoryRegionSection *prev_sec; + bool merged = false; + uintptr_t mrs_host; + RAMBlock *mrs_rb; + + if (!proxy_listener->n_mr_sections) { + return false; + } + + mrs_rb = section->mr->ram_block; + mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb); + mrs_size = int128_get64(section->size); + mrs_gpa = section->offset_within_address_space; + mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + + if (get_fd_from_hostaddr(mrs_host, NULL) < 0) { + return true; + } + + mrs_host = mrs_host & ~(mrs_page - 1); + mrs_gpa = mrs_gpa & ~(mrs_page - 1); + mrs_size = ROUND_UP(mrs_size, mrs_page); + + prev_sec = proxy_listener->mr_sections + + (proxy_listener->n_mr_sections - 1); + uint64_t prev_gpa_start = prev_sec->offset_within_address_space; + uint64_t prev_size = int128_get64(prev_sec->size); + uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); + uint64_t prev_host_start = + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + + prev_sec->offset_within_region; + uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); + + if (mrs_gpa <= (prev_gpa_end + 1)) { + g_assert(mrs_gpa > prev_gpa_start); + + if ((section->mr == prev_sec->mr) && + proxy_mrs_can_merge(mrs_host, prev_host_start, + (mrs_gpa - prev_gpa_start))) { + uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); + merged = true; + prev_sec->offset_within_address_space = + MIN(prev_gpa_start, mrs_gpa); + prev_sec->offset_within_region = + MIN(prev_host_start, mrs_host) - + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); + prev_sec->size = int128_make64(max_end - MIN(prev_host_start, + mrs_host)); + } + } + + return merged; +} + +static void proxy_memory_listener_region_addnop(MemoryListener *listener, + MemoryRegionSection *section) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + + if (!memory_region_is_ram(section->mr) || + memory_region_is_rom(section->mr)) { + return; + } + + if (try_merge(proxy_listener, section)) { + return; + } + + ++proxy_listener->n_mr_sections; + proxy_listener->mr_sections = g_renew(MemoryRegionSection, + proxy_listener->mr_sections, + proxy_listener->n_mr_sections); + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section; + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL; + memory_region_ref(section->mr); +} + +static void proxy_memory_listener_commit(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + MPQemuMsg msg; + MemoryRegionSection *section; + ram_addr_t offset; + uintptr_t host_addr; + int region; + Error *local_err = NULL; + + memset(&msg, 0, sizeof(MPQemuMsg)); + + msg.cmd = MPQEMU_CMD_SYNC_SYSMEM; + msg.num_fds = proxy_listener->n_mr_sections; + msg.size = sizeof(SyncSysmemMsg); + if (msg.num_fds > REMOTE_MAX_FDS) { + error_report("Number of fds is more than %d", REMOTE_MAX_FDS); + return; + } + + for (region = 0; region < proxy_listener->n_mr_sections; region++) { + section = &proxy_listener->mr_sections[region]; + msg.data.sync_sysmem.gpas[region] = + section->offset_within_address_space; + msg.data.sync_sysmem.sizes[region] = int128_get64(section->size); + host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset); + msg.data.sync_sysmem.offsets[region] = offset; + } + if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) { + error_report_err(local_err); + } +} + +void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener) +{ + memory_listener_unregister(&proxy_listener->listener); + + proxy_memory_listener_reset(&proxy_listener->listener); +} + +void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener, + QIOChannel *ioc) +{ + proxy_listener->n_mr_sections = 0; + proxy_listener->mr_sections = NULL; + + proxy_listener->ioc = ioc; + + proxy_listener->listener.begin = proxy_memory_listener_reset; + proxy_listener->listener.commit = proxy_memory_listener_commit; + proxy_listener->listener.region_add = proxy_memory_listener_region_addnop; + proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop; + proxy_listener->listener.priority = 10; + + memory_listener_register(&proxy_listener->listener, + &address_space_memory); +} diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c new file mode 100644 index 0000000000..4fa4be079d --- /dev/null +++ b/hw/remote/proxy.c @@ -0,0 +1,379 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/proxy.h" +#include "hw/pci/pci.h" +#include "qapi/error.h" +#include "io/channel-util.h" +#include "hw/qdev-properties.h" +#include "monitor/monitor.h" +#include "migration/blocker.h" +#include "qemu/sockets.h" +#include "hw/remote/mpqemu-link.h" +#include "qemu/error-report.h" +#include "hw/remote/proxy-memory-listener.h" +#include "qom/object.h" +#include "qemu/event_notifier.h" +#include "sysemu/kvm.h" +#include "util/event_notifier-posix.c" + +static void probe_pci_info(PCIDevice *dev, Error **errp); +static void proxy_device_reset(DeviceState *dev); + +static void proxy_intx_update(PCIDevice *pci_dev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev); + PCIINTxRoute route; + int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + if (dev->virq != -1) { + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq); + dev->virq = -1; + } + + route = pci_device_route_intx_to_irq(pci_dev, pin); + + dev->virq = route.irq; + + if (dev->virq != -1) { + kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr, + &dev->resample, dev->virq); + } +} + +static void setup_irqfd(PCIProxyDev *dev) +{ + PCIDevice *pci_dev = PCI_DEVICE(dev); + MPQemuMsg msg; + Error *local_err = NULL; + + event_notifier_init(&dev->intr, 0); + event_notifier_init(&dev->resample, 0); + + memset(&msg, 0, sizeof(MPQemuMsg)); + msg.cmd = MPQEMU_CMD_SET_IRQFD; + msg.num_fds = 2; + msg.fds[0] = event_notifier_get_fd(&dev->intr); + msg.fds[1] = event_notifier_get_fd(&dev->resample); + msg.size = 0; + + if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) { + error_report_err(local_err); + } + + dev->virq = -1; + + proxy_intx_update(pci_dev); + + pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update); +} + +static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) +{ + ERRP_GUARD(); + PCIProxyDev *dev = PCI_PROXY_DEV(device); + uint8_t *pci_conf = device->config; + int fd; + + if (!dev->fd) { + error_setg(errp, "fd parameter not specified for %s", + DEVICE(device)->id); + return; + } + + fd = monitor_fd_param(monitor_cur(), dev->fd, errp); + if (fd == -1) { + error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "proxy: fd %d is not a socket", fd); + close(fd); + return; + } + + dev->ioc = qio_channel_new_fd(fd, errp); + + error_setg(&dev->migration_blocker, "%s does not support migration", + TYPE_PCI_PROXY_DEV); + migrate_add_blocker(dev->migration_blocker, errp); + + qemu_mutex_init(&dev->io_mutex); + qio_channel_set_blocking(dev->ioc, true, NULL); + + pci_conf[PCI_LATENCY_TIMER] = 0xff; + pci_conf[PCI_INTERRUPT_PIN] = 0x01; + + proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc); + + setup_irqfd(dev); + + probe_pci_info(PCI_DEVICE(dev), errp); +} + +static void pci_proxy_dev_exit(PCIDevice *pdev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pdev); + + if (dev->ioc) { + qio_channel_close(dev->ioc, NULL); + } + + migrate_del_blocker(dev->migration_blocker); + + error_free(dev->migration_blocker); + + proxy_memory_listener_deconfigure(&dev->proxy_listener); + + event_notifier_cleanup(&dev->intr); + event_notifier_cleanup(&dev->resample); +} + +static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val, + int len, unsigned int op) +{ + MPQemuMsg msg = { 0 }; + uint64_t ret = -EINVAL; + Error *local_err = NULL; + + msg.cmd = op; + msg.data.pci_conf_data.addr = addr; + msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0; + msg.data.pci_conf_data.len = len; + msg.size = sizeof(PciConfDataMsg); + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (ret == UINT64_MAX) { + error_report("Failed to perform PCI config %s operation", + (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE"); + } + + if (op == MPQEMU_CMD_PCI_CFGREAD) { + *val = (uint32_t)ret; + } +} + +static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len) +{ + uint32_t val; + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD); + + return val; +} + +static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val, + int len) +{ + /* + * Some of the functions access the copy of remote device's PCI config + * space which is cached in the proxy device. Therefore, maintain + * it updated. + */ + pci_default_write_config(d, addr, val, len); + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE); +} + +static Property proxy_properties[] = { + DEFINE_PROP_STRING("fd", PCIProxyDev, fd), + DEFINE_PROP_END_OF_LIST(), +}; + +static void pci_proxy_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = pci_proxy_dev_realize; + k->exit = pci_proxy_dev_exit; + k->config_read = pci_proxy_read_config; + k->config_write = pci_proxy_write_config; + + dc->reset = proxy_device_reset; + + device_class_set_props(dc, proxy_properties); +} + +static const TypeInfo pci_proxy_dev_type_info = { + .name = TYPE_PCI_PROXY_DEV, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PCIProxyDev), + .class_init = pci_proxy_dev_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void pci_proxy_dev_register_types(void) +{ + type_register_static(&pci_proxy_dev_type_info); +} + +type_init(pci_proxy_dev_register_types) + +static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr, + bool write, hwaddr addr, uint64_t *val, + unsigned size, bool memory) +{ + MPQemuMsg msg = { 0 }; + long ret = -EINVAL; + Error *local_err = NULL; + + msg.size = sizeof(BarAccessMsg); + msg.data.bar_access.addr = mr->addr + addr; + msg.data.bar_access.size = size; + msg.data.bar_access.memory = memory; + + if (write) { + msg.cmd = MPQEMU_CMD_BAR_WRITE; + msg.data.bar_access.val = *val; + } else { + msg.cmd = MPQEMU_CMD_BAR_READ; + } + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (!write) { + *val = ret; + } +} + +static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + + send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size, + pmr->memory); +} + +static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + uint64_t val; + + send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size, + pmr->memory); + + return val; +} + +const MemoryRegionOps proxy_mr_ops = { + .read = proxy_bar_read, + .write = proxy_bar_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static void probe_pci_info(PCIDevice *dev, Error **errp) +{ + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + uint32_t orig_val, new_val, base_class, val; + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + DeviceClass *dc = DEVICE_CLASS(pc); + uint8_t type; + int i, size; + + config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->vendor_id = (uint16_t)val; + + config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->device_id = (uint16_t)val; + + config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->class_id = (uint16_t)val; + + config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->subsystem_id = (uint16_t)val; + + base_class = pc->class_id >> 4; + switch (base_class) { + case PCI_BASE_CLASS_BRIDGE: + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + break; + case PCI_BASE_CLASS_STORAGE: + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + break; + case PCI_BASE_CLASS_NETWORK: + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + break; + case PCI_BASE_CLASS_INPUT: + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + break; + case PCI_BASE_CLASS_DISPLAY: + set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories); + break; + case PCI_BASE_CLASS_PROCESSOR: + set_bit(DEVICE_CATEGORY_CPU, dc->categories); + break; + default: + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + break; + } + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + new_val = 0xffffffff; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + size = (~(new_val & 0xFFFFFFF0)) + 1; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + type = (new_val & 0x1) ? + PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY; + + if (size) { + g_autofree char *name; + pdev->region[i].dev = pdev; + pdev->region[i].present = true; + if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) { + pdev->region[i].memory = true; + } + name = g_strdup_printf("bar-region-%d", i); + memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev), + &proxy_mr_ops, &pdev->region[i], + name, size); + pci_register_bar(dev, i, type, &pdev->region[i].mr); + } + } +} + +static void proxy_device_reset(DeviceState *dev) +{ + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + MPQemuMsg msg = { 0 }; + Error *local_err = NULL; + + msg.cmd = MPQEMU_CMD_DEVICE_RESET; + msg.size = 0; + + mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + +} diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c new file mode 100644 index 0000000000..4f21254219 --- /dev/null +++ b/hw/remote/remote-obj.c @@ -0,0 +1,203 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/error-report.h" +#include "qemu/notify.h" +#include "qom/object_interfaces.h" +#include "hw/qdev-core.h" +#include "io/channel.h" +#include "hw/qdev-core.h" +#include "hw/remote/machine.h" +#include "io/channel-util.h" +#include "qapi/error.h" +#include "sysemu/sysemu.h" +#include "hw/pci/pci.h" +#include "qemu/sockets.h" +#include "monitor/monitor.h" + +#define TYPE_REMOTE_OBJECT "x-remote-object" +OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT) + +struct RemoteObjectClass { + ObjectClass parent_class; + + unsigned int nr_devs; + unsigned int max_devs; +}; + +struct RemoteObject { + /* private */ + Object parent; + + Notifier machine_done; + + int32_t fd; + char *devid; + + QIOChannel *ioc; + + DeviceState *dev; + DeviceListener listener; +}; + +static void remote_object_set_fd(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "File descriptor '%s' is not a socket", str); + close(fd); + return; + } + + o->fd = fd; +} + +static void remote_object_set_devid(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + + g_free(o->devid); + + o->devid = g_strdup(str); +} + +static void remote_object_unrealize_listener(DeviceListener *listener, + DeviceState *dev) +{ + RemoteObject *o = container_of(listener, RemoteObject, listener); + + if (o->dev == dev) { + object_unref(OBJECT(o)); + } +} + +static void remote_object_machine_done(Notifier *notifier, void *data) +{ + RemoteObject *o = container_of(notifier, RemoteObject, machine_done); + DeviceState *dev = NULL; + QIOChannel *ioc = NULL; + Coroutine *co = NULL; + RemoteCommDev *comdev = NULL; + Error *err = NULL; + + dev = qdev_find_recursive(sysbus_get_default(), o->devid); + if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + error_report("%s is not a PCI device", o->devid); + return; + } + + ioc = qio_channel_new_fd(o->fd, &err); + if (!ioc) { + error_report_err(err); + return; + } + qio_channel_set_blocking(ioc, false, NULL); + + o->dev = dev; + + o->listener.unrealize = remote_object_unrealize_listener; + device_listener_register(&o->listener); + + /* co-routine should free this. */ + comdev = g_new0(RemoteCommDev, 1); + *comdev = (RemoteCommDev) { + .ioc = ioc, + .dev = PCI_DEVICE(dev), + }; + + co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev); + qemu_coroutine_enter(co); +} + +static void remote_object_init(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + if (k->nr_devs >= k->max_devs) { + error_report("Reached maximum number of devices: %u", k->max_devs); + return; + } + + o->ioc = NULL; + o->fd = -1; + o->devid = NULL; + + k->nr_devs++; + + o->machine_done.notify = remote_object_machine_done; + qemu_add_machine_init_done_notifier(&o->machine_done); +} + +static void remote_object_finalize(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + device_listener_unregister(&o->listener); + + if (o->ioc) { + qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + qio_channel_close(o->ioc, NULL); + } + + object_unref(OBJECT(o->ioc)); + + k->nr_devs--; + g_free(o->devid); +} + +static void remote_object_class_init(ObjectClass *klass, void *data) +{ + RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass); + + /* + * Limit number of supported devices to 1. This is done to avoid devices + * from one VM accessing the RAM of another VM. This is done until we + * start using separate address spaces for individual devices. + */ + k->max_devs = 1; + k->nr_devs = 0; + + object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd); + object_class_property_add_str(klass, "devid", NULL, + remote_object_set_devid); +} + +static const TypeInfo remote_object_info = { + .name = TYPE_REMOTE_OBJECT, + .parent = TYPE_OBJECT, + .instance_size = sizeof(RemoteObject), + .instance_init = remote_object_init, + .instance_finalize = remote_object_finalize, + .class_size = sizeof(RemoteObjectClass), + .class_init = remote_object_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&remote_object_info); +} + +type_init(register_types); diff --git a/hw/remote/trace-events b/hw/remote/trace-events new file mode 100644 index 0000000000..0b23974f90 --- /dev/null +++ b/hw/remote/trace-events @@ -0,0 +1,4 @@ +# multi-process trace events + +mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process" +mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process" diff --git a/hw/remote/trace.h b/hw/remote/trace.h new file mode 100644 index 0000000000..5d5e3ac720 --- /dev/null +++ b/hw/remote/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_remote.h" diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c index ab3a2482aa..93eccfc05d 100644 --- a/hw/s390x/pv.c +++ b/hw/s390x/pv.c @@ -14,8 +14,11 @@ #include <linux/kvm.h> #include "cpu.h" +#include "qapi/error.h" #include "qemu/error-report.h" #include "sysemu/kvm.h" +#include "qom/object_interfaces.h" +#include "exec/confidential-guest-support.h" #include "hw/s390x/ipl.h" #include "hw/s390x/pv.h" @@ -111,3 +114,62 @@ void s390_pv_inject_reset_error(CPUState *cs) /* Report that we are unable to enter protected mode */ env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV; } + +#define TYPE_S390_PV_GUEST "s390-pv-guest" +OBJECT_DECLARE_SIMPLE_TYPE(S390PVGuest, S390_PV_GUEST) + +/** + * S390PVGuest: + * + * The S390PVGuest object is basically a dummy used to tell the + * confidential guest support system to use s390's PV mechanism. + * + * # $QEMU \ + * -object s390-pv-guest,id=pv0 \ + * -machine ...,confidential-guest-support=pv0 + */ +struct S390PVGuest { + ConfidentialGuestSupport parent_obj; +}; + +typedef struct S390PVGuestClass S390PVGuestClass; + +struct S390PVGuestClass { + ConfidentialGuestSupportClass parent_class; +}; + +int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!object_dynamic_cast(OBJECT(cgs), TYPE_S390_PV_GUEST)) { + return 0; + } + + if (!s390_has_feat(S390_FEAT_UNPACK)) { + error_setg(errp, + "CPU model does not support Protected Virtualization"); + return -1; + } + + cgs->ready = true; + + return 0; +} + +OBJECT_DEFINE_TYPE_WITH_INTERFACES(S390PVGuest, + s390_pv_guest, + S390_PV_GUEST, + CONFIDENTIAL_GUEST_SUPPORT, + { TYPE_USER_CREATABLE }, + { NULL }) + +static void s390_pv_guest_class_init(ObjectClass *oc, void *data) +{ +} + +static void s390_pv_guest_init(Object *obj) +{ +} + +static void s390_pv_guest_finalize(Object *obj) +{ +} diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index a2d9a79c84..2972b607f3 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -250,6 +250,9 @@ static void ccw_init(MachineState *machine) /* init CPUs (incl. CPU model) early so s390_has_feature() works */ s390_init_cpus(machine); + /* Need CPU model to be determined before we can set up PV */ + s390_pv_init(machine->cgs, &error_fatal); + s390_flic_init(); /* init the SIGP facility */ diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c index b995bab3a2..2c83a0ab1f 100644 --- a/hw/scsi/virtio-scsi-dataplane.c +++ b/hw/scsi/virtio-scsi-dataplane.c @@ -126,6 +126,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) { int i; int rc; + int vq_init_count = 0; BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev); @@ -153,17 +154,22 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) if (rc) { goto fail_vrings; } + + vq_init_count++; rc = virtio_scsi_vring_init(s, vs->event_vq, 1, virtio_scsi_data_plane_handle_event); if (rc) { goto fail_vrings; } + + vq_init_count++; for (i = 0; i < vs->conf.num_queues; i++) { rc = virtio_scsi_vring_init(s, vs->cmd_vqs[i], i + 2, virtio_scsi_data_plane_handle_cmd); if (rc) { goto fail_vrings; } + vq_init_count++; } s->dataplane_starting = false; @@ -174,7 +180,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) fail_vrings: aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s); aio_context_release(s->ctx); - for (i = 0; i < vs->conf.num_queues + 2; i++) { + for (i = 0; i < vq_init_count; i++) { virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); } diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c index 98e70b2d26..15caff0e41 100644 --- a/hw/timer/arm_timer.c +++ b/hw/timer/arm_timer.c @@ -185,10 +185,11 @@ static arm_timer_state *arm_timer_init(uint32_t freq) return s; } -/* ARM PrimeCell SP804 dual timer module. +/* + * ARM PrimeCell SP804 dual timer module. * Docs at - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0271d/index.html -*/ + * https://developer.arm.com/documentation/ddi0271/latest/ + */ #define TYPE_SP804 "sp804" OBJECT_DECLARE_SIMPLE_TYPE(SP804State, SP804) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index fc8d63c850..c5c4c61d01 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -14,6 +14,7 @@ #include CONFIG_DEVICES #include "exec/memop.h" #include "qemu/units.h" +#include "qemu/log.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" @@ -264,8 +265,15 @@ static uint64_t vfio_ati_3c3_quirk_read(void *opaque, return data; } +static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__); +} + static const MemoryRegionOps vfio_ati_3c3_quirk = { .read = vfio_ati_3c3_quirk_read, + .write = vfio_ati_3c3_quirk_write, .endianness = DEVICE_LITTLE_ENDIAN, }; diff --git a/include/block/block.h b/include/block/block.h index 0a9f2c187c..2f2698074e 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -849,4 +849,7 @@ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, BdrvChild *dst, int64_t dst_offset, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); + +void bdrv_cancel_in_flight(BlockDriverState *bs); + #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 22a2789d35..88e4111939 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -353,6 +353,15 @@ struct BlockDriver { int64_t *map, BlockDriverState **file); /* + * This informs the driver that we are no longer interested in the result + * of in-flight requests, so don't waste the time if possible. + * + * One example usage is to avoid waiting for an nbd target node reconnect + * timeout during job-cancel. + */ + void (*bdrv_cancel_in_flight)(BlockDriverState *bs); + + /* * Invalidate any cached meta-data. */ void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs, diff --git a/include/block/nvme.h b/include/block/nvme.h index 3e02d9ca98..07cfc92936 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar { uint64_t acq; uint32_t cmbloc; uint32_t cmbsz; - uint8_t padding[3520]; /* not used by QEMU */ + uint32_t bpinfo; + uint32_t bprsel; + uint64_t bpmbl; + uint64_t cmbmsc; + uint32_t cmbsts; + uint8_t rsvd92[3492]; uint32_t pmrcap; uint32_t pmrctl; uint32_t pmrsts; uint32_t pmrebs; uint32_t pmrswtp; uint64_t pmrmsc; - uint8_t reserved[484]; + uint8_t css[484]; } NvmeBar; enum NvmeCapShift { @@ -35,7 +40,8 @@ enum NvmeCapShift { CAP_CSS_SHIFT = 37, CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, - CAP_PMR_SHIFT = 56, + CAP_PMRS_SHIFT = 56, + CAP_CMBS_SHIFT = 57, }; enum NvmeCapMask { @@ -48,7 +54,8 @@ enum NvmeCapMask { CAP_CSS_MASK = 0xff, CAP_MPSMIN_MASK = 0xf, CAP_MPSMAX_MASK = 0xf, - CAP_PMR_MASK = 0x1, + CAP_PMRS_MASK = 0x1, + CAP_CMBS_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -60,6 +67,8 @@ enum NvmeCapMask { #define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) +#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) +#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) @@ -78,12 +87,15 @@ enum NvmeCapMask { #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ - << CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ - << CAP_PMR_SHIFT) + << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \ + << CAP_PMRS_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \ + << CAP_CMBS_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI_SUPP = 1 << 6, NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, }; @@ -117,9 +129,25 @@ enum NvmeCcMask { enum NvmeCcCss { NVME_CC_CSS_NVM = 0x0, + NVME_CC_CSS_CSI = 0x6, NVME_CC_CSS_ADMIN_ONLY = 0x7, }; +#define NVME_SET_CC_EN(cc, val) \ + (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT) +#define NVME_SET_CC_CSS(cc, val) \ + (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT) +#define NVME_SET_CC_MPS(cc, val) \ + (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT) +#define NVME_SET_CC_AMS(cc, val) \ + (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT) +#define NVME_SET_CC_SHN(cc, val) \ + (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT) +#define NVME_SET_CC_IOSQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT) +#define NVME_SET_CC_IOCQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT) + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, @@ -162,25 +190,64 @@ enum NvmeAqaMask { #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) enum NvmeCmblocShift { - CMBLOC_BIR_SHIFT = 0, - CMBLOC_OFST_SHIFT = 12, + CMBLOC_BIR_SHIFT = 0, + CMBLOC_CQMMS_SHIFT = 3, + CMBLOC_CQPDS_SHIFT = 4, + CMBLOC_CDPMLS_SHIFT = 5, + CMBLOC_CDPCILS_SHIFT = 6, + CMBLOC_CDMMMS_SHIFT = 7, + CMBLOC_CQDA_SHIFT = 8, + CMBLOC_OFST_SHIFT = 12, }; enum NvmeCmblocMask { - CMBLOC_BIR_MASK = 0x7, - CMBLOC_OFST_MASK = 0xfffff, + CMBLOC_BIR_MASK = 0x7, + CMBLOC_CQMMS_MASK = 0x1, + CMBLOC_CQPDS_MASK = 0x1, + CMBLOC_CDPMLS_MASK = 0x1, + CMBLOC_CDPCILS_MASK = 0x1, + CMBLOC_CDMMMS_MASK = 0x1, + CMBLOC_CQDA_MASK = 0x1, + CMBLOC_OFST_MASK = 0xfffff, }; -#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ - CMBLOC_BIR_MASK) -#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ - CMBLOC_OFST_MASK) - -#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ +#define NVME_CMBLOC_BIR(cmbloc) \ + ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK) +#define NVME_CMBLOC_CQMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK) +#define NVME_CMBLOC_CQPDS(cmbloc) \ + ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK) +#define NVME_CMBLOC_CDPMLS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK) +#define NVME_CMBLOC_CDPCILS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK) +#define NVME_CMBLOC_CDMMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK) +#define NVME_CMBLOC_CQDA(cmbloc) \ + ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK) +#define NVME_CMBLOC_OFST(cmbloc) \ + ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK) + +#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) +#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT) +#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT) +#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT) +#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT) +#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT) +#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT) #define NVME_CMBLOC_SET_OFST(cmbloc, val) \ (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) +#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT) + enum NvmeCmbszShift { CMBSZ_SQS_SHIFT = 0, CMBSZ_CQS_SHIFT = 1, @@ -227,6 +294,46 @@ enum NvmeCmbszMask { #define NVME_CMBSZ_GETSIZE(cmbsz) \ (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) +enum NvmeCmbmscShift { + CMBMSC_CRE_SHIFT = 0, + CMBMSC_CMSE_SHIFT = 1, + CMBMSC_CBA_SHIFT = 12, +}; + +enum NvmeCmbmscMask { + CMBMSC_CRE_MASK = 0x1, + CMBMSC_CMSE_MASK = 0x1, + CMBMSC_CBA_MASK = ((1ULL << 52) - 1), +}; + +#define NVME_CMBMSC_CRE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CRE_SHIFT) & CMBMSC_CRE_MASK) +#define NVME_CMBMSC_CMSE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK) +#define NVME_CMBMSC_CBA(cmbmsc) \ + ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK) + + +#define NVME_CMBMSC_SET_CRE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT) +#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT) +#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT) + +enum NvmeCmbstsShift { + CMBSTS_CBAI_SHIFT = 0, +}; +enum NvmeCmbstsMask { + CMBSTS_CBAI_MASK = 0x1, +}; + +#define NVME_CMBSTS_CBAI(cmbsts) \ + ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK) + +#define NVME_CMBSTS_SET_CBAI(cmbsts, val) \ + (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT) + enum NvmePmrcapShift { PMRCAP_RDS_SHIFT = 3, PMRCAP_WDS_SHIFT = 4, @@ -472,6 +579,9 @@ enum NvmeIoCommands { NVME_CMD_COMPARE = 0x05, NVME_CMD_WRITE_ZEROES = 0x08, NVME_CMD_DSM = 0x09, + NVME_CMD_ZONE_MGMT_SEND = 0x79, + NVME_CMD_ZONE_MGMT_RECV = 0x7a, + NVME_CMD_ZONE_APPEND = 0x7d, }; typedef struct QEMU_PACKED NvmeDeleteQ { @@ -540,8 +650,13 @@ typedef struct QEMU_PACKED NvmeIdentify { uint64_t rsvd2[2]; uint64_t prp1; uint64_t prp2; - uint32_t cns; - uint32_t rsvd11[5]; + uint8_t cns; + uint8_t rsvd10; + uint16_t ctrlid; + uint16_t nvmsetid; + uint8_t rsvd11; + uint8_t csi; + uint32_t rsvd12[4]; } NvmeIdentify; typedef struct QEMU_PACKED NvmeRwCmd { @@ -632,9 +747,13 @@ typedef struct QEMU_PACKED NvmeAerResult { uint8_t resv; } NvmeAerResult; +typedef struct QEMU_PACKED NvmeZonedResult { + uint64_t slba; +} NvmeZonedResult; + typedef struct QEMU_PACKED NvmeCqe { uint32_t result; - uint32_t rsvd; + uint32_t dw1; uint16_t sq_head; uint16_t sq_id; uint16_t cid; @@ -662,6 +781,8 @@ enum NvmeStatusCodes { NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, + NVME_CMD_SET_CMB_REJECTED = 0x002b, + NVME_INVALID_CMD_SET = 0x002c, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -686,6 +807,14 @@ enum NvmeStatusCodes { NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, + NVME_ZONE_BOUNDARY_ERROR = 0x01b8, + NVME_ZONE_FULL = 0x01b9, + NVME_ZONE_READ_ONLY = 0x01ba, + NVME_ZONE_OFFLINE = 0x01bb, + NVME_ZONE_INVALID_WRITE = 0x01bc, + NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd, + NVME_ZONE_TOO_MANY_OPEN = 0x01be, + NVME_ZONE_INVAL_TRANSITION = 0x01bf, NVME_WRITE_FAULT = 0x0280, NVME_UNRECOVERED_READ = 0x0281, NVME_E2E_GUARD_ERROR = 0x0282, @@ -693,6 +822,7 @@ enum NvmeStatusCodes { NVME_E2E_REF_ERROR = 0x0284, NVME_CMP_FAILURE = 0x0285, NVME_ACCESS_DENIED = 0x0286, + NVME_DULB = 0x0287, NVME_MORE = 0x2000, NVME_DNR = 0x4000, NVME_NO_COMPLETE = 0xffff, @@ -743,18 +873,37 @@ typedef struct QEMU_PACKED NvmeSmartLog { uint8_t reserved2[320]; } NvmeSmartLog; +#define NVME_SMART_WARN_MAX 6 enum NvmeSmartWarn { NVME_SMART_SPARE = 1 << 0, NVME_SMART_TEMPERATURE = 1 << 1, NVME_SMART_RELIABILITY = 1 << 2, NVME_SMART_MEDIA_READ_ONLY = 1 << 3, NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, + NVME_SMART_PMR_UNRELIABLE = 1 << 5, +}; + +typedef struct NvmeEffectsLog { + uint32_t acs[256]; + uint32_t iocs[256]; + uint8_t resv[2048]; +} NvmeEffectsLog; + +enum { + NVME_CMD_EFF_CSUPP = 1 << 0, + NVME_CMD_EFF_LBCC = 1 << 1, + NVME_CMD_EFF_NCC = 1 << 2, + NVME_CMD_EFF_NIC = 1 << 3, + NVME_CMD_EFF_CCC = 1 << 4, + NVME_CMD_EFF_CSE_MASK = 3 << 16, + NVME_CMD_EFF_UUID_SEL = 1 << 19, }; enum NvmeLogIdentifier { NVME_LOG_ERROR_INFO = 0x01, NVME_LOG_SMART_INFO = 0x02, NVME_LOG_FW_SLOT_INFO = 0x03, + NVME_LOG_CMD_EFFECTS = 0x05, }; typedef struct QEMU_PACKED NvmePSD { @@ -771,11 +920,19 @@ typedef struct QEMU_PACKED NvmePSD { #define NVME_IDENTIFY_DATA_SIZE 4096 -enum { - NVME_ID_CNS_NS = 0x0, - NVME_ID_CNS_CTRL = 0x1, - NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, - NVME_ID_CNS_NS_DESCR_LIST = 0x3, +enum NvmeIdCns { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESCR_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a, + NVME_ID_CNS_CS_NS_PRESENT = 0x1b, + NVME_ID_CNS_IO_COMMAND_SET = 0x1c, }; typedef struct QEMU_PACKED NvmeIdCtrl { @@ -794,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint32_t rtd3e; uint32_t oaes; uint32_t ctratt; - uint8_t rsvd100[12]; + uint8_t rsvd100[11]; + uint8_t cntrltype; uint8_t fguid[16]; uint8_t rsvd128[128]; uint16_t oacs; @@ -845,6 +1003,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t vs[1024]; } NvmeIdCtrl; +typedef struct NvmeIdCtrlZoned { + uint8_t zasl; + uint8_t rsvd1[4095]; +} NvmeIdCtrlZoned; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, @@ -867,6 +1030,7 @@ enum NvmeIdCtrlFrmw { enum NvmeIdCtrlLpa { NVME_LPA_NS_SMART = 1 << 0, + NVME_LPA_CSE = 1 << 1, NVME_LPA_EXTENDED = 1 << 2, }; @@ -909,6 +1073,9 @@ enum NvmeIdCtrlLpa { #define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1) #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1) +#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff) +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000) + enum NvmeFeatureIds { NVME_ARBITRATION = 0x1, NVME_POWER_MANAGEMENT = 0x2, @@ -922,6 +1089,7 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY = 0xa, NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, NVME_TIMESTAMP = 0xe, + NVME_COMMAND_SET_PROFILE = 0x19, NVME_SOFTWARE_PROGRESS_MARKER = 0x80, NVME_FID_MAX = 0x100, }; @@ -968,6 +1136,12 @@ typedef struct QEMU_PACKED NvmeLBAF { uint8_t rp; } NvmeLBAF; +typedef struct QEMU_PACKED NvmeLBAFE { + uint64_t zsze; + uint8_t zdes; + uint8_t rsvd9[7]; +} NvmeLBAFE; + #define NVME_NSID_BROADCAST 0xffffffff typedef struct QEMU_PACKED NvmeIdNs { @@ -992,7 +1166,12 @@ typedef struct QEMU_PACKED NvmeIdNs { uint16_t nabspf; uint16_t noiob; uint8_t nvmcap[16]; - uint8_t rsvd64[40]; + uint16_t npwg; + uint16_t npwa; + uint16_t npdg; + uint16_t npda; + uint16_t nows; + uint8_t rsvd74[30]; uint8_t nguid[16]; uint64_t eui64; NvmeLBAF lbaf[16]; @@ -1006,18 +1185,40 @@ typedef struct QEMU_PACKED NvmeIdNsDescr { uint8_t rsvd2[2]; } NvmeIdNsDescr; -enum { - NVME_NIDT_EUI64_LEN = 8, - NVME_NIDT_NGUID_LEN = 16, - NVME_NIDT_UUID_LEN = 16, +enum NvmeNsIdentifierLength { + NVME_NIDL_EUI64 = 8, + NVME_NIDL_NGUID = 16, + NVME_NIDL_UUID = 16, + NVME_NIDL_CSI = 1, }; enum NvmeNsIdentifierType { - NVME_NIDT_EUI64 = 0x1, - NVME_NIDT_NGUID = 0x2, - NVME_NIDT_UUID = 0x3, + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; +enum NvmeCsi { + NVME_CSI_NVM = 0x00, + NVME_CSI_ZONED = 0x02, +}; + +#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) + +typedef struct QEMU_PACKED NvmeIdNsZoned { + uint16_t zoc; + uint16_t ozcs; + uint32_t mar; + uint32_t mor; + uint32_t rrl; + uint32_t frl; + uint8_t rsvd20[2796]; + NvmeLBAFE lbafe[16]; + uint8_t rsvd3072[768]; + uint8_t vs[256]; +} NvmeIdNsZoned; + /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08) @@ -1029,6 +1230,7 @@ enum NvmeNsIdentifierType { #define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1) #define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) #define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) #define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) @@ -1049,10 +1251,76 @@ enum NvmeIdNsDps { DPS_FIRST_EIGHT = 8, }; +enum NvmeZoneAttr { + NVME_ZA_FINISHED_BY_CTLR = 1 << 0, + NVME_ZA_FINISH_RECOMMENDED = 1 << 1, + NVME_ZA_RESET_RECOMMENDED = 1 << 2, + NVME_ZA_ZD_EXT_VALID = 1 << 7, +}; + +typedef struct QEMU_PACKED NvmeZoneReportHeader { + uint64_t nr_zones; + uint8_t rsvd[56]; +} NvmeZoneReportHeader; + +enum NvmeZoneReceiveAction { + NVME_ZONE_REPORT = 0, + NVME_ZONE_REPORT_EXTENDED = 1, +}; + +enum NvmeZoneReportType { + NVME_ZONE_REPORT_ALL = 0, + NVME_ZONE_REPORT_EMPTY = 1, + NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2, + NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3, + NVME_ZONE_REPORT_CLOSED = 4, + NVME_ZONE_REPORT_FULL = 5, + NVME_ZONE_REPORT_READ_ONLY = 6, + NVME_ZONE_REPORT_OFFLINE = 7, +}; + +enum NvmeZoneType { + NVME_ZONE_TYPE_RESERVED = 0x00, + NVME_ZONE_TYPE_SEQ_WRITE = 0x02, +}; + +enum NvmeZoneSendAction { + NVME_ZONE_ACTION_RSD = 0x00, + NVME_ZONE_ACTION_CLOSE = 0x01, + NVME_ZONE_ACTION_FINISH = 0x02, + NVME_ZONE_ACTION_OPEN = 0x03, + NVME_ZONE_ACTION_RESET = 0x04, + NVME_ZONE_ACTION_OFFLINE = 0x05, + NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, +}; + +typedef struct QEMU_PACKED NvmeZoneDescr { + uint8_t zt; + uint8_t zs; + uint8_t za; + uint8_t rsvd3[5]; + uint64_t zcap; + uint64_t zslba; + uint64_t wp; + uint8_t rsvd32[32]; +} NvmeZoneDescr; + +typedef enum NvmeZoneState { + NVME_ZONE_STATE_RESERVED = 0x00, + NVME_ZONE_STATE_EMPTY = 0x01, + NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02, + NVME_ZONE_STATE_EXPLICITLY_OPEN = 0x03, + NVME_ZONE_STATE_CLOSED = 0x04, + NVME_ZONE_STATE_READ_ONLY = 0x0D, + NVME_ZONE_STATE_FULL = 0x0E, + NVME_ZONE_STATE_OFFLINE = 0x0F, +} NvmeZoneState; + static inline void _nvme_check_size(void) { QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); @@ -1066,9 +1334,15 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64); } #endif diff --git a/include/block/snapshot.h b/include/block/snapshot.h index b0fe42993d..940345692f 100644 --- a/include/block/snapshot.h +++ b/include/block/snapshot.h @@ -25,7 +25,7 @@ #ifndef SNAPSHOT_H #define SNAPSHOT_H - +#include "qapi/qapi-builtin-types.h" #define SNAPSHOT_OPT_BASE "snapshot." #define SNAPSHOT_OPT_ID "snapshot.id" @@ -77,17 +77,26 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, * These functions will properly handle dataplane (take aio_context_acquire * when appropriate for appropriate block drivers */ -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs); -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bsd_bs, +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp); +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs, +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, Error **errp); -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs); +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, BlockDriverState *vm_state_bs, uint64_t vm_state_size, - BlockDriverState **first_bad_bs); + bool has_devices, + strList *devices, + Error **errp); -BlockDriverState *bdrv_all_find_vmstate_bs(void); +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h new file mode 100644 index 0000000000..ba2dd4b5df --- /dev/null +++ b/include/exec/confidential-guest-support.h @@ -0,0 +1,62 @@ +/* + * QEMU Confidential Guest support + * This interface describes the common pieces between various + * schemes for protecting guest memory or other state against a + * compromised hypervisor. This includes memory encryption (AMD's + * SEV and Intel's MKTME) or special protection modes (PEF on POWER, + * or PV on s390x). + * + * Copyright Red Hat. + * + * Authors: + * David Gibson <david@gibson.dropbear.id.au> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + * + */ +#ifndef QEMU_CONFIDENTIAL_GUEST_SUPPORT_H +#define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H + +#ifndef CONFIG_USER_ONLY + +#include "qom/object.h" + +#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support" +OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT) + +struct ConfidentialGuestSupport { + Object parent; + + /* + * ready: flag set by CGS initialization code once it's ready to + * start executing instructions in a potentially-secure + * guest + * + * The definition here is a bit fuzzy, because this is essentially + * part of a self-sanity-check, rather than a strict mechanism. + * + * It's not feasible to have a single point in the common machine + * init path to configure confidential guest support, because + * different mechanisms have different interdependencies requiring + * initialization in different places, often in arch or machine + * type specific code. It's also usually not possible to check + * for invalid configurations until that initialization code. + * That means it would be very easy to have a bug allowing CGS + * init to be bypassed entirely in certain configurations. + * + * Silently ignoring a requested security feature would be bad, so + * to avoid that we check late in init that this 'ready' flag is + * set if CGS was requested. If the CGS init hasn't happened, and + * so 'ready' is not set, we'll abort. + */ + bool ready; +}; + +typedef struct ConfidentialGuestSupportClass { + ObjectClass parent; +} ConfidentialGuestSupportClass; + +#endif /* !CONFIG_USER_ONLY */ + +#endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */ diff --git a/include/exec/memory.h b/include/exec/memory.h index c6ce74fb79..c6fb714e49 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -45,13 +45,11 @@ DECLARE_OBJ_CHECKERS(IOMMUMemoryRegion, IOMMUMemoryRegionClass, #ifdef CONFIG_FUZZ void fuzz_dma_read_cb(size_t addr, size_t len, - MemoryRegion *mr, - bool is_write); + MemoryRegion *mr); #else static inline void fuzz_dma_read_cb(size_t addr, size_t len, - MemoryRegion *mr, - bool is_write) + MemoryRegion *mr) { /* Do Nothing */ } @@ -149,6 +147,14 @@ typedef struct IOMMUTLBEvent { /* RAM is a persistent kind memory */ #define RAM_PMEM (1 << 5) + +/* + * UFFDIO_WRITEPROTECT is used on this RAMBlock to + * support 'write-tracking' migration type. + * Implies ram_state->ram_wt_enabled. + */ +#define RAM_UF_WRITEPROTECT (1 << 6) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, @@ -992,6 +998,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, * @size: size of the region. * @share: %true if memory must be mmaped with the MAP_SHARED flag * @fd: the fd to mmap. + * @offset: offset within the file referenced by fd * @errp: pointer to Error*, to store an error if it happens. * * Note that this function does not do anything to cause the data in the @@ -1003,6 +1010,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr, uint64_t size, bool share, int fd, + ram_addr_t offset, Error **errp); #endif @@ -2506,7 +2514,7 @@ address_space_read_cached(MemoryRegionCache *cache, hwaddr addr, void *buf, hwaddr len) { assert(addr < cache->len && len <= cache->len - addr); - fuzz_dma_read_cb(cache->xlat + addr, len, cache->mrs.mr, false); + fuzz_dma_read_cb(cache->xlat + addr, len, cache->mrs.mr); if (likely(cache->ptr)) { memcpy(buf, cache->ptr + addr, len); return MEMTX_OK; diff --git a/include/exec/memory_ldst_cached.h.inc b/include/exec/memory_ldst_cached.h.inc index 01efad62de..7bc8790d34 100644 --- a/include/exec/memory_ldst_cached.h.inc +++ b/include/exec/memory_ldst_cached.h.inc @@ -28,7 +28,7 @@ static inline uint32_t ADDRESS_SPACE_LD_CACHED(l)(MemoryRegionCache *cache, hwaddr addr, MemTxAttrs attrs, MemTxResult *result) { assert(addr < cache->len && 4 <= cache->len - addr); - fuzz_dma_read_cb(cache->xlat + addr, 4, cache->mrs.mr, false); + fuzz_dma_read_cb(cache->xlat + addr, 4, cache->mrs.mr); if (likely(cache->ptr)) { return LD_P(l)(cache->ptr + addr); } else { @@ -40,7 +40,7 @@ static inline uint64_t ADDRESS_SPACE_LD_CACHED(q)(MemoryRegionCache *cache, hwaddr addr, MemTxAttrs attrs, MemTxResult *result) { assert(addr < cache->len && 8 <= cache->len - addr); - fuzz_dma_read_cb(cache->xlat + addr, 8, cache->mrs.mr, false); + fuzz_dma_read_cb(cache->xlat + addr, 8, cache->mrs.mr); if (likely(cache->ptr)) { return LD_P(q)(cache->ptr + addr); } else { @@ -52,7 +52,7 @@ static inline uint32_t ADDRESS_SPACE_LD_CACHED(uw)(MemoryRegionCache *cache, hwaddr addr, MemTxAttrs attrs, MemTxResult *result) { assert(addr < cache->len && 2 <= cache->len - addr); - fuzz_dma_read_cb(cache->xlat + addr, 2, cache->mrs.mr, false); + fuzz_dma_read_cb(cache->xlat + addr, 2, cache->mrs.mr); if (likely(cache->ptr)) { return LD_P(uw)(cache->ptr + addr); } else { diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 40b16609ab..3cb9791df3 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -121,8 +121,8 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, uint32_t ram_flags, const char *mem_path, bool readonly, Error **errp); RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, - uint32_t ram_flags, int fd, bool readonly, - Error **errp); + uint32_t ram_flags, int fd, off_t offset, + bool readonly, Error **errp); RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr, Error **errp); diff --git a/include/hw/boards.h b/include/hw/boards.h index 85af4faf76..a46dfe5d1a 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -270,7 +270,7 @@ struct MachineState { bool iommu; bool suppress_vmdesc; bool enable_graphics; - char *memory_encryption; + ConfidentialGuestSupport *cgs; char *ram_memdev_id; /* * convenience alias to ram_memdev_id backend memory region diff --git a/include/hw/dma/pl080.h b/include/hw/dma/pl080.h index 1883f04270..3c9659e438 100644 --- a/include/hw/dma/pl080.h +++ b/include/hw/dma/pl080.h @@ -10,11 +10,12 @@ * (at your option) any later version. */ -/* This is a model of the Arm PrimeCell PL080/PL081 DMA controller: +/* + * This is a model of the Arm PrimeCell PL080/PL081 DMA controller: * The PL080 TRM is: - * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0196g/DDI0196.pdf + * https://developer.arm.com/documentation/ddi0196/latest * and the PL081 TRM is: - * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0218e/DDI0218.pdf + * https://developer.arm.com/documentation/ddi0218/latest * * QEMU interface: * + sysbus IRQ 0: DMACINTR combined interrupt line diff --git a/include/hw/misc/arm_integrator_debug.h b/include/hw/misc/arm_integrator_debug.h index 0077dacb44..798b082164 100644 --- a/include/hw/misc/arm_integrator_debug.h +++ b/include/hw/misc/arm_integrator_debug.h @@ -3,7 +3,7 @@ * * Browse the data sheet: * - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0159b/Babbfijf.html + * https://developer.arm.com/documentation/dui0159/b/peripherals-and-interfaces/debug-leds-and-dip-switch-interface * * Copyright (c) 2013 Alex Bennée <alex@bennee.com> * diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h new file mode 100644 index 0000000000..3dcf6aa51d --- /dev/null +++ b/include/hw/pci-host/remote.h @@ -0,0 +1,30 @@ +/* + * PCI Host for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef REMOTE_PCIHOST_H +#define REMOTE_PCIHOST_H + +#include "exec/memory.h" +#include "hw/pci/pcie_host.h" + +#define TYPE_REMOTE_PCIHOST "remote-pcihost" +OBJECT_DECLARE_SIMPLE_TYPE(RemotePCIHost, REMOTE_PCIHOST) + +struct RemotePCIHost { + /*< private >*/ + PCIExpressHost parent_obj; + /*< public >*/ + + MemoryRegion *mr_pci_mem; + MemoryRegion *mr_sys_io; + MemoryRegion *mr_sys_mem; +}; + +#endif diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index bd014823a9..5b03a7b0eb 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -210,4 +210,6 @@ static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb) return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1; } +char *spapr_pci_fw_dev_name(PCIDevice *dev); + #endif /* PCI_HOST_SPAPR_H */ diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h index 11f8ab7149..bd0c17dc78 100644 --- a/include/hw/pci/pci_ids.h +++ b/include/hw/pci/pci_ids.h @@ -192,6 +192,9 @@ #define PCI_DEVICE_ID_SUN_SIMBA 0x5000 #define PCI_DEVICE_ID_SUN_SABRE 0xa000 +#define PCI_VENDOR_ID_ORACLE 0x108e +#define PCI_DEVICE_ID_REMOTE_IOHUB 0xb000 + #define PCI_VENDOR_ID_CMD 0x1095 #define PCI_DEVICE_ID_CMD_646 0x0646 diff --git a/include/hw/ppc/pef.h b/include/hw/ppc/pef.h new file mode 100644 index 0000000000..707dbe524c --- /dev/null +++ b/include/hw/ppc/pef.h @@ -0,0 +1,17 @@ +/* + * PEF (Protected Execution Facility) for POWER support + * + * Copyright Red Hat. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef HW_PPC_PEF_H +#define HW_PPC_PEF_H + +int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp); + +#endif /* HW_PPC_PEF_H */ diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h index ee7eda3e01..d69cee17b2 100644 --- a/include/hw/ppc/pnv.h +++ b/include/hw/ppc/pnv.h @@ -58,6 +58,7 @@ struct PnvChip { MemoryRegion xscom; AddressSpace xscom_as; + MemoryRegion *fw_mr; gchar *dt_isa_nodename; }; diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index c27c7ce515..ccbeeca1de 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -851,7 +851,6 @@ int spapr_max_server_number(SpaprMachineState *spapr); void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, uint64_t pte0, uint64_t pte1); void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered); -bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr); /* DRC callbacks. */ void spapr_core_release(DeviceState *dev); diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h index b3fd950634..6f9f02d3de 100644 --- a/include/hw/ppc/spapr_numa.h +++ b/include/hw/ppc/spapr_numa.h @@ -31,5 +31,6 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, int offset, PowerPCCPU *cpu); int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, int offset); +unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine); #endif /* HW_SPAPR_NUMA_H */ diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h index 7879692825..b7fde2354e 100644 --- a/include/hw/ppc/xive_regs.h +++ b/include/hw/ppc/xive_regs.h @@ -236,6 +236,8 @@ typedef struct XiveEND { (be32_to_cpu((end)->w0) & END_W0_UNCOND_ESCALATE) #define xive_end_is_silent_escalation(end) \ (be32_to_cpu((end)->w0) & END_W0_SILENT_ESCALATE) +#define xive_end_is_firmware(end) \ + (be32_to_cpu((end)->w0) & END_W0_FIRMWARE) static inline uint64_t xive_end_qaddr(XiveEND *end) { diff --git a/include/hw/remote/iohub.h b/include/hw/remote/iohub.h new file mode 100644 index 0000000000..0bf98e0d78 --- /dev/null +++ b/include/hw/remote/iohub.h @@ -0,0 +1,42 @@ +/* + * IO Hub for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef REMOTE_IOHUB_H +#define REMOTE_IOHUB_H + +#include "hw/pci/pci.h" +#include "qemu/event_notifier.h" +#include "qemu/thread-posix.h" +#include "hw/remote/mpqemu-link.h" + +#define REMOTE_IOHUB_NB_PIRQS PCI_DEVFN_MAX + +typedef struct ResampleToken { + void *iohub; + int pirq; +} ResampleToken; + +typedef struct RemoteIOHubState { + PCIDevice d; + EventNotifier irqfds[REMOTE_IOHUB_NB_PIRQS]; + EventNotifier resamplefds[REMOTE_IOHUB_NB_PIRQS]; + unsigned int irq_level[REMOTE_IOHUB_NB_PIRQS]; + ResampleToken token[REMOTE_IOHUB_NB_PIRQS]; + QemuMutex irq_level_lock[REMOTE_IOHUB_NB_PIRQS]; +} RemoteIOHubState; + +int remote_iohub_map_irq(PCIDevice *pci_dev, int intx); +void remote_iohub_set_irq(void *opaque, int pirq, int level); +void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg); + +void remote_iohub_init(RemoteIOHubState *iohub); +void remote_iohub_finalize(RemoteIOHubState *iohub); + +#endif diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h new file mode 100644 index 0000000000..2a2a33c4b2 --- /dev/null +++ b/include/hw/remote/machine.h @@ -0,0 +1,38 @@ +/* + * Remote machine configuration + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef REMOTE_MACHINE_H +#define REMOTE_MACHINE_H + +#include "qom/object.h" +#include "hw/boards.h" +#include "hw/pci-host/remote.h" +#include "io/channel.h" +#include "hw/remote/iohub.h" + +struct RemoteMachineState { + MachineState parent_obj; + + RemotePCIHost *host; + RemoteIOHubState iohub; +}; + +/* Used to pass to co-routine device and ioc. */ +typedef struct RemoteCommDev { + PCIDevice *dev; + QIOChannel *ioc; +} RemoteCommDev; + +#define TYPE_REMOTE_MACHINE "x-remote-machine" +OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE) + +void coroutine_fn mpqemu_remote_msg_loop_co(void *data); + +#endif diff --git a/include/hw/remote/memory.h b/include/hw/remote/memory.h new file mode 100644 index 0000000000..bc2e30945f --- /dev/null +++ b/include/hw/remote/memory.h @@ -0,0 +1,19 @@ +/* + * Memory manager for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef REMOTE_MEMORY_H +#define REMOTE_MEMORY_H + +#include "exec/hwaddr.h" +#include "hw/remote/mpqemu-link.h" + +void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp); + +#endif diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h new file mode 100644 index 0000000000..4ec0915885 --- /dev/null +++ b/include/hw/remote/mpqemu-link.h @@ -0,0 +1,99 @@ +/* + * Communication channel between QEMU and remote device process + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef MPQEMU_LINK_H +#define MPQEMU_LINK_H + +#include "qom/object.h" +#include "qemu/thread.h" +#include "io/channel.h" +#include "exec/hwaddr.h" +#include "io/channel-socket.h" +#include "hw/remote/proxy.h" + +#define REMOTE_MAX_FDS 8 + +#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data.u64) + +/** + * MPQemuCmd: + * + * MPQemuCmd enum type to specify the command to be executed on the remote + * device. + * + * This uses a private protocol between QEMU and the remote process. vfio-user + * protocol would supersede this in the future. + * + */ +typedef enum { + MPQEMU_CMD_SYNC_SYSMEM, + MPQEMU_CMD_RET, + MPQEMU_CMD_PCI_CFGWRITE, + MPQEMU_CMD_PCI_CFGREAD, + MPQEMU_CMD_BAR_WRITE, + MPQEMU_CMD_BAR_READ, + MPQEMU_CMD_SET_IRQFD, + MPQEMU_CMD_DEVICE_RESET, + MPQEMU_CMD_MAX, +} MPQemuCmd; + +typedef struct { + hwaddr gpas[REMOTE_MAX_FDS]; + uint64_t sizes[REMOTE_MAX_FDS]; + off_t offsets[REMOTE_MAX_FDS]; +} SyncSysmemMsg; + +typedef struct { + uint32_t addr; + uint32_t val; + int len; +} PciConfDataMsg; + +typedef struct { + hwaddr addr; + uint64_t val; + unsigned size; + bool memory; +} BarAccessMsg; + +/** + * MPQemuMsg: + * @cmd: The remote command + * @size: Size of the data to be shared + * @data: Structured data + * @fds: File descriptors to be shared with remote device + * + * MPQemuMsg Format of the message sent to the remote device from QEMU. + * + */ + +typedef struct { + int cmd; + size_t size; + + union { + uint64_t u64; + PciConfDataMsg pci_conf_data; + SyncSysmemMsg sync_sysmem; + BarAccessMsg bar_access; + } data; + + int fds[REMOTE_MAX_FDS]; + int num_fds; +} MPQemuMsg; + +bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp); +bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp); + +uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev, + Error **errp); +bool mpqemu_msg_valid(MPQemuMsg *msg); + +#endif diff --git a/include/hw/remote/proxy-memory-listener.h b/include/hw/remote/proxy-memory-listener.h new file mode 100644 index 0000000000..c4f3efb928 --- /dev/null +++ b/include/hw/remote/proxy-memory-listener.h @@ -0,0 +1,28 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef PROXY_MEMORY_LISTENER_H +#define PROXY_MEMORY_LISTENER_H + +#include "exec/memory.h" +#include "io/channel.h" + +typedef struct ProxyMemoryListener { + MemoryListener listener; + + int n_mr_sections; + MemoryRegionSection *mr_sections; + + QIOChannel *ioc; +} ProxyMemoryListener; + +void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener, + QIOChannel *ioc); +void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener); + +#endif diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h new file mode 100644 index 0000000000..741def71f1 --- /dev/null +++ b/include/hw/remote/proxy.h @@ -0,0 +1,48 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef PROXY_H +#define PROXY_H + +#include "hw/pci/pci.h" +#include "io/channel.h" +#include "hw/remote/proxy-memory-listener.h" +#include "qemu/event_notifier.h" + +#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev" +OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV) + +typedef struct ProxyMemoryRegion { + PCIProxyDev *dev; + MemoryRegion mr; + bool memory; + bool present; + uint8_t type; +} ProxyMemoryRegion; + +struct PCIProxyDev { + PCIDevice parent_dev; + char *fd; + + /* + * Mutex used to protect the QIOChannel fd from + * the concurrent access by the VCPUs since proxy + * blocks while awaiting for the replies from the + * process remote. + */ + QemuMutex io_mutex; + QIOChannel *ioc; + Error *migration_blocker; + ProxyMemoryListener proxy_listener; + int virq; + EventNotifier intr; + EventNotifier resample; + ProxyMemoryRegion region[PCI_NUM_REGIONS]; +}; + +#endif /* PROXY_H */ diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h index aee758bc2d..1f1f545bfc 100644 --- a/include/hw/s390x/pv.h +++ b/include/hw/s390x/pv.h @@ -12,6 +12,9 @@ #ifndef HW_S390_PV_H #define HW_S390_PV_H +#include "qapi/error.h" +#include "sysemu/kvm.h" + #ifdef CONFIG_KVM #include "cpu.h" #include "hw/s390x/s390-virtio-ccw.h" @@ -55,4 +58,18 @@ static inline void s390_pv_unshare(void) {} static inline void s390_pv_inject_reset_error(CPUState *cs) {}; #endif /* CONFIG_KVM */ +int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +static inline int s390_pv_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + if (!cgs) { + return 0; + } + if (kvm_enabled()) { + return s390_pv_kvm_init(cgs, errp); + } + + error_setg(errp, "Protected Virtualization requires KVM"); + return -1; +} + #endif /* HW_S390_PV_H */ diff --git a/include/hw/ssi/pl022.h b/include/hw/ssi/pl022.h index 545b52689c..25d58db5f3 100644 --- a/include/hw/ssi/pl022.h +++ b/include/hw/ssi/pl022.h @@ -9,9 +9,10 @@ * (at your option) any later version. */ -/* This is a model of the Arm PrimeCell PL022 synchronous serial port. +/* + * This is a model of the Arm PrimeCell PL022 synchronous serial port. * The PL022 TRM is: - * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0194h/DDI0194H_ssp_pl022_trm.pdf + * https://developer.arm.com/documentation/ddi0194/latest * * QEMU interface: * + sysbus IRQ: SSPINTR combined interrupt line diff --git a/include/io/channel.h b/include/io/channel.h index ab9ea77959..88988979f8 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -777,4 +777,82 @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc, IOHandler *io_write, void *opaque); +/** + * qio_channel_readv_full_all_eof: + * @ioc: the channel object + * @iov: the array of memory regions to read data to + * @niov: the length of the @iov array + * @fds: an array of file handles to read + * @nfds: number of file handles in @fds + * @errp: pointer to a NULL-initialized error object + * + * + * Performs same function as qio_channel_readv_all_eof. + * Additionally, attempts to read file descriptors shared + * over the channel. The function will wait for all + * requested data to be read, yielding from the current + * coroutine if required. data refers to both file + * descriptors and the iovs. + * + * Returns: 1 if all bytes were read, 0 if end-of-file + * occurs without data, or -1 on error + */ + +int qio_channel_readv_full_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int **fds, size_t *nfds, + Error **errp); + +/** + * qio_channel_readv_full_all: + * @ioc: the channel object + * @iov: the array of memory regions to read data to + * @niov: the length of the @iov array + * @fds: an array of file handles to read + * @nfds: number of file handles in @fds + * @errp: pointer to a NULL-initialized error object + * + * + * Performs same function as qio_channel_readv_all_eof. + * Additionally, attempts to read file descriptors shared + * over the channel. The function will wait for all + * requested data to be read, yielding from the current + * coroutine if required. data refers to both file + * descriptors and the iovs. + * + * Returns: 0 if all bytes were read, or -1 on error + */ + +int qio_channel_readv_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int **fds, size_t *nfds, + Error **errp); + +/** + * qio_channel_writev_full_all: + * @ioc: the channel object + * @iov: the array of memory regions to write data from + * @niov: the length of the @iov array + * @fds: an array of file handles to send + * @nfds: number of file handles in @fds + * @errp: pointer to a NULL-initialized error object + * + * + * Behaves like qio_channel_writev_full but will attempt + * to send all data passed (file handles and memory regions). + * The function will wait for all requested data + * to be written, yielding from the current coroutine + * if required. + * + * Returns: 0 if all bytes were written, or -1 on error + */ + +int qio_channel_writev_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int *fds, size_t nfds, + Error **errp); + #endif /* QIO_CHANNEL_H */ diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h index c85b6ec75b..e72083b117 100644 --- a/include/migration/snapshot.h +++ b/include/migration/snapshot.h @@ -15,7 +15,50 @@ #ifndef QEMU_MIGRATION_SNAPSHOT_H #define QEMU_MIGRATION_SNAPSHOT_H -int save_snapshot(const char *name, Error **errp); -int load_snapshot(const char *name, Error **errp); +#include "qapi/qapi-builtin-types.h" + +/** + * save_snapshot: Save an internal snapshot. + * @name: name of internal snapshot + * @overwrite: replace existing snapshot with @name + * @vmstate: blockdev node name to store VM state in + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool save_snapshot(const char *name, bool overwrite, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * load_snapshot: Load an internal snapshot. + * @name: name of internal snapshot + * @vmstate: blockdev node name to load VM state from + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool load_snapshot(const char *name, + const char *vmstate, + bool has_devices, strList *devices, + Error **errp); + +/** + * delete_snapshot: Delete a snapshot. + * @name: path to snapshot + * @has_devices: whether to use explicit device list + * @devices: explicit device list to snapshot + * @errp: pointer to error object + * On success, return %true. + * On failure, store an error through @errp and return %false. + */ +bool delete_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); #endif diff --git a/include/qemu/event_notifier.h b/include/qemu/event_notifier.h index 3380b662f3..b79add035d 100644 --- a/include/qemu/event_notifier.h +++ b/include/qemu/event_notifier.h @@ -24,6 +24,7 @@ struct EventNotifier { #else int rfd; int wfd; + bool initialized; #endif }; diff --git a/include/qemu/fifo8.h b/include/qemu/fifo8.h index 489c354291..28bf2cee57 100644 --- a/include/qemu/fifo8.h +++ b/include/qemu/fifo8.h @@ -148,12 +148,16 @@ uint32_t fifo8_num_used(Fifo8 *fifo); extern const VMStateDescription vmstate_fifo8; -#define VMSTATE_FIFO8(_field, _state) { \ - .name = (stringify(_field)), \ - .size = sizeof(Fifo8), \ - .vmsd = &vmstate_fifo8, \ - .flags = VMS_STRUCT, \ - .offset = vmstate_offset_value(_state, _field, Fifo8), \ +#define VMSTATE_FIFO8_TEST(_field, _state, _test) { \ + .name = (stringify(_field)), \ + .field_exists = (_test), \ + .size = sizeof(Fifo8), \ + .vmsd = &vmstate_fifo8, \ + .flags = VMS_STRUCT, \ + .offset = vmstate_offset_value(_state, _field, Fifo8), \ } +#define VMSTATE_FIFO8(_field, _state) \ + VMSTATE_FIFO8_TEST(_field, _state, NULL) + #endif /* QEMU_FIFO8_H */ diff --git a/include/qemu/job.h b/include/qemu/job.h index 32aabb1c60..efc6fa7544 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -251,6 +251,11 @@ struct JobDriver { */ void (*clean)(Job *job); + /** + * If the callback is not NULL, it will be invoked in job_cancel_async + */ + void (*cancel)(Job *job); + /** Called when the job is freed */ void (*free)(Job *job); diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h index 8b7a5c70f3..456ff87df1 100644 --- a/include/qemu/mmap-alloc.h +++ b/include/qemu/mmap-alloc.h @@ -17,6 +17,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path); * @readonly: true for a read-only mapping, false for read/write. * @shared: map has RAM_SHARED flag. * @is_pmem: map has RAM_PMEM flag. + * @map_offset: map starts at offset of map_offset from the start of fd * * Return: * On success, return a pointer to the mapped area. @@ -27,7 +28,8 @@ void *qemu_ram_mmap(int fd, size_t align, bool readonly, bool shared, - bool is_pmem); + bool is_pmem, + off_t map_offset); void qemu_ram_munmap(int fd, void *ptr, size_t size); diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 68deb74ef6..dc39b05c30 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -37,6 +37,7 @@ typedef struct Chardev Chardev; typedef struct Clock Clock; typedef struct CompatProperty CompatProperty; typedef struct CoMutex CoMutex; +typedef struct ConfidentialGuestSupport ConfidentialGuestSupport; typedef struct CPUAddressSpace CPUAddressSpace; typedef struct CPUState CPUState; typedef struct DeviceListener DeviceListener; diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h new file mode 100644 index 0000000000..6b74f92792 --- /dev/null +++ b/include/qemu/userfaultfd.h @@ -0,0 +1,35 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef USERFAULTFD_H +#define USERFAULTFD_H + +#include "qemu/osdep.h" +#include "exec/hwaddr.h" +#include <linux/userfaultfd.h> + +int uffd_query_features(uint64_t *features); +int uffd_create_fd(uint64_t features, bool non_blocking); +void uffd_close_fd(int uffd_fd); +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls); +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length); +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake); +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake); +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake); +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length); +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count); +bool uffd_poll_events(int uffd_fd, int tmo); + +#endif /* USERFAULTFD_H */ diff --git a/include/qom/object.h b/include/qom/object.h index d378f13a11..6721cd312e 100644 --- a/include/qom/object.h +++ b/include/qom/object.h @@ -638,7 +638,8 @@ bool object_apply_global_props(Object *obj, const GPtrArray *props, Error **errp); void object_set_machine_compat_props(GPtrArray *compat_props); void object_set_accelerator_compat_props(GPtrArray *compat_props); -void object_register_sugar_prop(const char *driver, const char *prop, const char *value); +void object_register_sugar_prop(const char *driver, const char *prop, + const char *value, bool optional); void object_apply_compat_props(Object *obj); /** diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h index 0c5284dbbc..f177142f16 100644 --- a/include/sysemu/iothread.h +++ b/include/sysemu/iothread.h @@ -57,4 +57,10 @@ IOThread *iothread_create(const char *id, Error **errp); void iothread_stop(IOThread *iothread); void iothread_destroy(IOThread *iothread); +/* + * Returns true if executing withing IOThread context, + * false otherwise. + */ +bool qemu_in_iothread(void); + #endif /* IOTHREAD_H */ diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 739682f3c3..c5546bdecc 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -233,22 +233,6 @@ int kvm_has_intx_set_mask(void); */ bool kvm_arm_supports_user_irq(void); -/** - * kvm_memcrypt_enabled - return boolean indicating whether memory encryption - * is enabled - * Returns: 1 memory encryption is enabled - * 0 memory encryption is disabled - */ -bool kvm_memcrypt_enabled(void); - -/** - * kvm_memcrypt_encrypt_data: encrypt the memory range - * - * Return: 1 failed to encrypt the range - * 0 succesfully encrypted memory region - */ -int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len); - #ifdef NEED_CPU_H #include "cpu.h" diff --git a/include/sysemu/sev.h b/include/sysemu/sev.h index 7ab6e3e31d..5c5a13c6ca 100644 --- a/include/sysemu/sev.h +++ b/include/sysemu/sev.h @@ -16,8 +16,8 @@ #include "sysemu/kvm.h" -void *sev_guest_init(const char *id); -int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len); +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); #endif diff --git a/io/channel.c b/io/channel.c index 93d449dee2..e8b019dc36 100644 --- a/io/channel.c +++ b/io/channel.c @@ -92,19 +92,47 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, size_t niov, Error **errp) { + return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp); +} + +int qio_channel_readv_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + Error **errp) +{ + return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp); +} + +int qio_channel_readv_full_all_eof(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int **fds, size_t *nfds, + Error **errp) +{ int ret = -1; struct iovec *local_iov = g_new(struct iovec, niov); struct iovec *local_iov_head = local_iov; unsigned int nlocal_iov = niov; + int **local_fds = fds; + size_t *local_nfds = nfds; bool partial = false; + if (nfds) { + *nfds = 0; + } + + if (fds) { + *fds = NULL; + } + nlocal_iov = iov_copy(local_iov, nlocal_iov, iov, niov, 0, iov_size(iov, niov)); - while (nlocal_iov > 0) { + while ((nlocal_iov > 0) || local_fds) { ssize_t len; - len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp); + len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds, + local_nfds, errp); if (len == QIO_CHANNEL_ERR_BLOCK) { if (qemu_in_coroutine()) { qio_channel_yield(ioc, G_IO_IN); @@ -112,20 +140,50 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, qio_channel_wait(ioc, G_IO_IN); } continue; - } else if (len < 0) { - goto cleanup; - } else if (len == 0) { - if (partial) { - error_setg(errp, - "Unexpected end-of-file before all bytes were read"); - } else { + } + + if (len == 0) { + if (local_nfds && *local_nfds) { + /* + * Got some FDs, but no data yet. This isn't an EOF + * scenario (yet), so carry on to try to read data + * on next loop iteration + */ + goto next_iter; + } else if (!partial) { + /* No fds and no data - EOF before any data read */ ret = 0; + goto cleanup; + } else { + len = -1; + error_setg(errp, + "Unexpected end-of-file before all data were read"); + /* Fallthrough into len < 0 handling */ + } + } + + if (len < 0) { + /* Close any FDs we previously received */ + if (nfds && fds) { + size_t i; + for (i = 0; i < (*nfds); i++) { + close((*fds)[i]); + } + g_free(*fds); + *fds = NULL; + *nfds = 0; } goto cleanup; } + if (nlocal_iov) { + iov_discard_front(&local_iov, &nlocal_iov, len); + } + +next_iter: partial = true; - iov_discard_front(&local_iov, &nlocal_iov, len); + local_fds = NULL; + local_nfds = NULL; } ret = 1; @@ -135,20 +193,22 @@ int qio_channel_readv_all_eof(QIOChannel *ioc, return ret; } -int qio_channel_readv_all(QIOChannel *ioc, - const struct iovec *iov, - size_t niov, - Error **errp) +int qio_channel_readv_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int **fds, size_t *nfds, + Error **errp) { - int ret = qio_channel_readv_all_eof(ioc, iov, niov, errp); + int ret = qio_channel_readv_full_all_eof(ioc, iov, niov, fds, nfds, errp); if (ret == 0) { - ret = -1; - error_setg(errp, - "Unexpected end-of-file before all bytes were read"); - } else if (ret == 1) { - ret = 0; + error_setg(errp, "Unexpected end-of-file before all data were read"); + return -1; + } + if (ret == 1) { + return 0; } + return ret; } @@ -157,6 +217,15 @@ int qio_channel_writev_all(QIOChannel *ioc, size_t niov, Error **errp) { + return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp); +} + +int qio_channel_writev_full_all(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + int *fds, size_t nfds, + Error **errp) +{ int ret = -1; struct iovec *local_iov = g_new(struct iovec, niov); struct iovec *local_iov_head = local_iov; @@ -168,7 +237,8 @@ int qio_channel_writev_all(QIOChannel *ioc, while (nlocal_iov > 0) { ssize_t len; - len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp); + len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds, + errp); if (len == QIO_CHANNEL_ERR_BLOCK) { if (qemu_in_coroutine()) { qio_channel_yield(ioc, G_IO_OUT); @@ -182,6 +252,9 @@ int qio_channel_writev_all(QIOChannel *ioc, } iov_discard_front(&local_iov, &nlocal_iov, len); + + fds = NULL; + nfds = 0; } ret = 0; diff --git a/iothread.c b/iothread.c index b9f2751382..7f086387be 100644 --- a/iothread.c +++ b/iothread.c @@ -369,3 +369,9 @@ IOThread *iothread_by_id(const char *id) { return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL)); } + +bool qemu_in_iothread(void) +{ + return qemu_get_current_aio_context() == qemu_get_aio_context() ? + false : true; +} @@ -715,6 +715,9 @@ static int job_finalize_single(Job *job) static void job_cancel_async(Job *job, bool force) { + if (job->driver->cancel) { + job->driver->cancel(job); + } if (job->user_paused) { /* Do not call job_enter here, the caller will handle it. */ if (job->driver->user_resume) { diff --git a/memory_ldst.c.inc b/memory_ldst.c.inc index 2fed2de18e..b56e961967 100644 --- a/memory_ldst.c.inc +++ b/memory_ldst.c.inc @@ -42,7 +42,7 @@ static inline uint32_t glue(address_space_ldl_internal, SUFFIX)(ARG1_DECL, MO_32 | devend_memop(endian), attrs); } else { /* RAM case */ - fuzz_dma_read_cb(addr, 4, mr, false); + fuzz_dma_read_cb(addr, 4, mr); ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: @@ -111,7 +111,7 @@ static inline uint64_t glue(address_space_ldq_internal, SUFFIX)(ARG1_DECL, MO_64 | devend_memop(endian), attrs); } else { /* RAM case */ - fuzz_dma_read_cb(addr, 8, mr, false); + fuzz_dma_read_cb(addr, 8, mr); ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: @@ -177,7 +177,7 @@ uint32_t glue(address_space_ldub, SUFFIX)(ARG1_DECL, r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs); } else { /* RAM case */ - fuzz_dma_read_cb(addr, 1, mr, false); + fuzz_dma_read_cb(addr, 1, mr); ptr = qemu_map_ram_ptr(mr->ram_block, addr1); val = ldub_p(ptr); r = MEMTX_OK; @@ -215,7 +215,7 @@ static inline uint32_t glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL, MO_16 | devend_memop(endian), attrs); } else { /* RAM case */ - fuzz_dma_read_cb(addr, 2, mr, false); + fuzz_dma_read_cb(addr, 2, mr); ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: diff --git a/meson.build b/meson.build index 2d8b433ff0..a923f249d8 100644 --- a/meson.build +++ b/meson.build @@ -18,6 +18,9 @@ config_host = keyval.load(meson.current_build_dir() / 'config-host.mak') enable_modules = 'CONFIG_MODULES' in config_host enable_static = 'CONFIG_STATIC' in config_host +# Allow both shared and static libraries unless --enable-static +static_kwargs = enable_static ? {'static': true} : {} + # Temporary directory used for files created while # configure runs. Since it is in the build directory # we can safely blow away any previous version of it @@ -224,10 +227,17 @@ tcg_arch = config_host['ARCH'] if not get_option('tcg').disabled() if cpu not in supported_cpus if get_option('tcg_interpreter') - warning('Unsupported CPU @0@, will use TCG with TCI (experimental)'.format(cpu)) + warning('Unsupported CPU @0@, will use TCG with TCI (experimental and slow)'.format(cpu)) else error('Unsupported CPU @0@, try --enable-tcg-interpreter'.format(cpu)) endif + elif get_option('tcg_interpreter') + warning('Use of the TCG interpretor is not recommended on this host') + warning('architecture. There is a native TCG execution backend available') + warning('which provides substantially better performance and reliability.') + warning('It is strongly recommended to remove the --enable-tcg-interpreter') + warning('configuration option on this architecture to use the native') + warning('backend.') endif if get_option('tcg_interpreter') tcg_arch = 'tci' @@ -311,14 +321,14 @@ endif pixman = not_found if have_system or have_tools pixman = dependency('pixman-1', required: have_system, version:'>=0.21.8', - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif pam = not_found if 'CONFIG_AUTH_PAM' in config_host pam = cc.find_library('pam') endif libaio = cc.find_library('aio', required: false) -zlib = dependency('zlib', required: true, static: enable_static) +zlib = dependency('zlib', required: true, kwargs: static_kwargs) linux_io_uring = not_found if 'CONFIG_LINUX_IO_URING' in config_host linux_io_uring = declare_dependency(compile_args: config_host['LINUX_IO_URING_CFLAGS'].split(), @@ -333,7 +343,7 @@ libnfs = not_found if not get_option('libnfs').auto() or have_block libnfs = dependency('libnfs', version: '>=1.9.3', required: get_option('libnfs'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif libattr_test = ''' @@ -354,7 +364,7 @@ if not get_option('attr').disabled() else libattr = cc.find_library('attr', has_headers: ['attr/xattr.h'], required: get_option('attr'), - static: enable_static) + kwargs: static_kwargs) if libattr.found() and not \ cc.links(libattr_test, dependencies: libattr, args: '-DCONFIG_LIBATTR') libattr = not_found @@ -381,14 +391,14 @@ seccomp = not_found if not get_option('seccomp').auto() or have_system or have_tools seccomp = dependency('libseccomp', version: '>=2.3.0', required: get_option('seccomp'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif libcap_ng = not_found if not get_option('cap_ng').auto() or have_system or have_tools libcap_ng = cc.find_library('cap-ng', has_headers: ['cap-ng.h'], required: get_option('cap_ng'), - static: enable_static) + kwargs: static_kwargs) endif if libcap_ng.found() and not cc.links(''' #include <cap-ng.h> @@ -409,7 +419,7 @@ if get_option('xkbcommon').auto() and not have_system and not have_tools xkbcommon = not_found else xkbcommon = dependency('xkbcommon', required: get_option('xkbcommon'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif vde = not_found if config_host.has_key('CONFIG_VDE') @@ -445,13 +455,13 @@ libiscsi = not_found if not get_option('libiscsi').auto() or have_block libiscsi = dependency('libiscsi', version: '>=1.9.0', required: get_option('libiscsi'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif zstd = not_found if not get_option('zstd').auto() or have_block zstd = dependency('libzstd', version: '>=1.4.0', required: get_option('zstd'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) endif gbm = not_found if 'CONFIG_GBM' in config_host @@ -468,14 +478,14 @@ if not get_option('curl').auto() or have_block curl = dependency('libcurl', version: '>=7.29.0', method: 'pkg-config', required: get_option('curl'), - static: enable_static) + kwargs: static_kwargs) endif libudev = not_found if targetos == 'linux' and (have_system or have_tools) libudev = dependency('libudev', method: 'pkg-config', required: get_option('libudev'), - static: enable_static) + kwargs: static_kwargs) endif mpathlibs = [libudev] @@ -511,17 +521,17 @@ if targetos == 'linux' and have_tools and not get_option('mpath').disabled() }''' libmpathpersist = cc.find_library('mpathpersist', required: get_option('mpath'), - static: enable_static) + kwargs: static_kwargs) if libmpathpersist.found() mpathlibs += libmpathpersist if enable_static mpathlibs += cc.find_library('devmapper', required: get_option('mpath'), - static: enable_static) + kwargs: static_kwargs) endif mpathlibs += cc.find_library('multipath', required: get_option('mpath'), - static: enable_static) + kwargs: static_kwargs) foreach lib: mpathlibs if not lib.found() mpathlibs = [] @@ -571,7 +581,7 @@ if have_system and not get_option('curses').disabled() curses = dependency(curses_dep, required: false, method: 'pkg-config', - static: enable_static) + kwargs: static_kwargs) endif endforeach msg = get_option('curses').enabled() ? 'curses library not found' : '' @@ -596,7 +606,7 @@ if have_system and not get_option('curses').disabled() foreach curses_libname : curses_libname_list libcurses = cc.find_library(curses_libname, required: false, - static: enable_static) + kwargs: static_kwargs) if libcurses.found() if cc.links(curses_test, args: curses_compile_args, dependencies: libcurses) curses = declare_dependency(compile_args: curses_compile_args, @@ -647,7 +657,7 @@ brlapi = not_found if not get_option('brlapi').auto() or have_system brlapi = cc.find_library('brlapi', has_headers: ['brlapi.h'], required: get_option('brlapi'), - static: enable_static) + kwargs: static_kwargs) if brlapi.found() and not cc.links(''' #include <brlapi.h> #include <stddef.h> @@ -663,7 +673,7 @@ endif sdl = not_found if not get_option('sdl').auto() or (have_system and not cocoa.found()) - sdl = dependency('sdl2', required: get_option('sdl'), static: enable_static) + sdl = dependency('sdl2', required: get_option('sdl'), kwargs: static_kwargs) sdl_image = not_found endif if sdl.found() @@ -671,7 +681,7 @@ if sdl.found() sdl = declare_dependency(compile_args: '-Wno-undef', dependencies: sdl) sdl_image = dependency('SDL2_image', required: get_option('sdl_image'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) else if get_option('sdl_image').enabled() error('sdl-image required, but SDL was @0@'.format( @@ -683,19 +693,25 @@ endif rbd = not_found if not get_option('rbd').auto() or have_block librados = cc.find_library('rados', required: get_option('rbd'), - static: enable_static) + kwargs: static_kwargs) librbd = cc.find_library('rbd', has_headers: ['rbd/librbd.h'], required: get_option('rbd'), - static: enable_static) - if librados.found() and librbd.found() and cc.links(''' - #include <stdio.h> - #include <rbd/librbd.h> - int main(void) { - rados_t cluster; - rados_create(&cluster, NULL); - return 0; - }''', dependencies: [librbd, librados]) - rbd = declare_dependency(dependencies: [librbd, librados]) + kwargs: static_kwargs) + if librados.found() and librbd.found() + if cc.links(''' + #include <stdio.h> + #include <rbd/librbd.h> + int main(void) { + rados_t cluster; + rados_create(&cluster, NULL); + return 0; + }''', dependencies: [librbd, librados]) + rbd = declare_dependency(dependencies: [librbd, librados]) + elif get_option('rbd').enabled() + error('could not link librados') + else + warning('could not link librados, disabling') + endif endif endif @@ -705,7 +721,7 @@ glusterfs_iocb_has_stat = false if not get_option('glusterfs').auto() or have_block glusterfs = dependency('glusterfs-api', version: '>=3', required: get_option('glusterfs'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) if glusterfs.found() glusterfs_ftruncate_has_stat = cc.links(''' #include <glusterfs/api/glfs.h> @@ -744,7 +760,7 @@ libbzip2 = not_found if not get_option('bzip2').auto() or have_block libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'], required: get_option('bzip2'), - static: enable_static) + kwargs: static_kwargs) if libbzip2.found() and not cc.links(''' #include <bzlib.h> int main(void) { BZ2_bzlibVersion(); return 0; }''', dependencies: libbzip2) @@ -761,7 +777,7 @@ liblzfse = not_found if not get_option('lzfse').auto() or have_block liblzfse = cc.find_library('lzfse', has_headers: ['lzfse.h'], required: get_option('lzfse'), - static: enable_static) + kwargs: static_kwargs) endif if liblzfse.found() and not cc.links(''' #include <lzfse.h> @@ -798,12 +814,12 @@ if not get_option('gtk').auto() or (have_system and not cocoa.found()) gtk = dependency('gtk+-3.0', version: '>=3.22.0', method: 'pkg-config', required: get_option('gtk'), - static: enable_static) + kwargs: static_kwargs) if gtk.found() gtkx11 = dependency('gtk+-x11-3.0', version: '>=3.22.0', method: 'pkg-config', required: false, - static: enable_static) + kwargs: static_kwargs) gtk = declare_dependency(dependencies: [gtk, gtkx11]) endif endif @@ -816,7 +832,7 @@ endif x11 = not_found if gtkx11.found() or 'lm32-softmmu' in target_dirs x11 = dependency('x11', method: 'pkg-config', required: gtkx11.found(), - static: enable_static) + kwargs: static_kwargs) endif vnc = not_found png = not_found @@ -825,12 +841,12 @@ sasl = not_found if get_option('vnc').enabled() vnc = declare_dependency() # dummy dependency png = dependency('libpng', required: get_option('vnc_png'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) jpeg = dependency('libjpeg', required: get_option('vnc_jpeg'), - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) sasl = cc.find_library('sasl2', has_headers: ['sasl/sasl.h'], required: get_option('vnc_sasl'), - static: enable_static) + kwargs: static_kwargs) if sasl.found() sasl = declare_dependency(dependencies: sasl, compile_args: '-DSTRUCT_IOVEC_DEFINED') @@ -841,7 +857,7 @@ snappy = not_found if not get_option('snappy').auto() or have_system snappy = cc.find_library('snappy', has_headers: ['snappy-c.h'], required: get_option('snappy'), - static: enable_static) + kwargs: static_kwargs) endif if snappy.found() and not cc.links(''' #include <snappy-c.h> @@ -858,7 +874,7 @@ lzo = not_found if not get_option('lzo').auto() or have_system lzo = cc.find_library('lzo2', has_headers: ['lzo/lzo1x.h'], required: get_option('lzo'), - static: enable_static) + kwargs: static_kwargs) endif if lzo.found() and not cc.links(''' #include <lzo/lzo1x.h> @@ -893,7 +909,7 @@ u2f = not_found if have_system u2f = dependency('u2f-emu', required: get_option('u2f'), method: 'pkg-config', - static: enable_static) + kwargs: static_kwargs) endif usbredir = not_found if 'CONFIG_USB_REDIR' in config_host @@ -920,7 +936,7 @@ if 'CONFIG_TASN1' in config_host link_args: config_host['TASN1_LIBS'].split()) endif keyutils = dependency('libkeyutils', required: false, - method: 'pkg-config', static: enable_static) + method: 'pkg-config', kwargs: static_kwargs) has_gettid = cc.has_function('gettid') @@ -979,7 +995,7 @@ endif fuse = dependency('fuse3', required: get_option('fuse'), version: '>=3.1', method: 'pkg-config', - static: enable_static) + kwargs: static_kwargs) fuse_lseek = not_found if not get_option('fuse_lseek').disabled() @@ -1210,7 +1226,8 @@ host_kconfig = \ ('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \ (have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \ ('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \ - ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \ + ('CONFIG_MULTIPROCESS_ALLOWED' in config_host ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ] @@ -1367,7 +1384,7 @@ capstone_opt = get_option('capstone') if capstone_opt in ['enabled', 'auto', 'system'] have_internal = fs.exists(meson.current_source_dir() / 'capstone/Makefile') capstone = dependency('capstone', version: '>=4.0', - static: enable_static, method: 'pkg-config', + kwargs: static_kwargs, method: 'pkg-config', required: capstone_opt == 'system' or capstone_opt == 'enabled' and not have_internal) if capstone.found() @@ -1477,7 +1494,7 @@ if have_system slirp_opt = get_option('slirp') if slirp_opt in ['enabled', 'auto', 'system'] have_internal = fs.exists(meson.current_source_dir() / 'slirp/meson.build') - slirp = dependency('slirp', static: enable_static, + slirp = dependency('slirp', kwargs: static_kwargs, method: 'pkg-config', required: slirp_opt == 'system' or slirp_opt == 'enabled' and not have_internal) @@ -1556,7 +1573,7 @@ fdt_opt = get_option('fdt') if have_system if fdt_opt in ['enabled', 'auto', 'system'] have_internal = fs.exists(meson.current_source_dir() / 'dtc/libfdt/Makefile.libfdt') - fdt = cc.find_library('fdt', static: enable_static, + fdt = cc.find_library('fdt', kwargs: static_kwargs, required: fdt_opt == 'system' or fdt_opt == 'enabled' and not have_internal) if fdt.found() and cc.links(''' @@ -1725,10 +1742,11 @@ target_softmmu_arch = {} # TODO: add each directory to the subdirs from its own meson.build, once # we have those trace_events_subdirs = [ - 'accel/kvm', - 'accel/tcg', 'crypto', + 'qapi', + 'qom', 'monitor', + 'util', ] if have_user trace_events_subdirs += [ 'linux-user' ] @@ -1744,6 +1762,7 @@ if have_block endif if have_system trace_events_subdirs += [ + 'accel/kvm', 'audio', 'backends', 'backends/tpm', @@ -1799,23 +1818,24 @@ if have_system 'net', 'softmmu', 'ui', + 'hw/remote', + ] +endif +if have_system or have_user + trace_events_subdirs += [ + 'accel/tcg', + 'hw/core', + 'target/arm', + 'target/hppa', + 'target/i386', + 'target/i386/kvm', + 'target/mips', + 'target/ppc', + 'target/riscv', + 'target/s390x', + 'target/sparc', ] endif -trace_events_subdirs += [ - 'hw/core', - 'qapi', - 'qom', - 'target/arm', - 'target/hppa', - 'target/i386', - 'target/i386/kvm', - 'target/mips', - 'target/ppc', - 'target/riscv', - 'target/s390x', - 'target/sparc', - 'util', -] vhost_user = not_found if 'CONFIG_VHOST_USER' in config_host @@ -1849,41 +1869,45 @@ libqemuutil = static_library('qemuutil', qemuutil = declare_dependency(link_with: libqemuutil, sources: genh + version_res) -decodetree = generator(find_program('scripts/decodetree.py'), - output: 'decode-@BASENAME@.c.inc', - arguments: ['@INPUT@', '@EXTRA_ARGS@', '-o', '@OUTPUT@']) +if have_system or have_user + decodetree = generator(find_program('scripts/decodetree.py'), + output: 'decode-@BASENAME@.c.inc', + arguments: ['@INPUT@', '@EXTRA_ARGS@', '-o', '@OUTPUT@']) + subdir('libdecnumber') + subdir('target') +endif subdir('audio') subdir('io') subdir('chardev') subdir('fsdev') -subdir('libdecnumber') -subdir('target') subdir('dump') -block_ss.add(files( - 'block.c', - 'blockjob.c', - 'job.c', - 'qemu-io-cmds.c', -)) -block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c')) - -subdir('nbd') -subdir('scsi') -subdir('block') - -blockdev_ss.add(files( - 'blockdev.c', - 'blockdev-nbd.c', - 'iothread.c', - 'job-qmp.c', -), gnutls) - -# os-posix.c contains POSIX-specific functions used by qemu-storage-daemon, -# os-win32.c does not -blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c')) -softmmu_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')]) +if have_block + block_ss.add(files( + 'block.c', + 'blockjob.c', + 'job.c', + 'qemu-io-cmds.c', + )) + block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c')) + + subdir('nbd') + subdir('scsi') + subdir('block') + + blockdev_ss.add(files( + 'blockdev.c', + 'blockdev-nbd.c', + 'iothread.c', + 'job-qmp.c', + ), gnutls) + + # os-posix.c contains POSIX-specific functions used by qemu-storage-daemon, + # os-win32.c does not + blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c')) + softmmu_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')]) +endif common_ss.add(files('cpus-common.c')) @@ -2500,8 +2524,12 @@ if have_system endif summary_info += {'TCG support': config_all.has_key('CONFIG_TCG')} if config_all.has_key('CONFIG_TCG') + if get_option('tcg_interpreter') + summary_info += {'TCG backend': 'TCI (TCG with bytecode interpreter, experimental and slow)'} + else + summary_info += {'TCG backend': 'native (@0@)'.format(cpu)} + endif summary_info += {'TCG debug enabled': config_host.has_key('CONFIG_DEBUG_TCG')} - summary_info += {'TCG interpreter': tcg_arch == 'tci'} endif summary_info += {'target list': ' '.join(target_dirs)} if have_system @@ -2626,6 +2654,7 @@ summary_info += {'libpmem support': config_host.has_key('CONFIG_LIBPMEM')} summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')} summary_info += {'libudev': libudev.found()} summary_info += {'FUSE lseek': fuse_lseek.found()} +summary_info += {'Multiprocess QEMU': config_host.has_key('CONFIG_MULTIPROCESS_ALLOWED')} summary(summary_info, bool_yn: true, section: 'Dependencies') if not supported_cpus.contains(cpu) diff --git a/meson_options.txt b/meson_options.txt index 95f1079829..675a9c500a 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -40,7 +40,7 @@ option('xen_pci_passthrough', type: 'feature', value: 'auto', option('tcg', type: 'feature', value: 'auto', description: 'TCG support') option('tcg_interpreter', type: 'boolean', value: false, - description: 'TCG bytecode interpreter (TCI)') + description: 'TCG with bytecode interpreter (experimental and slow)') option('cfi', type: 'boolean', value: 'false', description: 'Control-Flow Integrity (CFI)') option('cfi_debug', type: 'boolean', value: 'false', diff --git a/migration/migration.c b/migration/migration.c index 1986cb8573..a5ddf43559 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -58,6 +58,7 @@ #include "qemu/queue.h" #include "multifd.h" #include "qemu/yank.h" +#include "sysemu/cpus.h" #ifdef CONFIG_VFIO #include "hw/vfio/vfio-common.h" @@ -134,6 +135,38 @@ enum mig_rp_message_type { MIG_RP_MSG_MAX }; +/* Migration capabilities set */ +struct MigrateCapsSet { + int size; /* Capability set size */ + MigrationCapability caps[]; /* Variadic array of capabilities */ +}; +typedef struct MigrateCapsSet MigrateCapsSet; + +/* Define and initialize MigrateCapsSet */ +#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \ + MigrateCapsSet _name = { \ + .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \ + .caps = { __VA_ARGS__ } \ + } + +/* Background-snapshot compatibility check list */ +static const +INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, + MIGRATION_CAPABILITY_POSTCOPY_RAM, + MIGRATION_CAPABILITY_DIRTY_BITMAPS, + MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME, + MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE, + MIGRATION_CAPABILITY_RETURN_PATH, + MIGRATION_CAPABILITY_MULTIFD, + MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER, + MIGRATION_CAPABILITY_AUTO_CONVERGE, + MIGRATION_CAPABILITY_RELEASE_RAM, + MIGRATION_CAPABILITY_RDMA_PIN_ALL, + MIGRATION_CAPABILITY_COMPRESS, + MIGRATION_CAPABILITY_XBZRLE, + MIGRATION_CAPABILITY_X_COLO, + MIGRATION_CAPABILITY_VALIDATE_UUID); + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -141,6 +174,8 @@ enum mig_rp_message_type { static MigrationState *current_migration; static MigrationIncomingState *current_incoming; +static GSList *migration_blockers; + static bool migration_object_check(MigrationState *ms, Error **errp); static int migration_maybe_pause(MigrationState *s, int *current_active_state, @@ -1041,6 +1076,27 @@ static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); + info->blocked = migration_is_blocked(NULL); + info->has_blocked_reasons = info->blocked; + info->blocked_reasons = NULL; + if (info->blocked) { + GSList *cur_blocker = migration_blockers; + + /* + * There are two types of reasons a migration might be blocked; + * a) devices marked in VMState as non-migratable, and + * b) Explicit migration blockers + * We need to add both of them here. + */ + qemu_savevm_non_migratable_list(&info->blocked_reasons); + + while (cur_blocker) { + QAPI_LIST_PREPEND(info->blocked_reasons, + g_strdup(error_get_pretty(cur_blocker->data))); + cur_blocker = g_slist_next(cur_blocker); + } + } + switch (s->state) { case MIGRATION_STATUS_NONE: /* no migration has happened ever */ @@ -1089,6 +1145,31 @@ static void fill_source_migration_info(MigrationInfo *info) info->status = s->state; } +typedef enum WriteTrackingSupport { + WT_SUPPORT_UNKNOWN = 0, + WT_SUPPORT_ABSENT, + WT_SUPPORT_AVAILABLE, + WT_SUPPORT_COMPATIBLE +} WriteTrackingSupport; + +static +WriteTrackingSupport migrate_query_write_tracking(void) +{ + /* Check if kernel supports required UFFD features */ + if (!ram_write_tracking_available()) { + return WT_SUPPORT_ABSENT; + } + /* + * Check if current memory configuration is + * compatible with required UFFD features. + */ + if (!ram_write_tracking_compatible()) { + return WT_SUPPORT_AVAILABLE; + } + + return WT_SUPPORT_COMPATIBLE; +} + /** * @migration_caps_check - check capability validity * @@ -1150,6 +1231,39 @@ static bool migrate_caps_check(bool *cap_list, } } + if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) { + WriteTrackingSupport wt_support; + int idx; + /* + * Check if 'background-snapshot' capability is supported by + * host kernel and compatible with guest memory configuration. + */ + wt_support = migrate_query_write_tracking(); + if (wt_support < WT_SUPPORT_AVAILABLE) { + error_setg(errp, "Background-snapshot is not supported by host kernel"); + return false; + } + if (wt_support < WT_SUPPORT_COMPATIBLE) { + error_setg(errp, "Background-snapshot is not compatible " + "with guest memory configuration"); + return false; + } + + /* + * Check if there are any migration capabilities + * incompatible with 'background-snapshot'. + */ + for (idx = 0; idx < check_caps_background_snapshot.size; idx++) { + int incomp_cap = check_caps_background_snapshot.caps[idx]; + if (cap_list[incomp_cap]) { + error_setg(errp, + "Background-snapshot is not compatible with %s", + MigrationCapability_str(incomp_cap)); + return false; + } + } + } + return true; } @@ -1226,21 +1340,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_compress_level && (params->compress_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_compress_threads && (params->compress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_decompress_threads && (params->decompress_threads < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "decompress_threads", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } @@ -1293,21 +1407,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) if (params->has_multifd_channels && (params->multifd_channels < 1)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_channels", - "is invalid, it should be in the range of 1 to 255"); + "a value between 1 and 255"); return false; } if (params->has_multifd_zlib_level && (params->multifd_zlib_level > 9)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level", - "is invalid, it should be in the range of 0 to 9"); + "a value between 0 and 9"); return false; } if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", - "is invalid, it should be in the range of 0 to 20"); + "a value between 0 and 20"); return false; } @@ -1316,8 +1430,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) !is_power_of_2(params->xbzrle_cache_size))) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "xbzrle_cache_size", - "is invalid, it should be bigger than target page size" - " and a power of 2"); + "a power of two no less than the target page size"); return false; } @@ -1334,21 +1447,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_initial > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_initial", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_max && params->announce_max > 100000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_max", - "is invalid, it must be less than 100000 ms"); + "a value between 0 and 100000"); return false; } if (params->has_announce_rounds && params->announce_rounds > 1000) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_rounds", - "is invalid, it must be in the range of 0 to 1000"); + "a value between 0 and 1000"); return false; } if (params->has_announce_step && @@ -1356,7 +1469,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp) params->announce_step > 10000)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "announce_step", - "is invalid, it must be in the range of 1 to 10000 ms"); + "a value between 0 and 10000"); return false; } @@ -1909,6 +2022,7 @@ void migrate_init(MigrationState *s) * locks. */ s->cleanup_bh = 0; + s->vm_start_bh = 0; s->to_dst_file = NULL; s->state = MIGRATION_STATUS_NONE; s->rp_state.from_dst_file = NULL; @@ -1934,8 +2048,6 @@ void migrate_init(MigrationState *s) s->threshold_size = 0; } -static GSList *migration_blockers; - int migrate_add_blocker(Error *reason, Error **errp) { if (only_migratable) { @@ -2216,7 +2328,7 @@ void qmp_migrate_set_cache_size(int64_t value, Error **errp) qmp_migrate_set_parameters(&p, errp); } -int64_t qmp_query_migrate_cache_size(Error **errp) +uint64_t qmp_query_migrate_cache_size(Error **errp) { return migrate_xbzrle_cache_size(); } @@ -2446,7 +2558,7 @@ int migrate_use_xbzrle(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; } -int64_t migrate_xbzrle_cache_size(void) +uint64_t migrate_xbzrle_cache_size(void) { MigrationState *s; @@ -2491,6 +2603,15 @@ bool migrate_use_block_incremental(void) return s->parameters.block_incremental; } +bool migrate_background_snapshot(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]; +} + /* migration thread support */ /* * Something bad happened to the RP stream, mark an error @@ -3117,6 +3238,50 @@ fail: MIGRATION_STATUS_FAILED); } +/** + * bg_migration_completion: Used by bg_migration_thread when after all the + * RAM has been saved. The caller 'breaks' the loop when this returns. + * + * @s: Current migration state + */ +static void bg_migration_completion(MigrationState *s) +{ + int current_active_state = s->state; + + /* + * Stop tracking RAM writes - un-protect memory, un-register UFFD + * memory ranges, flush kernel wait queues and wake up threads + * waiting for write fault to be resolved. + */ + ram_write_tracking_stop(); + + if (s->state == MIGRATION_STATUS_ACTIVE) { + /* + * By this moment we have RAM content saved into the migration stream. + * The next step is to flush the non-RAM content (device state) + * right after the ram content. The device state has been stored into + * the temporary buffer before RAM saving started. + */ + qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); + qemu_fflush(s->to_dst_file); + } else if (s->state == MIGRATION_STATUS_CANCELLING) { + goto fail; + } + + if (qemu_file_get_error(s->to_dst_file)) { + trace_migration_completion_file_err(); + goto fail; + } + + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_COMPLETED); + return; + +fail: + migrate_set_state(&s->state, current_active_state, + MIGRATION_STATUS_FAILED); +} + bool migrate_colo_enabled(void) { MigrationState *s = migrate_get_current(); @@ -3457,6 +3622,47 @@ static void migration_iteration_finish(MigrationState *s) qemu_mutex_unlock_iothread(); } +static void bg_migration_iteration_finish(MigrationState *s) +{ + qemu_mutex_lock_iothread(); + switch (s->state) { + case MIGRATION_STATUS_COMPLETED: + migration_calculate_complete(s); + break; + + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_FAILED: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_CANCELLING: + break; + + default: + /* Should not reach here, but if so, forgive the VM. */ + error_report("%s: Unknown ending state %d", __func__, s->state); + break; + } + + migrate_fd_cleanup_schedule(s); + qemu_mutex_unlock_iothread(); +} + +/* + * Return true if continue to the next iteration directly, false + * otherwise. + */ +static MigIterateState bg_migration_iteration_run(MigrationState *s) +{ + int res; + + res = qemu_savevm_state_iterate(s->to_dst_file, false); + if (res > 0) { + bg_migration_completion(s); + return MIG_ITERATE_BREAK; + } + + return MIG_ITERATE_RESUME; +} + void migration_make_urgent_request(void) { qemu_sem_post(&migrate_get_current()->rate_limit_sem); @@ -3604,6 +3810,165 @@ static void *migration_thread(void *opaque) return NULL; } +static void bg_migration_vm_start_bh(void *opaque) +{ + MigrationState *s = opaque; + + qemu_bh_delete(s->vm_start_bh); + s->vm_start_bh = NULL; + + vm_start(); + s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; +} + +/** + * Background snapshot thread, based on live migration code. + * This is an alternative implementation of live migration mechanism + * introduced specifically to support background snapshots. + * + * It takes advantage of userfault_fd write protection mechanism introduced + * in v5.7 kernel. Compared to existing dirty page logging migration much + * lesser stream traffic is produced resulting in smaller snapshot images, + * simply cause of no page duplicates can get into the stream. + * + * Another key point is that generated vmstate stream reflects machine state + * 'frozen' at the beginning of snapshot creation compared to dirty page logging + * mechanism, which effectively results in that saved snapshot is the state of VM + * at the end of the process. + */ +static void *bg_migration_thread(void *opaque) +{ + MigrationState *s = opaque; + int64_t setup_start; + MigThrError thr_error; + QEMUFile *fb; + bool early_fail = true; + + rcu_register_thread(); + object_ref(OBJECT(s)); + + qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); + + setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); + /* + * We want to save vmstate for the moment when migration has been + * initiated but also we want to save RAM content while VM is running. + * The RAM content should appear first in the vmstate. So, we first + * stash the non-RAM part of the vmstate to the temporary buffer, + * then write RAM part of the vmstate to the migration stream + * with vCPUs running and, finally, write stashed non-RAM part of + * the vmstate from the buffer to the migration stream. + */ + s->bioc = qio_channel_buffer_new(128 * 1024); + qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); + fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc)); + object_unref(OBJECT(s->bioc)); + + update_iteration_initial_status(s); + + qemu_savevm_state_header(s->to_dst_file); + qemu_savevm_state_setup(s->to_dst_file); + + if (qemu_savevm_state_guest_unplug_pending()) { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_WAIT_UNPLUG); + + while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && + qemu_savevm_state_guest_unplug_pending()) { + qemu_sem_timedwait(&s->wait_unplug_sem, 250); + } + + migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, + MIGRATION_STATUS_ACTIVE); + } else { + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_ACTIVE); + } + s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + + trace_migration_thread_setup_complete(); + s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + + qemu_mutex_lock_iothread(); + + /* + * If VM is currently in suspended state, then, to make a valid runstate + * transition in vm_stop_force_state() we need to wakeup it up. + */ + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); + s->vm_was_running = runstate_is_running(); + + if (global_state_store()) { + goto fail; + } + /* Forcibly stop VM before saving state of vCPUs and devices */ + if (vm_stop_force_state(RUN_STATE_PAUSED)) { + goto fail; + } + /* + * Put vCPUs in sync with shadow context structures, then + * save their state to channel-buffer along with devices. + */ + cpu_synchronize_all_states(); + if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { + goto fail; + } + /* Now initialize UFFD context and start tracking RAM writes */ + if (ram_write_tracking_start()) { + goto fail; + } + early_fail = false; + + /* + * Start VM from BH handler to avoid write-fault lock here. + * UFFD-WP protection for the whole RAM is already enabled so + * calling VM state change notifiers from vm_start() would initiate + * writes to virtio VQs memory which is in write-protected region. + */ + s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); + qemu_bh_schedule(s->vm_start_bh); + + qemu_mutex_unlock_iothread(); + + while (migration_is_active(s)) { + MigIterateState iter_state = bg_migration_iteration_run(s); + if (iter_state == MIG_ITERATE_SKIP) { + continue; + } else if (iter_state == MIG_ITERATE_BREAK) { + break; + } + + /* + * Try to detect any kind of failures, and see whether we + * should stop the migration now. + */ + thr_error = migration_detect_error(s); + if (thr_error == MIG_THR_ERR_FATAL) { + /* Stop migration */ + break; + } + + migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); + } + + trace_migration_thread_after_loop(); + +fail: + if (early_fail) { + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + } + + bg_migration_iteration_finish(s); + + qemu_fclose(fb); + object_unref(OBJECT(s)); + rcu_unregister_thread(); + + return NULL; +} + void migrate_fd_connect(MigrationState *s, Error *error_in) { Error *local_err = NULL; @@ -3667,8 +4032,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) migrate_fd_cleanup(s); return; } - qemu_thread_create(&s->thread, "live_migration", migration_thread, s, - QEMU_THREAD_JOINABLE); + + if (migrate_background_snapshot()) { + qemu_thread_create(&s->thread, "bg_snapshot", + bg_migration_thread, s, QEMU_THREAD_JOINABLE); + } else { + qemu_thread_create(&s->thread, "live_migration", + migration_thread, s, QEMU_THREAD_JOINABLE); + } s->migration_thread_running = true; } @@ -3784,6 +4155,8 @@ static Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK), DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH), DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD), + DEFINE_PROP_MIG_CAP("x-background-snapshot", + MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT), DEFINE_PROP_END_OF_LIST(), }; diff --git a/migration/migration.h b/migration/migration.h index d096b77f74..db6708326b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -20,6 +20,7 @@ #include "qemu/thread.h" #include "qemu/coroutine_int.h" #include "io/channel.h" +#include "io/channel-buffer.h" #include "net/announce.h" #include "qom/object.h" @@ -147,8 +148,10 @@ struct MigrationState { /*< public >*/ QemuThread thread; + QEMUBH *vm_start_bh; QEMUBH *cleanup_bh; QEMUFile *to_dst_file; + QIOChannelBuffer *bioc; /* * Protects to_dst_file pointer. We need to make sure we won't * yield or hang during the critical section, since this lock will @@ -324,7 +327,7 @@ int migrate_multifd_zlib_level(void); int migrate_multifd_zstd_level(void); int migrate_use_xbzrle(void); -int64_t migrate_xbzrle_cache_size(void); +uint64_t migrate_xbzrle_cache_size(void); bool migrate_colo_enabled(void); bool migrate_use_block(void); @@ -341,6 +344,7 @@ int migrate_compress_wait_thread(void); int migrate_decompress_threads(void); bool migrate_use_events(void); bool migrate_postcopy_blocktime(void); +bool migrate_background_snapshot(void); /* Sending on the return path - generic and then for each message type */ void migrate_send_rp_shut(MigrationIncomingState *mis, diff --git a/migration/page_cache.c b/migration/page_cache.c index 098b436223..6d4f7a9bbc 100644 --- a/migration/page_cache.c +++ b/migration/page_cache.c @@ -38,7 +38,7 @@ struct PageCache { size_t num_items; }; -PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) +PageCache *cache_init(uint64_t new_size, size_t page_size, Error **errp) { int64_t i; size_t num_pages = new_size / page_size; @@ -60,8 +60,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) /* We prefer not to abort if there is no memory */ cache = g_try_malloc(sizeof(*cache)); if (!cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate cache"); + error_setg(errp, "Failed to allocate cache"); return NULL; } cache->page_size = page_size; @@ -74,8 +73,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp) cache->page_cache = g_try_malloc((cache->max_num_items) * sizeof(*cache->page_cache)); if (!cache->page_cache) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "Failed to allocate page cache"); + error_setg(errp, "Failed to allocate page cache"); g_free(cache); return NULL; } diff --git a/migration/page_cache.h b/migration/page_cache.h index 0cb94498a0..8733b4df6e 100644 --- a/migration/page_cache.h +++ b/migration/page_cache.h @@ -28,7 +28,7 @@ typedef struct PageCache PageCache; * @page_size: cache page size * @errp: set *errp if the check failed, with reason */ -PageCache *cache_init(int64_t cache_size, size_t page_size, Error **errp); +PageCache *cache_init(uint64_t cache_size, size_t page_size, Error **errp); /** * cache_fini: free all cache resources * @cache pointer to the PageCache struct diff --git a/migration/qemu-file.c b/migration/qemu-file.c index be21518c57..d6e03dbc0e 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -595,7 +595,7 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) { if (size < IO_BUF_SIZE) { size_t res; - uint8_t *src; + uint8_t *src = NULL; res = qemu_peek_buffer(f, &src, size, 0); diff --git a/migration/ram.c b/migration/ram.c index 7811cde643..72143da0ac 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -56,6 +56,11 @@ #include "savevm.h" #include "qemu/iov.h" #include "multifd.h" +#include "sysemu/runstate.h" + +#if defined(__linux__) +#include "qemu/userfaultfd.h" +#endif /* defined(__linux__) */ /***********************************************************/ /* ram save/restore */ @@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void) * @new_size: new cache size * @errp: set *errp if the check failed, with reason */ -int xbzrle_cache_resize(int64_t new_size, Error **errp) +int xbzrle_cache_resize(uint64_t new_size, Error **errp) { PageCache *new_cache; int64_t ret = 0; @@ -298,6 +303,8 @@ struct RAMSrcPageRequest { struct RAMState { /* QEMUFile used for this migration */ QEMUFile *f; + /* UFFD file descriptor, used in 'write-tracking' migration */ + int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; /* Last block from where we have sent data */ @@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) return block; } +#if defined(__linux__) +/** + * poll_fault_page: try to get next UFFD write fault page and, if pending fault + * is found, return RAM block pointer and page offset + * + * Returns pointer to the RAMBlock containing faulting page, + * NULL if no write faults are pending + * + * @rs: current RAM state + * @offset: page offset from the beginning of the block + */ +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + struct uffd_msg uffd_msg; + void *page_address; + RAMBlock *bs; + int res; + + if (!migrate_background_snapshot()) { + return NULL; + } + + res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); + if (res <= 0) { + return NULL; + } + + page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; + bs = qemu_ram_block_from_host(page_address, false, offset); + assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0); + return bs; +} + +/** + * ram_save_release_protection: release UFFD write protection after + * a range of pages has been saved + * + * @rs: current RAM state + * @pss: page-search-status structure + * @start_page: index of the first page in the range relative to pss->block + * + * Returns 0 on success, negative value in case of an error +*/ +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + int res = 0; + + /* Check if page is from UFFD-managed region. */ + if (pss->block->flags & RAM_UF_WRITEPROTECT) { + void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); + uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS; + + /* Flush async buffers before un-protect. */ + qemu_fflush(rs->f); + /* Un-protect memory range. */ + res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, + false, false); + } + + return res; +} + +/* ram_write_tracking_available: check if kernel supports required UFFD features + * + * Returns true if supports, false otherwise + */ +bool ram_write_tracking_available(void) +{ + uint64_t uffd_features; + int res; + + res = uffd_query_features(&uffd_features); + return (res == 0 && + (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); +} + +/* ram_write_tracking_compatible: check if guest configuration is + * compatible with 'write-tracking' + * + * Returns true if compatible, false otherwise + */ +bool ram_write_tracking_compatible(void) +{ + const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); + int uffd_fd; + RAMBlock *bs; + bool ret = false; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); + if (uffd_fd < 0) { + return false; + } + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + uint64_t uffd_ioctls; + + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + /* Try to register block memory via UFFD-IO to track writes */ + if (uffd_register_memory(uffd_fd, bs->host, bs->max_length, + UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { + goto out; + } + if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { + goto out; + } + } + ret = true; + +out: + uffd_close_fd(uffd_fd); + return ret; +} + +/* + * ram_write_tracking_start: start UFFD-WP memory tracking + * + * Returns 0 for success or negative value in case of error + */ +int ram_write_tracking_start(void) +{ + int uffd_fd; + RAMState *rs = ram_state; + RAMBlock *bs; + + /* Open UFFD file descriptor */ + uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); + if (uffd_fd < 0) { + return uffd_fd; + } + rs->uffdio_fd = uffd_fd; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + /* Nothing to do with read-only and MMIO-writable regions */ + if (bs->mr->readonly || bs->mr->rom_device) { + continue; + } + + /* Register block memory with UFFD to track writes */ + if (uffd_register_memory(rs->uffdio_fd, bs->host, + bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { + goto fail; + } + /* Apply UFFD write protection to the block memory range */ + if (uffd_change_protection(rs->uffdio_fd, bs->host, + bs->max_length, true, false)) { + goto fail; + } + bs->flags |= RAM_UF_WRITEPROTECT; + memory_region_ref(bs->mr); + + trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size, + bs->host, bs->max_length); + } + + return 0; + +fail: + error_report("ram_write_tracking_start() failed: restoring initial memory state"); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* + * In case some memory block failed to be write-protected + * remove protection and unregister all succeeded RAM blocks + */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + uffd_close_fd(uffd_fd); + rs->uffdio_fd = -1; + return -1; +} + +/** + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection + */ +void ram_write_tracking_stop(void) +{ + RAMState *rs = ram_state; + RAMBlock *bs; + + RCU_READ_LOCK_GUARD(); + + RAMBLOCK_FOREACH_NOT_IGNORED(bs) { + if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) { + continue; + } + /* Remove protection and unregister all affected RAM blocks */ + uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false); + uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length); + + trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size, + bs->host, bs->max_length); + + /* Cleanup flags and remove reference */ + bs->flags &= ~RAM_UF_WRITEPROTECT; + memory_region_unref(bs->mr); + } + + /* Finally close UFFD file descriptor */ + uffd_close_fd(rs->uffdio_fd); + rs->uffdio_fd = -1; +} + +#else +/* No target OS support, stubs just fail or ignore */ + +static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) +{ + (void) rs; + (void) offset; + + return NULL; +} + +static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, + unsigned long start_page) +{ + (void) rs; + (void) pss; + (void) start_page; + + return 0; +} + +bool ram_write_tracking_available(void) +{ + return false; +} + +bool ram_write_tracking_compatible(void) +{ + assert(0); + return false; +} + +int ram_write_tracking_start(void) +{ + assert(0); + return -1; +} + +void ram_write_tracking_stop(void) +{ + assert(0); +} +#endif /* defined(__linux__) */ + /** * get_queued_page: unqueue a page from the postcopy requests * @@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) } while (block && !dirty); + if (!block) { + /* + * Poll write faults too if background snapshot is enabled; that's + * when we have vcpus got blocked by the write protected pages. + */ + block = poll_fault_page(rs, &offset); + } + if (block) { /* * As soon as we start servicing pages out of order, then we have @@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, int tmppages, pages = 0; size_t pagesize_bits = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + unsigned long start_page = pss->page; + int res; if (ramblock_is_ignored(pss->block)) { error_report("block %s should not be migrated !", pss->block->idstr); @@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, } while ((pss->page & (pagesize_bits - 1)) && offset_in_ramblock(pss->block, ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); - /* The offset we leave with is the last one we looked at */ pss->page--; - return pages; + + res = ram_save_release_protection(rs, pss, start_page); + return (res < 0 ? res : pages); } /** @@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque) RAMState **rsp = opaque; RAMBlock *block; - /* caller have hold iothread lock or is in a bh, so there is - * no writing race against the migration bitmap - */ - memory_global_dirty_log_stop(); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against the migration bitmap + */ + memory_global_dirty_log_stop(); + } RAMBLOCK_FOREACH_NOT_IGNORED(block) { g_free(block->clear_bmap); @@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs) WITH_RCU_READ_LOCK_GUARD() { ram_list_init_bitmaps(); - memory_global_dirty_log_start(); - migration_bitmap_sync_precopy(rs); + /* We don't use dirty log with background snapshots */ + if (!migrate_background_snapshot()) { + memory_global_dirty_log_start(); + migration_bitmap_sync_precopy(rs); + } } qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); @@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f) } } /* For postcopy we need to check hugepage sizes match */ - if (postcopy_advised && + if (postcopy_advised && migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { uint64_t remote_page_size = qemu_get_be64(f); if (remote_page_size != block->page_size) { diff --git a/migration/ram.h b/migration/ram.h index 011e85414e..6378bb3ebc 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -47,7 +47,7 @@ bool ramblock_is_ignored(RAMBlock *block); INTERNAL_RAMBLOCK_FOREACH(block) \ if (!qemu_ram_is_migratable(block)) {} else -int xbzrle_cache_resize(int64_t new_size, Error **errp); +int xbzrle_cache_resize(uint64_t new_size, Error **errp); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_total(void); @@ -79,4 +79,10 @@ void colo_flush_ram_cache(void); void colo_release_ram_cache(void); void colo_incoming_start_dirty_log(void); +/* Background snapshot */ +bool ram_write_tracking_available(void); +bool ram_write_tracking_compatible(void); +int ram_write_tracking_start(void); +void ram_write_tracking_stop(void); + #endif diff --git a/migration/savevm.c b/migration/savevm.c index 4f3b69ecfc..52e2d72e4b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -43,6 +43,8 @@ #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "qapi/qmp/json-writer.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-builtin-visit.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "sysemu/cpus.h" @@ -315,6 +317,16 @@ static int configuration_pre_save(void *opaque) return 0; } +static int configuration_post_save(void *opaque) +{ + SaveState *state = opaque; + + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + return 0; +} + static int configuration_pre_load(void *opaque) { SaveState *state = opaque; @@ -365,24 +377,36 @@ static int configuration_post_load(void *opaque, int version_id) { SaveState *state = opaque; const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + int ret = 0; if (strncmp(state->name, current_name, state->len) != 0) { error_report("Machine type received is '%.*s' and local is '%s'", (int) state->len, state->name, current_name); - return -EINVAL; + ret = -EINVAL; + goto out; } if (state->target_page_bits != qemu_target_page_bits()) { error_report("Received TARGET_PAGE_BITS is %d but local is %d", state->target_page_bits, qemu_target_page_bits()); - return -EINVAL; + ret = -EINVAL; + goto out; } if (!configuration_validate_capabilities(state)) { - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; +out: + g_free((void *)state->name); + state->name = NULL; + state->len = 0; + g_free(state->capabilities); + state->capabilities = NULL; + state->caps_count = 0; + + return ret; } static int get_capability(QEMUFile *f, void *pv, size_t size, @@ -516,6 +540,7 @@ static const VMStateDescription vmstate_configuration = { .pre_load = configuration_pre_load, .post_load = configuration_post_load, .pre_save = configuration_pre_save, + .post_save = configuration_post_save, .fields = (VMStateField[]) { VMSTATE_UINT32(len, SaveState), VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), @@ -1131,6 +1156,19 @@ bool qemu_savevm_state_blocked(Error **errp) return false; } +void qemu_savevm_non_migratable_list(strList **reasons) +{ + SaveStateEntry *se; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (se->vmsd && se->vmsd->unmigratable) { + QAPI_LIST_PREPEND(*reasons, + g_strdup_printf("non-migratable device: %s", + se->idstr)); + } + } +} + void qemu_savevm_state_header(QEMUFile *f) { trace_savevm_state_header(); @@ -1355,7 +1393,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) return 0; } -static int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, bool in_postcopy, bool inactivate_disks) @@ -2729,9 +2766,10 @@ int qemu_load_device_state(QEMUFile *f) return 0; } -int save_snapshot(const char *name, Error **errp) +bool save_snapshot(const char *name, bool overwrite, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs1; + BlockDriverState *bs; QEMUSnapshotInfo sn1, *sn = &sn1; int ret = -1, ret2; QEMUFile *f; @@ -2742,35 +2780,43 @@ int save_snapshot(const char *name, Error **errp) AioContext *aio_context; if (migration_is_blocked(errp)) { - return ret; + return false; } if (!replay_can_snapshot()) { error_setg(errp, "Record/replay does not allow making snapshot " "right now. Try once more later."); - return ret; + return false; } - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, "Device '%s' is writable but does not support " - "snapshots", bdrv_get_device_or_node_name(bs)); - return ret; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } /* Delete old snapshots of the same name */ if (name) { - ret = bdrv_all_delete_snapshot(name, &bs1, errp); - if (ret < 0) { - error_prepend(errp, "Error while deleting snapshot on device " - "'%s': ", bdrv_get_device_or_node_name(bs1)); - return ret; + if (overwrite) { + if (bdrv_all_delete_snapshot(name, has_devices, + devices, errp) < 0) { + return false; + } + } else { + ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp); + if (ret2 < 0) { + return false; + } + if (ret2 == 1) { + error_setg(errp, + "Snapshot '%s' already exists in one or more devices", + name); + return false; + } } } - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (bs == NULL) { - error_setg(errp, "No block device can accept snapshots"); - return ret; + return false; } aio_context = bdrv_get_aio_context(bs); @@ -2779,7 +2825,7 @@ int save_snapshot(const char *name, Error **errp) ret = global_state_store(); if (ret) { error_setg(errp, "Error saving global state"); - return ret; + return false; } vm_stop(RUN_STATE_SAVE_VM); @@ -2833,11 +2879,10 @@ int save_snapshot(const char *name, Error **errp) aio_context_release(aio_context); aio_context = NULL; - ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); + ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, + has_devices, devices, errp); if (ret < 0) { - error_setg(errp, "Error while creating snapshot on '%s'", - bdrv_get_device_or_node_name(bs)); - bdrv_all_delete_snapshot(sn->name, &bs, NULL); + bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL); goto the_end; } @@ -2853,7 +2898,7 @@ int save_snapshot(const char *name, Error **errp) if (saved_vm_running) { vm_start(); } - return ret; + return ret == 0; } void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, @@ -2938,33 +2983,32 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp) migration_incoming_state_destroy(); } -int load_snapshot(const char *name, Error **errp) +bool load_snapshot(const char *name, const char *vmstate, + bool has_devices, strList *devices, Error **errp) { - BlockDriverState *bs, *bs_vm_state; + BlockDriverState *bs_vm_state; QEMUSnapshotInfo sn; QEMUFile *f; int ret; AioContext *aio_context; MigrationIncomingState *mis = migration_incoming_get_current(); - if (!bdrv_all_can_snapshot(&bs)) { - error_setg(errp, - "Device '%s' is writable but does not support snapshots", - bdrv_get_device_or_node_name(bs)); - return -ENOTSUP; + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; } - ret = bdrv_all_find_snapshot(name, &bs); + ret = bdrv_all_has_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_setg(errp, - "Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_or_node_name(bs), name); - return ret; + return false; + } + if (ret == 0) { + error_setg(errp, "Snapshot '%s' does not exist in one or more devices", + name); + return false; } - bs_vm_state = bdrv_all_find_vmstate_bs(); + bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); if (!bs_vm_state) { - error_setg(errp, "No block device supports snapshots"); - return -ENOTSUP; + return false; } aio_context = bdrv_get_aio_context(bs_vm_state); @@ -2973,11 +3017,11 @@ int load_snapshot(const char *name, Error **errp) ret = bdrv_snapshot_find(bs_vm_state, &sn, name); aio_context_release(aio_context); if (ret < 0) { - return ret; + return false; } else if (sn.vm_state_size == 0) { error_setg(errp, "This is a disk-only snapshot. Revert to it " " offline using qemu-img"); - return -EINVAL; + return false; } /* @@ -2989,10 +3033,8 @@ int load_snapshot(const char *name, Error **errp) /* Flush all IO requests so they don't interfere with the new state. */ bdrv_drain_all_begin(); - ret = bdrv_all_goto_snapshot(name, &bs, errp); + ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp); if (ret < 0) { - error_prepend(errp, "Could not load snapshot '%s' on '%s': ", - name, bdrv_get_device_or_node_name(bs)); goto err_drain; } @@ -3000,7 +3042,6 @@ int load_snapshot(const char *name, Error **errp) f = qemu_fopen_bdrv(bs_vm_state, 0); if (!f) { error_setg(errp, "Could not open VM state file"); - ret = -EINVAL; goto err_drain; } @@ -3020,14 +3061,28 @@ int load_snapshot(const char *name, Error **errp) if (ret < 0) { error_setg(errp, "Error %d while loading VM state", ret); - return ret; + return false; } - return 0; + return true; err_drain: bdrv_drain_all_end(); - return ret; + return false; +} + +bool delete_snapshot(const char *name, bool has_devices, + strList *devices, Error **errp) +{ + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { + return false; + } + + if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) { + return false; + } + + return true; } void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) @@ -3057,3 +3112,187 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd) return !(vmsd && vmsd->unmigratable); } + +typedef struct SnapshotJob { + Job common; + char *tag; + char *vmstate; + strList *devices; + Coroutine *co; + Error **errp; + bool ret; +} SnapshotJob; + +static void qmp_snapshot_job_free(SnapshotJob *s) +{ + g_free(s->tag); + g_free(s->vmstate); + qapi_free_strList(s->devices); +} + + +static void snapshot_load_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + int orig_vm_running; + + job_progress_set_remaining(&s->common, 1); + + orig_vm_running = runstate_is_running(); + vm_stop(RUN_STATE_RESTORE_VM); + + s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp); + if (s->ret && orig_vm_running) { + vm_start(); + } + + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_save_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = save_snapshot(s->tag, false, s->vmstate, + true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static void snapshot_delete_job_bh(void *opaque) +{ + Job *job = opaque; + SnapshotJob *s = container_of(job, SnapshotJob, common); + + job_progress_set_remaining(&s->common, 1); + s->ret = delete_snapshot(s->tag, true, s->devices, s->errp); + job_progress_update(&s->common, 1); + + qmp_snapshot_job_free(s); + aio_co_wake(s->co); +} + +static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_save_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_load_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + +static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp) +{ + SnapshotJob *s = container_of(job, SnapshotJob, common); + s->errp = errp; + s->co = qemu_coroutine_self(); + aio_bh_schedule_oneshot(qemu_get_aio_context(), + snapshot_delete_job_bh, job); + qemu_coroutine_yield(); + return s->ret ? 0 : -1; +} + + +static const JobDriver snapshot_load_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_LOAD, + .run = snapshot_load_job_run, +}; + +static const JobDriver snapshot_save_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_SAVE, + .run = snapshot_save_job_run, +}; + +static const JobDriver snapshot_delete_job_driver = { + .instance_size = sizeof(SnapshotJob), + .job_type = JOB_TYPE_SNAPSHOT_DELETE, + .run = snapshot_delete_job_run, +}; + + +void qmp_snapshot_save(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_save_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_load(const char *job_id, + const char *tag, + const char *vmstate, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_load_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->vmstate = g_strdup(vmstate); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} + +void qmp_snapshot_delete(const char *job_id, + const char *tag, + strList *devices, + Error **errp) +{ + SnapshotJob *s; + + s = job_create(job_id, &snapshot_delete_job_driver, NULL, + qemu_get_aio_context(), JOB_MANUAL_DISMISS, + NULL, NULL, errp); + if (!s) { + return; + } + + s->tag = g_strdup(tag); + s->devices = QAPI_CLONE(strList, devices); + + job_start(&s->common); +} diff --git a/migration/savevm.h b/migration/savevm.h index ba64a7e271..6461342cb4 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -30,6 +30,7 @@ #define QEMU_VM_SECTION_FOOTER 0x7e bool qemu_savevm_state_blocked(Error **errp); +void qemu_savevm_non_migratable_list(strList **reasons); void qemu_savevm_state_setup(QEMUFile *f); bool qemu_savevm_state_guest_unplug_pending(void); int qemu_savevm_state_resume_prepare(MigrationState *s); @@ -64,5 +65,7 @@ int qemu_loadvm_state(QEMUFile *f); void qemu_loadvm_state_cleanup(void); int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); int qemu_load_device_state(QEMUFile *f); +int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, + bool in_postcopy, bool inactivate_disks); #endif diff --git a/migration/trace-events b/migration/trace-events index 75de5004ac..668c562fed 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64 +ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" +ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %d" diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index a48bc1e904..3c88a4faef 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -224,6 +224,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) migration_global_dump(mon); + if (info->blocked) { + strList *reasons = info->blocked_reasons; + monitor_printf(mon, "Outgoing migration blocked:\n"); + while (reasons) { + monitor_printf(mon, " %s\n", reasons->value); + reasons = reasons->next; + } + } + if (info->has_status) { monitor_printf(mon, "Migration status: %s", MigrationStatus_str(info->status)); @@ -1130,7 +1139,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict) vm_stop(RUN_STATE_RESTORE_VM); - if (load_snapshot(name, &err) == 0 && saved_vm_running) { + if (!load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) { vm_start(); } hmp_handle_error(mon, err); @@ -1140,21 +1149,17 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) { Error *err = NULL; - save_snapshot(qdict_get_try_str(qdict, "name"), &err); + save_snapshot(qdict_get_try_str(qdict, "name"), + true, NULL, false, NULL, &err); hmp_handle_error(mon, err); } void hmp_delvm(Monitor *mon, const QDict *qdict) { - BlockDriverState *bs; Error *err = NULL; const char *name = qdict_get_str(qdict, "name"); - if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { - error_prepend(&err, - "deleting snapshot on device '%s': ", - bdrv_get_device_name(bs)); - } + delete_snapshot(name, false, NULL, &err); hmp_handle_error(mon, err); } @@ -1294,11 +1299,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) switch (val) { case MIGRATION_PARAMETER_COMPRESS_LEVEL: p->has_compress_level = true; - visit_type_int(v, param, &p->compress_level, &err); + visit_type_uint8(v, param, &p->compress_level, &err); break; case MIGRATION_PARAMETER_COMPRESS_THREADS: p->has_compress_threads = true; - visit_type_int(v, param, &p->compress_threads, &err); + visit_type_uint8(v, param, &p->compress_threads, &err); break; case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD: p->has_compress_wait_thread = true; @@ -1306,19 +1311,19 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DECOMPRESS_THREADS: p->has_decompress_threads = true; - visit_type_int(v, param, &p->decompress_threads, &err); + visit_type_uint8(v, param, &p->decompress_threads, &err); break; case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD: p->has_throttle_trigger_threshold = true; - visit_type_int(v, param, &p->throttle_trigger_threshold, &err); + visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL: p->has_cpu_throttle_initial = true; - visit_type_int(v, param, &p->cpu_throttle_initial, &err); + visit_type_uint8(v, param, &p->cpu_throttle_initial, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT: p->has_cpu_throttle_increment = true; - visit_type_int(v, param, &p->cpu_throttle_increment, &err); + visit_type_uint8(v, param, &p->cpu_throttle_increment, &err); break; case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW: p->has_cpu_throttle_tailslow = true; @@ -1326,7 +1331,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MAX_CPU_THROTTLE: p->has_max_cpu_throttle = true; - visit_type_int(v, param, &p->max_cpu_throttle, &err); + visit_type_uint8(v, param, &p->max_cpu_throttle, &err); break; case MIGRATION_PARAMETER_TLS_CREDS: p->has_tls_creds = true; @@ -1362,11 +1367,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_DOWNTIME_LIMIT: p->has_downtime_limit = true; - visit_type_int(v, param, &p->downtime_limit, &err); + visit_type_size(v, param, &p->downtime_limit, &err); break; case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY: p->has_x_checkpoint_delay = true; - visit_type_int(v, param, &p->x_checkpoint_delay, &err); + visit_type_uint32(v, param, &p->x_checkpoint_delay, &err); break; case MIGRATION_PARAMETER_BLOCK_INCREMENTAL: p->has_block_incremental = true; @@ -1374,7 +1379,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_CHANNELS: p->has_multifd_channels = true; - visit_type_int(v, param, &p->multifd_channels, &err); + visit_type_uint8(v, param, &p->multifd_channels, &err); break; case MIGRATION_PARAMETER_MULTIFD_COMPRESSION: p->has_multifd_compression = true; @@ -1383,11 +1388,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) break; case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL: p->has_multifd_zlib_level = true; - visit_type_int(v, param, &p->multifd_zlib_level, &err); + visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; - visit_type_int(v, param, &p->multifd_zstd_level, &err); + visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; diff --git a/pc-bios/README b/pc-bios/README index 33f9754ad3..db7129ef64 100644 --- a/pc-bios/README +++ b/pc-bios/README @@ -20,7 +20,7 @@ legacy x86 software to communicate with an attached serial console as if a video card were attached. The master sources reside in a subversion repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is - available at https://git.qemu.org/git/sgabios.git. + available at https://gitlab.com/qemu-project/sgabios.git. - The PXE roms come from the iPXE project. Built with BANNER_TIME 0. Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping: @@ -37,7 +37,7 @@ - The u-boot binary for e500 comes from the upstream denx u-boot project where it was compiled using the qemu-ppce500 target. - A git mirror is available at: https://git.qemu.org/git/u-boot.git + A git mirror is available at: https://gitlab.com/qemu-project/u-boot.git The hash used to compile the current version is: 2072e72 - Skiboot (https://github.com/open-power/skiboot/) is an OPAL diff --git a/pc-bios/descriptors/meson.build b/pc-bios/descriptors/meson.build index ac6ec66b00..29efa16d99 100644 --- a/pc-bios/descriptors/meson.build +++ b/pc-bios/descriptors/meson.build @@ -9,7 +9,7 @@ if install_edk2_blobs ] configure_file(input: files(f), output: f, - configuration: {'DATADIR': qemu_datadir}, + configuration: {'DATADIR': get_option('prefix') / qemu_datadir}, install: get_option('install_blobs'), install_dir: qemu_datadir / 'firmware') endforeach diff --git a/pc-bios/meson.build b/pc-bios/meson.build index af95c5d1f1..f2b32598af 100644 --- a/pc-bios/meson.build +++ b/pc-bios/meson.build @@ -12,6 +12,7 @@ if install_edk2_blobs foreach f : fds custom_target(f, + build_by_default: have_system, output: f, input: '@0@.bz2'.format(f), capture: true, diff --git a/qapi/job.json b/qapi/job.json index 280c2f76f1..1a6ef03451 100644 --- a/qapi/job.json +++ b/qapi/job.json @@ -22,10 +22,17 @@ # # @amend: image options amend job type, see "x-blockdev-amend" (since 5.1) # +# @snapshot-load: snapshot load job type, see "snapshot-load" (since 6.0) +# +# @snapshot-save: snapshot save job type, see "snapshot-save" (since 6.0) +# +# @snapshot-delete: snapshot delete job type, see "snapshot-delete" (since 6.0) +# # Since: 1.7 ## { 'enum': 'JobType', - 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend'] } + 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend', + 'snapshot-load', 'snapshot-save', 'snapshot-delete'] } ## # @JobStatus: diff --git a/qapi/meson.build b/qapi/meson.build index ab68e7900e..0652569bc4 100644 --- a/qapi/meson.build +++ b/qapi/meson.build @@ -4,18 +4,20 @@ util_ss.add(files( 'qapi-dealloc-visitor.c', 'qapi-util.c', 'qapi-visit-core.c', - 'qmp-dispatch.c', - 'qmp-event.c', - 'qmp-registry.c', 'qobject-input-visitor.c', 'qobject-output-visitor.c', 'string-input-visitor.c', 'string-output-visitor.c', )) +if have_system or have_tools + util_ss.add(files( + 'qmp-dispatch.c', + 'qmp-event.c', + 'qmp-registry.c', + )) +endif qapi_all_modules = [ - 'acpi', - 'audio', 'authz', 'block', 'block-core', @@ -35,20 +37,30 @@ qapi_all_modules = [ 'misc-target', 'net', 'pragma', - 'qdev', - 'pci', 'qom', - 'rdma', 'replay', - 'rocker', 'run-state', 'sockets', - 'tpm', 'trace', 'transaction', - 'ui', 'yank', ] +if have_system + qapi_all_modules += [ + 'acpi', + 'audio', + 'qdev', + 'pci', + 'rdma', + 'rocker', + 'tpm', + ] +endif +if have_system or have_tools + qapi_all_modules += [ + 'ui', + ] +endif qapi_storage_daemon_modules = [ 'block-core', diff --git a/qapi/migration.json b/qapi/migration.json index d1d9632c2a..ce14d78071 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -78,7 +78,7 @@ # Since: 1.2 ## { 'struct': 'XBZRLECacheStats', - 'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int', + 'data': {'cache-size': 'size', 'bytes': 'int', 'pages': 'int', 'cache-miss': 'int', 'cache-miss-rate': 'number', 'encoding-rate': 'number', 'overflow': 'int' } } @@ -224,6 +224,10 @@ # only returned if VFIO device is present, migration is supported by all # VFIO devices and status is 'active' or 'completed' (since 5.2) # +# @blocked: True if outgoing migration is blocked (since 6.0) +# +# @blocked-reasons: A list of reasons an outgoing migration is blocked (since 6.0) +# # Since: 0.14 ## { 'struct': 'MigrationInfo', @@ -237,6 +241,8 @@ '*setup-time': 'int', '*cpu-throttle-percentage': 'int', '*error-desc': 'str', + 'blocked': 'bool', + '*blocked-reasons': ['str'], '*postcopy-blocktime' : 'uint32', '*postcopy-vcpu-blocktime': ['uint32'], '*compression': 'CompressionStats', @@ -442,6 +448,11 @@ # @validate-uuid: Send the UUID of the source to allow the destination # to ensure it is the same. (since 4.2) # +# @background-snapshot: If enabled, the migration stream will be a snapshot +# of the VM exactly at the point when the migration +# procedure starts. The VM RAM is saved with running VM. +# (since 6.0) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', @@ -449,7 +460,7 @@ 'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram', 'block', 'return-path', 'pause-before-switchover', 'multifd', 'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate', - 'x-ignore-shared', 'validate-uuid' ] } + 'x-ignore-shared', 'validate-uuid', 'background-snapshot'] } ## # @MigrationCapabilityStatus: @@ -885,28 +896,28 @@ '*announce-max': 'size', '*announce-rounds': 'size', '*announce-step': 'size', - '*compress-level': 'int', - '*compress-threads': 'int', + '*compress-level': 'uint8', + '*compress-threads': 'uint8', '*compress-wait-thread': 'bool', - '*decompress-threads': 'int', - '*throttle-trigger-threshold': 'int', - '*cpu-throttle-initial': 'int', - '*cpu-throttle-increment': 'int', + '*decompress-threads': 'uint8', + '*throttle-trigger-threshold': 'uint8', + '*cpu-throttle-initial': 'uint8', + '*cpu-throttle-increment': 'uint8', '*cpu-throttle-tailslow': 'bool', '*tls-creds': 'StrOrNull', '*tls-hostname': 'StrOrNull', '*tls-authz': 'StrOrNull', - '*max-bandwidth': 'int', - '*downtime-limit': 'int', - '*x-checkpoint-delay': 'int', + '*max-bandwidth': 'size', + '*downtime-limit': 'uint64', + '*x-checkpoint-delay': 'uint32', '*block-incremental': 'bool', - '*multifd-channels': 'int', + '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', - '*max-cpu-throttle': 'int', + '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', - '*multifd-zlib-level': 'int', - '*multifd-zstd-level': 'int', + '*multifd-zlib-level': 'uint8', + '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } } ## @@ -1093,7 +1104,7 @@ '*max-bandwidth': 'size', '*downtime-limit': 'uint64', '*x-checkpoint-delay': 'uint32', - '*block-incremental': 'bool' , + '*block-incremental': 'bool', '*multifd-channels': 'uint8', '*xbzrle-cache-size': 'size', '*max-postcopy-bandwidth': 'size', @@ -1465,7 +1476,7 @@ # <- { "return": 67108864 } # ## -{ 'command': 'query-migrate-cache-size', 'returns': 'int', +{ 'command': 'query-migrate-cache-size', 'returns': 'size', 'features': [ 'deprecated' ] } ## @@ -1843,3 +1854,176 @@ # Since: 5.2 ## { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' } + +## +# @snapshot-save: +# +# Save a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to create +# @vmstate: block device node name to save vmstate to +# @devices: list of block device node names to save a snapshot to +# +# Applications should not assume that the snapshot save is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs may be stopped during the +# time it takes to save the snapshot. A future version of QEMU +# may ensure CPUs are executing continuously. +# +# It is strongly recommended that @devices contain all writable +# block device nodes if a consistent snapshot is required. +# +# If @tag already exists, an error will be reported +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-save", +# "data": { +# "job-id": "snapsave0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapsave0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapsave0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapsave0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-save", +# "id": "snapsave0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-save', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-load: +# +# Load a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to load. +# @vmstate: block device node name to load vmstate from +# @devices: list of block device node names to load a snapshot from +# +# Applications should not assume that the snapshot load is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Note that execution of the guest CPUs will be stopped during the +# time it takes to load the snapshot. +# +# It is strongly recommended that @devices contain all writable +# block device nodes that can have changed since the original +# @snapshot-save command execution. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-load", +# "data": { +# "job-id": "snapload0", +# "tag": "my-snap", +# "vmstate": "disk0", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapload0"}} +# <- {"event": "STOP"} +# <- {"event": "RESUME"} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapload0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapload0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-load", +# "id": "snapload0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-load', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'vmstate': 'str', + 'devices': ['str'] } } + +## +# @snapshot-delete: +# +# Delete a VM snapshot +# +# @job-id: identifier for the newly created job +# @tag: name of the snapshot to delete. +# @devices: list of block device node names to delete a snapshot from +# +# Applications should not assume that the snapshot delete is complete +# when this command returns. The job commands / events must be used +# to determine completion and to fetch details of any errors that arise. +# +# Returns: nothing +# +# Example: +# +# -> { "execute": "snapshot-delete", +# "data": { +# "job-id": "snapdelete0", +# "tag": "my-snap", +# "devices": ["disk0", "disk1"] +# } +# } +# <- { "return": { } } +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "created", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "running", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "waiting", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "pending", "id": "snapdelete0"}} +# <- {"event": "JOB_STATUS_CHANGE", +# "data": {"status": "concluded", "id": "snapdelete0"}} +# -> {"execute": "query-jobs"} +# <- {"return": [{"current-progress": 1, +# "status": "concluded", +# "total-progress": 1, +# "type": "snapshot-delete", +# "id": "snapdelete0"}]} +# +# Since: 6.0 +## +{ 'command': 'snapshot-delete', + 'data': { 'job-id': 'str', + 'tag': 'str', + 'devices': ['str'] } } diff --git a/qemu-nbd.c b/qemu-nbd.c index 608c63e82a..b1b9430a8f 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -328,7 +328,7 @@ static void *nbd_client_thread(void *arg) static int nbd_can_accept(void) { - return state == RUNNING && nb_fds < shared; + return state == RUNNING && (shared == 0 || nb_fds < shared); } static void nbd_update_server_watch(void); @@ -707,7 +707,7 @@ int main(int argc, char **argv) break; case 'e': if (qemu_strtoi(optarg, NULL, 0, &shared) < 0 || - shared < 1) { + shared < 0) { error_report("Invalid shared device number '%s'", optarg); exit(EXIT_FAILURE); } @@ -964,8 +964,16 @@ int main(int argc, char **argv) server = qio_net_listener_new(); if (socket_activation == 0) { + int backlog; + + if (persistent || shared == 0) { + backlog = SOMAXCONN; + } else { + backlog = MIN(shared, SOMAXCONN); + } saddr = nbd_build_socket_address(sockpath, bindto, port); - if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) { + if (qio_net_listener_open_sync(server, saddr, backlog, + &local_err) < 0) { object_unref(OBJECT(server)); error_report_err(local_err); exit(EXIT_FAILURE); diff --git a/qemu-options.hx b/qemu-options.hx index c09c4646e2..6c34c7050f 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -35,7 +35,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ " suppress-vmdesc=on|off disables self-describing migration (default=off)\n" " nvdimm=on|off controls NVDIMM support (default=off)\n" " memory-encryption=@var{} memory encryption object to use (default=none)\n" - " hmat=on|off controls ACPI HMAT support (default=off)\n", + " hmat=on|off controls ACPI HMAT support (default=off)\n" + " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n", QEMU_ARCH_ALL) SRST ``-machine [type=]name[,prop=value[,...]]`` @@ -96,6 +97,29 @@ SRST ``hmat=on|off`` Enables or disables ACPI Heterogeneous Memory Attribute Table (HMAT) support. The default is off. + + ``memory-backend='id'`` + An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options. + Allows to use a memory backend as main RAM. + + For example: + :: + -object memory-backend-file,id=pc.ram,size=512M,mem-path=/hugetlbfs,prealloc=on,share=on + -machine memory-backend=pc.ram + -m 512M + + Migration compatibility note: + a) as backend id one shall use value of 'default-ram-id', advertised by + machine type (available via ``query-machines`` QMP command), if migration + to/from old QEMU (<5.0) is expected. + b) for machine types 4.0 and older, user shall + use ``x-use-canonical-path-for-ramblock-id=off`` backend option + if migration to/from old QEMU (<5.0) is expected. + For example: + :: + -object memory-backend-ram,id=pc.ram,size=512M,x-use-canonical-path-for-ramblock-id=off + -machine memory-backend=pc.ram + -m 512M ERST HXCOMM Deprecated by -machine diff --git a/qom/object.c b/qom/object.c index 2fa0119647..491823db4a 100644 --- a/qom/object.c +++ b/qom/object.c @@ -442,7 +442,8 @@ static GPtrArray *object_compat_props[3]; * other than "-global". These are generally used for syntactic * sugar and legacy command line options. */ -void object_register_sugar_prop(const char *driver, const char *prop, const char *value) +void object_register_sugar_prop(const char *driver, const char *prop, + const char *value, bool optional) { GlobalProperty *g; if (!object_compat_props[2]) { @@ -452,6 +453,7 @@ void object_register_sugar_prop(const char *driver, const char *prop, const char g->driver = g_strdup(driver); g->property = g_strdup(prop); g->value = g_strdup(value); + g->optional = optional; g_ptr_array_add(object_compat_props[2], g); } diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c index 5ec574724a..1cde50e9f3 100644 --- a/replay/replay-debugging.c +++ b/replay/replay-debugging.c @@ -143,12 +143,13 @@ static char *replay_find_nearest_snapshot(int64_t icount, QEMUSnapshotInfo *sn_tab; QEMUSnapshotInfo *nearest = NULL; char *ret = NULL; + int rv; int nb_sns, i; AioContext *aio_context; *snapshot_icount = -1; - bs = bdrv_all_find_vmstate_bs(); + bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, NULL); if (!bs) { goto fail; } @@ -159,7 +160,10 @@ static char *replay_find_nearest_snapshot(int64_t icount, aio_context_release(aio_context); for (i = 0; i < nb_sns; i++) { - if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) { + rv = bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL); + if (rv < 0) + goto fail; + if (rv == 1) { if (sn_tab[i].icount != -1ULL && sn_tab[i].icount <= icount && (!nearest || nearest->icount < sn_tab[i].icount)) { @@ -192,7 +196,7 @@ static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp) if (icount < replay_get_current_icount() || replay_get_current_icount() < snapshot_icount) { vm_stop(RUN_STATE_RESTORE_VM); - load_snapshot(snapshot, errp); + load_snapshot(snapshot, NULL, false, NULL, errp); } g_free(snapshot); } @@ -323,7 +327,7 @@ void replay_gdb_attached(void) */ if (replay_mode == REPLAY_MODE_PLAY && !replay_snapshot) { - if (save_snapshot("start_debugging", NULL) != 0) { + if (!save_snapshot("start_debugging", true, NULL, false, NULL, NULL)) { /* Can't create the snapshot. Continue conventional debugging. */ } } diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c index e26fa4c892..e8767a1937 100644 --- a/replay/replay-snapshot.c +++ b/replay/replay-snapshot.c @@ -77,13 +77,14 @@ void replay_vmstate_init(void) if (replay_snapshot) { if (replay_mode == REPLAY_MODE_RECORD) { - if (save_snapshot(replay_snapshot, &err) != 0) { + if (!save_snapshot(replay_snapshot, + true, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not create snapshot for icount record"); exit(1); } } else if (replay_mode == REPLAY_MODE_PLAY) { - if (load_snapshot(replay_snapshot, &err) != 0) { + if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) { error_report_err(err); error_report("Could not load snapshot for icount replay"); exit(1); diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 271f5ff42a..e5499b94b4 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -1377,7 +1377,7 @@ sub vcs_exists { warn("$P: No supported VCS found. Add --nogit to options?\n"); warn("Using a git repository produces better results.\n"); warn("Try latest git repository using:\n"); - warn("git clone https://git.qemu.org/git/qemu.git\n"); + warn("git clone https://gitlab.com/qemu-project/qemu.git\n"); $printed_novcs = 1; } return 0; diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py index 25ee6887cf..cbbcba100d 100644 --- a/scripts/mtest2make.py +++ b/scripts/mtest2make.py @@ -110,6 +110,7 @@ def emit_suite(name, suite, prefix): print('ifneq ($(filter %s %s, $(MAKECMDGOALS)),)' % (target, prefix)) print('.tests += $(.test.$(SPEED).%s)' % (target, )) print('endif') + print('all-%s-targets += %s' % (prefix, target)) targets = {t['id']: [os.path.relpath(f) for f in t['filename']] for t in introspect['targets']} diff --git a/scripts/oss-fuzz/minimize_qtest_trace.py b/scripts/oss-fuzz/minimize_qtest_trace.py index 4cba96dee2..20825768c2 100755 --- a/scripts/oss-fuzz/minimize_qtest_trace.py +++ b/scripts/oss-fuzz/minimize_qtest_trace.py @@ -261,7 +261,7 @@ def clear_bits(newtrace, outpath): data_try = hex(int("".join(data_bin_list), 2)) # It seems qtest only accepts padded hex-values. if len(data_try) % 2 == 1: - data_try = data_try[:2] + "0" + data_try[2:-1] + data_try = data_try[:2] + "0" + data_try[2:] newtrace[i] = "{prefix} {data_try}\n".format( prefix=prefix, diff --git a/scripts/qapi/commands.py b/scripts/qapi/commands.py index 50978090b4..54af519f44 100644 --- a/scripts/qapi/commands.py +++ b/scripts/qapi/commands.py @@ -23,7 +23,6 @@ from typing import ( from .common import c_name, mcgen from .gen import ( QAPIGenC, - QAPIGenCCode, QAPISchemaModularCVisitor, build_params, ifcontext, @@ -126,6 +125,9 @@ def gen_marshal(name: str, boxed: bool, ret_type: Optional[QAPISchemaType]) -> str: have_args = boxed or (arg_type and not arg_type.is_empty()) + if have_args: + assert arg_type is not None + arg_type_c_name = arg_type.c_name() ret = mcgen(''' @@ -147,7 +149,7 @@ def gen_marshal(name: str, ret += mcgen(''' %(c_name)s arg = {0}; ''', - c_name=arg_type.c_name()) + c_name=arg_type_c_name) ret += mcgen(''' @@ -163,7 +165,7 @@ def gen_marshal(name: str, ok = visit_check_struct(v, errp); } ''', - c_arg_type=arg_type.c_name()) + c_arg_type=arg_type_c_name) else: ret += mcgen(''' ok = visit_check_struct(v, errp); @@ -193,7 +195,7 @@ out: ret += mcgen(''' visit_type_%(c_arg_type)s_members(v, &arg, NULL); ''', - c_arg_type=arg_type.c_name()) + c_arg_type=arg_type_c_name) ret += mcgen(''' visit_end_struct(v, NULL); @@ -234,28 +236,11 @@ def gen_register_command(name: str, return ret -def gen_registry(registry: str, prefix: str) -> str: - ret = mcgen(''' - -void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds) -{ - QTAILQ_INIT(cmds); - -''', - c_prefix=c_name(prefix, protect=False)) - ret += registry - ret += mcgen(''' -} -''') - return ret - - class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor): def __init__(self, prefix: str): super().__init__( prefix, 'qapi-commands', ' * Schema-defined QAPI/QMP commands', None, __doc__) - self._regy = QAPIGenCCode(None) self._visited_ret_types: Dict[QAPIGenC, Set[QAPISchemaType]] = {} def _begin_user_module(self, name: str) -> None: @@ -282,25 +267,36 @@ class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor): ''', types=types)) - def visit_end(self) -> None: - self._add_system_module('init', ' * QAPI Commands initialization') + def visit_begin(self, schema: QAPISchema) -> None: + self._add_module('./init', ' * QAPI Commands initialization') self._genh.add(mcgen(''' #include "qapi/qmp/dispatch.h" void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds); ''', c_prefix=c_name(self._prefix, protect=False))) - self._genc.preamble_add(mcgen(''' + self._genc.add(mcgen(''' #include "qemu/osdep.h" #include "%(prefix)sqapi-commands.h" #include "%(prefix)sqapi-init-commands.h" + +void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds) +{ + QTAILQ_INIT(cmds); + ''', - prefix=self._prefix)) - self._genc.add(gen_registry(self._regy.get_content(), self._prefix)) + prefix=self._prefix, + c_prefix=c_name(self._prefix, protect=False))) + + def visit_end(self) -> None: + with self._temp_module('./init'): + self._genc.add(mcgen(''' +} +''')) def visit_command(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], arg_type: Optional[QAPISchemaObjectType], @@ -321,15 +317,17 @@ void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds); if ret_type and ret_type not in self._visited_ret_types[self._genc]: self._visited_ret_types[self._genc].add(ret_type) with ifcontext(ret_type.ifcond, - self._genh, self._genc, self._regy): + self._genh, self._genc): self._genc.add(gen_marshal_output(ret_type)) - with ifcontext(ifcond, self._genh, self._genc, self._regy): + with ifcontext(ifcond, self._genh, self._genc): self._genh.add(gen_command_decl(name, arg_type, boxed, ret_type)) self._genh.add(gen_marshal_decl(name)) self._genc.add(gen_marshal(name, arg_type, boxed, ret_type)) - self._regy.add(gen_register_command(name, success_response, - allow_oob, allow_preconfig, - coroutine)) + with self._temp_module('./init'): + with ifcontext(ifcond, self._genh, self._genc): + self._genc.add(gen_register_command(name, success_response, + allow_oob, allow_preconfig, + coroutine)) def gen_commands(schema: QAPISchema, diff --git a/scripts/qapi/events.py b/scripts/qapi/events.py index 599f3d1f56..8c57deb2b8 100644 --- a/scripts/qapi/events.py +++ b/scripts/qapi/events.py @@ -12,7 +12,7 @@ This work is licensed under the terms of the GNU GPL, version 2. See the COPYING file in the top-level directory. """ -from typing import List +from typing import List, Optional from .common import c_enum_const, c_name, mcgen from .gen import QAPISchemaModularCVisitor, build_params, ifcontext @@ -27,7 +27,7 @@ from .types import gen_enum, gen_enum_lookup def build_event_send_proto(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> str: return 'void qapi_event_send_%(c_name)s(%(param)s)' % { 'c_name': c_name(name.lower()), @@ -35,7 +35,7 @@ def build_event_send_proto(name: str, def gen_event_send_decl(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> str: return mcgen(''' @@ -78,7 +78,7 @@ def gen_param_var(typ: QAPISchemaObjectType) -> str: def gen_event_send(name: str, - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool, event_enum_name: str, event_emit: str) -> str: @@ -99,6 +99,7 @@ def gen_event_send(name: str, proto=build_event_send_proto(name, arg_type, boxed)) if have_args: + assert arg_type is not None ret += mcgen(''' QObject *obj; Visitor *v; @@ -114,6 +115,7 @@ def gen_event_send(name: str, name=name) if have_args: + assert arg_type is not None ret += mcgen(''' v = qobject_output_visitor_new(&obj); ''') @@ -189,7 +191,7 @@ class QAPISchemaGenEventVisitor(QAPISchemaModularCVisitor): types=types)) def visit_end(self) -> None: - self._add_system_module('emit', ' * QAPI Events emission') + self._add_module('./emit', ' * QAPI Events emission') self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "%(prefix)sqapi-emit-events.h" @@ -211,10 +213,10 @@ void %(event_emit)s(%(event_enum)s event, QDict *qdict); def visit_event(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], - arg_type: QAPISchemaObjectType, + arg_type: Optional[QAPISchemaObjectType], boxed: bool) -> None: with ifcontext(ifcond, self._genh, self._genc): self._genh.add(gen_event_send_decl(name, arg_type, boxed)) diff --git a/scripts/qapi/gen.py b/scripts/qapi/gen.py index b40f18eee3..63549cc8d4 100644 --- a/scripts/qapi/gen.py +++ b/scripts/qapi/gen.py @@ -31,12 +31,16 @@ from .common import ( guardstart, mcgen, ) -from .schema import QAPISchemaObjectType, QAPISchemaVisitor +from .schema import ( + QAPISchemaModule, + QAPISchemaObjectType, + QAPISchemaVisitor, +) from .source import QAPISourceInfo class QAPIGen: - def __init__(self, fname: Optional[str]): + def __init__(self, fname: str): self.fname = fname self._preamble = '' self._body = '' @@ -121,7 +125,7 @@ def build_params(arg_type: Optional[QAPISchemaObjectType], class QAPIGenCCode(QAPIGen): - def __init__(self, fname: Optional[str]): + def __init__(self, fname: str): super().__init__(fname) self._start_if: Optional[Tuple[List[str], str, str]] = None @@ -130,15 +134,12 @@ class QAPIGenCCode(QAPIGen): self._start_if = (ifcond, self._body, self._preamble) def end_if(self) -> None: - assert self._start_if - self._wrap_ifcond() - self._start_if = None - - def _wrap_ifcond(self) -> None: + assert self._start_if is not None self._body = _wrap_ifcond(self._start_if[0], self._start_if[1], self._body) self._preamble = _wrap_ifcond(self._start_if[0], self._start_if[2], self._preamble) + self._start_if = None def get_content(self) -> str: assert self._start_if is None @@ -243,85 +244,88 @@ class QAPISchemaModularCVisitor(QAPISchemaVisitor): self._user_blurb = user_blurb self._builtin_blurb = builtin_blurb self._pydoc = pydoc - self._genc: Optional[QAPIGenC] = None - self._genh: Optional[QAPIGenH] = None - self._module: Dict[Optional[str], Tuple[QAPIGenC, QAPIGenH]] = {} + self._current_module: Optional[str] = None + self._module: Dict[str, Tuple[QAPIGenC, QAPIGenH]] = {} self._main_module: Optional[str] = None - @staticmethod - def _is_user_module(name: Optional[str]) -> bool: - return bool(name and not name.startswith('./')) + @property + def _genc(self) -> QAPIGenC: + assert self._current_module is not None + return self._module[self._current_module][0] - @staticmethod - def _is_builtin_module(name: Optional[str]) -> bool: - return not name + @property + def _genh(self) -> QAPIGenH: + assert self._current_module is not None + return self._module[self._current_module][1] - def _module_dirname(self, name: Optional[str]) -> str: - if self._is_user_module(name): + @staticmethod + def _module_dirname(name: str) -> str: + if QAPISchemaModule.is_user_module(name): return os.path.dirname(name) return '' - def _module_basename(self, what: str, name: Optional[str]) -> str: - ret = '' if self._is_builtin_module(name) else self._prefix - if self._is_user_module(name): + def _module_basename(self, what: str, name: str) -> str: + ret = '' if QAPISchemaModule.is_builtin_module(name) else self._prefix + if QAPISchemaModule.is_user_module(name): basename = os.path.basename(name) ret += what if name != self._main_module: ret += '-' + os.path.splitext(basename)[0] else: - name = name[2:] if name else 'builtin' - ret += re.sub(r'-', '-' + name + '-', what) + assert QAPISchemaModule.is_system_module(name) + ret += re.sub(r'-', '-' + name[2:] + '-', what) return ret - def _module_filename(self, what: str, name: Optional[str]) -> str: + def _module_filename(self, what: str, name: str) -> str: return os.path.join(self._module_dirname(name), self._module_basename(what, name)) - def _add_module(self, name: Optional[str], blurb: str) -> None: + def _add_module(self, name: str, blurb: str) -> None: + if QAPISchemaModule.is_user_module(name): + if self._main_module is None: + self._main_module = name basename = self._module_filename(self._what, name) genc = QAPIGenC(basename + '.c', blurb, self._pydoc) genh = QAPIGenH(basename + '.h', blurb, self._pydoc) self._module[name] = (genc, genh) - self._genc, self._genh = self._module[name] - - def _add_user_module(self, name: str, blurb: str) -> None: - assert self._is_user_module(name) - if self._main_module is None: - self._main_module = name - self._add_module(name, blurb) + self._current_module = name - def _add_system_module(self, name: Optional[str], blurb: str) -> None: - self._add_module(name and './' + name, blurb) + @contextmanager + def _temp_module(self, name: str) -> Iterator[None]: + old_module = self._current_module + self._current_module = name + yield + self._current_module = old_module def write(self, output_dir: str, opt_builtins: bool = False) -> None: for name in self._module: - if self._is_builtin_module(name) and not opt_builtins: + if QAPISchemaModule.is_builtin_module(name) and not opt_builtins: continue (genc, genh) = self._module[name] genc.write(output_dir) genh.write(output_dir) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: pass def _begin_user_module(self, name: str) -> None: pass - def visit_module(self, name: Optional[str]) -> None: - if name is None: + def visit_module(self, name: str) -> None: + if QAPISchemaModule.is_builtin_module(name): if self._builtin_blurb: - self._add_system_module(None, self._builtin_blurb) - self._begin_system_module(name) + self._add_module(name, self._builtin_blurb) + self._begin_builtin_module() else: # The built-in module has not been created. No code may # be generated. - self._genc = None - self._genh = None + self._current_module = None else: - self._add_user_module(name, self._user_blurb) + assert QAPISchemaModule.is_user_module(name) + self._add_module(name, self._user_blurb) self._begin_user_module(name) - def visit_include(self, name: str, info: QAPISourceInfo) -> None: + def visit_include(self, name: str, info: Optional[QAPISourceInfo]) -> None: relname = os.path.relpath(self._module_filename(self._what, name), os.path.dirname(self._genh.fname)) self._genh.preamble_add(mcgen(''' diff --git a/scripts/qapi/main.py b/scripts/qapi/main.py index 42517210b8..703e7ed1ed 100644 --- a/scripts/qapi/main.py +++ b/scripts/qapi/main.py @@ -23,6 +23,8 @@ from .visit import gen_visit def invalid_prefix_char(prefix: str) -> Optional[str]: match = re.match(r'([A-Za-z_.-][A-Za-z0-9_.-]*)?', prefix) + # match cannot be None, but mypy cannot infer that. + assert match is not None if match.end() != len(prefix): return prefix[match.end()] return None diff --git a/scripts/qapi/mypy.ini b/scripts/qapi/mypy.ini index 74fc6c8215..04bd5db527 100644 --- a/scripts/qapi/mypy.ini +++ b/scripts/qapi/mypy.ini @@ -1,6 +1,5 @@ [mypy] strict = True -strict_optional = False disallow_untyped_calls = False python_version = 3.6 diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py index 720449feee..353e8020a2 100644 --- a/scripts/qapi/schema.py +++ b/scripts/qapi/schema.py @@ -68,7 +68,8 @@ class QAPISchemaEntity: def _set_module(self, schema, info): assert self._checked - self._module = schema.module_by_fname(info and info.fname) + fname = info.fname if info else QAPISchemaModule.BUILTIN_MODULE_NAME + self._module = schema.module_by_fname(fname) self._module.add_entity(self) def set_module(self, schema): @@ -137,10 +138,40 @@ class QAPISchemaVisitor: class QAPISchemaModule: + + BUILTIN_MODULE_NAME = './builtin' + def __init__(self, name): self.name = name self._entity_list = [] + @staticmethod + def is_system_module(name: str) -> bool: + """ + System modules are internally defined modules. + + Their names start with the "./" prefix. + """ + return name.startswith('./') + + @classmethod + def is_user_module(cls, name: str) -> bool: + """ + User modules are those defined by the user in qapi JSON files. + + They do not start with the "./" prefix. + """ + return not cls.is_system_module(name) + + @classmethod + def is_builtin_module(cls, name: str) -> bool: + """ + The built-in module is a single System module for the built-in types. + + It is always "./builtin". + """ + return name == cls.BUILTIN_MODULE_NAME + def add_entity(self, ent): self._entity_list.append(ent) @@ -825,7 +856,7 @@ class QAPISchema: self._entity_dict = {} self._module_dict = OrderedDict() self._schema_dir = os.path.dirname(fname) - self._make_module(None) # built-ins + self._make_module(QAPISchemaModule.BUILTIN_MODULE_NAME) self._make_module(fname) self._predefining = True self._def_predefineds() @@ -870,9 +901,9 @@ class QAPISchema: info, "%s uses unknown type '%s'" % (what, name)) return typ - def _module_name(self, fname): - if fname is None: - return None + def _module_name(self, fname: str) -> str: + if QAPISchemaModule.is_system_module(fname): + return fname return os.path.relpath(fname, self._schema_dir) def _make_module(self, fname): @@ -883,7 +914,6 @@ class QAPISchema: def module_by_fname(self, fname): name = self._module_name(fname) - assert name in self._module_dict return self._module_dict[name] def _def_include(self, expr, info, doc): diff --git a/scripts/qapi/types.py b/scripts/qapi/types.py index 2b4916cdaa..2bdd626847 100644 --- a/scripts/qapi/types.py +++ b/scripts/qapi/types.py @@ -271,7 +271,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor): prefix, 'qapi-types', ' * Schema-defined QAPI types', ' * Built-in QAPI types', __doc__) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "qapi/dealloc-visitor.h" @@ -350,7 +350,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor): def visit_alternate_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], variants: QAPISchemaVariants) -> None: diff --git a/scripts/qapi/visit.py b/scripts/qapi/visit.py index 339f152152..22e62df901 100644 --- a/scripts/qapi/visit.py +++ b/scripts/qapi/visit.py @@ -305,7 +305,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): prefix, 'qapi-visit', ' * Schema-defined QAPI visitors', ' * Built-in QAPI visitors', __doc__) - def _begin_system_module(self, name: None) -> None: + def _begin_builtin_module(self) -> None: self._genc.preamble_add(mcgen(''' #include "qemu/osdep.h" #include "qapi/error.h" @@ -336,7 +336,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): def visit_enum_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], members: List[QAPISchemaEnumMember], @@ -378,7 +378,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor): def visit_alternate_type(self, name: str, - info: QAPISourceInfo, + info: Optional[QAPISourceInfo], ifcond: List[str], features: List[QAPISchemaFeature], variants: QAPISchemaVariants) -> None: diff --git a/scripts/userfaultfd-wrlat.py b/scripts/userfaultfd-wrlat.py new file mode 100755 index 0000000000..0684be4e04 --- /dev/null +++ b/scripts/userfaultfd-wrlat.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 +# +# userfaultfd-wrlat Summarize userfaultfd write fault latencies. +# Events are continuously accumulated for the +# run, while latency distribution histogram is +# dumped each 'interval' seconds. +# +# For Linux, uses BCC, eBPF. +# +# USAGE: userfaultfd-lat [interval [count]] +# +# Copyright Virtuozzo GmbH, 2020 +# +# Authors: +# Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +from __future__ import print_function +from bcc import BPF +from ctypes import c_ushort, c_int, c_ulonglong +from time import sleep +from sys import argv + +def usage(): + print("USAGE: %s [interval [count]]" % argv[0]) + exit() + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <linux/mm.h> + +BPF_HASH(ev_start, u32, u64); +BPF_HISTOGRAM(ev_delta_hist, u64); + +/* Trace UFFD page fault start event. */ +static void do_event_start() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + + ev_start.update(&tid, &ts); +} + +/* Trace UFFD page fault end event. */ +static void do_event_end() +{ + /* Using "(u32)" to drop group ID which is upper 32 bits */ + u32 tid = (u32) bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + u64 *tsp; + + tsp = ev_start.lookup(&tid); + if (tsp) { + u64 delta = ts - (*tsp); + /* Transform time delta to milliseconds */ + ev_delta_hist.increment(bpf_log2l(delta / 1000000)); + ev_start.delete(&tid); + } +} + +/* KPROBE for handle_userfault(). */ +int probe_handle_userfault(struct pt_regs *ctx, struct vm_fault *vmf, + unsigned long reason) +{ + /* Trace only UFFD write faults. */ + if (reason & VM_UFFD_WP) { + do_event_start(); + } + return 0; +} + +/* KRETPROBE for handle_userfault(). */ +int retprobe_handle_userfault(struct pt_regs *ctx) +{ + do_event_end(); + return 0; +} +""" + +# arguments +interval = 10 +count = -1 +if len(argv) > 1: + try: + interval = int(argv[1]) + if interval == 0: + raise + if len(argv) > 2: + count = int(argv[2]) + except: # also catches -h, --help + usage() + +# load BPF program +b = BPF(text=bpf_text) +# attach KRPOBEs +b.attach_kprobe(event="handle_userfault", fn_name="probe_handle_userfault") +b.attach_kretprobe(event="handle_userfault", fn_name="retprobe_handle_userfault") + +# header +print("Tracing UFFD-WP write fault latency... Hit Ctrl-C to end.") + +# output +loop = 0 +do_exit = 0 +while (1): + if count > 0: + loop += 1 + if loop > count: + exit() + try: + sleep(interval) + except KeyboardInterrupt: + pass; do_exit = 1 + + print() + b["ev_delta_hist"].print_log2_hist("msecs") + if do_exit: + exit() diff --git a/softmmu/cpu-throttle.c b/softmmu/cpu-throttle.c index 2ec4b8e0bc..8c2144ab95 100644 --- a/softmmu/cpu-throttle.c +++ b/softmmu/cpu-throttle.c @@ -90,14 +90,21 @@ static void cpu_throttle_timer_tick(void *opaque) void cpu_throttle_set(int new_throttle_pct) { + /* + * boolean to store whether throttle is already active or not, + * before modifying throttle_percentage + */ + bool throttle_active = cpu_throttle_active(); + /* Ensure throttle percentage is within valid range */ new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX); new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN); qatomic_set(&throttle_percentage, new_throttle_pct); - timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) + - CPU_THROTTLE_TIMESLICE_NS); + if (!throttle_active) { + cpu_throttle_timer_tick(NULL); + } } void cpu_throttle_stop(void) diff --git a/softmmu/memory.c b/softmmu/memory.c index c0c814fbb9..874a8fccde 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -1440,7 +1440,7 @@ MemTxResult memory_region_dispatch_read(MemoryRegion *mr, unsigned size = memop_size(op); MemTxResult r; - fuzz_dma_read_cb(addr, size, mr, false); + fuzz_dma_read_cb(addr, size, mr); if (!memory_region_access_valid(mr, addr, size, false, attrs)) { *pval = unassigned_mem_read(mr, addr, size); return MEMTX_DECODE_ERROR; @@ -1612,6 +1612,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr, uint64_t size, bool share, int fd, + ram_addr_t offset, Error **errp) { Error *err = NULL; @@ -1621,7 +1622,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr, mr->destructor = memory_region_destructor_ram; mr->ram_block = qemu_ram_alloc_from_fd(size, mr, share ? RAM_SHARED : 0, - fd, false, &err); + fd, offset, false, &err); if (err) { mr->size = int128_zero(); object_unparent(OBJECT(mr)); @@ -3285,8 +3286,7 @@ void memory_region_init_rom_device(MemoryRegion *mr, #ifdef CONFIG_FUZZ void __attribute__((weak)) fuzz_dma_read_cb(size_t addr, size_t len, - MemoryRegion *mr, - bool is_write) + MemoryRegion *mr) { } #endif diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 243c3097d3..19e0aa9836 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -1543,6 +1543,7 @@ static void *file_ram_alloc(RAMBlock *block, int fd, bool readonly, bool truncate, + off_t offset, Error **errp) { void *area; @@ -1593,7 +1594,8 @@ static void *file_ram_alloc(RAMBlock *block, } area = qemu_ram_mmap(fd, memory, block->mr->align, readonly, - block->flags & RAM_SHARED, block->flags & RAM_PMEM); + block->flags & RAM_SHARED, block->flags & RAM_PMEM, + offset); if (area == MAP_FAILED) { error_setg_errno(errp, errno, "unable to map backing store for guest RAM"); @@ -2024,8 +2026,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared) #ifdef CONFIG_POSIX RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, - uint32_t ram_flags, int fd, bool readonly, - Error **errp) + uint32_t ram_flags, int fd, off_t offset, + bool readonly, Error **errp) { RAMBlock *new_block; Error *local_err = NULL; @@ -2079,7 +2081,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, new_block->max_length = size; new_block->flags = ram_flags; new_block->host = file_ram_alloc(new_block, size, fd, readonly, - !file_size, errp); + !file_size, offset, errp); if (!new_block->host) { g_free(new_block); return NULL; @@ -2110,7 +2112,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, return NULL; } - block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, readonly, errp); + block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp); if (!block) { if (created) { unlink(mem_path); @@ -2839,7 +2841,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, stn_he_p(buf, l, val); } else { /* RAM case */ - fuzz_dma_read_cb(addr, len, mr, false); + fuzz_dma_read_cb(addr, len, mr); ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); memcpy(buf, ram_ptr, l); } @@ -3200,7 +3202,7 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs); - fuzz_dma_read_cb(addr, *plen, mr, is_write); + fuzz_dma_read_cb(addr, *plen, mr); ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); return ptr; diff --git a/softmmu/rtc.c b/softmmu/rtc.c index e1e15ef613..5632684fc9 100644 --- a/softmmu/rtc.c +++ b/softmmu/rtc.c @@ -179,7 +179,8 @@ void configure_rtc(QemuOpts *opts) if (!strcmp(value, "slew")) { object_register_sugar_prop("mc146818rtc", "lost_tick_policy", - "slew"); + "slew", + false); } else if (!strcmp(value, "none")) { /* discard is default */ } else { diff --git a/softmmu/vl.c b/softmmu/vl.c index 2bf94ece9c..b219ce1f35 100644 --- a/softmmu/vl.c +++ b/softmmu/vl.c @@ -101,6 +101,7 @@ #include "qemu/plugin.h" #include "qemu/queue.h" #include "sysemu/arch_init.h" +#include "exec/confidential-guest-support.h" #include "ui/qemu-spice.h" #include "qapi/string-input-visitor.h" @@ -1663,16 +1664,20 @@ static int machine_set_property(void *opaque, return 0; } if (g_str_equal(qom_name, "igd-passthru")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value, + false); return 0; } if (g_str_equal(qom_name, "kvm-shadow-mem")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value, + false); return 0; } if (g_str_equal(qom_name, "kernel-irqchip")) { - object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value); - object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value); + object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value, + false); + object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value, + false); return 0; } @@ -2298,9 +2303,10 @@ static void qemu_process_sugar_options(void) val = g_strdup_printf("%d", (uint32_t) qemu_opt_get_number(qemu_find_opts_singleton("smp-opts"), "cpus", 1)); - object_register_sugar_prop("memory-backend", "prealloc-threads", val); + object_register_sugar_prop("memory-backend", "prealloc-threads", val, + false); g_free(val); - object_register_sugar_prop("memory-backend", "prealloc", "on"); + object_register_sugar_prop("memory-backend", "prealloc", "on", false); } if (watchdog) { @@ -2493,6 +2499,8 @@ static void qemu_create_cli_devices(void) static void qemu_machine_creation_done(void) { + MachineState *machine = MACHINE(qdev_get_machine()); + /* Did we create any drives that we failed to create a device for? */ drive_check_orphaned(); @@ -2512,6 +2520,13 @@ static void qemu_machine_creation_done(void) qdev_machine_creation_done(); + if (machine->cgs) { + /* + * Verify that Confidential Guest Support has actually been initialized + */ + assert(machine->cgs->ready); + } + if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) { exit(1); } @@ -2530,7 +2545,7 @@ void qmp_x_exit_preconfig(Error **errp) if (loadvm) { Error *local_err = NULL; - if (load_snapshot(loadvm, &local_err) < 0) { + if (!load_snapshot(loadvm, NULL, false, NULL, &local_err)) { error_report_err(local_err); autostart = 0; exit(1); diff --git a/stubs/meson.build b/stubs/meson.build index 1a656cd070..a054d5877f 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -53,4 +53,6 @@ endif if have_system stub_ss.add(files('semihost.c')) stub_ss.add(files('xen-hw-stub.c')) +else + stub_ss.add(files('qdev.c')) endif diff --git a/stubs/qdev.c b/stubs/qdev.c new file mode 100644 index 0000000000..92e6143134 --- /dev/null +++ b/stubs/qdev.c @@ -0,0 +1,23 @@ +/* + * QOM stubs + * + * Copyright (c) 2021 Red Hat, Inc. + * + * Author: + * Philippe Mathieu-Daudé <philmd@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/qapi-events-qdev.h" + +void qapi_event_send_device_deleted(bool has_device, + const char *device, + const char *path) +{ + /* Nothing to do. */ +} diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 8ddb2556f8..5cf6c056c5 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2202,6 +2202,10 @@ static void arm_max_initfn(Object *obj) t = FIELD_DP32(t, ID_MMFR4, CNP, 1); /* TTCNP */ t = FIELD_DP32(t, ID_MMFR4, XNX, 1); /* TTS2UXN */ cpu->isar.id_mmfr4 = t; + + t = cpu->isar.id_pfr0; + t = FIELD_DP32(t, ID_PFR0, DIT, 1); + cpu->isar.id_pfr0 = t; } #endif } diff --git a/target/arm/cpu.h b/target/arm/cpu.h index d080239863..f240275407 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -1243,6 +1243,7 @@ void pmu_init(ARMCPU *cpu); #define CPSR_IT_2_7 (0xfc00U) #define CPSR_GE (0xfU << 16) #define CPSR_IL (1U << 20) +#define CPSR_DIT (1U << 21) #define CPSR_PAN (1U << 22) #define CPSR_J (1U << 24) #define CPSR_IT_0_1 (3U << 25) @@ -1310,6 +1311,7 @@ void pmu_init(ARMCPU *cpu); #define PSTATE_SS (1U << 21) #define PSTATE_PAN (1U << 22) #define PSTATE_UAO (1U << 23) +#define PSTATE_DIT (1U << 24) #define PSTATE_TCO (1U << 25) #define PSTATE_V (1U << 28) #define PSTATE_C (1U << 29) @@ -3876,6 +3878,11 @@ static inline bool isar_feature_aa32_tts2uxn(const ARMISARegisters *id) return FIELD_EX32(id->id_mmfr4, ID_MMFR4, XNX) != 0; } +static inline bool isar_feature_aa32_dit(const ARMISARegisters *id) +{ + return FIELD_EX32(id->id_pfr0, ID_PFR0, DIT) != 0; +} + /* * 64-bit feature tests via id registers. */ @@ -4033,6 +4040,11 @@ static inline bool isar_feature_aa64_aa32(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, EL0) >= 2; } +static inline bool isar_feature_aa64_aa32_el1(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, EL1) >= 2; +} + static inline bool isar_feature_aa64_sve(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, SVE) != 0; @@ -4120,6 +4132,11 @@ static inline bool isar_feature_aa64_tts2uxn(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, XNX) != 0; } +static inline bool isar_feature_aa64_dit(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, DIT) != 0; +} + /* * Feature tests for "does this exist in either 32-bit or 64-bit?" */ diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 10c5118176..c255f1bcc3 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -669,6 +669,7 @@ static void aarch64_max_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64PFR0, FP, 1); t = FIELD_DP64(t, ID_AA64PFR0, ADVSIMD, 1); t = FIELD_DP64(t, ID_AA64PFR0, SEL2, 1); + t = FIELD_DP64(t, ID_AA64PFR0, DIT, 1); cpu->isar.id_aa64pfr0 = t; t = cpu->isar.id_aa64pfr1; @@ -718,6 +719,10 @@ static void aarch64_max_initfn(Object *obj) u = FIELD_DP32(u, ID_ISAR6, SPECRES, 1); cpu->isar.id_isar6 = u; + u = cpu->isar.id_pfr0; + u = FIELD_DP32(u, ID_PFR0, DIT, 1); + cpu->isar.id_pfr0 = u; + u = cpu->isar.id_mmfr3; u = FIELD_DP32(u, ID_MMFR3, PAN, 2); /* ATS1E1 */ cpu->isar.id_mmfr3 = u; diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c index c426c23d2c..ae611d73c2 100644 --- a/target/arm/helper-a64.c +++ b/target/arm/helper-a64.c @@ -945,11 +945,31 @@ static int el_from_spsr(uint32_t spsr) } } +static void cpsr_write_from_spsr_elx(CPUARMState *env, + uint32_t val) +{ + uint32_t mask; + + /* Save SPSR_ELx.SS into PSTATE. */ + env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS); + val &= ~PSTATE_SS; + + /* Move DIT to the correct location for CPSR */ + if (val & PSTATE_DIT) { + val &= ~PSTATE_DIT; + val |= CPSR_DIT; + } + + mask = aarch32_cpsr_valid_mask(env->features, \ + &env_archcpu(env)->isar); + cpsr_write(env, val, mask, CPSRWriteRaw); +} + void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) { int cur_el = arm_current_el(env); unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el); - uint32_t mask, spsr = env->banked_spsr[spsr_idx]; + uint32_t spsr = env->banked_spsr[spsr_idx]; int new_el; bool return_to_aa64 = (spsr & PSTATE_nRW) == 0; @@ -998,10 +1018,9 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc) * will sort the register banks out for us, and we've already * caught all the bad-mode cases in el_from_spsr(). */ - mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar); - cpsr_write(env, spsr, mask, CPSRWriteRaw); + cpsr_write_from_spsr_elx(env, spsr); if (!arm_singlestep_active(env)) { - env->uncached_cpsr &= ~PSTATE_SS; + env->pstate &= ~PSTATE_SS; } aarch64_sync_64_to_32(env); diff --git a/target/arm/helper.c b/target/arm/helper.c index 1a64bd748c..0e1a3b9421 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -38,6 +38,7 @@ #endif #define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */ +#define PMCR_NUM_COUNTERS 4 /* QEMU IMPDEF choice */ #ifndef CONFIG_USER_ONLY @@ -2024,7 +2025,10 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) ARMCPU *cpu = env_archcpu(env); if (ri->state == ARM_CP_STATE_AA64) { - value |= SCR_FW | SCR_AW; /* these two bits are RES1. */ + if (arm_feature(env, ARM_FEATURE_AARCH64) && + !cpu_isar_feature(aa64_aa32_el1, cpu)) { + value |= SCR_FW | SCR_AW; /* these two bits are RES1. */ + } valid_mask &= ~SCR_NET; if (cpu_isar_feature(aa64_lor, cpu)) { @@ -2063,6 +2067,15 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) raw_write(env, ri, value); } +static void scr_reset(CPUARMState *env, const ARMCPRegInfo *ri) +{ + /* + * scr_write will set the RES1 bits on an AArch64-only CPU. + * The reset value will be 0x30 on an AArch64-only CPU and 0 otherwise. + */ + scr_write(env, ri, 0); +} + static CPAccessResult access_aa64_tid2(CPUARMState *env, const ARMCPRegInfo *ri, bool isread) @@ -4419,6 +4432,24 @@ static const ARMCPRegInfo uao_reginfo = { .readfn = aa64_uao_read, .writefn = aa64_uao_write }; +static uint64_t aa64_dit_read(CPUARMState *env, const ARMCPRegInfo *ri) +{ + return env->pstate & PSTATE_DIT; +} + +static void aa64_dit_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + env->pstate = (env->pstate & ~PSTATE_DIT) | (value & PSTATE_DIT); +} + +static const ARMCPRegInfo dit_reginfo = { + .name = "DIT", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 4, .crm = 2, .opc2 = 5, + .type = ARM_CP_NO_RAW, .access = PL0_RW, + .readfn = aa64_dit_read, .writefn = aa64_dit_write +}; + static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env, const ARMCPRegInfo *ri, bool isread) @@ -5705,13 +5736,11 @@ static const ARMCPRegInfo el2_cp_reginfo[] = { .writefn = gt_hyp_ctl_write, .raw_writefn = raw_write }, #endif /* The only field of MDCR_EL2 that has a defined architectural reset value - * is MDCR_EL2.HPMN which should reset to the value of PMCR_EL0.N; but we - * don't implement any PMU event counters, so using zero as a reset - * value for MDCR_EL2 is okay + * is MDCR_EL2.HPMN which should reset to the value of PMCR_EL0.N. */ { .name = "MDCR_EL2", .state = ARM_CP_STATE_BOTH, .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 1, - .access = PL2_RW, .resetvalue = 0, + .access = PL2_RW, .resetvalue = PMCR_NUM_COUNTERS, .fieldoffset = offsetof(CPUARMState, cp15.mdcr_el2), }, { .name = "HPFAR", .state = ARM_CP_STATE_AA32, .cp = 15, .opc1 = 4, .crn = 6, .crm = 0, .opc2 = 4, @@ -5785,7 +5814,7 @@ static const ARMCPRegInfo el3_cp_reginfo[] = { { .name = "SCR_EL3", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 1, .opc2 = 0, .access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.scr_el3), - .resetvalue = 0, .writefn = scr_write }, + .resetfn = scr_reset, .writefn = scr_write }, { .name = "SCR", .type = ARM_CP_ALIAS | ARM_CP_NEWEL, .cp = 15, .opc1 = 0, .crn = 1, .crm = 1, .opc2 = 0, .access = PL1_RW, .accessfn = access_trap_aa32s_el1, @@ -6642,7 +6671,7 @@ static void define_pmu_regs(ARMCPU *cpu) * field as main ID register, and we implement four counters in * addition to the cycle count register. */ - unsigned int i, pmcrn = 4; + unsigned int i, pmcrn = PMCR_NUM_COUNTERS; ARMCPRegInfo pmcr = { .name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0, .access = PL0_RW, @@ -8212,6 +8241,10 @@ void register_cp_regs_for_features(ARMCPU *cpu) define_one_arm_cp_reg(cpu, &uao_reginfo); } + if (cpu_isar_feature(aa64_dit, cpu)) { + define_one_arm_cp_reg(cpu, &dit_reginfo); + } + if (arm_feature(env, ARM_FEATURE_EL2) && cpu_isar_feature(aa64_vh, cpu)) { define_arm_cp_regs(cpu, vhe_reginfo); } @@ -9411,7 +9444,7 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode, * For exceptions taken to AArch32 we must clear the SS bit in both * PSTATE and in the old-state value we save to SPSR_<mode>, so zero it now. */ - env->uncached_cpsr &= ~PSTATE_SS; + env->pstate &= ~PSTATE_SS; env->spsr = cpsr_read(env); /* Clear IT bits. */ env->condexec_bits = 0; @@ -9767,6 +9800,21 @@ static int aarch64_regnum(CPUARMState *env, int aarch32_reg) } } +static uint32_t cpsr_read_for_spsr_elx(CPUARMState *env) +{ + uint32_t ret = cpsr_read(env); + + /* Move DIT to the correct location for SPSR_ELx */ + if (ret & CPSR_DIT) { + ret &= ~CPSR_DIT; + ret |= PSTATE_DIT; + } + /* Merge PSTATE.SS into SPSR_ELx */ + ret |= env->pstate & PSTATE_SS; + + return ret; +} + /* Handle exception entry to a target EL which is using AArch64 */ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) { @@ -9889,7 +9937,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) aarch64_save_sp(env, arm_current_el(env)); env->elr_el[new_el] = env->pc; } else { - old_mode = cpsr_read(env); + old_mode = cpsr_read_for_spsr_elx(env); env->elr_el[new_el] = env->regs[15]; aarch64_sync_32_to_64(env); @@ -13183,7 +13231,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, target_ulong *cs_base, uint32_t *pflags) { uint32_t flags = env->hflags; - uint32_t pstate_for_ss; *cs_base = 0; assert_hflags_rebuild_correctly(env); @@ -13193,7 +13240,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, if (cpu_isar_feature(aa64_bti, env_archcpu(env))) { flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype); } - pstate_for_ss = env->pstate; } else { *pc = env->regs[15]; @@ -13241,7 +13287,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, flags = FIELD_DP32(flags, TBFLAG_AM32, THUMB, env->thumb); flags = FIELD_DP32(flags, TBFLAG_AM32, CONDEXEC, env->condexec_bits); - pstate_for_ss = env->uncached_cpsr; } /* @@ -13254,7 +13299,7 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, * SS_ACTIVE is set in hflags; PSTATE_SS is computed every TB. */ if (FIELD_EX32(flags, TBFLAG_ANY, SS_ACTIVE) && - (pstate_for_ss & PSTATE_SS)) { + (env->pstate & PSTATE_SS)) { flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1); } diff --git a/target/arm/internals.h b/target/arm/internals.h index 448982dd2f..b251fe4450 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1228,6 +1228,9 @@ static inline uint32_t aarch32_cpsr_valid_mask(uint64_t features, if (isar_feature_aa32_pan(id)) { valid |= CPSR_PAN; } + if (isar_feature_aa32_dit(id)) { + valid |= CPSR_DIT; + } return valid; } @@ -1246,6 +1249,9 @@ static inline uint32_t aarch64_pstate_valid_mask(const ARMISARegisters *id) if (isar_feature_aa64_uao(id)) { valid |= PSTATE_UAO; } + if (isar_feature_aa64_dit(id)) { + valid |= PSTATE_DIT; + } if (isar_feature_aa64_mte(id)) { valid |= PSTATE_TCO; } diff --git a/target/arm/machine.c b/target/arm/machine.c index 581852bc53..6ad1d306b1 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -810,7 +810,7 @@ const VMStateDescription vmstate_arm_cpu = { VMSTATE_UINT64(env.exclusive_addr, ARMCPU), VMSTATE_UINT64(env.exclusive_val, ARMCPU), VMSTATE_UINT64(env.exclusive_high, ARMCPU), - VMSTATE_UINT64(env.features, ARMCPU), + VMSTATE_UNUSED(sizeof(uint64_t)), VMSTATE_UINT32(env.exception.syndrome, ARMCPU), VMSTATE_UINT32(env.exception.fsr, ARMCPU), VMSTATE_UINT64(env.exception.vaddress, ARMCPU), diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c index 5e0f123043..65cb37d088 100644 --- a/target/arm/op_helper.c +++ b/target/arm/op_helper.c @@ -389,14 +389,7 @@ void HELPER(exception_bkpt_insn)(CPUARMState *env, uint32_t syndrome) uint32_t HELPER(cpsr_read)(CPUARMState *env) { - /* - * We store the ARMv8 PSTATE.SS bit in env->uncached_cpsr. - * This is convenient for populating SPSR_ELx, but must be - * hidden from aarch32 mode, where it is not visible. - * - * TODO: ARMv8.4-DIT -- need to move SS somewhere else. - */ - return cpsr_read(env) & ~(CPSR_EXEC | PSTATE_SS); + return cpsr_read(env) & ~CPSR_EXEC; } void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index ffc060e5d7..1c4b8d02f3 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -1700,6 +1700,18 @@ static void handle_msr_i(DisasContext *s, uint32_t insn, tcg_temp_free_i32(t1); break; + case 0x1a: /* DIT */ + if (!dc_isar_feature(aa64_dit, s)) { + goto do_unallocated; + } + if (crm & 1) { + set_pstate_bits(PSTATE_DIT); + } else { + clear_pstate_bits(PSTATE_DIT); + } + /* There's no need to rebuild hflags because DIT is a nop */ + break; + case 0x1e: /* DAIFSet */ t1 = tcg_const_i32(crm); gen_helper_msr_i_daifset(cpu_env, t1); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ae89024d36..9c3d2d60b7 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -667,7 +667,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_7_0_EBX_RDSEED */ #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_PKU | \ /* CPUID_7_0_ECX_OSPKE is dynamic */ \ - CPUID_7_0_ECX_LA57) + CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS) #define TCG_7_0_EDX_FEATURES 0 #define TCG_7_1_EAX_FEATURES 0 #define TCG_APM_FEATURES 0 @@ -926,11 +926,11 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "npt", "lbrv", "svm-lock", "nrip-save", "tsc-scale", "vmcb-clean", "flushbyasid", "decodeassists", NULL, NULL, "pause-filter", NULL, - "pfthreshold", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + "pfthreshold", "avic", NULL, "v-vmsave-vmload", + "vgif", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "svme-addr-chk", NULL, NULL, NULL, }, .cpuid = { .eax = 0x8000000A, .reg = R_EDX, }, .tcg_features = TCG_SVM_FEATURES, @@ -964,7 +964,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "la57", NULL, NULL, NULL, NULL, NULL, "rdpid", NULL, NULL, "cldemote", NULL, "movdiri", - "movdir64b", NULL, NULL, NULL, + "movdir64b", NULL, NULL, "pks", }, .cpuid = { .eax = 7, @@ -1215,7 +1215,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "vmx-exit-save-efer", "vmx-exit-load-efer", "vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs", NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL, - NULL, NULL, NULL, NULL, + NULL, "vmx-exit-load-pkrs", NULL, NULL, }, .msr = { .index = MSR_IA32_VMX_TRUE_EXIT_CTLS, @@ -1230,7 +1230,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { NULL, "vmx-entry-ia32e-mode", NULL, NULL, NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer", "vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL, - NULL, NULL, NULL, NULL, + NULL, NULL, "vmx-entry-load-pkrs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }, @@ -5073,6 +5073,11 @@ static uint64_t x86_cpu_get_supported_feature_word(FeatureWord w, } else { return ~0; } +#ifndef TARGET_X86_64 + if (w == FEAT_8000_0001_EDX) { + r &= ~CPUID_EXT2_LM; + } +#endif if (migratable_only) { r &= x86_cpu_get_migratable_flags(w); } diff --git a/target/i386/cpu.h b/target/i386/cpu.h index d23a5b340a..8d599bb5b8 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -247,6 +247,7 @@ typedef enum X86Seg { #define CR4_SMEP_MASK (1U << 20) #define CR4_SMAP_MASK (1U << 21) #define CR4_PKE_MASK (1U << 22) +#define CR4_PKS_MASK (1U << 24) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) @@ -357,6 +358,7 @@ typedef enum X86Seg { #define MSR_IA32_TSX_CTRL 0x122 #define MSR_IA32_TSCDEADLINE 0x6e0 +#define MSR_IA32_PKRS 0x6e1 #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) @@ -670,16 +672,20 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_EXT3_PERFCORE (1U << 23) #define CPUID_EXT3_PERFNB (1U << 24) -#define CPUID_SVM_NPT (1U << 0) -#define CPUID_SVM_LBRV (1U << 1) -#define CPUID_SVM_SVMLOCK (1U << 2) -#define CPUID_SVM_NRIPSAVE (1U << 3) -#define CPUID_SVM_TSCSCALE (1U << 4) -#define CPUID_SVM_VMCBCLEAN (1U << 5) -#define CPUID_SVM_FLUSHASID (1U << 6) -#define CPUID_SVM_DECODEASSIST (1U << 7) -#define CPUID_SVM_PAUSEFILTER (1U << 10) -#define CPUID_SVM_PFTHRESHOLD (1U << 12) +#define CPUID_SVM_NPT (1U << 0) +#define CPUID_SVM_LBRV (1U << 1) +#define CPUID_SVM_SVMLOCK (1U << 2) +#define CPUID_SVM_NRIPSAVE (1U << 3) +#define CPUID_SVM_TSCSCALE (1U << 4) +#define CPUID_SVM_VMCBCLEAN (1U << 5) +#define CPUID_SVM_FLUSHASID (1U << 6) +#define CPUID_SVM_DECODEASSIST (1U << 7) +#define CPUID_SVM_PAUSEFILTER (1U << 10) +#define CPUID_SVM_PFTHRESHOLD (1U << 12) +#define CPUID_SVM_AVIC (1U << 13) +#define CPUID_SVM_V_VMSAVE_VMLOAD (1U << 15) +#define CPUID_SVM_VGIF (1U << 16) +#define CPUID_SVM_SVME_ADDR_CHK (1U << 28) /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ #define CPUID_7_0_EBX_FSGSBASE (1U << 0) @@ -768,6 +774,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_ECX_MOVDIRI (1U << 27) /* Move 64 Bytes as Direct Store Instruction */ #define CPUID_7_0_ECX_MOVDIR64B (1U << 28) +/* Protection Keys for Supervisor-mode Pages */ +#define CPUID_7_0_ECX_PKS (1U << 31) /* AVX512 Neural Network Instructions */ #define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) @@ -965,6 +973,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define VMX_VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VMX_VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 +#define VMX_VM_EXIT_LOAD_IA32_PKRS 0x20000000 #define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VMX_VM_ENTRY_IA32E_MODE 0x00000200 @@ -976,6 +985,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define VMX_VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VMX_VM_ENTRY_PT_CONCEAL_PIP 0x00020000 #define VMX_VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 +#define VMX_VM_ENTRY_LOAD_IA32_PKRS 0x00400000 /* Supported Hyper-V Enlightenments */ #define HYPERV_FEAT_RELAXED 0 @@ -1483,6 +1493,7 @@ typedef struct CPUX86State { uint64_t msr_smi_count; uint32_t pkru; + uint32_t pkrs; uint32_t tsx_ctrl; uint64_t spec_ctrl; diff --git a/target/i386/helper.c b/target/i386/helper.c index 6bb0c53182..618ad1c409 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -194,6 +194,9 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4) if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKU)) { new_cr4 &= ~CR4_PKE_MASK; } + if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) { + new_cr4 &= ~CR4_PKS_MASK; + } env->cr[4] = new_cr4; env->hflags = hflags; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6dc1ee052d..e97f841757 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -42,6 +42,7 @@ #include "hw/i386/intel_iommu.h" #include "hw/i386/x86-iommu.h" #include "hw/i386/e820_memory_layout.h" +#include "sysemu/sev.h" #include "hw/pci/pci.h" #include "hw/pci/msi.h" @@ -112,6 +113,7 @@ static bool has_msr_vmx_vmfunc; static bool has_msr_ucode_rev; static bool has_msr_vmx_procbased_ctls2; static bool has_msr_perf_capabs; +static bool has_msr_pkrs; static uint32_t has_architectural_pmu_version; static uint32_t num_architectural_pmu_gp_counters; @@ -2086,6 +2088,9 @@ static int kvm_get_supported_msrs(KVMState *s) case MSR_IA32_VMX_PROCBASED_CTLS2: has_msr_vmx_procbased_ctls2 = true; break; + case MSR_IA32_PKRS: + has_msr_pkrs = true; + break; } } } @@ -2135,6 +2140,25 @@ int kvm_arch_init(MachineState *ms, KVMState *s) uint64_t shadow_mem; int ret; struct utsname utsname; + Error *local_err = NULL; + + /* + * Initialize SEV context, if required + * + * If no memory encryption is requested (ms->cgs == NULL) this is + * a no-op. + * + * It's also a no-op if a non-SEV confidential guest support + * mechanism is selected. SEV is the only mechanism available to + * select on x86 at present, so this doesn't arise, but if new + * mechanisms are supported in future (e.g. TDX), they'll need + * their own initialization either here or elsewhere. + */ + ret = sev_kvm_init(ms->cgs, &local_err); + if (ret < 0) { + error_report_err(local_err); + return ret; + } if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) { error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM"); @@ -2794,6 +2818,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level) if (has_msr_smi_count) { kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count); } + if (has_msr_pkrs) { + kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs); + } if (has_msr_bndcfgs) { kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs); } @@ -3185,6 +3212,9 @@ static int kvm_get_msrs(X86CPU *cpu) if (has_msr_feature_control) { kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0); } + if (has_msr_pkrs) { + kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0); + } if (has_msr_bndcfgs) { kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0); } @@ -3455,6 +3485,9 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_IA32_UMWAIT_CONTROL: env->umwait = msrs[i].data; break; + case MSR_IA32_PKRS: + env->pkrs = msrs[i].data; + break; default: if (msrs[i].index >= MSR_MC0_CTL && msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) { diff --git a/target/i386/machine.c b/target/i386/machine.c index 1614e8c2f8..3768a753af 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -980,7 +980,6 @@ static const VMStateDescription vmstate_umwait = { } }; -#ifdef TARGET_X86_64 static bool pkru_needed(void *opaque) { X86CPU *cpu = opaque; @@ -999,7 +998,25 @@ static const VMStateDescription vmstate_pkru = { VMSTATE_END_OF_LIST() } }; -#endif + +static bool pkrs_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return env->pkrs != 0; +} + +static const VMStateDescription vmstate_pkrs = { + .name = "cpu/pkrs", + .version_id = 1, + .minimum_version_id = 1, + .needed = pkrs_needed, + .fields = (VMStateField[]){ + VMSTATE_UINT32(env.pkrs, X86CPU), + VMSTATE_END_OF_LIST() + } +}; static bool tsc_khz_needed(void *opaque) { @@ -1480,9 +1497,8 @@ VMStateDescription vmstate_x86_cpu = { &vmstate_umwait, &vmstate_tsc_khz, &vmstate_msr_smi_count, -#ifdef TARGET_X86_64 &vmstate_pkru, -#endif + &vmstate_pkrs, &vmstate_spec_ctrl, &vmstate_mcg_ext_ctl, &vmstate_msr_intel_pt, diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c index c1fecc2101..1ac1fd5b94 100644 --- a/target/i386/sev-stub.c +++ b/target/i386/sev-stub.c @@ -54,3 +54,8 @@ int sev_inject_launch_secret(const char *hdr, const char *secret, { return 1; } + +int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) +{ + return 0; +} diff --git a/target/i386/sev.c b/target/i386/sev.c index 1546606811..11c9a3cc21 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -31,6 +31,7 @@ #include "qom/object.h" #include "exec/address-spaces.h" #include "monitor/monitor.h" +#include "exec/confidential-guest-support.h" #define TYPE_SEV_GUEST "sev-guest" OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) @@ -47,7 +48,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) * -machine ...,memory-encryption=sev0 */ struct SevGuestState { - Object parent_obj; + ConfidentialGuestSupport parent_obj; /* configuration parameters */ char *sev_device; @@ -322,7 +323,7 @@ sev_guest_instance_init(Object *obj) /* sev guest info */ static const TypeInfo sev_guest_info = { - .parent = TYPE_OBJECT, + .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, .name = TYPE_SEV_GUEST, .instance_size = sizeof(SevGuestState), .instance_finalize = sev_guest_finalize, @@ -334,26 +335,6 @@ static const TypeInfo sev_guest_info = { } }; -static SevGuestState * -lookup_sev_guest_info(const char *id) -{ - Object *obj; - SevGuestState *info; - - obj = object_resolve_path_component(object_get_objects_root(), id); - if (!obj) { - return NULL; - } - - info = (SevGuestState *) - object_dynamic_cast(obj, TYPE_SEV_GUEST); - if (!info) { - return NULL; - } - - return info; -} - bool sev_enabled(void) { @@ -681,27 +662,24 @@ sev_vm_state_change(void *opaque, int running, RunState state) } } -void * -sev_guest_init(const char *id) +int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) { - SevGuestState *sev; + SevGuestState *sev + = (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST); char *devname; int ret, fw_error; uint32_t ebx; uint32_t host_cbitpos; struct sev_user_data_status status = {}; + if (!sev) { + return 0; + } + ret = ram_block_discard_disable(true); if (ret) { error_report("%s: cannot disable RAM discard", __func__); - return NULL; - } - - sev = lookup_sev_guest_info(id); - if (!sev) { - error_report("%s: '%s' is not a valid '%s' object", - __func__, id, TYPE_SEV_GUEST); - goto err; + return -1; } sev_guest = sev; @@ -711,14 +689,14 @@ sev_guest_init(const char *id) host_cbitpos = ebx & 0x3f; if (host_cbitpos != sev->cbitpos) { - error_report("%s: cbitpos check failed, host '%d' requested '%d'", - __func__, host_cbitpos, sev->cbitpos); + error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'", + __func__, host_cbitpos, sev->cbitpos); goto err; } if (sev->reduced_phys_bits < 1) { - error_report("%s: reduced_phys_bits check failed, it should be >=1," - " requested '%d'", __func__, sev->reduced_phys_bits); + error_setg(errp, "%s: reduced_phys_bits check failed, it should be >=1," + " requested '%d'", __func__, sev->reduced_phys_bits); goto err; } @@ -727,20 +705,19 @@ sev_guest_init(const char *id) devname = object_property_get_str(OBJECT(sev), "sev-device", NULL); sev->sev_fd = open(devname, O_RDWR); if (sev->sev_fd < 0) { - error_report("%s: Failed to open %s '%s'", __func__, - devname, strerror(errno)); - } - g_free(devname); - if (sev->sev_fd < 0) { + error_setg(errp, "%s: Failed to open %s '%s'", __func__, + devname, strerror(errno)); + g_free(devname); goto err; } + g_free(devname); ret = sev_platform_ioctl(sev->sev_fd, SEV_PLATFORM_STATUS, &status, &fw_error); if (ret) { - error_report("%s: failed to get platform status ret=%d " - "fw_error='%d: %s'", __func__, ret, fw_error, - fw_error_to_str(fw_error)); + error_setg(errp, "%s: failed to get platform status ret=%d " + "fw_error='%d: %s'", __func__, ret, fw_error, + fw_error_to_str(fw_error)); goto err; } sev->build_id = status.build; @@ -750,14 +727,14 @@ sev_guest_init(const char *id) trace_kvm_sev_init(); ret = sev_ioctl(sev->sev_fd, KVM_SEV_INIT, NULL, &fw_error); if (ret) { - error_report("%s: failed to initialize ret=%d fw_error=%d '%s'", - __func__, ret, fw_error, fw_error_to_str(fw_error)); + error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); goto err; } ret = sev_launch_start(sev); if (ret) { - error_report("%s: failed to create encryption context", __func__); + error_setg(errp, "%s: failed to create encryption context", __func__); goto err; } @@ -765,23 +742,29 @@ sev_guest_init(const char *id) qemu_add_machine_init_done_notifier(&sev_machine_done_notify); qemu_add_vm_change_state_handler(sev_vm_state_change, sev); - return sev; + cgs->ready = true; + + return 0; err: sev_guest = NULL; ram_block_discard_disable(false); - return NULL; + return -1; } int -sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len) +sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) { - SevGuestState *sev = handle; - - assert(sev); + if (!sev_guest) { + return 0; + } /* if SEV is in update state then encrypt the data else do nothing */ - if (sev_check_state(sev, SEV_STATE_LAUNCH_UPDATE)) { - return sev_launch_update_data(sev, ptr, len); + if (sev_check_state(sev_guest, SEV_STATE_LAUNCH_UPDATE)) { + int ret = sev_launch_update_data(sev_guest, ptr, len); + if (ret < 0) { + error_setg(errp, "failed to encrypt pflash rom"); + return ret; + } } return 0; diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c index a0f44431fe..b7d6259e4a 100644 --- a/target/i386/tcg/excp_helper.c +++ b/target/i386/tcg/excp_helper.c @@ -361,6 +361,7 @@ static int handle_mmu_fault(CPUState *cs, vaddr addr, int size, uint64_t rsvd_mask = PG_HI_RSVD_MASK; uint32_t page_offset; target_ulong vaddr; + uint32_t pkr; is_user = mmu_idx == MMU_USER_IDX; #if defined(DEBUG_MMU) @@ -588,21 +589,28 @@ do_check_protect_pse36: !((env->cr[4] & CR4_SMEP_MASK) && (ptep & PG_USER_MASK)))) { prot |= PAGE_EXEC; } - if ((env->cr[4] & CR4_PKE_MASK) && (env->hflags & HF_LMA_MASK) && - (ptep & PG_USER_MASK) && env->pkru) { + + if (!(env->hflags & HF_LMA_MASK)) { + pkr = 0; + } else if (ptep & PG_USER_MASK) { + pkr = env->cr[4] & CR4_PKE_MASK ? env->pkru : 0; + } else { + pkr = env->cr[4] & CR4_PKS_MASK ? env->pkrs : 0; + } + if (pkr) { uint32_t pk = (pte & PG_PKRU_MASK) >> PG_PKRU_BIT; - uint32_t pkru_ad = (env->pkru >> pk * 2) & 1; - uint32_t pkru_wd = (env->pkru >> pk * 2) & 2; - uint32_t pkru_prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; - - if (pkru_ad) { - pkru_prot &= ~(PAGE_READ | PAGE_WRITE); - } else if (pkru_wd && (is_user || env->cr[0] & CR0_WP_MASK)) { - pkru_prot &= ~PAGE_WRITE; + uint32_t pkr_ad = (pkr >> pk * 2) & 1; + uint32_t pkr_wd = (pkr >> pk * 2) & 2; + uint32_t pkr_prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; + + if (pkr_ad) { + pkr_prot &= ~(PAGE_READ | PAGE_WRITE); + } else if (pkr_wd && (is_user || env->cr[0] & CR0_WP_MASK)) { + pkr_prot &= ~PAGE_WRITE; } - prot &= pkru_prot; - if ((pkru_prot & (1 << is_write1)) == 0) { + prot &= pkr_prot; + if ((pkr_prot & (1 << is_write1)) == 0) { assert(is_write1 != 2); error_code |= PG_ERROR_PK_MASK; goto do_fault_protect; diff --git a/target/i386/tcg/misc_helper.c b/target/i386/tcg/misc_helper.c index 0bd6c95749..f02e4fd400 100644 --- a/target/i386/tcg/misc_helper.c +++ b/target/i386/tcg/misc_helper.c @@ -244,6 +244,7 @@ void helper_rdmsr(CPUX86State *env) void helper_wrmsr(CPUX86State *env) { uint64_t val; + CPUState *cs = env_cpu(env); cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC()); @@ -296,6 +297,13 @@ void helper_wrmsr(CPUX86State *env) case MSR_PAT: env->pat = val; break; + case MSR_IA32_PKRS: + if (val & 0xFFFFFFFF00000000ull) { + goto error; + } + env->pkrs = val; + tlb_flush(cs); + break; case MSR_VM_HSAVE_PA: env->vm_hsave = val; break; @@ -399,6 +407,9 @@ void helper_wrmsr(CPUX86State *env) /* XXX: exception? */ break; } + return; +error: + raise_exception_err_ra(env, EXCP0D_GPF, 0, GETPC()); } void helper_rdmsr(CPUX86State *env) @@ -430,6 +441,9 @@ void helper_rdmsr(CPUX86State *env) case MSR_PAT: val = env->pat; break; + case MSR_IA32_PKRS: + val = env->pkrs; + break; case MSR_VM_HSAVE_PA: val = env->vm_hsave; break; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 6a4c31f933..af1faf9342 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3075,7 +3075,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } if (is_xmm && !(s->flags & HF_OSFXSR_MASK) - && ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA))) { + && (b != 0x38 && b != 0x3a)) { goto unknown_op; } if (b == 0x0e) { diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 2609e4082e..e73416da68 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -1919,6 +1919,7 @@ typedef PowerPCCPU ArchCPU; #define SPR_750FX_HID2 (0x3F8) #define SPR_Exxx_L1FINV0 (0x3F8) #define SPR_L2CR (0x3F9) +#define SPR_Exxx_L2CSR0 (0x3F9) #define SPR_L3CR (0x3FA) #define SPR_750_TDCH (0x3FA) #define SPR_IABR2 (0x3FA) @@ -1974,6 +1975,11 @@ typedef PowerPCCPU ArchCPU; #define L1CSR1_ICFI 0x00000002 /* Instruction Cache Flash Invalidate */ #define L1CSR1_ICE 0x00000001 /* Instruction Cache Enable */ +/* E500 L2CSR0 */ +#define E500_L2CSR0_L2FI (1 << 21) /* L2 cache flash invalidate */ +#define E500_L2CSR0_L2FL (1 << 11) /* L2 cache flush */ +#define E500_L2CSR0_L2LFC (1 << 10) /* L2 cache lock flash clear */ + /* HID0 bits */ #define HID0_DEEPNAP (1 << 24) /* pre-2.06 */ #define HID0_DOZE (1 << 23) /* pre-2.06 */ @@ -2205,9 +2211,6 @@ enum { * may be needed for precise access rights control and precise exceptions. */ enum { - /* 1 bit to define user level / supervisor access */ - ACCESS_USER = 0x00, - ACCESS_SUPER = 0x01, /* Type of instruction that generated the access */ ACCESS_CODE = 0x10, /* Code fetch access */ ACCESS_INT = 0x20, /* Integer load/store access */ diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index daf690a678..0c5056dd5b 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -2929,21 +2929,3 @@ void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset) kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &tb_offset); } } - -/* - * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels - * that don't support this ioctl. - */ -void kvmppc_svm_off(Error **errp) -{ - int rc; - - if (!kvm_enabled()) { - return; - } - - rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF); - if (rc && rc != -ENOTTY) { - error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed"); - } -} diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h index 73ce2bc951..989f61ace0 100644 --- a/target/ppc/kvm_ppc.h +++ b/target/ppc/kvm_ppc.h @@ -39,7 +39,6 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu); target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu, bool radix, bool gtse, uint64_t proc_tbl); -void kvmppc_svm_off(Error **errp); #ifndef CONFIG_USER_ONLY bool kvmppc_spapr_use_multitce(void); int kvmppc_spapr_enable_inkernel_multitce(void); @@ -216,11 +215,6 @@ static inline target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu, return 0; } -static inline void kvmppc_svm_off(Error **errp) -{ - return; -} - static inline void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu, unsigned int online) { diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc index 9867d0a6e4..3ec45cbc19 100644 --- a/target/ppc/translate_init.c.inc +++ b/target/ppc/translate_init.c.inc @@ -1735,6 +1735,16 @@ static void spr_write_e500_l1csr1(DisasContext *ctx, int sprn, int gprn) tcg_temp_free(t0); } +static void spr_write_e500_l2csr0(DisasContext *ctx, int sprn, int gprn) +{ + TCGv t0 = tcg_temp_new(); + + tcg_gen_andi_tl(t0, cpu_gpr[gprn], + ~(E500_L2CSR0_L2FI | E500_L2CSR0_L2FL | E500_L2CSR0_L2LFC)); + gen_store_spr(sprn, t0); + tcg_temp_free(t0); +} + static void spr_write_booke206_mmucsr0(DisasContext *ctx, int sprn, int gprn) { gen_helper_booke206_tlbflush(cpu_env, cpu_gpr[gprn]); @@ -5029,6 +5039,12 @@ static void init_proc_e500(CPUPPCState *env, int version) SPR_NOACCESS, SPR_NOACCESS, &spr_read_generic, &spr_write_e500_l1csr1, 0x00000000); + if (version != fsl_e500v1 && version != fsl_e500v2) { + spr_register(env, SPR_Exxx_L2CSR0, "L2CSR0", + SPR_NOACCESS, SPR_NOACCESS, + &spr_read_generic, &spr_write_e500_l2csr0, + 0x00000000); + } spr_register(env, SPR_BOOKE_MCSRR0, "MCSRR0", SPR_NOACCESS, SPR_NOACCESS, &spr_read_generic, &spr_write_generic, diff --git a/tests/Makefile.include b/tests/Makefile.include index ceaf3f0d6e..d34254fb29 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -12,7 +12,7 @@ check-help: @echo " $(MAKE) check-speed Run qobject speed tests" @echo " $(MAKE) check-qapi-schema Run QAPI schema tests" @echo " $(MAKE) check-block Run block tests" -ifeq ($(CONFIG_TCG),y) +ifneq ($(filter $(all-check-targets), check-softfloat),) @echo " $(MAKE) check-tcg Run TCG tests" @echo " $(MAKE) check-softfloat Run FPU emulation tests" endif @@ -40,11 +40,13 @@ SYSEMU_TARGET_LIST := $(subst -softmmu.mak,,$(notdir \ SPEED = quick -# Per guest TCG tests +# Build up our target list from the filtered list of ninja targets +TARGETS=$(patsubst libqemu-%.fa, %, $(filter libqemu-%.fa, $(ninja-targets))) -BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGET_DIRS)) -CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGET_DIRS)) -RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGET_DIRS)) +# Per guest TCG tests +BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGETS)) +CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGETS)) +RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGETS)) # Probe for the Docker Builds needed for each build $(foreach PROBE_TARGET,$(TARGET_DIRS), \ diff --git a/tests/acceptance/boot_linux.py b/tests/acceptance/boot_linux.py index 1da4a53d6a..bcd923bb4a 100644 --- a/tests/acceptance/boot_linux.py +++ b/tests/acceptance/boot_linux.py @@ -57,7 +57,7 @@ class BootLinuxBase(Test): self.cancel('Failed to download/prepare boot image') return boot.path - def download_cloudinit(self, ssh_pubkey=None): + def prepare_cloudinit(self, ssh_pubkey=None): self.log.info('Preparing cloudinit image') try: cloudinit_iso = os.path.join(self.workdir, 'cloudinit.iso') @@ -70,7 +70,7 @@ class BootLinuxBase(Test): phone_home_port=self.phone_home_port, authorized_key=ssh_pubkey) except Exception: - self.cancel('Failed to prepared cloudinit image') + self.cancel('Failed to prepare the cloudinit image') return cloudinit_iso class BootLinux(BootLinuxBase): @@ -85,15 +85,15 @@ class BootLinux(BootLinuxBase): super(BootLinux, self).setUp() self.vm.add_args('-smp', '2') self.vm.add_args('-m', '1024') - self.prepare_boot() - self.prepare_cloudinit(ssh_pubkey) + self.set_up_boot() + self.set_up_cloudinit(ssh_pubkey) - def prepare_boot(self): + def set_up_boot(self): path = self.download_boot() self.vm.add_args('-drive', 'file=%s' % path) - def prepare_cloudinit(self, ssh_pubkey=None): - cloudinit_iso = self.download_cloudinit(ssh_pubkey) + def set_up_cloudinit(self, ssh_pubkey=None): + cloudinit_iso = self.prepare_cloudinit(ssh_pubkey) self.vm.add_args('-drive', 'file=%s,format=raw' % cloudinit_iso) def launch_and_wait(self): diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py index fb41bb7144..eb01286799 100644 --- a/tests/acceptance/boot_linux_console.py +++ b/tests/acceptance/boot_linux_console.py @@ -802,27 +802,7 @@ class BootLinuxConsole(LinuxKernelTest): # Wait for VM to shut down gracefully self.vm.wait() - @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'), - 'Test artifacts fetched from unreliable dl.armbian.com') - @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited') - @skipUnless(P7ZIP_AVAILABLE, '7z not installed') - def test_arm_orangepi_bionic(self): - """ - :avocado: tags=arch:arm - :avocado: tags=machine:orangepi-pc - :avocado: tags=device:sd - """ - - # This test download a 196MB compressed image and expand it to 1GB - image_url = ('https://dl.armbian.com/orangepipc/archive/' - 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.7z') - image_hash = '196a8ffb72b0123d92cea4a070894813d305c71e' - image_path_7z = self.fetch_asset(image_url, asset_hash=image_hash) - image_name = 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.img' - image_path = os.path.join(self.workdir, image_name) - process.run("7z e -o%s %s" % (self.workdir, image_path_7z)) - image_pow2ceil_expand(image_path) - + def do_test_arm_orangepi_uboot_armbian(self, image_path): self.vm.set_console() self.vm.add_args('-drive', 'file=' + image_path + ',if=sd,format=raw', '-nic', 'user', @@ -848,6 +828,54 @@ class BootLinuxConsole(LinuxKernelTest): 'to <orangepipc>') self.wait_for_console_pattern('Starting Load Kernel Modules...') + @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'), + 'Test artifacts fetched from unreliable apt.armbian.com') + @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited') + @skipUnless(P7ZIP_AVAILABLE, '7z not installed') + def test_arm_orangepi_bionic_19_11(self): + """ + :avocado: tags=arch:arm + :avocado: tags=machine:orangepi-pc + :avocado: tags=device:sd + """ + + # This test download a 196MB compressed image and expand it to 1GB + image_url = ('https://dl.armbian.com/orangepipc/archive/' + 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.7z') + image_hash = '196a8ffb72b0123d92cea4a070894813d305c71e' + image_path_7z = self.fetch_asset(image_url, asset_hash=image_hash) + image_name = 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.img' + image_path = os.path.join(self.workdir, image_name) + process.run("7z e -o%s %s" % (self.workdir, image_path_7z)) + image_pow2ceil_expand(image_path) + + self.do_test_arm_orangepi_uboot_armbian(image_path) + + @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'), + 'Test artifacts fetched from unreliable apt.armbian.com') + @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited') + def test_arm_orangepi_bionic_20_08(self): + """ + :avocado: tags=arch:arm + :avocado: tags=machine:orangepi-pc + :avocado: tags=device:sd + """ + + # This test download a 275 MiB compressed image and expand it + # to 1036 MiB, but the underlying filesystem is 1552 MiB... + # As we expand it to 2 GiB we are safe. + + image_url = ('https://dl.armbian.com/orangepipc/archive/' + 'Armbian_20.08.1_Orangepipc_bionic_current_5.8.5.img.xz') + image_hash = ('b4d6775f5673486329e45a0586bf06b6' + 'dbe792199fd182ac6b9c7bb6c7d3e6dd') + image_path_xz = self.fetch_asset(image_url, asset_hash=image_hash, + algorithm='sha256') + image_path = archive.extract(image_path_xz, self.workdir) + image_pow2ceil_expand(image_path) + + self.do_test_arm_orangepi_uboot_armbian(image_path) + @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited') def test_arm_orangepi_uboot_netbsd9(self): """ @@ -976,25 +1004,6 @@ class BootLinuxConsole(LinuxKernelTest): console_pattern = 'Kernel command line: %s' % kernel_command_line self.wait_for_console_pattern(console_pattern) - def test_ppc64_pseries(self): - """ - :avocado: tags=arch:ppc64 - :avocado: tags=machine:pseries - """ - kernel_url = ('https://archives.fedoraproject.org/pub/archive' - '/fedora-secondary/releases/29/Everything/ppc64le/os' - '/ppc/ppc64/vmlinuz') - kernel_hash = '3fe04abfc852b66653b8c3c897a59a689270bc77' - kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash) - - self.vm.set_console() - kernel_command_line = self.KERNEL_COMMON_COMMAND_LINE + 'console=hvc0' - self.vm.add_args('-kernel', kernel_path, - '-append', kernel_command_line) - self.vm.launch() - console_pattern = 'Kernel command line: %s' % kernel_command_line - self.wait_for_console_pattern(console_pattern) - def test_m68k_q800(self): """ :avocado: tags=arch:m68k @@ -1047,15 +1056,6 @@ class BootLinuxConsole(LinuxKernelTest): tar_hash = 'ac688fd00561a2b6ce1359f9ff6aa2b98c9a570c' self.do_test_advcal_2018('07', tar_hash, 'sanity-clause.elf') - @skip("Test currently broken") # Console stuck as of 5.2-rc1 - def test_microblaze_s3adsp1800(self): - """ - :avocado: tags=arch:microblaze - :avocado: tags=machine:petalogix-s3adsp1800 - """ - tar_hash = '08bf3e3bfb6b6c7ce1e54ab65d54e189f2caf13f' - self.do_test_advcal_2018('17', tar_hash, 'ballerina.bin') - def test_or1k_sim(self): """ :avocado: tags=arch:or1k diff --git a/tests/acceptance/linux_ssh_mips_malta.py b/tests/acceptance/linux_ssh_mips_malta.py index 25c5c5f741..6dbd02d49d 100644 --- a/tests/acceptance/linux_ssh_mips_malta.py +++ b/tests/acceptance/linux_ssh_mips_malta.py @@ -91,7 +91,7 @@ class LinuxSSH(Test): except: time.sleep(4) pass - self.fail("sshd timeout") + self.fail("ssh connection timeout") def ssh_disconnect_vm(self): self.ssh_session.quit() diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py index 2baba5fdc2..09e2745cc5 100644 --- a/tests/acceptance/machine_m68k_nextcube.py +++ b/tests/acceptance/machine_m68k_nextcube.py @@ -1,19 +1,17 @@ # Functional test that boots a VM and run OCR on the framebuffer # -# Copyright (c) Philippe Mathieu-Daudé <f4bug@amsat.org> +# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> # # This work is licensed under the terms of the GNU GPL, version 2 or # later. See the COPYING file in the top-level directory. import os -import re import time -import logging from avocado_qemu import Test from avocado import skipUnless -from avocado.utils import process -from avocado.utils.path import find_command, CmdNotFoundError + +from tesseract_utils import tesseract_available, tesseract_ocr PIL_AVAILABLE = True try: @@ -22,25 +20,6 @@ except ImportError: PIL_AVAILABLE = False -def tesseract_available(expected_version): - try: - find_command('tesseract') - except CmdNotFoundError: - return False - res = process.run('tesseract --version') - try: - version = res.stdout_text.split()[1] - except IndexError: - version = res.stderr_text.split()[1] - return int(version.split('.')[0]) == expected_version - - match = re.match(r'tesseract\s(\d)', res) - if match is None: - return False - # now this is guaranteed to be a digit - return int(match.groups()[0]) == expected_version - - class NextCubeMachine(Test): """ :avocado: tags=arch:m68k @@ -80,12 +59,8 @@ class NextCubeMachine(Test): def test_bootrom_framebuffer_ocr_with_tesseract_v3(self): screenshot_path = os.path.join(self.workdir, "dump.ppm") self.check_bootrom_framebuffer(screenshot_path) - - console_logger = logging.getLogger('console') - text = process.run("tesseract %s stdout" % screenshot_path).stdout_text - for line in text.split('\n'): - if len(line): - console_logger.debug(line) + lines = tesseract_ocr(screenshot_path, tesseract_version=3) + text = '\n'.join(lines) self.assertIn('Backplane', text) self.assertIn('Ethernet address', text) @@ -96,13 +71,8 @@ class NextCubeMachine(Test): def test_bootrom_framebuffer_ocr_with_tesseract_v4(self): screenshot_path = os.path.join(self.workdir, "dump.ppm") self.check_bootrom_framebuffer(screenshot_path) - - console_logger = logging.getLogger('console') - proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path) - text = proc.stdout_text - for line in text.split('\n'): - if len(line): - console_logger.debug(line) + lines = tesseract_ocr(screenshot_path, tesseract_version=4) + text = '\n'.join(lines) self.assertIn('Testing the FPU, SCC', text) self.assertIn('System test failed. Error code', text) self.assertIn('Boot command', text) diff --git a/tests/acceptance/machine_microblaze.py b/tests/acceptance/machine_microblaze.py new file mode 100644 index 0000000000..7f6d18495d --- /dev/null +++ b/tests/acceptance/machine_microblaze.py @@ -0,0 +1,35 @@ +# Functional test that boots a microblaze Linux kernel and checks the console +# +# Copyright (c) 2018, 2021 Red Hat, Inc. +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +from avocado_qemu import Test +from avocado_qemu import wait_for_console_pattern +from avocado.utils import archive + +class MicroblazeMachine(Test): + + timeout = 90 + + def test_microblaze_s3adsp1800(self): + """ + :avocado: tags=arch:microblaze + :avocado: tags=machine:petalogix-s3adsp1800 + """ + + tar_url = ('https://www.qemu-advent-calendar.org' + '/2018/download/day17.tar.xz') + tar_hash = '08bf3e3bfb6b6c7ce1e54ab65d54e189f2caf13f' + file_path = self.fetch_asset(tar_url, asset_hash=tar_hash) + archive.extract(file_path, self.workdir) + self.vm.set_console() + self.vm.add_args('-kernel', self.workdir + '/day17/ballerina.bin') + self.vm.launch() + wait_for_console_pattern(self, 'This architecture does not have ' + 'kernel memory protection') + # Note: + # The kernel sometimes gets stuck after the "This architecture ..." + # message, that's why we don't test for a later string here. This + # needs some investigation by a microblaze wizard one day... diff --git a/tests/acceptance/machine_ppc.py b/tests/acceptance/machine_ppc.py new file mode 100644 index 0000000000..a836e2496f --- /dev/null +++ b/tests/acceptance/machine_ppc.py @@ -0,0 +1,69 @@ +# Test that Linux kernel boots on ppc machines and check the console +# +# Copyright (c) 2018, 2020 Red Hat, Inc. +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +from avocado.utils import archive +from avocado_qemu import Test +from avocado_qemu import wait_for_console_pattern + +class PpcMachine(Test): + + timeout = 90 + KERNEL_COMMON_COMMAND_LINE = 'printk.time=0 ' + panic_message = 'Kernel panic - not syncing' + + def test_ppc64_pseries(self): + """ + :avocado: tags=arch:ppc64 + :avocado: tags=machine:pseries + """ + kernel_url = ('https://archives.fedoraproject.org/pub/archive' + '/fedora-secondary/releases/29/Everything/ppc64le/os' + '/ppc/ppc64/vmlinuz') + kernel_hash = '3fe04abfc852b66653b8c3c897a59a689270bc77' + kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash) + + self.vm.set_console() + kernel_command_line = self.KERNEL_COMMON_COMMAND_LINE + 'console=hvc0' + self.vm.add_args('-kernel', kernel_path, + '-append', kernel_command_line) + self.vm.launch() + console_pattern = 'Kernel command line: %s' % kernel_command_line + wait_for_console_pattern(self, console_pattern, self.panic_message) + + def test_ppc_mpc8544ds(self): + """ + :avocado: tags=arch:ppc + :avocado: tags=machine:mpc8544ds + """ + tar_url = ('https://www.qemu-advent-calendar.org' + '/2020/download/day17.tar.gz') + tar_hash = '7a5239542a7c4257aa4d3b7f6ddf08fb6775c494' + file_path = self.fetch_asset(tar_url, asset_hash=tar_hash) + archive.extract(file_path, self.workdir) + self.vm.set_console() + self.vm.add_args('-kernel', self.workdir + '/creek/creek.bin') + self.vm.launch() + wait_for_console_pattern(self, 'QEMU advent calendar 2020', + self.panic_message) + + def test_ppc_virtex_ml507(self): + """ + :avocado: tags=arch:ppc + :avocado: tags=machine:virtex-ml507 + """ + tar_url = ('https://www.qemu-advent-calendar.org' + '/2020/download/hippo.tar.gz') + tar_hash = '306b95bfe7d147f125aa176a877e266db8ef914a' + file_path = self.fetch_asset(tar_url, asset_hash=tar_hash) + archive.extract(file_path, self.workdir) + self.vm.set_console() + self.vm.add_args('-kernel', self.workdir + '/hippo/hippo.linux', + '-dtb', self.workdir + '/hippo/virtex440-ml507.dtb', + '-m', '512') + self.vm.launch() + wait_for_console_pattern(self, 'QEMU advent calendar 2020', + self.panic_message) diff --git a/tests/acceptance/replay_kernel.py b/tests/acceptance/replay_kernel.py index 772633b01d..c1cb862468 100644 --- a/tests/acceptance/replay_kernel.py +++ b/tests/acceptance/replay_kernel.py @@ -31,7 +31,7 @@ class ReplayKernelBase(LinuxKernelTest): terminates. """ - timeout = 90 + timeout = 120 KERNEL_COMMON_COMMAND_LINE = 'printk.time=1 panic=-1 ' def run_vm(self, kernel_path, kernel_command_line, console_pattern, diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py new file mode 100644 index 0000000000..72cd9ab798 --- /dev/null +++ b/tests/acceptance/tesseract_utils.py @@ -0,0 +1,46 @@ +# ... +# +# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +import re +import logging + +from avocado.utils import process +from avocado.utils.path import find_command, CmdNotFoundError + +def tesseract_available(expected_version): + try: + find_command('tesseract') + except CmdNotFoundError: + return False + res = process.run('tesseract --version') + try: + version = res.stdout_text.split()[1] + except IndexError: + version = res.stderr_text.split()[1] + return int(version.split('.')[0]) == expected_version + + match = re.match(r'tesseract\s(\d)', res) + if match is None: + return False + # now this is guaranteed to be a digit + return int(match.groups()[0]) == expected_version + + +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): + console_logger = logging.getLogger('tesseract') + console_logger.debug(image_path) + if tesseract_version == 4: + tesseract_args += ' --oem 1' + proc = process.run("tesseract {} {} stdout".format(tesseract_args, + image_path)) + lines = [] + for line in proc.stdout_text.split('\n'): + sline = line.strip() + if len(sline): + console_logger.debug(sline) + lines += [sline] + return lines diff --git a/tests/acceptance/virtiofs_submounts.py b/tests/acceptance/virtiofs_submounts.py index 361e5990b6..949ca87a83 100644 --- a/tests/acceptance/virtiofs_submounts.py +++ b/tests/acceptance/virtiofs_submounts.py @@ -83,20 +83,21 @@ class VirtiofsSubmountsTest(BootLinux): command_line='info usernet') for line in res.split('\r\n'): match = \ - re.search(r'TCP.HOST_FORWARD.*127\.0\.0\.1\s*(\d+)\s+10\.', + re.search(r'TCP.HOST_FORWARD.*127\.0\.0\.1\s+(\d+)\s+10\.', line) if match is not None: - port = match[1] + port = int(match[1]) break self.assertIsNotNone(port) - self.log.debug('sshd listening on port: ' + port) + self.assertGreater(port, 0) + self.log.debug('sshd listening on port: %d', port) return port def ssh_connect(self, username, keyfile): self.ssh_logger = logging.getLogger('ssh') port = self.get_portfwd() - self.ssh_session = ssh.Session('127.0.0.1', port=int(port), + self.ssh_session = ssh.Session('127.0.0.1', port=port, user=username, key=keyfile) for i in range(10): try: @@ -105,7 +106,7 @@ class VirtiofsSubmountsTest(BootLinux): except: time.sleep(4) pass - self.fail('sshd timeout') + self.fail('ssh connection timeout') def ssh_command(self, command): self.ssh_logger.info(command) @@ -136,8 +137,7 @@ class VirtiofsSubmountsTest(BootLinux): return (stdout, stderr, ret) def set_up_shared_dir(self): - atwd = os.getenv('AVOCADO_TEST_WORKDIR') - self.shared_dir = os.path.join(atwd, 'virtiofs-shared') + self.shared_dir = os.path.join(self.workdir, 'virtiofs-shared') os.mkdir(self.shared_dir) @@ -234,10 +234,9 @@ class VirtiofsSubmountsTest(BootLinux): self.seed = self.params.get('seed') - atwd = os.getenv('AVOCADO_TEST_WORKDIR') - self.ssh_key = os.path.join(atwd, 'id_ed25519') + self.ssh_key = os.path.join(self.workdir, 'id_ed25519') - self.run(('ssh-keygen', '-t', 'ed25519', '-f', self.ssh_key)) + self.run(('ssh-keygen', '-N', '', '-t', 'ed25519', '-f', self.ssh_key)) pubkey = open(self.ssh_key + '.pub').read() @@ -249,7 +248,7 @@ class VirtiofsSubmountsTest(BootLinux): # Allow us to connect to SSH self.vm.add_args('-netdev', 'user,id=vnet,hostfwd=:127.0.0.1:0-:22', - '-device', 'e1000,netdev=vnet') + '-device', 'virtio-net,netdev=vnet') if not kvm_available(self.arch, self.qemu_bin): self.cancel(KVM_NOT_AVAILABLE) diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index 0779dab5b9..93b29ad823 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -1,6 +1,6 @@ # Makefile for Docker tests -.PHONY: docker docker-test docker-clean docker-image docker-qemu-src +.PHONY: docker docker-help docker-test docker-clean docker-image docker-qemu-src NULL := SPACE := $(NULL) # @@ -11,7 +11,7 @@ HOST_ARCH = $(if $(ARCH),$(ARCH),$(shell uname -m)) DOCKER_SUFFIX := .docker DOCKER_FILES_DIR := $(SRC_PATH)/tests/docker/dockerfiles # we don't run tests on intermediate images (used as base by another image) -DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap +DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap empty DOCKER_IMAGES := $(sort $(notdir $(basename $(wildcard $(DOCKER_FILES_DIR)/*.docker)))) DOCKER_TARGETS := $(patsubst %,docker-image-%,$(DOCKER_IMAGES)) # Use a global constant ccache directory to speed up repetitive builds @@ -92,6 +92,24 @@ docker-binfmt-image-debian-%: $(DOCKER_FILES_DIR)/debian-bootstrap.docker { echo "You will need to build $(EXECUTABLE)"; exit 1;},\ "CHECK", "debian-$* exists")) +# These are test targets +USER_TCG_TARGETS=$(patsubst %-linux-user,qemu-%,$(filter %-linux-user,$(TARGET_DIRS))) +EXEC_COPY_TESTS=$(patsubst %,docker-exec-copy-test-%, $(USER_TCG_TARGETS)) + +$(EXEC_COPY_TESTS): docker-exec-copy-test-%: $(DOCKER_FILES_DIR)/empty.docker + $(call quiet-command, \ + $(DOCKER_SCRIPT) build -t qemu/exec-copy-test-$* -f $< \ + $(if $V,,--quiet) --no-cache \ + --include-executable=$* \ + --skip-binfmt, \ + "TEST","copy $* to container") + $(call quiet-command, \ + $(DOCKER_SCRIPT) run qemu/exec-copy-test-$* \ + /$* -version > tests/docker-exec-copy-test-$*.out, \ + "TEST","check $* works in container") + +docker-exec-copy-test: $(EXEC_COPY_TESTS) + endif # Enforce dependencies for composite images @@ -209,7 +227,7 @@ endif @echo ' before running the command.' @echo ' NETWORK=1 Enable virtual network interface with default backend.' @echo ' NETWORK=$$BACKEND Enable virtual network interface with $$BACKEND.' - @echo ' NOUSER Define to disable adding current user to containers passwd.' + @echo ' NOUSER=1 Define to disable adding current user to containers passwd.' @echo ' NOCACHE=1 Ignore cache when build images.' @echo ' EXECUTABLE=<path> Include executable in image.' @echo ' EXTRA_FILES="<path> [... <path>]"' @@ -218,6 +236,8 @@ endif @echo ' Specify which container engine to run.' @echo ' REGISTRY=url Cache builds from registry (default:$(DOCKER_REGISTRY))' +docker-help: docker + # This rule if for directly running against an arbitrary docker target. # It is called by the expanded docker targets (e.g. make # docker-test-foo@bar) which will do additional verification. diff --git a/tests/docker/docker.py b/tests/docker/docker.py index 884dfeb29c..d28df4c140 100755 --- a/tests/docker/docker.py +++ b/tests/docker/docker.py @@ -93,7 +93,7 @@ def _guess_engine_command(): commands_txt) -def _copy_with_mkdir(src, root_dir, sub_path='.'): +def _copy_with_mkdir(src, root_dir, sub_path='.', name=None): """Copy src into root_dir, creating sub_path as needed.""" dest_dir = os.path.normpath("%s/%s" % (root_dir, sub_path)) try: @@ -102,8 +102,13 @@ def _copy_with_mkdir(src, root_dir, sub_path='.'): # we can safely ignore already created directories pass - dest_file = "%s/%s" % (dest_dir, os.path.basename(src)) - copy(src, dest_file) + dest_file = "%s/%s" % (dest_dir, name if name else os.path.basename(src)) + + try: + copy(src, dest_file) + except FileNotFoundError: + print("Couldn't copy %s to %s" % (src, dest_file)) + pass def _get_so_libs(executable): @@ -120,7 +125,7 @@ def _get_so_libs(executable): search = ldd_re.search(line) if search: try: - libs.append(s.group(1)) + libs.append(search.group(1)) except IndexError: pass except subprocess.CalledProcessError: @@ -150,8 +155,9 @@ def _copy_binary_with_libs(src, bin_dest, dest_dir): if libs: for l in libs: so_path = os.path.dirname(l) + name = os.path.basename(l) real_l = os.path.realpath(l) - _copy_with_mkdir(real_l, dest_dir, so_path) + _copy_with_mkdir(real_l, dest_dir, so_path, name) def _check_binfmt_misc(executable): @@ -432,6 +438,9 @@ class BuildCommand(SubCommand): help="""Specify a binary that will be copied to the container together with all its dependent libraries""") + parser.add_argument("--skip-binfmt", + action="store_true", + help="""Skip binfmt entry check (used for testing)""") parser.add_argument("--extra-files", nargs='*', help="""Specify files that will be copied in the Docker image, fulfilling the ADD directive from the @@ -460,7 +469,9 @@ class BuildCommand(SubCommand): docker_dir = tempfile.mkdtemp(prefix="docker_build") # Validate binfmt_misc will work - if args.include_executable: + if args.skip_binfmt: + qpath = args.include_executable + elif args.include_executable: qpath, enabled = _check_binfmt_misc(args.include_executable) if not enabled: return 1 diff --git a/tests/docker/dockerfiles/empty.docker b/tests/docker/dockerfiles/empty.docker new file mode 100644 index 0000000000..9ba980f1a8 --- /dev/null +++ b/tests/docker/dockerfiles/empty.docker @@ -0,0 +1,8 @@ +# +# Empty Dockerfile +# + +FROM scratch + +# Add everything from the context into the container +ADD . / diff --git a/tests/meson.build b/tests/meson.build index 29ebaba48d..7d7da6a636 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -86,7 +86,6 @@ tests = { 'test-qobject-input-visitor': [testqapi], 'test-string-input-visitor': [testqapi], 'test-string-output-visitor': [testqapi], - 'test-qmp-event': [testqapi], 'test-opts-visitor': [testqapi], 'test-visitor-serialization': [testqapi], 'test-bitmap': [], @@ -117,6 +116,12 @@ tests = { 'test-qapi-util': [], } +if have_system or have_tools + tests += { + 'test-qmp-event': [testqapi], + } +endif + test_deps = { 'test-qht-par': qht_bench, } @@ -276,7 +281,9 @@ test('decodetree', sh, workdir: meson.current_source_dir() / 'decode', suite: 'decodetree') -subdir('fp') +if 'CONFIG_TCG' in config_all + subdir('fp') +endif if not get_option('tcg').disabled() if 'CONFIG_PLUGIN' in config_host diff --git a/tests/qapi-schema/comments.out b/tests/qapi-schema/comments.out index 273f0f54e1..ce4f6a4f0f 100644 --- a/tests/qapi-schema/comments.out +++ b/tests/qapi-schema/comments.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/doc-good.out b/tests/qapi-schema/doc-good.out index 419284dae2..715b0bbc1a 100644 --- a/tests/qapi-schema/doc-good.out +++ b/tests/qapi-schema/doc-good.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/empty.out b/tests/qapi-schema/empty.out index 69666c39ad..3feb3f69d3 100644 --- a/tests/qapi-schema/empty.out +++ b/tests/qapi-schema/empty.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/event-case.out b/tests/qapi-schema/event-case.out index 42ae519656..9ae44052ac 100644 --- a/tests/qapi-schema/event-case.out +++ b/tests/qapi-schema/event-case.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/include-repetition.out b/tests/qapi-schema/include-repetition.out index 0b654ddebb..16dbd9b819 100644 --- a/tests/qapi-schema/include-repetition.out +++ b/tests/qapi-schema/include-repetition.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/include-simple.out b/tests/qapi-schema/include-simple.out index 061f81e509..48e923bfbc 100644 --- a/tests/qapi-schema/include-simple.out +++ b/tests/qapi-schema/include-simple.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/indented-expr.out b/tests/qapi-schema/indented-expr.out index 04356775cd..6a30ded3fa 100644 --- a/tests/qapi-schema/indented-expr.out +++ b/tests/qapi-schema/indented-expr.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qapi-schema/qapi-schema-test.out b/tests/qapi-schema/qapi-schema-test.out index 8868ca0dca..3b1387d9f1 100644 --- a/tests/qapi-schema/qapi-schema-test.out +++ b/tests/qapi-schema/qapi-schema-test.out @@ -1,4 +1,4 @@ -module None +module ./builtin object q_empty enum QType prefix QTYPE diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out index dc1a3c9786..2e9fc596eb 100644 --- a/tests/qemu-iotests/210.out +++ b/tests/qemu-iotests/210.out @@ -182,7 +182,7 @@ Job failed: The requested file size is too large === Resize image with invalid sizes === {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775296}} -{"error": {"class": "GenericError", "desc": "Required too big image size, it must be not greater than 9223372035781033984"}} +{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) exceeds maximum(9223372035781033984)"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775808}} {"error": {"class": "GenericError", "desc": "Invalid parameter type for 'size', expected: integer"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 18446744073709551104}} diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264 index e725cefd47..4f96825a22 100755 --- a/tests/qemu-iotests/264 +++ b/tests/qemu-iotests/264 @@ -20,60 +20,102 @@ # import time +import os import iotests -from iotests import qemu_img_create, file_path, qemu_nbd_popen, log - -iotests.script_initialize( - supported_fmts=['qcow2'], -) +from iotests import qemu_img_create, file_path, qemu_nbd_popen disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock') nbd_uri = 'nbd+unix:///?socket=' + nbd_sock -size = 5 * 1024 * 1024 wait_limit = 3.0 wait_step = 0.2 -qemu_img_create('-f', iotests.imgfmt, disk_a, str(size)) -qemu_img_create('-f', iotests.imgfmt, disk_b, str(size)) - -with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): - vm = iotests.VM().add_drive(disk_a) - vm.launch() - vm.hmp_qemu_io('drive0', 'write 0 {}'.format(size)) - - vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles], - **{'node_name': 'backup0', - 'driver': 'raw', - 'file': {'driver': 'nbd', - 'server': {'type': 'unix', 'path': nbd_sock}, - 'reconnect-delay': 10}}) - vm.qmp_log('blockdev-backup', device='drive0', sync='full', - target='backup0', speed=(1 * 1024 * 1024)) - - # Wait for some progress - t = 0.0 - while t < wait_limit: - jobs = vm.qmp('query-block-jobs')['return'] - if jobs and jobs[0]['offset'] > 0: - break - time.sleep(wait_step) - t += wait_step - - if jobs and jobs[0]['offset'] > 0: - log('Backup job is started') - -jobs = vm.qmp('query-block-jobs')['return'] -if jobs and jobs[0]['offset'] < jobs[0]['len']: - log('Backup job is still in progress') - -vm.qmp_log('block-job-set-speed', device='drive0', speed=0) - -# Emulate server down time for 1 second -time.sleep(1) - -with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): - e = vm.event_wait('BLOCK_JOB_COMPLETED') - log('Backup completed: {}'.format(e['data']['offset'])) - vm.qmp_log('blockdev-del', node_name='backup0') - vm.shutdown() + +class TestNbdReconnect(iotests.QMPTestCase): + def init_vm(self, disk_size): + qemu_img_create('-f', iotests.imgfmt, disk_a, str(disk_size)) + qemu_img_create('-f', iotests.imgfmt, disk_b, str(disk_size)) + self.vm = iotests.VM().add_drive(disk_a) + self.vm.launch() + self.vm.hmp_qemu_io('drive0', 'write 0 {}'.format(disk_size)) + + def tearDown(self): + self.vm.shutdown() + os.remove(disk_a) + os.remove(disk_b) + + def start_job(self, job): + """Stat job with nbd target and kill the server""" + assert job in ('blockdev-backup', 'blockdev-mirror') + with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): + result = self.vm.qmp('blockdev-add', + **{'node_name': 'backup0', + 'driver': 'raw', + 'file': {'driver': 'nbd', + 'server': {'type': 'unix', + 'path': nbd_sock}, + 'reconnect-delay': 10}}) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp(job, device='drive0', + sync='full', target='backup0', + speed=(1 * 1024 * 1024)) + self.assert_qmp(result, 'return', {}) + + # Wait for some progress + t = 0.0 + while t < wait_limit: + jobs = self.vm.qmp('query-block-jobs')['return'] + if jobs and jobs[0]['offset'] > 0: + break + time.sleep(wait_step) + t += wait_step + + self.assertTrue(jobs and jobs[0]['offset'] > 0) # job started + + jobs = self.vm.qmp('query-block-jobs')['return'] + # Check that job is still in progress + self.assertTrue(jobs) + self.assertTrue(jobs[0]['offset'] < jobs[0]['len']) + + result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0) + self.assert_qmp(result, 'return', {}) + + # Emulate server down time for 1 second + time.sleep(1) + + def test_backup(self): + size = 5 * 1024 * 1024 + self.init_vm(size) + self.start_job('blockdev-backup') + + with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b): + e = self.vm.event_wait('BLOCK_JOB_COMPLETED') + self.assertEqual(e['data']['offset'], size) + result = self.vm.qmp('blockdev-del', node_name='backup0') + self.assert_qmp(result, 'return', {}) + + def cancel_job(self): + result = self.vm.qmp('block-job-cancel', device='drive0') + self.assert_qmp(result, 'return', {}) + + start_t = time.time() + self.vm.event_wait('BLOCK_JOB_CANCELLED') + delta_t = time.time() - start_t + self.assertTrue(delta_t < 2.0) + + def test_mirror_cancel(self): + # Mirror speed limit doesn't work well enough, it seems that mirror + # will run many parallel requests anyway. MAX_IN_FLIGHT is 16 and + # MAX_IO_BYTES is 1M in mirror.c, so let's use 20M disk. + self.init_vm(20 * 1024 * 1024) + self.start_job('blockdev-mirror') + self.cancel_job() + + def test_backup_cancel(self): + self.init_vm(5 * 1024 * 1024) + self.start_job('blockdev-backup') + self.cancel_job() + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2']) diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out index c45b1e81ef..8d7e996700 100644 --- a/tests/qemu-iotests/264.out +++ b/tests/qemu-iotests/264.out @@ -1,15 +1,5 @@ -Start NBD server -{"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}} -{"return": {}} -{"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}} -{"return": {}} -Backup job is started -Kill NBD server -Backup job is still in progress -{"execute": "block-job-set-speed", "arguments": {"device": "drive0", "speed": 0}} -{"return": {}} -Start NBD server -Backup completed: 5242880 -{"execute": "blockdev-del", "arguments": {"node-name": "backup0"}} -{"return": {}} -Kill NBD server +... +---------------------------------------------------------------------- +Ran 3 tests + +OK diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out index 27471ffae8..7176e376e1 100644 --- a/tests/qemu-iotests/267.out +++ b/tests/qemu-iotests/267.out @@ -6,11 +6,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 Testing: QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 -Error: No block device can accept snapshots +Error: no block device can store vmstate for snapshot (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 -Error: No block device supports snapshots +Error: no block device can store vmstate for snapshot (qemu) quit @@ -22,7 +22,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'none0' is writable but does not support snapshots (qemu) quit @@ -58,7 +58,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'virtio0' is writable but does not support snapshots (qemu) quit @@ -83,7 +83,7 @@ QEMU X.Y.Z monitor - type 'help' for more information (qemu) savevm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) info snapshots -No available block device supports snapshots +no block device can store vmstate for snapshot (qemu) loadvm snap0 Error: Device 'file' is writable but does not support snapshots (qemu) quit diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index ef105dfc39..0fc52d20d7 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -53,6 +53,15 @@ _in_fd=4 # If $mismatch_only is set, only non-matching responses will # be echoed. # +# If $capture_events is non-empty, then any QMP event names it lists +# will not be echoed out, but instead collected in the $QEMU_EVENTS +# variable. The _wait_event function can later be used to receive +# the cached events. +# +# If $only_capture_events is set to anything but an empty string, +# then an error will be raised if a QMP message is seen which is +# not an event listed in $capture_events. +# # If $success_or_failure is set, the meaning of the arguments is # changed as follows: # $2: A string to search for in the response; if found, this indicates @@ -78,6 +87,31 @@ _timed_wait_for() QEMU_STATUS[$h]=0 while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} do + if [ -n "$capture_events" ]; then + capture=0 + local evname + for evname in $capture_events + do + case ${resp} in + *\"event\":\ \"${evname}\"* ) capture=1 ;; + esac + done + if [ $capture = 1 ]; + then + ev=$(echo "${resp}" | tr -d '\r' | tr % .) + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + if [ -n "$only_capture_events" ]; then + return + else + continue + fi + fi + fi + if [ -n "$only_capture_events" ]; then + echo "Only expected $capture_events but got ${resp}" + exit 1 + fi + if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp @@ -172,12 +206,82 @@ _send_qemu_cmd() let count--; done if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then - echo "Timeout waiting for ${1} on handle ${h}" + echo "Timeout waiting for command ${1} response on handle ${h}" exit 1 #Timeout means the test failed fi } +# Check event cache for a named QMP event +# +# Input parameters: +# $1: Name of the QMP event to check for +# +# Checks if the named QMP event that was previously captured +# into $QEMU_EVENTS. When matched, the QMP event will be echoed +# and the $matched variable set to 1. +# +# _wait_event is more suitable for test usage in most cases +_check_cached_events() +{ + local evname=${1} + + local match="\"event\": \"$evname\"" + + matched=0 + if [ -n "$QEMU_EVENTS" ]; then + CURRENT_QEMU_EVENTS=$QEMU_EVENTS + QEMU_EVENTS= + old_IFS=$IFS + IFS="%" + for ev in $CURRENT_QEMU_EVENTS + do + grep -q "$match" < <(echo "${ev}") + if [ $? -eq 0 ] && [ $matched = 0 ]; then + echo "${ev}" | _filter_testdir | _filter_qemu \ + | _filter_qemu_io | _filter_qmp | _filter_hmp + matched=1 + else + QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}" + fi + done + IFS=$old_IFS + fi +} + +# Wait for a named QMP event +# +# Input parameters: +# $1: QEMU handle to use +# $2: Name of the QMP event to wait for +# +# Checks if the named QMP even was previously captured +# into $QEMU_EVENTS. If none are present, then waits for the +# event to arrive on the QMP channel. When matched, the QMP +# event will be echoed +_wait_event() +{ + local h=${1} + local evname=${2} + + while true + do + _check_cached_events $evname + + if [ $matched = 1 ]; + then + return + fi + + only_capture_events=1 qemu_error_no_exit=1 _timed_wait_for ${h} + + if [ ${QEMU_STATUS[$h]} -ne 0 ] ; then + echo "Timeout waiting for event ${evname} on handle ${h}" + exit 1 #Timeout means the test failed + fi + done +} + # Launch a QEMU process. # # Input parameters: diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 297acf9b6a..77c37e8312 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -109,8 +109,14 @@ peek_file_raw() dd if="$1" bs=1 skip="$2" count="$3" status=none } - -if ! . ./common.config +config=common.config +test -f $config || config=../common.config +if ! test -f $config +then + echo "$0: failed to find common.config" + exit 1 +fi +if ! . $config then echo "$0: failed to source common.config" exit 1 diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 00be68eca3..4e758308f2 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -296,7 +296,9 @@ def qemu_nbd_list_log(*args: str) -> str: @contextmanager def qemu_nbd_popen(*args): '''Context manager running qemu-nbd within the context''' - pid_file = file_path("pid") + pid_file = file_path("qemu_nbd_popen-nbd-pid-file") + + assert not os.path.exists(pid_file) cmd = list(qemu_nbd_args) cmd.extend(('--persistent', '--pid-file', pid_file)) @@ -314,6 +316,8 @@ def qemu_nbd_popen(*args): time.sleep(0.01) yield finally: + if os.path.exists(pid_file): + os.remove(pid_file) log('Kill NBD server') p.kill() p.wait() diff --git a/tests/qtest/fuzz/fuzz.c b/tests/qtest/fuzz/fuzz.c index 238866a037..496d11a231 100644 --- a/tests/qtest/fuzz/fuzz.c +++ b/tests/qtest/fuzz/fuzz.c @@ -159,6 +159,8 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp) char *target_name; const char *bindir; char *datadir; + GString *cmd_line; + gchar *pretty_cmd_line; bool serialize = false; /* Initialize qgraph and modules */ @@ -217,7 +219,7 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp) } /* Run QEMU's softmmu main with the fuzz-target dependent arguments */ - GString *cmd_line = fuzz_target->get_init_cmdline(fuzz_target); + cmd_line = fuzz_target->get_init_cmdline(fuzz_target); g_string_append_printf(cmd_line, " %s -qtest /dev/null ", getenv("QTEST_LOG") ? "" : "-qtest-log none"); @@ -226,6 +228,13 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp) wordexp(cmd_line->str, &result, 0); g_string_free(cmd_line, true); + if (getenv("QTEST_LOG")) { + pretty_cmd_line = g_strjoinv(" ", result.we_wordv + 1); + printf("Starting %s with Arguments: %s\n", + result.we_wordv[0], pretty_cmd_line); + g_free(pretty_cmd_line); + } + qemu_init(result.we_wordc, result.we_wordv, NULL); /* re-enable the rcu atfork, which was previously disabled in qemu_init */ diff --git a/tests/qtest/fuzz/generic_fuzz.c b/tests/qtest/fuzz/generic_fuzz.c index be76d47d2d..ee8c17a04c 100644 --- a/tests/qtest/fuzz/generic_fuzz.c +++ b/tests/qtest/fuzz/generic_fuzz.c @@ -175,7 +175,7 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) * generic_fuzz(), avoiding potential race-conditions, which we don't have * a good way for reproducing right now. */ -void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write) +void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr) { /* Are we in the generic-fuzzer or are we using another fuzz-target? */ if (!qts_global) { @@ -187,14 +187,11 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write) * - We have no DMA patterns defined * - The length of the DMA read request is zero * - The DMA read is hitting an MR other than the machine's main RAM - * - The DMA request is not a read (what happens for a address_space_map - * with is_write=True? Can the device use the same pointer to do reads?) * - The DMA request hits past the bounds of our RAM */ if (dma_patterns->len == 0 || len == 0 || mr != current_machine->ram - || is_write || addr > current_machine->ram_size) { return; } @@ -213,12 +210,12 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write) double_fetch = true; if (addr < region.addr && avoid_double_fetches) { - fuzz_dma_read_cb(addr, region.addr - addr, mr, is_write); + fuzz_dma_read_cb(addr, region.addr - addr, mr); } if (addr + len > region.addr + region.size && avoid_double_fetches) { fuzz_dma_read_cb(region.addr + region.size, - addr + len - (region.addr + region.size), mr, is_write); + addr + len - (region.addr + region.size), mr); } return; } @@ -936,12 +933,20 @@ static GString *generic_fuzz_cmdline(FuzzTarget *t) static GString *generic_fuzz_predefined_config_cmdline(FuzzTarget *t) { + gchar *args; const generic_fuzz_config *config; g_assert(t->opaque); config = t->opaque; setenv("QEMU_AVOID_DOUBLE_FETCH", "1", 1); - setenv("QEMU_FUZZ_ARGS", config->args, 1); + if (config->argfunc) { + args = config->argfunc(); + setenv("QEMU_FUZZ_ARGS", args, 1); + g_free(args); + } else { + g_assert_nonnull(config->args); + setenv("QEMU_FUZZ_ARGS", config->args, 1); + } setenv("QEMU_FUZZ_OBJECTS", config->objects, 1); return generic_fuzz_cmdline(t); } diff --git a/tests/qtest/fuzz/generic_fuzz_configs.h b/tests/qtest/fuzz/generic_fuzz_configs.h index 7fed035345..5d599765c4 100644 --- a/tests/qtest/fuzz/generic_fuzz_configs.h +++ b/tests/qtest/fuzz/generic_fuzz_configs.h @@ -16,8 +16,19 @@ typedef struct generic_fuzz_config { const char *name, *args, *objects; + gchar* (*argfunc)(void); /* Result must be freeable by g_free() */ } generic_fuzz_config; +static inline gchar *generic_fuzzer_virtio_9p_args(void){ + char tmpdir[] = "/tmp/qemu-fuzz.XXXXXX"; + g_assert_nonnull(mkdtemp(tmpdir)); + + return g_strdup_printf("-machine q35 -nodefaults " + "-device virtio-9p,fsdev=hshare,mount_tag=hshare " + "-fsdev local,id=hshare,path=%s,security_model=mapped-xattr," + "writeout=immediate,fmode=0600,dmode=0700", tmpdir); +} + const generic_fuzz_config predefined_configs[] = { { .name = "virtio-net-pci-slirp", @@ -60,6 +71,16 @@ const generic_fuzz_config predefined_configs[] = { .args = "-machine q35 -nodefaults -device virtio-mouse", .objects = "virtio*", },{ + .name = "virtio-9p", + .argfunc = generic_fuzzer_virtio_9p_args, + .objects = "virtio*", + },{ + .name = "virtio-9p-synth", + .args = "-machine q35 -nodefaults " + "-device virtio-9p,fsdev=hshare,mount_tag=hshare " + "-fsdev synth,id=hshare", + .objects = "virtio*", + },{ .name = "e1000", .args = "-M q35 -nodefaults " "-device e1000,netdev=net0 -netdev user,id=net0", @@ -85,10 +106,28 @@ const generic_fuzz_config predefined_configs[] = { .objects = "intel-hda", },{ .name = "ide-hd", + .args = "-machine pc -nodefaults " + "-drive file=null-co://,if=none,format=raw,id=disk0 " + "-device ide-hd,drive=disk0", + .objects = "*ide*", + },{ + .name = "ide-atapi", + .args = "-machine pc -nodefaults " + "-drive file=null-co://,if=none,format=raw,id=disk0 " + "-device ide-cd,drive=disk0", + .objects = "*ide*", + },{ + .name = "ahci-hd", .args = "-machine q35 -nodefaults " "-drive file=null-co://,if=none,format=raw,id=disk0 " "-device ide-hd,drive=disk0", - .objects = "ahci*", + .objects = "*ahci*", + },{ + .name = "ahci-atapi", + .args = "-machine q35 -nodefaults " + "-drive file=null-co://,if=none,format=raw,id=disk0 " + "-device ide-cd,drive=disk0", + .objects = "*ahci*", },{ .name = "floppy", .args = "-machine pc -nodefaults -device floppy,id=floppy0 " diff --git a/tests/tcg/Makefile.qemu b/tests/tcg/Makefile.qemu index c096c611a2..a56564660c 100644 --- a/tests/tcg/Makefile.qemu +++ b/tests/tcg/Makefile.qemu @@ -90,11 +90,11 @@ run-guest-tests: guest-tests else guest-tests: - $(call quiet-command, /bin/true, "BUILD", \ + $(call quiet-command, true, "BUILD", \ "$(TARGET) guest-tests SKIPPED") run-guest-tests: - $(call quiet-command, /bin/true, "RUN", \ + $(call quiet-command, true, "RUN", \ "tests for $(TARGET) SKIPPED") endif diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target index 1dd0f64d23..abbdb2e126 100644 --- a/tests/tcg/multiarch/Makefile.target +++ b/tests/tcg/multiarch/Makefile.target @@ -63,8 +63,11 @@ run-gdbstub-qxfer-auxv-read: sha1 --bin $< --test $(MULTIARCH_SRC)/gdbstub/test-qxfer-auxv-read.py, \ "basic gdbstub qXfer:auxv:read support") -EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read +else +run-gdbstub-%: + $(call skip-test, "gdbstub test $*", "need working gdb") endif +EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read # Update TESTS diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c index 00d93204f9..5b2110e861 100644 --- a/util/event_notifier-posix.c +++ b/util/event_notifier-posix.c @@ -29,6 +29,7 @@ void event_notifier_init_fd(EventNotifier *e, int fd) { e->rfd = fd; e->wfd = fd; + e->initialized = true; } #endif @@ -68,6 +69,7 @@ int event_notifier_init(EventNotifier *e, int active) if (active) { event_notifier_set(e); } + e->initialized = true; return 0; fail: @@ -78,12 +80,18 @@ fail: void event_notifier_cleanup(EventNotifier *e) { + if (!e->initialized) { + return; + } + if (e->rfd != e->wfd) { close(e->rfd); } + e->rfd = -1; close(e->wfd); e->wfd = -1; + e->initialized = false; } int event_notifier_get_fd(const EventNotifier *e) @@ -96,6 +104,10 @@ int event_notifier_set(EventNotifier *e) static const uint64_t value = 1; ssize_t ret; + if (!e->initialized) { + return -1; + } + do { ret = write(e->wfd, &value, sizeof(value)); } while (ret < 0 && errno == EINTR); @@ -113,6 +125,10 @@ int event_notifier_test_and_clear(EventNotifier *e) ssize_t len; char buffer[512]; + if (!e->initialized) { + return 0; + } + /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */ value = 0; do { diff --git a/util/fifo8.c b/util/fifo8.c index a5dd789ce5..d4d1c135e0 100644 --- a/util/fifo8.c +++ b/util/fifo8.c @@ -31,9 +31,7 @@ void fifo8_destroy(Fifo8 *fifo) void fifo8_push(Fifo8 *fifo, uint8_t data) { - if (fifo->num == fifo->capacity) { - abort(); - } + assert(fifo->num < fifo->capacity); fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data; fifo->num++; } @@ -42,9 +40,7 @@ void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num) { uint32_t start, avail; - if (fifo->num + num > fifo->capacity) { - abort(); - } + assert(fifo->num + num <= fifo->capacity); start = (fifo->head + fifo->num) % fifo->capacity; @@ -63,9 +59,7 @@ uint8_t fifo8_pop(Fifo8 *fifo) { uint8_t ret; - if (fifo->num == 0) { - abort(); - } + assert(fifo->num > 0); ret = fifo->data[fifo->head++]; fifo->head %= fifo->capacity; fifo->num--; @@ -76,9 +70,7 @@ const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num) { uint8_t *ret; - if (max == 0 || max > fifo->num) { - abort(); - } + assert(max > 0 && max <= fifo->num); *num = MIN(fifo->capacity - fifo->head, max); ret = &fifo->data[fifo->head]; fifo->head += *num; diff --git a/util/meson.build b/util/meson.build index 3eccdbe596..984fba965f 100644 --- a/util/meson.build +++ b/util/meson.build @@ -52,6 +52,7 @@ if have_system util_ss.add(files('crc-ccitt.c')) util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio]) util_ss.add(files('yank.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c')) endif if have_block diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 890fda6a35..e6fa8b598b 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -87,7 +87,8 @@ void *qemu_ram_mmap(int fd, size_t align, bool readonly, bool shared, - bool is_pmem) + bool is_pmem, + off_t map_offset) { int prot; int flags; @@ -150,7 +151,8 @@ void *qemu_ram_mmap(int fd, prot = PROT_READ | (readonly ? 0 : PROT_WRITE); - ptr = mmap(guardptr + offset, size, prot, flags | map_sync_flags, fd, 0); + ptr = mmap(guardptr + offset, size, prot, + flags | map_sync_flags, fd, map_offset); if (ptr == MAP_FAILED && map_sync_flags) { if (errno == ENOTSUP) { @@ -174,7 +176,7 @@ void *qemu_ram_mmap(int fd, * if map failed with MAP_SHARED_VALIDATE | MAP_SYNC, * we will remove these flags to handle compatibility. */ - ptr = mmap(guardptr + offset, size, prot, flags, fd, 0); + ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset); } if (ptr == MAP_FAILED) { diff --git a/util/oslib-posix.c b/util/oslib-posix.c index bf57d3b030..36820fec16 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -230,7 +230,7 @@ void *qemu_memalign(size_t alignment, size_t size) void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared) { size_t align = QEMU_VMALLOC_ALIGN; - void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false); + void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false, 0); if (ptr == MAP_FAILED) { return NULL; diff --git a/util/trace-events b/util/trace-events index 61e0d4bcdf..bac0924899 100644 --- a/util/trace-events +++ b/util/trace-events @@ -91,3 +91,12 @@ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uin qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" + +#userfaultfd.c +uffd_query_features_nosys(int err) "errno: %i" +uffd_query_features_api_failed(int err) "errno: %i" +uffd_create_fd_nosys(int err) "errno: %i" +uffd_create_fd_api_failed(int err) "errno: %i" +uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64 +uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i" +uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i" diff --git a/util/userfaultfd.c b/util/userfaultfd.c new file mode 100644 index 0000000000..f1cd6af2b1 --- /dev/null +++ b/util/userfaultfd.c @@ -0,0 +1,345 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "qemu/userfaultfd.h" +#include "trace.h" +#include <poll.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> + +/** + * uffd_query_features: query UFFD features + * + * Returns: 0 on success, negative value in case of an error + * + * @features: parameter to receive 'uffdio_api.features' + */ +int uffd_query_features(uint64_t *features) +{ + int uffd_fd; + struct uffdio_api api_struct = { 0 }; + int ret = -1; + + uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (uffd_fd < 0) { + trace_uffd_query_features_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = 0; + + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_query_features_api_failed(errno); + goto out; + } + *features = api_struct.features; + ret = 0; + +out: + close(uffd_fd); + return ret; +} + +/** + * uffd_create_fd: create UFFD file descriptor + * + * Returns non-negative file descriptor or negative value in case of an error + * + * @features: UFFD features to request + * @non_blocking: create UFFD file descriptor for non-blocking operation + */ +int uffd_create_fd(uint64_t features, bool non_blocking) +{ + int uffd_fd; + int flags; + struct uffdio_api api_struct = { 0 }; + uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); + + flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); + uffd_fd = syscall(__NR_userfaultfd, flags); + if (uffd_fd < 0) { + trace_uffd_create_fd_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = features; + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_create_fd_api_failed(errno); + goto fail; + } + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); + goto fail; + } + + return uffd_fd; + +fail: + close(uffd_fd); + return -1; +} + +/** + * uffd_close_fd: close UFFD file descriptor + * + * @uffd_fd: UFFD file descriptor + */ +void uffd_close_fd(int uffd_fd) +{ + assert(uffd_fd >= 0); + close(uffd_fd); +} + +/** + * uffd_register_memory: register memory range via UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) + * @ioctls: optional pointer to receive supported IOCTL mask + */ +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls) +{ + struct uffdio_register uffd_register; + + uffd_register.range.start = (uintptr_t) addr; + uffd_register.range.len = length; + uffd_register.mode = mode; + + if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { + trace_uffd_register_memory_failed(addr, length, mode, errno); + return -1; + } + if (ioctls) { + *ioctls = uffd_register.ioctls; + } + + return 0; +} + +/** + * uffd_unregister_memory: un-register memory range with UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + */ +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { + trace_uffd_unregister_memory_failed(addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO + * + * Returns 0 on success, negative value in case of error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @wp: write-protect/unprotect + * @dont_wake: do not wake threads waiting on wr-protected page + */ +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake) +{ + struct uffdio_writeprotect uffd_writeprotect; + + uffd_writeprotect.range.start = (uintptr_t) addr; + uffd_writeprotect.range.len = length; + if (!wp && dont_wake) { + /* DONTWAKE is meaningful only on protection release */ + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + } else { + uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); + } + + if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_writeprotect.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_copy_page: copy range of pages to destination via UFFD-IO + * + * Copy range of source pages to the destination to resolve + * missing page fault somewhere in the destination range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @dst_addr: destination base address + * @src_addr: source base address + * @length: length of the range to copy + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake) +{ + struct uffdio_copy uffd_copy; + + uffd_copy.dst = (uintptr_t) dst_addr; + uffd_copy.src = (uintptr_t) src_addr; + uffd_copy.len = length; + uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { + error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, + length, (uint64_t) uffd_copy.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_zero_page: fill range of pages with zeroes via UFFD-IO + * + * Fill range pages with zeroes to resolve missing page fault within the range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range to fill with zeroes + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) +{ + struct uffdio_zeropage uffd_zeropage; + + uffd_zeropage.range.start = (uintptr_t) addr; + uffd_zeropage.range.len = length; + uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { + error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_zeropage.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution + * + * Wake up threads waiting on any page/pages from the designated range. + * The main use case is when during some period, page faults are resolved + * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits + * for the whole memory range are satisfied in a single call to uffd_wakeup(). + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range + */ +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { + error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", + addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_read_events: read pending UFFD events + * + * Returns number of fetched messages, 0 if non is available or + * negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @msgs: pointer to message buffer + * @count: number of messages that can fit in the buffer + */ +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) +{ + ssize_t res; + do { + res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); + } while (res < 0 && errno == EINTR); + + if ((res < 0 && errno == EAGAIN)) { + return 0; + } + if (res < 0) { + error_report("uffd_read_events() failed: errno=%i", errno); + return -1; + } + + return (int) (res / sizeof(struct uffd_msg)); +} + +/** + * uffd_poll_events: poll UFFD file descriptor for read + * + * Returns true if events are available for read, false otherwise + * + * @uffd_fd: UFFD file descriptor + * @tmo: timeout value + */ +bool uffd_poll_events(int uffd_fd, int tmo) +{ + int res; + struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; + + do { + res = poll(&poll_fd, 1, tmo); + } while (res < 0 && errno == EINTR); + + if (res == 0) { + return false; + } + if (res < 0) { + error_report("uffd_poll_events() failed: errno=%i", errno); + return false; + } + + return (poll_fd.revents & POLLIN) != 0; +} |