diff options
143 files changed, 2718 insertions, 1074 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index bf1fc5b21e..50435b8d2f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -271,8 +271,9 @@ M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Maintained F: target/ppc/ -F: hw/ppc/ -F: include/hw/ppc/ +F: hw/ppc/ppc.c +F: hw/ppc/ppc_booke.c +F: include/hw/ppc/ppc.h F: disas/ppc.c RISC-V TCG CPUs @@ -1235,24 +1236,18 @@ F: hw/openrisc/openrisc_sim.c PowerPC Machines ---------------- 405 -M: David Gibson <david@gibson.dropbear.id.au> -M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org -S: Odd Fixes +S: Orphan F: hw/ppc/ppc405_boards.c Bamboo -M: David Gibson <david@gibson.dropbear.id.au> -M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org -S: Odd Fixes +S: Orphan F: hw/ppc/ppc440_bamboo.c e500 -M: David Gibson <david@gibson.dropbear.id.au> -M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org -S: Odd Fixes +S: Orphan F: hw/ppc/e500* F: hw/gpio/mpc8xxx.c F: hw/i2c/mpc_i2c.c @@ -1261,20 +1256,18 @@ F: hw/pci-host/ppce500.c F: include/hw/ppc/ppc_e500.h F: include/hw/pci-host/ppce500.h F: pc-bios/u-boot.e500 +F: hw/intc/openpic_kvm.h +F: include/hw/ppc/openpic_kvm.h mpc8544ds -M: David Gibson <david@gibson.dropbear.id.au> -M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org -S: Odd Fixes +S: Orphan F: hw/ppc/mpc8544ds.c F: hw/ppc/mpc8544_guts.c F: tests/acceptance/ppc_mpc8544ds.py New World (mac99) M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> -R: David Gibson <david@gibson.dropbear.id.au> -R: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Odd Fixes F: hw/ppc/mac_newworld.c @@ -1293,8 +1286,6 @@ F: pc-bios/qemu_vga.ndrv Old World (g3beige) M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> -R: David Gibson <david@gibson.dropbear.id.au> -R: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Odd Fixes F: hw/ppc/mac_oldworld.c @@ -1308,8 +1299,6 @@ F: pc-bios/qemu_vga.ndrv PReP M: Hervé Poussineau <hpoussin@reactos.org> -R: David Gibson <david@gibson.dropbear.id.au> -R: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Maintained F: hw/ppc/prep.c @@ -1328,7 +1317,7 @@ sPAPR M: David Gibson <david@gibson.dropbear.id.au> M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org -S: Supported +S: Maintained F: hw/*/spapr* F: include/hw/*/spapr* F: hw/*/xics* @@ -1344,8 +1333,6 @@ F: tests/acceptance/ppc_pseries.py PowerNV (Non-Virtualized) M: Cédric Le Goater <clg@kaod.org> -M: David Gibson <david@gibson.dropbear.id.au> -M: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Maintained F: hw/ppc/pnv* @@ -1366,8 +1353,6 @@ F: tests/acceptance/ppc_virtex_ml507.py sam460ex M: BALATON Zoltan <balaton@eik.bme.hu> -R: David Gibson <david@gibson.dropbear.id.au> -R: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Maintained F: hw/ppc/sam460ex.c @@ -1381,7 +1366,6 @@ F: roms/u-boot-sam460ex pegasos2 M: BALATON Zoltan <balaton@eik.bme.hu> -R: David Gibson <david@gibson.dropbear.id.au> L: qemu-ppc@nongnu.org S: Maintained F: hw/ppc/pegasos2.c @@ -1785,9 +1769,8 @@ F: include/hw/acpi/ghes.h F: docs/specs/acpi_hest_ghes.rst ppc4xx -M: David Gibson <david@gibson.dropbear.id.au> L: qemu-ppc@nongnu.org -S: Odd Fixes +S: Orphan F: hw/ppc/ppc4*.c F: hw/i2c/ppc4xx_i2c.c F: include/hw/ppc/ppc4xx.h @@ -2242,8 +2225,6 @@ T: git https://github.com/philmd/qemu.git fw_cfg-next XIVE M: Cédric Le Goater <clg@kaod.org> -R: David Gibson <david@gibson.dropbear.id.au> -R: Greg Kurz <groug@kaod.org> L: qemu-ppc@nongnu.org S: Supported F: hw/*/*xive* @@ -2279,6 +2260,12 @@ F: net/can/* F: hw/net/can/* F: include/net/can_*.h +OpenPIC interrupt controller +M: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> +S: Odd Fixes +F: hw/intc/openpic.c +F: include/hw/ppc/openpic.h + Subsystems ---------- Overall Audio backends diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c index 93976f4ece..6cbd2c3f97 100644 --- a/accel/hvf/hvf-accel-ops.c +++ b/accel/hvf/hvf-accel-ops.c @@ -295,6 +295,7 @@ static void hvf_region_del(MemoryListener *listener, } static MemoryListener hvf_memory_listener = { + .name = "hvf", .priority = 10, .region_add = hvf_region_add, .region_del = hvf_region_del, diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index cace5ffe64..db8d83b137 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1129,6 +1129,7 @@ static void kvm_coalesce_pio_del(MemoryListener *listener, } static MemoryListener kvm_coalesced_pio_listener = { + .name = "kvm-coalesced-pio", .coalesced_io_add = kvm_coalesce_pio_add, .coalesced_io_del = kvm_coalesce_pio_del, }; @@ -1633,7 +1634,7 @@ static void kvm_io_ioeventfd_del(MemoryListener *listener, } void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, - AddressSpace *as, int as_id) + AddressSpace *as, int as_id, const char *name) { int i; @@ -1649,6 +1650,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, kml->listener.log_start = kvm_log_start; kml->listener.log_stop = kvm_log_stop; kml->listener.priority = 10; + kml->listener.name = name; if (s->kvm_dirty_ring_size) { kml->listener.log_sync_global = kvm_log_sync_global; @@ -1669,6 +1671,7 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, } static MemoryListener kvm_io_listener = { + .name = "kvm-io", .eventfd_add = kvm_io_ioeventfd_add, .eventfd_del = kvm_io_ioeventfd_del, .priority = 10, @@ -2579,7 +2582,7 @@ static int kvm_init(MachineState *ms) s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; kvm_memory_listener_register(s, &s->memory_listener, - &address_space_memory, 0); + &address_space_memory, 0, "kvm-memory"); if (kvm_eventfds_allowed) { memory_listener_register(&kvm_io_listener, &address_space_io); diff --git a/backends/hostmem-epc.c b/backends/hostmem-epc.c new file mode 100644 index 0000000000..b47f98b6a3 --- /dev/null +++ b/backends/hostmem-epc.c @@ -0,0 +1,82 @@ +/* + * QEMU host SGX EPC memory backend + * + * Copyright (C) 2019 Intel Corporation + * + * Authors: + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include <sys/ioctl.h> + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qom/object_interfaces.h" +#include "qapi/error.h" +#include "sysemu/hostmem.h" +#include "hw/i386/hostmem-epc.h" + +static void +sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ + uint32_t ram_flags; + char *name; + int fd; + + if (!backend->size) { + error_setg(errp, "can't create backend with size 0"); + return; + } + + fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd < 0) { + error_setg_errno(errp, errno, + "failed to open /dev/sgx_vepc to alloc SGX EPC"); + return; + } + + name = object_get_canonical_path(OBJECT(backend)); + ram_flags = (backend->share ? RAM_SHARED : 0) | RAM_PROTECTED; + memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), + name, backend->size, ram_flags, + fd, 0, errp); + g_free(name); +} + +static void sgx_epc_backend_instance_init(Object *obj) +{ + HostMemoryBackend *m = MEMORY_BACKEND(obj); + + m->share = true; + m->merge = false; + m->dump = false; +} + +static void sgx_epc_backend_class_init(ObjectClass *oc, void *data) +{ + HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc); + + bc->alloc = sgx_epc_backend_memory_alloc; +} + +static const TypeInfo sgx_epc_backed_info = { + .name = TYPE_MEMORY_BACKEND_EPC, + .parent = TYPE_MEMORY_BACKEND, + .instance_init = sgx_epc_backend_instance_init, + .class_init = sgx_epc_backend_class_init, + .instance_size = sizeof(HostMemoryBackendEpc), +}; + +static void register_types(void) +{ + int fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd >= 0) { + close(fd); + + type_register_static(&sgx_epc_backed_info); + } +} + +type_init(register_types); diff --git a/backends/meson.build b/backends/meson.build index d4221831fc..6e68945528 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -16,5 +16,6 @@ softmmu_ss.add(when: ['CONFIG_VHOST_USER', 'CONFIG_VIRTIO'], if_true: files('vho softmmu_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-vhost.c')) softmmu_ss.add(when: ['CONFIG_VIRTIO_CRYPTO', 'CONFIG_VHOST_CRYPTO'], if_true: files('cryptodev-vhost-user.c')) softmmu_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus-vmstate.c'), gio]) +softmmu_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c')) subdir('tpm') diff --git a/block/blkdebug.c b/block/blkdebug.c index 8b67554bec..bbf2948703 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -631,8 +631,8 @@ static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static int coroutine_fn -blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blkdebug_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int err; @@ -652,8 +652,8 @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static int coroutine_fn -blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blkdebug_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int err; @@ -684,7 +684,7 @@ static int blkdebug_co_flush(BlockDriverState *bs) } static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { uint32_t align = MAX(bs->bl.request_alignment, @@ -717,7 +717,7 @@ static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs, } static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { uint32_t align = bs->bl.pdiscard_alignment; int err; diff --git a/block/blklogwrites.c b/block/blklogwrites.c index b7579370a3..f7a251e91f 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -301,8 +301,8 @@ static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp) } static int coroutine_fn -blk_log_writes_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blk_log_writes_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } @@ -460,16 +460,16 @@ blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) } static int coroutine_fn -blk_log_writes_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blk_log_writes_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { return blk_log_writes_co_log(bs, offset, bytes, qiov, flags, blk_log_writes_co_do_file_pwritev, 0, false); } static int coroutine_fn -blk_log_writes_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, - BdrvRequestFlags flags) +blk_log_writes_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int64_t bytes, BdrvRequestFlags flags) { return blk_log_writes_co_log(bs, offset, bytes, NULL, flags, blk_log_writes_co_do_file_pwrite_zeroes, 0, @@ -484,9 +484,9 @@ static int coroutine_fn blk_log_writes_co_flush_to_disk(BlockDriverState *bs) } static int coroutine_fn -blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) +blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { - return blk_log_writes_co_log(bs, offset, count, NULL, 0, + return blk_log_writes_co_log(bs, offset, bytes, NULL, 0, blk_log_writes_co_do_file_pdiscard, LOG_DISCARD_FLAG, false); } diff --git a/block/blkreplay.c b/block/blkreplay.c index 4a247752fd..dcbe780ddb 100644 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -72,7 +72,7 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs, } static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); @@ -83,7 +83,7 @@ static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs, } static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); @@ -94,7 +94,7 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs, } static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); @@ -105,7 +105,7 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs, } static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { uint64_t reqid = blkreplay_next_id(); int ret = bdrv_co_pdiscard(bs->file, offset, bytes); diff --git a/block/blkverify.c b/block/blkverify.c index 188d7632fa..d1facf5ba9 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -221,8 +221,8 @@ blkverify_co_prwv(BlockDriverState *bs, BlkverifyRequest *r, uint64_t offset, } static int coroutine_fn -blkverify_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BlkverifyRequest r; QEMUIOVector raw_qiov; @@ -250,8 +250,8 @@ blkverify_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static int coroutine_fn -blkverify_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +blkverify_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BlkverifyRequest r; return blkverify_co_prwv(bs, &r, offset, bytes, qiov, qiov, flags, true); diff --git a/block/bochs.c b/block/bochs.c index 2f010ab40a..4d68658087 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -238,8 +238,8 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) } static int coroutine_fn -bochs_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +bochs_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVBochsState *s = bs->opaque; uint64_t sector_num = offset >> BDRV_SECTOR_BITS; diff --git a/block/cloop.c b/block/cloop.c index c99192a57f..b8c6d0eccd 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -245,8 +245,8 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) } static int coroutine_fn -cloop_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +cloop_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVCloopState *s = bs->opaque; uint64_t sector_num = offset >> BDRV_SECTOR_BITS; diff --git a/block/commit.c b/block/commit.c index 42792b4556..10cc5ff451 100644 --- a/block/commit.c +++ b/block/commit.c @@ -207,7 +207,7 @@ static const BlockJobDriver commit_job_driver = { }; static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); } diff --git a/block/copy-before-write.c b/block/copy-before-write.c index 2a5e57deca..c30a5ff8de 100644 --- a/block/copy-before-write.c +++ b/block/copy-before-write.c @@ -40,8 +40,8 @@ typedef struct BDRVCopyBeforeWriteState { } BDRVCopyBeforeWriteState; static coroutine_fn int cbw_co_preadv( - BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } @@ -64,7 +64,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs, } static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { int ret = cbw_do_copy_before_write(bs, offset, bytes, 0); if (ret < 0) { @@ -75,7 +75,7 @@ static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs, } static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); if (ret < 0) { @@ -86,9 +86,10 @@ static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs, } static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, + int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { int ret = cbw_do_copy_before_write(bs, offset, bytes, flags); if (ret < 0) { diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c428682272..1fc7fb3333 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -128,10 +128,10 @@ static int64_t cor_getlength(BlockDriverState *bs) static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, - int flags) + BdrvRequestFlags flags) { int64_t n; int local_flags; @@ -181,10 +181,11 @@ static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, int flags) + size_t qiov_offset, + BdrvRequestFlags flags) { return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, flags); @@ -192,7 +193,7 @@ static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); @@ -200,15 +201,15 @@ static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { return bdrv_co_pdiscard(bs->file, offset, bytes); } static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov) { return bdrv_co_pwritev(bs->file, offset, bytes, qiov, diff --git a/block/crypto.c b/block/crypto.c index 1d30fde38e..c8ba4681e2 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -397,8 +397,8 @@ static int block_crypto_reopen_prepare(BDRVReopenState *state, #define BLOCK_CRYPTO_MAX_IO_SIZE (1024 * 1024) static coroutine_fn int -block_crypto_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BlockCrypto *crypto = bs->opaque; uint64_t cur_bytes; /* number of bytes in current iteration */ @@ -460,8 +460,8 @@ block_crypto_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, static coroutine_fn int -block_crypto_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BlockCrypto *crypto = bs->opaque; uint64_t cur_bytes; /* number of bytes in current iteration */ diff --git a/block/curl.c b/block/curl.c index 50e741a0d7..4a8ae2b269 100644 --- a/block/curl.c +++ b/block/curl.c @@ -896,7 +896,8 @@ out: } static int coroutine_fn curl_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { CURLAIOCB acb = { .co = qemu_coroutine_self(), diff --git a/block/dmg.c b/block/dmg.c index ef35a505f2..447901fbb8 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -689,8 +689,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) } static int coroutine_fn -dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +dmg_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVDMGState *s = bs->opaque; uint64_t sector_num = offset >> BDRV_SECTOR_BITS; diff --git a/block/file-posix.c b/block/file-posix.c index d81e15efa4..c62e42743d 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2077,16 +2077,16 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); } -static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); } -static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { assert(flags == 0); return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); @@ -2942,7 +2942,8 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) } static coroutine_fn int -raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev) +raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, + bool blkdev) { BDRVRawState *s = bs->opaque; RawPosixAIOData acb; @@ -2966,13 +2967,13 @@ raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev) } static coroutine_fn int -raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { return raw_do_pdiscard(bs, offset, bytes, false); } static int coroutine_fn -raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, +raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, BdrvRequestFlags flags, bool blkdev) { BDRVRawState *s = bs->opaque; @@ -3040,7 +3041,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, static int coroutine_fn raw_co_pwrite_zeroes( BlockDriverState *bs, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false); } @@ -3203,8 +3204,8 @@ static void raw_abort_perm_update(BlockDriverState *bs) } static int coroutine_fn raw_co_copy_range_from( - BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, + BlockDriverState *bs, BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, @@ -3213,10 +3214,10 @@ static int coroutine_fn raw_co_copy_range_from( static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { @@ -3591,7 +3592,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) #endif /* linux */ static coroutine_fn int -hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { BDRVRawState *s = bs->opaque; int ret; @@ -3605,7 +3606,7 @@ hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) } static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { int rc; diff --git a/block/file-win32.c b/block/file-win32.c index b97c58d642..ec9d64d0e4 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -440,8 +440,8 @@ fail: } static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; @@ -455,8 +455,8 @@ static BlockAIOCB *raw_aio_preadv(BlockDriverState *bs, } static BlockAIOCB *raw_aio_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; diff --git a/block/filter-compress.c b/block/filter-compress.c index 5136371bf8..d5be538619 100644 --- a/block/filter-compress.c +++ b/block/filter-compress.c @@ -63,10 +63,10 @@ static int64_t compress_getlength(BlockDriverState *bs) static int coroutine_fn compress_co_preadv_part(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, - int flags) + BdrvRequestFlags flags) { return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, flags); @@ -74,10 +74,11 @@ static int coroutine_fn compress_co_preadv_part(BlockDriverState *bs, static int coroutine_fn compress_co_pwritev_part(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, int flags) + size_t qiov_offset, + BdrvRequestFlags flags) { return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, flags | BDRV_REQ_WRITE_COMPRESSED); @@ -85,7 +86,7 @@ static int coroutine_fn compress_co_pwritev_part(BlockDriverState *bs, static int coroutine_fn compress_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); @@ -93,7 +94,7 @@ static int coroutine_fn compress_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn compress_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { return bdrv_co_pdiscard(bs->file, offset, bytes); } diff --git a/block/gluster.c b/block/gluster.c index d51938e447..398976bc66 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -891,6 +891,7 @@ out: static void qemu_gluster_refresh_limits(BlockDriverState *bs, Error **errp) { bs->bl.max_transfer = GLUSTER_MAX_TRANSFER; + bs->bl.max_pdiscard = SIZE_MAX; } static int qemu_gluster_reopen_prepare(BDRVReopenState *state, @@ -1003,19 +1004,19 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state) #ifdef CONFIG_GLUSTERFS_ZEROFILL static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int size, + int64_t bytes, BdrvRequestFlags flags) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - acb.size = size; + acb.size = bytes; acb.ret = 0; acb.coroutine = qemu_coroutine_self(); acb.aio_context = bdrv_get_aio_context(bs); - ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb); + ret = glfs_zerofill_async(s->fd, offset, bytes, gluster_finish_aiocb, &acb); if (ret < 0) { return -errno; } @@ -1297,18 +1298,20 @@ error: #ifdef CONFIG_GLUSTERFS_DISCARD static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs, - int64_t offset, int size) + int64_t offset, int64_t bytes) { int ret; GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; + assert(bytes <= SIZE_MAX); /* rely on max_pdiscard */ + acb.size = 0; acb.ret = 0; acb.coroutine = qemu_coroutine_self(); acb.aio_context = bdrv_get_aio_context(bs); - ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb); + ret = glfs_discard_async(s->fd, offset, bytes, gluster_finish_aiocb, &acb); if (ret < 0) { return -errno; } diff --git a/block/io.c b/block/io.c index 99ee182ca4..18d345a87a 100644 --- a/block/io.c +++ b/block/io.c @@ -956,9 +956,9 @@ bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, return waited; } -static int bdrv_check_qiov_request(int64_t offset, int64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, - Error **errp) +int bdrv_check_qiov_request(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + Error **errp) { /* * Check generic offset/bytes correctness @@ -1230,7 +1230,8 @@ out: static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, int flags) + size_t qiov_offset, + BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; int64_t sector_num; @@ -1868,7 +1869,8 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int head = 0; int tail = 0; - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); + int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, + INT64_MAX); int alignment = MAX(bs->bl.pwrite_zeroes_alignment, bs->bl.request_alignment); int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); @@ -2073,7 +2075,8 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes, */ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, BdrvTrackedRequest *req, int64_t offset, int64_t bytes, - int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags) + int64_t align, QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags) { BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; @@ -2246,7 +2249,11 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, return -ENOMEDIUM; } - ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); + if (flags & BDRV_REQ_ZERO_WRITE) { + ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL); + } else { + ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset); + } if (ret < 0) { return ret; } @@ -2810,7 +2817,12 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BlockDriver *drv = bs->drv; BlockDriverState *child_bs = bdrv_primary_bs(bs); - int ret = -ENOTSUP; + int ret; + + ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); + if (ret < 0) { + return ret; + } if (!drv) { return -ENOMEDIUM; @@ -2822,6 +2834,8 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) ret = drv->bdrv_load_vmstate(bs, qiov, pos); } else if (child_bs) { ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); + } else { + ret = -ENOTSUP; } bdrv_dec_in_flight(bs); @@ -2834,7 +2848,12 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BlockDriver *drv = bs->drv; BlockDriverState *child_bs = bdrv_primary_bs(bs); - int ret = -ENOTSUP; + int ret; + + ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); + if (ret < 0) { + return ret; + } if (!drv) { return -ENOMEDIUM; @@ -2846,6 +2865,8 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) ret = drv->bdrv_save_vmstate(bs, qiov, pos); } else if (child_bs) { ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); + } else { + ret = -ENOTSUP; } bdrv_dec_in_flight(bs); @@ -3035,7 +3056,8 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) { BdrvTrackedRequest req; - int max_pdiscard, ret; + int ret; + int64_t max_pdiscard; int head, tail, align; BlockDriverState *bs = child->bs; @@ -3082,7 +3104,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, goto out; } - max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX), + max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX), align); assert(max_pdiscard >= bs->bl.request_alignment); diff --git a/block/iscsi.c b/block/iscsi.c index 852384086b..57aa07a40d 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -427,14 +427,14 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; } -static bool is_byte_request_lun_aligned(int64_t offset, int count, +static bool is_byte_request_lun_aligned(int64_t offset, int64_t bytes, IscsiLun *iscsilun) { - if (offset % iscsilun->block_size || count % iscsilun->block_size) { + if (offset % iscsilun->block_size || bytes % iscsilun->block_size) { error_report("iSCSI misaligned request: " "iscsilun->block_size %u, offset %" PRIi64 - ", count %d", - iscsilun->block_size, offset, count); + ", bytes %" PRIi64, + iscsilun->block_size, offset, bytes); return false; } return true; @@ -1138,7 +1138,8 @@ iscsi_getlength(BlockDriverState *bs) } static int -coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, + int64_t bytes) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; @@ -1154,6 +1155,12 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) return 0; } + /* + * We don't want to overflow list.num which is uint32_t. + * We rely on our max_pdiscard. + */ + assert(bytes / iscsilun->block_size <= UINT32_MAX); + list.lba = offset / iscsilun->block_size; list.num = bytes / iscsilun->block_size; @@ -1202,12 +1209,12 @@ out_unlock: static int coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; uint64_t lba; - uint32_t nb_blocks; + uint64_t nb_blocks; bool use_16_for_ws = iscsilun->use_16_for_rw; int r = 0; @@ -1247,11 +1254,21 @@ coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, iscsi_co_init_iscsitask(iscsilun, &iTask); retry: if (use_16_for_ws) { + /* + * iscsi_writesame16_task num_blocks argument is uint32_t. We rely here + * on our max_pwrite_zeroes limit. + */ + assert(nb_blocks <= UINT32_MAX); iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, iscsilun->zeroblock, iscsilun->block_size, nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), 0, 0, iscsi_co_generic_cb, &iTask); } else { + /* + * iscsi_writesame10_task num_blocks argument is uint16_t. We rely here + * on our max_pwrite_zeroes limit. + */ + assert(nb_blocks <= UINT16_MAX); iTask.task = iscsi_writesame10_task(iscsilun->iscsi, iscsilun->lun, lba, iscsilun->zeroblock, iscsilun->block_size, nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), @@ -2061,20 +2078,19 @@ static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) } if (iscsilun->lbp.lbpu) { - if (iscsilun->bl.max_unmap < 0xffffffff / block_size) { - bs->bl.max_pdiscard = - iscsilun->bl.max_unmap * iscsilun->block_size; - } + bs->bl.max_pdiscard = + MIN_NON_ZERO(iscsilun->bl.max_unmap * iscsilun->block_size, + (uint64_t)UINT32_MAX * iscsilun->block_size); bs->bl.pdiscard_alignment = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; } else { bs->bl.pdiscard_alignment = iscsilun->block_size; } - if (iscsilun->bl.max_ws_len < 0xffffffff / block_size) { - bs->bl.max_pwrite_zeroes = - iscsilun->bl.max_ws_len * iscsilun->block_size; - } + bs->bl.max_pwrite_zeroes = + MIN_NON_ZERO(iscsilun->bl.max_ws_len * iscsilun->block_size, + max_xfer_len * iscsilun->block_size); + if (iscsilun->lbp.lbpws) { bs->bl.pwrite_zeroes_alignment = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; @@ -2169,10 +2185,10 @@ static void coroutine_fn iscsi_co_invalidate_cache(BlockDriverState *bs, static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { @@ -2310,10 +2326,10 @@ static void iscsi_xcopy_data(struct iscsi_data *data, static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { diff --git a/block/mirror.c b/block/mirror.c index 85b781bc21..c962e8b471 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1402,7 +1402,7 @@ static void coroutine_fn active_write_settle(MirrorOp *op) } static int coroutine_fn bdrv_mirror_top_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); } @@ -1456,7 +1456,7 @@ out: } static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { MirrorBDSOpaque *s = bs->opaque; QEMUIOVector bounce_qiov; @@ -1501,14 +1501,14 @@ static int coroutine_fn bdrv_mirror_top_flush(BlockDriverState *bs) } static int coroutine_fn bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL, flags); } static int coroutine_fn bdrv_mirror_top_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes, NULL, 0); diff --git a/block/nbd.c b/block/nbd.c index f6ff1c4fb4..5ef462db1b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -57,7 +57,8 @@ typedef struct { Coroutine *coroutine; uint64_t offset; /* original offset of the request */ - bool receiving; /* waiting for connection_co? */ + bool receiving; /* sleeping in the yield in nbd_receive_replies */ + bool reply_possible; /* reply header not yet received */ } NBDClientRequest; typedef enum NBDClientState { @@ -73,14 +74,10 @@ typedef struct BDRVNBDState { CoMutex send_mutex; CoQueue free_sema; - Coroutine *connection_co; - Coroutine *teardown_co; - QemuCoSleep reconnect_sleep; - bool drained; - bool wait_drained_end; + + CoMutex receive_mutex; int in_flight; NBDClientState state; - bool wait_in_flight; QEMUTimer *reconnect_delay_timer; @@ -127,33 +124,44 @@ static bool nbd_client_connected(BDRVNBDState *s) return qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED; } -static void nbd_channel_error(BDRVNBDState *s, int ret) +static bool nbd_recv_coroutine_wake_one(NBDClientRequest *req) { - if (ret == -EIO) { - if (nbd_client_connected(s)) { - s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT : - NBD_CLIENT_CONNECTING_NOWAIT; - } - } else { - if (nbd_client_connected(s)) { - qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); - } - s->state = NBD_CLIENT_QUIT; + if (req->receiving) { + req->receiving = false; + aio_co_wake(req->coroutine); + return true; } + + return false; } -static void nbd_recv_coroutines_wake_all(BDRVNBDState *s) +static void nbd_recv_coroutines_wake(BDRVNBDState *s, bool all) { int i; for (i = 0; i < MAX_NBD_REQUESTS; i++) { - NBDClientRequest *req = &s->requests[i]; + if (nbd_recv_coroutine_wake_one(&s->requests[i]) && !all) { + return; + } + } +} + +static void nbd_channel_error(BDRVNBDState *s, int ret) +{ + if (nbd_client_connected(s)) { + qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } - if (req->coroutine && req->receiving) { - req->receiving = false; - aio_co_wake(req->coroutine); + if (ret == -EIO) { + if (nbd_client_connected(s)) { + s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT : + NBD_CLIENT_CONNECTING_NOWAIT; } + } else { + s->state = NBD_CLIENT_QUIT; } + + nbd_recv_coroutines_wake(s, true); } static void reconnect_delay_timer_del(BDRVNBDState *s) @@ -170,6 +178,7 @@ static void reconnect_delay_timer_cb(void *opaque) if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) { s->state = NBD_CLIENT_CONNECTING_NOWAIT; + nbd_co_establish_connection_cancel(s->conn); while (qemu_co_enter_next(&s->free_sema, NULL)) { /* Resume all queued requests */ } @@ -192,113 +201,21 @@ static void reconnect_delay_timer_init(BDRVNBDState *s, uint64_t expire_time_ns) timer_mod(s->reconnect_delay_timer, expire_time_ns); } -static void nbd_client_detach_aio_context(BlockDriverState *bs) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - /* Timer is deleted in nbd_client_co_drain_begin() */ - assert(!s->reconnect_delay_timer); - /* - * If reconnect is in progress we may have no ->ioc. It will be - * re-instantiated in the proper aio context once the connection is - * reestablished. - */ - if (s->ioc) { - qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc)); - } -} - -static void nbd_client_attach_aio_context_bh(void *opaque) -{ - BlockDriverState *bs = opaque; - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - if (s->connection_co) { - /* - * The node is still drained, so we know the coroutine has yielded in - * nbd_read_eof(), the only place where bs->in_flight can reach 0, or - * it is entered for the first time. Both places are safe for entering - * the coroutine. - */ - qemu_aio_coroutine_enter(bs->aio_context, s->connection_co); - } - bdrv_dec_in_flight(bs); -} - -static void nbd_client_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - /* - * s->connection_co is either yielded from nbd_receive_reply or from - * nbd_co_reconnect_loop() - */ - if (nbd_client_connected(s)) { - qio_channel_attach_aio_context(QIO_CHANNEL(s->ioc), new_context); - } - - bdrv_inc_in_flight(bs); - - /* - * Need to wait here for the BH to run because the BH must run while the - * node is still drained. - */ - aio_wait_bh_oneshot(new_context, nbd_client_attach_aio_context_bh, bs); -} - -static void coroutine_fn nbd_client_co_drain_begin(BlockDriverState *bs) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - s->drained = true; - qemu_co_sleep_wake(&s->reconnect_sleep); - - nbd_co_establish_connection_cancel(s->conn); - - reconnect_delay_timer_del(s); - - if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) { - s->state = NBD_CLIENT_CONNECTING_NOWAIT; - qemu_co_queue_restart_all(&s->free_sema); - } -} - -static void coroutine_fn nbd_client_co_drain_end(BlockDriverState *bs) -{ - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - s->drained = false; - if (s->wait_drained_end) { - s->wait_drained_end = false; - aio_co_wake(s->connection_co); - } -} - - static void nbd_teardown_connection(BlockDriverState *bs) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + assert(!s->in_flight); + if (s->ioc) { - /* finish any pending coroutines */ qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name), + nbd_yank, s->bs); + object_unref(OBJECT(s->ioc)); + s->ioc = NULL; } s->state = NBD_CLIENT_QUIT; - if (s->connection_co) { - qemu_co_sleep_wake(&s->reconnect_sleep); - nbd_co_establish_connection_cancel(s->conn); - } - if (qemu_in_coroutine()) { - s->teardown_co = qemu_coroutine_self(); - /* connection_co resumes us when it terminates */ - qemu_coroutine_yield(); - s->teardown_co = NULL; - } else { - BDRV_POLL_WHILE(bs, s->connection_co); - } - assert(!s->connection_co); } static bool nbd_client_connecting(BDRVNBDState *s) @@ -363,10 +280,11 @@ int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs, { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; int ret; + bool blocking = nbd_client_connecting_wait(s); assert(!s->ioc); - s->ioc = nbd_co_establish_connection(s->conn, &s->info, true, errp); + s->ioc = nbd_co_establish_connection(s->conn, &s->info, blocking, errp); if (!s->ioc) { return -ECONNREFUSED; } @@ -402,29 +320,22 @@ int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs, return 0; } +/* called under s->send_mutex */ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) { - if (!nbd_client_connecting(s)) { - return; - } - - /* Wait for completion of all in-flight requests */ - - qemu_co_mutex_lock(&s->send_mutex); - - while (s->in_flight > 0) { - qemu_co_mutex_unlock(&s->send_mutex); - nbd_recv_coroutines_wake_all(s); - s->wait_in_flight = true; - qemu_coroutine_yield(); - s->wait_in_flight = false; - qemu_co_mutex_lock(&s->send_mutex); - } + assert(nbd_client_connecting(s)); + assert(s->in_flight == 0); - qemu_co_mutex_unlock(&s->send_mutex); - - if (!nbd_client_connecting(s)) { - return; + if (nbd_client_connecting_wait(s) && s->reconnect_delay && + !s->reconnect_delay_timer) + { + /* + * It's first reconnect attempt after switching to + * NBD_CLIENT_CONNECTING_WAIT + */ + reconnect_delay_timer_init(s, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + + s->reconnect_delay * NANOSECONDS_PER_SECOND); } /* @@ -444,135 +355,73 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) nbd_co_do_establish_connection(s->bs, NULL); } -static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s) +static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle) { - uint64_t timeout = 1 * NANOSECONDS_PER_SECOND; - uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND; + int ret; + uint64_t ind = HANDLE_TO_INDEX(s, handle), ind2; + QEMU_LOCK_GUARD(&s->receive_mutex); - if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) { - reconnect_delay_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + - s->reconnect_delay * NANOSECONDS_PER_SECOND); - } - - nbd_reconnect_attempt(s); - - while (nbd_client_connecting(s)) { - if (s->drained) { - bdrv_dec_in_flight(s->bs); - s->wait_drained_end = true; - while (s->drained) { - /* - * We may be entered once from nbd_client_attach_aio_context_bh - * and then from nbd_client_co_drain_end. So here is a loop. - */ - qemu_coroutine_yield(); - } - bdrv_inc_in_flight(s->bs); - } else { - qemu_co_sleep_ns_wakeable(&s->reconnect_sleep, - QEMU_CLOCK_REALTIME, timeout); - if (s->drained) { - continue; - } - if (timeout < max_timeout) { - timeout *= 2; - } + while (true) { + if (s->reply.handle == handle) { + /* We are done */ + return 0; } - nbd_reconnect_attempt(s); - } - - reconnect_delay_timer_del(s); -} + if (!nbd_client_connected(s)) { + return -EIO; + } -static coroutine_fn void nbd_connection_entry(void *opaque) -{ - BDRVNBDState *s = opaque; - uint64_t i; - int ret = 0; - Error *local_err = NULL; + if (s->reply.handle != 0) { + /* + * Some other request is being handled now. It should already be + * woken by whoever set s->reply.handle (or never wait in this + * yield). So, we should not wake it here. + */ + ind2 = HANDLE_TO_INDEX(s, s->reply.handle); + assert(!s->requests[ind2].receiving); - while (qatomic_load_acquire(&s->state) != NBD_CLIENT_QUIT) { - /* - * The NBD client can only really be considered idle when it has - * yielded from qio_channel_readv_all_eof(), waiting for data. This is - * the point where the additional scheduled coroutine entry happens - * after nbd_client_attach_aio_context(). - * - * Therefore we keep an additional in_flight reference all the time and - * only drop it temporarily here. - */ + s->requests[ind].receiving = true; + qemu_co_mutex_unlock(&s->receive_mutex); - if (nbd_client_connecting(s)) { - nbd_co_reconnect_loop(s); - } + qemu_coroutine_yield(); + /* + * We may be woken for 3 reasons: + * 1. From this function, executing in parallel coroutine, when our + * handle is received. + * 2. From nbd_channel_error(), when connection is lost. + * 3. From nbd_co_receive_one_chunk(), when previous request is + * finished and s->reply.handle set to 0. + * Anyway, it's OK to lock the mutex and go to the next iteration. + */ - if (!nbd_client_connected(s)) { + qemu_co_mutex_lock(&s->receive_mutex); + assert(!s->requests[ind].receiving); continue; } + /* We are under mutex and handle is 0. We have to do the dirty work. */ assert(s->reply.handle == 0); - ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, &local_err); - - if (local_err) { - trace_nbd_read_reply_entry_fail(ret, error_get_pretty(local_err)); - error_free(local_err); - local_err = NULL; - } + ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, NULL); if (ret <= 0) { - nbd_channel_error(s, ret ? ret : -EIO); - continue; + ret = ret ? ret : -EIO; + nbd_channel_error(s, ret); + return ret; } - - /* - * There's no need for a mutex on the receive side, because the - * handler acts as a synchronization point and ensures that only - * one coroutine is called until the reply finishes. - */ - i = HANDLE_TO_INDEX(s, s->reply.handle); - if (i >= MAX_NBD_REQUESTS || - !s->requests[i].coroutine || - !s->requests[i].receiving || - (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply)) - { + if (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply) { nbd_channel_error(s, -EINVAL); - continue; + return -EINVAL; } - - /* - * We're woken up again by the request itself. Note that there - * is no race between yielding and reentering connection_co. This - * is because: - * - * - if the request runs on the same AioContext, it is only - * entered after we yield - * - * - if the request runs on a different AioContext, reentering - * connection_co happens through a bottom half, which can only - * run after we yield. - */ - s->requests[i].receiving = false; - aio_co_wake(s->requests[i].coroutine); - qemu_coroutine_yield(); - } - - qemu_co_queue_restart_all(&s->free_sema); - nbd_recv_coroutines_wake_all(s); - bdrv_dec_in_flight(s->bs); - - s->connection_co = NULL; - if (s->ioc) { - qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc)); - yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name), - nbd_yank, s->bs); - object_unref(OBJECT(s->ioc)); - s->ioc = NULL; - } - - if (s->teardown_co) { - aio_co_wake(s->teardown_co); + if (s->reply.handle == handle) { + /* We are done */ + return 0; + } + ind2 = HANDLE_TO_INDEX(s, s->reply.handle); + if (ind2 >= MAX_NBD_REQUESTS || !s->requests[ind2].reply_possible) { + nbd_channel_error(s, -EINVAL); + return -EINVAL; + } + nbd_recv_coroutine_wake_one(&s->requests[ind2]); } - aio_wait_kick(); } static int nbd_co_send_request(BlockDriverState *bs, @@ -583,10 +432,17 @@ static int nbd_co_send_request(BlockDriverState *bs, int rc, i = -1; qemu_co_mutex_lock(&s->send_mutex); - while (s->in_flight == MAX_NBD_REQUESTS || nbd_client_connecting_wait(s)) { + + while (s->in_flight == MAX_NBD_REQUESTS || + (!nbd_client_connected(s) && s->in_flight > 0)) + { qemu_co_queue_wait(&s->free_sema, &s->send_mutex); } + if (nbd_client_connecting(s)) { + nbd_reconnect_attempt(s); + } + if (!nbd_client_connected(s)) { rc = -EIO; goto err; @@ -606,6 +462,7 @@ static int nbd_co_send_request(BlockDriverState *bs, s->requests[i].coroutine = qemu_coroutine_self(); s->requests[i].offset = request->from; s->requests[i].receiving = false; + s->requests[i].reply_possible = true; request->handle = INDEX_TO_HANDLE(s, i); @@ -633,10 +490,6 @@ err: if (i != -1) { s->requests[i].coroutine = NULL; s->in_flight--; - } - if (s->in_flight == 0 && s->wait_in_flight) { - aio_co_wake(s->connection_co); - } else { qemu_co_queue_next(&s->free_sema); } } @@ -935,10 +788,7 @@ static coroutine_fn int nbd_co_do_receive_one_chunk( } *request_ret = 0; - /* Wait until we're woken up by nbd_connection_entry. */ - s->requests[i].receiving = true; - qemu_coroutine_yield(); - assert(!s->requests[i].receiving); + nbd_receive_replies(s, handle); if (!nbd_client_connected(s)) { error_setg(errp, "Connection closed"); return -EIO; @@ -1031,14 +881,7 @@ static coroutine_fn int nbd_co_receive_one_chunk( } s->reply.handle = 0; - if (s->connection_co && !s->wait_in_flight) { - /* - * We must check s->wait_in_flight, because we may entered by - * nbd_recv_coroutines_wake_all(), in this case we should not - * wake connection_co here, it will woken by last request. - */ - aio_co_wake(s->connection_co); - } + nbd_recv_coroutines_wake(s, false); return ret; } @@ -1149,11 +992,7 @@ break_loop: qemu_co_mutex_lock(&s->send_mutex); s->in_flight--; - if (s->in_flight == 0 && s->wait_in_flight) { - aio_co_wake(s->connection_co); - } else { - qemu_co_queue_next(&s->free_sema); - } + qemu_co_queue_next(&s->free_sema); qemu_co_mutex_unlock(&s->send_mutex); return false; @@ -1322,8 +1161,9 @@ static int nbd_co_request(BlockDriverState *bs, NBDRequest *request, return ret ? ret : request_ret; } -static int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, int flags) +static int nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { int ret, request_ret; Error *local_err = NULL; @@ -1380,8 +1220,9 @@ static int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, return ret ? ret : request_ret; } -static int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, int flags) +static int nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; NBDRequest request = { @@ -1405,15 +1246,17 @@ static int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, } static int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; NBDRequest request = { .type = NBD_CMD_WRITE_ZEROES, .from = offset, - .len = bytes, + .len = bytes, /* .len is uint32_t actually */ }; + assert(bytes <= UINT32_MAX); /* rely on max_pwrite_zeroes */ + assert(!(s->info.flags & NBD_FLAG_READ_ONLY)); if (!(s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) { return -ENOTSUP; @@ -1453,15 +1296,17 @@ static int nbd_client_co_flush(BlockDriverState *bs) } static int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, - int bytes) + int64_t bytes) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; NBDRequest request = { .type = NBD_CMD_TRIM, .from = offset, - .len = bytes, + .len = bytes, /* len is uint32_t */ }; + assert(bytes <= UINT32_MAX); /* rely on max_pdiscard */ + assert(!(s->info.flags & NBD_FLAG_READ_ONLY)); if (!(s->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) { return 0; @@ -1969,6 +1814,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, s->bs = bs; qemu_co_mutex_init(&s->send_mutex); qemu_co_queue_init(&s->free_sema); + qemu_co_mutex_init(&s->receive_mutex); if (!yank_register_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name), errp)) { return -EEXIST; @@ -1983,14 +1829,13 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, s->x_dirty_bitmap, s->tlscreds); /* TODO: Configurable retry-until-timeout behaviour. */ + s->state = NBD_CLIENT_CONNECTING_WAIT; ret = nbd_do_establish_connection(bs, errp); if (ret < 0) { goto fail; } - s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); - bdrv_inc_in_flight(bs); - aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co); + nbd_client_connection_enable_retry(s->conn); return 0; @@ -2144,6 +1989,8 @@ static void nbd_cancel_in_flight(BlockDriverState *bs) s->state = NBD_CLIENT_CONNECTING_NOWAIT; qemu_co_queue_restart_all(&s->free_sema); } + + nbd_co_establish_connection_cancel(s->conn); } static BlockDriver bdrv_nbd = { @@ -2164,10 +2011,6 @@ static BlockDriver bdrv_nbd = { .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_client_detach_aio_context, - .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, @@ -2193,10 +2036,6 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_client_detach_aio_context, - .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, @@ -2222,10 +2061,6 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_refresh_limits = nbd_refresh_limits, .bdrv_co_truncate = nbd_co_truncate, .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_client_detach_aio_context, - .bdrv_attach_aio_context = nbd_client_attach_aio_context, - .bdrv_co_drain_begin = nbd_client_co_drain_begin, - .bdrv_co_drain_end = nbd_client_co_drain_end, .bdrv_refresh_filename = nbd_refresh_filename, .bdrv_co_block_status = nbd_client_co_block_status, .bdrv_dirname = nbd_dirname, diff --git a/block/nfs.c b/block/nfs.c index 9aeaefb364..577aea1d22 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -262,9 +262,9 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, nfs_co_generic_bh_cb, task); } -static int coroutine_fn nfs_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *iov, - int flags) +static int coroutine_fn nfs_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *iov, + BdrvRequestFlags flags) { NFSClient *client = bs->opaque; NFSRPC task; @@ -296,9 +296,9 @@ static int coroutine_fn nfs_co_preadv(BlockDriverState *bs, uint64_t offset, return 0; } -static int coroutine_fn nfs_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *iov, - int flags) +static int coroutine_fn nfs_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *iov, + BdrvRequestFlags flags) { NFSClient *client = bs->opaque; NFSRPC task; diff --git a/block/null.c b/block/null.c index cc9b1d4ea7..75f7d0db40 100644 --- a/block/null.c +++ b/block/null.c @@ -116,8 +116,9 @@ static coroutine_fn int null_co_common(BlockDriverState *bs) } static coroutine_fn int null_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVNullState *s = bs->opaque; @@ -129,8 +130,9 @@ static coroutine_fn int null_co_preadv(BlockDriverState *bs, } static coroutine_fn int null_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { return null_co_common(bs); } @@ -187,8 +189,8 @@ static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, } static BlockAIOCB *null_aio_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { @@ -202,8 +204,8 @@ static BlockAIOCB *null_aio_preadv(BlockDriverState *bs, } static BlockAIOCB *null_aio_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { diff --git a/block/nvme.c b/block/nvme.c index abfe305baf..1cc7b62bb4 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1251,15 +1251,17 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { return nvme_co_prw(bs, offset, bytes, qiov, false, flags); } static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { return nvme_co_prw(bs, offset, bytes, qiov, true, flags); } @@ -1294,19 +1296,29 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, + int64_t bytes, BdrvRequestFlags flags) { BDRVNVMeState *s = bs->opaque; NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; - - uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; + uint32_t cdw12; if (!s->supports_write_zeroes) { return -ENOTSUP; } + if (bytes == 0) { + return 0; + } + + cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; + /* + * We should not lose information. pwrite_zeroes_alignment and + * max_pwrite_zeroes guarantees it. + */ + assert(((cdw12 + 1) << s->blkshift) == bytes); + NvmeCmd cmd = { .opcode = NVME_CMD_WRITE_ZEROES, .nsid = cpu_to_le32(s->nsid), @@ -1348,7 +1360,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, int64_t offset, - int bytes) + int64_t bytes) { BDRVNVMeState *s = bs->opaque; NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; @@ -1375,6 +1387,14 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, assert(s->queue_count > 1); + /* + * Filling the @buf requires @offset and @bytes to satisfy restrictions + * defined in nvme_refresh_limits(). + */ + assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); + assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); + assert((bytes >> s->blkshift) <= UINT32_MAX); + buf = qemu_try_memalign(s->page_size, s->page_size); if (!buf) { return -ENOMEM; @@ -1470,6 +1490,18 @@ static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.opt_mem_alignment = s->page_size; bs->bl.request_alignment = s->page_size; bs->bl.max_transfer = s->max_transfer; + + /* + * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get + * at most 0xFFFF + */ + bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); + bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, + 1UL << s->blkshift); + + bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; + bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, + 1UL << s->blkshift); } static void nvme_detach_aio_context(BlockDriverState *bs) diff --git a/block/preallocate.c b/block/preallocate.c index b619206304..1d4233f730 100644 --- a/block/preallocate.c +++ b/block/preallocate.c @@ -227,15 +227,15 @@ static void preallocate_reopen_abort(BDRVReopenState *state) } static coroutine_fn int preallocate_co_preadv_part( - BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, int flags) + BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) { return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, flags); } static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { return bdrv_co_pdiscard(bs->file, offset, bytes); } @@ -337,7 +337,7 @@ static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset, } static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { bool want_merge_zero = !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); @@ -349,11 +349,11 @@ static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, } static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, - int flags) + BdrvRequestFlags flags) { handle_write(bs, offset, bytes, false); diff --git a/block/qcow.c b/block/qcow.c index f8919a44d1..c39940f33e 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -617,9 +617,9 @@ static void qcow_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.request_alignment = BDRV_SECTOR_SIZE; } -static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVQcowState *s = bs->opaque; int offset_in_cluster; @@ -714,9 +714,9 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, uint64_t offset, return ret; } -static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVQcowState *s = bs->opaque; int offset_in_cluster; @@ -1047,8 +1047,8 @@ static int qcow_make_empty(BlockDriverState *bs) /* XXX: put compressed sectors first, then all the cluster aligned tables to avoid losing bytes in alignment */ static coroutine_fn int -qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov) +qcow_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov) { BDRVQcowState *s = bs->opaque; z_stream strm; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4ebb49a087..5727f92dcb 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -505,7 +505,19 @@ static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, return -ENOMEDIUM; } - /* Call .bdrv_co_readv() directly instead of using the public block-layer + /* + * We never deal with requests that don't satisfy + * bdrv_check_qiov_request(), and aligning requests to clusters never + * breaks this condition. So, do some assertions before calling + * bs->drv->bdrv_co_preadv_part() which has int64_t arguments. + */ + assert(src_cluster_offset <= INT64_MAX); + assert(src_cluster_offset + offset_in_cluster <= INT64_MAX); + assert(qiov->size <= INT64_MAX); + bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size, + qiov, 0, &error_abort); + /* + * Call .bdrv_co_readv() directly instead of using the public block-layer * interface. This avoids double I/O throttling and request tracking, * which can lead to deadlock when block layer copy-on-read is enabled. */ diff --git a/block/qcow2.c b/block/qcow2.c index 02f9f3e636..d509016756 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2310,9 +2310,10 @@ static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task) } static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset, int flags) + size_t qiov_offset, + BdrvRequestFlags flags) { BDRVQcow2State *s = bs->opaque; int ret = 0; @@ -2596,8 +2597,8 @@ static coroutine_fn int qcow2_co_pwritev_task_entry(AioTask *task) } static coroutine_fn int qcow2_co_pwritev_part( - BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, int flags) + BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) { BDRVQcow2State *s = bs->opaque; int offset_in_cluster; @@ -3940,7 +3941,7 @@ static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes) } static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags) + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { int ret; BDRVQcow2State *s = bs->opaque; @@ -3995,7 +3996,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, } static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { int ret; BDRVQcow2State *s = bs->opaque; @@ -4025,9 +4026,9 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, static int coroutine_fn qcow2_co_copy_range_from(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags read_flags, + BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; @@ -4108,9 +4109,9 @@ out: static int coroutine_fn qcow2_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags read_flags, + BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; @@ -4630,7 +4631,7 @@ static coroutine_fn int qcow2_co_pwritev_compressed_task_entry(AioTask *task) */ static coroutine_fn int qcow2_co_pwritev_compressed_part(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset) { BDRVQcow2State *s = bs->opaque; @@ -5227,24 +5228,55 @@ static int qcow2_has_zero_init(BlockDriverState *bs) } } +/* + * Check the request to vmstate. On success return + * qcow2_vm_state_offset(bs) + @pos + */ +static int64_t qcow2_check_vmstate_request(BlockDriverState *bs, + QEMUIOVector *qiov, int64_t pos) +{ + BDRVQcow2State *s = bs->opaque; + int64_t vmstate_offset = qcow2_vm_state_offset(s); + int ret; + + /* Incoming requests must be OK */ + bdrv_check_qiov_request(pos, qiov->size, qiov, 0, &error_abort); + + if (INT64_MAX - pos < vmstate_offset) { + return -EIO; + } + + pos += vmstate_offset; + ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL); + if (ret < 0) { + return ret; + } + + return pos; +} + static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - BDRVQcow2State *s = bs->opaque; + int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos); + if (offset < 0) { + return offset; + } BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); - return bs->drv->bdrv_co_pwritev_part(bs, qcow2_vm_state_offset(s) + pos, - qiov->size, qiov, 0, 0); + return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0); } static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - BDRVQcow2State *s = bs->opaque; + int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos); + if (offset < 0) { + return offset; + } BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); - return bs->drv->bdrv_co_preadv_part(bs, qcow2_vm_state_offset(s) + pos, - qiov->size, qiov, 0, 0); + return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0); } /* diff --git a/block/qed.c b/block/qed.c index f45c640513..558d3646c4 100644 --- a/block/qed.c +++ b/block/qed.c @@ -582,6 +582,7 @@ static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) BDRVQEDState *s = bs->opaque; bs->bl.pwrite_zeroes_alignment = s->header.cluster_size; + bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size); } /* We have nothing to do for QED reopen, stubs just return @@ -1397,7 +1398,7 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs, static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, + int64_t bytes, BdrvRequestFlags flags) { BDRVQEDState *s = bs->opaque; @@ -1408,6 +1409,12 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, */ QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); + /* + * QED is not prepared for 63bit write-zero requests, so rely on + * max_pwrite_zeroes. + */ + assert(bytes <= INT_MAX); + /* Fall back if the request is not aligned */ if (qed_offset_into_cluster(s, offset) || qed_offset_into_cluster(s, bytes)) { diff --git a/block/quorum.c b/block/quorum.c index f2c0805000..c28dda7baa 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -663,8 +663,8 @@ static int read_fifo_child(QuorumAIOCB *acb) return ret; } -static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, int flags) +static int quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVQuorumState *s = bs->opaque; QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); @@ -714,8 +714,9 @@ static void write_quorum_entry(void *opaque) } } -static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, int flags) +static int quorum_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVQuorumState *s = bs->opaque; QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); @@ -745,7 +746,7 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, } static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { return quorum_co_pwritev(bs, offset, bytes, NULL, diff --git a/block/raw-format.c b/block/raw-format.c index c26f493688..bda757fd19 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -181,8 +181,8 @@ static void raw_reopen_abort(BDRVReopenState *state) } /* Check and adjust the offset, against 'offset' and 'size' options. */ -static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset, - uint64_t bytes, bool is_write) +static inline int raw_adjust_offset(BlockDriverState *bs, int64_t *offset, + int64_t bytes, bool is_write) { BDRVRawState *s = bs->opaque; @@ -201,9 +201,9 @@ static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset, return 0; } -static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { int ret; @@ -216,9 +216,9 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { void *buf = NULL; BlockDriver *drv; @@ -289,12 +289,12 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { int ret; - ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + ret = raw_adjust_offset(bs, &offset, bytes, true); if (ret) { return ret; } @@ -302,11 +302,11 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, } static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { int ret; - ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + ret = raw_adjust_offset(bs, &offset, bytes, true); if (ret) { return ret; } @@ -532,10 +532,10 @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { @@ -551,10 +551,10 @@ static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { diff --git a/block/rbd.c b/block/rbd.c index dcf82b15b8..701fbf2b0c 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -1164,17 +1164,17 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs, } static int -coroutine_fn qemu_rbd_co_preadv(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +coroutine_fn qemu_rbd_co_preadv(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_READ); } static int -coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov, - int flags) +coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, int64_t offset, + int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVRBDState *s = bs->opaque; /* @@ -1197,17 +1197,17 @@ static int coroutine_fn qemu_rbd_co_flush(BlockDriverState *bs) } static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs, - int64_t offset, int count) + int64_t offset, int64_t bytes) { - return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD); + return qemu_rbd_start_co(bs, offset, bytes, NULL, 0, RBD_AIO_DISCARD); } #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES static int coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int count, BdrvRequestFlags flags) + int64_t bytes, BdrvRequestFlags flags) { - return qemu_rbd_start_co(bs, offset, count, NULL, flags, + return qemu_rbd_start_co(bs, offset, bytes, NULL, flags, RBD_AIO_WRITE_ZEROES); } #endif diff --git a/block/throttle.c b/block/throttle.c index b685166ad4..6e8d52fa24 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -112,8 +112,9 @@ static int64_t throttle_getlength(BlockDriverState *bs) } static int coroutine_fn throttle_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { ThrottleGroupMember *tgm = bs->opaque; @@ -123,8 +124,9 @@ static int coroutine_fn throttle_co_preadv(BlockDriverState *bs, } static int coroutine_fn throttle_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { ThrottleGroupMember *tgm = bs->opaque; throttle_group_co_io_limits_intercept(tgm, bytes, true); @@ -133,7 +135,7 @@ static int coroutine_fn throttle_co_pwritev(BlockDriverState *bs, } static int coroutine_fn throttle_co_pwrite_zeroes(BlockDriverState *bs, - int64_t offset, int bytes, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) { ThrottleGroupMember *tgm = bs->opaque; @@ -143,7 +145,7 @@ static int coroutine_fn throttle_co_pwrite_zeroes(BlockDriverState *bs, } static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { ThrottleGroupMember *tgm = bs->opaque; throttle_group_co_io_limits_intercept(tgm, bytes, true); @@ -152,8 +154,8 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, } static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov) { return throttle_co_pwritev(bs, offset, bytes, qiov, diff --git a/block/trace-events b/block/trace-events index f4f1267c8c..f2d0a9b62a 100644 --- a/block/trace-events +++ b/block/trace-events @@ -75,13 +75,13 @@ luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p l # qcow2.c qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu" -qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" +qcow2_writev_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64 qcow2_writev_done_req(void *co, int ret) "co %p ret %d" qcow2_writev_start_part(void *co) "co %p" qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d" qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64 -qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d" -qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d" +qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64 +qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset 0x%" PRIx64 " bytes %" PRId64 qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d" # qcow2-cluster.c @@ -152,8 +152,8 @@ nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p off nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x" nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d" nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d" -nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64"" -nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" +nvme_dsm(void *s, int64_t offset, int64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64"" +nvme_dsm_done(void *s, int64_t offset, int64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" nvme_dma_map_flush(void *s) "s %p" nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u" nvme_create_queue_pair(unsigned q_index, void *q, size_t size, void *aio_context, int fd) "index %u q %p size %zu aioctx %p fd %d" diff --git a/block/vdi.c b/block/vdi.c index 548f8a057b..bdc58d726e 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -544,8 +544,8 @@ static int coroutine_fn vdi_co_block_status(BlockDriverState *bs, } static int coroutine_fn -vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vdi_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVVdiState *s = bs->opaque; QEMUIOVector local_qiov; @@ -600,8 +600,8 @@ vdi_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static int coroutine_fn -vdi_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vdi_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVVdiState *s = bs->opaque; QEMUIOVector local_qiov; diff --git a/block/vmdk.c b/block/vmdk.c index 4499f136bd..fb4cc9da90 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -1888,8 +1888,8 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, } static int coroutine_fn -vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vmdk_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVVmdkState *s = bs->opaque; int ret; @@ -2068,8 +2068,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset, } static int coroutine_fn -vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vmdk_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int ret; BDRVVmdkState *s = bs->opaque; @@ -2080,8 +2080,8 @@ vmdk_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } static int coroutine_fn -vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov) +vmdk_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov) { if (bytes == 0) { /* The caller will write bytes 0 to signal EOF. @@ -2109,7 +2109,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, - int bytes, + int64_t bytes, BdrvRequestFlags flags) { int ret; diff --git a/block/vpc.c b/block/vpc.c index 17a705b482..1b4c7333af 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -608,8 +608,8 @@ static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) } static int coroutine_fn -vpc_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vpc_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVVPCState *s = bs->opaque; int ret; @@ -658,8 +658,8 @@ fail: } static int coroutine_fn -vpc_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vpc_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { BDRVVPCState *s = bs->opaque; int64_t image_offset; diff --git a/block/vvfat.c b/block/vvfat.c index 34bf1e3a86..05e78e3c27 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -1522,8 +1522,8 @@ static int vvfat_read(BlockDriverState *bs, int64_t sector_num, } static int coroutine_fn -vvfat_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vvfat_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int ret; BDRVVVFATState *s = bs->opaque; @@ -3061,8 +3061,8 @@ DLOG(checkpoint()); } static int coroutine_fn -vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +vvfat_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags) { int ret; BDRVVVFATState *s = bs->opaque; diff --git a/configs/devices/i386-softmmu/default.mak b/configs/devices/i386-softmmu/default.mak index 84d1a2487c..598c6646df 100644 --- a/configs/devices/i386-softmmu/default.mak +++ b/configs/devices/i386-softmmu/default.mak @@ -22,6 +22,7 @@ #CONFIG_TPM_CRB=n #CONFIG_TPM_TIS_ISA=n #CONFIG_VTD=n +#CONFIG_SGX=n # Boards: # diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 3c2be84d80..2f7db9a98d 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -238,6 +238,16 @@ The ``I7200`` guest CPU relies on the nanoMIPS ISA, which is deprecated (the ISA has never been upstreamed to a compiler toolchain). Therefore this CPU is also deprecated. + +QEMU API (QAPI) events +---------------------- + +``MEM_UNPLUG_ERROR`` (since 6.2) +'''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +Use the more generic event ``DEVICE_UNPLUG_GUEST_ERROR`` instead. + + System emulator machines ------------------------ diff --git a/docs/system/i386/sgx.rst b/docs/system/i386/sgx.rst new file mode 100644 index 0000000000..f103ae2a2f --- /dev/null +++ b/docs/system/i386/sgx.rst @@ -0,0 +1,165 @@ +Software Guard eXtensions (SGX) +=============================== + +Overview +-------- + +Intel Software Guard eXtensions (SGX) is a set of instructions and mechanisms +for memory accesses in order to provide security accesses for sensitive +applications and data. SGX allows an application to use it's pariticular +address space as an *enclave*, which is a protected area provides confidentiality +and integrity even in the presence of privileged malware. Accesses to the +enclave memory area from any software not resident in the enclave are prevented, +including those from privileged software. + +Virtual SGX +----------- + +SGX feature is exposed to guest via SGX CPUID. Looking at SGX CPUID, we can +report the same CPUID info to guest as on host for most of SGX CPUID. With +reporting the same CPUID guest is able to use full capacity of SGX, and KVM +doesn't need to emulate those info. + +The guest's EPC base and size are determined by Qemu, and KVM needs Qemu to +notify such info to it before it can initialize SGX for guest. + +Virtual EPC +~~~~~~~~~~~ + +By default, Qemu does not assign EPC to a VM, i.e. fully enabling SGX in a VM +requires explicit allocation of EPC to the VM. Similar to other specialized +memory types, e.g. hugetlbfs, EPC is exposed as a memory backend. + +SGX EPC is enumerated through CPUID, i.e. EPC "devices" need to be realized +prior to realizing the vCPUs themselves, which occurs long before generic +devices are parsed and realized. This limitation means that EPC does not +require -maxmem as EPC is not treated as {cold,hot}plugged memory. + +Qemu does not artificially restrict the number of EPC sections exposed to a +guest, e.g. Qemu will happily allow you to create 64 1M EPC sections. Be aware +that some kernels may not recognize all EPC sections, e.g. the Linux SGX driver +is hardwired to support only 8 EPC sections. + +The following Qemu snippet creates two EPC sections, with 64M pre-allocated +to the VM and an additional 28M mapped but not allocated:: + + -object memory-backend-epc,id=mem1,size=64M,prealloc=on \ + -object memory-backend-epc,id=mem2,size=28M \ + -M sgx-epc.0.memdev=mem1,sgx-epc.1.memdev=mem2 + +Note: + +The size and location of the virtual EPC are far less restricted compared +to physical EPC. Because physical EPC is protected via range registers, +the size of the physical EPC must be a power of two (though software sees +a subset of the full EPC, e.g. 92M or 128M) and the EPC must be naturally +aligned. KVM SGX's virtual EPC is purely a software construct and only +requires the size and location to be page aligned. Qemu enforces the EPC +size is a multiple of 4k and will ensure the base of the EPC is 4k aligned. +To simplify the implementation, EPC is always located above 4g in the guest +physical address space. + +Migration +~~~~~~~~~ + +Qemu/KVM doesn't prevent live migrating SGX VMs, although from hardware's +perspective, SGX doesn't support live migration, since both EPC and the SGX +key hierarchy are bound to the physical platform. However live migration +can be supported in the sense if guest software stack can support recreating +enclaves when it suffers sudden lose of EPC; and if guest enclaves can detect +SGX keys being changed, and handle gracefully. For instance, when ERESUME fails +with #PF.SGX, guest software can gracefully detect it and recreate enclaves; +and when enclave fails to unseal sensitive information from outside, it can +detect such error and sensitive information can be provisioned to it again. + +CPUID +~~~~~ + +Due to its myriad dependencies, SGX is currently not listed as supported +in any of Qemu's built-in CPU configuration. To expose SGX (and SGX Launch +Control) to a guest, you must either use `-cpu host` to pass-through the +host CPU model, or explicitly enable SGX when using a built-in CPU model, +e.g. via `-cpu <model>,+sgx` or `-cpu <model>,+sgx,+sgxlc`. + +All SGX sub-features enumerated through CPUID, e.g. SGX2, MISCSELECT, +ATTRIBUTES, etc... can be restricted via CPUID flags. Be aware that enforcing +restriction of MISCSELECT, ATTRIBUTES and XFRM requires intercepting ECREATE, +i.e. may marginally reduce SGX performance in the guest. All SGX sub-features +controlled via -cpu are prefixed with "sgx", e.g.:: + + $ qemu-system-x86_64 -cpu help | xargs printf "%s\n" | grep sgx + sgx + sgx-debug + sgx-encls-c + sgx-enclv + sgx-exinfo + sgx-kss + sgx-mode64 + sgx-provisionkey + sgx-tokenkey + sgx1 + sgx2 + sgxlc + +The following Qemu snippet passes through the host CPU but restricts access to +the provision and EINIT token keys:: + + -cpu host,-sgx-provisionkey,-sgx-tokenkey + +SGX sub-features cannot be emulated, i.e. sub-features that are not present +in hardware cannot be forced on via '-cpu'. + +Virtualize SGX Launch Control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Qemu SGX support for Launch Control (LC) is passive, in the sense that it +does not actively change the LC configuration. Qemu SGX provides the user +the ability to set/clear the CPUID flag (and by extension the associated +IA32_FEATURE_CONTROL MSR bit in fw_cfg) and saves/restores the LE Hash MSRs +when getting/putting guest state, but Qemu does not add new controls to +directly modify the LC configuration. Similar to hardware behavior, locking +the LC configuration to a non-Intel value is left to guest firmware. Unlike +host bios setting for SGX launch control(LC), there is no special bios setting +for SGX guest by our design. If host is in locked mode, we can still allow +creating VM with SGX. + +Feature Control +~~~~~~~~~~~~~~~ + +Qemu SGX updates the `etc/msr_feature_control` fw_cfg entry to set the SGX +(bit 18) and SGX LC (bit 17) flags based on their respective CPUID support, +i.e. existing guest firmware will automatically set SGX and SGX LC accordingly, +assuming said firmware supports fw_cfg.msr_feature_control. + +Launching a guest +----------------- + +To launch a SGX guest: + +.. parsed-literal:: + + |qemu_system_x86| \\ + -cpu host,+sgx-provisionkey \\ + -object memory-backend-epc,id=mem1,size=64M,prealloc=on \\ + -object memory-backend-epc,id=mem2,size=28M \\ + -M sgx-epc.0.memdev=mem1,sgx-epc.1.memdev=mem2 + +Utilizing SGX in the guest requires a kernel/OS with SGX support. +The support can be determined in guest by:: + + $ grep sgx /proc/cpuinfo + +and SGX epc info by:: + + $ dmesg | grep sgx + [ 1.242142] sgx: EPC section 0x180000000-0x181bfffff + [ 1.242319] sgx: EPC section 0x181c00000-0x1837fffff + +References +---------- + +- `SGX Homepage <https://software.intel.com/sgx>`__ + +- `SGX SDK <https://github.com/intel/linux-sgx.git>`__ + +- SGX specification: Intel SDM Volume 3 diff --git a/docs/system/ppc/powernv.rst b/docs/system/ppc/powernv.rst index 4c4cdea527..86186b7d2c 100644 --- a/docs/system/ppc/powernv.rst +++ b/docs/system/ppc/powernv.rst @@ -53,8 +53,7 @@ initramfs ``skiroot``. Source code can be found on GitHub: https://github.com/open-power. -Prebuilt images of ``skiboot`` and ``skiboot`` are made available on the `OpenPOWER <https://openpower.xyz/job/openpower/job/openpower-op-build/>`__ site. To boot a POWER9 machine, use the `witherspoon <https://openpower.xyz/job/openpower/job/openpower-op-build/label=slave,target=witherspoon/lastSuccessfulBuild/>`__ images. For POWER8, use -the `palmetto <https://openpower.xyz/job/openpower/job/openpower-op-build/label=slave,target=palmetto/lastSuccessfulBuild/>`__ images. +Prebuilt images of ``skiboot`` and ``skiroot`` are made available on the `OpenPOWER <https://github.com/open-power/op-build/releases/>`__ site. QEMU includes a prebuilt image of ``skiboot`` which is updated when a more recent version is required by the models. diff --git a/docs/system/target-i386.rst b/docs/system/target-i386.rst index c9720a8cd1..6a86d63863 100644 --- a/docs/system/target-i386.rst +++ b/docs/system/target-i386.rst @@ -26,6 +26,7 @@ Architectural features :maxdepth: 1 i386/cpu + i386/sgx .. _pcsys_005freq: diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst index e39a9f4b1a..56e54cd441 100644 --- a/docs/tools/qemu-nbd.rst +++ b/docs/tools/qemu-nbd.rst @@ -99,8 +99,10 @@ driver options if ``--image-opts`` is specified. .. option:: --cache=CACHE - The cache mode to be used with the file. See the documentation of - the emulator's ``-drive cache=...`` option for allowed values. + The cache mode to be used with the file. Valid values are: + ``none``, ``writeback`` (the default), ``writethrough``, + ``directsync`` and ``unsafe``. See the documentation of + the emulator's ``-drive cache=...`` option for more info. .. option:: -n, --nocache diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 27206ac049..4c966e8a6b 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -877,3 +877,18 @@ SRST ``info dirty_rate`` Display the vcpu dirty rate information. ERST + +#if defined(TARGET_I386) + { + .name = "sgx", + .args_type = "", + .params = "", + .help = "show intel SGX information", + .cmd = hmp_info_sgx, + }, +#endif + +SRST + ``info sgx`` + Show intel SGX information. +ERST diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index af37889423..d0fffcf787 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -8,6 +8,7 @@ #include "qapi/error.h" #include "qapi/qapi-events-acpi.h" #include "qapi/qapi-events-machine.h" +#include "qapi/qapi-events-qdev.h" #define MEMORY_SLOTS_NUMBER "MDNR" #define MEMORY_HOTPLUG_IO_REGION "HPMR" @@ -178,8 +179,16 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, hotplug_handler_unplug(hotplug_ctrl, dev, &local_err); if (local_err) { trace_mhp_acpi_pc_dimm_delete_failed(mem_st->selector); - qapi_event_send_mem_unplug_error(dev->id, + + /* + * Send both MEM_UNPLUG_ERROR and DEVICE_UNPLUG_GUEST_ERROR + * while the deprecation of MEM_UNPLUG_ERROR is + * pending. + */ + qapi_event_send_mem_unplug_error(dev->id ? : "", error_get_pretty(local_err)); + qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id, + dev->canonical_path); error_free(local_err); break; } diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index ddedcef0b2..962d2c981b 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -6,6 +6,10 @@ config SEV select X86_FW_OVMF depends on KVM +config SGX + bool + depends on KVM + config PC bool imply APPLESMC @@ -21,6 +25,7 @@ config PC imply PVPANIC_ISA imply QXL imply SEV + imply SGX imply SGA imply TEST_DEVICES imply TPM_CRB diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index dfaa47cdc2..f4d6ae3d02 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1841,6 +1841,28 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, } #endif + if (pcms->sgx_epc.size != 0) { + uint64_t epc_base = pcms->sgx_epc.base; + uint64_t epc_size = pcms->sgx_epc.size; + + dev = aml_device("EPC"); + aml_append(dev, aml_name_decl("_HID", aml_eisaid("INT0E0C"))); + aml_append(dev, aml_name_decl("_STR", + aml_unicode("Enclave Page Cache 1.0"))); + crs = aml_resource_template(); + aml_append(crs, + aml_qword_memory(AML_POS_DECODE, AML_MIN_FIXED, + AML_MAX_FIXED, AML_NON_CACHEABLE, + AML_READ_WRITE, 0, epc_base, + epc_base + epc_size - 1, 0, epc_size)); + aml_append(dev, aml_name_decl("_CRS", crs)); + + method = aml_method("_STA", 0, AML_NOTSERIALIZED); + aml_append(method, aml_return(aml_int(0x0f))); + aml_append(dev, method); + + aml_append(sb_scope, dev); + } aml_append(dsdt, sb_scope); /* copy AML table into ACPI tables blob and patch header there */ diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c index 4e68d5dea4..a283785a8d 100644 --- a/hw/i386/fw_cfg.c +++ b/hw/i386/fw_cfg.c @@ -159,7 +159,7 @@ void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg) { X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu); CPUX86State *env = &cpu->env; - uint32_t unused, ecx, edx; + uint32_t unused, ebx, ecx, edx; uint64_t feature_control_bits = 0; uint64_t *val; @@ -174,6 +174,16 @@ void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg) feature_control_bits |= FEATURE_CONTROL_LMCE; } + if (env->cpuid_level >= 7) { + cpu_x86_cpuid(env, 0x7, 0, &unused, &ebx, &ecx, &unused); + if (ebx & CPUID_7_0_EBX_SGX) { + feature_control_bits |= FEATURE_CONTROL_SGX; + } + if (ecx & CPUID_7_0_ECX_SGX_LC) { + feature_control_bits |= FEATURE_CONTROL_SGX_LC; + } + } + if (!feature_control_bits) { return; } diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c index fa68669e8a..191a26fa57 100644 --- a/hw/i386/kvm/i8254.c +++ b/hw/i386/kvm/i8254.c @@ -59,11 +59,6 @@ struct KVMPITClass { DeviceRealize parent_realize; }; -static int64_t abs64(int64_t v) -{ - return v < 0 ? -v : v; -} - static void kvm_pit_update_clock_offset(KVMPITState *s) { int64_t offset, clock_offset; @@ -81,7 +76,7 @@ static void kvm_pit_update_clock_offset(KVMPITState *s) clock_gettime(CLOCK_MONOTONIC, &ts); offset -= ts.tv_nsec; offset -= (int64_t)ts.tv_sec * 1000000000; - if (abs64(offset) < abs64(clock_offset)) { + if (uabs64(offset) < uabs64(clock_offset)) { clock_offset = offset; } } diff --git a/hw/i386/meson.build b/hw/i386/meson.build index 80dad29f2b..c502965219 100644 --- a/hw/i386/meson.build +++ b/hw/i386/meson.build @@ -16,6 +16,8 @@ i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c')) i386_ss.add(when: 'CONFIG_VMMOUSE', if_true: files('vmmouse.c')) i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c')) i386_ss.add(when: 'CONFIG_VTD', if_true: files('intel_iommu.c')) +i386_ss.add(when: 'CONFIG_SGX', if_true: files('sgx-epc.c','sgx.c'), + if_false: files('sgx-stub.c')) i386_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-common.c')) i386_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device_x86.c')) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 557d49c9f8..df457eceba 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -889,6 +889,10 @@ void pc_memory_init(PCMachineState *pcms, e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); } + if (pcms->sgx_epc.size != 0) { + e820_add_entry(pcms->sgx_epc.base, pcms->sgx_epc.size, E820_RESERVED); + } + if (!pcmc->has_reserved_memory && (machine->ram_slots || (machine->maxram_size > machine->ram_size))) { @@ -919,8 +923,15 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } + if (pcms->sgx_epc.size != 0) { + machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); + } else { + machine->device_memory->base = + 0x100000000ULL + x86ms->above_4g_mem_size; + } + machine->device_memory->base = - ROUND_UP(0x100000000ULL + x86ms->above_4g_mem_size, 1 * GiB); + ROUND_UP(machine->device_memory->base, 1 * GiB); if (pcmc->enforce_aligned_dimm) { /* size device region assuming 1G page max alignment per slot */ @@ -1005,6 +1016,8 @@ uint64_t pc_pci_hole64_start(void) if (!pcmc->broken_reserved_end) { hole64_start += memory_region_size(&ms->device_memory->mr); } + } else if (pcms->sgx_epc.size != 0) { + hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; } diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index c5da7739ce..6cc834aff6 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -153,6 +153,7 @@ static void pc_init1(MachineState *machine, } } + pc_machine_init_sgx_epc(pcms); x86_cpus_init(x86ms, pcmc->default_cpu_version); if (pcmc->kvmclock_enabled) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 46cd542d17..5481d5c965 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -177,6 +177,7 @@ static void pc_q35_init(MachineState *machine) x86ms->below_4g_mem_size = machine->ram_size; } + pc_machine_init_sgx_epc(pcms); x86_cpus_init(x86ms, pcmc->default_cpu_version); kvmclock_create(pcmc->kvmclock_create_always); diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c new file mode 100644 index 0000000000..55e2217eae --- /dev/null +++ b/hw/i386/sgx-epc.c @@ -0,0 +1,184 @@ +/* + * SGX EPC device + * + * Copyright (C) 2019 Intel Corporation + * + * Authors: + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "hw/mem/memory-device.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qapi/visitor.h" +#include "target/i386/cpu.h" +#include "exec/address-spaces.h" + +static Property sgx_epc_properties[] = { + DEFINE_PROP_UINT64(SGX_EPC_ADDR_PROP, SGXEPCDevice, addr, 0), + DEFINE_PROP_LINK(SGX_EPC_MEMDEV_PROP, SGXEPCDevice, hostmem, + TYPE_MEMORY_BACKEND_EPC, HostMemoryBackendEpc *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void sgx_epc_get_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Error *local_err = NULL; + uint64_t value; + + value = memory_device_get_region_size(MEMORY_DEVICE(obj), &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + visit_type_uint64(v, name, &value, errp); +} + +static void sgx_epc_init(Object *obj) +{ + object_property_add(obj, SGX_EPC_SIZE_PROP, "uint64", sgx_epc_get_size, + NULL, NULL, NULL); +} + +static void sgx_epc_realize(DeviceState *dev, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + X86MachineState *x86ms = X86_MACHINE(pcms); + MemoryDeviceState *md = MEMORY_DEVICE(dev); + SGXEPCState *sgx_epc = &pcms->sgx_epc; + SGXEPCDevice *epc = SGX_EPC(dev); + HostMemoryBackend *hostmem; + const char *path; + + if (x86ms->boot_cpus != 0) { + error_setg(errp, "'" TYPE_SGX_EPC "' can't be created after vCPUs," + "e.g. via -device"); + return; + } + + if (!epc->hostmem) { + error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property is not set"); + return; + } + hostmem = MEMORY_BACKEND(epc->hostmem); + if (host_memory_backend_is_mapped(hostmem)) { + path = object_get_canonical_path_component(OBJECT(hostmem)); + error_setg(errp, "can't use already busy memdev: %s", path); + return; + } + + epc->addr = sgx_epc->base + sgx_epc->size; + + memory_region_add_subregion(&sgx_epc->mr, epc->addr - sgx_epc->base, + host_memory_backend_get_memory(hostmem)); + + host_memory_backend_set_mapped(hostmem, true); + + sgx_epc->sections = g_renew(SGXEPCDevice *, sgx_epc->sections, + sgx_epc->nr_sections + 1); + sgx_epc->sections[sgx_epc->nr_sections++] = epc; + + sgx_epc->size += memory_device_get_region_size(md, errp); +} + +static void sgx_epc_unrealize(DeviceState *dev) +{ + SGXEPCDevice *epc = SGX_EPC(dev); + HostMemoryBackend *hostmem = MEMORY_BACKEND(epc->hostmem); + + host_memory_backend_set_mapped(hostmem, false); +} + +static uint64_t sgx_epc_md_get_addr(const MemoryDeviceState *md) +{ + const SGXEPCDevice *epc = SGX_EPC(md); + + return epc->addr; +} + +static void sgx_epc_md_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), SGX_EPC_ADDR_PROP, addr, errp); +} + +static uint64_t sgx_epc_md_get_plugged_size(const MemoryDeviceState *md, + Error **errp) +{ + return 0; +} + +static MemoryRegion *sgx_epc_md_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + SGXEPCDevice *epc = SGX_EPC(md); + HostMemoryBackend *hostmem; + + if (!epc->hostmem) { + error_setg(errp, "'" SGX_EPC_MEMDEV_PROP "' property must be set"); + return NULL; + } + + hostmem = MEMORY_BACKEND(epc->hostmem); + return host_memory_backend_get_memory(hostmem); +} + +static void sgx_epc_md_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + SgxEPCDeviceInfo *se = g_new0(SgxEPCDeviceInfo, 1); + SGXEPCDevice *epc = SGX_EPC(md); + + se->memaddr = epc->addr; + se->size = object_property_get_uint(OBJECT(epc), SGX_EPC_SIZE_PROP, + NULL); + se->memdev = object_get_canonical_path(OBJECT(epc->hostmem)); + + info->u.sgx_epc.data = se; + info->type = MEMORY_DEVICE_INFO_KIND_SGX_EPC; +} + +static void sgx_epc_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(oc); + + dc->hotpluggable = false; + dc->realize = sgx_epc_realize; + dc->unrealize = sgx_epc_unrealize; + dc->desc = "SGX EPC section"; + device_class_set_props(dc, sgx_epc_properties); + + mdc->get_addr = sgx_epc_md_get_addr; + mdc->set_addr = sgx_epc_md_set_addr; + mdc->get_plugged_size = sgx_epc_md_get_plugged_size; + mdc->get_memory_region = sgx_epc_md_get_memory_region; + mdc->fill_device_info = sgx_epc_md_fill_device_info; +} + +static TypeInfo sgx_epc_info = { + .name = TYPE_SGX_EPC, + .parent = TYPE_DEVICE, + .instance_size = sizeof(SGXEPCDevice), + .instance_init = sgx_epc_init, + .class_init = sgx_epc_class_init, + .class_size = sizeof(DeviceClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_MEMORY_DEVICE }, + { } + }, +}; + +static void sgx_epc_register_types(void) +{ + type_register_static(&sgx_epc_info); +} + +type_init(sgx_epc_register_types) diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c new file mode 100644 index 0000000000..3be9f5ca32 --- /dev/null +++ b/hw/i386/sgx-stub.c @@ -0,0 +1,26 @@ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "hw/i386/sgx.h" + +SGXInfo *sgx_get_info(Error **errp) +{ + error_setg(errp, "SGX support is not compiled in"); + return NULL; +} + +SGXInfo *sgx_get_capabilities(Error **errp) +{ + error_setg(errp, "SGX support is not compiled in"); + return NULL; +} + +void pc_machine_init_sgx_epc(PCMachineState *pcms) +{ + memset(&pcms->sgx_epc, 0, sizeof(SGXEPCState)); +} + +int sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) +{ + g_assert_not_reached(); +} diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c new file mode 100644 index 0000000000..e481e9358f --- /dev/null +++ b/hw/i386/sgx.c @@ -0,0 +1,170 @@ +/* + * SGX common code + * + * Copyright (C) 2021 Intel Corporation + * + * Authors: + * Yang Zhong<yang.zhong@intel.com> + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "hw/i386/pc.h" +#include "hw/i386/sgx-epc.h" +#include "hw/mem/memory-device.h" +#include "monitor/qdev.h" +#include "qapi/error.h" +#include "exec/address-spaces.h" +#include "hw/i386/sgx.h" +#include "sysemu/hw_accel.h" + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_CPUID_EPC_INVALID 0x0 + +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +#define SGX_CPUID_EPC_MASK 0xF + +static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) +{ + return (low & MAKE_64BIT_MASK(12, 20)) + + ((high & MAKE_64BIT_MASK(0, 20)) << 32); +} + +static uint64_t sgx_calc_host_epc_section_size(void) +{ + uint32_t i, type; + uint32_t eax, ebx, ecx, edx; + uint64_t size = 0; + + for (i = 0; i < SGX_MAX_EPC_SECTIONS; i++) { + host_cpuid(0x12, i + 2, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) { + break; + } + + if (type != SGX_CPUID_EPC_SECTION) { + break; + } + + size += sgx_calc_section_metric(ecx, edx); + } + + return size; +} + +SGXInfo *sgx_get_capabilities(Error **errp) +{ + SGXInfo *info = NULL; + uint32_t eax, ebx, ecx, edx; + + int fd = qemu_open_old("/dev/sgx_vepc", O_RDWR); + if (fd < 0) { + error_setg(errp, "SGX is not enabled in KVM"); + return NULL; + } + + info = g_new0(SGXInfo, 1); + host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); + + info->sgx = ebx & (1U << 2) ? true : false; + info->flc = ecx & (1U << 30) ? true : false; + + host_cpuid(0x12, 0, &eax, &ebx, &ecx, &edx); + info->sgx1 = eax & (1U << 0) ? true : false; + info->sgx2 = eax & (1U << 1) ? true : false; + + info->section_size = sgx_calc_host_epc_section_size(); + + close(fd); + + return info; +} + +SGXInfo *sgx_get_info(Error **errp) +{ + SGXInfo *info = NULL; + X86MachineState *x86ms; + PCMachineState *pcms = + (PCMachineState *)object_dynamic_cast(qdev_get_machine(), + TYPE_PC_MACHINE); + if (!pcms) { + error_setg(errp, "SGX is only supported on PC machines"); + return NULL; + } + + x86ms = X86_MACHINE(pcms); + if (!x86ms->sgx_epc_list) { + error_setg(errp, "No EPC regions defined, SGX not available"); + return NULL; + } + + SGXEPCState *sgx_epc = &pcms->sgx_epc; + info = g_new0(SGXInfo, 1); + + info->sgx = true; + info->sgx1 = true; + info->sgx2 = true; + info->flc = true; + info->section_size = sgx_epc->size; + + return info; +} + +int sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) +{ + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + SGXEPCDevice *epc; + + if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { + return 1; + } + + epc = pcms->sgx_epc.sections[section_nr]; + + *addr = epc->addr; + *size = memory_device_get_region_size(MEMORY_DEVICE(epc), &error_fatal); + + return 0; +} + +void pc_machine_init_sgx_epc(PCMachineState *pcms) +{ + SGXEPCState *sgx_epc = &pcms->sgx_epc; + X86MachineState *x86ms = X86_MACHINE(pcms); + SgxEPCList *list = NULL; + Object *obj; + + memset(sgx_epc, 0, sizeof(SGXEPCState)); + if (!x86ms->sgx_epc_list) { + return; + } + + sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + + memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); + memory_region_add_subregion(get_system_memory(), sgx_epc->base, + &sgx_epc->mr); + + for (list = x86ms->sgx_epc_list; list; list = list->next) { + obj = object_new("sgx-epc"); + + /* set the memdev link with memory backend */ + object_property_parse(obj, SGX_EPC_MEMDEV_PROP, list->value->memdev, + &error_fatal); + object_property_set_bool(obj, "realized", true, &error_fatal); + object_unref(obj); + } + + if ((sgx_epc->base + sgx_epc->size) < sgx_epc->base) { + error_report("Size of all 'sgx-epc' =0x%"PRIu64" causes EPC to wrap", + sgx_epc->size); + exit(EXIT_FAILURE); + } + + memory_region_set_size(&sgx_epc->mr, sgx_epc->size); +} diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 00448ed55a..41ef9a84a9 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -30,6 +30,8 @@ #include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qapi/qapi-visit-common.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-visit-machine.h" #include "qapi/visitor.h" #include "sysemu/qtest.h" #include "sysemu/whpx.h" @@ -1263,6 +1265,27 @@ static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v, visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp); } +static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + SgxEPCList *list = x86ms->sgx_epc_list; + + visit_type_SgxEPCList(v, name, &list, errp); +} + +static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + SgxEPCList *list; + + list = x86ms->sgx_epc_list; + visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp); + + qapi_free_SgxEPCList(list); +} + static void x86_machine_initfn(Object *obj) { X86MachineState *x86ms = X86_MACHINE(obj); @@ -1322,6 +1345,12 @@ static void x86_machine_class_init(ObjectClass *oc, void *data) x86_machine_set_bus_lock_ratelimit, NULL, NULL); object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "Set the ratelimit for the bus locks acquired in VMs"); + + object_class_property_add(oc, "sgx-epc", "SgxEPC", + machine_get_sgx_epc, machine_set_sgx_epc, + NULL, NULL); + object_class_property_set_description(oc, "sgx-epc", + "SGX EPC device"); } static const TypeInfo x86_machine_info = { diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c index 9b432773f0..e3d3d5cf89 100644 --- a/hw/i386/xen/xen-hvm.c +++ b/hw/i386/xen/xen-hvm.c @@ -721,6 +721,7 @@ static void xen_log_global_stop(MemoryListener *listener) } static MemoryListener xen_memory_listener = { + .name = "xen-memory", .region_add = xen_region_add, .region_del = xen_region_del, .log_start = xen_log_start, @@ -732,6 +733,7 @@ static MemoryListener xen_memory_listener = { }; static MemoryListener xen_io_listener = { + .name = "xen-io", .region_add = xen_io_add, .region_del = xen_io_del, .priority = 10, diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index 9b4c17854d..49504e740f 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -25,12 +25,8 @@ /* * * Based on OpenPic implementations: - * - Intel GW80314 I/O companion chip developer's manual * - Motorola MPC8245 & MPC8540 user manuals. - * - Motorola MCP750 (aka Raven) programmer manual. - * - Motorola Harrier programmer manuel - * - * Serial interrupts, as implemented in Raven chipset are not supported yet. + * - Motorola Harrier programmer manual * */ @@ -51,7 +47,7 @@ #include "qemu/timer.h" #include "qemu/error-report.h" -//#define DEBUG_OPENPIC +/* #define DEBUG_OPENPIC */ #ifdef DEBUG_OPENPIC static const int debug_openpic = 1; @@ -122,7 +118,8 @@ static FslMpicInfo fsl_mpic_42 = { #define ILR_INTTGT_CINT 0x01 /* critical */ #define ILR_INTTGT_MCP 0x02 /* machine check */ -/* The currently supported INTTGT values happen to be the same as QEMU's +/* + * The currently supported INTTGT values happen to be the same as QEMU's * openpic output codes, but don't depend on this. The output codes * could change (unlikely, but...) or support could be added for * more INTTGT values. @@ -181,10 +178,11 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, uint32_t val, int idx); static void openpic_reset(DeviceState *d); -/* Convert between openpic clock ticks and nanosecs. In the hardware the clock - frequency is driven by board inputs to the PIC which the PIC would then - divide by 4 or 8. For now hard code to 25MZ. -*/ +/* + * Convert between openpic clock ticks and nanosecs. In the hardware the clock + * frequency is driven by board inputs to the PIC which the PIC would then + * divide by 4 or 8. For now hard code to 25MZ. + */ #define OPENPIC_TIMER_FREQ_MHZ 25 #define OPENPIC_TIMER_NS_PER_TICK (1000 / OPENPIC_TIMER_FREQ_MHZ) static inline uint64_t ns_to_ticks(uint64_t ns) @@ -257,7 +255,8 @@ static void IRQ_local_pipe(OpenPICState *opp, int n_CPU, int n_IRQ, __func__, src->output, n_IRQ, active, was_active, dst->outputs_active[src->output]); - /* On Freescale MPIC, critical interrupts ignore priority, + /* + * On Freescale MPIC, critical interrupts ignore priority, * IACK, EOI, etc. Before MPIC v4.1 they also ignore * masking. */ @@ -280,7 +279,8 @@ static void IRQ_local_pipe(OpenPICState *opp, int n_CPU, int n_IRQ, priority = IVPR_PRIORITY(src->ivpr); - /* Even if the interrupt doesn't have enough priority, + /* + * Even if the interrupt doesn't have enough priority, * it is still raised, in case ctpr is lowered later. */ if (active) { @@ -412,7 +412,8 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) } if (src->output != OPENPIC_OUTPUT_INT) { - /* Edge-triggered interrupts shouldn't be used + /* + * Edge-triggered interrupts shouldn't be used * with non-INT delivery, but just in case, * try to make it do something sane rather than * cause an interrupt storm. This is close to @@ -505,7 +506,8 @@ static inline void write_IRQreg_ivpr(OpenPICState *opp, int n_IRQ, uint32_t val) { uint32_t mask; - /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + /* + * NOTE when implementing newer FSL MPIC models: starting with v4.0, * the polarity bit is read-only on internal interrupts. */ mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | @@ -515,7 +517,8 @@ static inline void write_IRQreg_ivpr(OpenPICState *opp, int n_IRQ, uint32_t val) opp->src[n_IRQ].ivpr = (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); - /* For FSL internal interrupts, The sense bit is reserved and zero, + /* + * For FSL internal interrupts, The sense bit is reserved and zero, * and the interrupt is always level-triggered. Timers and IPIs * have no sense or polarity bits, and are edge-triggered. */ @@ -699,16 +702,20 @@ static void qemu_timer_cb(void *opaque) openpic_set_irq(opp, n_IRQ, 0); } -/* If enabled is true, arranges for an interrupt to be raised val clocks into - the future, if enabled is false cancels the timer. */ +/* + * If enabled is true, arranges for an interrupt to be raised val clocks into + * the future, if enabled is false cancels the timer. + */ static void openpic_tmr_set_tmr(OpenPICTimer *tmr, uint32_t val, bool enabled) { uint64_t ns = ticks_to_ns(val & ~TCCR_TOG); - /* A count of zero causes a timer to be set to expire immediately. This - effectively stops the simulation since the timer is constantly expiring - which prevents guest code execution, so we don't honor that - configuration. On real hardware, this situation would generate an - interrupt on every clock cycle if the interrupt was unmasked. */ + /* + * A count of zero causes a timer to be set to expire immediately. This + * effectively stops the simulation since the timer is constantly expiring + * which prevents guest code execution, so we don't honor that + * configuration. On real hardware, this situation would generate an + * interrupt on every clock cycle if the interrupt was unmasked. + */ if ((ns == 0) || !enabled) { tmr->qemu_timer_active = false; tmr->tccr = tmr->tccr & TCCR_TOG; @@ -721,8 +728,10 @@ static void openpic_tmr_set_tmr(OpenPICTimer *tmr, uint32_t val, bool enabled) } } -/* Returns the currrent tccr value, i.e., timer value (in clocks) with - appropriate TOG. */ +/* + * Returns the currrent tccr value, i.e., timer value (in clocks) with + * appropriate TOG. + */ static uint64_t openpic_tmr_get_timer(OpenPICTimer *tmr) { uint64_t retval; @@ -1276,6 +1285,15 @@ static void openpic_reset(DeviceState *d) break; } + /* Mask all IPI interrupts for Freescale OpenPIC */ + if ((opp->model == OPENPIC_MODEL_FSL_MPIC_20) || + (opp->model == OPENPIC_MODEL_FSL_MPIC_42)) { + if (i >= opp->irq_ipi0 && i < opp->irq_tim0) { + write_IRQreg_idr(opp, i, 0); + continue; + } + } + write_IRQreg_idr(opp, i, opp->idr_reset); } /* Initialise IRQ destinations */ @@ -1304,7 +1322,7 @@ static void openpic_reset(DeviceState *d) typedef struct MemReg { const char *name; MemoryRegionOps const *ops; - hwaddr start_addr; + hwaddr start_addr; ram_addr_t size; } MemReg; @@ -1555,28 +1573,6 @@ static void openpic_realize(DeviceState *dev, Error **errp) break; - case OPENPIC_MODEL_RAVEN: - opp->nb_irqs = RAVEN_MAX_EXT; - opp->vid = VID_REVISION_1_3; - opp->vir = VIR_GENERIC; - opp->vector_mask = 0xFF; - opp->tfrr_reset = 4160000; - opp->ivpr_reset = IVPR_MASK_MASK | IVPR_MODE_MASK; - opp->idr_reset = 0; - opp->max_irq = RAVEN_MAX_IRQ; - opp->irq_ipi0 = RAVEN_IPI_IRQ; - opp->irq_tim0 = RAVEN_TMR_IRQ; - opp->brr1 = -1; - opp->mpic_mode_mask = GCR_MODE_MIXED; - - if (opp->nb_cpus != 1) { - error_setg(errp, "Only UP supported today"); - return; - } - - map_list(opp, list_le, &list_count); - break; - case OPENPIC_MODEL_KEYLARGO: opp->nb_irqs = KEYLARGO_MAX_EXT; opp->vid = VID_REVISION_1_2; diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c index 21da680389..557dd0c2bf 100644 --- a/hw/intc/openpic_kvm.c +++ b/hw/intc/openpic_kvm.c @@ -234,6 +234,7 @@ static void kvm_openpic_realize(DeviceState *dev, Error **errp) opp->mem_listener.region_add = kvm_openpic_region_add; opp->mem_listener.region_del = kvm_openpic_region_del; + opp->mem_listener.name = "openpic-kvm"; memory_listener_register(&opp->mem_listener, &address_space_memory); /* indicate pic capabilities */ diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 3e534b9685..6d4909d0a8 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -236,6 +236,8 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) SpaprXive *xive = SPAPR_XIVE(xsrc->xive); uint64_t state = 0; + trace_kvm_xive_source_reset(srcno); + assert(xive->fd != -1); if (xive_source_irq_is_lsi(xsrc, srcno)) { @@ -311,8 +313,6 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, return xive_esb_rw(xsrc, srcno, offset, data, 1); } - trace_kvm_xive_source_reset(srcno); - /* * Special Load EOI handling for LSI sources. Q bit is never set * and the interrupt should be re-triggered if the level is still diff --git a/hw/intc/xive.c b/hw/intc/xive.c index b817ee8e37..6c82326ec7 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -28,17 +28,6 @@ */ /* - * Convert a priority number to an Interrupt Pending Buffer (IPB) - * register, which indicates a pending interrupt at the priority - * corresponding to the bit number - */ -static uint8_t priority_to_ipb(uint8_t priority) -{ - return priority > XIVE_PRIORITY_MAX ? - 0 : 1 << (XIVE_PRIORITY_MAX - priority); -} - -/* * Convert an Interrupt Pending Buffer (IPB) register to a Pending * Interrupt Priority Register (PIPR), which contains the priority of * the most favored pending notification. @@ -89,7 +78,7 @@ static uint64_t xive_tctx_accept(XiveTCTX *tctx, uint8_t ring) regs[TM_CPPR] = cppr; /* Reset the pending buffer bit */ - regs[TM_IPB] &= ~priority_to_ipb(cppr); + regs[TM_IPB] &= ~xive_priority_to_ipb(cppr); regs[TM_PIPR] = ipb_to_pipr(regs[TM_IPB]); /* Drop Exception bit */ @@ -152,11 +141,6 @@ void xive_tctx_ipb_update(XiveTCTX *tctx, uint8_t ring, uint8_t ipb) xive_tctx_notify(tctx, ring); } -static inline uint32_t xive_tctx_word2(uint8_t *ring) -{ - return *((uint32_t *) &ring[TM_WORD2]); -} - /* * XIVE Thread Interrupt Management Area (TIMA) */ @@ -353,7 +337,7 @@ static void xive_tm_set_os_cppr(XivePresenter *xptr, XiveTCTX *tctx, static void xive_tm_set_os_pending(XivePresenter *xptr, XiveTCTX *tctx, hwaddr offset, uint64_t value, unsigned size) { - xive_tctx_ipb_update(tctx, TM_QW1_OS, priority_to_ipb(value & 0xff)); + xive_tctx_ipb_update(tctx, TM_QW1_OS, xive_priority_to_ipb(value & 0xff)); } static void xive_os_cam_decode(uint32_t cam, uint8_t *nvt_blk, @@ -1535,7 +1519,8 @@ bool xive_presenter_notify(XiveFabric *xfb, uint8_t format, /* handle CPU exception delivery */ if (count) { trace_xive_presenter_notify(nvt_blk, nvt_idx, match.ring); - xive_tctx_ipb_update(match.tctx, match.ring, priority_to_ipb(priority)); + xive_tctx_ipb_update(match.tctx, match.ring, + xive_priority_to_ipb(priority)); } return !!count; @@ -1682,7 +1667,8 @@ static void xive_router_end_notify(XiveRouter *xrtr, uint8_t end_blk, * use. The presenter will resend the interrupt when the vCPU * is dispatched again on a HW thread. */ - ipb = xive_get_field32(NVT_W4_IPB, nvt.w4) | priority_to_ipb(priority); + ipb = xive_get_field32(NVT_W4_IPB, nvt.w4) | + xive_priority_to_ipb(priority); nvt.w4 = xive_set_field32(NVT_W4_IPB, nvt.w4, ipb); xive_router_write_nvt(xrtr, nvt_blk, nvt_idx, &nvt, 4); diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 2f5358b70c..71e45515f1 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -723,6 +723,8 @@ static uint64_t pnv_chip_get_ram_size(PnvMachineState *pnv, int chip_id) return QEMU_ALIGN_DOWN(ram_per_chip, 1 * MiB); } + assert(pnv->num_chips > 1); + ram_per_chip = (machine->ram_size - 1 * GiB) / (pnv->num_chips - 1); return chip_id == 0 ? 1 * GiB : QEMU_ALIGN_DOWN(ram_per_chip, 1 * MiB); } @@ -838,8 +840,7 @@ static void pnv_init(MachineState *machine) for (i = 0; i < pnv->num_chips; i++) { char chip_name[32]; Object *chip = OBJECT(qdev_new(chip_typename)); - int chip_id = i; - uint64_t chip_ram_size = pnv_chip_get_ram_size(pnv, chip_id); + uint64_t chip_ram_size = pnv_chip_get_ram_size(pnv, i); pnv->chips[i] = PNV_CHIP(chip); @@ -850,9 +851,9 @@ static void pnv_init(MachineState *machine) &error_fatal); chip_ram_start += chip_ram_size; - snprintf(chip_name, sizeof(chip_name), "chip[%d]", chip_id); + snprintf(chip_name, sizeof(chip_name), "chip[%d]", i); object_property_add_child(OBJECT(pnv), chip_name, chip); - object_property_set_int(chip, "chip-id", chip_id, &error_fatal); + object_property_set_int(chip, "chip-id", i, &error_fatal); object_property_set_int(chip, "nr-cores", machine->smp.cores, &error_fatal); object_property_set_int(chip, "nr-threads", machine->smp.threads, @@ -1369,10 +1370,10 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error **errp) sizeof(*eq), TYPE_PNV_QUAD, &error_fatal, NULL); - object_property_set_int(OBJECT(eq), "id", core_id, &error_fatal); + object_property_set_int(OBJECT(eq), "quad-id", core_id, &error_fatal); qdev_realize(DEVICE(eq), NULL, &error_fatal); - pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->id), + pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->quad_id), &eq->xscom_regs); } } diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c index 4de8414df2..19e8eb885f 100644 --- a/hw/ppc/pnv_core.c +++ b/hw/ppc/pnv_core.c @@ -407,13 +407,13 @@ static void pnv_quad_realize(DeviceState *dev, Error **errp) PnvQuad *eq = PNV_QUAD(dev); char name[32]; - snprintf(name, sizeof(name), "xscom-quad.%d", eq->id); + snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id); pnv_xscom_region_init(&eq->xscom_regs, OBJECT(dev), &pnv_quad_xscom_ops, eq, name, PNV9_XSCOM_EQ_SIZE); } static Property pnv_quad_properties[] = { - DEFINE_PROP_UINT32("id", PnvQuad, id, 0), + DEFINE_PROP_UINT32("quad-id", PnvQuad, quad_id, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/ppc/pnv_xscom.c b/hw/ppc/pnv_xscom.c index faa488e311..9ce018dbc2 100644 --- a/hw/ppc/pnv_xscom.c +++ b/hw/ppc/pnv_xscom.c @@ -284,6 +284,10 @@ int pnv_dt_xscom(PnvChip *chip, void *fdt, int root_offset, _FDT(xscom_offset); g_free(name); _FDT((fdt_setprop_cell(fdt, xscom_offset, "ibm,chip-id", chip->chip_id))); + /* + * On P10, the xscom bus id has been deprecated and the chip id is + * calculated from the "Primary topology table index". See skiboot. + */ _FDT((fdt_setprop_cell(fdt, xscom_offset, "ibm,primary-topology-index", chip->chip_id))); _FDT((fdt_setprop_cell(fdt, xscom_offset, "#address-cells", 1))); diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index 7375bf4fa9..f5d012f860 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -37,22 +37,6 @@ #include "migration/vmstate.h" #include "trace.h" -//#define PPC_DEBUG_IRQ -//#define PPC_DEBUG_TB - -#ifdef PPC_DEBUG_IRQ -# define LOG_IRQ(...) qemu_log_mask(CPU_LOG_INT, ## __VA_ARGS__) -#else -# define LOG_IRQ(...) do { } while (0) -#endif - - -#ifdef PPC_DEBUG_TB -# define LOG_TB(...) qemu_log(__VA_ARGS__) -#else -# define LOG_TB(...) do { } while (0) -#endif - static void cpu_ppc_tb_stop (CPUPPCState *env); static void cpu_ppc_tb_start (CPUPPCState *env); @@ -86,9 +70,8 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level) } - LOG_IRQ("%s: %p n_IRQ %d level %d => pending %08" PRIx32 - "req %08x\n", __func__, env, n_IRQ, level, - env->pending_interrupts, CPU(cpu)->interrupt_request); + trace_ppc_irq_set_exit(env, n_IRQ, level, env->pending_interrupts, + CPU(cpu)->interrupt_request); if (locked) { qemu_mutex_unlock_iothread(); @@ -102,8 +85,8 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -112,8 +95,7 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC6xx_INPUT_TBEN: /* Level sensitive - active high */ - LOG_IRQ("%s: %s the time base\n", - __func__, level ? "start" : "stop"); + trace_ppc_irq_set_state("time base", level); if (level) { cpu_ppc_tb_start(env); } else { @@ -122,14 +104,12 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) break; case PPC6xx_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC6xx_INPUT_SMI: /* Level sensitive - active high */ - LOG_IRQ("%s: set the SMI IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("SMI IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_SMI, level); break; case PPC6xx_INPUT_MCP: @@ -138,8 +118,7 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) * 603/604/740/750: check HID0[EMCP] */ if (cur_level == 1 && level == 0) { - LOG_IRQ("%s: raise machine check state\n", - __func__); + trace_ppc_irq_set_state("machine check", 1); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, 1); } break; @@ -148,26 +127,23 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) /* XXX: TODO: relay the signal to CKSTP_OUT pin */ /* XXX: Note that the only way to restart the CPU is to reset it */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } break; case PPC6xx_INPUT_HRESET: /* Level sensitive - active low */ if (level) { - LOG_IRQ("%s: reset the CPU\n", __func__); + trace_ppc_irq_reset("CPU"); cpu_interrupt(cs, CPU_INTERRUPT_RESET); } break; case PPC6xx_INPUT_SRESET: - LOG_IRQ("%s: set the RESET IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("RESET IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_RESET, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -192,8 +168,8 @@ static void ppc970_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -202,14 +178,12 @@ static void ppc970_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC970_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC970_INPUT_THINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the SMI IRQ state to %d\n", __func__, - level); + trace_ppc_irq_set_state("SMI IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_THERM, level); break; case PPC970_INPUT_MCP: @@ -218,8 +192,7 @@ static void ppc970_set_irq(void *opaque, int pin, int level) * 603/604/740/750: check HID0[EMCP] */ if (cur_level == 1 && level == 0) { - LOG_IRQ("%s: raise machine check state\n", - __func__); + trace_ppc_irq_set_state("machine check", 1); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, 1); } break; @@ -227,10 +200,10 @@ static void ppc970_set_irq(void *opaque, int pin, int level) /* Level sensitive - active low */ /* XXX: TODO: relay the signal to CKSTP_OUT pin */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } else { - LOG_IRQ("%s: restart the CPU\n", __func__); + trace_ppc_irq_cpu("restart"); cs->halted = 0; qemu_cpu_kick(cs); } @@ -242,19 +215,15 @@ static void ppc970_set_irq(void *opaque, int pin, int level) } break; case PPC970_INPUT_SRESET: - LOG_IRQ("%s: set the RESET IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("RESET IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_RESET, level); break; case PPC970_INPUT_TBEN: - LOG_IRQ("%s: set the TBEN state to %d\n", __func__, - level); + trace_ppc_irq_set_state("TBEN IRQ", level); /* XXX: TODO */ break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -276,20 +245,16 @@ static void power7_set_irq(void *opaque, int pin, int level) { PowerPCCPU *cpu = opaque; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - &cpu->env, pin, level); + trace_ppc_irq_set(&cpu->env, pin, level); switch (pin) { case POWER7_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } } @@ -306,25 +271,21 @@ static void power9_set_irq(void *opaque, int pin, int level) { PowerPCCPU *cpu = opaque; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - &cpu->env, pin, level); + trace_ppc_irq_set(&cpu->env, pin, level); switch (pin) { case POWER9_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case POWER9_INPUT_HINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("HV external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_HVIRT, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); + g_assert_not_reached(); return; } } @@ -401,8 +362,8 @@ static void ppc40x_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -411,57 +372,51 @@ static void ppc40x_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC40x_INPUT_RESET_SYS: if (level) { - LOG_IRQ("%s: reset the PowerPC system\n", - __func__); + trace_ppc_irq_reset("system"); ppc40x_system_reset(cpu); } break; case PPC40x_INPUT_RESET_CHIP: if (level) { - LOG_IRQ("%s: reset the PowerPC chip\n", __func__); + trace_ppc_irq_reset("chip"); ppc40x_chip_reset(cpu); } break; case PPC40x_INPUT_RESET_CORE: /* XXX: TODO: update DBSR[MRR] */ if (level) { - LOG_IRQ("%s: reset the PowerPC core\n", __func__); + trace_ppc_irq_reset("core"); ppc40x_core_reset(cpu); } break; case PPC40x_INPUT_CINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the critical IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("critical IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_CEXT, level); break; case PPC40x_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC40x_INPUT_HALT: /* Level sensitive - active low */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } else { - LOG_IRQ("%s: restart the CPU\n", __func__); + trace_ppc_irq_cpu("restart"); cs->halted = 0; qemu_cpu_kick(cs); } break; case PPC40x_INPUT_DEBUG: /* Level sensitive - active high */ - LOG_IRQ("%s: set the debug pin state to %d\n", - __func__, level); + trace_ppc_irq_set_state("debug pin", level); ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -485,47 +440,41 @@ static void ppce500_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { switch (pin) { case PPCE500_INPUT_MCK: if (level) { - LOG_IRQ("%s: reset the PowerPC system\n", - __func__); + trace_ppc_irq_reset("system"); qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); } break; case PPCE500_INPUT_RESET_CORE: if (level) { - LOG_IRQ("%s: reset the PowerPC core\n", __func__); + trace_ppc_irq_reset("core"); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, level); } break; case PPCE500_INPUT_CINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the critical IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("critical IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_CEXT, level); break; case PPCE500_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the core IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("core IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPCE500_INPUT_DEBUG: /* Level sensitive - active high */ - LOG_IRQ("%s: set the debug pin state to %d\n", - __func__, level); + trace_ppc_irq_set_state("debug pin", level); ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -576,7 +525,7 @@ uint64_t cpu_ppc_load_tbl (CPUPPCState *env) } tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb; } @@ -587,7 +536,7 @@ static inline uint32_t _cpu_ppc_load_tbu(CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb >> 32; } @@ -607,8 +556,7 @@ static inline void cpu_ppc_store_tb(ppc_tb_t *tb_env, uint64_t vmclk, *tb_offsetp = value - muldiv64(vmclk, tb_env->tb_freq, NANOSECONDS_PER_SECOND); - LOG_TB("%s: tb %016" PRIx64 " offset %08" PRIx64 "\n", - __func__, value, *tb_offsetp); + trace_ppc_tb_store(value, *tb_offsetp); } void cpu_ppc_store_tbl (CPUPPCState *env, uint32_t value) @@ -644,7 +592,7 @@ uint64_t cpu_ppc_load_atbl (CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb; } @@ -655,7 +603,7 @@ uint32_t cpu_ppc_load_atbu (CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb >> 32; } @@ -774,7 +722,7 @@ static inline int64_t _cpu_ppc_load_decr(CPUPPCState *env, uint64_t next) } else { decr = -muldiv64(-diff, tb_env->decr_freq, NANOSECONDS_PER_SECOND); } - LOG_TB("%s: %016" PRIx64 "\n", __func__, decr); + trace_ppc_decr_load(decr); return decr; } @@ -833,7 +781,7 @@ uint64_t cpu_ppc_load_purr (CPUPPCState *env) static inline void cpu_ppc_decr_excp(PowerPCCPU *cpu) { /* Raise it */ - LOG_TB("raise decrementer exception\n"); + trace_ppc_decr_excp("raise"); ppc_set_irq(cpu, PPC_INTERRUPT_DECR, 1); } @@ -847,7 +795,7 @@ static inline void cpu_ppc_hdecr_excp(PowerPCCPU *cpu) CPUPPCState *env = &cpu->env; /* Raise it */ - LOG_TB("raise hv decrementer exception\n"); + trace_ppc_decr_excp("raise HV"); /* The architecture specifies that we don't deliver HDEC * interrupts in a PM state. Not only they don't cause a @@ -873,17 +821,14 @@ static void __cpu_ppc_store_decr(PowerPCCPU *cpu, uint64_t *nextp, CPUPPCState *env = &cpu->env; ppc_tb_t *tb_env = env->tb_env; uint64_t now, next; - bool negative; + int64_t signed_value; + int64_t signed_decr; /* Truncate value to decr_width and sign extend for simplicity */ - value &= ((1ULL << nr_bits) - 1); - negative = !!(value & (1ULL << (nr_bits - 1))); - if (negative) { - value |= (0xFFFFFFFFULL << nr_bits); - } + signed_value = sextract64(value, 0, nr_bits); + signed_decr = sextract64(decr, 0, nr_bits); - LOG_TB("%s: " TARGET_FMT_lx " => " TARGET_FMT_lx "\n", __func__, - decr, value); + trace_ppc_decr_store(nr_bits, decr, value); if (kvm_enabled()) { /* KVM handles decrementer exceptions, we don't need our own timer */ @@ -903,16 +848,16 @@ static void __cpu_ppc_store_decr(PowerPCCPU *cpu, uint64_t *nextp, * On MSB edge based DEC implementations the MSB going from 0 -> 1 triggers * an edge interrupt, so raise it here too. */ - if ((value < 3) || - ((tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL) && negative) || - ((tb_env->flags & PPC_DECR_UNDERFLOW_TRIGGERED) && negative - && !(decr & (1ULL << (nr_bits - 1))))) { + if ((signed_value < 3) || + ((tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL) && signed_value < 0) || + ((tb_env->flags & PPC_DECR_UNDERFLOW_TRIGGERED) && signed_value < 0 + && signed_decr >= 0)) { (*raise_excp)(cpu); return; } /* On MSB level based systems a 0 for the MSB stops interrupt delivery */ - if (!negative && (tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL)) { + if (signed_value >= 0 && (tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL)) { (*lower_excp)(cpu); } @@ -1211,9 +1156,8 @@ static void cpu_4xx_fit_cb (void *opaque) if ((env->spr[SPR_40x_TCR] >> 23) & 0x1) { ppc_set_irq(cpu, PPC_INTERRUPT_FIT, 1); } - LOG_TB("%s: ir %d TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx "\n", __func__, - (int)((env->spr[SPR_40x_TCR] >> 23) & 0x1), - env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); + trace_ppc4xx_fit((int)((env->spr[SPR_40x_TCR] >> 23) & 0x1), + env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); } /* Programmable interval timer */ @@ -1227,11 +1171,10 @@ static void start_stop_pit (CPUPPCState *env, ppc_tb_t *tb_env, int is_excp) !((env->spr[SPR_40x_TCR] >> 26) & 0x1) || (is_excp && !((env->spr[SPR_40x_TCR] >> 22) & 0x1))) { /* Stop PIT */ - LOG_TB("%s: stop PIT\n", __func__); + trace_ppc4xx_pit_stop(); timer_del(tb_env->decr_timer); } else { - LOG_TB("%s: start PIT %016" PRIx64 "\n", - __func__, ppc40x_timer->pit_reload); + trace_ppc4xx_pit_start(ppc40x_timer->pit_reload); now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); next = now + muldiv64(ppc40x_timer->pit_reload, NANOSECONDS_PER_SECOND, tb_env->decr_freq); @@ -1260,9 +1203,7 @@ static void cpu_4xx_pit_cb (void *opaque) ppc_set_irq(cpu, ppc40x_timer->decr_excp, 1); } start_stop_pit(env, tb_env, 1); - LOG_TB("%s: ar %d ir %d TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx " " - "%016" PRIx64 "\n", __func__, - (int)((env->spr[SPR_40x_TCR] >> 22) & 0x1), + trace_ppc4xx_pit((int)((env->spr[SPR_40x_TCR] >> 22) & 0x1), (int)((env->spr[SPR_40x_TCR] >> 26) & 0x1), env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR], ppc40x_timer->pit_reload); @@ -1302,8 +1243,7 @@ static void cpu_4xx_wdt_cb (void *opaque) next = now + muldiv64(next, NANOSECONDS_PER_SECOND, tb_env->decr_freq); if (next == now) next++; - LOG_TB("%s: TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx "\n", __func__, - env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); + trace_ppc4xx_wdt(env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); switch ((env->spr[SPR_40x_TSR] >> 30) & 0x3) { case 0x0: case 0x1: @@ -1346,7 +1286,7 @@ void store_40x_pit (CPUPPCState *env, target_ulong val) tb_env = env->tb_env; ppc40x_timer = tb_env->opaque; - LOG_TB("%s val" TARGET_FMT_lx "\n", __func__, val); + trace_ppc40x_store_pit(val); ppc40x_timer->pit_reload = val; start_stop_pit(env, tb_env, 0); } @@ -1361,8 +1301,7 @@ static void ppc_40x_set_tb_clk (void *opaque, uint32_t freq) CPUPPCState *env = opaque; ppc_tb_t *tb_env = env->tb_env; - LOG_TB("%s set new frequency to %" PRIu32 "\n", __func__, - freq); + trace_ppc40x_set_tb_clk(freq); tb_env->tb_freq = freq; tb_env->decr_freq = freq; /* XXX: we should also update all timers */ @@ -1381,7 +1320,7 @@ clk_setup_cb ppc_40x_timers_init (CPUPPCState *env, uint32_t freq, tb_env->tb_freq = freq; tb_env->decr_freq = freq; tb_env->opaque = ppc40x_timer; - LOG_TB("%s freq %" PRIu32 "\n", __func__, freq); + trace_ppc40x_timers_init(freq); if (ppc40x_timer != NULL) { /* We use decr timer for PIT */ tb_env->decr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_4xx_pit_cb, env); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index d39fd4e644..b7bee5f4ff 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -29,6 +29,7 @@ #include "qemu/datadir.h" #include "qapi/error.h" #include "qapi/qapi-events-machine.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/visitor.h" #include "sysemu/sysemu.h" #include "sysemu/hostmem.h" @@ -2752,6 +2753,11 @@ static void spapr_machine_init(MachineState *machine) spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY); + /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */ + if (!smc->pre_6_2_numa_affinity) { + spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY); + } + /* advertise support for dedicated HP event source to guests */ if (spapr->use_hotplug_event_source) { spapr_ovec_set(spapr->ov5, OV5_HP_EVT); @@ -2773,39 +2779,6 @@ static void spapr_machine_init(MachineState *machine) /* init CPUs */ spapr_init_cpus(spapr); - /* - * check we don't have a memory-less/cpu-less NUMA node - * Firmware relies on the existing memory/cpu topology to provide the - * NUMA topology to the kernel. - * And the linux kernel needs to know the NUMA topology at start - * to be able to hotplug CPUs later. - */ - if (machine->numa_state->num_nodes) { - for (i = 0; i < machine->numa_state->num_nodes; ++i) { - /* check for memory-less node */ - if (machine->numa_state->nodes[i].node_mem == 0) { - CPUState *cs; - int found = 0; - /* check for cpu-less node */ - CPU_FOREACH(cs) { - PowerPCCPU *cpu = POWERPC_CPU(cs); - if (cpu->node_id == i) { - found = 1; - break; - } - } - /* memory-less and cpu-less node */ - if (!found) { - error_report( - "Memory-less/cpu-less nodes are not supported (node %d)", - i); - exit(1); - } - } - } - - } - spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine); /* Init numa_assoc_array */ @@ -3686,11 +3659,18 @@ void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev) /* * Tell QAPI that something happened and the memory - * hotunplug wasn't successful. + * hotunplug wasn't successful. Keep sending + * MEM_UNPLUG_ERROR even while sending + * DEVICE_UNPLUG_GUEST_ERROR until the deprecation of + * MEM_UNPLUG_ERROR is due. */ qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest " "for device %s", dev->id); - qapi_event_send_mem_unplug_error(dev->id, qapi_error); + + qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error); + + qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id, + dev->canonical_path); } /* Callback to be called during DRC release. */ @@ -4700,8 +4680,11 @@ DEFINE_SPAPR_MACHINE(6_2, "6.2", true); */ static void spapr_machine_6_1_class_options(MachineClass *mc) { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + spapr_machine_6_2_class_options(mc); compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len); + smc->pre_6_2_numa_affinity = true; } DEFINE_SPAPR_MACHINE(6_1, "6.1", false); diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 4f316a6f9d..58e7341cb7 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -382,6 +382,7 @@ static const TypeInfo spapr_cpu_core_type_infos[] = { DEFINE_SPAPR_CPU_CORE_TYPE("power9_v1.0"), DEFINE_SPAPR_CPU_CORE_TYPE("power9_v2.0"), DEFINE_SPAPR_CPU_CORE_TYPE("power10_v1.0"), + DEFINE_SPAPR_CPU_CORE_TYPE("power10_v2.0"), #ifdef CONFIG_KVM DEFINE_SPAPR_CPU_CORE_TYPE("host"), #endif diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index a2f2634601..f8ac0a10df 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -17,6 +17,8 @@ #include "hw/ppc/spapr_drc.h" #include "qom/object.h" #include "migration/vmstate.h" +#include "qapi/error.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/visitor.h" #include "qemu/error-report.h" #include "hw/ppc/spapr.h" /* for RTAS return codes */ @@ -167,13 +169,15 @@ static uint32_t drc_unisolate_logical(SpaprDrc *drc) } drc->unplug_requested = false; - error_report("Device hotunplug rejected by the guest " - "for device %s", drc->dev->id); - /* - * TODO: send a QAPI DEVICE_UNPLUG_ERROR event when - * it is implemented. - */ + if (drc->dev->id) { + error_report("Device hotunplug rejected by the guest " + "for device %s", drc->dev->id); + } + + qapi_event_send_device_unplug_guest_error(!!drc->dev->id, + drc->dev->id, + drc->dev->canonical_path); } return RTAS_OUT_SUCCESS; /* Nothing to do */ diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 0e9a5b2e40..222c1b6bbd 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -17,6 +17,7 @@ #include "kvm_ppc.h" #include "hw/ppc/fdt.h" #include "hw/ppc/spapr_ovec.h" +#include "hw/ppc/spapr_numa.h" #include "mmu-book3s-v3.h" #include "hw/mem/memory-device.h" @@ -1198,6 +1199,12 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, spapr_ovec_cleanup(ov1_guest); /* + * Check for NUMA affinity conditions now that we know which NUMA + * affinity the guest will use. + */ + spapr_numa_associativity_check(spapr); + + /* * Ensure the guest asks for an interrupt mode we support; * otherwise terminate the boot. */ diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 779f18b994..5822938448 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,13 +19,51 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) -static bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) +/* + * Retrieves max_dist_ref_points of the current NUMA affinity. + */ +static int get_max_dist_ref_points(SpaprMachineState *spapr) { - MachineState *machine = MACHINE(spapr); - SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return FORM2_DIST_REF_POINTS; + } + + return FORM1_DIST_REF_POINTS; +} + +/* + * Retrieves numa_assoc_size of the current NUMA affinity. + */ +static int get_numa_assoc_size(SpaprMachineState *spapr) +{ + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return FORM2_NUMA_ASSOC_SIZE; + } + + return FORM1_NUMA_ASSOC_SIZE; +} + +/* + * Retrieves vcpu_assoc_size of the current NUMA affinity. + * + * vcpu_assoc_size is the size of ibm,associativity array + * for CPUs, which has an extra element (vcpu_id) in the end. + */ +static int get_vcpu_assoc_size(SpaprMachineState *spapr) +{ + return get_numa_assoc_size(spapr) + 1; +} - return smc->pre_5_2_numa_associativity || - machine->numa_state->num_nodes <= 1; +/* + * Retrieves the ibm,associativity array of NUMA node 'node_id' + * for the current NUMA affinity. + */ +static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id) +{ + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return spapr->FORM2_assoc_array[node_id]; + } + return spapr->FORM1_assoc_array[node_id]; } static bool spapr_numa_is_symmetrical(MachineState *ms) @@ -92,12 +130,23 @@ static uint8_t spapr_numa_get_numa_level(uint8_t distance) return 0; } -static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) +static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) { MachineState *ms = MACHINE(spapr); NodeInfo *numa_info = ms->numa_state->nodes; int nb_numa_nodes = ms->numa_state->num_nodes; - int src, dst, i; + int src, dst, i, j; + + /* + * Fill all associativity domains of non-zero NUMA nodes with + * node_id. This is required because the default value (0) is + * considered a match with associativity domains of node 0. + */ + for (i = 1; i < nb_numa_nodes; i++) { + for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { + spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i); + } + } for (src = 0; src < nb_numa_nodes; src++) { for (dst = src; dst < nb_numa_nodes; dst++) { @@ -132,7 +181,7 @@ static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) * * The Linux kernel will assume that the distance between src and * dst, in this case of no match, is 10 (local distance) doubled - * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS + * for each NUMA it didn't match. We have FORM1_DIST_REF_POINTS * levels (4), so this gives us 10*2*2*2*2 = 160. * * This logic can be seen in the Linux kernel source code, as of @@ -147,25 +196,69 @@ static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) * and going up to 0x1. */ for (i = n_level; i > 0; i--) { - assoc_src = spapr->numa_assoc_array[src][i]; - spapr->numa_assoc_array[dst][i] = assoc_src; + assoc_src = spapr->FORM1_assoc_array[src][i]; + spapr->FORM1_assoc_array[dst][i] = assoc_src; } } } } -void spapr_numa_associativity_init(SpaprMachineState *spapr, - MachineState *machine) +static void spapr_numa_FORM1_affinity_check(MachineState *machine) +{ + int i; + + /* + * Check we don't have a memory-less/cpu-less NUMA node + * Firmware relies on the existing memory/cpu topology to provide the + * NUMA topology to the kernel. + * And the linux kernel needs to know the NUMA topology at start + * to be able to hotplug CPUs later. + */ + if (machine->numa_state->num_nodes) { + for (i = 0; i < machine->numa_state->num_nodes; ++i) { + /* check for memory-less node */ + if (machine->numa_state->nodes[i].node_mem == 0) { + CPUState *cs; + int found = 0; + /* check for cpu-less node */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + if (cpu->node_id == i) { + found = 1; + break; + } + } + /* memory-less and cpu-less node */ + if (!found) { + error_report( +"Memory-less/cpu-less nodes are not supported with FORM1 NUMA (node %d)", i); + exit(EXIT_FAILURE); + } + } + } + } + + if (!spapr_numa_is_symmetrical(machine)) { + error_report( +"Asymmetrical NUMA topologies aren't supported in the pSeries machine using FORM1 NUMA"); + exit(EXIT_FAILURE); + } +} + +/* + * Set NUMA machine state data based on FORM1 affinity semantics. + */ +static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, + MachineState *machine) { SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int i, j, max_nodes_with_gpus; - bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); /* * For all associativity arrays: first position is the size, - * position MAX_DISTANCE_REF_POINTS is always the numa_id, + * position FORM1_DIST_REF_POINTS is always the numa_id, * represented by the index 'i'. * * This will break on sparse NUMA setups, when/if QEMU starts @@ -173,19 +266,8 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, * 'i' will be a valid node_id set by the user. */ for (i = 0; i < nb_numa_nodes; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); - - /* - * Fill all associativity domains of non-zero NUMA nodes with - * node_id. This is required because the default value (0) is - * considered a match with associativity domains of node 0. - */ - if (!using_legacy_numa && i != 0) { - for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { - spapr->numa_assoc_array[i][j] = cpu_to_be32(i); - } - } + spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); + spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* @@ -199,47 +281,95 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); - for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? SPAPR_GPU_NUMA_ID : cpu_to_be32(i); - spapr->numa_assoc_array[i][j] = gpu_assoc; + spapr->FORM1_assoc_array[i][j] = gpu_assoc; } - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* - * Legacy NUMA guests (pseries-5.1 and older, or guests with only - * 1 NUMA node) will not benefit from anything we're going to do - * after this point. + * Guests pseries-5.1 and older uses zeroed associativity domains, + * i.e. no domain definition based on NUMA distance input. + * + * Same thing with guests that have only one NUMA node. */ - if (using_legacy_numa) { + if (smc->pre_5_2_numa_associativity || + machine->numa_state->num_nodes <= 1) { return; } - if (!spapr_numa_is_symmetrical(machine)) { - error_report("Asymmetrical NUMA topologies aren't supported " - "in the pSeries machine"); - exit(EXIT_FAILURE); + spapr_numa_define_FORM1_domains(spapr); +} + +/* + * Init NUMA FORM2 machine state data + */ +static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr) +{ + int i; + + /* + * For all resources but CPUs, FORM2 associativity arrays will + * be a size 2 array with the following format: + * + * ibm,associativity = {1, numa_id} + * + * CPUs will write an additional 'vcpu_id' on top of the arrays + * being initialized here. 'numa_id' is represented by the + * index 'i' of the loop. + * + * Given that this initialization is also valid for GPU associativity + * arrays, handle everything in one single step by populating the + * arrays up to NUMA_NODES_MAX_NUM. + */ + for (i = 0; i < NUMA_NODES_MAX_NUM; i++) { + spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1); + spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i); } +} - spapr_numa_define_associativity_domains(spapr); +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine) +{ + spapr_numa_FORM1_affinity_init(spapr, machine); + spapr_numa_FORM2_affinity_init(spapr); +} + +void spapr_numa_associativity_check(SpaprMachineState *spapr) +{ + /* + * FORM2 does not have any restrictions we need to handle + * at CAS time, for now. + */ + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return; + } + + spapr_numa_FORM1_affinity_check(MACHINE(spapr)); } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid) { + const uint32_t *associativity = get_associativity(spapr, nodeid); + _FDT((fdt_setprop(fdt, offset, "ibm,associativity", - spapr->numa_assoc_array[nodeid], - sizeof(spapr->numa_assoc_array[nodeid])))); + associativity, + get_numa_assoc_size(spapr) * sizeof(uint32_t)))); } static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, PowerPCCPU *cpu) { - uint32_t *vcpu_assoc = g_new(uint32_t, VCPU_ASSOC_SIZE); + const uint32_t *associativity = get_associativity(spapr, cpu->node_id); + int max_distance_ref_points = get_max_dist_ref_points(spapr); + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); + uint32_t *vcpu_assoc = g_new(uint32_t, vcpu_assoc_size); int index = spapr_get_vcpu_id(cpu); /* @@ -248,10 +378,10 @@ static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, * 0, put cpu_id last, then copy the remaining associativity * domains. */ - vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); - vcpu_assoc[VCPU_ASSOC_SIZE - 1] = cpu_to_be32(index); - memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, - (VCPU_ASSOC_SIZE - 2) * sizeof(uint32_t)); + vcpu_assoc[0] = cpu_to_be32(max_distance_ref_points + 1); + vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); + memcpy(vcpu_assoc + 1, associativity + 1, + (vcpu_assoc_size - 2) * sizeof(uint32_t)); return vcpu_assoc; } @@ -260,12 +390,13 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, int offset, PowerPCCPU *cpu) { g_autofree uint32_t *vcpu_assoc = NULL; + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); /* Advertise NUMA via ibm,associativity */ return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, - VCPU_ASSOC_SIZE * sizeof(uint32_t)); + vcpu_assoc_size * sizeof(uint32_t)); } @@ -273,27 +404,28 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, int offset) { MachineState *machine = MACHINE(spapr); + int max_distance_ref_points = get_max_dist_ref_points(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; uint32_t *int_buf, *cur_index, buf_len; int ret, i; /* ibm,associativity-lookup-arrays */ - buf_len = (nr_nodes * MAX_DISTANCE_REF_POINTS + 2) * sizeof(uint32_t); + buf_len = (nr_nodes * max_distance_ref_points + 2) * sizeof(uint32_t); cur_index = int_buf = g_malloc0(buf_len); int_buf[0] = cpu_to_be32(nr_nodes); /* Number of entries per associativity list */ - int_buf[1] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + int_buf[1] = cpu_to_be32(max_distance_ref_points); cur_index += 2; for (i = 0; i < nr_nodes; i++) { /* - * For the lookup-array we use the ibm,associativity array, - * from numa_assoc_array. without the first element (size). + * For the lookup-array we use the ibm,associativity array of the + * current NUMA affinity, without the first element (size). */ - uint32_t *associativity = spapr->numa_assoc_array[i]; + const uint32_t *associativity = get_associativity(spapr, i); memcpy(cur_index, ++associativity, - sizeof(uint32_t) * MAX_DISTANCE_REF_POINTS); - cur_index += MAX_DISTANCE_REF_POINTS; + sizeof(uint32_t) * max_distance_ref_points); + cur_index += max_distance_ref_points; } ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, (cur_index - int_buf) * sizeof(uint32_t)); @@ -302,12 +434,8 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, return ret; } -/* - * Helper that writes ibm,associativity-reference-points and - * max-associativity-domains in the RTAS pointed by @rtas - * in the DT @fdt. - */ -void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, + void *fdt, int rtas) { MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); @@ -329,7 +457,8 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) cpu_to_be32(maxdomain) }; - if (spapr_machine_using_legacy_numa(spapr)) { + if (smc->pre_5_2_numa_associativity || + ms->numa_state->num_nodes <= 1) { uint32_t legacy_refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4), @@ -365,6 +494,125 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) maxdomains, sizeof(maxdomains))); } +static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, + void *fdt, int rtas) +{ + MachineState *ms = MACHINE(spapr); + NodeInfo *numa_info = ms->numa_state->nodes; + int nb_numa_nodes = ms->numa_state->num_nodes; + int distance_table_entries = nb_numa_nodes * nb_numa_nodes; + g_autofree uint32_t *lookup_index_table = NULL; + g_autofree uint8_t *distance_table = NULL; + int src, dst, i, distance_table_size; + + /* + * ibm,numa-lookup-index-table: array with length and a + * list of NUMA ids present in the guest. + */ + lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1); + lookup_index_table[0] = cpu_to_be32(nb_numa_nodes); + + for (i = 0; i < nb_numa_nodes; i++) { + lookup_index_table[i + 1] = cpu_to_be32(i); + } + + _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table", + lookup_index_table, + (nb_numa_nodes + 1) * sizeof(uint32_t))); + + /* + * ibm,numa-distance-table: contains all node distances. First + * element is the size of the table as uint32, followed up + * by all the uint8 distances from the first NUMA node, then all + * distances from the second NUMA node and so on. + * + * ibm,numa-lookup-index-table is used by guest to navigate this + * array because NUMA ids can be sparse (node 0 is the first, + * node 8 is the second ...). + */ + distance_table_size = distance_table_entries * sizeof(uint8_t) + + sizeof(uint32_t); + distance_table = g_new0(uint8_t, distance_table_size); + stl_be_p(distance_table, distance_table_entries); + + /* Skip the uint32_t array length at the start */ + i = sizeof(uint32_t); + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + /* + * We need to be explicit with the local distance + * value to cover the case where the user didn't added any + * NUMA nodes, but QEMU adds the default NUMA node without + * adding the numa_info to retrieve distance info from. + */ + if (src == dst) { + distance_table[i++] = NUMA_DISTANCE_MIN; + continue; + } + + distance_table[i++] = numa_info[src].distance[dst]; + } + } + + _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table", + distance_table, distance_table_size)); +} + +/* + * This helper could be compressed in a single function with + * FORM1 logic since we're setting the same DT values, with the + * difference being a call to spapr_numa_FORM2_write_rtas_tables() + * in the end. The separation was made to avoid clogging FORM1 code + * which already has to deal with compat modes from previous + * QEMU machine types. + */ +static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr, + void *fdt, int rtas) +{ + MachineState *ms = MACHINE(spapr); + uint32_t number_nvgpus_nodes = spapr->gpu_numa_id - + spapr_numa_initial_nvgpu_numa_id(ms); + + /* + * In FORM2, ibm,associativity-reference-points will point to + * the element in the ibm,associativity array that contains the + * primary domain index (for FORM2, the first element). + * + * This value (in our case, the numa-id) is then used as an index + * to retrieve all other attributes of the node (distance, + * bandwidth, latency) via ibm,numa-lookup-index-table and other + * ibm,numa-*-table properties. + */ + uint32_t refpoints[] = { cpu_to_be32(1) }; + + uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes; + uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) }; + + _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", + refpoints, sizeof(refpoints))); + + _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", + maxdomains, sizeof(maxdomains))); + + spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas); +} + +/* + * Helper that writes ibm,associativity-reference-points and + * max-associativity-domains in the RTAS pointed by @rtas + * in the DT @fdt. + */ +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +{ + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas); + return; + } + + spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas); +} + static target_ulong h_home_node_associativity(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, @@ -375,6 +623,7 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, target_ulong procno = args[1]; PowerPCCPU *tcpu; int idx, assoc_idx; + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); /* only support procno from H_REGISTER_VPA */ if (flags != 0x1) { @@ -393,7 +642,7 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, * 12 associativity domains for vcpus. Assert and bail if that's * not the case. */ - G_STATIC_ASSERT((VCPU_ASSOC_SIZE - 1) <= 12); + g_assert((vcpu_assoc_size - 1) <= 12); vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); /* assoc_idx starts at 1 to skip associativity size */ @@ -414,9 +663,9 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, * macro. The ternary will fill the remaining registers with -1 * after we went through vcpu_assoc[]. */ - a = assoc_idx < VCPU_ASSOC_SIZE ? + a = assoc_idx < vcpu_assoc_size ? be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; - b = assoc_idx < VCPU_ASSOC_SIZE ? + b = assoc_idx < vcpu_assoc_size ? be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; args[idx] = ASSOCIATIVITY(a, b); diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index da6e74b80d..3bf43fa340 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -97,7 +97,27 @@ vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx6 # ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" - +ppc_tb_load(uint64_t tb) "tb 0x%016" PRIx64 +ppc_tb_store(uint64_t tb, uint64_t offset) "tb 0x%016" PRIx64 " offset 0x%08" PRIx64 + +ppc_decr_load(uint64_t tb) "decr 0x%016" PRIx64 +ppc_decr_excp(const char *action) "%s decrementer" +ppc_decr_store(uint32_t nr_bits, uint64_t decr, uint64_t value) "%d-bit 0x%016" PRIx64 " => 0x%016" PRIx64 + +ppc4xx_fit(uint32_t ir, uint64_t tcr, uint64_t tsr) "ir %d TCR 0x%" PRIx64 " TSR 0x%" PRIx64 +ppc4xx_pit_stop(void) "" +ppc4xx_pit_start(uint64_t reload) "PIT 0x%016" PRIx64 +ppc4xx_pit(uint32_t ar, uint32_t ir, uint64_t tcr, uint64_t tsr, uint64_t reload) "ar %d ir %d TCR 0x%" PRIx64 " TSR 0x%" PRIx64 " PIT 0x%016" PRIx64 +ppc4xx_wdt(uint64_t tcr, uint64_t tsr) "TCR 0x%" PRIx64 " TSR 0x%" PRIx64 +ppc40x_store_pit(uint64_t value) "val 0x%" PRIx64 +ppc40x_set_tb_clk(uint32_t value) "new frequency %" PRIu32 +ppc40x_timers_init(uint32_t value) "frequency %" PRIu32 + +ppc_irq_set(void *env, uint32_t pin, uint32_t level) "env [%p] pin %d level %d" +ppc_irq_set_exit(void *env, uint32_t n_IRQ, uint32_t level, uint32_t pending, uint32_t request) "env [%p] n_IRQ %d level %d => pending 0x%08" PRIx32 " req 0x%08" PRIx32 +ppc_irq_set_state(const char *name, uint32_t level) "\"%s\" level %d" +ppc_irq_reset(const char *name) "%s" +ppc_irq_cpu(const char *action) "%s" # prep_systemio.c prep_systemio_read(uint32_t addr, uint32_t val) "read addr=0x%x val=0x%x" diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c index 901dbf1357..882c9b4854 100644 --- a/hw/remote/proxy-memory-listener.c +++ b/hw/remote/proxy-memory-listener.c @@ -219,6 +219,7 @@ void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener, proxy_listener->listener.region_add = proxy_memory_listener_region_addnop; proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop; proxy_listener->listener.priority = 10; + proxy_listener->listener.name = "proxy"; memory_listener_register(&proxy_listener->listener, &address_space_memory); diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 8728d4d5c2..a784b219e6 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -562,6 +562,7 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) { return (!memory_region_is_ram(section->mr) && !memory_region_is_iommu(section->mr)) || + memory_region_is_protected(section->mr) || /* * Sizing an enabled 64-bit BAR can cause spurious mappings to * addresses in the upper part of the 64-bit address space. These @@ -1434,6 +1435,7 @@ static void vfio_listener_log_sync(MemoryListener *listener, } static const MemoryListener vfio_memory_listener = { + .name = "vfio", .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, .log_global_start = vfio_listener_log_global_start, diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c index ea3f70bd2f..04c6e67f8f 100644 --- a/hw/vfio/spapr.c +++ b/hw/vfio/spapr.c @@ -136,6 +136,7 @@ static void vfio_prereg_listener_region_del(MemoryListener *listener, } const MemoryListener vfio_prereg_listener = { + .name = "vfio-pre-reg", .region_add = vfio_prereg_listener_region_add, .region_del = vfio_prereg_listener_region_del, }; diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 7633ea66d1..47d7a5a23d 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -246,6 +246,7 @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener, * depends on the addnop(). */ static const MemoryListener vhost_vdpa_memory_listener = { + .name = "vhost-vdpa", .commit = vhost_vdpa_listener_commit, .region_add = vhost_vdpa_listener_region_add, .region_del = vhost_vdpa_listener_region_del, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index b4b29413e6..437347ad01 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1366,6 +1366,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, hdev->features = features; hdev->memory_listener = (MemoryListener) { + .name = "vhost", .begin = vhost_begin, .commit = vhost_commit, .region_add = vhost_region_addnop, @@ -1381,6 +1382,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, }; hdev->iommu_listener = (MemoryListener) { + .name = "vhost-iommu", .region_add = vhost_iommu_region_add, .region_del = vhost_iommu_region_del, }; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 3a1f6c520c..240759ff0b 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3670,6 +3670,7 @@ static void virtio_device_realize(DeviceState *dev, Error **errp) } vdev->listener.commit = virtio_memory_listener_commit; + vdev->listener.name = "virtio"; memory_listener_register(&vdev->listener, vdev->dma_as); } diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c index 232482d65f..ca0a98187e 100644 --- a/hw/xen/xen_pt.c +++ b/hw/xen/xen_pt.c @@ -689,12 +689,14 @@ static void xen_pt_io_region_del(MemoryListener *l, MemoryRegionSection *sec) } static const MemoryListener xen_pt_memory_listener = { + .name = "xen-pt-mem", .region_add = xen_pt_region_add, .region_del = xen_pt_region_del, .priority = 10, }; static const MemoryListener xen_pt_io_listener = { + .name = "xen-pt-io", .region_add = xen_pt_io_region_add, .region_del = xen_pt_io_region_del, .priority = 10, diff --git a/include/block/block_int.h b/include/block/block_int.h index 5451f89b8d..ffe86068d4 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -94,6 +94,9 @@ typedef struct BdrvTrackedRequest { struct BdrvTrackedRequest *waiting_for; } BdrvTrackedRequest; +int bdrv_check_qiov_request(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + Error **errp); int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); struct BlockDriver { @@ -232,11 +235,11 @@ struct BlockDriver { /* aio */ BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags, - BlockCompletionFunc *cb, void *opaque); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags, - BlockCompletionFunc *cb, void *opaque); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, @@ -262,10 +265,11 @@ struct BlockDriver { * The buffer in @qiov may point directly to guest memory. */ int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, int flags); + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); /** @@ -284,10 +288,11 @@ struct BlockDriver { * The buffer in @qiov may point directly to guest memory. */ int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, size_t qiov_offset, int flags); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); /* * Efficiently zero a region of the disk image. Typically an image format @@ -296,9 +301,9 @@ struct BlockDriver { * will be called instead. */ int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, - int64_t offset, int bytes, BdrvRequestFlags flags); + int64_t offset, int64_t bytes, BdrvRequestFlags flags); int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, - int64_t offset, int bytes); + int64_t offset, int64_t bytes); /* Map [offset, offset + nbytes) range onto a child of @bs to copy from, * and invoke bdrv_co_copy_range_from(child, ...), or invoke @@ -309,10 +314,10 @@ struct BlockDriver { */ int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, BdrvChild *src, - uint64_t offset, + int64_t offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); @@ -326,10 +331,10 @@ struct BlockDriver { */ int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, BdrvChild *src, - uint64_t src_offset, + int64_t src_offset, BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, + int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags); @@ -434,10 +439,9 @@ struct BlockDriver { Error **errp); int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov); + int64_t offset, int64_t bytes, QEMUIOVector *qiov); int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, - size_t qiov_offset); + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); int (*bdrv_snapshot_create)(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); @@ -670,11 +674,12 @@ typedef struct BlockLimits { * otherwise. */ uint32_t request_alignment; - /* Maximum number of bytes that can be discarded at once (since it - * is signed, it must be < 2G, if set). Must be multiple of - * pdiscard_alignment, but need not be power of 2. May be 0 if no - * inherent 32-bit limit */ - int32_t max_pdiscard; + /* + * Maximum number of bytes that can be discarded at once. Must be multiple + * of pdiscard_alignment, but need not be power of 2. May be 0 if no + * inherent 64-bit limit. + */ + int64_t max_pdiscard; /* Optimal alignment for discard requests in bytes. A power of 2 * is best but not mandatory. Must be a multiple of @@ -682,10 +687,11 @@ typedef struct BlockLimits { * that is set. May be 0 if bl.request_alignment is good enough */ uint32_t pdiscard_alignment; - /* Maximum number of bytes that can zeroized at once (since it is - * signed, it must be < 2G, if set). Must be multiple of - * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */ - int32_t max_pwrite_zeroes; + /* + * Maximum number of bytes that can zeroized at once. Must be multiple of + * pwrite_zeroes_alignment. 0 means no limit. + */ + int64_t max_pwrite_zeroes; /* Optimal alignment for write zeroes requests in bytes. A power * of 2 is best but not mandatory. Must be a multiple of diff --git a/include/exec/memory.h b/include/exec/memory.h index c3d417d317..a185b6dcb8 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -190,6 +190,9 @@ typedef struct IOMMUTLBEvent { */ #define RAM_NORESERVE (1 << 7) +/* RAM that isn't accessible through normal means. */ +#define RAM_PROTECTED (1 << 8) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, @@ -979,6 +982,14 @@ struct MemoryListener { */ unsigned priority; + /** + * @name: + * + * Name of the listener. It can be used in contexts where we'd like to + * identify one memory listener with the rest. + */ + const char *name; + /* private: */ AddressSpace *address_space; QTAILQ_ENTRY(MemoryListener) link; @@ -1267,7 +1278,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, * @name: the name of the region. * @size: size of the region. * @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM, - * RAM_NORESERVE. + * RAM_NORESERVE, RAM_PROTECTED. * @fd: the fd to mmap. * @offset: offset within the file referenced by fd * @errp: pointer to Error*, to store an error if it happens. @@ -1569,6 +1580,16 @@ static inline bool memory_region_is_romd(MemoryRegion *mr) } /** + * memory_region_is_protected: check whether a memory region is protected + * + * Returns %true if a memory region is protected RAM and cannot be accessed + * via standard mechanisms, e.g. DMA. + * + * @mr: the memory region being queried + */ +bool memory_region_is_protected(MemoryRegion *mr); + +/** * memory_region_get_iommu: check whether a memory region is an iommu * * Returns pointer to IOMMUMemoryRegion if a memory region is an iommu, diff --git a/include/hw/i386/hostmem-epc.h b/include/hw/i386/hostmem-epc.h new file mode 100644 index 0000000000..846c726085 --- /dev/null +++ b/include/hw/i386/hostmem-epc.h @@ -0,0 +1,28 @@ +/* + * SGX EPC backend + * + * Copyright (C) 2019 Intel Corporation + * + * Authors: + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_HOSTMEM_EPC_H +#define QEMU_HOSTMEM_EPC_H + +#include "sysemu/hostmem.h" + +#define TYPE_MEMORY_BACKEND_EPC "memory-backend-epc" + +#define MEMORY_BACKEND_EPC(obj) \ + OBJECT_CHECK(HostMemoryBackendEpc, (obj), TYPE_MEMORY_BACKEND_EPC) + +typedef struct HostMemoryBackendEpc HostMemoryBackendEpc; + +struct HostMemoryBackendEpc { + HostMemoryBackend parent_obj; +}; + +#endif diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 4d2e35a152..5748d7c55f 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -12,6 +12,7 @@ #include "hw/acpi/acpi_dev_interface.h" #include "hw/hotplug.h" #include "qom/object.h" +#include "hw/i386/sgx-epc.h" #define HPET_INTCAP "hpet-intcap" @@ -49,6 +50,8 @@ typedef struct PCMachineState { /* ACPI Memory hotplug IO base address */ hwaddr memhp_io_base; + + SGXEPCState sgx_epc; } PCMachineState; #define PC_MACHINE_ACPI_DEVICE_PROP "acpi-device" @@ -192,6 +195,9 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size); void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, const CPUArchIdList *apic_ids, GArray *entry); +/* sgx.c */ +void pc_machine_init_sgx_epc(PCMachineState *pcms); + extern GlobalProperty pc_compat_6_1[]; extern const size_t pc_compat_6_1_len; diff --git a/include/hw/i386/sgx-epc.h b/include/hw/i386/sgx-epc.h new file mode 100644 index 0000000000..65a68ca753 --- /dev/null +++ b/include/hw/i386/sgx-epc.h @@ -0,0 +1,67 @@ +/* + * SGX EPC device + * + * Copyright (C) 2019 Intel Corporation + * + * Authors: + * Sean Christopherson <sean.j.christopherson@intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_SGX_EPC_H +#define QEMU_SGX_EPC_H + +#include "hw/i386/hostmem-epc.h" + +#define TYPE_SGX_EPC "sgx-epc" +#define SGX_EPC(obj) \ + OBJECT_CHECK(SGXEPCDevice, (obj), TYPE_SGX_EPC) +#define SGX_EPC_CLASS(oc) \ + OBJECT_CLASS_CHECK(SGXEPCDeviceClass, (oc), TYPE_SGX_EPC) +#define SGX_EPC_GET_CLASS(obj) \ + OBJECT_GET_CLASS(SGXEPCDeviceClass, (obj), TYPE_SGX_EPC) + +#define SGX_EPC_ADDR_PROP "addr" +#define SGX_EPC_SIZE_PROP "size" +#define SGX_EPC_MEMDEV_PROP "memdev" + +/** + * SGXEPCDevice: + * @addr: starting guest physical address, where @SGXEPCDevice is mapped. + * Default value: 0, means that address is auto-allocated. + * @hostmem: host memory backend providing memory for @SGXEPCDevice + */ +typedef struct SGXEPCDevice { + /* private */ + DeviceState parent_obj; + + /* public */ + uint64_t addr; + HostMemoryBackendEpc *hostmem; +} SGXEPCDevice; + +/* + * @base: address in guest physical address space where EPC regions start + * @mr: address space container for memory devices + */ +typedef struct SGXEPCState { + uint64_t base; + uint64_t size; + + MemoryRegion mr; + + struct SGXEPCDevice **sections; + int nr_sections; +} SGXEPCState; + +int sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size); + +static inline uint64_t sgx_epc_above_4g_end(SGXEPCState *sgx_epc) +{ + assert(sgx_epc != NULL && sgx_epc->base >= 0x100000000ULL); + + return sgx_epc->base + sgx_epc->size; +} + +#endif diff --git a/include/hw/i386/sgx.h b/include/hw/i386/sgx.h new file mode 100644 index 0000000000..16fc25725c --- /dev/null +++ b/include/hw/i386/sgx.h @@ -0,0 +1,12 @@ +#ifndef QEMU_SGX_H +#define QEMU_SGX_H + +#include "qom/object.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-misc-target.h" + +SGXInfo *sgx_get_info(Error **errp); +SGXInfo *sgx_get_capabilities(Error **errp); + +#endif diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index 6e9244a82c..23267a3674 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -62,6 +62,7 @@ struct X86MachineState { unsigned pci_irq_mask; unsigned apic_id_limit; uint16_t boot_cpus; + SgxEPCList *sgx_epc_list; OnOffAuto smm; OnOffAuto acpi; diff --git a/include/hw/ppc/openpic.h b/include/hw/ppc/openpic.h index 74ff44bff0..ebdaf8a493 100644 --- a/include/hw/ppc/openpic.h +++ b/include/hw/ppc/openpic.h @@ -21,7 +21,6 @@ enum { typedef struct IrqLines { qemu_irq irq[OPENPIC_OUTPUT_NB]; } IrqLines; -#define OPENPIC_MODEL_RAVEN 0 #define OPENPIC_MODEL_FSL_MPIC_20 1 #define OPENPIC_MODEL_FSL_MPIC_42 2 #define OPENPIC_MODEL_KEYLARGO 3 @@ -32,13 +31,6 @@ typedef struct IrqLines { qemu_irq irq[OPENPIC_OUTPUT_NB]; } IrqLines; #define OPENPIC_MAX_IRQ (OPENPIC_MAX_SRC + OPENPIC_MAX_IPI + \ OPENPIC_MAX_TMR) -/* Raven */ -#define RAVEN_MAX_CPU 2 -#define RAVEN_MAX_EXT 48 -#define RAVEN_MAX_IRQ 64 -#define RAVEN_MAX_TMR OPENPIC_MAX_TMR -#define RAVEN_MAX_IPI OPENPIC_MAX_IPI - /* KeyLargo */ #define KEYLARGO_MAX_CPU 4 #define KEYLARGO_MAX_EXT 64 @@ -49,14 +41,6 @@ typedef struct IrqLines { qemu_irq irq[OPENPIC_OUTPUT_NB]; } IrqLines; /* Timers don't exist but this makes the code happy... */ #define KEYLARGO_TMR_IRQ (KEYLARGO_IPI_IRQ + KEYLARGO_MAX_IPI) -/* Interrupt definitions */ -#define RAVEN_FE_IRQ (RAVEN_MAX_EXT) /* Internal functional IRQ */ -#define RAVEN_ERR_IRQ (RAVEN_MAX_EXT + 1) /* Error IRQ */ -#define RAVEN_TMR_IRQ (RAVEN_MAX_EXT + 2) /* First timer IRQ */ -#define RAVEN_IPI_IRQ (RAVEN_TMR_IRQ + RAVEN_MAX_TMR) /* First IPI IRQ */ -/* First doorbell IRQ */ -#define RAVEN_DBL_IRQ (RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI)) - typedef struct FslMpicInfo { int max_ext; } FslMpicInfo; @@ -67,7 +51,8 @@ typedef enum IRQType { IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ } IRQType; -/* Round up to the nearest 64 IRQs so that the queue length +/* + * Round up to the nearest 64 IRQs so that the queue length * won't change when moving between 32 and 64 bit hosts. */ #define IRQQUEUE_SIZE_BITS ((OPENPIC_MAX_IRQ + 63) & ~63) @@ -117,8 +102,10 @@ typedef struct OpenPICTimer { bool qemu_timer_active; /* Is the qemu_timer is running? */ struct QEMUTimer *qemu_timer; struct OpenPICState *opp; /* Device timer is part of. */ - /* The QEMU_CLOCK_VIRTUAL time (in ns) corresponding to the last - current_count written or read, only defined if qemu_timer_active. */ + /* + * The QEMU_CLOCK_VIRTUAL time (in ns) corresponding to the last + * current_count written or read, only defined if qemu_timer_active. + */ uint64_t origin_time; } OpenPICTimer; diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h index 6ecee98a76..c22eab2e1f 100644 --- a/include/hw/ppc/pnv_core.h +++ b/include/hw/ppc/pnv_core.h @@ -67,7 +67,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(PnvQuad, PNV_QUAD) struct PnvQuad { DeviceState parent_obj; - uint32_t id; + uint32_t quad_id; MemoryRegion xscom_regs; }; #endif /* PPC_PNV_CORE_H */ diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 637652ad16..ee7504b976 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -100,23 +100,30 @@ typedef enum { #define FDT_MAX_SIZE 0x200000 +/* Max number of GPUs per system */ +#define NVGPU_MAX_NUM 6 + +/* Max number of NUMA nodes */ +#define NUMA_NODES_MAX_NUM (MAX_NODES + NVGPU_MAX_NUM) + /* - * NUMA related macros. MAX_DISTANCE_REF_POINTS was taken - * from Linux kernel arch/powerpc/mm/numa.h. It represents the - * amount of associativity domains for non-CPU resources. + * NUMA FORM1 macros. FORM1_DIST_REF_POINTS was taken from + * MAX_DISTANCE_REF_POINTS in arch/powerpc/mm/numa.h from Linux + * kernel source. It represents the amount of associativity domains + * for non-CPU resources. * - * NUMA_ASSOC_SIZE is the base array size of an ibm,associativity + * FORM1_NUMA_ASSOC_SIZE is the base array size of an ibm,associativity * array for any non-CPU resource. - * - * VCPU_ASSOC_SIZE represents the size of ibm,associativity array - * for CPUs, which has an extra element (vcpu_id) in the end. */ -#define MAX_DISTANCE_REF_POINTS 4 -#define NUMA_ASSOC_SIZE (MAX_DISTANCE_REF_POINTS + 1) -#define VCPU_ASSOC_SIZE (NUMA_ASSOC_SIZE + 1) +#define FORM1_DIST_REF_POINTS 4 +#define FORM1_NUMA_ASSOC_SIZE (FORM1_DIST_REF_POINTS + 1) -/* Max number of these GPUsper a physical box */ -#define NVGPU_MAX_NUM 6 +/* + * FORM2 NUMA affinity has a single associativity domain, giving + * us a assoc size of 2. + */ +#define FORM2_DIST_REF_POINTS 1 +#define FORM2_NUMA_ASSOC_SIZE (FORM2_DIST_REF_POINTS + 1) typedef struct SpaprCapabilities SpaprCapabilities; struct SpaprCapabilities { @@ -145,6 +152,7 @@ struct SpaprMachineClass { hwaddr rma_limit; /* clamp the RMA to this size */ bool pre_5_1_assoc_refpoints; bool pre_5_2_numa_associativity; + bool pre_6_2_numa_affinity; bool (*phb_placement)(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, @@ -249,7 +257,8 @@ struct SpaprMachineState { unsigned gpu_numa_id; SpaprTpmProxy *tpm_proxy; - uint32_t numa_assoc_array[MAX_NODES + NVGPU_MAX_NUM][NUMA_ASSOC_SIZE]; + uint32_t FORM1_assoc_array[NUMA_NODES_MAX_NUM][FORM1_NUMA_ASSOC_SIZE]; + uint32_t FORM2_assoc_array[NUMA_NODES_MAX_NUM][FORM2_NUMA_ASSOC_SIZE]; Error *fwnmi_migration_blocker; }; diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h index 6f9f02d3de..7cb3367400 100644 --- a/include/hw/ppc/spapr_numa.h +++ b/include/hw/ppc/spapr_numa.h @@ -24,6 +24,7 @@ */ void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine); +void spapr_numa_associativity_check(SpaprMachineState *spapr); void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid); diff --git a/include/hw/ppc/spapr_ovec.h b/include/hw/ppc/spapr_ovec.h index 48b716a060..c3e8b98e7e 100644 --- a/include/hw/ppc/spapr_ovec.h +++ b/include/hw/ppc/spapr_ovec.h @@ -49,6 +49,7 @@ typedef struct SpaprOptionVector SpaprOptionVector; /* option vector 5 */ #define OV5_DRCONF_MEMORY OV_BIT(2, 2) #define OV5_FORM1_AFFINITY OV_BIT(5, 0) +#define OV5_FORM2_AFFINITY OV_BIT(5, 2) #define OV5_HP_EVT OV_BIT(6, 5) #define OV5_HPT_RESIZE OV_BIT(6, 7) #define OV5_DRMEM_V2 OV_BIT(22, 0) diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index db76411654..252c58a1d6 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -335,6 +335,11 @@ struct XiveTCTX { XivePresenter *xptr; }; +static inline uint32_t xive_tctx_word2(uint8_t *ring) +{ + return *((uint32_t *) &ring[TM_WORD2]); +} + /* * XIVE Router */ @@ -459,6 +464,17 @@ struct XiveENDSource { #define XIVE_PRIORITY_MAX 7 /* + * Convert a priority number to an Interrupt Pending Buffer (IPB) + * register, which indicates a pending interrupt at the priority + * corresponding to the bit number + */ +static inline uint8_t xive_priority_to_ipb(uint8_t priority) +{ + return priority > XIVE_PRIORITY_MAX ? + 0 : 1 << (XIVE_PRIORITY_MAX - priority); +} + +/* * XIVE Thread Interrupt Management Aera (TIMA) * * This region gives access to the registers of the thread interrupt diff --git a/include/monitor/hmp-target.h b/include/monitor/hmp-target.h index 60fc92722a..dc53add7ee 100644 --- a/include/monitor/hmp-target.h +++ b/include/monitor/hmp-target.h @@ -49,5 +49,6 @@ void hmp_info_tlb(Monitor *mon, const QDict *qdict); void hmp_mce(Monitor *mon, const QDict *qdict); void hmp_info_local_apic(Monitor *mon, const QDict *qdict); void hmp_info_io_apic(Monitor *mon, const QDict *qdict); +void hmp_info_sgx(Monitor *mon, const QDict *qdict); #endif /* MONITOR_HMP_TARGET_H */ diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h index 711b221704..ca9f3f021b 100644 --- a/include/qemu/host-utils.h +++ b/include/qemu/host-utils.h @@ -70,7 +70,7 @@ static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor) if (divisor == 0) { return 1; } else { - __int128_t dividend = ((__int128_t)*phigh << 64) | *plow; + __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow; __int128_t result = dividend / divisor; *plow = result; *phigh = dividend % divisor; @@ -358,6 +358,14 @@ static inline uint64_t revbit64(uint64_t x) } /** + * Return the absolute value of a 64-bit integer as an unsigned 64-bit value + */ +static inline uint64_t uabs64(int64_t v) +{ + return v < 0 ? -v : v; +} + +/** * sadd32_overflow - addition with overflow indication * @x, @y: addends * @ret: Output for sum diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index c788452cd9..1f5487d9b7 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -37,7 +37,7 @@ typedef struct KVMMemoryListener { } KVMMemoryListener; void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, - AddressSpace *as, int as_id); + AddressSpace *as, int as_id, const char *name); void kvm_set_max_memslot_size(hwaddr max_slot_size); diff --git a/io/dns-resolver.c b/io/dns-resolver.c index a5946a93bf..53b0e8407a 100644 --- a/io/dns-resolver.c +++ b/io/dns-resolver.c @@ -122,7 +122,7 @@ static int qio_dns_resolver_lookup_sync_inet(QIODNSResolver *resolver, .ipv4 = iaddr->ipv4, .has_ipv6 = iaddr->has_ipv6, .ipv6 = iaddr->ipv6, -#ifdef IPPROTO_MPTCP +#ifdef HAVE_IPPROTO_MPTCP .has_mptcp = iaddr->has_mptcp, .mptcp = iaddr->mptcp, #endif diff --git a/meson.build b/meson.build index 7bdbbbdf02..60f4f45165 100644 --- a/meson.build +++ b/meson.build @@ -106,14 +106,14 @@ if targetos != 'darwin' endif edk2_targets = [ 'arm-softmmu', 'aarch64-softmmu', 'i386-softmmu', 'x86_64-softmmu' ] -install_edk2_blobs = false -if get_option('install_blobs') - foreach target : target_dirs - install_edk2_blobs = install_edk2_blobs or target in edk2_targets - endforeach -endif - -bzip2 = find_program('bzip2', required: install_edk2_blobs) +unpack_edk2_blobs = false +foreach target : edk2_targets + if target in target_dirs + bzip2 = find_program('bzip2', required: get_option('install_blobs')) + unpack_edk2_blobs = bzip2.found() + break + endif +endforeach ################## # Compiler flags # @@ -1374,6 +1374,8 @@ config_host_data.set('HAVE_OPTRESET', cc.has_header_symbol('getopt.h', 'optreset')) config_host_data.set('HAVE_UTMPX', cc.has_header_symbol('utmpx.h', 'struct utmpx')) +config_host_data.set('HAVE_IPPROTO_MPTCP', + cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP')) # has_member config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', diff --git a/meson_options.txt b/meson_options.txt index a9a9b8f4c6..2c89e79e8b 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -120,7 +120,7 @@ option('usb_redir', type : 'feature', value : 'auto', description: 'libusbredir support') option('virglrenderer', type : 'feature', value : 'auto', description: 'virgl rendering support') -option('vnc', type : 'feature', value : 'enabled', +option('vnc', type : 'feature', value : 'auto', description: 'VNC server') option('vnc_jpeg', type : 'feature', value : 'auto', description: 'JPEG lossy compression for VNC server') diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index b5e71d9e6f..bcaa41350e 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -1823,6 +1823,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) VirtioMEMDeviceInfo *vmi; MemoryDeviceInfo *value; PCDIMMDeviceInfo *di; + SgxEPCDeviceInfo *se; for (info = info_list; info; info = info->next) { value = info->value; @@ -1870,6 +1871,15 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) vmi->block_size); monitor_printf(mon, " memdev: %s\n", vmi->memdev); break; + case MEMORY_DEVICE_INFO_KIND_SGX_EPC: + se = value->u.sgx_epc.data; + monitor_printf(mon, "Memory device [%s]: \"%s\"\n", + MemoryDeviceInfoKind_str(value->type), + se->id ? se->id : ""); + monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", se->memaddr); + monitor_printf(mon, " size: %" PRIu64 "\n", se->size); + monitor_printf(mon, " memdev: %s\n", se->memdev); + break; default: g_assert_not_reached(); } diff --git a/nbd/client-connection.c b/nbd/client-connection.c index 7123b1e189..695f855754 100644 --- a/nbd/client-connection.c +++ b/nbd/client-connection.c @@ -318,6 +318,7 @@ nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info, } if (!blocking) { + error_setg(errp, "No connection at the moment"); return NULL; } diff --git a/nbd/client.c b/nbd/client.c index 0c2db4bcba..30d5383cb1 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -1434,9 +1434,7 @@ nbd_read_eof(BlockDriverState *bs, QIOChannel *ioc, void *buffer, size_t size, len = qio_channel_readv(ioc, &iov, 1, errp); if (len == QIO_CHANNEL_ERR_BLOCK) { - bdrv_dec_in_flight(bs); qio_channel_yield(ioc, G_IO_IN); - bdrv_inc_in_flight(bs); continue; } else if (len < 0) { return -EIO; diff --git a/nbd/server.c b/nbd/server.c index 3927f7789d..6d03e8a4b4 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -980,7 +980,7 @@ static int nbd_negotiate_meta_queries(NBDClient *client, size_t i; size_t count = 0; - if (!client->structured_reply) { + if (client->opt == NBD_OPT_SET_META_CONTEXT && !client->structured_reply) { return nbd_opt_invalid(client, errp, "request option '%s' when structured reply " "is not negotiated", diff --git a/pc-bios/descriptors/meson.build b/pc-bios/descriptors/meson.build index 29efa16d99..66f85d01c4 100644 --- a/pc-bios/descriptors/meson.build +++ b/pc-bios/descriptors/meson.build @@ -1,4 +1,4 @@ -if install_edk2_blobs +if unpack_edk2_blobs and get_option('install_blobs') foreach f: [ '50-edk2-i386-secure.json', '50-edk2-x86_64-secure.json', @@ -10,7 +10,7 @@ if install_edk2_blobs configure_file(input: files(f), output: f, configuration: {'DATADIR': get_option('prefix') / qemu_datadir}, - install: get_option('install_blobs'), + install: true, install_dir: qemu_datadir / 'firmware') endforeach endif diff --git a/pc-bios/meson.build b/pc-bios/meson.build index f2b32598af..a44c9bc127 100644 --- a/pc-bios/meson.build +++ b/pc-bios/meson.build @@ -1,4 +1,5 @@ -if install_edk2_blobs +roms = [] +if unpack_edk2_blobs fds = [ 'edk2-aarch64-code.fd', 'edk2-arm-code.fd', @@ -11,7 +12,7 @@ if install_edk2_blobs ] foreach f : fds - custom_target(f, + roms += custom_target(f, build_by_default: have_system, output: f, input: '@0@.bz2'.format(f), diff --git a/qapi/machine.json b/qapi/machine.json index 32d47f4e35..f1c4983b64 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -1195,12 +1195,35 @@ } ## +# @SgxEPCDeviceInfo: +# +# Sgx EPC state information +# +# @id: device's ID +# +# @memaddr: physical address in memory, where device is mapped +# +# @size: size of memory that the device provides +# +# @memdev: memory backend linked with device +# +# Since: 6.2 +## +{ 'struct': 'SgxEPCDeviceInfo', + 'data': { '*id': 'str', + 'memaddr': 'size', + 'size': 'size', + 'memdev': 'str' + } +} + +## # @MemoryDeviceInfoKind: # # Since: 2.1 ## { 'enum': 'MemoryDeviceInfoKind', - 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem' ] } + 'data': [ 'dimm', 'nvdimm', 'virtio-pmem', 'virtio-mem', 'sgx-epc' ] } ## # @PCDIMMDeviceInfoWrapper: @@ -1227,12 +1250,20 @@ 'data': { 'data': 'VirtioMEMDeviceInfo' } } ## +# @SgxEPCDeviceInfoWrapper: +# +# Since: 6.2 +## +{ 'struct': 'SgxEPCDeviceInfoWrapper', + 'data': { 'data': 'SgxEPCDeviceInfo' } } + +## # @MemoryDeviceInfo: # # Union containing information about a memory device # # nvdimm is included since 2.12. virtio-pmem is included since 4.1. -# virtio-mem is included since 5.1. +# virtio-mem is included since 5.1. sgx-epc is included since 6.2. # # Since: 2.1 ## @@ -1242,11 +1273,37 @@ 'data': { 'dimm': 'PCDIMMDeviceInfoWrapper', 'nvdimm': 'PCDIMMDeviceInfoWrapper', 'virtio-pmem': 'VirtioPMEMDeviceInfoWrapper', - 'virtio-mem': 'VirtioMEMDeviceInfoWrapper' + 'virtio-mem': 'VirtioMEMDeviceInfoWrapper', + 'sgx-epc': 'SgxEPCDeviceInfoWrapper' } } ## +# @SgxEPC: +# +# Sgx EPC cmdline information +# +# @memdev: memory backend linked with device +# +# Since: 6.2 +## +{ 'struct': 'SgxEPC', + 'data': { 'memdev': 'str' } } + +## +# @SgxEPCProperties: +# +# SGX properties of machine types. +# +# @sgx-epc: list of ids of memory-backend-epc objects. +# +# Since: 6.2 +## +{ 'struct': 'SgxEPCProperties', + 'data': { 'sgx-epc': ['SgxEPC'] } +} + +## # @query-memory-devices: # # Lists available memory devices and their state @@ -1305,6 +1362,10 @@ # # @msg: Informative message # +# Features: +# @deprecated: This event is deprecated. Use @DEVICE_UNPLUG_GUEST_ERROR +# instead. +# # Since: 2.4 # # Example: @@ -1317,7 +1378,8 @@ # ## { 'event': 'MEM_UNPLUG_ERROR', - 'data': { 'device': 'str', 'msg': 'str' } } + 'data': { 'device': 'str', 'msg': 'str' }, + 'features': ['deprecated'] } ## # @SMPConfiguration: diff --git a/qapi/misc-target.json b/qapi/misc-target.json index 3b05ad3dbf..594fbd1577 100644 --- a/qapi/misc-target.json +++ b/qapi/misc-target.json @@ -333,3 +333,64 @@ { 'command': 'query-sev-attestation-report', 'data': { 'mnonce': 'str' }, 'returns': 'SevAttestationReport', 'if': 'TARGET_I386' } + +## +# @SGXInfo: +# +# Information about intel Safe Guard eXtension (SGX) support +# +# @sgx: true if SGX is supported +# +# @sgx1: true if SGX1 is supported +# +# @sgx2: true if SGX2 is supported +# +# @flc: true if FLC is supported +# +# @section-size: The EPC section size for guest +# +# Since: 6.2 +## +{ 'struct': 'SGXInfo', + 'data': { 'sgx': 'bool', + 'sgx1': 'bool', + 'sgx2': 'bool', + 'flc': 'bool', + 'section-size': 'uint64'}, + 'if': 'TARGET_I386' } + +## +# @query-sgx: +# +# Returns information about SGX +# +# Returns: @SGXInfo +# +# Since: 6.2 +# +# Example: +# +# -> { "execute": "query-sgx" } +# <- { "return": { "sgx": true, "sgx1" : true, "sgx2" : true, +# "flc": true, "section-size" : 0 } } +# +## +{ 'command': 'query-sgx', 'returns': 'SGXInfo', 'if': 'TARGET_I386' } + +## +# @query-sgx-capabilities: +# +# Returns information from host SGX capabilities +# +# Returns: @SGXInfo +# +# Since: 6.2 +# +# Example: +# +# -> { "execute": "query-sgx-capabilities" } +# <- { "return": { "sgx": true, "sgx1" : true, "sgx2" : true, +# "flc": true, "section-size" : 0 } } +# +## +{ 'command': 'query-sgx-capabilities', 'returns': 'SGXInfo', 'if': 'TARGET_I386' } diff --git a/qapi/qdev.json b/qapi/qdev.json index b83178220b..d75e68908b 100644 --- a/qapi/qdev.json +++ b/qapi/qdev.json @@ -84,7 +84,9 @@ # This command merely requests that the guest begin the hot removal # process. Completion of the device removal process is signaled with a # DEVICE_DELETED event. Guest reset will automatically complete removal -# for all devices. +# for all devices. If a guest-side error in the hot removal process is +# detected, the device will not be removed and a DEVICE_UNPLUG_GUEST_ERROR +# event is sent. Some errors cannot be detected. # # Since: 0.14 # @@ -108,9 +110,9 @@ # At this point, it's safe to reuse the specified device ID. Device removal can # be initiated by the guest or by HMP/QMP commands. # -# @device: device name +# @device: the device's ID if it has one # -# @path: device path +# @path: the device's QOM path # # Since: 1.5 # @@ -124,3 +126,26 @@ ## { 'event': 'DEVICE_DELETED', 'data': { '*device': 'str', 'path': 'str' } } + +## +# @DEVICE_UNPLUG_GUEST_ERROR: +# +# Emitted when a device hot unplug fails due to a guest reported error. +# +# @device: the device's ID if it has one +# +# @path: the device's QOM path +# +# Since: 6.2 +# +# Example: +# +# <- { "event": "DEVICE_UNPLUG_GUEST_ERROR" +# "data": { "device": "core1", +# "path": "/machine/peripheral/core1" }, +# }, +# "timestamp": { "seconds": 1615570772, "microseconds": 202844 } } +# +## +{ 'event': 'DEVICE_UNPLUG_GUEST_ERROR', + 'data': { '*device': 'str', 'path': 'str' } } diff --git a/qapi/qom.json b/qapi/qom.json index a25616bc7a..0222bb4506 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -648,6 +648,23 @@ '*seal': 'bool' } } ## +# @MemoryBackendEpcProperties: +# +# Properties for memory-backend-epc objects. +# +# The @share boolean option is true by default with epc +# +# The @merge boolean option is false by default with epc +# +# The @dump boolean option is false by default with epc +# +# Since: 6.2 +## +{ 'struct': 'MemoryBackendEpcProperties', + 'base': 'MemoryBackendProperties', + 'data': {} } + +## # @PrManagerHelperProperties: # # Properties for pr-manager-helper objects. @@ -797,6 +814,7 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', + 'memory-backend-epc', 'pef-guest', 'pr-manager-helper', 'qtest', @@ -855,6 +873,7 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-epc': 'MemoryBackendEpcProperties', 'pr-manager-helper': 'PrManagerHelperProperties', 'qtest': 'QtestProperties', 'rng-builtin': 'RngProperties', diff --git a/qapi/sockets.json b/qapi/sockets.json index ef4b16d6f2..5773d9fcc4 100644 --- a/qapi/sockets.json +++ b/qapi/sockets.json @@ -69,7 +69,7 @@ '*ipv4': 'bool', '*ipv6': 'bool', '*keep-alive': 'bool', - '*mptcp': { 'type': 'bool', 'if': 'IPPROTO_MPTCP' } } } + '*mptcp': { 'type': 'bool', 'if': 'HAVE_IPPROTO_MPTCP' } } } ## # @UnixSocketAddress: diff --git a/qemu-nbd.c b/qemu-nbd.c index 65ebec598f..9d895ba24b 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,9 @@ static void usage(const char *name) " 'snapshot.id=[ID],snapshot.name=[NAME]', or\n" " '[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" -" --cache=MODE set cache mode (none, writeback, ...)\n" +" --cache=MODE set cache mode used to access the disk image, the\n" +" valid options are: 'none', 'writeback' (default),\n" +" 'writethrough', 'directsync' and 'unsafe'\n" " --aio=MODE set AIO mode (native, io_uring or threads)\n" " --discard=MODE set discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" @@ -552,7 +554,7 @@ int main(int argc, char **argv) bool alloc_depth = false; const char *tlscredsid = NULL; bool imageOpts = false; - bool writethrough = true; + bool writethrough = false; /* Client will flush as needed. */ bool fork_process = false; bool list = false; int old_stderr = -1; diff --git a/qemu-options.hx b/qemu-options.hx index 8f603cc7e6..ceca52818a 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -126,8 +126,14 @@ SRST -m 512M ERST -HXCOMM Deprecated by -machine -DEF("M", HAS_ARG, QEMU_OPTION_M, "", QEMU_ARCH_ALL) +DEF("M", HAS_ARG, QEMU_OPTION_M, + " sgx-epc.0.memdev=memid\n", + QEMU_ARCH_ALL) + +SRST +``sgx-epc.0.memdev=@var{memid}`` + Define an SGX EPC section. +ERST DEF("cpu", HAS_ARG, QEMU_OPTION_cpu, "-cpu cpu select CPU ('-cpu help' for list)\n", QEMU_ARCH_ALL) diff --git a/softmmu/memory.c b/softmmu/memory.c index bfedaf9c4d..db182e5d3d 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -1811,6 +1811,11 @@ bool memory_region_is_ram_device(MemoryRegion *mr) return mr->ram_device; } +bool memory_region_is_protected(MemoryRegion *mr) +{ + return mr->ram && (mr->ram_block->flags & RAM_PROTECTED); +} + uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; @@ -2149,6 +2154,7 @@ static void memory_region_sync_dirty_bitmap(MemoryRegion *mr) } } flatview_unref(view); + trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 0); } else if (listener->log_sync_global) { /* * No matter whether MR is specified, what we can do here @@ -2156,6 +2162,7 @@ static void memory_region_sync_dirty_bitmap(MemoryRegion *mr) * sync in a finer granularity. */ listener->log_sync_global(listener); + trace_memory_region_sync_dirty(mr ? mr->name : "(all)", listener->name, 1); } } } diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 23e77cb771..f67ad29981 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -756,6 +756,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (tcg_enabled()) { newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync; newas->tcg_as_listener.commit = tcg_commit; + newas->tcg_as_listener.name = "tcg"; memory_listener_register(&newas->tcg_as_listener, as); } } @@ -2055,7 +2056,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, int64_t file_size, file_align; /* Just support these ram flags by now. */ - assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE)) == 0); + assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | + RAM_PROTECTED)) == 0); if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); diff --git a/softmmu/trace-events b/softmmu/trace-events index 7b278590a0..bf1469990e 100644 --- a/softmmu/trace-events +++ b/softmmu/trace-events @@ -15,6 +15,7 @@ memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t va memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" +memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '%s' listener '%s' synced (global=%d)" flatview_new(void *view, void *root) "%p (root %p)" flatview_destroy(void *view, void *root) "%p (root %p)" flatview_destroy_rcu(void *view, void *root) "%p (root %p)" diff --git a/stubs/qdev.c b/stubs/qdev.c index 92e6143134..187659f707 100644 --- a/stubs/qdev.c +++ b/stubs/qdev.c @@ -21,3 +21,10 @@ void qapi_event_send_device_deleted(bool has_device, { /* Nothing to do. */ } + +void qapi_event_send_device_unplug_guest_error(bool has_device, + const char *device, + const char *path) +{ + /* Nothing to do. */ +} diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 94b970bbf9..bbf1ce7ba3 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -335,6 +335,7 @@ static void kvm_arm_devlistener_del(MemoryListener *listener, } static MemoryListener devlistener = { + .name = "kvm-arm", .region_add = kvm_arm_devlistener_add, .region_del = kvm_arm_devlistener_del, }; diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6b029f1bdf..cacec605bf 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -36,6 +36,7 @@ #ifndef CONFIG_USER_ONLY #include "exec/address-spaces.h" #include "hw/boards.h" +#include "hw/i386/sgx-epc.h" #endif #include "disas/capstone.h" @@ -654,6 +655,9 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, /* missing: CPUID_XSAVE_XSAVEC, CPUID_XSAVE_XSAVES */ #define TCG_14_0_ECX_FEATURES 0 +#define TCG_SGX_12_0_EAX_FEATURES 0 +#define TCG_SGX_12_0_EBX_FEATURES 0 +#define TCG_SGX_12_1_EAX_FEATURES 0 FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_1_EDX] = { @@ -795,7 +799,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_7_0_EBX] = { .type = CPUID_FEATURE_WORD, .feat_names = { - "fsgsbase", "tsc-adjust", NULL, "bmi1", + "fsgsbase", "tsc-adjust", "sgx", "bmi1", "hle", "avx2", NULL, "smep", "bmi2", "erms", "invpcid", "rtm", NULL, NULL, "mpx", NULL, @@ -821,7 +825,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { "la57", NULL, NULL, NULL, NULL, NULL, "rdpid", NULL, "bus-lock-detect", "cldemote", NULL, "movdiri", - "movdir64b", NULL, NULL, "pks", + "movdir64b", NULL, "sgxlc", "pks", }, .cpuid = { .eax = 7, @@ -1182,6 +1186,65 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .tcg_features = TCG_14_0_ECX_FEATURES, }, + [FEAT_SGX_12_0_EAX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + "sgx1", "sgx2", NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid = { + .eax = 0x12, + .needs_ecx = true, .ecx = 0, + .reg = R_EAX, + }, + .tcg_features = TCG_SGX_12_0_EAX_FEATURES, + }, + + [FEAT_SGX_12_0_EBX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + "sgx-exinfo" , NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid = { + .eax = 0x12, + .needs_ecx = true, .ecx = 0, + .reg = R_EBX, + }, + .tcg_features = TCG_SGX_12_0_EBX_FEATURES, + }, + + [FEAT_SGX_12_1_EAX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + NULL, "sgx-debug", "sgx-mode64", NULL, + "sgx-provisionkey", "sgx-tokenkey", NULL, "sgx-kss", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid = { + .eax = 0x12, + .needs_ecx = true, .ecx = 1, + .reg = R_EAX, + }, + .tcg_features = TCG_SGX_12_1_EAX_FEATURES, + }, }; typedef struct FeatureMask { @@ -5272,6 +5335,25 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, *ecx |= CPUID_7_0_ECX_OSPKE; } *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */ + + /* + * SGX cannot be emulated in software. If hardware does not + * support enabling SGX and/or SGX flexible launch control, + * then we need to update the VM's CPUID values accordingly. + */ + if ((*ebx & CPUID_7_0_EBX_SGX) && + (!kvm_enabled() || + !(kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7, 0, R_EBX) & + CPUID_7_0_EBX_SGX))) { + *ebx &= ~CPUID_7_0_EBX_SGX; + } + + if ((*ecx & CPUID_7_0_ECX_SGX_LC) && + (!(*ebx & CPUID_7_0_EBX_SGX) || !kvm_enabled() || + !(kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7, 0, R_ECX) & + CPUID_7_0_ECX_SGX_LC))) { + *ecx &= ~CPUID_7_0_ECX_SGX_LC; + } } else if (count == 1) { *eax = env->features[FEAT_7_1_EAX]; *ebx = 0; @@ -5407,6 +5489,66 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } break; } + case 0x12: +#ifndef CONFIG_USER_ONLY + if (!kvm_enabled() || + !(env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_SGX)) { + *eax = *ebx = *ecx = *edx = 0; + break; + } + + /* + * SGX sub-leafs CPUID.0x12.{0x2..N} enumerate EPC sections. Retrieve + * the EPC properties, e.g. confidentiality and integrity, from the + * host's first EPC section, i.e. assume there is one EPC section or + * that all EPC sections have the same security properties. + */ + if (count > 1) { + uint64_t epc_addr, epc_size; + + if (sgx_epc_get_section(count - 2, &epc_addr, &epc_size)) { + *eax = *ebx = *ecx = *edx = 0; + break; + } + host_cpuid(index, 2, eax, ebx, ecx, edx); + *eax = (uint32_t)(epc_addr & 0xfffff000) | 0x1; + *ebx = (uint32_t)(epc_addr >> 32); + *ecx = (uint32_t)(epc_size & 0xfffff000) | (*ecx & 0xf); + *edx = (uint32_t)(epc_size >> 32); + break; + } + + /* + * SGX sub-leafs CPUID.0x12.{0x0,0x1} are heavily dependent on hardware + * and KVM, i.e. QEMU cannot emulate features to override what KVM + * supports. Features can be further restricted by userspace, but not + * made more permissive. + */ + *eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x12, count, R_EAX); + *ebx = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x12, count, R_EBX); + *ecx = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x12, count, R_ECX); + *edx = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x12, count, R_EDX); + + if (count == 0) { + *eax &= env->features[FEAT_SGX_12_0_EAX]; + *ebx &= env->features[FEAT_SGX_12_0_EBX]; + } else { + *eax &= env->features[FEAT_SGX_12_1_EAX]; + *ebx &= 0; /* ebx reserve */ + *ecx &= env->features[FEAT_XSAVE_COMP_LO]; + *edx &= env->features[FEAT_XSAVE_COMP_HI]; + + /* FP and SSE are always allowed regardless of XSAVE/XCR0. */ + *ecx |= XSTATE_FP_MASK | XSTATE_SSE_MASK; + + /* Access to PROVISIONKEY requires additional credentials. */ + if ((*eax & (1U << 4)) && + !kvm_enable_sgx_provisioning(cs->kvm_state)) { + *eax &= ~(1U << 4); + } + } +#endif + break; case 0x14: { /* Intel Processor Trace Enumeration */ *eax = 0; @@ -5638,6 +5780,17 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, } } +static void x86_cpu_set_sgxlepubkeyhash(CPUX86State *env) +{ +#ifndef CONFIG_USER_ONLY + /* Those default values are defined in Skylake HW */ + env->msr_ia32_sgxlepubkeyhash[0] = 0xa6053e051270b7acULL; + env->msr_ia32_sgxlepubkeyhash[1] = 0x6cfbe8ba8b3b413dULL; + env->msr_ia32_sgxlepubkeyhash[2] = 0xc4916d99f2b3735dULL; + env->msr_ia32_sgxlepubkeyhash[3] = 0xd4f8c05909f9bb3bULL; +#endif +} + static void x86_cpu_reset(DeviceState *dev) { CPUState *s = CPU(dev); @@ -5770,6 +5923,8 @@ static void x86_cpu_reset(DeviceState *dev) if (kvm_enabled()) { kvm_arch_reset_vcpu(cpu); } + + x86_cpu_set_sgxlepubkeyhash(env); #endif } @@ -5999,6 +6154,11 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) if (sev_enabled()) { x86_cpu_adjust_level(cpu, &env->cpuid_min_xlevel, 0x8000001F); } + + /* SGX requires CPUID[0x12] for EPC enumeration */ + if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_SGX) { + x86_cpu_adjust_level(cpu, &env->cpuid_min_level, 0x12); + } } /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */ @@ -6152,6 +6312,8 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) & CPUID_EXT2_AMD_ALIASES); } + x86_cpu_set_sgxlepubkeyhash(env); + /* * note: the call to the framework needs to happen after feature expansion, * but before the checks/modifications to ucode_rev, mwait, phys_bits. @@ -6839,7 +7001,6 @@ static const TypeInfo x86_cpu_type_info = { .class_init = x86_cpu_common_class_init, }; - /* "base" CPU model, used by query-cpu-model-expansion */ static void x86_cpu_base_class_init(ObjectClass *oc, void *data) { diff --git a/target/i386/cpu.h b/target/i386/cpu.h index c2954c71ea..29552dc2a7 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -389,9 +389,17 @@ typedef enum X86Seg { #define MSR_IA32_PKRS 0x6e1 #define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1ULL << 1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) +#define FEATURE_CONTROL_SGX_LC (1ULL << 17) +#define FEATURE_CONTROL_SGX (1ULL << 18) #define FEATURE_CONTROL_LMCE (1<<20) +#define MSR_IA32_SGXLEPUBKEYHASH0 0x8c +#define MSR_IA32_SGXLEPUBKEYHASH1 0x8d +#define MSR_IA32_SGXLEPUBKEYHASH2 0x8e +#define MSR_IA32_SGXLEPUBKEYHASH3 0x8f + #define MSR_P6_PERFCTR0 0xc1 #define MSR_IA32_SMBASE 0x9e @@ -570,6 +578,9 @@ typedef enum FeatureWord { FEAT_VMX_BASIC, FEAT_VMX_VMFUNC, FEAT_14_0_ECX, + FEAT_SGX_12_0_EAX, /* CPUID[EAX=0x12,ECX=0].EAX (SGX) */ + FEAT_SGX_12_0_EBX, /* CPUID[EAX=0x12,ECX=0].EBX (SGX MISCSELECT[31:0]) */ + FEAT_SGX_12_1_EAX, /* CPUID[EAX=0x12,ECX=1].EAX (SGX ATTRIBUTES[31:0]) */ FEATURE_WORDS, } FeatureWord; @@ -718,6 +729,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; /* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ #define CPUID_7_0_EBX_FSGSBASE (1U << 0) +/* Support SGX */ +#define CPUID_7_0_EBX_SGX (1U << 2) /* 1st Group of Advanced Bit Manipulation Extensions */ #define CPUID_7_0_EBX_BMI1 (1U << 3) /* Hardware Lock Elision */ @@ -805,6 +818,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_ECX_MOVDIRI (1U << 27) /* Move 64 Bytes as Direct Store Instruction */ #define CPUID_7_0_ECX_MOVDIR64B (1U << 28) +/* Support SGX Launch Control */ +#define CPUID_7_0_ECX_SGX_LC (1U << 30) /* Protection Keys for Supervisor-mode Pages */ #define CPUID_7_0_ECX_PKS (1U << 31) @@ -1501,6 +1516,7 @@ typedef struct CPUX86State { uint64_t mcg_status; uint64_t msr_ia32_misc_enable; uint64_t msr_ia32_feature_control; + uint64_t msr_ia32_sgxlepubkeyhash[4]; uint64_t msr_fixed_ctr_ctrl; uint64_t msr_global_ctrl; diff --git a/target/i386/hax/hax-mem.c b/target/i386/hax/hax-mem.c index 8d44edbffd..a226d174d8 100644 --- a/target/i386/hax/hax-mem.c +++ b/target/i386/hax/hax-mem.c @@ -285,6 +285,7 @@ static void hax_log_sync(MemoryListener *listener, } static MemoryListener hax_memory_listener = { + .name = "hax", .begin = hax_transaction_begin, .commit = hax_transaction_commit, .region_add = hax_region_add, diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 500d2e0e68..7f1b060e6d 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1703,6 +1703,25 @@ int kvm_arch_init_vcpu(CPUState *cs) } break; case 0x7: + case 0x12: + for (j = 0; ; j++) { + c->function = i; + c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + c->index = j; + cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); + + if (j > 1 && (c->eax & 0xf) != 1) { + break; + } + + if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { + fprintf(stderr, "cpuid_data is full, no space for " + "cpuid(eax:0x12,ecx:0x%x)\n", j); + abort(); + } + c = &cpuid_data.entries[cpuid_i++]; + } + break; case 0x14: { uint32_t times; @@ -1877,6 +1896,11 @@ int kvm_arch_init_vcpu(CPUState *cs) !!(c->ecx & CPUID_EXT_SMX); } + c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0); + if (c && (c->ebx & CPUID_7_0_EBX_SGX)) { + has_msr_feature_control = true; + } + if (env->mcg_cap & MCG_LMCE_P) { has_msr_mcg_ext_ctl = has_msr_feature_control = true; } @@ -2224,7 +2248,7 @@ static void register_smram_listener(Notifier *n, void *unused) address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM"); kvm_memory_listener_register(kvm_state, &smram_listener, - &smram_address_space, 1); + &smram_address_space, 1, "kvm-smram"); } int kvm_arch_init(MachineState *ms, KVMState *s) @@ -3107,6 +3131,17 @@ static int kvm_put_msrs(X86CPU *cpu, int level) } } + if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, + env->msr_ia32_sgxlepubkeyhash[0]); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, + env->msr_ia32_sgxlepubkeyhash[1]); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, + env->msr_ia32_sgxlepubkeyhash[2]); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, + env->msr_ia32_sgxlepubkeyhash[3]); + } + /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see * kvm_put_msr_feature_control. */ } @@ -3446,6 +3481,13 @@ static int kvm_get_msrs(X86CPU *cpu) } } + if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0); + kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0); + } + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); if (ret < 0) { return ret; @@ -3735,6 +3777,10 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] = + msrs[i].data; + break; } } @@ -4617,6 +4663,35 @@ void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) } } +static bool has_sgx_provisioning; + +static bool __kvm_enable_sgx_provisioning(KVMState *s) +{ + int fd, ret; + + if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) { + return false; + } + + fd = qemu_open_old("/dev/sgx_provision", O_RDONLY); + if (fd < 0) { + return false; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd); + if (ret) { + error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret)); + exit(1); + } + close(fd); + return true; +} + +bool kvm_enable_sgx_provisioning(KVMState *s) +{ + return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning); +} + static bool host_supports_vmx(void) { uint32_t ecx, unused; diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index 54667b35f0..a978509d50 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -51,4 +51,6 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp); uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address); +bool kvm_enable_sgx_provisioning(KVMState *s); + #endif diff --git a/target/i386/machine.c b/target/i386/machine.c index b0943118d1..4367931623 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -1415,6 +1415,25 @@ static const VMStateDescription vmstate_msr_tsx_ctrl = { } }; +static bool intel_sgx_msrs_needed(void *opaque) +{ + X86CPU *cpu = opaque; + CPUX86State *env = &cpu->env; + + return !!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC); +} + +static const VMStateDescription vmstate_msr_intel_sgx = { + .name = "cpu/intel_sgx", + .version_id = 1, + .minimum_version_id = 1, + .needed = intel_sgx_msrs_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64_ARRAY(env.msr_ia32_sgxlepubkeyhash, X86CPU, 4), + VMSTATE_END_OF_LIST() + } +}; + const VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, @@ -1551,6 +1570,7 @@ const VMStateDescription vmstate_x86_cpu = { &vmstate_nested_state, #endif &vmstate_msr_tsx_ctrl, + &vmstate_msr_intel_sgx, NULL } }; diff --git a/target/i386/monitor.c b/target/i386/monitor.c index 119211f0b0..196c1c9e77 100644 --- a/target/i386/monitor.c +++ b/target/i386/monitor.c @@ -35,6 +35,7 @@ #include "qapi/qapi-commands-misc-target.h" #include "qapi/qapi-commands-misc.h" #include "hw/i386/pc.h" +#include "hw/i386/sgx.h" /* Perform linear address sign extension */ static hwaddr addr_canonical(CPUArchState *env, hwaddr addr) @@ -763,3 +764,34 @@ qmp_query_sev_attestation_report(const char *mnonce, Error **errp) { return sev_get_attestation_report(mnonce, errp); } + +SGXInfo *qmp_query_sgx(Error **errp) +{ + return sgx_get_info(errp); +} + +void hmp_info_sgx(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + g_autoptr(SGXInfo) info = qmp_query_sgx(&err); + + if (err) { + error_report_err(err); + return; + } + monitor_printf(mon, "SGX support: %s\n", + info->sgx ? "enabled" : "disabled"); + monitor_printf(mon, "SGX1 support: %s\n", + info->sgx1 ? "enabled" : "disabled"); + monitor_printf(mon, "SGX2 support: %s\n", + info->sgx2 ? "enabled" : "disabled"); + monitor_printf(mon, "FLC support: %s\n", + info->flc ? "enabled" : "disabled"); + monitor_printf(mon, "size: %" PRIu64 "\n", + info->section_size); +} + +SGXInfo *qmp_query_sgx_capabilities(Error **errp) +{ + return sgx_get_capabilities(errp); +} diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c index a488b00e90..14c996f968 100644 --- a/target/i386/nvmm/nvmm-all.c +++ b/target/i386/nvmm/nvmm-all.c @@ -1123,6 +1123,7 @@ nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section) } static MemoryListener nvmm_memory_listener = { + .name = "nvmm", .begin = nvmm_transaction_begin, .commit = nvmm_transaction_commit, .region_add = nvmm_region_add, diff --git a/target/i386/sev.c b/target/i386/sev.c index 0b2c8f594a..fa7210473a 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -565,7 +565,7 @@ static int sev_read_file_base64(const char *filename, guchar **data, gsize *len) { gsize sz; - gchar *base64; + g_autofree gchar *base64 = NULL; GError *error = NULL; if (!g_file_get_contents(filename, &base64, &sz, &error)) { diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c index 3e925b9da7..ef896da0a2 100644 --- a/target/i386/whpx/whpx-all.c +++ b/target/i386/whpx/whpx-all.c @@ -1598,6 +1598,7 @@ static void whpx_log_sync(MemoryListener *listener, } static MemoryListener whpx_memory_listener = { + .name = "whpx", .begin = whpx_transaction_begin, .commit = whpx_transaction_commit, .region_add = whpx_region_add, diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 01d3773bc7..baa4e7c34d 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -600,6 +600,7 @@ enum { HFLAGS_64 = 2, /* computed from MSR_CE and MSR_SF */ HFLAGS_GTSE = 3, /* computed from SPR_LPCR[GTSE] */ HFLAGS_DR = 4, /* MSR_DR */ + HFLAGS_HR = 5, /* computed from SPR_LPCR[HR] */ HFLAGS_SPE = 6, /* from MSR_SPE if cpu has SPE; avoid overlap w/ MSR_VR */ HFLAGS_TM = 8, /* computed from MSR_TM */ HFLAGS_BE = 9, /* MSR_BE -- from elsewhere on embedded ppc */ diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c index d7e32ee107..b7d1767920 100644 --- a/target/ppc/excp_helper.c +++ b/target/ppc/excp_helper.c @@ -23,20 +23,14 @@ #include "internal.h" #include "helper_regs.h" +#include "trace.h" + #ifdef CONFIG_TCG #include "exec/helper-proto.h" #include "exec/cpu_ldst.h" #endif -/* #define DEBUG_OP */ /* #define DEBUG_SOFTWARE_TLB */ -/* #define DEBUG_EXCEPTIONS */ - -#ifdef DEBUG_EXCEPTIONS -# define LOG_EXCP(...) qemu_log(__VA_ARGS__) -#else -# define LOG_EXCP(...) do { } while (0) -#endif /*****************************************************************************/ /* Exception processing */ @@ -414,12 +408,10 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp) } break; case POWERPC_EXCP_DSI: /* Data storage exception */ - LOG_EXCP("DSI exception: DSISR=" TARGET_FMT_lx" DAR=" TARGET_FMT_lx - "\n", env->spr[SPR_DSISR], env->spr[SPR_DAR]); + trace_ppc_excp_dsi(env->spr[SPR_DSISR], env->spr[SPR_DAR]); break; case POWERPC_EXCP_ISI: /* Instruction storage exception */ - LOG_EXCP("ISI exception: msr=" TARGET_FMT_lx ", nip=" TARGET_FMT_lx - "\n", msr, env->nip); + trace_ppc_excp_isi(msr, env->nip); msr |= env->error_code; break; case POWERPC_EXCP_EXTERNAL: /* External input */ @@ -474,7 +466,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp) switch (env->error_code & ~0xF) { case POWERPC_EXCP_FP: if ((msr_fe0 == 0 && msr_fe1 == 0) || msr_fp == 0) { - LOG_EXCP("Ignore floating point exception\n"); + trace_ppc_excp_fp_ignore(); cs->exception_index = POWERPC_EXCP_NONE; env->error_code = 0; return; @@ -489,7 +481,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp) env->spr[SPR_BOOKE_ESR] = ESR_FP; break; case POWERPC_EXCP_INVAL: - LOG_EXCP("Invalid instruction at " TARGET_FMT_lx "\n", env->nip); + trace_ppc_excp_inval(env->nip); msr |= 0x00080000; env->spr[SPR_BOOKE_ESR] = ESR_PIL; break; @@ -547,10 +539,10 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp) break; case POWERPC_EXCP_FIT: /* Fixed-interval timer interrupt */ /* FIT on 4xx */ - LOG_EXCP("FIT exception\n"); + trace_ppc_excp_print("FIT"); break; case POWERPC_EXCP_WDT: /* Watchdog timer interrupt */ - LOG_EXCP("WDT exception\n"); + trace_ppc_excp_print("WDT"); switch (excp_model) { case POWERPC_EXCP_BOOKE: srr0 = SPR_BOOKE_CSRR0; @@ -657,7 +649,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int excp_model, int excp) #endif break; case POWERPC_EXCP_PIT: /* Programmable interval timer interrupt */ - LOG_EXCP("PIT exception\n"); + trace_ppc_excp_print("PIT"); break; case POWERPC_EXCP_IO: /* IO error exception */ /* XXX: TODO */ @@ -1115,14 +1107,6 @@ bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request) #endif /* !CONFIG_USER_ONLY */ -#if defined(DEBUG_OP) -static void cpu_dump_rfi(target_ulong RA, target_ulong msr) -{ - qemu_log("Return from exception at " TARGET_FMT_lx " with flags " - TARGET_FMT_lx "\n", RA, msr); -} -#endif - /*****************************************************************************/ /* Exceptions processing helpers */ @@ -1221,9 +1205,7 @@ static inline void do_rfi(CPUPPCState *env, target_ulong nip, target_ulong msr) /* XXX: beware: this is false if VLE is supported */ env->nip = nip & ~((target_ulong)0x00000003); hreg_store_msr(env, msr, 1); -#if defined(DEBUG_OP) - cpu_dump_rfi(env->nip, env->msr); -#endif + trace_ppc_excp_rfi(env->nip, env->msr); /* * No need to raise an exception here, as rfi is always the last * insn of a TB diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c index 405450d863..1bfb480ecf 100644 --- a/target/ppc/helper_regs.c +++ b/target/ppc/helper_regs.c @@ -106,6 +106,9 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env) if (env->spr[SPR_LPCR] & LPCR_GTSE) { hflags |= 1 << HFLAGS_GTSE; } + if (env->spr[SPR_LPCR] & LPCR_HR) { + hflags |= 1 << HFLAGS_HR; + } #ifndef CONFIG_USER_ONLY if (!env->has_hv_mode || (msr & (1ull << MSR_HV))) { diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index c2d3248d1e..f5dac3aa87 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -2480,10 +2480,26 @@ uint32_t helper_bcdctz(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps) return cr; } +/** + * Compare 2 128-bit unsigned integers, passed in as unsigned 64-bit pairs + * + * Returns: + * > 0 if ahi|alo > bhi|blo, + * 0 if ahi|alo == bhi|blo, + * < 0 if ahi|alo < bhi|blo + */ +static inline int ucmp128(uint64_t alo, uint64_t ahi, + uint64_t blo, uint64_t bhi) +{ + return (ahi == bhi) ? + (alo > blo ? 1 : (alo == blo ? 0 : -1)) : + (ahi > bhi ? 1 : -1); +} + uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps) { int i; - int cr = 0; + int cr; uint64_t lo_value; uint64_t hi_value; ppc_avr_t ret = { .u64 = { 0, 0 } }; @@ -2492,28 +2508,47 @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps) lo_value = -b->VsrSD(1); hi_value = ~b->VsrD(0) + !lo_value; bcd_put_digit(&ret, 0xD, 0); + + cr = CRF_LT; } else { lo_value = b->VsrD(1); hi_value = b->VsrD(0); bcd_put_digit(&ret, bcd_preferred_sgn(0, ps), 0); - } - if (divu128(&lo_value, &hi_value, 1000000000000000ULL) || - lo_value > 9999999999999999ULL) { - cr = CRF_SO; + if (hi_value == 0 && lo_value == 0) { + cr = CRF_EQ; + } else { + cr = CRF_GT; + } } - for (i = 1; i < 16; hi_value /= 10, i++) { - bcd_put_digit(&ret, hi_value % 10, i); - } + /* + * Check src limits: abs(src) <= 10^31 - 1 + * + * 10^31 - 1 = 0x0000007e37be2022 c0914b267fffffff + */ + if (ucmp128(lo_value, hi_value, + 0xc0914b267fffffffULL, 0x7e37be2022ULL) > 0) { + cr |= CRF_SO; - for (; i < 32; lo_value /= 10, i++) { - bcd_put_digit(&ret, lo_value % 10, i); - } + /* + * According to the ISA, if src wouldn't fit in the destination + * register, the result is undefined. + * In that case, we leave r unchanged. + */ + } else { + divu128(&lo_value, &hi_value, 1000000000000000ULL); - cr |= bcd_cmp_zero(&ret); + for (i = 1; i < 16; hi_value /= 10, i++) { + bcd_put_digit(&ret, hi_value % 10, i); + } - *r = ret; + for (; i < 32; lo_value /= 10, i++) { + bcd_put_digit(&ret, lo_value % 10, i); + } + + *r = ret; + } return cr; } diff --git a/target/ppc/trace-events b/target/ppc/trace-events index c88cfccf8d..53b107f56e 100644 --- a/target/ppc/trace-events +++ b/target/ppc/trace-events @@ -28,3 +28,11 @@ kvm_handle_epr(void) "handle epr" kvm_handle_watchdog_expiry(void) "handle watchdog expiry" kvm_handle_debug_exception(void) "handle debug exception" kvm_handle_nmi_exception(void) "handle NMI exception" + +# excp_helper.c +ppc_excp_rfi(uint64_t nip, uint64_t msr) "Return from exception at 0x%" PRIx64 " with flags 0x%016" PRIx64 +ppc_excp_dsi(uint64_t dsisr, uint64_t dar) "DSI exception: DSISR=0x%" PRIx64 " DAR=0x%" PRIx64 +ppc_excp_isi(uint64_t msr, uint64_t nip) "ISI exception: msr=0x%016" PRIx64 " nip=0x%" PRIx64 +ppc_excp_fp_ignore(void) "Ignore floating point exception" +ppc_excp_inval(uint64_t nip) "Invalid instruction at 0x%" PRIx64 +ppc_excp_print(const char *excp) "%s exception" diff --git a/target/ppc/translate.c b/target/ppc/translate.c index 5d8b06bd80..b985e9e55b 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -175,6 +175,7 @@ struct DisasContext { bool spe_enabled; bool tm_enabled; bool gtse; + bool hr; ppc_spr_t *spr_cb; /* Needed to check rights for mfspr/mtspr */ int singlestep_enabled; uint32_t flags; @@ -5516,7 +5517,15 @@ static void gen_tlbiel(DisasContext *ctx) #if defined(CONFIG_USER_ONLY) GEN_PRIV; #else - CHK_SV; + bool psr = (ctx->opcode >> 17) & 0x1; + + if (ctx->pr || (!ctx->hv && !psr && ctx->hr)) { + /* + * tlbiel is privileged except when PSR=0 and HR=1, making it + * hypervisor privileged. + */ + GEN_PRIV; + } gen_helper_tlbie(cpu_env, cpu_gpr[rB(ctx->opcode)]); #endif /* defined(CONFIG_USER_ONLY) */ @@ -5528,12 +5537,20 @@ static void gen_tlbie(DisasContext *ctx) #if defined(CONFIG_USER_ONLY) GEN_PRIV; #else + bool psr = (ctx->opcode >> 17) & 0x1; TCGv_i32 t1; - if (ctx->gtse) { - CHK_SV; /* If gtse is set then tlbie is supervisor privileged */ - } else { - CHK_HV; /* Else hypervisor privileged */ + if (ctx->pr) { + /* tlbie is privileged... */ + GEN_PRIV; + } else if (!ctx->hv) { + if (!ctx->gtse || (!psr && ctx->hr)) { + /* + * ... except when GTSE=0 or when PSR=0 and HR=1, making it + * hypervisor privileged. + */ + GEN_PRIV; + } } if (NARROW_MODE(ctx)) { @@ -8539,6 +8556,7 @@ static void ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs) ctx->vsx_enabled = (hflags >> HFLAGS_VSX) & 1; ctx->tm_enabled = (hflags >> HFLAGS_TM) & 1; ctx->gtse = (hflags >> HFLAGS_GTSE) & 1; + ctx->hr = (hflags >> HFLAGS_HR) & 1; ctx->singlestep_enabled = 0; if ((hflags >> HFLAGS_SE) & 1) { diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build index 757bb8499a..19444d4752 100644 --- a/tests/qtest/meson.build +++ b/tests/qtest/meson.build @@ -68,12 +68,12 @@ qtests_i386 = \ (config_all_devices.has_key('CONFIG_RTL8139_PCI') ? ['rtl8139-test'] : []) + \ (config_all_devices.has_key('CONFIG_E1000E_PCI_EXPRESS') ? ['fuzz-e1000e-test'] : []) + \ (config_all_devices.has_key('CONFIG_ESP_PCI') ? ['am53c974-test'] : []) + \ + (unpack_edk2_blobs ? ['bios-tables-test'] : []) + \ qtests_pci + \ ['fdc-test', 'ide-test', 'hd-geo-test', 'boot-order-test', - 'bios-tables-test', 'rtc-test', 'i440fx-test', 'fw_cfg-test', @@ -180,7 +180,7 @@ qtests_arm = \ # TODO: once aarch64 TCG is fixed on ARM 32 bit host, make bios-tables-test unconditional qtests_aarch64 = \ - (cpu != 'arm' ? ['bios-tables-test'] : []) + \ + (cpu != 'arm' and unpack_edk2_blobs ? ['bios-tables-test'] : []) + \ (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-test'] : []) + \ (config_all_devices.has_key('CONFIG_TPM_TIS_SYSBUS') ? ['tpm-tis-device-swtpm-test'] : []) + \ ['arm-cpu-features', @@ -269,7 +269,7 @@ foreach dir : target_dirs qtest_emulator = emulators['qemu-system-' + target_base] target_qtests = get_variable('qtests_' + target_base, []) + qtests_generic - test_deps = [] + test_deps = roms qtest_env = environment() if have_tools qtest_env.set('QTEST_QEMU_IMG', './qemu-img') diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c index c98b78d033..1af2f74c28 100644 --- a/tests/qtest/qmp-cmd-test.c +++ b/tests/qtest/qmp-cmd-test.c @@ -100,6 +100,8 @@ static bool query_is_ignored(const char *cmd) /* Success depends on Host or Hypervisor SEV support */ "query-sev", "query-sev-capabilities", + "query-sgx", + "query-sgx-capabilities", NULL }; int i; diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c index ce071b5fc5..2d3c17e566 100644 --- a/tests/unit/test-bdrv-drain.c +++ b/tests/unit/test-bdrv-drain.c @@ -65,8 +65,9 @@ static void co_reenter_bh(void *opaque) } static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVTestState *s = bs->opaque; @@ -1106,8 +1107,9 @@ static void bdrv_test_top_close(BlockDriverState *bs) } static int coroutine_fn bdrv_test_top_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { BDRVTestTopState *tts = bs->opaque; return bdrv_co_preadv(tts->wait_child, offset, bytes, qiov, flags); @@ -1855,10 +1857,10 @@ static void bdrv_replace_test_close(BlockDriverState *bs) * Set .has_read to true and return success. */ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs, - uint64_t offset, - uint64_t bytes, + int64_t offset, + int64_t bytes, QEMUIOVector *qiov, - int flags) + BdrvRequestFlags flags) { BDRVReplaceTestState *s = bs->opaque; diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c index c39e70b2f5..aea660aeed 100644 --- a/tests/unit/test-block-iothread.c +++ b/tests/unit/test-block-iothread.c @@ -31,15 +31,24 @@ #include "qemu/main-loop.h" #include "iothread.h" -static int coroutine_fn bdrv_test_co_prwv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn bdrv_test_co_preadv(BlockDriverState *bs, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + return 0; +} + +static int coroutine_fn bdrv_test_co_pwritev(BlockDriverState *bs, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { return 0; } static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs, - int64_t offset, int bytes) + int64_t offset, int64_t bytes) { return 0; } @@ -66,8 +75,8 @@ static BlockDriver bdrv_test = { .format_name = "test", .instance_size = 1, - .bdrv_co_preadv = bdrv_test_co_prwv, - .bdrv_co_pwritev = bdrv_test_co_prwv, + .bdrv_co_preadv = bdrv_test_co_preadv, + .bdrv_co_pwritev = bdrv_test_co_pwritev, .bdrv_co_pdiscard = bdrv_test_co_pdiscard, .bdrv_co_truncate = bdrv_test_co_truncate, .bdrv_co_block_status = bdrv_test_co_block_status, diff --git a/util/host-utils.c b/util/host-utils.c index 7b9322071d..a789a11b46 100644 --- a/util/host-utils.c +++ b/util/host-utils.c @@ -102,7 +102,7 @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor) *plow = dlo / divisor; *phigh = dlo % divisor; return 0; - } else if (dhi > divisor) { + } else if (dhi >= divisor) { return 1; } else { diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index 72216ef980..0585e7a629 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -278,7 +278,7 @@ static int inet_listen_saddr(InetSocketAddress *saddr, /* create socket + bind/listen */ for (e = res; e != NULL; e = e->ai_next) { -#ifdef IPPROTO_MPTCP +#ifdef HAVE_IPPROTO_MPTCP if (saddr->has_mptcp && saddr->mptcp) { e->ai_protocol = IPPROTO_MPTCP; } @@ -462,7 +462,7 @@ int inet_connect_saddr(InetSocketAddress *saddr, Error **errp) error_free(local_err); local_err = NULL; -#ifdef IPPROTO_MPTCP +#ifdef HAVE_IPPROTO_MPTCP if (saddr->has_mptcp && saddr->mptcp) { e->ai_protocol = IPPROTO_MPTCP; } @@ -699,7 +699,7 @@ int inet_parse(InetSocketAddress *addr, const char *str, Error **errp) } addr->has_keep_alive = true; } -#ifdef IPPROTO_MPTCP +#ifdef HAVE_IPPROTO_MPTCP begin = strstr(optstr, ",mptcp"); if (begin) { if (inet_parse_flag("mptcp", begin + strlen(",mptcp"), |