summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/lockdown.yml8
-rw-r--r--.gitlab-ci.yml1
-rw-r--r--.gitmodules44
-rw-r--r--Kconfig.host4
-rw-r--r--MAINTAINERS27
-rw-r--r--Makefile2
-rw-r--r--README.rst4
-rw-r--r--accel/kvm/kvm-all.c59
-rw-r--r--accel/kvm/sev-stub.c10
-rw-r--r--accel/stubs/kvm-stub.c10
-rw-r--r--accel/tcg/cpu-exec.c25
-rw-r--r--accel/tcg/tcg-accel-ops-icount.c8
-rw-r--r--backends/confidential-guest-support.c33
-rw-r--r--backends/hostmem-memfd.c2
-rw-r--r--backends/hostmem.c10
-rw-r--r--backends/meson.build1
-rw-r--r--backends/rng-builtin.c3
-rw-r--r--block/backup.c10
-rw-r--r--block/io.c11
-rw-r--r--block/mirror.c9
-rw-r--r--block/monitor/block-hmp-cmds.c7
-rw-r--r--block/nbd.c15
-rw-r--r--block/raw-format.c6
-rw-r--r--block/snapshot.c256
-rw-r--r--blockdev-nbd.c7
-rwxr-xr-xconfigure18
-rw-r--r--docs/amd-memory-encryption.txt2
-rw-r--r--docs/confidential-guest-support.txt49
-rw-r--r--docs/devel/build-system.rst2
-rw-r--r--docs/devel/fuzzing.rst35
-rw-r--r--docs/devel/index.rst1
-rw-r--r--docs/devel/multi-process.rst966
-rw-r--r--docs/devel/testing.rst3
-rw-r--r--docs/interop/parallels.txt2
-rw-r--r--docs/meson.build6
-rw-r--r--docs/papr-pef.txt30
-rw-r--r--docs/system/arm/versatile.rst34
-rw-r--r--docs/system/arm/vexpress.rst28
-rw-r--r--docs/system/index.rst1
-rw-r--r--docs/system/multi-process.rst64
-rw-r--r--docs/system/s390x/protvirt.rst19
-rw-r--r--docs/tools/qemu-nbd.rst4
-rw-r--r--gdbstub.c17
-rw-r--r--hw/Kconfig1
-rw-r--r--hw/arm/aspeed_ast2600.c2
-rw-r--r--hw/arm/musca.c4
-rw-r--r--hw/arm/npcm7xx.c8
-rw-r--r--hw/arm/xlnx-versal.c4
-rw-r--r--hw/block/nvme-ns.c291
-rw-r--r--hw/block/nvme-ns.h112
-rw-r--r--hw/block/nvme.c2292
-rw-r--r--hw/block/nvme.h31
-rw-r--r--hw/block/trace-events50
-rw-r--r--hw/core/machine.c63
-rw-r--r--hw/i386/pc_sysfw.c17
-rw-r--r--hw/intc/pnv_xive.c3
-rw-r--r--hw/intc/trace-events3
-rw-r--r--hw/intc/xive.c3
-rw-r--r--hw/meson.build1
-rw-r--r--hw/misc/arm_integrator_debug.c2
-rw-r--r--hw/misc/imx7_ccm.c8
-rw-r--r--hw/misc/ivshmem.c3
-rw-r--r--hw/misc/tz-ppc.c14
-rw-r--r--hw/net/fsl_etsec/rings.c2
-rw-r--r--hw/nvram/nrf51_nvm.c10
-rw-r--r--hw/pci-host/Kconfig3
-rw-r--r--hw/pci-host/designware.c19
-rw-r--r--hw/pci-host/meson.build1
-rw-r--r--hw/pci-host/pnv_phb4.c3
-rw-r--r--hw/pci-host/prep.c8
-rw-r--r--hw/pci-host/remote.c75
-rw-r--r--hw/pci-host/trace-events3
-rw-r--r--hw/ppc/e500.c10
-rw-r--r--hw/ppc/meson.build1
-rw-r--r--hw/ppc/pef.c140
-rw-r--r--hw/ppc/pnv.c27
-rw-r--r--hw/ppc/pnv_bmc.c22
-rw-r--r--hw/ppc/pnv_lpc.c15
-rw-r--r--hw/ppc/prep_systemio.c8
-rw-r--r--hw/ppc/spapr.c52
-rw-r--r--hw/ppc/spapr_numa.c27
-rw-r--r--hw/ppc/spapr_pci.c58
-rw-r--r--hw/remote/Kconfig4
-rw-r--r--hw/remote/iohub.c119
-rw-r--r--hw/remote/machine.c80
-rw-r--r--hw/remote/memory.c65
-rw-r--r--hw/remote/meson.build13
-rw-r--r--hw/remote/message.c230
-rw-r--r--hw/remote/mpqemu-link.c267
-rw-r--r--hw/remote/proxy-memory-listener.c227
-rw-r--r--hw/remote/proxy.c379
-rw-r--r--hw/remote/remote-obj.c203
-rw-r--r--hw/remote/trace-events4
-rw-r--r--hw/remote/trace.h1
-rw-r--r--hw/s390x/pv.c62
-rw-r--r--hw/s390x/s390-virtio-ccw.c3
-rw-r--r--hw/scsi/virtio-scsi-dataplane.c8
-rw-r--r--hw/timer/arm_timer.c7
-rw-r--r--hw/vfio/pci-quirks.c8
-rw-r--r--include/block/block.h3
-rw-r--r--include/block/block_int.h9
-rw-r--r--include/block/nvme.h342
-rw-r--r--include/block/snapshot.h23
-rw-r--r--include/exec/confidential-guest-support.h62
-rw-r--r--include/exec/memory.h18
-rw-r--r--include/exec/memory_ldst_cached.h.inc6
-rw-r--r--include/exec/ram_addr.h4
-rw-r--r--include/hw/boards.h2
-rw-r--r--include/hw/dma/pl080.h7
-rw-r--r--include/hw/misc/arm_integrator_debug.h2
-rw-r--r--include/hw/pci-host/remote.h30
-rw-r--r--include/hw/pci-host/spapr.h2
-rw-r--r--include/hw/pci/pci_ids.h3
-rw-r--r--include/hw/ppc/pef.h17
-rw-r--r--include/hw/ppc/pnv.h1
-rw-r--r--include/hw/ppc/spapr.h1
-rw-r--r--include/hw/ppc/spapr_numa.h1
-rw-r--r--include/hw/ppc/xive_regs.h2
-rw-r--r--include/hw/remote/iohub.h42
-rw-r--r--include/hw/remote/machine.h38
-rw-r--r--include/hw/remote/memory.h19
-rw-r--r--include/hw/remote/mpqemu-link.h99
-rw-r--r--include/hw/remote/proxy-memory-listener.h28
-rw-r--r--include/hw/remote/proxy.h48
-rw-r--r--include/hw/s390x/pv.h17
-rw-r--r--include/hw/ssi/pl022.h5
-rw-r--r--include/io/channel.h78
-rw-r--r--include/migration/snapshot.h47
-rw-r--r--include/qemu/event_notifier.h1
-rw-r--r--include/qemu/fifo8.h16
-rw-r--r--include/qemu/job.h5
-rw-r--r--include/qemu/mmap-alloc.h4
-rw-r--r--include/qemu/typedefs.h1
-rw-r--r--include/qemu/userfaultfd.h35
-rw-r--r--include/qom/object.h3
-rw-r--r--include/sysemu/iothread.h6
-rw-r--r--include/sysemu/kvm.h16
-rw-r--r--include/sysemu/sev.h4
-rw-r--r--io/channel.c115
-rw-r--r--iothread.c6
-rw-r--r--job.c3
-rw-r--r--memory_ldst.c.inc8
-rw-r--r--meson.build219
-rw-r--r--meson_options.txt2
-rw-r--r--migration/migration.c409
-rw-r--r--migration/migration.h6
-rw-r--r--migration/page_cache.c8
-rw-r--r--migration/page_cache.h2
-rw-r--r--migration/qemu-file.c2
-rw-r--r--migration/ram.c307
-rw-r--r--migration/ram.h8
-rw-r--r--migration/savevm.c341
-rw-r--r--migration/savevm.h3
-rw-r--r--migration/trace-events2
-rw-r--r--monitor/hmp-cmds.c45
-rw-r--r--pc-bios/README4
-rw-r--r--pc-bios/descriptors/meson.build2
-rw-r--r--pc-bios/meson.build1
-rw-r--r--qapi/job.json9
-rw-r--r--qapi/meson.build34
-rw-r--r--qapi/migration.json218
-rw-r--r--qemu-nbd.c14
-rw-r--r--qemu-options.hx26
-rw-r--r--qom/object.c4
-rw-r--r--replay/replay-debugging.c12
-rw-r--r--replay/replay-snapshot.c5
-rwxr-xr-xscripts/get_maintainer.pl2
-rw-r--r--scripts/mtest2make.py1
-rwxr-xr-xscripts/oss-fuzz/minimize_qtest_trace.py2
-rw-r--r--scripts/qapi/commands.py62
-rw-r--r--scripts/qapi/events.py16
-rw-r--r--scripts/qapi/gen.py94
-rw-r--r--scripts/qapi/main.py2
-rw-r--r--scripts/qapi/mypy.ini1
-rw-r--r--scripts/qapi/schema.py42
-rw-r--r--scripts/qapi/types.py4
-rw-r--r--scripts/qapi/visit.py6
-rwxr-xr-xscripts/userfaultfd-wrlat.py122
-rw-r--r--softmmu/cpu-throttle.c11
-rw-r--r--softmmu/memory.c8
-rw-r--r--softmmu/physmem.c16
-rw-r--r--softmmu/rtc.c3
-rw-r--r--softmmu/vl.c29
-rw-r--r--stubs/meson.build2
-rw-r--r--stubs/qdev.c23
-rw-r--r--target/arm/cpu.c4
-rw-r--r--target/arm/cpu.h17
-rw-r--r--target/arm/cpu64.c5
-rw-r--r--target/arm/helper-a64.c27
-rw-r--r--target/arm/helper.c71
-rw-r--r--target/arm/internals.h6
-rw-r--r--target/arm/machine.c2
-rw-r--r--target/arm/op_helper.c9
-rw-r--r--target/arm/translate-a64.c12
-rw-r--r--target/i386/cpu.c19
-rw-r--r--target/i386/cpu.h31
-rw-r--r--target/i386/helper.c3
-rw-r--r--target/i386/kvm/kvm.c33
-rw-r--r--target/i386/machine.c24
-rw-r--r--target/i386/sev-stub.c5
-rw-r--r--target/i386/sev.c95
-rw-r--r--target/i386/tcg/excp_helper.c32
-rw-r--r--target/i386/tcg/misc_helper.c14
-rw-r--r--target/i386/tcg/translate.c2
-rw-r--r--target/ppc/cpu.h9
-rw-r--r--target/ppc/kvm.c18
-rw-r--r--target/ppc/kvm_ppc.h6
-rw-r--r--target/ppc/translate_init.c.inc16
-rw-r--r--tests/Makefile.include12
-rw-r--r--tests/acceptance/boot_linux.py14
-rw-r--r--tests/acceptance/boot_linux_console.py98
-rw-r--r--tests/acceptance/linux_ssh_mips_malta.py2
-rw-r--r--tests/acceptance/machine_m68k_nextcube.py44
-rw-r--r--tests/acceptance/machine_microblaze.py35
-rw-r--r--tests/acceptance/machine_ppc.py69
-rw-r--r--tests/acceptance/replay_kernel.py2
-rw-r--r--tests/acceptance/tesseract_utils.py46
-rw-r--r--tests/acceptance/virtiofs_submounts.py21
-rw-r--r--tests/docker/Makefile.include26
-rwxr-xr-xtests/docker/docker.py23
-rw-r--r--tests/docker/dockerfiles/empty.docker8
-rw-r--r--tests/meson.build11
-rw-r--r--tests/qapi-schema/comments.out2
-rw-r--r--tests/qapi-schema/doc-good.out2
-rw-r--r--tests/qapi-schema/empty.out2
-rw-r--r--tests/qapi-schema/event-case.out2
-rw-r--r--tests/qapi-schema/include-repetition.out2
-rw-r--r--tests/qapi-schema/include-simple.out2
-rw-r--r--tests/qapi-schema/indented-expr.out2
-rw-r--r--tests/qapi-schema/qapi-schema-test.out2
-rw-r--r--tests/qemu-iotests/210.out2
-rwxr-xr-xtests/qemu-iotests/264140
-rw-r--r--tests/qemu-iotests/264.out20
-rw-r--r--tests/qemu-iotests/267.out12
-rw-r--r--tests/qemu-iotests/common.qemu106
-rw-r--r--tests/qemu-iotests/common.rc10
-rw-r--r--tests/qemu-iotests/iotests.py6
-rw-r--r--tests/qtest/fuzz/fuzz.c11
-rw-r--r--tests/qtest/fuzz/generic_fuzz.c19
-rw-r--r--tests/qtest/fuzz/generic_fuzz_configs.h41
-rw-r--r--tests/tcg/Makefile.qemu4
-rw-r--r--tests/tcg/multiarch/Makefile.target5
-rw-r--r--util/event_notifier-posix.c16
-rw-r--r--util/fifo8.c16
-rw-r--r--util/meson.build1
-rw-r--r--util/mmap-alloc.c8
-rw-r--r--util/oslib-posix.c2
-rw-r--r--util/trace-events9
-rw-r--r--util/userfaultfd.c345
249 files changed, 10441 insertions, 1456 deletions
diff --git a/.github/lockdown.yml b/.github/lockdown.yml
index 9acc393f1c..07fc2f31ee 100644
--- a/.github/lockdown.yml
+++ b/.github/lockdown.yml
@@ -10,8 +10,8 @@ issues:
comment: |
Thank you for your interest in the QEMU project.
- This repository is a read-only mirror of the project's master
- repostories hosted on https://git.qemu.org/git/qemu.git.
+ This repository is a read-only mirror of the project's repostories hosted
+ at https://gitlab.com/qemu-project/qemu.git.
The project does not process issues filed on GitHub.
The project issues are tracked on Launchpad:
@@ -24,8 +24,8 @@ pulls:
comment: |
Thank you for your interest in the QEMU project.
- This repository is a read-only mirror of the project's master
- repostories hosted on https://git.qemu.org/git/qemu.git.
+ This repository is a read-only mirror of the project's repostories hosted
+ on https://gitlab.com/qemu-project/qemu.git.
The project does not process merge requests filed on GitHub.
QEMU welcomes contributions of code (either fixing bugs or adding new
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7c0db64710..28a83afb91 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,7 +18,6 @@ include:
image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest
before_script:
- JOBS=$(expr $(nproc) + 1)
- - sed -i s,git.qemu.org/git,gitlab.com/qemu-project, .gitmodules
script:
- mkdir build
- cd build
diff --git a/.gitmodules b/.gitmodules
index 2bdeeacef8..08b1b48a09 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,66 +1,66 @@
[submodule "roms/seabios"]
path = roms/seabios
- url = https://git.qemu.org/git/seabios.git/
+ url = https://gitlab.com/qemu-project/seabios.git/
[submodule "roms/SLOF"]
path = roms/SLOF
- url = https://git.qemu.org/git/SLOF.git
+ url = https://gitlab.com/qemu-project/SLOF.git
[submodule "roms/ipxe"]
path = roms/ipxe
- url = https://git.qemu.org/git/ipxe.git
+ url = https://gitlab.com/qemu-project/ipxe.git
[submodule "roms/openbios"]
path = roms/openbios
- url = https://git.qemu.org/git/openbios.git
+ url = https://gitlab.com/qemu-project/openbios.git
[submodule "roms/qemu-palcode"]
path = roms/qemu-palcode
- url = https://git.qemu.org/git/qemu-palcode.git
+ url = https://gitlab.com/qemu-project/qemu-palcode.git
[submodule "roms/sgabios"]
path = roms/sgabios
- url = https://git.qemu.org/git/sgabios.git
+ url = https://gitlab.com/qemu-project/sgabios.git
[submodule "dtc"]
path = dtc
- url = https://git.qemu.org/git/dtc.git
+ url = https://gitlab.com/qemu-project/dtc.git
[submodule "roms/u-boot"]
path = roms/u-boot
- url = https://git.qemu.org/git/u-boot.git
+ url = https://gitlab.com/qemu-project/u-boot.git
[submodule "roms/skiboot"]
path = roms/skiboot
- url = https://git.qemu.org/git/skiboot.git
+ url = https://gitlab.com/qemu-project/skiboot.git
[submodule "roms/QemuMacDrivers"]
path = roms/QemuMacDrivers
- url = https://git.qemu.org/git/QemuMacDrivers.git
+ url = https://gitlab.com/qemu-project/QemuMacDrivers.git
[submodule "ui/keycodemapdb"]
path = ui/keycodemapdb
- url = https://git.qemu.org/git/keycodemapdb.git
+ url = https://gitlab.com/qemu-project/keycodemapdb.git
[submodule "capstone"]
path = capstone
- url = https://git.qemu.org/git/capstone.git
+ url = https://gitlab.com/qemu-project/capstone.git
[submodule "roms/seabios-hppa"]
path = roms/seabios-hppa
- url = https://git.qemu.org/git/seabios-hppa.git
+ url = https://gitlab.com/qemu-project/seabios-hppa.git
[submodule "roms/u-boot-sam460ex"]
path = roms/u-boot-sam460ex
- url = https://git.qemu.org/git/u-boot-sam460ex.git
+ url = https://gitlab.com/qemu-project/u-boot-sam460ex.git
[submodule "tests/fp/berkeley-testfloat-3"]
path = tests/fp/berkeley-testfloat-3
- url = https://git.qemu.org/git/berkeley-testfloat-3.git
+ url = https://gitlab.com/qemu-project/berkeley-testfloat-3.git
[submodule "tests/fp/berkeley-softfloat-3"]
path = tests/fp/berkeley-softfloat-3
- url = https://git.qemu.org/git/berkeley-softfloat-3.git
+ url = https://gitlab.com/qemu-project/berkeley-softfloat-3.git
[submodule "roms/edk2"]
path = roms/edk2
- url = https://git.qemu.org/git/edk2.git
+ url = https://gitlab.com/qemu-project/edk2.git
[submodule "slirp"]
path = slirp
- url = https://git.qemu.org/git/libslirp.git
+ url = https://gitlab.com/qemu-project/libslirp.git
[submodule "roms/opensbi"]
path = roms/opensbi
- url = https://git.qemu.org/git/opensbi.git
+ url = https://gitlab.com/qemu-project/opensbi.git
[submodule "roms/qboot"]
path = roms/qboot
- url = https://git.qemu.org/git/qboot.git
+ url = https://gitlab.com/qemu-project/qboot.git
[submodule "meson"]
path = meson
- url = https://git.qemu.org/git/meson.git
+ url = https://gitlab.com/qemu-project/meson.git
[submodule "roms/vbootrom"]
path = roms/vbootrom
- url = https://git.qemu.org/git/vbootrom.git
+ url = https://gitlab.com/qemu-project/vbootrom.git
diff --git a/Kconfig.host b/Kconfig.host
index a9a55a9c31..24255ef441 100644
--- a/Kconfig.host
+++ b/Kconfig.host
@@ -37,3 +37,7 @@ config VIRTFS
config PVRDMA
bool
+
+config MULTIPROCESS_ALLOWED
+ bool
+ imply MULTIPROCESS
diff --git a/MAINTAINERS b/MAINTAINERS
index 5f2ce9c1d0..de5fe1c65f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -281,6 +281,7 @@ F: target/ppc/
F: hw/ppc/
F: include/hw/ppc/
F: disas/ppc.c
+F: tests/acceptance/machine_ppc.py
RISC-V TCG CPUs
M: Palmer Dabbelt <palmer@dabbelt.com>
@@ -1120,6 +1121,7 @@ M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
S: Maintained
F: hw/microblaze/petalogix_s3adsp1800_mmu.c
F: include/hw/char/xilinx_uartlite.h
+F: tests/acceptance/machine_microblaze.py
petalogix_ml605
M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
@@ -2541,6 +2543,7 @@ F: qapi/qom.json
F: qapi/qdev.json
F: scripts/coccinelle/qom-parent-type.cocci
F: softmmu/qdev-monitor.c
+F: stubs/qdev.c
F: qom/
F: tests/check-qom-interface.c
F: tests/check-qom-proplist.c
@@ -3199,6 +3202,30 @@ S: Maintained
F: hw/semihosting/
F: include/hw/semihosting/
+Multi-process QEMU
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
+M: Jagannathan Raman <jag.raman@oracle.com>
+M: John G Johnson <john.g.johnson@oracle.com>
+S: Maintained
+F: docs/devel/multi-process.rst
+F: docs/system/multi-process.rst
+F: hw/pci-host/remote.c
+F: include/hw/pci-host/remote.h
+F: hw/remote/machine.c
+F: include/hw/remote/machine.h
+F: hw/remote/mpqemu-link.c
+F: include/hw/remote/mpqemu-link.h
+F: hw/remote/message.c
+F: hw/remote/remote-obj.c
+F: include/hw/remote/memory.h
+F: hw/remote/memory.c
+F: hw/remote/proxy.c
+F: include/hw/remote/proxy.h
+F: hw/remote/proxy-memory-listener.c
+F: include/hw/remote/proxy-memory-listener.h
+F: hw/remote/iohub.c
+F: include/hw/remote/iohub.h
+
Build and test automation
-------------------------
Build and test automation
diff --git a/Makefile b/Makefile
index b0dff73904..d7fb6b270e 100644
--- a/Makefile
+++ b/Makefile
@@ -305,7 +305,7 @@ endif
@echo 'Test targets:'
$(call print-help,check,Run all tests (check-help for details))
$(call print-help,bench,Run all benchmarks)
- $(call print-help,docker,Help about targets running tests inside containers)
+ $(call print-help,docker-help,Help about targets running tests inside containers)
$(call print-help,vm-help,Help about targets running tests inside VM)
@echo ''
@echo 'Documentation targets:'
diff --git a/README.rst b/README.rst
index 58b9f2dc15..ce39d89077 100644
--- a/README.rst
+++ b/README.rst
@@ -60,7 +60,7 @@ The QEMU source code is maintained under the GIT version control system.
.. code-block:: shell
- git clone https://git.qemu.org/git/qemu.git
+ git clone https://gitlab.com/qemu-project/qemu.git
When submitting patches, one common approach is to use 'git
format-patch' and/or 'git send-email' to format & send the mail to the
@@ -78,7 +78,7 @@ The QEMU website is also maintained under source control.
.. code-block:: shell
- git clone https://git.qemu.org/git/qemu-web.git
+ git clone https://gitlab.com/qemu-project/qemu-web.git
* `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 5164d838b9..47516913b7 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -123,10 +123,6 @@ struct KVMState
KVMMemoryListener memory_listener;
QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
- /* memory encryption */
- void *memcrypt_handle;
- int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
-
/* For "info mtree -f" to tell if an MR is registered in KVM */
int nr_as;
struct KVMAs {
@@ -225,26 +221,6 @@ int kvm_get_max_memslots(void)
return s->nr_slots;
}
-bool kvm_memcrypt_enabled(void)
-{
- if (kvm_state && kvm_state->memcrypt_handle) {
- return true;
- }
-
- return false;
-}
-
-int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
-{
- if (kvm_state->memcrypt_handle &&
- kvm_state->memcrypt_encrypt_data) {
- return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
- ptr, len);
- }
-
- return 1;
-}
-
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
@@ -668,16 +644,19 @@ static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
d.dirty_bitmap = mem->dirty_bmap;
d.slot = mem->slot | (kml->as_id << 16);
- if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
- DPRINTF("ioctl failed %d\n", errno);
- ret = -1;
+ ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
+ if (ret == -ENOENT) {
+ /* kernel does not have dirty bitmap in this slot */
+ ret = 0;
+ } else if (ret < 0) {
+ error_report("ioctl KVM_GET_DIRTY_LOG failed: %d", errno);
goto out;
+ } else {
+ subsection.offset_within_region += slot_offset;
+ subsection.size = int128_make64(slot_size);
+ kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap);
}
- subsection.offset_within_region += slot_offset;
- subsection.size = int128_make64(slot_size);
- kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap);
-
slot_offset += slot_size;
start_addr += slot_size;
size -= slot_size;
@@ -774,8 +753,8 @@ static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
d.num_pages = bmap_npages;
d.slot = mem->slot | (as_id << 16);
- if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) {
- ret = -errno;
+ ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
+ if (ret < 0 && ret != -ENOENT) {
error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
"start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
__func__, d.slot, (uint64_t)d.first_page,
@@ -2204,20 +2183,6 @@ static int kvm_init(MachineState *ms)
kvm_state = s;
- /*
- * if memory encryption object is specified then initialize the memory
- * encryption context.
- */
- if (ms->memory_encryption) {
- kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
- if (!kvm_state->memcrypt_handle) {
- ret = -1;
- goto err;
- }
-
- kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
- }
-
ret = kvm_arch_init(ms, s);
if (ret < 0) {
goto err;
diff --git a/accel/kvm/sev-stub.c b/accel/kvm/sev-stub.c
index 4f97452585..9587d1b2a3 100644
--- a/accel/kvm/sev-stub.c
+++ b/accel/kvm/sev-stub.c
@@ -15,12 +15,8 @@
#include "qemu-common.h"
#include "sysemu/sev.h"
-int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len)
+int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
- abort();
-}
-
-void *sev_guest_init(const char *id)
-{
- return NULL;
+ /* If we get here, cgs must be some non-SEV thing */
+ return 0;
}
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 680e099463..0f17acfac0 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -81,16 +81,6 @@ int kvm_on_sigbus(int code, void *addr)
return 1;
}
-bool kvm_memcrypt_enabled(void)
-{
- return false;
-}
-
-int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
-{
- return 1;
-}
-
#ifndef CONFIG_USER_ONLY
int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
{
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index d9ef69121c..f2819eec7d 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -773,17 +773,30 @@ int cpu_exec(CPUState *cpu)
/* prepare setjmp context for exception handling */
if (sigsetjmp(cpu->jmp_env, 0) != 0) {
#if defined(__clang__)
- /* Some compilers wrongly smash all local variables after
- * siglongjmp. There were bug reports for gcc 4.5.0 and clang.
+ /*
+ * Some compilers wrongly smash all local variables after
+ * siglongjmp (the spec requires that only non-volatile locals
+ * which are changed between the sigsetjmp and siglongjmp are
+ * permitted to be trashed). There were bug reports for gcc
+ * 4.5.0 and clang. The bug is fixed in all versions of gcc
+ * that we support, but is still unfixed in clang:
+ * https://bugs.llvm.org/show_bug.cgi?id=21183
+ *
* Reload essential local variables here for those compilers.
- * Newer versions of gcc would complain about this code (-Wclobbered). */
+ * Newer versions of gcc would complain about this code (-Wclobbered),
+ * so we only perform the workaround for clang.
+ */
cpu = current_cpu;
cc = CPU_GET_CLASS(cpu);
-#else /* buggy compiler */
- /* Assert that the compiler does not smash local variables. */
+#else
+ /*
+ * Non-buggy compilers preserve these locals; assert that
+ * they have the correct value.
+ */
g_assert(cpu == current_cpu);
g_assert(cc == CPU_GET_CLASS(cpu));
-#endif /* buggy compiler */
+#endif
+
#ifndef CONFIG_SOFTMMU
tcg_debug_assert(!have_mmap_lock());
#endif
diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c
index 87762b469c..13b8fbeb69 100644
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -81,7 +81,13 @@ void icount_handle_deadline(void)
int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
QEMU_TIMER_ATTR_ALL);
- if (deadline == 0) {
+ /*
+ * Instructions, interrupts, and exceptions are processed in cpu-exec.
+ * Don't interrupt cpu thread, when these events are waiting
+ * (i.e., there is no checkpoint)
+ */
+ if (deadline == 0
+ && (replay_mode != REPLAY_MODE_PLAY || replay_has_checkpoint())) {
icount_notify_aio_contexts();
}
}
diff --git a/backends/confidential-guest-support.c b/backends/confidential-guest-support.c
new file mode 100644
index 0000000000..052fde8db0
--- /dev/null
+++ b/backends/confidential-guest-support.c
@@ -0,0 +1,33 @@
+/*
+ * QEMU Confidential Guest support
+ *
+ * Copyright Red Hat.
+ *
+ * Authors:
+ * David Gibson <david@gibson.dropbear.id.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "exec/confidential-guest-support.h"
+
+OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport,
+ confidential_guest_support,
+ CONFIDENTIAL_GUEST_SUPPORT,
+ OBJECT)
+
+static void confidential_guest_support_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void confidential_guest_support_init(Object *obj)
+{
+}
+
+static void confidential_guest_support_finalize(Object *obj)
+{
+}
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index e5626d4330..69b0ae30bb 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -55,7 +55,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
name = host_memory_backend_get_name(backend);
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
name, backend->size,
- backend->share, fd, errp);
+ backend->share, fd, 0, errp);
g_free(name);
}
diff --git a/backends/hostmem.c b/backends/hostmem.c
index be0c3b079f..c6c1ff5b99 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -494,6 +494,16 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)
host_memory_backend_get_share, host_memory_backend_set_share);
object_class_property_set_description(oc, "share",
"Mark the memory as private to QEMU or shared");
+ /*
+ * Do not delete/rename option. This option must be considered stable
+ * (as if it didn't have the 'x-' prefix including deprecation period) as
+ * long as 4.0 and older machine types exists.
+ * Option will be used by upper layers to override (disable) canonical path
+ * for ramblock-id set by compat properties on old machine types ( <= 4.0),
+ * to keep migration working when backend is used for main RAM with
+ * -machine memory-backend= option (main RAM historically used prefix-less
+ * ramblock-id).
+ */
object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id",
host_memory_backend_get_use_canonical_path,
host_memory_backend_set_use_canonical_path);
diff --git a/backends/meson.build b/backends/meson.build
index 484456ece7..d4221831fc 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -6,6 +6,7 @@ softmmu_ss.add([files(
'rng-builtin.c',
'rng-egd.c',
'rng.c',
+ 'confidential-guest-support.c',
), numa])
softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files('rng-random.c'))
diff --git a/backends/rng-builtin.c b/backends/rng-builtin.c
index f38dff117d..f367eb665c 100644
--- a/backends/rng-builtin.c
+++ b/backends/rng-builtin.c
@@ -10,6 +10,7 @@
#include "qemu/main-loop.h"
#include "qemu/guest-random.h"
#include "qom/object.h"
+#include "sysemu/replay.h"
OBJECT_DECLARE_SIMPLE_TYPE(RngBuiltin, RNG_BUILTIN)
@@ -37,7 +38,7 @@ static void rng_builtin_request_entropy(RngBackend *b, RngRequest *req)
{
RngBuiltin *s = RNG_BUILTIN(b);
- qemu_bh_schedule(s->bh);
+ replay_bh_schedule_event(s->bh);
}
static void rng_builtin_init(Object *obj)
diff --git a/block/backup.c b/block/backup.c
index cc525d5544..94e6dcd72e 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -35,6 +35,7 @@ typedef struct BackupBlockJob {
BlockJob common;
BlockDriverState *backup_top;
BlockDriverState *source_bs;
+ BlockDriverState *target_bs;
BdrvDirtyBitmap *sync_bitmap;
@@ -329,6 +330,13 @@ static void coroutine_fn backup_set_speed(BlockJob *job, int64_t speed)
}
}
+static void backup_cancel(Job *job)
+{
+ BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
+
+ bdrv_cancel_in_flight(s->target_bs);
+}
+
static const BlockJobDriver backup_job_driver = {
.job_driver = {
.instance_size = sizeof(BackupBlockJob),
@@ -340,6 +348,7 @@ static const BlockJobDriver backup_job_driver = {
.abort = backup_abort,
.clean = backup_clean,
.pause = backup_pause,
+ .cancel = backup_cancel,
},
.set_speed = backup_set_speed,
};
@@ -528,6 +537,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
job->backup_top = backup_top;
job->source_bs = bs;
+ job->target_bs = target;
job->on_source_error = on_source_error;
job->on_target_error = on_target_error;
job->sync_mode = sync_mode;
diff --git a/block/io.c b/block/io.c
index b0435ed670..ca2dca3007 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3460,3 +3460,14 @@ out:
return ret;
}
+
+void bdrv_cancel_in_flight(BlockDriverState *bs)
+{
+ if (!bs || !bs->drv) {
+ return;
+ }
+
+ if (bs->drv->bdrv_cancel_in_flight) {
+ bs->drv->bdrv_cancel_in_flight(bs);
+ }
+}
diff --git a/block/mirror.c b/block/mirror.c
index 8e1ad6eceb..9faffe4707 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1179,6 +1179,14 @@ static bool mirror_drained_poll(BlockJob *job)
return !!s->in_flight;
}
+static void mirror_cancel(Job *job)
+{
+ MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
+ BlockDriverState *target = blk_bs(s->target);
+
+ bdrv_cancel_in_flight(target);
+}
+
static const BlockJobDriver mirror_job_driver = {
.job_driver = {
.instance_size = sizeof(MirrorBlockJob),
@@ -1190,6 +1198,7 @@ static const BlockJobDriver mirror_job_driver = {
.abort = mirror_abort,
.pause = mirror_pause,
.complete = mirror_complete,
+ .cancel = mirror_cancel,
},
.drained_poll = mirror_drained_poll,
};
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index afd75ab628..75d7fa9510 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -900,10 +900,11 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
ImageEntry *image_entry, *next_ie;
SnapshotEntry *snapshot_entry;
+ Error *err = NULL;
- bs = bdrv_all_find_vmstate_bs();
+ bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, &err);
if (!bs) {
- monitor_printf(mon, "No available block device supports snapshots\n");
+ error_report_err(err);
return;
}
aio_context = bdrv_get_aio_context(bs);
@@ -953,7 +954,7 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
total = 0;
for (i = 0; i < nb_sns; i++) {
SnapshotEntry *next_sn;
- if (bdrv_all_find_snapshot(sn_tab[i].name, &bs1) == 0) {
+ if (bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL) == 1) {
global_snapshots[total] = i;
total++;
QTAILQ_FOREACH(image_entry, &image_list, next) {
diff --git a/block/nbd.c b/block/nbd.c
index b3cbbeb4b0..c26dc5a54f 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -2458,6 +2458,18 @@ static const char *const nbd_strong_runtime_opts[] = {
NULL
};
+static void nbd_cancel_in_flight(BlockDriverState *bs)
+{
+ BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
+
+ reconnect_delay_timer_del(s);
+
+ if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
+ s->state = NBD_CLIENT_CONNECTING_NOWAIT;
+ qemu_co_queue_restart_all(&s->free_sema);
+ }
+}
+
static BlockDriver bdrv_nbd = {
.format_name = "nbd",
.protocol_name = "nbd",
@@ -2484,6 +2496,7 @@ static BlockDriver bdrv_nbd = {
.bdrv_co_block_status = nbd_client_co_block_status,
.bdrv_dirname = nbd_dirname,
.strong_runtime_opts = nbd_strong_runtime_opts,
+ .bdrv_cancel_in_flight = nbd_cancel_in_flight,
};
static BlockDriver bdrv_nbd_tcp = {
@@ -2512,6 +2525,7 @@ static BlockDriver bdrv_nbd_tcp = {
.bdrv_co_block_status = nbd_client_co_block_status,
.bdrv_dirname = nbd_dirname,
.strong_runtime_opts = nbd_strong_runtime_opts,
+ .bdrv_cancel_in_flight = nbd_cancel_in_flight,
};
static BlockDriver bdrv_nbd_unix = {
@@ -2540,6 +2554,7 @@ static BlockDriver bdrv_nbd_unix = {
.bdrv_co_block_status = nbd_client_co_block_status,
.bdrv_dirname = nbd_dirname,
.strong_runtime_opts = nbd_strong_runtime_opts,
+ .bdrv_cancel_in_flight = nbd_cancel_in_flight,
};
static void bdrv_nbd_init(void)
diff --git a/block/raw-format.c b/block/raw-format.c
index 42ec50802b..7717578ed6 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -575,6 +575,11 @@ static const char *const raw_strong_runtime_opts[] = {
NULL
};
+static void raw_cancel_in_flight(BlockDriverState *bs)
+{
+ bdrv_cancel_in_flight(bs->file->bs);
+}
+
BlockDriver bdrv_raw = {
.format_name = "raw",
.instance_size = sizeof(BDRVRawState),
@@ -608,6 +613,7 @@ BlockDriver bdrv_raw = {
.bdrv_has_zero_init = &raw_has_zero_init,
.strong_runtime_opts = raw_strong_runtime_opts,
.mutable_opts = mutable_opts,
+ .bdrv_cancel_in_flight = raw_cancel_in_flight,
};
static void bdrv_raw_init(void)
diff --git a/block/snapshot.c b/block/snapshot.c
index a2bf3a54eb..e8ae9a28c1 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -447,6 +447,41 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
return ret;
}
+
+static int bdrv_all_get_snapshot_devices(bool has_devices, strList *devices,
+ GList **all_bdrvs,
+ Error **errp)
+{
+ g_autoptr(GList) bdrvs = NULL;
+
+ if (has_devices) {
+ if (!devices) {
+ error_setg(errp, "At least one device is required for snapshot");
+ return -1;
+ }
+
+ while (devices) {
+ BlockDriverState *bs = bdrv_find_node(devices->value);
+ if (!bs) {
+ error_setg(errp, "No block device node '%s'", devices->value);
+ return -1;
+ }
+ bdrvs = g_list_append(bdrvs, bs);
+ devices = devices->next;
+ }
+ } else {
+ BlockDriverState *bs;
+ BdrvNextIterator it;
+ for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ bdrvs = g_list_append(bdrvs, bs);
+ }
+ }
+
+ *all_bdrvs = g_steal_pointer(&bdrvs);
+ return 0;
+}
+
+
static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs)
{
if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
@@ -462,44 +497,59 @@ static bool bdrv_all_snapshots_includes_bs(BlockDriverState *bs)
* These functions will properly handle dataplane (take aio_context_acquire
* when appropriate for appropriate block drivers) */
-bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs)
+bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
+ Error **errp)
{
- bool ok = true;
- BlockDriverState *bs;
- BdrvNextIterator it;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
+
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return false;
+ }
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
+ bool ok = true;
aio_context_acquire(ctx);
- if (bdrv_all_snapshots_includes_bs(bs)) {
+ if (devices || bdrv_all_snapshots_includes_bs(bs)) {
ok = bdrv_can_snapshot(bs);
}
aio_context_release(ctx);
if (!ok) {
- bdrv_next_cleanup(&it);
- goto fail;
+ error_setg(errp, "Device '%s' is writable but does not support "
+ "snapshots", bdrv_get_device_or_node_name(bs));
+ return false;
}
+
+ iterbdrvs = iterbdrvs->next;
}
-fail:
- *first_bad_bs = bs;
- return ok;
+ return true;
}
-int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
+int bdrv_all_delete_snapshot(const char *name,
+ bool has_devices, strList *devices,
Error **errp)
{
- int ret = 0;
- BlockDriverState *bs;
- BdrvNextIterator it;
- QEMUSnapshotInfo sn1, *snapshot = &sn1;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
+
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return -1;
+ }
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
+ QEMUSnapshotInfo sn1, *snapshot = &sn1;
+ int ret = 0;
aio_context_acquire(ctx);
- if (bdrv_all_snapshots_includes_bs(bs) &&
+ if ((devices || bdrv_all_snapshots_includes_bs(bs)) &&
bdrv_snapshot_find(bs, snapshot, name) >= 0)
{
ret = bdrv_snapshot_delete(bs, snapshot->id_str,
@@ -507,118 +557,180 @@ int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs,
}
aio_context_release(ctx);
if (ret < 0) {
- bdrv_next_cleanup(&it);
- goto fail;
+ error_prepend(errp, "Could not delete snapshot '%s' on '%s': ",
+ name, bdrv_get_device_or_node_name(bs));
+ return -1;
}
+
+ iterbdrvs = iterbdrvs->next;
}
-fail:
- *first_bad_bs = bs;
- return ret;
+ return 0;
}
-int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs,
+int bdrv_all_goto_snapshot(const char *name,
+ bool has_devices, strList *devices,
Error **errp)
{
- int ret = 0;
- BlockDriverState *bs;
- BdrvNextIterator it;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return -1;
+ }
+
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
+ int ret = 0;
aio_context_acquire(ctx);
- if (bdrv_all_snapshots_includes_bs(bs)) {
+ if (devices || bdrv_all_snapshots_includes_bs(bs)) {
ret = bdrv_snapshot_goto(bs, name, errp);
}
aio_context_release(ctx);
if (ret < 0) {
- bdrv_next_cleanup(&it);
- goto fail;
+ error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
+ name, bdrv_get_device_or_node_name(bs));
+ return -1;
}
+
+ iterbdrvs = iterbdrvs->next;
}
-fail:
- *first_bad_bs = bs;
- return ret;
+ return 0;
}
-int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs)
+int bdrv_all_has_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp)
{
- QEMUSnapshotInfo sn;
- int err = 0;
- BlockDriverState *bs;
- BdrvNextIterator it;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return -1;
+ }
+
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
+ QEMUSnapshotInfo sn;
+ int ret = 0;
aio_context_acquire(ctx);
- if (bdrv_all_snapshots_includes_bs(bs)) {
- err = bdrv_snapshot_find(bs, &sn, name);
+ if (devices || bdrv_all_snapshots_includes_bs(bs)) {
+ ret = bdrv_snapshot_find(bs, &sn, name);
}
aio_context_release(ctx);
- if (err < 0) {
- bdrv_next_cleanup(&it);
- goto fail;
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ return 0;
+ } else {
+ error_setg_errno(errp, errno,
+ "Could not check snapshot '%s' on '%s'",
+ name, bdrv_get_device_or_node_name(bs));
+ return -1;
+ }
}
+
+ iterbdrvs = iterbdrvs->next;
}
-fail:
- *first_bad_bs = bs;
- return err;
+ return 1;
}
int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
BlockDriverState *vm_state_bs,
uint64_t vm_state_size,
- BlockDriverState **first_bad_bs)
+ bool has_devices, strList *devices,
+ Error **errp)
{
- int err = 0;
- BlockDriverState *bs;
- BdrvNextIterator it;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
+
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return -1;
+ }
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
+ int ret = 0;
aio_context_acquire(ctx);
if (bs == vm_state_bs) {
sn->vm_state_size = vm_state_size;
- err = bdrv_snapshot_create(bs, sn);
- } else if (bdrv_all_snapshots_includes_bs(bs)) {
+ ret = bdrv_snapshot_create(bs, sn);
+ } else if (devices || bdrv_all_snapshots_includes_bs(bs)) {
sn->vm_state_size = 0;
- err = bdrv_snapshot_create(bs, sn);
+ ret = bdrv_snapshot_create(bs, sn);
}
aio_context_release(ctx);
- if (err < 0) {
- bdrv_next_cleanup(&it);
- goto fail;
+ if (ret < 0) {
+ error_setg(errp, "Could not create snapshot '%s' on '%s'",
+ sn->name, bdrv_get_device_or_node_name(bs));
+ return -1;
}
+
+ iterbdrvs = iterbdrvs->next;
}
-fail:
- *first_bad_bs = bs;
- return err;
+ return 0;
}
-BlockDriverState *bdrv_all_find_vmstate_bs(void)
+
+BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
+ bool has_devices, strList *devices,
+ Error **errp)
{
- BlockDriverState *bs;
- BdrvNextIterator it;
+ g_autoptr(GList) bdrvs = NULL;
+ GList *iterbdrvs;
+
+ if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) {
+ return NULL;
+ }
- for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+ iterbdrvs = bdrvs;
+ while (iterbdrvs) {
+ BlockDriverState *bs = iterbdrvs->data;
AioContext *ctx = bdrv_get_aio_context(bs);
- bool found;
+ bool found = false;
aio_context_acquire(ctx);
- found = bdrv_all_snapshots_includes_bs(bs) && bdrv_can_snapshot(bs);
+ found = (devices || bdrv_all_snapshots_includes_bs(bs)) &&
+ bdrv_can_snapshot(bs);
aio_context_release(ctx);
- if (found) {
- bdrv_next_cleanup(&it);
- break;
+ if (vmstate_bs) {
+ if (g_str_equal(vmstate_bs,
+ bdrv_get_node_name(bs))) {
+ if (found) {
+ return bs;
+ } else {
+ error_setg(errp,
+ "vmstate block device '%s' does not support snapshots",
+ vmstate_bs);
+ return NULL;
+ }
+ }
+ } else if (found) {
+ return bs;
}
+
+ iterbdrvs = iterbdrvs->next;
+ }
+
+ if (vmstate_bs) {
+ error_setg(errp,
+ "vmstate block device '%s' does not exist", vmstate_bs);
+ } else {
+ error_setg(errp,
+ "no block device can store vmstate for snapshot");
}
- return bs;
+ return NULL;
}
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index d8443d235b..b264620b98 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds,
qio_net_listener_set_name(nbd_server->listener,
"nbd-listener");
- if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) {
+ /*
+ * Because this server is persistent, a backlog of SOMAXCONN is
+ * better than trying to size it to max_connections.
+ */
+ if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN,
+ errp) < 0) {
goto error;
}
diff --git a/configure b/configure
index a34f91171d..a79b3746d4 100755
--- a/configure
+++ b/configure
@@ -198,8 +198,8 @@ has() {
}
version_ge () {
- local_ver1=`echo $1 | tr . ' '`
- local_ver2=`echo $2 | tr . ' '`
+ local_ver1=$(expr "$1" : '\([0-9.]*\)' | tr . ' ')
+ local_ver2=$(echo "$2" | tr . ' ')
while true; do
set x $local_ver1
local_first=${2-0}
@@ -463,6 +463,7 @@ skip_meson=no
gettext="auto"
fuse="auto"
fuse_lseek="auto"
+multiprocess="no"
malloc_trim="auto"
@@ -797,6 +798,7 @@ Linux)
linux="yes"
linux_user="yes"
vhost_user=${default_feature:-yes}
+ multiprocess=${default_feature:-yes}
;;
esac
@@ -1556,6 +1558,10 @@ for opt do
;;
--disable-fuse-lseek) fuse_lseek="disabled"
;;
+ --enable-multiprocess) multiprocess="yes"
+ ;;
+ --disable-multiprocess) multiprocess="no"
+ ;;
*)
echo "ERROR: unknown option $opt"
echo "Try '$0 --help' for more information"
@@ -1764,7 +1770,7 @@ Advanced options (experts only):
--with-trace-file=NAME Full PATH,NAME of file to store traces
Default:trace-<pid>
--disable-slirp disable SLIRP userspace network connectivity
- --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)
+ --enable-tcg-interpreter enable TCI (TCG with bytecode interpreter, experimental and slow)
--enable-malloc-trim enable libc malloc_trim() for memory optimization
--oss-lib path to OSS library
--cpu=CPU Build for host CPU [$cpu]
@@ -1908,6 +1914,7 @@ disabled with --disable-FEATURE, default is enabled if available
libdaxctl libdaxctl support
fuse FUSE block device export
fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports
+ multiprocess Multiprocess QEMU support
NOTE: The object files are built at the place where configure is launched
EOF
@@ -6082,6 +6089,9 @@ fi
if test "$have_mlockall" = "yes" ; then
echo "HAVE_MLOCKALL=y" >> $config_host_mak
fi
+if test "$multiprocess" = "yes" ; then
+ echo "CONFIG_MULTIPROCESS_ALLOWED=y" >> $config_host_mak
+fi
if test "$fuzzing" = "yes" ; then
# If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the
# needed CFLAGS have already been provided
@@ -6115,7 +6125,7 @@ fi
if test -n "$gdb_bin"; then
gdb_version=$($gdb_bin --version | head -n 1)
- if version_ge ${gdb_version##* } 8.3.1; then
+ if version_ge ${gdb_version##* } 9.1; then
echo "HAVE_GDB_BIN=$gdb_bin" >> $config_host_mak
fi
fi
diff --git a/docs/amd-memory-encryption.txt b/docs/amd-memory-encryption.txt
index 80b8eb00e9..145896aec7 100644
--- a/docs/amd-memory-encryption.txt
+++ b/docs/amd-memory-encryption.txt
@@ -73,7 +73,7 @@ complete flow chart.
To launch a SEV guest
# ${QEMU} \
- -machine ...,memory-encryption=sev0 \
+ -machine ...,confidential-guest-support=sev0 \
-object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1
Debugging
diff --git a/docs/confidential-guest-support.txt b/docs/confidential-guest-support.txt
new file mode 100644
index 0000000000..71d07ba57a
--- /dev/null
+++ b/docs/confidential-guest-support.txt
@@ -0,0 +1,49 @@
+Confidential Guest Support
+==========================
+
+Traditionally, hypervisors such as QEMU have complete access to a
+guest's memory and other state, meaning that a compromised hypervisor
+can compromise any of its guests. A number of platforms have added
+mechanisms in hardware and/or firmware which give guests at least some
+protection from a compromised hypervisor. This is obviously
+especially desirable for public cloud environments.
+
+These mechanisms have different names and different modes of
+operation, but are often referred to as Secure Guests or Confidential
+Guests. We use the term "Confidential Guest Support" to distinguish
+this from other aspects of guest security (such as security against
+attacks from other guests, or from network sources).
+
+Running a Confidential Guest
+----------------------------
+
+To run a confidential guest you need to add two command line parameters:
+
+1. Use "-object" to create a "confidential guest support" object. The
+ type and parameters will vary with the specific mechanism to be
+ used
+2. Set the "confidential-guest-support" machine parameter to the ID of
+ the object from (1).
+
+Example (for AMD SEV)::
+
+ qemu-system-x86_64 \
+ <other parameters> \
+ -machine ...,confidential-guest-support=sev0 \
+ -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1
+
+Supported mechanisms
+--------------------
+
+Currently supported confidential guest mechanisms are:
+
+AMD Secure Encrypted Virtualization (SEV)
+ docs/amd-memory-encryption.txt
+
+POWER Protected Execution Facility (PEF)
+ docs/papr-pef.txt
+
+s390x Protected Virtualization (PV)
+ docs/system/s390x/protvirt.rst
+
+Other mechanisms may be supported in future.
diff --git a/docs/devel/build-system.rst b/docs/devel/build-system.rst
index 31f4dced2a..69ce3087e3 100644
--- a/docs/devel/build-system.rst
+++ b/docs/devel/build-system.rst
@@ -100,7 +100,7 @@ In meson.build::
# Detect dependency
sdl_image = dependency('SDL2_image', required: get_option('sdl_image'),
method: 'pkg-config',
- static: enable_static)
+ kwargs: static_kwargs)
# Create config-host.h (if applicable)
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
diff --git a/docs/devel/fuzzing.rst b/docs/devel/fuzzing.rst
index 6096242d99..97797c4f8c 100644
--- a/docs/devel/fuzzing.rst
+++ b/docs/devel/fuzzing.rst
@@ -119,7 +119,7 @@ Adding a new fuzzer
Coverage over virtual devices can be improved by adding additional fuzzers.
Fuzzers are kept in ``tests/qtest/fuzz/`` and should be added to
-``tests/qtest/fuzz/Makefile.include``
+``tests/qtest/fuzz/meson.build``
Fuzzers can rely on both qtest and libqos to communicate with virtual devices.
@@ -128,8 +128,7 @@ Fuzzers can rely on both qtest and libqos to communicate with virtual devices.
2. Write the fuzzing code using the libqtest/libqos API. See existing fuzzers
for reference.
-3. Register the fuzzer in ``tests/fuzz/Makefile.include`` by appending the
- corresponding object to fuzz-obj-y
+3. Add the fuzzer to ``tests/qtest/fuzz/meson.build``.
Fuzzers can be more-or-less thought of as special qtest programs which can
modify the qtest commands and/or qtest command arguments based on inputs
@@ -181,6 +180,36 @@ To ensure that these env variables have been configured correctly, we can use::
The output should contain a complete list of matched MemoryRegions.
+OSS-Fuzz
+--------
+QEMU is continuously fuzzed on `OSS-Fuzz` __(https://github.com/google/oss-fuzz).
+By default, the OSS-Fuzz build will try to fuzz every fuzz-target. Since the
+generic-fuzz target requires additional information provided in environment
+variables, we pre-define some generic-fuzz configs in
+``tests/qtest/fuzz/generic_fuzz_configs.h``. Each config must specify:
+
+- ``.name``: To identify the fuzzer config
+
+- ``.args`` OR ``.argfunc``: A string or pointer to a function returning a
+ string. These strings are used to specify the ``QEMU_FUZZ_ARGS``
+ environment variable. ``argfunc`` is useful when the config relies on e.g.
+ a dynamically created temp directory, or a free tcp/udp port.
+
+- ``.objects``: A string that specifies the ``QEMU_FUZZ_OBJECTS`` environment
+ variable.
+
+To fuzz additional devices/device configuration on OSS-Fuzz, send patches for
+either a new device-specific fuzzer or a new generic-fuzz config.
+
+Build details:
+
+- The Dockerfile that sets up the environment for building QEMU's
+ fuzzers on OSS-Fuzz can be fund in the OSS-Fuzz repository
+ __(https://github.com/google/oss-fuzz/blob/master/projects/qemu/Dockerfile)
+
+- The script responsible for building the fuzzers can be found in the
+ QEMU source tree at ``scripts/oss-fuzz/build.sh``
+
Implementation Details / Fuzzer Lifecycle
-----------------------------------------
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
index 98a7016a9b..22854e334d 100644
--- a/docs/devel/index.rst
+++ b/docs/devel/index.rst
@@ -37,3 +37,4 @@ Contents:
clocks
qom
block-coroutine-wrapper
+ multi-process
diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst
new file mode 100644
index 0000000000..69699329d6
--- /dev/null
+++ b/docs/devel/multi-process.rst
@@ -0,0 +1,966 @@
+This is the design document for multi-process QEMU. It does not
+necessarily reflect the status of the current implementation, which
+may lack features or be considerably different from what is described
+in this document. This document is still useful as a description of
+the goals and general direction of this feature.
+
+Please refer to the following wiki for latest details:
+https://wiki.qemu.org/Features/MultiProcessQEMU
+
+Multi-process QEMU
+===================
+
+QEMU is often used as the hypervisor for virtual machines running in the
+Oracle cloud. Since one of the advantages of cloud computing is the
+ability to run many VMs from different tenants in the same cloud
+infrastructure, a guest that compromised its hypervisor could
+potentially use the hypervisor's access privileges to access data it is
+not authorized for.
+
+QEMU can be susceptible to security attacks because it is a large,
+monolithic program that provides many features to the VMs it services.
+Many of these features can be configured out of QEMU, but even a reduced
+configuration QEMU has a large amount of code a guest can potentially
+attack. Separating QEMU reduces the attack surface by aiding to
+limit each component in the system to only access the resources that
+it needs to perform its job.
+
+QEMU services
+-------------
+
+QEMU can be broadly described as providing three main services. One is a
+VM control point, where VMs can be created, migrated, re-configured, and
+destroyed. A second is to emulate the CPU instructions within the VM,
+often accelerated by HW virtualization features such as Intel's VT
+extensions. Finally, it provides IO services to the VM by emulating HW
+IO devices, such as disk and network devices.
+
+A multi-process QEMU
+~~~~~~~~~~~~~~~~~~~~
+
+A multi-process QEMU involves separating QEMU services into separate
+host processes. Each of these processes can be given only the privileges
+it needs to provide its service, e.g., a disk service could be given
+access only to the disk images it provides, and not be allowed to
+access other files, or any network devices. An attacker who compromised
+this service would not be able to use this exploit to access files or
+devices beyond what the disk service was given access to.
+
+A QEMU control process would remain, but in multi-process mode, will
+have no direct interfaces to the VM. During VM execution, it would still
+provide the user interface to hot-plug devices or live migrate the VM.
+
+A first step in creating a multi-process QEMU is to separate IO services
+from the main QEMU program, which would continue to provide CPU
+emulation. i.e., the control process would also be the CPU emulation
+process. In a later phase, CPU emulation could be separated from the
+control process.
+
+Separating IO services
+----------------------
+
+Separating IO services into individual host processes is a good place to
+begin for a couple of reasons. One is the sheer number of IO devices QEMU
+can emulate provides a large surface of interfaces which could potentially
+be exploited, and, indeed, have been a source of exploits in the past.
+Another is the modular nature of QEMU device emulation code provides
+interface points where the QEMU functions that perform device emulation
+can be separated from the QEMU functions that manage the emulation of
+guest CPU instructions. The devices emulated in the separate process are
+referred to as remote devices.
+
+QEMU device emulation
+~~~~~~~~~~~~~~~~~~~~~
+
+QEMU uses an object oriented SW architecture for device emulation code.
+Configured objects are all compiled into the QEMU binary, then objects
+are instantiated by name when used by the guest VM. For example, the
+code to emulate a device named "foo" is always present in QEMU, but its
+instantiation code is only run when the device is included in the target
+VM. (e.g., via the QEMU command line as *-device foo*)
+
+The object model is hierarchical, so device emulation code names its
+parent object (such as "pci-device" for a PCI device) and QEMU will
+instantiate a parent object before calling the device's instantiation
+code.
+
+Current separation models
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to separate the device emulation code from the CPU emulation
+code, the device object code must run in a different process. There are
+a couple of existing QEMU features that can run emulation code
+separately from the main QEMU process. These are examined below.
+
+vhost user model
+^^^^^^^^^^^^^^^^
+
+Virtio guest device drivers can be connected to vhost user applications
+in order to perform their IO operations. This model uses special virtio
+device drivers in the guest and vhost user device objects in QEMU, but
+once the QEMU vhost user code has configured the vhost user application,
+mission-mode IO is performed by the application. The vhost user
+application is a daemon process that can be contacted via a known UNIX
+domain socket.
+
+vhost socket
+''''''''''''
+
+As mentioned above, one of the tasks of the vhost device object within
+QEMU is to contact the vhost application and send it configuration
+information about this device instance. As part of the configuration
+process, the application can also be sent other file descriptors over
+the socket, which then can be used by the vhost user application in
+various ways, some of which are described below.
+
+vhost MMIO store acceleration
+'''''''''''''''''''''''''''''
+
+VMs are often run using HW virtualization features via the KVM kernel
+driver. This driver allows QEMU to accelerate the emulation of guest CPU
+instructions by running the guest in a virtual HW mode. When the guest
+executes instructions that cannot be executed by virtual HW mode,
+execution returns to the KVM driver so it can inform QEMU to emulate the
+instructions in SW.
+
+One of the events that can cause a return to QEMU is when a guest device
+driver accesses an IO location. QEMU then dispatches the memory
+operation to the corresponding QEMU device object. In the case of a
+vhost user device, the memory operation would need to be sent over a
+socket to the vhost application. This path is accelerated by the QEMU
+virtio code by setting up an eventfd file descriptor that the vhost
+application can directly receive MMIO store notifications from the KVM
+driver, instead of needing them to be sent to the QEMU process first.
+
+vhost interrupt acceleration
+''''''''''''''''''''''''''''
+
+Another optimization used by the vhost application is the ability to
+directly inject interrupts into the VM via the KVM driver, again,
+bypassing the need to send the interrupt back to the QEMU process first.
+The QEMU virtio setup code configures the KVM driver with an eventfd
+that triggers the device interrupt in the guest when the eventfd is
+written. This irqfd file descriptor is then passed to the vhost user
+application program.
+
+vhost access to guest memory
+''''''''''''''''''''''''''''
+
+The vhost application is also allowed to directly access guest memory,
+instead of needing to send the data as messages to QEMU. This is also
+done with file descriptors sent to the vhost user application by QEMU.
+These descriptors can be passed to ``mmap()`` by the vhost application
+to map the guest address space into the vhost application.
+
+IOMMUs introduce another level of complexity, since the address given to
+the guest virtio device to DMA to or from is not a guest physical
+address. This case is handled by having vhost code within QEMU register
+as a listener for IOMMU mapping changes. The vhost application maintains
+a cache of IOMMMU translations: sending translation requests back to
+QEMU on cache misses, and in turn receiving flush requests from QEMU
+when mappings are purged.
+
+applicability to device separation
+''''''''''''''''''''''''''''''''''
+
+Much of the vhost model can be re-used by separated device emulation. In
+particular, the ideas of using a socket between QEMU and the device
+emulation application, using a file descriptor to inject interrupts into
+the VM via KVM, and allowing the application to ``mmap()`` the guest
+should be re used.
+
+There are, however, some notable differences between how a vhost
+application works and the needs of separated device emulation. The most
+basic is that vhost uses custom virtio device drivers which always
+trigger IO with MMIO stores. A separated device emulation model must
+work with existing IO device models and guest device drivers. MMIO loads
+break vhost store acceleration since they are synchronous - guest
+progress cannot continue until the load has been emulated. By contrast,
+stores are asynchronous, the guest can continue after the store event
+has been sent to the vhost application.
+
+Another difference is that in the vhost user model, a single daemon can
+support multiple QEMU instances. This is contrary to the security regime
+desired, in which the emulation application should only be allowed to
+access the files or devices the VM it's running on behalf of can access.
+#### qemu-io model
+
+Qemu-io is a test harness used to test changes to the QEMU block backend
+object code. (e.g., the code that implements disk images for disk driver
+emulation) Qemu-io is not a device emulation application per se, but it
+does compile the QEMU block objects into a separate binary from the main
+QEMU one. This could be useful for disk device emulation, since its
+emulation applications will need to include the QEMU block objects.
+
+New separation model based on proxy objects
+-------------------------------------------
+
+A different model based on proxy objects in the QEMU program
+communicating with remote emulation programs could provide separation
+while minimizing the changes needed to the device emulation code. The
+rest of this section is a discussion of how a proxy object model would
+work.
+
+Remote emulation processes
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The remote emulation process will run the QEMU object hierarchy without
+modification. The device emulation objects will be also be based on the
+QEMU code, because for anything but the simplest device, it would not be
+a tractable to re-implement both the object model and the many device
+backends that QEMU has.
+
+The processes will communicate with the QEMU process over UNIX domain
+sockets. The processes can be executed either as standalone processes,
+or be executed by QEMU. In both cases, the host backends the emulation
+processes will provide are specified on its command line, as they would
+be for QEMU. For example:
+
+::
+
+ disk-proc -blockdev driver=file,node-name=file0,filename=disk-file0 \
+ -blockdev driver=qcow2,node-name=drive0,file=file0
+
+would indicate process *disk-proc* uses a qcow2 emulated disk named
+*file0* as its backend.
+
+Emulation processes may emulate more than one guest controller. A common
+configuration might be to put all controllers of the same device class
+(e.g., disk, network, etc.) in a single process, so that all backends of
+the same type can be managed by a single QMP monitor.
+
+communication with QEMU
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The first argument to the remote emulation process will be a Unix domain
+socket that connects with the Proxy object. This is a required argument.
+
+::
+
+ disk-proc <socket number> <backend list>
+
+remote process QMP monitor
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Remote emulation processes can be monitored via QMP, similar to QEMU
+itself. The QMP monitor socket is specified the same as for a QEMU
+process:
+
+::
+
+ disk-proc -qmp unix:/tmp/disk-mon,server
+
+can be monitored over the UNIX socket path */tmp/disk-mon*.
+
+QEMU command line
+~~~~~~~~~~~~~~~~~
+
+Each remote device emulated in a remote process on the host is
+represented as a *-device* of type *pci-proxy-dev*. A socket
+sub-option to this option specifies the Unix socket that connects
+to the remote process. An *id* sub-option is required, and it should
+be the same id as used in the remote process.
+
+::
+
+ qemu-system-x86_64 ... -device pci-proxy-dev,id=lsi0,socket=3
+
+can be used to add a device emulated in a remote process
+
+
+QEMU management of remote processes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+QEMU is not aware of the type of type of the remote PCI device. It is
+a pass through device as far as QEMU is concerned.
+
+communication with emulation process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+primary channel
+'''''''''''''''
+
+The primary channel (referred to as com in the code) is used to bootstrap
+the remote process. It is also used to pass on device-agnostic commands
+like reset.
+
+per-device channels
+'''''''''''''''''''
+
+Each remote device communicates with QEMU using a dedicated communication
+channel. The proxy object sets up this channel using the primary
+channel during its initialization.
+
+QEMU device proxy objects
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+QEMU has an object model based on sub-classes inherited from the
+"object" super-class. The sub-classes that are of interest here are the
+"device" and "bus" sub-classes whose child sub-classes make up the
+device tree of a QEMU emulated system.
+
+The proxy object model will use device proxy objects to replace the
+device emulation code within the QEMU process. These objects will live
+in the same place in the object and bus hierarchies as the objects they
+replace. i.e., the proxy object for an LSI SCSI controller will be a
+sub-class of the "pci-device" class, and will have the same PCI bus
+parent and the same SCSI bus child objects as the LSI controller object
+it replaces.
+
+It is worth noting that the same proxy object is used to mediate with
+all types of remote PCI devices.
+
+object initialization
+^^^^^^^^^^^^^^^^^^^^^
+
+The Proxy device objects are initialized in the exact same manner in
+which any other QEMU device would be initialized.
+
+In addition, the Proxy objects perform the following two tasks:
+- Parses the "socket" sub option and connects to the remote process
+using this channel
+- Uses the "id" sub-option to connect to the emulated device on the
+separate process
+
+class\_init
+'''''''''''
+
+The ``class_init()`` method of a proxy object will, in general behave
+similarly to the object it replaces, including setting any static
+properties and methods needed by the proxy.
+
+instance\_init / realize
+''''''''''''''''''''''''
+
+The ``instance_init()`` and ``realize()`` functions would only need to
+perform tasks related to being a proxy, such are registering its own
+MMIO handlers, or creating a child bus that other proxy devices can be
+attached to later.
+
+Other tasks will be device-specific. For example, PCI device objects
+will initialize the PCI config space in order to make a valid PCI device
+tree within the QEMU process.
+
+address space registration
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Most devices are driven by guest device driver accesses to IO addresses
+or ports. The QEMU device emulation code uses QEMU's memory region
+function calls (such as ``memory_region_init_io()``) to add callback
+functions that QEMU will invoke when the guest accesses the device's
+areas of the IO address space. When a guest driver does access the
+device, the VM will exit HW virtualization mode and return to QEMU,
+which will then lookup and execute the corresponding callback function.
+
+A proxy object would need to mirror the memory region calls the actual
+device emulator would perform in its initialization code, but with its
+own callbacks. When invoked by QEMU as a result of a guest IO operation,
+they will forward the operation to the device emulation process.
+
+PCI config space
+^^^^^^^^^^^^^^^^
+
+PCI devices also have a configuration space that can be accessed by the
+guest driver. Guest accesses to this space is not handled by the device
+emulation object, but by its PCI parent object. Much of this space is
+read-only, but certain registers (especially BAR and MSI-related ones)
+need to be propagated to the emulation process.
+
+PCI parent proxy
+''''''''''''''''
+
+One way to propagate guest PCI config accesses is to create a
+"pci-device-proxy" class that can serve as the parent of a PCI device
+proxy object. This class's parent would be "pci-device" and it would
+override the PCI parent's ``config_read()`` and ``config_write()``
+methods with ones that forward these operations to the emulation
+program.
+
+interrupt receipt
+^^^^^^^^^^^^^^^^^
+
+A proxy for a device that generates interrupts will need to create a
+socket to receive interrupt indications from the emulation process. An
+incoming interrupt indication would then be sent up to its bus parent to
+be injected into the guest. For example, a PCI device object may use
+``pci_set_irq()``.
+
+live migration
+^^^^^^^^^^^^^^
+
+The proxy will register to save and restore any *vmstate* it needs over
+a live migration event. The device proxy does not need to manage the
+remote device's *vmstate*; that will be handled by the remote process
+proxy (see below).
+
+QEMU remote device operation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Generic device operations, such as DMA, will be performed by the remote
+process proxy by sending messages to the remote process.
+
+DMA operations
+^^^^^^^^^^^^^^
+
+DMA operations would be handled much like vhost applications do. One of
+the initial messages sent to the emulation process is a guest memory
+table. Each entry in this table consists of a file descriptor and size
+that the emulation process can ``mmap()`` to directly access guest
+memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
+must be backed by file descriptors, such as when QEMU is given the
+*-mem-path* command line option.
+
+IOMMU operations
+^^^^^^^^^^^^^^^^
+
+When the emulated system includes an IOMMU, the remote process proxy in
+QEMU will need to create a socket for IOMMU requests from the emulation
+process. It will handle those requests with an
+``address_space_get_iotlb_entry()`` call. In order to handle IOMMU
+unmaps, the remote process proxy will also register as a listener on the
+device's DMA address space. When an IOMMU memory region is created
+within the DMA address space, an IOMMU notifier for unmaps will be added
+to the memory region that will forward unmaps to the emulation process
+over the IOMMU socket.
+
+device hot-plug via QMP
+^^^^^^^^^^^^^^^^^^^^^^^
+
+An QMP "device\_add" command can add a device emulated by a remote
+process. It will also have "rid" option to the command, just as the
+*-device* command line option does. The remote process may either be one
+started at QEMU startup, or be one added by the "add-process" QMP
+command described above. In either case, the remote process proxy will
+forward the new device's JSON description to the corresponding emulation
+process.
+
+live migration
+^^^^^^^^^^^^^^
+
+The remote process proxy will also register for live migration
+notifications with ``vmstate_register()``. When called to save state,
+the proxy will send the remote process a secondary socket file
+descriptor to save the remote process's device *vmstate* over. The
+incoming byte stream length and data will be saved as the proxy's
+*vmstate*. When the proxy is resumed on its new host, this *vmstate*
+will be extracted, and a secondary socket file descriptor will be sent
+to the new remote process through which it receives the *vmstate* in
+order to restore the devices there.
+
+device emulation in remote process
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The parts of QEMU that the emulation program will need include the
+object model; the memory emulation objects; the device emulation objects
+of the targeted device, and any dependent devices; and, the device's
+backends. It will also need code to setup the machine environment,
+handle requests from the QEMU process, and route machine-level requests
+(such as interrupts or IOMMU mappings) back to the QEMU process.
+
+initialization
+^^^^^^^^^^^^^^
+
+The process initialization sequence will follow the same sequence
+followed by QEMU. It will first initialize the backend objects, then
+device emulation objects. The JSON descriptions sent by the QEMU process
+will drive which objects need to be created.
+
+- address spaces
+
+Before the device objects are created, the initial address spaces and
+memory regions must be configured with ``memory_map_init()``. This
+creates a RAM memory region object (*system\_memory*) and an IO memory
+region object (*system\_io*).
+
+- RAM
+
+RAM memory region creation will follow how ``pc_memory_init()`` creates
+them, but must use ``memory_region_init_ram_from_fd()`` instead of
+``memory_region_allocate_system_memory()``. The file descriptors needed
+will be supplied by the guest memory table from above. Those RAM regions
+would then be added to the *system\_memory* memory region with
+``memory_region_add_subregion()``.
+
+- PCI
+
+IO initialization will be driven by the JSON descriptions sent from the
+QEMU process. For a PCI device, a PCI bus will need to be created with
+``pci_root_bus_new()``, and a PCI memory region will need to be created
+and added to the *system\_memory* memory region with
+``memory_region_add_subregion_overlap()``. The overlap version is
+required for architectures where PCI memory overlaps with RAM memory.
+
+MMIO handling
+^^^^^^^^^^^^^
+
+The device emulation objects will use ``memory_region_init_io()`` to
+install their MMIO handlers, and ``pci_register_bar()`` to associate
+those handlers with a PCI BAR, as they do within QEMU currently.
+
+In order to use ``address_space_rw()`` in the emulation process to
+handle MMIO requests from QEMU, the PCI physical addresses must be the
+same in the QEMU process and the device emulation process. In order to
+accomplish that, guest BAR programming must also be forwarded from QEMU
+to the emulation process.
+
+interrupt injection
+^^^^^^^^^^^^^^^^^^^
+
+When device emulation wants to inject an interrupt into the VM, the
+request climbs the device's bus object hierarchy until the point where a
+bus object knows how to signal the interrupt to the guest. The details
+depend on the type of interrupt being raised.
+
+- PCI pin interrupts
+
+On x86 systems, there is an emulated IOAPIC object attached to the root
+PCI bus object, and the root PCI object forwards interrupt requests to
+it. The IOAPIC object, in turn, calls the KVM driver to inject the
+corresponding interrupt into the VM. The simplest way to handle this in
+an emulation process would be to setup the root PCI bus driver (via
+``pci_bus_irqs()``) to send a interrupt request back to the QEMU
+process, and have the device proxy object reflect it up the PCI tree
+there.
+
+- PCI MSI/X interrupts
+
+PCI MSI/X interrupts are implemented in HW as DMA writes to a
+CPU-specific PCI address. In QEMU on x86, a KVM APIC object receives
+these DMA writes, then calls into the KVM driver to inject the interrupt
+into the VM. A simple emulation process implementation would be to send
+the MSI DMA address from QEMU as a message at initialization, then
+install an address space handler at that address which forwards the MSI
+message back to QEMU.
+
+DMA operations
+^^^^^^^^^^^^^^
+
+When a emulation object wants to DMA into or out of guest memory, it
+first must use dma\_memory\_map() to convert the DMA address to a local
+virtual address. The emulation process memory region objects setup above
+will be used to translate the DMA address to a local virtual address the
+device emulation code can access.
+
+IOMMU
+^^^^^
+
+When an IOMMU is in use in QEMU, DMA translation uses IOMMU memory
+regions to translate the DMA address to a guest physical address before
+that physical address can be translated to a local virtual address. The
+emulation process will need similar functionality.
+
+- IOTLB cache
+
+The emulation process will maintain a cache of recent IOMMU translations
+(the IOTLB). When the translate() callback of an IOMMU memory region is
+invoked, the IOTLB cache will be searched for an entry that will map the
+DMA address to a guest PA. On a cache miss, a message will be sent back
+to QEMU requesting the corresponding translation entry, which be both be
+used to return a guest address and be added to the cache.
+
+- IOTLB purge
+
+The IOMMU emulation will also need to act on unmap requests from QEMU.
+These happen when the guest IOMMU driver purges an entry from the
+guest's translation table.
+
+live migration
+^^^^^^^^^^^^^^
+
+When a remote process receives a live migration indication from QEMU, it
+will set up a channel using the received file descriptor with
+``qio_channel_socket_new_fd()``. This channel will be used to create a
+*QEMUfile* that can be passed to ``qemu_save_device_state()`` to send
+the process's device state back to QEMU. This method will be reversed on
+restore - the channel will be passed to ``qemu_loadvm_state()`` to
+restore the device state.
+
+Accelerating device emulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The messages that are required to be sent between QEMU and the emulation
+process can add considerable latency to IO operations. The optimizations
+described below attempt to ameliorate this effect by allowing the
+emulation process to communicate directly with the kernel KVM driver.
+The KVM file descriptors created would be passed to the emulation process
+via initialization messages, much like the guest memory table is done.
+#### MMIO acceleration
+
+Vhost user applications can receive guest virtio driver stores directly
+from KVM. The issue with the eventfd mechanism used by vhost user is
+that it does not pass any data with the event indication, so it cannot
+handle guest loads or guest stores that carry store data. This concept
+could, however, be expanded to cover more cases.
+
+The expanded idea would require a new type of KVM device:
+*KVM\_DEV\_TYPE\_USER*. This device has two file descriptors: a master
+descriptor that QEMU can use for configuration, and a slave descriptor
+that the emulation process can use to receive MMIO notifications. QEMU
+would create both descriptors using the KVM driver, and pass the slave
+descriptor to the emulation process via an initialization message.
+
+data structures
+^^^^^^^^^^^^^^^
+
+- guest physical range
+
+The guest physical range structure describes the address range that a
+device will respond to. It includes the base and length of the range, as
+well as which bus the range resides on (e.g., on an x86machine, it can
+specify whether the range refers to memory or IO addresses).
+
+A device can have multiple physical address ranges it responds to (e.g.,
+a PCI device can have multiple BARs), so the structure will also include
+an enumerated identifier to specify which of the device's ranges is
+being referred to.
+
++--------+----------------------------+
+| Name | Description |
++========+============================+
+| addr | range base address |
++--------+----------------------------+
+| len | range length |
++--------+----------------------------+
+| bus | addr type (memory or IO) |
++--------+----------------------------+
+| id | range ID (e.g., PCI BAR) |
++--------+----------------------------+
+
+- MMIO request structure
+
+This structure describes an MMIO operation. It includes which guest
+physical range the MMIO was within, the offset within that range, the
+MMIO type (e.g., load or store), and its length and data. It also
+includes a sequence number that can be used to reply to the MMIO, and
+the CPU that issued the MMIO.
+
++----------+------------------------+
+| Name | Description |
++==========+========================+
+| rid | range MMIO is within |
++----------+------------------------+
+| offset | offset withing *rid* |
++----------+------------------------+
+| type | e.g., load or store |
++----------+------------------------+
+| len | MMIO length |
++----------+------------------------+
+| data | store data |
++----------+------------------------+
+| seq | sequence ID |
++----------+------------------------+
+
+- MMIO request queues
+
+MMIO request queues are FIFO arrays of MMIO request structures. There
+are two queues: pending queue is for MMIOs that haven't been read by the
+emulation program, and the sent queue is for MMIOs that haven't been
+acknowledged. The main use of the second queue is to validate MMIO
+replies from the emulation program.
+
+- scoreboard
+
+Each CPU in the VM is emulated in QEMU by a separate thread, so multiple
+MMIOs may be waiting to be consumed by an emulation program and multiple
+threads may be waiting for MMIO replies. The scoreboard would contain a
+wait queue and sequence number for the per-CPU threads, allowing them to
+be individually woken when the MMIO reply is received from the emulation
+program. It also tracks the number of posted MMIO stores to the device
+that haven't been replied to, in order to satisfy the PCI constraint
+that a load to a device will not complete until all previous stores to
+that device have been completed.
+
+- device shadow memory
+
+Some MMIO loads do not have device side-effects. These MMIOs can be
+completed without sending a MMIO request to the emulation program if the
+emulation program shares a shadow image of the device's memory image
+with the KVM driver.
+
+The emulation program will ask the KVM driver to allocate memory for the
+shadow image, and will then use ``mmap()`` to directly access it. The
+emulation program can control KVM access to the shadow image by sending
+KVM an access map telling it which areas of the image have no
+side-effects (and can be completed immediately), and which require a
+MMIO request to the emulation program. The access map can also inform
+the KVM drive which size accesses are allowed to the image.
+
+master descriptor
+^^^^^^^^^^^^^^^^^
+
+The master descriptor is used by QEMU to configure the new KVM device.
+The descriptor would be returned by the KVM driver when QEMU issues a
+*KVM\_CREATE\_DEVICE* ``ioctl()`` with a *KVM\_DEV\_TYPE\_USER* type.
+
+KVM\_DEV\_TYPE\_USER device ops
+
+
+The *KVM\_DEV\_TYPE\_USER* operations vector will be registered by a
+``kvm_register_device_ops()`` call when the KVM system in initialized by
+``kvm_init()``. These device ops are called by the KVM driver when QEMU
+executes certain ``ioctl()`` operations on its KVM file descriptor. They
+include:
+
+- create
+
+This routine is called when QEMU issues a *KVM\_CREATE\_DEVICE*
+``ioctl()`` on its per-VM file descriptor. It will allocate and
+initialize a KVM user device specific data structure, and assign the
+*kvm\_device* private field to it.
+
+- ioctl
+
+This routine is invoked when QEMU issues an ``ioctl()`` on the master
+descriptor. The ``ioctl()`` commands supported are defined by the KVM
+device type. *KVM\_DEV\_TYPE\_USER* ones will need several commands:
+
+*KVM\_DEV\_USER\_SLAVE\_FD* creates the slave file descriptor that will
+be passed to the device emulation program. Only one slave can be created
+by each master descriptor. The file operations performed by this
+descriptor are described below.
+
+The *KVM\_DEV\_USER\_PA\_RANGE* command configures a guest physical
+address range that the slave descriptor will receive MMIO notifications
+for. The range is specified by a guest physical range structure
+argument. For buses that assign addresses to devices dynamically, this
+command can be executed while the guest is running, such as the case
+when a guest changes a device's PCI BAR registers.
+
+*KVM\_DEV\_USER\_PA\_RANGE* will use ``kvm_io_bus_register_dev()`` to
+register *kvm\_io\_device\_ops* callbacks to be invoked when the guest
+performs a MMIO operation within the range. When a range is changed,
+``kvm_io_bus_unregister_dev()`` is used to remove the previous
+instantiation.
+
+*KVM\_DEV\_USER\_TIMEOUT* will configure a timeout value that specifies
+how long KVM will wait for the emulation process to respond to a MMIO
+indication.
+
+- destroy
+
+This routine is called when the VM instance is destroyed. It will need
+to destroy the slave descriptor; and free any memory allocated by the
+driver, as well as the *kvm\_device* structure itself.
+
+slave descriptor
+^^^^^^^^^^^^^^^^
+
+The slave descriptor will have its own file operations vector, which
+responds to system calls on the descriptor performed by the device
+emulation program.
+
+- read
+
+A read returns any pending MMIO requests from the KVM driver as MMIO
+request structures. Multiple structures can be returned if there are
+multiple MMIO operations pending. The MMIO requests are moved from the
+pending queue to the sent queue, and if there are threads waiting for
+space in the pending to add new MMIO operations, they will be woken
+here.
+
+- write
+
+A write also consists of a set of MMIO requests. They are compared to
+the MMIO requests in the sent queue. Matches are removed from the sent
+queue, and any threads waiting for the reply are woken. If a store is
+removed, then the number of posted stores in the per-CPU scoreboard is
+decremented. When the number is zero, and a non side-effect load was
+waiting for posted stores to complete, the load is continued.
+
+- ioctl
+
+There are several ioctl()s that can be performed on the slave
+descriptor.
+
+A *KVM\_DEV\_USER\_SHADOW\_SIZE* ``ioctl()`` causes the KVM driver to
+allocate memory for the shadow image. This memory can later be
+``mmap()``\ ed by the emulation process to share the emulation's view of
+device memory with the KVM driver.
+
+A *KVM\_DEV\_USER\_SHADOW\_CTRL* ``ioctl()`` controls access to the
+shadow image. It will send the KVM driver a shadow control map, which
+specifies which areas of the image can complete guest loads without
+sending the load request to the emulation program. It will also specify
+the size of load operations that are allowed.
+
+- poll
+
+An emulation program will use the ``poll()`` call with a *POLLIN* flag
+to determine if there are MMIO requests waiting to be read. It will
+return if the pending MMIO request queue is not empty.
+
+- mmap
+
+This call allows the emulation program to directly access the shadow
+image allocated by the KVM driver. As device emulation updates device
+memory, changes with no side-effects will be reflected in the shadow,
+and the KVM driver can satisfy guest loads from the shadow image without
+needing to wait for the emulation program.
+
+kvm\_io\_device ops
+^^^^^^^^^^^^^^^^^^^
+
+Each KVM per-CPU thread can handle MMIO operation on behalf of the guest
+VM. KVM will use the MMIO's guest physical address to search for a
+matching *kvm\_io\_device* to see if the MMIO can be handled by the KVM
+driver instead of exiting back to QEMU. If a match is found, the
+corresponding callback will be invoked.
+
+- read
+
+This callback is invoked when the guest performs a load to the device.
+Loads with side-effects must be handled synchronously, with the KVM
+driver putting the QEMU thread to sleep waiting for the emulation
+process reply before re-starting the guest. Loads that do not have
+side-effects may be optimized by satisfying them from the shadow image,
+if there are no outstanding stores to the device by this CPU. PCI memory
+ordering demands that a load cannot complete before all older stores to
+the same device have been completed.
+
+- write
+
+Stores can be handled asynchronously unless the pending MMIO request
+queue is full. In this case, the QEMU thread must sleep waiting for
+space in the queue. Stores will increment the number of posted stores in
+the per-CPU scoreboard, in order to implement the PCI ordering
+constraint above.
+
+interrupt acceleration
+^^^^^^^^^^^^^^^^^^^^^^
+
+This performance optimization would work much like a vhost user
+application does, where the QEMU process sets up *eventfds* that cause
+the device's corresponding interrupt to be triggered by the KVM driver.
+These irq file descriptors are sent to the emulation process at
+initialization, and are used when the emulation code raises a device
+interrupt.
+
+intx acceleration
+'''''''''''''''''
+
+Traditional PCI pin interrupts are level based, so, in addition to an
+irq file descriptor, a re-sampling file descriptor needs to be sent to
+the emulation program. This second file descriptor allows multiple
+devices sharing an irq to be notified when the interrupt has been
+acknowledged by the guest, so they can re-trigger the interrupt if their
+device has not de-asserted its interrupt.
+
+intx irq descriptor
+
+
+The irq descriptors are created by the proxy object
+``using event_notifier_init()`` to create the irq and re-sampling
+*eventds*, and ``kvm_vm_ioctl(KVM_IRQFD)`` to bind them to an interrupt.
+The interrupt route can be found with
+``pci_device_route_intx_to_irq()``.
+
+intx routing changes
+
+
+Intx routing can be changed when the guest programs the APIC the device
+pin is connected to. The proxy object in QEMU will use
+``pci_device_set_intx_routing_notifier()`` to be informed of any guest
+changes to the route. This handler will broadly follow the VFIO
+interrupt logic to change the route: de-assigning the existing irq
+descriptor from its route, then assigning it the new route. (see
+``vfio_intx_update()``)
+
+MSI/X acceleration
+''''''''''''''''''
+
+MSI/X interrupts are sent as DMA transactions to the host. The interrupt
+data contains a vector that is programmed by the guest, A device may have
+multiple MSI interrupts associated with it, so multiple irq descriptors
+may need to be sent to the emulation program.
+
+MSI/X irq descriptor
+
+
+This case will also follow the VFIO example. For each MSI/X interrupt,
+an *eventfd* is created, a virtual interrupt is allocated by
+``kvm_irqchip_add_msi_route()``, and the virtual interrupt is bound to
+the eventfd with ``kvm_irqchip_add_irqfd_notifier()``.
+
+MSI/X config space changes
+
+
+The guest may dynamically update several MSI-related tables in the
+device's PCI config space. These include per-MSI interrupt enables and
+vector data. Additionally, MSIX tables exist in device memory space, not
+config space. Much like the BAR case above, the proxy object must look
+at guest config space programming to keep the MSI interrupt state
+consistent between QEMU and the emulation program.
+
+--------------
+
+Disaggregated CPU emulation
+---------------------------
+
+After IO services have been disaggregated, a second phase would be to
+separate a process to handle CPU instruction emulation from the main
+QEMU control function. There are no object separation points for this
+code, so the first task would be to create one.
+
+Host access controls
+--------------------
+
+Separating QEMU relies on the host OS's access restriction mechanisms to
+enforce that the differing processes can only access the objects they
+are entitled to. There are a couple types of mechanisms usually provided
+by general purpose OSs.
+
+Discretionary access control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Discretionary access control allows each user to control who can access
+their files. In Linux, this type of control is usually too coarse for
+QEMU separation, since it only provides three separate access controls:
+one for the same user ID, the second for users IDs with the same group
+ID, and the third for all other user IDs. Each device instance would
+need a separate user ID to provide access control, which is likely to be
+unwieldy for dynamically created VMs.
+
+Mandatory access control
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Mandatory access control allows the OS to add an additional set of
+controls on top of discretionary access for the OS to control. It also
+adds other attributes to processes and files such as types, roles, and
+categories, and can establish rules for how processes and files can
+interact.
+
+Type enforcement
+^^^^^^^^^^^^^^^^
+
+Type enforcement assigns a *type* attribute to processes and files, and
+allows rules to be written on what operations a process with a given
+type can perform on a file with a given type. QEMU separation could take
+advantage of type enforcement by running the emulation processes with
+different types, both from the main QEMU process, and from the emulation
+processes of different classes of devices.
+
+For example, guest disk images and disk emulation processes could have
+types separate from the main QEMU process and non-disk emulation
+processes, and the type rules could prevent processes other than disk
+emulation ones from accessing guest disk images. Similarly, network
+emulation processes can have a type separate from the main QEMU process
+and non-network emulation process, and only that type can access the
+host tun/tap device used to provide guest networking.
+
+Category enforcement
+^^^^^^^^^^^^^^^^^^^^
+
+Category enforcement assigns a set of numbers within a given range to
+the process or file. The process is granted access to the file if the
+process's set is a superset of the file's set. This enforcement can be
+used to separate multiple instances of devices in the same class.
+
+For example, if there are multiple disk devices provides to a guest,
+each device emulation process could be provisioned with a separate
+category. The different device emulation processes would not be able to
+access each other's backing disk images.
+
+Alternatively, categories could be used in lieu of the type enforcement
+scheme described above. In this scenario, different categories would be
+used to prevent device emulation processes in different classes from
+accessing resources assigned to other classes.
diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst
index 809af69725..209f9d8172 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -765,9 +765,6 @@ and hypothetical example follows:
class MultipleMachines(Test):
- """
- :avocado: enable
- """
def test_multiple_machines(self):
first_machine = self.get_vm()
second_machine = self.get_vm()
diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt
index e9271eba5d..f15bf35bd1 100644
--- a/docs/interop/parallels.txt
+++ b/docs/interop/parallels.txt
@@ -208,7 +208,7 @@ of its data area are:
28 - 31: l1_size
The number of entries in the L1 table of the bitmap.
- variable: l1 (64 * l1_size bytes)
+ variable: l1_table (8 * l1_size bytes)
L1 offset table (in bytes)
A dirty bitmap is stored using a one-level structure for the mapping to host
diff --git a/docs/meson.build b/docs/meson.build
index bb14eaebd3..f84306ba7e 100644
--- a/docs/meson.build
+++ b/docs/meson.build
@@ -46,9 +46,11 @@ if build_docs
meson.source_root() / 'docs/sphinx/qmp_lexer.py',
qapi_gen_depends ]
+ have_ga = have_tools and config_host.has_key('CONFIG_GUEST_AGENT')
+
man_pages = {
- 'qemu-ga.8': (have_tools ? 'man8' : ''),
- 'qemu-ga-ref.7': 'man7',
+ 'qemu-ga.8': (have_ga ? 'man8' : ''),
+ 'qemu-ga-ref.7': (have_ga ? 'man7' : ''),
'qemu-qmp-ref.7': 'man7',
'qemu-storage-daemon-qmp-ref.7': (have_tools ? 'man7' : ''),
'qemu-img.1': (have_tools ? 'man1' : ''),
diff --git a/docs/papr-pef.txt b/docs/papr-pef.txt
new file mode 100644
index 0000000000..72550e9bf8
--- /dev/null
+++ b/docs/papr-pef.txt
@@ -0,0 +1,30 @@
+POWER (PAPR) Protected Execution Facility (PEF)
+===============================================
+
+Protected Execution Facility (PEF), also known as Secure Guest support
+is a feature found on IBM POWER9 and POWER10 processors.
+
+If a suitable firmware including an Ultravisor is installed, it adds
+an extra memory protection mode to the CPU. The ultravisor manages a
+pool of secure memory which cannot be accessed by the hypervisor.
+
+When this feature is enabled in QEMU, a guest can use ultracalls to
+enter "secure mode". This transfers most of its memory to secure
+memory, where it cannot be eavesdropped by a compromised hypervisor.
+
+Launching
+---------
+
+To launch a guest which will be permitted to enter PEF secure mode:
+
+# ${QEMU} \
+ -object pef-guest,id=pef0 \
+ -machine confidential-guest-support=pef0 \
+ ...
+
+Live Migration
+----------------
+
+Live migration is not yet implemented for PEF guests. For
+consistency, we currently prevent migration if the PEF feature is
+enabled, whether or not the guest has actually entered secure mode.
diff --git a/docs/system/arm/versatile.rst b/docs/system/arm/versatile.rst
index 51221c30a4..2ae792bac3 100644
--- a/docs/system/arm/versatile.rst
+++ b/docs/system/arm/versatile.rst
@@ -27,3 +27,37 @@ The Arm Versatile baseboard is emulated with the following devices:
devices.
- PL181 MultiMedia Card Interface with SD card.
+
+Booting a Linux kernel
+----------------------
+
+Building a current Linux kernel with ``versatile_defconfig`` should be
+enough to get something running. Nowadays an out-of-tree build is
+recommended (and also useful if you build a lot of different targets).
+In the following example $BLD points to the build directory and $SRC
+points to the root of the Linux source tree. You can drop $SRC if you
+are running from there.
+
+.. code-block:: bash
+
+ $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- versatile_defconfig
+ $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf-
+
+You may want to enable some additional modules if you want to boot
+something from the SCSI interface::
+
+ CONFIG_PCI=y
+ CONFIG_PCI_VERSATILE=y
+ CONFIG_SCSI=y
+ CONFIG_SCSI_SYM53C8XX_2=y
+
+You can then boot with a command line like:
+
+.. code-block:: bash
+
+ $ qemu-system-arm -machine type=versatilepb \
+ -serial mon:stdio \
+ -drive if=scsi,driver=file,filename=debian-buster-armel-rootfs.ext4 \
+ -kernel zImage \
+ -dtb versatile-pb.dtb \
+ -append "console=ttyAMA0 ro root=/dev/sda"
diff --git a/docs/system/arm/vexpress.rst b/docs/system/arm/vexpress.rst
index 7f1bcbef07..3e3839e923 100644
--- a/docs/system/arm/vexpress.rst
+++ b/docs/system/arm/vexpress.rst
@@ -58,3 +58,31 @@ Other differences between the hardware and the QEMU model:
``vexpress-a15``, and have IRQs from 40 upwards. If a dtb is
provided on the command line then QEMU will edit it to include
suitable entries describing these transports for the guest.
+
+Booting a Linux kernel
+----------------------
+
+Building a current Linux kernel with ``multi_v7_defconfig`` should be
+enough to get something running. Nowadays an out-of-tree build is
+recommended (and also useful if you build a lot of different targets).
+In the following example $BLD points to the build directory and $SRC
+points to the root of the Linux source tree. You can drop $SRC if you
+are running from there.
+
+.. code-block:: bash
+
+ $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf- multi_v7_defconfig
+ $ make O=$BLD -C $SRC ARCH=arm CROSS_COMPILE=arm-linux-gnueabihf-
+
+By default you will want to boot your rootfs off the sdcard interface.
+Your rootfs will need to be padded to the right size. With a suitable
+DTB you could also add devices to the virtio-mmio bus.
+
+.. code-block:: bash
+
+ $ qemu-system-arm -cpu cortex-a15 -smp 4 -m 4096 \
+ -machine type=vexpress-a15 -serial mon:stdio \
+ -drive if=sd,driver=file,filename=armel-rootfs.ext4 \
+ -kernel zImage \
+ -dtb vexpress-v2p-ca15-tc1.dtb \
+ -append "console=ttyAMA0 root=/dev/mmcblk0 ro"
diff --git a/docs/system/index.rst b/docs/system/index.rst
index d40f72c92b..625b494372 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -34,6 +34,7 @@ Contents:
pr-manager
targets
security
+ multi-process
deprecated
removed-features
build-platforms
diff --git a/docs/system/multi-process.rst b/docs/system/multi-process.rst
new file mode 100644
index 0000000000..46bb0cafc2
--- /dev/null
+++ b/docs/system/multi-process.rst
@@ -0,0 +1,64 @@
+Multi-process QEMU
+==================
+
+This document describes how to configure and use multi-process qemu.
+For the design document refer to docs/devel/qemu-multiprocess.
+
+1) Configuration
+----------------
+
+multi-process is enabled by default for targets that enable KVM
+
+
+2) Usage
+--------
+
+Multi-process QEMU requires an orchestrator to launch.
+
+Following is a description of command-line used to launch mpqemu.
+
+* Orchestrator:
+
+ - The Orchestrator creates a unix socketpair
+
+ - It launches the remote process and passes one of the
+ sockets to it via command-line.
+
+ - It then launches QEMU and specifies the other socket as an option
+ to the Proxy device object
+
+* Remote Process:
+
+ - QEMU can enter remote process mode by using the "remote" machine
+ option.
+
+ - The orchestrator creates a "remote-object" with details about
+ the device and the file descriptor for the device
+
+ - The remaining options are no different from how one launches QEMU with
+ devices.
+
+ - Example command-line for the remote process is as follows:
+
+ /usr/bin/qemu-system-x86_64 \
+ -machine x-remote \
+ -device lsi53c895a,id=lsi0 \
+ -drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2 \
+ -device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0 \
+ -object x-remote-object,id=robj1,devid=lsi1,fd=4,
+
+* QEMU:
+
+ - Since parts of the RAM are shared between QEMU & remote process, a
+ memory-backend-memfd is required to facilitate this, as follows:
+
+ -object memory-backend-memfd,id=mem,size=2G
+
+ - A "x-pci-proxy-dev" device is created for each of the PCI devices emulated
+ in the remote process. A "socket" sub-option specifies the other end of
+ unix channel created by orchestrator. The "id" sub-option must be specified
+ and should be the same as the "id" specified for the remote PCI device
+
+ - Example commandline for QEMU is as follows:
+
+ -device x-pci-proxy-dev,id=lsi0,socket=3
diff --git a/docs/system/s390x/protvirt.rst b/docs/system/s390x/protvirt.rst
index 712974ad87..0f481043d9 100644
--- a/docs/system/s390x/protvirt.rst
+++ b/docs/system/s390x/protvirt.rst
@@ -22,15 +22,22 @@ If those requirements are met, the capability `KVM_CAP_S390_PROTECTED`
will indicate that KVM can support PVMs on that LPAR.
-QEMU Settings
--------------
+Running a Protected Virtual Machine
+-----------------------------------
-To indicate to the VM that it can transition into protected mode, the
+To run a PVM you will need to select a CPU model which includes the
`Unpack facility` (stfle bit 161 represented by the feature
-`unpack`/`S390_FEAT_UNPACK`) needs to be part of the cpu model of
-the VM.
+`unpack`/`S390_FEAT_UNPACK`), and add these options to the command line::
+
+ -object s390-pv-guest,id=pv0 \
+ -machine confidential-guest-support=pv0
+
+Adding these options will:
+
+* Ensure the `unpack` facility is available
+* Enable the IOMMU by default for all I/O devices
+* Initialize the PV mechanism
-All I/O devices need to use the IOMMU.
Passthrough (vfio) devices are currently not supported.
Host huge page backings are not supported. However guests can use huge
diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
index fe41336dc5..ee862fa0bc 100644
--- a/docs/tools/qemu-nbd.rst
+++ b/docs/tools/qemu-nbd.rst
@@ -136,8 +136,8 @@ driver options if ``--image-opts`` is specified.
.. option:: -e, --shared=NUM
Allow up to *NUM* clients to share the device (default
- ``1``). Safe for readers, but for now, consistency is not
- guaranteed between multiple writers.
+ ``1``), 0 for unlimited. Safe for readers, but for now,
+ consistency is not guaranteed between multiple writers.
.. option:: -t, --persistent
diff --git a/gdbstub.c b/gdbstub.c
index c7ca7e9f88..759bb00bcf 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2245,7 +2245,6 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx)
{
TaskState *ts;
unsigned long offset, len, saved_auxv, auxv_len;
- const char *mem;
if (gdb_ctx->num_params < 2) {
put_packet("E22");
@@ -2257,8 +2256,8 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx)
ts = gdbserver_state.c_cpu->opaque;
saved_auxv = ts->info->saved_auxv;
auxv_len = ts->info->auxv_len;
- mem = (const char *)(saved_auxv + offset);
- if (offset > auxv_len) {
+
+ if (offset >= auxv_len) {
put_packet("E00");
return;
}
@@ -2269,12 +2268,20 @@ static void handle_query_xfer_auxv(GdbCmdContext *gdb_ctx, void *user_ctx)
if (len < auxv_len - offset) {
g_string_assign(gdbserver_state.str_buf, "m");
- memtox(gdbserver_state.str_buf, mem, len);
} else {
g_string_assign(gdbserver_state.str_buf, "l");
- memtox(gdbserver_state.str_buf, mem, auxv_len - offset);
+ len = auxv_len - offset;
+ }
+
+ g_byte_array_set_size(gdbserver_state.mem_buf, len);
+ if (target_memory_rw_debug(gdbserver_state.g_cpu, saved_auxv + offset,
+ gdbserver_state.mem_buf->data, len, false)) {
+ put_packet("E14");
+ return;
}
+ memtox(gdbserver_state.str_buf,
+ (const char *)gdbserver_state.mem_buf->data, len);
put_packet_binary(gdbserver_state.str_buf->str,
gdbserver_state.str_buf->len, true);
}
diff --git a/hw/Kconfig b/hw/Kconfig
index d4cec9e476..8ea26479c4 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -27,6 +27,7 @@ source pci-host/Kconfig
source pcmcia/Kconfig
source pci/Kconfig
source rdma/Kconfig
+source remote/Kconfig
source rtc/Kconfig
source scsi/Kconfig
source sd/Kconfig
diff --git a/hw/arm/aspeed_ast2600.c b/hw/arm/aspeed_ast2600.c
index 12e4a16d37..bf31ca351f 100644
--- a/hw/arm/aspeed_ast2600.c
+++ b/hw/arm/aspeed_ast2600.c
@@ -216,7 +216,7 @@ static void aspeed_soc_ast2600_init(Object *obj)
/*
* ASPEED ast2600 has 0xf as cluster ID
*
- * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0388e/CIHEBGFG.html
+ * https://developer.arm.com/documentation/ddi0388/e/the-system-control-coprocessors/summary-of-system-control-coprocessor-registers/multiprocessor-affinity-register
*/
static uint64_t aspeed_calc_affinity(int cpu)
{
diff --git a/hw/arm/musca.c b/hw/arm/musca.c
index 945643c3cd..7a83f7dda7 100644
--- a/hw/arm/musca.c
+++ b/hw/arm/musca.c
@@ -15,8 +15,8 @@
* https://developer.arm.com/products/system-design/development-boards/iot-test-chips-and-boards/musca-a-test-chip-board
* https://developer.arm.com/products/system-design/development-boards/iot-test-chips-and-boards/musca-b-test-chip-board
* We model the A and B1 variants of this board, as described in the TRMs:
- * http://infocenter.arm.com/help/topic/com.arm.doc.101107_0000_00_en/index.html
- * http://infocenter.arm.com/help/topic/com.arm.doc.101312_0000_00_en/index.html
+ * https://developer.arm.com/documentation/101107/latest/
+ * https://developer.arm.com/documentation/101312/latest/
*/
#include "qemu/osdep.h"
diff --git a/hw/arm/npcm7xx.c b/hw/arm/npcm7xx.c
index 72040d4079..d1fe9bd1df 100644
--- a/hw/arm/npcm7xx.c
+++ b/hw/arm/npcm7xx.c
@@ -576,14 +576,6 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp)
create_unimplemented_device("npcm7xx.pcierc", 0xe1000000, 64 * KiB);
create_unimplemented_device("npcm7xx.kcs", 0xf0007000, 4 * KiB);
create_unimplemented_device("npcm7xx.gfxi", 0xf000e000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[0]", 0xf0010000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[1]", 0xf0011000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[2]", 0xf0012000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[3]", 0xf0013000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[4]", 0xf0014000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[5]", 0xf0015000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[6]", 0xf0016000, 4 * KiB);
- create_unimplemented_device("npcm7xx.gpio[7]", 0xf0017000, 4 * KiB);
create_unimplemented_device("npcm7xx.smbus[0]", 0xf0080000, 4 * KiB);
create_unimplemented_device("npcm7xx.smbus[1]", 0xf0081000, 4 * KiB);
create_unimplemented_device("npcm7xx.smbus[2]", 0xf0082000, 4 * KiB);
diff --git a/hw/arm/xlnx-versal.c b/hw/arm/xlnx-versal.c
index b0777166e8..628e77ef66 100644
--- a/hw/arm/xlnx-versal.c
+++ b/hw/arm/xlnx-versal.c
@@ -67,10 +67,10 @@ static void versal_create_apu_gic(Versal *s, qemu_irq *pic)
gicbusdev = SYS_BUS_DEVICE(&s->fpd.apu.gic);
gicdev = DEVICE(&s->fpd.apu.gic);
qdev_prop_set_uint32(gicdev, "revision", 3);
- qdev_prop_set_uint32(gicdev, "num-cpu", 2);
+ qdev_prop_set_uint32(gicdev, "num-cpu", nr_apu_cpus);
qdev_prop_set_uint32(gicdev, "num-irq", XLNX_VERSAL_NR_IRQS + 32);
qdev_prop_set_uint32(gicdev, "len-redist-region-count", 1);
- qdev_prop_set_uint32(gicdev, "redist-region-count[0]", 2);
+ qdev_prop_set_uint32(gicdev, "redist-region-count[0]", nr_apu_cpus);
qdev_prop_set_bit(gicdev, "has-security-extensions", true);
sysbus_realize(SYS_BUS_DEVICE(&s->fpd.apu.gic), &error_fatal);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2670787d26..93ac6e107a 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -16,6 +16,7 @@
#include "qemu/units.h"
#include "qemu/cutils.h"
#include "qemu/log.h"
+#include "qemu/error-report.h"
#include "hw/block/block.h"
#include "hw/pci/pci.h"
#include "sysemu/sysemu.h"
@@ -25,28 +26,47 @@
#include "hw/qdev-properties.h"
#include "hw/qdev-core.h"
+#include "trace.h"
#include "nvme.h"
#include "nvme-ns.h"
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
{
+ BlockDriverInfo bdi;
NvmeIdNs *id_ns = &ns->id_ns;
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ int npdg;
- if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
- ns->id_ns.dlfeat = 0x9;
- }
+ ns->id_ns.dlfeat = 0x9;
id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
+ ns->csi = NVME_CSI_NVM;
+
/* no thin provisioning */
id_ns->ncap = id_ns->nsze;
id_ns->nuse = id_ns->ncap;
+
+ /* support DULBE and I/O optimization fields */
+ id_ns->nsfeat |= (0x4 | 0x10);
+
+ npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+ if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
+ bdi.cluster_size > ns->blkconf.discard_granularity) {
+ npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+ }
+
+ id_ns->npda = id_ns->npdg = npdg - 1;
+
+ return 0;
}
-static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
{
bool read_only;
@@ -59,19 +79,225 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1;
}
+ if (ns->blkconf.discard_granularity == -1) {
+ ns->blkconf.discard_granularity =
+ MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+ }
+
ns->size = blk_getlength(ns->blkconf.blk);
if (ns->size < 0) {
error_setg_errno(errp, -ns->size, "could not get blockdev size");
return -1;
}
- if (blk_enable_write_cache(ns->blkconf.blk)) {
- n->features.vwc = 0x1;
+ return 0;
+}
+
+static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
+{
+ uint64_t zone_size, zone_cap;
+ uint32_t lbasz = ns->blkconf.logical_block_size;
+
+ /* Make sure that the values of ZNS properties are sane */
+ if (ns->params.zone_size_bs) {
+ zone_size = ns->params.zone_size_bs;
+ } else {
+ zone_size = NVME_DEFAULT_ZONE_SIZE;
+ }
+ if (ns->params.zone_cap_bs) {
+ zone_cap = ns->params.zone_cap_bs;
+ } else {
+ zone_cap = zone_size;
+ }
+ if (zone_cap > zone_size) {
+ error_setg(errp, "zone capacity %"PRIu64"B exceeds "
+ "zone size %"PRIu64"B", zone_cap, zone_size);
+ return -1;
+ }
+ if (zone_size < lbasz) {
+ error_setg(errp, "zone size %"PRIu64"B too small, "
+ "must be at least %"PRIu32"B", zone_size, lbasz);
+ return -1;
+ }
+ if (zone_cap < lbasz) {
+ error_setg(errp, "zone capacity %"PRIu64"B too small, "
+ "must be at least %"PRIu32"B", zone_cap, lbasz);
+ return -1;
+ }
+
+ /*
+ * Save the main zone geometry values to avoid
+ * calculating them later again.
+ */
+ ns->zone_size = zone_size / lbasz;
+ ns->zone_capacity = zone_cap / lbasz;
+ ns->num_zones = ns->size / lbasz / ns->zone_size;
+
+ /* Do a few more sanity checks of ZNS properties */
+ if (!ns->num_zones) {
+ error_setg(errp,
+ "insufficient drive capacity, must be at least the size "
+ "of one zone (%"PRIu64"B)", zone_size);
+ return -1;
+ }
+
+ if (ns->params.max_open_zones > ns->num_zones) {
+ error_setg(errp,
+ "max_open_zones value %u exceeds the number of zones %u",
+ ns->params.max_open_zones, ns->num_zones);
+ return -1;
+ }
+ if (ns->params.max_active_zones > ns->num_zones) {
+ error_setg(errp,
+ "max_active_zones value %u exceeds the number of zones %u",
+ ns->params.max_active_zones, ns->num_zones);
+ return -1;
+ }
+
+ if (ns->params.zd_extension_size) {
+ if (ns->params.zd_extension_size & 0x3f) {
+ error_setg(errp,
+ "zone descriptor extension size must be a multiple of 64B");
+ return -1;
+ }
+ if ((ns->params.zd_extension_size >> 6) > 0xff) {
+ error_setg(errp, "zone descriptor extension size is too large");
+ return -1;
+ }
}
return 0;
}
+static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
+{
+ uint64_t start = 0, zone_size = ns->zone_size;
+ uint64_t capacity = ns->num_zones * zone_size;
+ NvmeZone *zone;
+ int i;
+
+ ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+ if (ns->params.zd_extension_size) {
+ ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+ ns->num_zones);
+ }
+
+ QTAILQ_INIT(&ns->exp_open_zones);
+ QTAILQ_INIT(&ns->imp_open_zones);
+ QTAILQ_INIT(&ns->closed_zones);
+ QTAILQ_INIT(&ns->full_zones);
+
+ zone = ns->zone_array;
+ for (i = 0; i < ns->num_zones; i++, zone++) {
+ if (start + zone_size > capacity) {
+ zone_size = capacity - start;
+ }
+ zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
+ nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
+ zone->d.za = 0;
+ zone->d.zcap = ns->zone_capacity;
+ zone->d.zslba = start;
+ zone->d.wp = start;
+ zone->w_ptr = start;
+ start += zone_size;
+ }
+
+ ns->zone_size_log2 = 0;
+ if (is_power_of_2(ns->zone_size)) {
+ ns->zone_size_log2 = 63 - clz64(ns->zone_size);
+ }
+}
+
+static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
+{
+ NvmeIdNsZoned *id_ns_z;
+
+ nvme_ns_zoned_init_state(ns);
+
+ id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
+
+ /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
+ id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+ id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
+ id_ns_z->zoc = 0;
+ id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
+
+ id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
+ id_ns_z->lbafe[lba_index].zdes =
+ ns->params.zd_extension_size >> 6; /* Units of 64B */
+
+ ns->csi = NVME_CSI_ZONED;
+ ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
+ ns->id_ns.ncap = ns->id_ns.nsze;
+ ns->id_ns.nuse = ns->id_ns.ncap;
+
+ /*
+ * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
+ * status of logical blocks. Since the spec defines that logical blocks
+ * SHALL be deallocated when then zone is in the Empty or Offline states,
+ * we can only support DULBE if the zone size is a multiple of the
+ * calculated NPDG.
+ */
+ if (ns->zone_size % (ns->id_ns.npdg + 1)) {
+ warn_report("the zone size (%"PRIu64" blocks) is not a multiple of "
+ "the calculated deallocation granularity (%d blocks); "
+ "DULBE support disabled",
+ ns->zone_size, ns->id_ns.npdg + 1);
+
+ ns->id_ns.nsfeat &= ~0x4;
+ }
+
+ ns->id_ns_zoned = id_ns_z;
+}
+
+static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint8_t state;
+
+ zone->w_ptr = zone->d.wp;
+ state = nvme_get_zone_state(zone);
+ if (zone->d.wp != zone->d.zslba ||
+ (zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
+ if (state != NVME_ZONE_STATE_CLOSED) {
+ trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
+ nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
+ }
+ nvme_aor_inc_active(ns);
+ QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
+ } else {
+ trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
+ nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
+ }
+}
+
+/*
+ * Close all the zones that are currently open.
+ */
+static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
+{
+ NvmeZone *zone, *next;
+
+ QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
+ QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+ nvme_aor_dec_active(ns);
+ nvme_clear_zone(ns, zone);
+ }
+ QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
+ QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+ nvme_aor_dec_open(ns);
+ nvme_aor_dec_active(ns);
+ nvme_clear_zone(ns, zone);
+ }
+ QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
+ QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
+ nvme_aor_dec_open(ns);
+ nvme_aor_dec_active(ns);
+ nvme_clear_zone(ns, zone);
+ }
+
+ assert(ns->nr_open_zones == 0);
+}
+
static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
{
if (!ns->blkconf.blk) {
@@ -82,20 +308,25 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
return 0;
}
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
{
if (nvme_ns_check_constraints(ns, errp)) {
return -1;
}
- if (nvme_ns_init_blk(n, ns, errp)) {
+ if (nvme_ns_init_blk(ns, errp)) {
return -1;
}
- nvme_ns_init(ns);
- if (nvme_register_namespace(n, ns, errp)) {
+ if (nvme_ns_init(ns, errp)) {
return -1;
}
+ if (ns->params.zoned) {
+ if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
+ return -1;
+ }
+ nvme_ns_init_zoned(ns, 0);
+ }
return 0;
}
@@ -105,9 +336,21 @@ void nvme_ns_drain(NvmeNamespace *ns)
blk_drain(ns->blkconf.blk);
}
-void nvme_ns_flush(NvmeNamespace *ns)
+void nvme_ns_shutdown(NvmeNamespace *ns)
{
blk_flush(ns->blkconf.blk);
+ if (ns->params.zoned) {
+ nvme_zoned_ns_shutdown(ns);
+ }
+}
+
+void nvme_ns_cleanup(NvmeNamespace *ns)
+{
+ if (ns->params.zoned) {
+ g_free(ns->id_ns_zoned);
+ g_free(ns->zone_array);
+ g_free(ns->zd_extensions);
+ }
}
static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -115,18 +358,34 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
NvmeNamespace *ns = NVME_NS(dev);
BusState *s = qdev_get_parent_bus(dev);
NvmeCtrl *n = NVME(s->parent);
- Error *local_err = NULL;
- if (nvme_ns_setup(n, ns, &local_err)) {
- error_propagate_prepend(errp, local_err,
- "could not setup namespace: ");
+ if (nvme_ns_setup(ns, errp)) {
return;
}
+
+ if (nvme_register_namespace(n, ns, errp)) {
+ return;
+ }
+
}
static Property nvme_ns_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+ DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
+ DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
+ DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs,
+ NVME_DEFAULT_ZONE_SIZE),
+ DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs,
+ 0),
+ DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
+ params.cross_zone_read, false),
+ DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
+ params.max_active_zones, 0),
+ DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
+ params.max_open_zones, 0),
+ DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+ params.zd_extension_size, 0),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606..293ac990e3 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,8 +19,23 @@
#define NVME_NS(obj) \
OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
+typedef struct NvmeZone {
+ NvmeZoneDescr d;
+ uint64_t w_ptr;
+ QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
typedef struct NvmeNamespaceParams {
uint32_t nsid;
+ QemuUUID uuid;
+
+ bool zoned;
+ bool cross_zone_read;
+ uint64_t zone_size_bs;
+ uint64_t zone_cap_bs;
+ uint32_t max_active_zones;
+ uint32_t max_open_zones;
+ uint32_t zd_extension_size;
} NvmeNamespaceParams;
typedef struct NvmeNamespace {
@@ -29,8 +44,28 @@ typedef struct NvmeNamespace {
int32_t bootindex;
int64_t size;
NvmeIdNs id_ns;
+ const uint32_t *iocs;
+ uint8_t csi;
+
+ NvmeIdNsZoned *id_ns_zoned;
+ NvmeZone *zone_array;
+ QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+ QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+ QTAILQ_HEAD(, NvmeZone) closed_zones;
+ QTAILQ_HEAD(, NvmeZone) full_zones;
+ uint32_t num_zones;
+ uint64_t zone_size;
+ uint64_t zone_capacity;
+ uint32_t zone_size_log2;
+ uint8_t *zd_extensions;
+ int32_t nr_open_zones;
+ int32_t nr_active_zones;
NvmeNamespaceParams params;
+
+ struct {
+ uint32_t err_rec;
+ } features;
} NvmeNamespace;
static inline uint32_t nvme_nsid(NvmeNamespace *ns)
@@ -67,8 +102,81 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
typedef struct NvmeCtrl NvmeCtrl;
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
+static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+{
+ return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
+{
+ zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+ return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+ return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+ uint8_t st = nvme_get_zone_state(zone);
+
+ return st != NVME_ZONE_STATE_FULL &&
+ st != NVME_ZONE_STATE_READ_ONLY &&
+ st != NVME_ZONE_STATE_OFFLINE;
+}
+
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+ return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+ assert(ns->nr_open_zones >= 0);
+ if (ns->params.max_open_zones) {
+ ns->nr_open_zones++;
+ assert(ns->nr_open_zones <= ns->params.max_open_zones);
+ }
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+ if (ns->params.max_open_zones) {
+ assert(ns->nr_open_zones > 0);
+ ns->nr_open_zones--;
+ }
+ assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+ assert(ns->nr_active_zones >= 0);
+ if (ns->params.max_active_zones) {
+ ns->nr_active_zones++;
+ assert(ns->nr_active_zones <= ns->params.max_active_zones);
+ }
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+ if (ns->params.max_active_zones) {
+ assert(ns->nr_active_zones > 0);
+ ns->nr_active_zones--;
+ assert(ns->nr_active_zones >= ns->nr_open_zones);
+ }
+ assert(ns->nr_active_zones >= 0);
+}
+
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
void nvme_ns_drain(NvmeNamespace *ns);
-void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
#endif /* NVME_NS_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 27d2c72716..fb83636abd 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
*/
/**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
*
* https://nvmexpress.org/developers/nvme-specification/
*/
@@ -22,33 +22,67 @@
* [pmrdev=<mem_backend_file_id>,] \
* max_ioqpairs=<N[optional]>, \
* aerl=<N[optional]>, aer_max_queued=<N[optional]>, \
- * mdts=<N[optional]>
- * -device nvme-ns,drive=<drive_id>,bus=bus_name,nsid=<nsid>
+ * mdts=<N[optional]>,zoned.append_size_limit=<N[optional]> \
+ * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
+ * zoned=<true|false[optional]>
*
* Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
+ * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
+ * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
*
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
* Enabling pmr emulation can be achieved by pointing to memory-backend-file.
* For example:
* -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
* size=<size> .... -device nvme,...,pmrdev=<mem_id>
*
+ * The PMR will use BAR 4/5 exclusively.
+ *
*
* nvme device parameters
* ~~~~~~~~~~~~~~~~~~~~~~
* - `aerl`
* The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- * of concurrently outstanding Asynchronous Event Request commands suppoert
+ * of concurrently outstanding Asynchronous Event Request commands support
* by the controller. This is a 0's based value.
*
* - `aer_max_queued`
* This is the maximum number of events that the device will enqueue for
- * completion when there are no oustanding AERs. When the maximum number of
+ * completion when there are no outstanding AERs. When the maximum number of
* enqueued events are reached, subsequent events will be dropped.
*
+ * - `zoned.append_size_limit`
+ * The maximum I/O size in bytes that is allowed in Zone Append command.
+ * The default is 128KiB. Since internally this this value is maintained as
+ * ZASL = log2(<maximum append size> / <page size>), some values assigned
+ * to this property may be rounded down and result in a lower maximum ZA
+ * data size being in effect. By setting this property to 0, users can make
+ * ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zone_size=<zone size in bytes, default: 128MiB>
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=<zone descriptor extension size, default 0>
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=<Maximum Active Resources (zones), default: 0>
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=<Maximum Open Resources (zones), default: 0>
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.cross_read=<enable RAZB, default: false>
+ * Setting this property to true enables Read Across Zone Boundaries.
*/
#include "qemu/osdep.h"
@@ -74,9 +108,9 @@
#define NVME_MAX_IOQPAIRS 0xffff
#define NVME_DB_SIZE 4
-#define NVME_SPEC_VER 0x00010300
+#define NVME_SPEC_VER 0x00010400
#define NVME_CMB_BIR 2
-#define NVME_PMR_BIR 2
+#define NVME_PMR_BIR 4
#define NVME_TEMPERATURE 0x143
#define NVME_TEMPERATURE_WARNING 0x157
#define NVME_TEMPERATURE_CRITICAL 0x175
@@ -105,12 +139,49 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
[NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
+ [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
[NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
[NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
[NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
[NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
};
+static const uint32_t nvme_cse_acs[256] = {
+ [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+ [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+ [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_zoned[256] = {
+ [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+ [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
+ [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+ [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
+};
+
static void nvme_process_sq(void *opaque);
static uint16_t nvme_cid(NvmeRequest *req)
@@ -127,19 +198,104 @@ static uint16_t nvme_sqid(NvmeRequest *req)
return le16_to_cpu(req->sq->sqid);
}
+static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state)
+{
+ if (QTAILQ_IN_USE(zone, entry)) {
+ switch (nvme_get_zone_state(zone)) {
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_CLOSED:
+ QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_FULL:
+ QTAILQ_REMOVE(&ns->full_zones, zone, entry);
+ default:
+ ;
+ }
+ }
+
+ nvme_set_zone_state(zone, state);
+
+ switch (state) {
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_CLOSED:
+ QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
+ break;
+ case NVME_ZONE_STATE_FULL:
+ QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
+ case NVME_ZONE_STATE_READ_ONLY:
+ break;
+ default:
+ zone->d.za = 0;
+ }
+}
+
+/*
+ * Check if we can open a zone without exceeding open/active limits.
+ * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
+ */
+static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
+{
+ if (ns->params.max_active_zones != 0 &&
+ ns->nr_active_zones + act > ns->params.max_active_zones) {
+ trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
+ return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
+ }
+ if (ns->params.max_open_zones != 0 &&
+ ns->nr_open_zones + opn > ns->params.max_open_zones) {
+ trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
+ return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
+ }
+
+ return NVME_SUCCESS;
+}
+
static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
{
- hwaddr low = n->ctrl_mem.addr;
- hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+ hwaddr hi, lo;
+
+ if (!n->cmb.cmse) {
+ return false;
+ }
+
+ lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
+ hi = lo + int128_get64(n->cmb.mem.size);
- return addr >= low && addr < hi;
+ return addr >= lo && addr < hi;
}
static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
{
- assert(nvme_addr_is_cmb(n, addr));
+ hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
+ return &n->cmb.buf[addr - base];
+}
+
+static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
+{
+ hwaddr hi;
- return &n->cmbuf[addr - n->ctrl_mem.addr];
+ if (!n->pmr.cmse) {
+ return false;
+ }
+
+ hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
+
+ return addr >= n->pmr.cba && addr < hi;
+}
+
+static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
+{
+ return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
}
static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
@@ -154,6 +310,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
return 0;
}
+ if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
+ memcpy(buf, nvme_addr_to_pmr(n, addr), size);
+ return 0;
+ }
+
return pci_dma_read(&n->parent_obj, addr, buf, size);
}
@@ -241,6 +402,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
static void nvme_req_clear(NvmeRequest *req)
{
req->ns = NULL;
+ req->opaque = NULL;
memset(&req->cqe, 0x0, sizeof(req->cqe));
req->status = NVME_SUCCESS;
}
@@ -274,9 +436,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
return NVME_SUCCESS;
}
+static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
+ size_t len)
+{
+ if (!len) {
+ return NVME_SUCCESS;
+ }
+
+ if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
+ return NVME_DATA_TRAS_ERROR;
+ }
+
+ qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
+
+ return NVME_SUCCESS;
+}
+
static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
hwaddr addr, size_t len)
{
+ bool cmb = false, pmr = false;
+
if (!len) {
return NVME_SUCCESS;
}
@@ -284,6 +464,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
trace_pci_nvme_map_addr(addr, len);
if (nvme_addr_is_cmb(n, addr)) {
+ cmb = true;
+ } else if (nvme_addr_is_pmr(n, addr)) {
+ pmr = true;
+ }
+
+ if (cmb || pmr) {
if (qsg && qsg->sg) {
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
@@ -294,7 +480,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
qemu_iovec_init(iov, 1);
}
- return nvme_map_addr_cmb(n, iov, addr, len);
+ if (cmb) {
+ return nvme_map_addr_cmb(n, iov, addr, len);
+ } else {
+ return nvme_map_addr_pmr(n, iov, addr, len);
+ }
}
if (iov && iov->iov) {
@@ -319,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
trans_len = MIN(len, trans_len);
int num_prps = (len >> n->page_bits) + 1;
uint16_t status;
- bool prp_list_in_cmb = false;
int ret;
QEMUSGList *qsg = &req->qsg;
@@ -327,7 +516,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
- if (nvme_addr_is_cmb(n, prp1)) {
+ if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) {
qemu_iovec_init(iov, num_prps);
} else {
pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
@@ -345,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
uint32_t nents, prp_trans;
int i = 0;
- if (nvme_addr_is_cmb(n, prp2)) {
- prp_list_in_cmb = true;
- }
-
nents = (len + n->page_size - 1) >> n->page_bits;
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
@@ -365,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
return NVME_INVALID_PRP_OFFSET | NVME_DNR;
}
- if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) {
- return NVME_INVALID_USE_OF_CMB | NVME_DNR;
- }
-
i = 0;
nents = (len + n->page_size - 1) >> n->page_bits;
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
@@ -502,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
uint64_t nsgld;
uint32_t seg_len;
uint16_t status;
- bool sgl_in_cmb = false;
hwaddr addr;
int ret;
@@ -524,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
goto out;
}
- /*
- * If the segment is located in the CMB, the submission queue of the
- * request must also reside there.
- */
- if (nvme_addr_is_cmb(n, addr)) {
- if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
- return NVME_INVALID_USE_OF_CMB | NVME_DNR;
- }
-
- sgl_in_cmb = true;
- }
-
for (;;) {
switch (NVME_SGL_TYPE(sgld->type)) {
case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -624,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
if (status) {
goto unmap;
}
-
- /*
- * If the next segment is in the CMB, make sure that the sgl was
- * already located there.
- */
- if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
- status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
- goto unmap;
- }
}
out:
@@ -847,6 +1006,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
nvme_process_aers(n);
}
+static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
+{
+ uint8_t aer_info;
+
+ /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
+ if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
+ return;
+ }
+
+ switch (event) {
+ case NVME_SMART_SPARE:
+ aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
+ break;
+ case NVME_SMART_TEMPERATURE:
+ aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
+ break;
+ case NVME_SMART_RELIABILITY:
+ case NVME_SMART_MEDIA_READ_ONLY:
+ case NVME_SMART_FAILED_VOLATILE_MEDIA:
+ case NVME_SMART_PMR_UNRELIABLE:
+ aer_info = NVME_AER_INFO_SMART_RELIABILITY;
+ break;
+ default:
+ return;
+ }
+
+ nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
+}
+
static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
{
n->aer_mask &= ~(1 << event_type);
@@ -866,8 +1054,8 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
return NVME_SUCCESS;
}
-static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
- uint64_t slba, uint32_t nlb)
+static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb)
{
uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
@@ -878,6 +1066,307 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
return NVME_SUCCESS;
}
+static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb)
+{
+ BlockDriverState *bs = blk_bs(ns->blkconf.blk);
+
+ int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
+ int64_t offset = nvme_l2b(ns, slba);
+ bool zeroed;
+ int ret;
+
+ Error *local_err = NULL;
+
+ /*
+ * `pnum` holds the number of bytes after offset that shares the same
+ * allocation status as the byte at offset. If `pnum` is different from
+ * `bytes`, we should check the allocation status of the next range and
+ * continue this until all bytes have been checked.
+ */
+ do {
+ bytes -= pnum;
+
+ ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
+ if (ret < 0) {
+ error_setg_errno(&local_err, -ret, "unable to get block status");
+ error_report_err(local_err);
+
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
+ zeroed = !!(ret & BDRV_BLOCK_ZERO);
+
+ trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
+
+ if (zeroed) {
+ return NVME_DULB;
+ }
+
+ offset += pnum;
+ } while (pnum != bytes);
+
+ return NVME_SUCCESS;
+}
+
+static void nvme_aio_err(NvmeRequest *req, int ret)
+{
+ uint16_t status = NVME_SUCCESS;
+ Error *local_err = NULL;
+
+ switch (req->cmd.opcode) {
+ case NVME_CMD_READ:
+ status = NVME_UNRECOVERED_READ;
+ break;
+ case NVME_CMD_FLUSH:
+ case NVME_CMD_WRITE:
+ case NVME_CMD_WRITE_ZEROES:
+ case NVME_CMD_ZONE_APPEND:
+ status = NVME_WRITE_FAULT;
+ break;
+ default:
+ status = NVME_INTERNAL_DEV_ERROR;
+ break;
+ }
+
+ trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status);
+
+ error_setg_errno(&local_err, -ret, "aio failed");
+ error_report_err(local_err);
+
+ /*
+ * Set the command status code to the first encountered error but allow a
+ * subsequent Internal Device Error to trump it.
+ */
+ if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
+ return;
+ }
+
+ req->status = status;
+}
+
+static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
+{
+ return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
+ slba / ns->zone_size;
+}
+
+static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
+{
+ uint32_t zone_idx = nvme_zone_idx(ns, slba);
+
+ assert(zone_idx < ns->num_zones);
+ return &ns->zone_array[zone_idx];
+}
+
+static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
+{
+ uint64_t zslba = zone->d.zslba;
+
+ switch (nvme_get_zone_state(zone)) {
+ case NVME_ZONE_STATE_EMPTY:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_CLOSED:
+ return NVME_SUCCESS;
+ case NVME_ZONE_STATE_FULL:
+ trace_pci_nvme_err_zone_is_full(zslba);
+ return NVME_ZONE_FULL;
+ case NVME_ZONE_STATE_OFFLINE:
+ trace_pci_nvme_err_zone_is_offline(zslba);
+ return NVME_ZONE_OFFLINE;
+ case NVME_ZONE_STATE_READ_ONLY:
+ trace_pci_nvme_err_zone_is_read_only(zslba);
+ return NVME_ZONE_READ_ONLY;
+ default:
+ assert(false);
+ }
+
+ return NVME_INTERNAL_DEV_ERROR;
+}
+
+static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
+ NvmeZone *zone, uint64_t slba,
+ uint32_t nlb)
+{
+ uint64_t zcap = nvme_zone_wr_boundary(zone);
+ uint16_t status;
+
+ status = nvme_check_zone_state_for_write(zone);
+ if (status) {
+ return status;
+ }
+
+ if (unlikely(slba != zone->w_ptr)) {
+ trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
+ return NVME_ZONE_INVALID_WRITE;
+ }
+
+ if (unlikely((slba + nlb) > zcap)) {
+ trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
+ return NVME_ZONE_BOUNDARY_ERROR;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
+{
+ uint16_t status;
+
+ switch (nvme_get_zone_state(zone)) {
+ case NVME_ZONE_STATE_EMPTY:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_FULL:
+ case NVME_ZONE_STATE_CLOSED:
+ case NVME_ZONE_STATE_READ_ONLY:
+ status = NVME_SUCCESS;
+ break;
+ case NVME_ZONE_STATE_OFFLINE:
+ status = NVME_ZONE_OFFLINE;
+ break;
+ default:
+ assert(false);
+ }
+
+ return status;
+}
+
+static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
+ uint32_t nlb)
+{
+ NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
+ uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
+ uint64_t end = slba + nlb;
+ uint16_t status;
+
+ status = nvme_check_zone_state_for_read(zone);
+ if (status) {
+ ;
+ } else if (unlikely(end > bndry)) {
+ if (!ns->params.cross_zone_read) {
+ status = NVME_ZONE_BOUNDARY_ERROR;
+ } else {
+ /*
+ * Read across zone boundary - check that all subsequent
+ * zones that are being read have an appropriate state.
+ */
+ do {
+ zone++;
+ status = nvme_check_zone_state_for_read(zone);
+ if (status) {
+ break;
+ }
+ } while (end > nvme_zone_rd_boundary(ns, zone));
+ }
+ }
+
+ return status;
+}
+
+static void nvme_auto_transition_zone(NvmeNamespace *ns)
+{
+ NvmeZone *zone;
+
+ if (ns->params.max_open_zones &&
+ ns->nr_open_zones == ns->params.max_open_zones) {
+ zone = QTAILQ_FIRST(&ns->imp_open_zones);
+ if (zone) {
+ /*
+ * Automatically close this implicitly open zone.
+ */
+ QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+ nvme_aor_dec_open(ns);
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+ }
+ }
+}
+
+static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint16_t status = NVME_SUCCESS;
+ uint8_t zs = nvme_get_zone_state(zone);
+
+ if (zs == NVME_ZONE_STATE_EMPTY) {
+ nvme_auto_transition_zone(ns);
+ status = nvme_aor_check(ns, 1, 1);
+ } else if (zs == NVME_ZONE_STATE_CLOSED) {
+ nvme_auto_transition_zone(ns);
+ status = nvme_aor_check(ns, 0, 1);
+ }
+
+ return status;
+}
+
+static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
+ bool failed)
+{
+ NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+ NvmeZone *zone;
+ NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
+ uint64_t slba;
+ uint32_t nlb;
+
+ slba = le64_to_cpu(rw->slba);
+ nlb = le16_to_cpu(rw->nlb) + 1;
+ zone = nvme_get_zone_by_slba(ns, slba);
+
+ zone->d.wp += nlb;
+
+ if (failed) {
+ res->slba = 0;
+ }
+
+ if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
+ switch (nvme_get_zone_state(zone)) {
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ nvme_aor_dec_open(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ nvme_aor_dec_active(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_EMPTY:
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+ /* fall through */
+ case NVME_ZONE_STATE_FULL:
+ break;
+ default:
+ assert(false);
+ }
+ }
+}
+
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+ uint32_t nlb)
+{
+ uint8_t zs;
+
+ zone->w_ptr += nlb;
+
+ if (zone->w_ptr < nvme_zone_wr_boundary(zone)) {
+ zs = nvme_get_zone_state(zone);
+ switch (zs) {
+ case NVME_ZONE_STATE_EMPTY:
+ nvme_aor_inc_active(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ nvme_aor_inc_open(ns);
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
+ }
+ }
+}
+
+static inline bool nvme_is_write(NvmeRequest *req)
+{
+ NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+
+ return rw->opcode == NVME_CMD_WRITE ||
+ rw->opcode == NVME_CMD_ZONE_APPEND ||
+ rw->opcode == NVME_CMD_WRITE_ZEROES;
+}
+
static void nvme_rw_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
@@ -887,91 +1376,278 @@ static void nvme_rw_cb(void *opaque, int ret)
BlockAcctCookie *acct = &req->acct;
BlockAcctStats *stats = blk_get_stats(blk);
- Error *local_err = NULL;
-
trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
+ if (ns->params.zoned && nvme_is_write(req)) {
+ nvme_finalize_zoned_write(ns, req, ret != 0);
+ }
+
if (!ret) {
block_acct_done(stats, acct);
} else {
- uint16_t status;
-
block_acct_failed(stats, acct);
+ nvme_aio_err(req, ret);
+ }
- switch (req->cmd.opcode) {
- case NVME_CMD_READ:
- status = NVME_UNRECOVERED_READ;
- break;
- case NVME_CMD_FLUSH:
- case NVME_CMD_WRITE:
- case NVME_CMD_WRITE_ZEROES:
- status = NVME_WRITE_FAULT;
- break;
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+ NvmeRequest *req = opaque;
+ uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+ trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+ if (ret) {
+ nvme_aio_err(req, ret);
+ }
+
+ (*discards)--;
+
+ if (*discards) {
+ return;
+ }
+
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+struct nvme_zone_reset_ctx {
+ NvmeRequest *req;
+ NvmeZone *zone;
+};
+
+static void nvme_aio_zone_reset_cb(void *opaque, int ret)
+{
+ struct nvme_zone_reset_ctx *ctx = opaque;
+ NvmeRequest *req = ctx->req;
+ NvmeNamespace *ns = req->ns;
+ NvmeZone *zone = ctx->zone;
+ uintptr_t *resets = (uintptr_t *)&req->opaque;
+
+ g_free(ctx);
+
+ trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
+
+ if (!ret) {
+ switch (nvme_get_zone_state(zone)) {
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ nvme_aor_dec_open(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ nvme_aor_dec_active(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_FULL:
+ zone->w_ptr = zone->d.zslba;
+ zone->d.wp = zone->w_ptr;
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+ /* fall through */
default:
- status = NVME_INTERNAL_DEV_ERROR;
break;
}
+ } else {
+ nvme_aio_err(req, ret);
+ }
- trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status);
+ (*resets)--;
- error_setg_errno(&local_err, -ret, "aio failed");
- error_report_err(local_err);
+ if (*resets) {
+ return;
+ }
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+struct nvme_compare_ctx {
+ QEMUIOVector iov;
+ uint8_t *bounce;
+ size_t len;
+};
+
+static void nvme_compare_cb(void *opaque, int ret)
+{
+ NvmeRequest *req = opaque;
+ NvmeNamespace *ns = req->ns;
+ struct nvme_compare_ctx *ctx = req->opaque;
+ g_autofree uint8_t *buf = NULL;
+ uint16_t status;
+
+ trace_pci_nvme_compare_cb(nvme_cid(req));
+
+ if (!ret) {
+ block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+ } else {
+ block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
+ nvme_aio_err(req, ret);
+ goto out;
+ }
+
+ buf = g_malloc(ctx->len);
+
+ status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE,
+ req);
+ if (status) {
req->status = status;
+ goto out;
+ }
+
+ if (memcmp(buf, ctx->bounce, ctx->len)) {
+ req->status = NVME_CMP_FAILURE;
}
+out:
+ qemu_iovec_destroy(&ctx->iov);
+ g_free(ctx->bounce);
+ g_free(ctx);
+
nvme_enqueue_req_completion(nvme_cq(req), req);
}
-static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
{
- block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
- BLOCK_ACCT_FLUSH);
- req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
- return NVME_NO_COMPLETE;
+ NvmeNamespace *ns = req->ns;
+ NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+
+ uint32_t attr = le32_to_cpu(dsm->attributes);
+ uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+ uint16_t status = NVME_SUCCESS;
+
+ trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+ if (attr & NVME_DSMGMT_AD) {
+ int64_t offset;
+ size_t len;
+ NvmeDsmRange range[nr];
+ uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+ status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+ DMA_DIRECTION_TO_DEVICE, req);
+ if (status) {
+ return status;
+ }
+
+ /*
+ * AIO callbacks may be called immediately, so initialize discards to 1
+ * to make sure the the callback does not complete the request before
+ * all discards have been issued.
+ */
+ *discards = 1;
+
+ for (int i = 0; i < nr; i++) {
+ uint64_t slba = le64_to_cpu(range[i].slba);
+ uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+ if (nvme_check_bounds(ns, slba, nlb)) {
+ trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+ ns->id_ns.nsze);
+ continue;
+ }
+
+ trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+ nlb);
+
+ offset = nvme_l2b(ns, slba);
+ len = nvme_l2b(ns, nlb);
+
+ while (len) {
+ size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+ (*discards)++;
+
+ blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+ nvme_aio_discard_cb, req);
+
+ offset += bytes;
+ len -= bytes;
+ }
+ }
+
+ /* account for the 1-initialization */
+ (*discards)--;
+
+ if (*discards) {
+ status = NVME_NO_COMPLETE;
+ } else {
+ status = req->status;
+ }
+ }
+
+ return status;
}
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
{
NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
NvmeNamespace *ns = req->ns;
+ BlockBackend *blk = ns->blkconf.blk;
uint64_t slba = le64_to_cpu(rw->slba);
- uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
- uint64_t offset = nvme_l2b(ns, slba);
- uint32_t count = nvme_l2b(ns, nlb);
+ uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+ size_t len = nvme_l2b(ns, nlb);
+ int64_t offset = nvme_l2b(ns, slba);
+ uint8_t *bounce = NULL;
+ struct nvme_compare_ctx *ctx = NULL;
uint16_t status;
- trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
+ trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
+
+ status = nvme_check_mdts(n, len);
+ if (status) {
+ trace_pci_nvme_err_mdts(nvme_cid(req), len);
+ return status;
+ }
- status = nvme_check_bounds(n, ns, slba, nlb);
+ status = nvme_check_bounds(ns, slba, nlb);
if (status) {
trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
return status;
}
+ if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+ status = nvme_check_dulbe(ns, slba, nlb);
+ if (status) {
+ return status;
+ }
+ }
+
+ bounce = g_malloc(len);
+
+ ctx = g_new(struct nvme_compare_ctx, 1);
+ ctx->bounce = bounce;
+ ctx->len = len;
+
+ req->opaque = ctx;
+
+ qemu_iovec_init(&ctx->iov, 1);
+ qemu_iovec_add(&ctx->iov, bounce, len);
+
+ block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ);
+ blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req);
+
+ return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
+{
block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
- BLOCK_ACCT_WRITE);
- req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
- BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
+ BLOCK_ACCT_FLUSH);
+ req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
return NVME_NO_COMPLETE;
}
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
{
NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
NvmeNamespace *ns = req->ns;
- uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
uint64_t slba = le64_to_cpu(rw->slba);
-
+ uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
uint64_t data_size = nvme_l2b(ns, nlb);
- uint64_t data_offset = nvme_l2b(ns, slba);
- enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
- BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+ uint64_t data_offset;
BlockBackend *blk = ns->blkconf.blk;
uint16_t status;
- trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
- nvme_nsid(ns), nlb, data_size, slba);
+ trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
status = nvme_check_mdts(n, data_size);
if (status) {
@@ -979,39 +1655,680 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
goto invalid;
}
- status = nvme_check_bounds(n, ns, slba, nlb);
+ status = nvme_check_bounds(ns, slba, nlb);
if (status) {
trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
goto invalid;
}
+ if (ns->params.zoned) {
+ status = nvme_check_zone_read(ns, slba, nlb);
+ if (status) {
+ trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
+ goto invalid;
+ }
+ }
+
status = nvme_map_dptr(n, data_size, req);
if (status) {
goto invalid;
}
- block_acct_start(blk_get_stats(blk), &req->acct, data_size, acct);
+ if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+ status = nvme_check_dulbe(ns, slba, nlb);
+ if (status) {
+ goto invalid;
+ }
+ }
+
+ data_offset = nvme_l2b(ns, slba);
+
+ block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+ BLOCK_ACCT_READ);
if (req->qsg.sg) {
- if (acct == BLOCK_ACCT_WRITE) {
+ req->aiocb = dma_blk_read(blk, &req->qsg, data_offset,
+ BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+ } else {
+ req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0,
+ nvme_rw_cb, req);
+ }
+ return NVME_NO_COMPLETE;
+
+invalid:
+ block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+ return status | NVME_DNR;
+}
+
+static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
+ bool wrz)
+{
+ NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+ NvmeNamespace *ns = req->ns;
+ uint64_t slba = le64_to_cpu(rw->slba);
+ uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+ uint64_t data_size = nvme_l2b(ns, nlb);
+ uint64_t data_offset;
+ NvmeZone *zone;
+ NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
+ BlockBackend *blk = ns->blkconf.blk;
+ uint16_t status;
+
+ trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
+
+ if (!wrz) {
+ status = nvme_check_mdts(n, data_size);
+ if (status) {
+ trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+ goto invalid;
+ }
+ }
+
+ status = nvme_check_bounds(ns, slba, nlb);
+ if (status) {
+ trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+ goto invalid;
+ }
+
+ if (ns->params.zoned) {
+ zone = nvme_get_zone_by_slba(ns, slba);
+
+ if (append) {
+ if (unlikely(slba != zone->d.zslba)) {
+ trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+ status = NVME_INVALID_FIELD;
+ goto invalid;
+ }
+
+ if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+ trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+ status = NVME_INVALID_FIELD;
+ goto invalid;
+ }
+
+ slba = zone->w_ptr;
+ res->slba = cpu_to_le64(slba);
+ }
+
+ status = nvme_check_zone_write(n, ns, zone, slba, nlb);
+ if (status) {
+ goto invalid;
+ }
+
+ status = nvme_auto_open_zone(ns, zone);
+ if (status) {
+ goto invalid;
+ }
+
+ nvme_advance_zone_wp(ns, zone, nlb);
+ }
+
+ data_offset = nvme_l2b(ns, slba);
+
+ if (!wrz) {
+ status = nvme_map_dptr(n, data_size, req);
+ if (status) {
+ goto invalid;
+ }
+
+ block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+ BLOCK_ACCT_WRITE);
+ if (req->qsg.sg) {
req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
BDRV_SECTOR_SIZE, nvme_rw_cb, req);
} else {
- req->aiocb = dma_blk_read(blk, &req->qsg, data_offset,
- BDRV_SECTOR_SIZE, nvme_rw_cb, req);
- }
- } else {
- if (acct == BLOCK_ACCT_WRITE) {
req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
nvme_rw_cb, req);
- } else {
- req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0,
- nvme_rw_cb, req);
}
+ } else {
+ block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE);
+ req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+ BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+ req);
}
return NVME_NO_COMPLETE;
invalid:
- block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct);
+ block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
+ return status | NVME_DNR;
+}
+
+static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+{
+ return nvme_do_write(n, req, false, false);
+}
+
+static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
+{
+ return nvme_do_write(n, req, false, true);
+}
+
+static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
+{
+ return nvme_do_write(n, req, true, false);
+}
+
+static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
+ uint64_t *slba, uint32_t *zone_idx)
+{
+ uint32_t dw10 = le32_to_cpu(c->cdw10);
+ uint32_t dw11 = le32_to_cpu(c->cdw11);
+
+ if (!ns->params.zoned) {
+ trace_pci_nvme_err_invalid_opc(c->opcode);
+ return NVME_INVALID_OPCODE | NVME_DNR;
+ }
+
+ *slba = ((uint64_t)dw11) << 32 | dw10;
+ if (unlikely(*slba >= ns->id_ns.nsze)) {
+ trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
+ *slba = 0;
+ return NVME_LBA_RANGE | NVME_DNR;
+ }
+
+ *zone_idx = nvme_zone_idx(ns, *slba);
+ assert(*zone_idx < ns->num_zones);
+
+ return NVME_SUCCESS;
+}
+
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
+ NvmeRequest *);
+
+enum NvmeZoneProcessingMask {
+ NVME_PROC_CURRENT_ZONE = 0,
+ NVME_PROC_OPENED_ZONES = 1 << 0,
+ NVME_PROC_CLOSED_ZONES = 1 << 1,
+ NVME_PROC_READ_ONLY_ZONES = 1 << 2,
+ NVME_PROC_FULL_ZONES = 1 << 3,
+};
+
+static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state, NvmeRequest *req)
+{
+ uint16_t status;
+
+ switch (state) {
+ case NVME_ZONE_STATE_EMPTY:
+ status = nvme_aor_check(ns, 1, 0);
+ if (status) {
+ return status;
+ }
+ nvme_aor_inc_active(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ status = nvme_aor_check(ns, 0, 1);
+ if (status) {
+ if (state == NVME_ZONE_STATE_EMPTY) {
+ nvme_aor_dec_active(ns);
+ }
+ return status;
+ }
+ nvme_aor_inc_open(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
+ /* fall through */
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ return NVME_SUCCESS;
+ default:
+ return NVME_ZONE_INVAL_TRANSITION;
+ }
+}
+
+static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state, NvmeRequest *req)
+{
+ switch (state) {
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ nvme_aor_dec_open(ns);
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ return NVME_SUCCESS;
+ default:
+ return NVME_ZONE_INVAL_TRANSITION;
+ }
+}
+
+static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state, NvmeRequest *req)
+{
+ switch (state) {
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ nvme_aor_dec_open(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_CLOSED:
+ nvme_aor_dec_active(ns);
+ /* fall through */
+ case NVME_ZONE_STATE_EMPTY:
+ zone->w_ptr = nvme_zone_wr_boundary(zone);
+ zone->d.wp = zone->w_ptr;
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+ /* fall through */
+ case NVME_ZONE_STATE_FULL:
+ return NVME_SUCCESS;
+ default:
+ return NVME_ZONE_INVAL_TRANSITION;
+ }
+}
+
+static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state, NvmeRequest *req)
+{
+ uintptr_t *resets = (uintptr_t *)&req->opaque;
+ struct nvme_zone_reset_ctx *ctx;
+
+ switch (state) {
+ case NVME_ZONE_STATE_EMPTY:
+ return NVME_SUCCESS;
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ case NVME_ZONE_STATE_CLOSED:
+ case NVME_ZONE_STATE_FULL:
+ break;
+ default:
+ return NVME_ZONE_INVAL_TRANSITION;
+ }
+
+ /*
+ * The zone reset aio callback needs to know the zone that is being reset
+ * in order to transition the zone on completion.
+ */
+ ctx = g_new(struct nvme_zone_reset_ctx, 1);
+ ctx->req = req;
+ ctx->zone = zone;
+
+ (*resets)++;
+
+ blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
+ nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
+ nvme_aio_zone_reset_cb, ctx);
+
+ return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
+ NvmeZoneState state, NvmeRequest *req)
+{
+ switch (state) {
+ case NVME_ZONE_STATE_READ_ONLY:
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
+ /* fall through */
+ case NVME_ZONE_STATE_OFFLINE:
+ return NVME_SUCCESS;
+ default:
+ return NVME_ZONE_INVAL_TRANSITION;
+ }
+}
+
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint16_t status;
+ uint8_t state = nvme_get_zone_state(zone);
+
+ if (state == NVME_ZONE_STATE_EMPTY) {
+ status = nvme_aor_check(ns, 1, 0);
+ if (status) {
+ return status;
+ }
+ nvme_aor_inc_active(ns);
+ zone->d.za |= NVME_ZA_ZD_EXT_VALID;
+ nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+ return NVME_SUCCESS;
+ }
+
+ return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
+ enum NvmeZoneProcessingMask proc_mask,
+ op_handler_t op_hndlr, NvmeRequest *req)
+{
+ uint16_t status = NVME_SUCCESS;
+ NvmeZoneState zs = nvme_get_zone_state(zone);
+ bool proc_zone;
+
+ switch (zs) {
+ case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+ case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+ proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
+ break;
+ case NVME_ZONE_STATE_CLOSED:
+ proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
+ break;
+ case NVME_ZONE_STATE_READ_ONLY:
+ proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
+ break;
+ case NVME_ZONE_STATE_FULL:
+ proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
+ break;
+ default:
+ proc_zone = false;
+ }
+
+ if (proc_zone) {
+ status = op_hndlr(ns, zone, zs, req);
+ }
+
+ return status;
+}
+
+static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
+ enum NvmeZoneProcessingMask proc_mask,
+ op_handler_t op_hndlr, NvmeRequest *req)
+{
+ NvmeZone *next;
+ uint16_t status = NVME_SUCCESS;
+ int i;
+
+ if (!proc_mask) {
+ status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
+ } else {
+ if (proc_mask & NVME_PROC_CLOSED_ZONES) {
+ QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
+ status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+ req);
+ if (status && status != NVME_NO_COMPLETE) {
+ goto out;
+ }
+ }
+ }
+ if (proc_mask & NVME_PROC_OPENED_ZONES) {
+ QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
+ status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+ req);
+ if (status && status != NVME_NO_COMPLETE) {
+ goto out;
+ }
+ }
+
+ QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
+ status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+ req);
+ if (status && status != NVME_NO_COMPLETE) {
+ goto out;
+ }
+ }
+ }
+ if (proc_mask & NVME_PROC_FULL_ZONES) {
+ QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
+ status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+ req);
+ if (status && status != NVME_NO_COMPLETE) {
+ goto out;
+ }
+ }
+ }
+
+ if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
+ for (i = 0; i < ns->num_zones; i++, zone++) {
+ status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
+ req);
+ if (status && status != NVME_NO_COMPLETE) {
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ return status;
+}
+
+static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+ NvmeNamespace *ns = req->ns;
+ NvmeZone *zone;
+ uintptr_t *resets;
+ uint8_t *zd_ext;
+ uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+ uint64_t slba = 0;
+ uint32_t zone_idx = 0;
+ uint16_t status;
+ uint8_t action;
+ bool all;
+ enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
+
+ action = dw13 & 0xff;
+ all = dw13 & 0x100;
+
+ req->status = NVME_SUCCESS;
+
+ if (!all) {
+ status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+ if (status) {
+ return status;
+ }
+ }
+
+ zone = &ns->zone_array[zone_idx];
+ if (slba != zone->d.zslba) {
+ trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ switch (action) {
+
+ case NVME_ZONE_ACTION_OPEN:
+ if (all) {
+ proc_mask = NVME_PROC_CLOSED_ZONES;
+ }
+ trace_pci_nvme_open_zone(slba, zone_idx, all);
+ status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
+ break;
+
+ case NVME_ZONE_ACTION_CLOSE:
+ if (all) {
+ proc_mask = NVME_PROC_OPENED_ZONES;
+ }
+ trace_pci_nvme_close_zone(slba, zone_idx, all);
+ status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
+ break;
+
+ case NVME_ZONE_ACTION_FINISH:
+ if (all) {
+ proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
+ }
+ trace_pci_nvme_finish_zone(slba, zone_idx, all);
+ status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
+ break;
+
+ case NVME_ZONE_ACTION_RESET:
+ resets = (uintptr_t *)&req->opaque;
+
+ if (all) {
+ proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
+ NVME_PROC_FULL_ZONES;
+ }
+ trace_pci_nvme_reset_zone(slba, zone_idx, all);
+
+ *resets = 1;
+
+ status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
+
+ (*resets)--;
+
+ return *resets ? NVME_NO_COMPLETE : req->status;
+
+ case NVME_ZONE_ACTION_OFFLINE:
+ if (all) {
+ proc_mask = NVME_PROC_READ_ONLY_ZONES;
+ }
+ trace_pci_nvme_offline_zone(slba, zone_idx, all);
+ status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
+ break;
+
+ case NVME_ZONE_ACTION_SET_ZD_EXT:
+ trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
+ if (all || !ns->params.zd_extension_size) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+ zd_ext = nvme_get_zd_extension(ns, zone_idx);
+ status = nvme_dma(n, zd_ext, ns->params.zd_extension_size,
+ DMA_DIRECTION_TO_DEVICE, req);
+ if (status) {
+ trace_pci_nvme_err_zd_extension_map_error(zone_idx);
+ return status;
+ }
+
+ status = nvme_set_zd_ext(ns, zone);
+ if (status == NVME_SUCCESS) {
+ trace_pci_nvme_zd_extension_set(zone_idx);
+ return status;
+ }
+ break;
+
+ default:
+ trace_pci_nvme_err_invalid_mgmt_action(action);
+ status = NVME_INVALID_FIELD;
+ }
+
+ if (status == NVME_ZONE_INVAL_TRANSITION) {
+ trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
+ zone->d.za);
+ }
+ if (status) {
+ status |= NVME_DNR;
+ }
+
+ return status;
+}
+
+static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
+{
+ NvmeZoneState zs = nvme_get_zone_state(zl);
+
+ switch (zafs) {
+ case NVME_ZONE_REPORT_ALL:
+ return true;
+ case NVME_ZONE_REPORT_EMPTY:
+ return zs == NVME_ZONE_STATE_EMPTY;
+ case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
+ return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
+ case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
+ return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
+ case NVME_ZONE_REPORT_CLOSED:
+ return zs == NVME_ZONE_STATE_CLOSED;
+ case NVME_ZONE_REPORT_FULL:
+ return zs == NVME_ZONE_STATE_FULL;
+ case NVME_ZONE_REPORT_READ_ONLY:
+ return zs == NVME_ZONE_STATE_READ_ONLY;
+ case NVME_ZONE_REPORT_OFFLINE:
+ return zs == NVME_ZONE_STATE_OFFLINE;
+ default:
+ return false;
+ }
+}
+
+static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+ NvmeNamespace *ns = req->ns;
+ /* cdw12 is zero-based number of dwords to return. Convert to bytes */
+ uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
+ uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+ uint32_t zone_idx, zra, zrasf, partial;
+ uint64_t max_zones, nr_zones = 0;
+ uint16_t status;
+ uint64_t slba, capacity = nvme_ns_nlbas(ns);
+ NvmeZoneDescr *z;
+ NvmeZone *zone;
+ NvmeZoneReportHeader *header;
+ void *buf, *buf_p;
+ size_t zone_entry_sz;
+
+ req->status = NVME_SUCCESS;
+
+ status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+ if (status) {
+ return status;
+ }
+
+ zra = dw13 & 0xff;
+ if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+ if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ zrasf = (dw13 >> 8) & 0xff;
+ if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ if (data_size < sizeof(NvmeZoneReportHeader)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ status = nvme_check_mdts(n, data_size);
+ if (status) {
+ trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+ return status;
+ }
+
+ partial = (dw13 >> 16) & 0x01;
+
+ zone_entry_sz = sizeof(NvmeZoneDescr);
+ if (zra == NVME_ZONE_REPORT_EXTENDED) {
+ zone_entry_sz += ns->params.zd_extension_size;
+ }
+
+ max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
+ buf = g_malloc0(data_size);
+
+ zone = &ns->zone_array[zone_idx];
+ for (; slba < capacity; slba += ns->zone_size) {
+ if (partial && nr_zones >= max_zones) {
+ break;
+ }
+ if (nvme_zone_matches_filter(zrasf, zone++)) {
+ nr_zones++;
+ }
+ }
+ header = (NvmeZoneReportHeader *)buf;
+ header->nr_zones = cpu_to_le64(nr_zones);
+
+ buf_p = buf + sizeof(NvmeZoneReportHeader);
+ for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
+ zone = &ns->zone_array[zone_idx];
+ if (nvme_zone_matches_filter(zrasf, zone)) {
+ z = (NvmeZoneDescr *)buf_p;
+ buf_p += sizeof(NvmeZoneDescr);
+
+ z->zt = zone->d.zt;
+ z->zs = zone->d.zs;
+ z->zcap = cpu_to_le64(zone->d.zcap);
+ z->zslba = cpu_to_le64(zone->d.zslba);
+ z->za = zone->d.za;
+
+ if (nvme_wp_is_valid(zone)) {
+ z->wp = cpu_to_le64(zone->d.wp);
+ } else {
+ z->wp = cpu_to_le64(~0ULL);
+ }
+
+ if (zra == NVME_ZONE_REPORT_EXTENDED) {
+ if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
+ memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
+ ns->params.zd_extension_size);
+ }
+ buf_p += ns->params.zd_extension_size;
+ }
+
+ max_zones--;
+ }
+ }
+
+ status = nvme_dma(n, (uint8_t *)buf, data_size,
+ DMA_DIRECTION_FROM_DEVICE, req);
+
+ g_free(buf);
+
return status;
}
@@ -1022,10 +2339,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
- if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
- return NVME_INVALID_OPCODE | NVME_DNR;
- }
-
if (!nvme_nsid_valid(n, nsid)) {
return NVME_INVALID_NSID | NVME_DNR;
}
@@ -1035,18 +2348,35 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_FIELD | NVME_DNR;
}
+ if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+ trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+ return NVME_INVALID_OPCODE | NVME_DNR;
+ }
+
switch (req->cmd.opcode) {
case NVME_CMD_FLUSH:
return nvme_flush(n, req);
case NVME_CMD_WRITE_ZEROES:
return nvme_write_zeroes(n, req);
+ case NVME_CMD_ZONE_APPEND:
+ return nvme_zone_append(n, req);
case NVME_CMD_WRITE:
+ return nvme_write(n, req);
case NVME_CMD_READ:
- return nvme_rw(n, req);
+ return nvme_read(n, req);
+ case NVME_CMD_COMPARE:
+ return nvme_compare(n, req);
+ case NVME_CMD_DSM:
+ return nvme_dsm(n, req);
+ case NVME_CMD_ZONE_MGMT_SEND:
+ return nvme_zone_mgmt_send(n, req);
+ case NVME_CMD_ZONE_MGMT_RECV:
+ return nvme_zone_mgmt_recv(n, req);
default:
- trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
- return NVME_INVALID_OPCODE | NVME_DNR;
+ assert(false);
}
+
+ return NVME_INVALID_OPCODE | NVME_DNR;
}
static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
@@ -1214,6 +2544,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
}
trans_len = MIN(sizeof(smart) - off, buf_len);
+ smart.critical_warning = n->smart_critical_warning;
smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
1000));
@@ -1281,6 +2612,47 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
DMA_DIRECTION_FROM_DEVICE, req);
}
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
+ uint64_t off, NvmeRequest *req)
+{
+ NvmeEffectsLog log = {};
+ const uint32_t *src_iocs = NULL;
+ uint32_t trans_len;
+
+ if (off >= sizeof(log)) {
+ trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ switch (NVME_CC_CSS(n->bar.cc)) {
+ case NVME_CC_CSS_NVM:
+ src_iocs = nvme_cse_iocs_nvm;
+ /* fall through */
+ case NVME_CC_CSS_ADMIN_ONLY:
+ break;
+ case NVME_CC_CSS_CSI:
+ switch (csi) {
+ case NVME_CSI_NVM:
+ src_iocs = nvme_cse_iocs_nvm;
+ break;
+ case NVME_CSI_ZONED:
+ src_iocs = nvme_cse_iocs_zoned;
+ break;
+ }
+ }
+
+ memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
+
+ if (src_iocs) {
+ memcpy(log.iocs, src_iocs, sizeof(log.iocs));
+ }
+
+ trans_len = MIN(sizeof(log) - off, buf_len);
+
+ return nvme_dma(n, ((uint8_t *)&log) + off, trans_len,
+ DMA_DIRECTION_FROM_DEVICE, req);
+}
+
static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
{
NvmeCmd *cmd = &req->cmd;
@@ -1292,6 +2664,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
uint8_t lid = dw10 & 0xff;
uint8_t lsp = (dw10 >> 8) & 0xf;
uint8_t rae = (dw10 >> 15) & 0x1;
+ uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
uint32_t numdl, numdu;
uint64_t off, lpol, lpou;
size_t len;
@@ -1324,6 +2697,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
return nvme_smart_info(n, rae, len, off, req);
case NVME_LOG_FW_SLOT_INFO:
return nvme_fw_log_info(n, len, off, req);
+ case NVME_LOG_CMD_EFFECTS:
+ return nvme_cmd_effects(n, csi, len, off, req);
default:
trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
return NVME_INVALID_FIELD | NVME_DNR;
@@ -1334,7 +2709,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
n->cq[cq->cqid] = NULL;
timer_free(cq->timer);
- msix_vector_unuse(&n->parent_obj, cq->vector);
+ if (msix_enabled(&n->parent_obj)) {
+ msix_vector_unuse(&n->parent_obj, cq->vector);
+ }
if (cq->cqid) {
g_free(cq);
}
@@ -1368,8 +2745,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
{
int ret;
- ret = msix_vector_use(&n->parent_obj, vector);
- assert(ret == 0);
+ if (msix_enabled(&n->parent_obj)) {
+ ret = msix_vector_use(&n->parent_obj, vector);
+ assert(ret == 0);
+ }
cq->ctrl = n;
cq->cqid = cqid;
cq->size = size;
@@ -1436,6 +2815,23 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
return NVME_SUCCESS;
}
+static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
+{
+ uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
+
+ return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
+{
+ switch (ns->csi) {
+ case NVME_CSI_NVM:
+ case NVME_CSI_ZONED:
+ return true;
+ }
+ return false;
+}
+
static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
{
trace_pci_nvme_identify_ctrl();
@@ -1444,11 +2840,30 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
DMA_DIRECTION_FROM_DEVICE, req);
}
+static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+ NvmeIdCtrlZoned id = {};
+
+ trace_pci_nvme_identify_ctrl_csi(c->csi);
+
+ if (c->csi == NVME_CSI_NVM) {
+ return nvme_rpt_empty_id_struct(n, req);
+ } else if (c->csi == NVME_CSI_ZONED) {
+ if (n->params.zasl_bs) {
+ id.zasl = n->zasl;
+ }
+ return nvme_dma(n, (uint8_t *)&id, sizeof(id),
+ DMA_DIRECTION_FROM_DEVICE, req);
+ }
+
+ return NVME_INVALID_FIELD | NVME_DNR;
+}
+
static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
{
NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
- NvmeIdNs *id_ns, inactive = { 0 };
uint32_t nsid = le32_to_cpu(c->nsid);
trace_pci_nvme_identify_ns(nsid);
@@ -1459,23 +2874,53 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
ns = nvme_ns(n, nsid);
if (unlikely(!ns)) {
- id_ns = &inactive;
- } else {
- id_ns = &ns->id_ns;
+ return nvme_rpt_empty_id_struct(n, req);
}
- return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs),
- DMA_DIRECTION_FROM_DEVICE, req);
+ if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+ return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
+ DMA_DIRECTION_FROM_DEVICE, req);
+ }
+
+ return NVME_INVALID_CMD_SET | NVME_DNR;
+}
+
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns;
+ NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+ uint32_t nsid = le32_to_cpu(c->nsid);
+
+ trace_pci_nvme_identify_ns_csi(nsid, c->csi);
+
+ if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
+ ns = nvme_ns(n, nsid);
+ if (unlikely(!ns)) {
+ return nvme_rpt_empty_id_struct(n, req);
+ }
+
+ if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+ return nvme_rpt_empty_id_struct(n, req);
+ } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
+ return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
+ DMA_DIRECTION_FROM_DEVICE, req);
+ }
+
+ return NVME_INVALID_FIELD | NVME_DNR;
}
static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
{
+ NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
- static const int data_len = NVME_IDENTIFY_DATA_SIZE;
uint32_t min_nsid = le32_to_cpu(c->nsid);
- uint32_t *list;
- uint16_t ret;
- int j = 0;
+ uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+ static const int data_len = sizeof(list);
+ uint32_t *list_ptr = (uint32_t *)list;
+ int i, j = 0;
trace_pci_nvme_identify_nslist(min_nsid);
@@ -1489,33 +2934,79 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_NSID | NVME_DNR;
}
- list = g_malloc0(data_len);
- for (int i = 1; i <= n->num_namespaces; i++) {
- if (i <= min_nsid || !nvme_ns(n, i)) {
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
continue;
}
- list[j++] = cpu_to_le32(i);
+ if (ns->params.nsid <= min_nsid) {
+ continue;
+ }
+ list_ptr[j++] = cpu_to_le32(ns->params.nsid);
if (j == data_len / sizeof(uint32_t)) {
break;
}
}
- ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
- req);
- g_free(list);
- return ret;
+
+ return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns;
+ NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+ uint32_t min_nsid = le32_to_cpu(c->nsid);
+ uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+ static const int data_len = sizeof(list);
+ uint32_t *list_ptr = (uint32_t *)list;
+ int i, j = 0;
+
+ trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
+
+ /*
+ * Same as in nvme_identify_nslist(), 0xffffffff/0xfffffffe are invalid.
+ */
+ if (min_nsid >= NVME_NSID_BROADCAST - 1) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
+ if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
+ continue;
+ }
+ if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
+ continue;
+ }
+ list_ptr[j++] = cpu_to_le32(ns->params.nsid);
+ if (j == data_len / sizeof(uint32_t)) {
+ break;
+ }
+ }
+
+ return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
{
+ NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint32_t nsid = le32_to_cpu(c->nsid);
- uint8_t list[NVME_IDENTIFY_DATA_SIZE];
+ uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
struct data {
struct {
NvmeIdNsDescr hdr;
- uint8_t v[16];
+ uint8_t v[NVME_NIDL_UUID];
} uuid;
+ struct {
+ NvmeIdNsDescr hdr;
+ uint8_t v;
+ } csi;
};
struct data *ns_descrs = (struct data *)list;
@@ -1526,24 +3017,38 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_NSID | NVME_DNR;
}
- if (unlikely(!nvme_ns(n, nsid))) {
+ ns = nvme_ns(n, nsid);
+ if (unlikely(!ns)) {
return NVME_INVALID_FIELD | NVME_DNR;
}
- memset(list, 0x0, sizeof(list));
-
/*
* Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
* structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
*/
ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
- ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
- stl_be_p(&ns_descrs->uuid.v, nsid);
+ ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
+ memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
- return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
- DMA_DIRECTION_FROM_DEVICE, req);
+ ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
+ ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
+ ns_descrs->csi.v = ns->csi;
+
+ return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
+{
+ uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+ static const int data_len = sizeof(list);
+
+ trace_pci_nvme_identify_cmd_set();
+
+ NVME_SET_CSI(*list, NVME_CSI_NVM);
+ NVME_SET_CSI(*list, NVME_CSI_ZONED);
+
+ return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
@@ -1552,13 +3057,29 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
switch (le32_to_cpu(c->cns)) {
case NVME_ID_CNS_NS:
+ /* fall through */
+ case NVME_ID_CNS_NS_PRESENT:
return nvme_identify_ns(n, req);
+ case NVME_ID_CNS_CS_NS:
+ /* fall through */
+ case NVME_ID_CNS_CS_NS_PRESENT:
+ return nvme_identify_ns_csi(n, req);
case NVME_ID_CNS_CTRL:
return nvme_identify_ctrl(n, req);
+ case NVME_ID_CNS_CS_CTRL:
+ return nvme_identify_ctrl_csi(n, req);
case NVME_ID_CNS_NS_ACTIVE_LIST:
+ /* fall through */
+ case NVME_ID_CNS_NS_PRESENT_LIST:
return nvme_identify_nslist(n, req);
+ case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
+ /* fall through */
+ case NVME_ID_CNS_CS_NS_PRESENT_LIST:
+ return nvme_identify_nslist_csi(n, req);
case NVME_ID_CNS_NS_DESCR_LIST:
return nvme_identify_ns_descr_list(n, req);
+ case NVME_ID_CNS_IO_COMMAND_SET:
+ return nvme_identify_cmd_set(n, req);
default:
trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
return NVME_INVALID_FIELD | NVME_DNR;
@@ -1630,6 +3151,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
uint8_t fid = NVME_GETSETFEAT_FID(dw10);
NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
uint16_t iv;
+ NvmeNamespace *ns;
+ int i;
static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
[NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -1692,8 +3215,31 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
}
return NVME_INVALID_FIELD | NVME_DNR;
+ case NVME_ERROR_RECOVERY:
+ if (!nvme_nsid_valid(n, nsid)) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
+ ns = nvme_ns(n, nsid);
+ if (unlikely(!ns)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ result = ns->features.err_rec;
+ goto out;
case NVME_VOLATILE_WRITE_CACHE:
- result = n->features.vwc;
+ result = 0;
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
+ continue;
+ }
+
+ result = blk_enable_write_cache(ns->blkconf.blk);
+ if (result) {
+ break;
+ }
+ }
trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
goto out;
case NVME_ASYNCHRONOUS_EVENT_CONF:
@@ -1734,7 +3280,9 @@ defaults:
if (iv == n->admin_cq.vector) {
result |= NVME_INTVC_NOCOALESCING;
}
-
+ break;
+ case NVME_COMMAND_SET_PROFILE:
+ result = 0;
break;
default:
result = nvme_feature_default[fid];
@@ -1753,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
ret = nvme_dma(n, (uint8_t *)&timestamp, sizeof(timestamp),
DMA_DIRECTION_TO_DEVICE, req);
- if (ret != NVME_SUCCESS) {
+ if (ret) {
return ret;
}
@@ -1764,7 +3312,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeNamespace *ns;
+ NvmeNamespace *ns = NULL;
NvmeCmd *cmd = &req->cmd;
uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -1772,10 +3320,11 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
uint32_t nsid = le32_to_cpu(cmd->nsid);
uint8_t fid = NVME_GETSETFEAT_FID(dw10);
uint8_t save = NVME_SETFEAT_SAVE(dw10);
+ int i;
trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
- if (save) {
+ if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
return NVME_FID_NOT_SAVEABLE | NVME_DNR;
}
@@ -1823,19 +3372,36 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_FIELD | NVME_DNR;
}
- if (((n->temperature >= n->features.temp_thresh_hi) ||
- (n->temperature <= n->features.temp_thresh_low)) &&
- NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
- nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
- NVME_AER_INFO_SMART_TEMP_THRESH,
- NVME_LOG_SMART_INFO);
+ if ((n->temperature >= n->features.temp_thresh_hi) ||
+ (n->temperature <= n->features.temp_thresh_low)) {
+ nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
}
break;
- case NVME_VOLATILE_WRITE_CACHE:
- n->features.vwc = dw11 & 0x1;
+ case NVME_ERROR_RECOVERY:
+ if (nsid == NVME_NSID_BROADCAST) {
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
- for (int i = 1; i <= n->num_namespaces; i++) {
+ if (!ns) {
+ continue;
+ }
+
+ if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+ ns->features.err_rec = dw11;
+ }
+ }
+
+ break;
+ }
+
+ assert(ns);
+ if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+ ns->features.err_rec = dw11;
+ }
+ break;
+ case NVME_VOLATILE_WRITE_CACHE:
+ for (i = 1; i <= n->num_namespaces; i++) {
ns = nvme_ns(n, i);
if (!ns) {
continue;
@@ -1875,6 +3441,12 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
break;
case NVME_TIMESTAMP:
return nvme_set_feature_timestamp(n, req);
+ case NVME_COMMAND_SET_PROFILE:
+ if (dw11 & 0x1ff) {
+ trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
+ return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
+ }
+ break;
default:
return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
}
@@ -1905,6 +3477,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
nvme_adm_opc_str(req->cmd.opcode));
+ if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+ trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
+ return NVME_INVALID_OPCODE | NVME_DNR;
+ }
+
switch (req->cmd.opcode) {
case NVME_ADM_CMD_DELETE_SQ:
return nvme_del_sq(n, req);
@@ -1927,9 +3504,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
case NVME_ADM_CMD_ASYNC_EV_REQ:
return nvme_aer(n, req);
default:
- trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
- return NVME_INVALID_OPCODE | NVME_DNR;
+ assert(false);
}
+
+ return NVME_INVALID_OPCODE | NVME_DNR;
}
static void nvme_process_sq(void *opaque)
@@ -1969,7 +3547,7 @@ static void nvme_process_sq(void *opaque)
}
}
-static void nvme_clear_ctrl(NvmeCtrl *n)
+static void nvme_ctrl_reset(NvmeCtrl *n)
{
NvmeNamespace *ns;
int i;
@@ -2004,16 +3582,54 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
n->outstanding_aers = 0;
n->qs_created = false;
+ n->bar.cc = 0;
+}
+
+static void nvme_ctrl_shutdown(NvmeCtrl *n)
+{
+ NvmeNamespace *ns;
+ int i;
+
+ if (n->pmr.dev) {
+ memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
+ }
+
for (i = 1; i <= n->num_namespaces; i++) {
ns = nvme_ns(n, i);
if (!ns) {
continue;
}
- nvme_ns_flush(ns);
+ nvme_ns_shutdown(ns);
}
+}
- n->bar.cc = 0;
+static void nvme_select_ns_iocs(NvmeCtrl *n)
+{
+ NvmeNamespace *ns;
+ int i;
+
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
+ continue;
+ }
+ ns->iocs = nvme_cse_iocs_none;
+ switch (ns->csi) {
+ case NVME_CSI_NVM:
+ if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+ ns->iocs = nvme_cse_iocs_nvm;
+ }
+ break;
+ case NVME_CSI_ZONED:
+ if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
+ ns->iocs = nvme_cse_iocs_zoned;
+ } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
+ ns->iocs = nvme_cse_iocs_nvm;
+ }
+ break;
+ }
+ }
}
static int nvme_start_ctrl(NvmeCtrl *n)
@@ -2110,13 +3726,41 @@ static int nvme_start_ctrl(NvmeCtrl *n)
nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
NVME_AQA_ASQS(n->bar.aqa) + 1);
+ if (!n->params.zasl_bs) {
+ n->zasl = n->params.mdts;
+ } else {
+ if (n->params.zasl_bs < n->page_size) {
+ trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
+ n->page_size);
+ return -1;
+ }
+ n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
+ }
+
nvme_set_timestamp(n, 0ULL);
QTAILQ_INIT(&n->aer_queue);
+ nvme_select_ns_iocs(n);
+
return 0;
}
+static void nvme_cmb_enable_regs(NvmeCtrl *n)
+{
+ NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
+ NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
+ NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
+
+ NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+ NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+ NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
+ NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+ NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+ NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
+ NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
+}
+
static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
unsigned size)
{
@@ -2180,12 +3824,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
}
} else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
trace_pci_nvme_mmio_stopped();
- nvme_clear_ctrl(n);
+ nvme_ctrl_reset(n);
n->bar.csts &= ~NVME_CSTS_READY;
}
if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
trace_pci_nvme_mmio_shutdown_set();
- nvme_clear_ctrl(n);
+ nvme_ctrl_shutdown(n);
n->bar.cc = data;
n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
} else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
@@ -2218,19 +3862,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
break;
case 0x28: /* ASQ */
- n->bar.asq = data;
+ n->bar.asq = size == 8 ? data :
+ (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
trace_pci_nvme_mmio_asqaddr(data);
break;
case 0x2c: /* ASQ hi */
- n->bar.asq |= data << 32;
+ n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
break;
case 0x30: /* ACQ */
trace_pci_nvme_mmio_acqaddr(data);
- n->bar.acq = data;
+ n->bar.acq = size == 8 ? data :
+ (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
break;
case 0x34: /* ACQ hi */
- n->bar.acq |= data << 32;
+ n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
break;
case 0x38: /* CMBLOC */
@@ -2242,12 +3888,53 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
return;
+ case 0x50: /* CMBMSC */
+ if (!NVME_CAP_CMBS(n->bar.cap)) {
+ return;
+ }
+
+ n->bar.cmbmsc = size == 8 ? data :
+ (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
+ n->cmb.cmse = false;
+
+ if (NVME_CMBMSC_CRE(data)) {
+ nvme_cmb_enable_regs(n);
+
+ if (NVME_CMBMSC_CMSE(data)) {
+ hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
+ if (cba + int128_get64(n->cmb.mem.size) < cba) {
+ NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
+ return;
+ }
+
+ n->cmb.cba = cba;
+ n->cmb.cmse = true;
+ }
+ } else {
+ n->bar.cmbsz = 0;
+ n->bar.cmbloc = 0;
+ }
+
+ return;
+ case 0x54: /* CMBMSC hi */
+ n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
+ return;
+
case 0xE00: /* PMRCAP */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
"invalid write to PMRCAP register, ignored");
return;
- case 0xE04: /* TODO PMRCTL */
- break;
+ case 0xE04: /* PMRCTL */
+ n->bar.pmrctl = data;
+ if (NVME_PMRCTL_EN(data)) {
+ memory_region_set_enabled(&n->pmr.dev->mr, true);
+ n->bar.pmrsts = 0;
+ } else {
+ memory_region_set_enabled(&n->pmr.dev->mr, false);
+ NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
+ n->pmr.cmse = false;
+ }
+ return;
case 0xE08: /* PMRSTS */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
"invalid write to PMRSTS register, ignored");
@@ -2260,8 +3947,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
"invalid write to PMRSWTP register, ignored");
return;
- case 0xE14: /* TODO PMRMSC */
- break;
+ case 0xE14: /* PMRMSCL */
+ if (!NVME_CAP_PMRS(n->bar.cap)) {
+ return;
+ }
+
+ n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
+ n->pmr.cmse = false;
+
+ if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
+ hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
+ if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
+ NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
+ return;
+ }
+
+ n->pmr.cmse = true;
+ n->pmr.cba = cba;
+ }
+
+ return;
+ case 0xE18: /* PMRMSCU */
+ if (!NVME_CAP_PMRS(n->bar.cap)) {
+ return;
+ }
+
+ n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
+ return;
default:
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -2277,7 +3989,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
uint8_t *ptr = (uint8_t *)&n->bar;
uint64_t val = 0;
- trace_pci_nvme_mmio_read(addr);
+ trace_pci_nvme_mmio_read(addr, size);
if (unlikely(addr & (sizeof(uint32_t) - 1))) {
NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
@@ -2299,7 +4011,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
*/
if (addr == 0xE08 &&
(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
- memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
+ memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
}
memcpy(&val, ptr + addr, size);
} else {
@@ -2441,7 +4153,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
{
NvmeCtrl *n = (NvmeCtrl *)opaque;
- trace_pci_nvme_mmio_write(addr, data);
+ trace_pci_nvme_mmio_write(addr, data, size);
if (addr < sizeof(n->bar)) {
nvme_write_bar(n, addr, data, size);
@@ -2464,13 +4176,13 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
unsigned size)
{
NvmeCtrl *n = (NvmeCtrl *)opaque;
- stn_le_p(&n->cmbuf[addr], size, data);
+ stn_le_p(&n->cmb.buf[addr], size, data);
}
static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
{
NvmeCtrl *n = (NvmeCtrl *)opaque;
- return ldn_le_p(&n->cmbuf[addr], size);
+ return ldn_le_p(&n->cmb.buf[addr], size);
}
static const MemoryRegionOps nvme_cmb_ops = {
@@ -2518,19 +4230,26 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
return;
}
- if (!n->params.cmb_size_mb && n->pmrdev) {
- if (host_memory_backend_is_mapped(n->pmrdev)) {
+ if (n->pmr.dev) {
+ if (host_memory_backend_is_mapped(n->pmr.dev)) {
error_setg(errp, "can't use already busy memdev: %s",
- object_get_canonical_path_component(OBJECT(n->pmrdev)));
+ object_get_canonical_path_component(OBJECT(n->pmr.dev)));
return;
}
- if (!is_power_of_2(n->pmrdev->size)) {
+ if (!is_power_of_2(n->pmr.dev->size)) {
error_setg(errp, "pmr backend size needs to be power of 2 in size");
return;
}
- host_memory_backend_set_mapped(n->pmrdev, true);
+ host_memory_backend_set_mapped(n->pmr.dev, true);
+ }
+
+ if (n->params.zasl_bs) {
+ if (!is_power_of_2(n->params.zasl_bs)) {
+ error_setg(errp, "zone append size limit has to be a power of 2");
+ return;
+ }
}
}
@@ -2586,78 +4305,49 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
{
- NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
- NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
+ uint64_t cmb_size = n->params.cmb_size_mb * MiB;
- NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
- NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
- NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
- NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
- NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
- NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
- NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
-
- n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
- memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
- "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
- pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
+ n->cmb.buf = g_malloc0(cmb_size);
+ memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
+ "nvme-cmb", cmb_size);
+ pci_register_bar(pci_dev, NVME_CMB_BIR,
PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_64 |
- PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
+ PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
+
+ NVME_CAP_SET_CMBS(n->bar.cap, 1);
+
+ if (n->params.legacy_cmb) {
+ nvme_cmb_enable_regs(n);
+ n->cmb.cmse = true;
+ }
}
static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
{
- /* Controller Capabilities register */
- NVME_CAP_SET_PMRS(n->bar.cap, 1);
-
- /* PMR Capabities register */
- n->bar.pmrcap = 0;
- NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
- NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
+ NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
+ NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
- NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
/* Turn on bit 1 support */
NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
- NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
- NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
-
- /* PMR Control register */
- n->bar.pmrctl = 0;
- NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
-
- /* PMR Status register */
- n->bar.pmrsts = 0;
- NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
- NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
- NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
- NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
-
- /* PMR Elasticity Buffer Size register */
- n->bar.pmrebs = 0;
- NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
- NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
- NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
-
- /* PMR Sustained Write Throughput register */
- n->bar.pmrswtp = 0;
- NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
- NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
-
- /* PMR Memory Space Control register */
- n->bar.pmrmsc = 0;
- NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
- NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
+ NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_64 |
- PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
+ PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
+
+ memory_region_set_enabled(&n->pmr.dev->mr, false);
}
-static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
+static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
{
uint8_t *pci_conf = pci_dev->config;
+ uint64_t bar_size, msix_table_size, msix_pba_size;
+ unsigned msix_table_offset, msix_pba_offset;
+ int ret;
+
+ Error *err = NULL;
pci_conf[PCI_INTERRUPT_PIN] = 1;
pci_config_set_prog_interface(pci_conf, 0x2);
@@ -2673,19 +4363,46 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
pcie_endpoint_cap_init(pci_dev, 0x80);
+ bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
+ msix_table_offset = bar_size;
+ msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
+
+ bar_size += msix_table_size;
+ bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+ msix_pba_offset = bar_size;
+ msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
+
+ bar_size += msix_pba_size;
+ bar_size = pow2ceil(bar_size);
+
+ memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
n->reg_size);
+ memory_region_add_subregion(&n->bar0, 0, &n->iomem);
+
pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
- PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
- if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) {
- return;
+ PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+ ret = msix_init(pci_dev, n->params.msix_qsize,
+ &n->bar0, 0, msix_table_offset,
+ &n->bar0, 0, msix_pba_offset, 0, &err);
+ if (ret < 0) {
+ if (ret == -ENOTSUP) {
+ warn_report_err(err);
+ } else {
+ error_propagate(errp, err);
+ return ret;
+ }
}
if (n->params.cmb_size_mb) {
nvme_init_cmb(n, pci_dev);
- } else if (n->pmrdev) {
+ }
+
+ if (n->pmr.dev) {
nvme_init_pmr(n, pci_dev);
}
+
+ return 0;
}
static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -2706,6 +4423,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->mdts = n->params.mdts;
id->ver = cpu_to_le32(NVME_SPEC_VER);
id->oacs = cpu_to_le16(0);
+ id->cntrltype = 0x1;
/*
* Because the controller always completes the Abort command immediately,
@@ -2721,7 +4439,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->acl = 3;
id->aerl = n->params.aerl;
id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
- id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED;
+ id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
/* recommended default value (~70 C) */
id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
@@ -2731,9 +4449,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->cqes = (0x4 << 4) | 0x4;
id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
- NVME_ONCS_FEATURES);
+ NVME_ONCS_FEATURES | NVME_ONCS_DSM |
+ NVME_ONCS_COMPARE);
- id->vwc = 0x1;
+ id->vwc = (0x2 << 1) | 0x1;
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
NVME_CTRL_SGLS_BITBUCKET);
@@ -2745,13 +4464,15 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->psd[0].enlat = cpu_to_le32(0x10);
id->psd[0].exlat = cpu_to_le32(0x4);
- n->bar.cap = 0;
NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
NVME_CAP_SET_CQR(n->bar.cap, 1);
NVME_CAP_SET_TO(n->bar.cap, 0xf);
NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
+ NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+ NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
+ NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
n->bar.vs = NVME_SPEC_VER;
n->bar.intmc = n->bar.intms = 0;
@@ -2773,9 +4494,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
&pci_dev->qdev, n->parent_obj.qdev.id);
nvme_init_state(n);
- nvme_init_pci(n, pci_dev, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (nvme_init_pci(n, pci_dev, errp)) {
return;
}
@@ -2786,7 +4505,11 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
ns = &n->namespace;
ns->params.nsid = 1;
- if (nvme_ns_setup(n, ns, errp)) {
+ if (nvme_ns_setup(ns, errp)) {
+ return;
+ }
+
+ if (nvme_register_namespace(n, ns, errp)) {
return;
}
}
@@ -2795,25 +4518,37 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
static void nvme_exit(PCIDevice *pci_dev)
{
NvmeCtrl *n = NVME(pci_dev);
+ NvmeNamespace *ns;
+ int i;
+
+ nvme_ctrl_reset(n);
+
+ for (i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
+ continue;
+ }
+
+ nvme_ns_cleanup(ns);
+ }
- nvme_clear_ctrl(n);
g_free(n->cq);
g_free(n->sq);
g_free(n->aer_reqs);
if (n->params.cmb_size_mb) {
- g_free(n->cmbuf);
+ g_free(n->cmb.buf);
}
- if (n->pmrdev) {
- host_memory_backend_set_mapped(n->pmrdev, false);
+ if (n->pmr.dev) {
+ host_memory_backend_set_mapped(n->pmr.dev, false);
}
msix_uninit_exclusive_bar(pci_dev);
}
static Property nvme_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
- DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND,
+ DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
HostMemoryBackend *),
DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
@@ -2824,9 +4559,54 @@ static Property nvme_props[] = {
DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
+ DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
+ DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs,
+ NVME_DEFAULT_MAX_ZA_SIZE),
DEFINE_PROP_END_OF_LIST(),
};
+static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ NvmeCtrl *n = NVME(obj);
+ uint8_t value = n->smart_critical_warning;
+
+ visit_type_uint8(v, name, &value, errp);
+}
+
+static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ NvmeCtrl *n = NVME(obj);
+ uint8_t value, old_value, cap = 0, index, event;
+
+ if (!visit_type_uint8(v, name, &value, errp)) {
+ return;
+ }
+
+ cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
+ | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
+ if (NVME_CAP_PMRS(n->bar.cap)) {
+ cap |= NVME_SMART_PMR_UNRELIABLE;
+ }
+
+ if ((value & cap) != value) {
+ error_setg(errp, "unsupported smart critical warning bits: 0x%x",
+ value & ~cap);
+ return;
+ }
+
+ old_value = n->smart_critical_warning;
+ n->smart_critical_warning = value;
+
+ /* only inject new bits of smart critical warning */
+ for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
+ event = 1 << index;
+ if (value & ~old_value & event)
+ nvme_smart_event(n, event);
+ }
+}
+
static const VMStateDescription nvme_vmstate = {
.name = "nvme",
.unmigratable = 1,
@@ -2850,13 +4630,17 @@ static void nvme_class_init(ObjectClass *oc, void *data)
static void nvme_instance_init(Object *obj)
{
- NvmeCtrl *s = NVME(obj);
+ NvmeCtrl *n = NVME(obj);
- if (s->namespace.blkconf.blk) {
- device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex,
+ if (n->namespace.blkconf.blk) {
+ device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
"bootindex", "/namespace@1,0",
DEVICE(obj));
}
+
+ object_property_add(obj, "smart_critical_warning", "uint8",
+ nvme_get_smart_warning,
+ nvme_set_smart_warning, NULL, NULL);
}
static const TypeInfo nvme_info = {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a..dee6092bd4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
#define NVME_MAX_NAMESPACES 256
+#define NVME_DEFAULT_ZONE_SIZE (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
typedef struct NvmeParams {
char *serial;
uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,8 @@ typedef struct NvmeParams {
uint32_t aer_max_queued;
uint8_t mdts;
bool use_intel_id;
+ uint32_t zasl_bs;
+ bool legacy_cmb;
} NvmeParams;
typedef struct NvmeAsyncEvent {
@@ -28,6 +33,7 @@ typedef struct NvmeRequest {
struct NvmeNamespace *ns;
BlockAIOCB *aiocb;
uint16_t status;
+ void *opaque;
NvmeCqe cqe;
NvmeCmd cmd;
BlockAcctCookie acct;
@@ -59,7 +65,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH";
case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
+ case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE";
case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+ case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
+ case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND";
+ case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV";
+ case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND";
default: return "NVME_NVM_CMD_UNKNOWN";
}
}
@@ -111,13 +122,12 @@ typedef struct NvmeFeatureVal {
uint16_t temp_thresh_low;
};
uint32_t async_config;
- uint32_t vwc;
} NvmeFeatureVal;
typedef struct NvmeCtrl {
PCIDevice parent_obj;
+ MemoryRegion bar0;
MemoryRegion iomem;
- MemoryRegion ctrl_mem;
NvmeBar bar;
NvmeParams params;
NvmeBus bus;
@@ -133,20 +143,33 @@ typedef struct NvmeCtrl {
uint32_t num_namespaces;
uint32_t max_q_ents;
uint8_t outstanding_aers;
- uint8_t *cmbuf;
uint32_t irq_status;
uint64_t host_timestamp; /* Timestamp sent by the host */
uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */
uint64_t starttime_ms;
uint16_t temperature;
+ uint8_t smart_critical_warning;
- HostMemoryBackend *pmrdev;
+ struct {
+ MemoryRegion mem;
+ uint8_t *buf;
+ bool cmse;
+ hwaddr cba;
+ } cmb;
+
+ struct {
+ HostMemoryBackend *dev;
+ bool cmse;
+ hwaddr cba;
+ } pmr;
uint8_t aer_mask;
NvmeRequest **aer_reqs;
QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
int aer_queued;
+ uint8_t zasl;
+
NvmeNamespace namespace;
NvmeNamespace *namespaces[NVME_MAX_NAMESPACES];
NvmeSQueue **sq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index c1537e3ac0..d32475c398 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -40,16 +40,27 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2,
pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64""
pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
-pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
+pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
+pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
pci_nvme_identify_ctrl(void) "identify controller"
+pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8""
pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8""
+pci_nvme_identify_cmd_set(void) "identify i/o command set"
pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32""
@@ -69,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
-pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
-pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
+pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
+pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d"
pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""
pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16""
pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
@@ -85,6 +96,15 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded"
pci_nvme_mmio_stopped(void) "cleared controller enable bit"
pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
+pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32""
+pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state"
+pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state"
# nvme traces for error conditions
pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
@@ -102,6 +122,25 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR
pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
+pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
+pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
+pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64""
+pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
+pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
+pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64""
+pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64""
+pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
+pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded"
+pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded"
+pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32""
+pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32""
pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16""
pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16""
pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16""
@@ -134,7 +173,9 @@ pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_
pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u"
pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero"
pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
+pci_nvme_err_startfail_zasl_too_small(uint32_t zasl, uint32_t pagesz) "nvme_start_ctrl failed because zone append size limit %"PRIu32" is too small, needs to be >= %"PRIu32""
pci_nvme_err_startfail(void) "setting controller enable bit failed"
+pci_nvme_err_invalid_mgmt_action(uint8_t action) "action=0x%"PRIx8""
# Traces for undefined behavior
pci_nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
@@ -158,6 +199,7 @@ pci_nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for
pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring"
pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring"
pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring"
+pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field"
# xen-block.c
xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s d%up%u"
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 5d6163ab70..970046f438 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -32,6 +32,9 @@
#include "hw/mem/nvdimm.h"
#include "migration/global_state.h"
#include "migration/vmstate.h"
+#include "exec/confidential-guest-support.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-pci.h"
GlobalProperty hw_compat_5_2[] = {};
const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);
@@ -427,24 +430,37 @@ static char *machine_get_memory_encryption(Object *obj, Error **errp)
{
MachineState *ms = MACHINE(obj);
- return g_strdup(ms->memory_encryption);
+ if (ms->cgs) {
+ return g_strdup(object_get_canonical_path_component(OBJECT(ms->cgs)));
+ }
+
+ return NULL;
}
static void machine_set_memory_encryption(Object *obj, const char *value,
Error **errp)
{
- MachineState *ms = MACHINE(obj);
+ Object *cgs =
+ object_resolve_path_component(object_get_objects_root(), value);
+
+ if (!cgs) {
+ error_setg(errp, "No such memory encryption object '%s'", value);
+ return;
+ }
- g_free(ms->memory_encryption);
- ms->memory_encryption = g_strdup(value);
+ object_property_set_link(obj, "confidential-guest-support", cgs, errp);
+}
+static void machine_check_confidential_guest_support(const Object *obj,
+ const char *name,
+ Object *new_target,
+ Error **errp)
+{
/*
- * With memory encryption, the host can't see the real contents of RAM,
- * so there's no point in it trying to merge areas.
+ * So far the only constraint is that the target has the
+ * TYPE_CONFIDENTIAL_GUEST_SUPPORT interface, and that's checked
+ * by the QOM core
*/
- if (value) {
- machine_set_mem_merge(obj, false, errp);
- }
}
static bool machine_get_nvdimm(Object *obj, Error **errp)
@@ -844,6 +860,15 @@ static void machine_class_init(ObjectClass *oc, void *data)
object_class_property_set_description(oc, "suppress-vmdesc",
"Set on to disable self-describing migration");
+ object_class_property_add_link(oc, "confidential-guest-support",
+ TYPE_CONFIDENTIAL_GUEST_SUPPORT,
+ offsetof(MachineState, cgs),
+ machine_check_confidential_guest_support,
+ OBJ_PROP_LINK_STRONG);
+ object_class_property_set_description(oc, "confidential-guest-support",
+ "Set confidential guest scheme to support");
+
+ /* For compatibility */
object_class_property_add_str(oc, "memory-encryption",
machine_get_memory_encryption, machine_set_memory_encryption);
object_class_property_set_description(oc, "memory-encryption",
@@ -1166,6 +1191,26 @@ void machine_run_board_init(MachineState *machine)
cc->deprecation_note);
}
+ if (machine->cgs) {
+ /*
+ * With confidential guests, the host can't see the real
+ * contents of RAM, so there's no point in it trying to merge
+ * areas.
+ */
+ machine_set_mem_merge(OBJECT(machine), false, &error_abort);
+
+ /*
+ * Virtio devices can't count on directly accessing guest
+ * memory, so they need iommu_platform=on to use normal DMA
+ * mechanisms. That requires also disabling legacy virtio
+ * support for those virtio pci devices which allow it.
+ */
+ object_register_sugar_prop(TYPE_VIRTIO_PCI, "disable-legacy",
+ "on", true);
+ object_register_sugar_prop(TYPE_VIRTIO_DEVICE, "iommu_platform",
+ "on", false);
+ }
+
machine_class->init(machine);
phase_advance(PHASE_MACHINE_INITIALIZED);
}
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 92e90ff013..11172214f1 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -38,6 +38,7 @@
#include "sysemu/sysemu.h"
#include "hw/block/flash.h"
#include "sysemu/kvm.h"
+#include "sysemu/sev.h"
#define FLASH_SECTOR_SIZE 4096
@@ -147,7 +148,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
PFlashCFI01 *system_flash;
MemoryRegion *flash_mem;
void *flash_ptr;
- int ret, flash_size;
+ int flash_size;
assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled);
@@ -191,16 +192,10 @@ static void pc_system_flash_map(PCMachineState *pcms,
flash_mem = pflash_cfi01_get_memory(system_flash);
pc_isa_bios_init(rom_memory, flash_mem, size);
- /* Encrypt the pflash boot ROM */
- if (kvm_memcrypt_enabled()) {
- flash_ptr = memory_region_get_ram_ptr(flash_mem);
- flash_size = memory_region_size(flash_mem);
- ret = kvm_memcrypt_encrypt_data(flash_ptr, flash_size);
- if (ret) {
- error_report("failed to encrypt pflash rom");
- exit(1);
- }
- }
+ /* Encrypt the pflash boot ROM, if necessary */
+ flash_ptr = memory_region_get_ram_ptr(flash_mem);
+ flash_size = memory_region_size(flash_mem);
+ sev_encrypt_flash(flash_ptr, flash_size, &error_fatal);
}
}
}
diff --git a/hw/intc/pnv_xive.c b/hw/intc/pnv_xive.c
index 5f69626b3a..ad43483612 100644
--- a/hw/intc/pnv_xive.c
+++ b/hw/intc/pnv_xive.c
@@ -24,6 +24,7 @@
#include "hw/ppc/xive_regs.h"
#include "hw/qdev-properties.h"
#include "hw/ppc/ppc.h"
+#include "trace.h"
#include <libfdt.h>
@@ -1319,6 +1320,8 @@ static void pnv_xive_ic_hw_trigger(PnvXive *xive, hwaddr addr, uint64_t val)
uint8_t blk;
uint32_t idx;
+ trace_pnv_xive_ic_hw_trigger(addr, val);
+
if (val & XIVE_TRIGGER_END) {
xive_error(xive, "IC: END trigger at @0x%"HWADDR_PRIx" data 0x%"PRIx64,
addr, val);
diff --git a/hw/intc/trace-events b/hw/intc/trace-events
index 8ed397a0d5..45ddaf48df 100644
--- a/hw/intc/trace-events
+++ b/hw/intc/trace-events
@@ -236,3 +236,6 @@ xive_tctx_tm_write(uint64_t offset, unsigned int size, uint64_t value) "@0x0x%"P
xive_tctx_tm_read(uint64_t offset, unsigned int size, uint64_t value) "@0x0x%"PRIx64" sz=%d val=0x%" PRIx64
xive_presenter_notify(uint8_t nvt_blk, uint32_t nvt_idx, uint8_t ring) "found NVT 0x%x/0x%x ring=0x%x"
xive_end_source_read(uint8_t end_blk, uint32_t end_idx, uint64_t addr) "END 0x%x/0x%x @0x0x%"PRIx64
+
+# pnv_xive.c
+pnv_xive_ic_hw_trigger(uint64_t addr, uint64_t val) "@0x%"PRIx64" val=0x%"PRIx64
diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index fa8c3d8287..eeb4e62ba9 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1294,7 +1294,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon)
pq = xive_get_field32(END_W1_ESn, end->w1);
- monitor_printf(mon, " %08x %c%c %c%c%c%c%c%c%c prio:%d nvt:%02x/%04x",
+ monitor_printf(mon, " %08x %c%c %c%c%c%c%c%c%c%c prio:%d nvt:%02x/%04x",
end_idx,
pq & XIVE_ESB_VAL_P ? 'P' : '-',
pq & XIVE_ESB_VAL_Q ? 'Q' : '-',
@@ -1305,6 +1305,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon)
xive_end_is_escalate(end) ? 'e' : '-',
xive_end_is_uncond_escalation(end) ? 'u' : '-',
xive_end_is_silent_escalation(end) ? 's' : '-',
+ xive_end_is_firmware(end) ? 'f' : '-',
priority, nvt_blk, nvt_idx);
if (qaddr_base) {
diff --git a/hw/meson.build b/hw/meson.build
index 010de7219c..e615d72d4d 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -56,6 +56,7 @@ subdir('moxie')
subdir('nios2')
subdir('openrisc')
subdir('ppc')
+subdir('remote')
subdir('riscv')
subdir('rx')
subdir('s390x')
diff --git a/hw/misc/arm_integrator_debug.c b/hw/misc/arm_integrator_debug.c
index ec0d4b90d3..9a19727829 100644
--- a/hw/misc/arm_integrator_debug.c
+++ b/hw/misc/arm_integrator_debug.c
@@ -6,7 +6,7 @@
* to this area.
*
* The real h/w is described at:
- * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0159b/Babbfijf.html
+ * https://developer.arm.com/documentation/dui0159/b/peripherals-and-interfaces/debug-leds-and-dip-switch-interface
*
* Copyright (c) 2013 Alex Bennée <alex@bennee.com>
*
diff --git a/hw/misc/imx7_ccm.c b/hw/misc/imx7_ccm.c
index 02fc1ae8d0..075159e497 100644
--- a/hw/misc/imx7_ccm.c
+++ b/hw/misc/imx7_ccm.c
@@ -131,8 +131,16 @@ static const struct MemoryRegionOps imx7_set_clr_tog_ops = {
},
};
+static void imx7_digprog_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Guest write to read-only ANALOG_DIGPROG register\n");
+}
+
static const struct MemoryRegionOps imx7_digprog_ops = {
.read = imx7_set_clr_tog_read,
+ .write = imx7_digprog_write,
.endianness = DEVICE_NATIVE_ENDIAN,
.impl = {
.min_access_size = 4,
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 0505b52c98..603e992a7f 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -495,7 +495,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
/* mmap the region and map into the BAR2 */
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
- "ivshmem.bar2", size, true, fd, &local_err);
+ "ivshmem.bar2", size, true, fd, 0,
+ &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
diff --git a/hw/misc/tz-ppc.c b/hw/misc/tz-ppc.c
index 6431257b52..36495c68e7 100644
--- a/hw/misc/tz-ppc.c
+++ b/hw/misc/tz-ppc.c
@@ -196,7 +196,21 @@ static bool tz_ppc_dummy_accepts(void *opaque, hwaddr addr,
g_assert_not_reached();
}
+static uint64_t tz_ppc_dummy_read(void *opaque, hwaddr addr, unsigned size)
+{
+ g_assert_not_reached();
+}
+
+static void tz_ppc_dummy_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ g_assert_not_reached();
+}
+
static const MemoryRegionOps tz_ppc_dummy_ops = {
+ /* define r/w methods to avoid assert failure in memory_region_init_io */
+ .read = tz_ppc_dummy_read,
+ .write = tz_ppc_dummy_write,
.valid.accepts = tz_ppc_dummy_accepts,
};
diff --git a/hw/net/fsl_etsec/rings.c b/hw/net/fsl_etsec/rings.c
index 121415abfe..fe055d3381 100644
--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@@ -502,7 +502,7 @@ ssize_t etsec_rx_ring_write(eTSEC *etsec, const uint8_t *buf, size_t size)
return -1;
}
- if ((etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
+ if (!(etsec->regs[RCTRL].value & RCTRL_RSF) && (size < 60)) {
/* CRC is not in the packet yet, so short frame is below 60 bytes */
RING_DEBUG("%s: Drop short frame\n", __func__);
return -1;
diff --git a/hw/nvram/nrf51_nvm.c b/hw/nvram/nrf51_nvm.c
index f2283c1a8d..7b3460d52d 100644
--- a/hw/nvram/nrf51_nvm.c
+++ b/hw/nvram/nrf51_nvm.c
@@ -273,6 +273,15 @@ static const MemoryRegionOps io_ops = {
.endianness = DEVICE_LITTLE_ENDIAN,
};
+static uint64_t flash_read(void *opaque, hwaddr offset, unsigned size)
+{
+ /*
+ * This is a rom_device MemoryRegion which is always in
+ * romd_mode (we never put it in MMIO mode), so reads always
+ * go directly to RAM and never come here.
+ */
+ g_assert_not_reached();
+}
static void flash_write(void *opaque, hwaddr offset, uint64_t value,
unsigned int size)
@@ -300,6 +309,7 @@ static void flash_write(void *opaque, hwaddr offset, uint64_t value,
static const MemoryRegionOps flash_ops = {
+ .read = flash_read,
.write = flash_write,
.valid.min_access_size = 4,
.valid.max_access_size = 4,
diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig
index eb03f0489d..8b8c763c28 100644
--- a/hw/pci-host/Kconfig
+++ b/hw/pci-host/Kconfig
@@ -65,3 +65,6 @@ config PCI_POWERNV
select PCI_EXPRESS
select MSI_NONBROKEN
select PCIE_PORT
+
+config REMOTE_PCIHOST
+ bool
diff --git a/hw/pci-host/designware.c b/hw/pci-host/designware.c
index f9fb97a3e3..bde3a343a2 100644
--- a/hw/pci-host/designware.c
+++ b/hw/pci-host/designware.c
@@ -21,6 +21,7 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "qemu/module.h"
+#include "qemu/log.h"
#include "hw/pci/msi.h"
#include "hw/pci/pci_bridge.h"
#include "hw/pci/pci_host.h"
@@ -63,6 +64,23 @@ designware_pcie_root_to_host(DesignwarePCIERoot *root)
return DESIGNWARE_PCIE_HOST(bus->parent);
}
+static uint64_t designware_pcie_root_msi_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ /*
+ * Attempts to read from the MSI address are undefined in
+ * the PCI specifications. For this hardware, the datasheet
+ * specifies that a read from the magic address is simply not
+ * intercepted by the MSI controller, and will go out to the
+ * AHB/AXI bus like any other PCI-device-initiated DMA read.
+ * This is not trivial to implement in QEMU, so since
+ * well-behaved guests won't ever ask a PCI device to DMA from
+ * this address we just log the missing functionality.
+ */
+ qemu_log_mask(LOG_UNIMP, "%s not implemented\n", __func__);
+ return 0;
+}
+
static void designware_pcie_root_msi_write(void *opaque, hwaddr addr,
uint64_t val, unsigned len)
{
@@ -77,6 +95,7 @@ static void designware_pcie_root_msi_write(void *opaque, hwaddr addr,
}
static const MemoryRegionOps designware_pci_host_msi_ops = {
+ .read = designware_pcie_root_msi_read,
.write = designware_pcie_root_msi_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.valid = {
diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build
index da9d1a9964..1847c69905 100644
--- a/hw/pci-host/meson.build
+++ b/hw/pci-host/meson.build
@@ -9,6 +9,7 @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c'))
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
+pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c'))
# PPC devices
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
diff --git a/hw/pci-host/pnv_phb4.c b/hw/pci-host/pnv_phb4.c
index 6328e985f8..54f57c660a 100644
--- a/hw/pci-host/pnv_phb4.c
+++ b/hw/pci-host/pnv_phb4.c
@@ -22,6 +22,7 @@
#include "hw/irq.h"
#include "hw/qdev-properties.h"
#include "qom/object.h"
+#include "trace.h"
#define phb_error(phb, fmt, ...) \
qemu_log_mask(LOG_GUEST_ERROR, "phb4[%d:%d]: " fmt "\n", \
@@ -1257,6 +1258,8 @@ static void pnv_phb4_xive_notify(XiveNotifier *xf, uint32_t srcno)
uint64_t data = XIVE_TRIGGER_PQ | offset | srcno;
MemTxResult result;
+ trace_pnv_phb4_xive_notify(notif_port, data);
+
address_space_stq_be(&address_space_memory, notif_port, data,
MEMTXATTRS_UNSPECIFIED, &result);
if (result != MEMTX_OK) {
diff --git a/hw/pci-host/prep.c b/hw/pci-host/prep.c
index 0469db8c1d..0a9162fba9 100644
--- a/hw/pci-host/prep.c
+++ b/hw/pci-host/prep.c
@@ -27,6 +27,7 @@
#include "qemu-common.h"
#include "qemu/datadir.h"
#include "qemu/units.h"
+#include "qemu/log.h"
#include "qapi/error.h"
#include "hw/pci/pci.h"
#include "hw/pci/pci_bus.h"
@@ -121,8 +122,15 @@ static uint64_t raven_intack_read(void *opaque, hwaddr addr,
return pic_read_irq(isa_pic);
}
+static void raven_intack_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ qemu_log_mask(LOG_UNIMP, "%s not implemented\n", __func__);
+}
+
static const MemoryRegionOps raven_intack_ops = {
.read = raven_intack_read,
+ .write = raven_intack_write,
.valid = {
.max_access_size = 1,
},
diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c
new file mode 100644
index 0000000000..eee45444ef
--- /dev/null
+++ b/hw/pci-host/remote.c
@@ -0,0 +1,75 @@
+/*
+ * Remote PCI host device
+ *
+ * Unlike PCI host devices that model physical hardware, the purpose
+ * of this PCI host is to host multi-process QEMU devices.
+ *
+ * Multi-process QEMU extends the PCI host of a QEMU machine into a
+ * remote process. Any PCI device attached to the remote process is
+ * visible in the QEMU guest. This allows existing QEMU device models
+ * to be reused in the remote process.
+ *
+ * This PCI host is purely a container for PCI devices. It's fake in the
+ * sense that the guest never sees this PCI host and has no way of
+ * accessing it. Its job is just to provide the environment that QEMU
+ * PCI device models need when running in a remote process.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_host.h"
+#include "hw/qdev-properties.h"
+#include "hw/pci-host/remote.h"
+#include "exec/memory.h"
+
+static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
+ PCIBus *rootbus)
+{
+ return "0000:00";
+}
+
+static void remote_pcihost_realize(DeviceState *dev, Error **errp)
+{
+ PCIHostState *pci = PCI_HOST_BRIDGE(dev);
+ RemotePCIHost *s = REMOTE_PCIHOST(dev);
+
+ pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci",
+ s->mr_pci_mem, s->mr_sys_io,
+ 0, TYPE_PCIE_BUS);
+}
+
+static void remote_pcihost_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
+
+ hc->root_bus_path = remote_pcihost_root_bus_path;
+ dc->realize = remote_pcihost_realize;
+
+ dc->user_creatable = false;
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ dc->fw_name = "pci";
+}
+
+static const TypeInfo remote_pcihost_info = {
+ .name = TYPE_REMOTE_PCIHOST,
+ .parent = TYPE_PCIE_HOST_BRIDGE,
+ .instance_size = sizeof(RemotePCIHost),
+ .class_init = remote_pcihost_class_init,
+};
+
+static void remote_pcihost_register(void)
+{
+ type_register_static(&remote_pcihost_info);
+}
+
+type_init(remote_pcihost_register)
diff --git a/hw/pci-host/trace-events b/hw/pci-host/trace-events
index d19ca9aef6..7d8063ac42 100644
--- a/hw/pci-host/trace-events
+++ b/hw/pci-host/trace-events
@@ -20,3 +20,6 @@ unin_data_write(uint64_t addr, unsigned len, uint64_t val) "write addr 0x%"PRIx6
unin_data_read(uint64_t addr, unsigned len, uint64_t val) "read addr 0x%"PRIx64 " len %d val 0x%"PRIx64
unin_write(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64
unin_read(uint64_t addr, uint64_t value) "addr=0x%" PRIx64 " val=0x%"PRIx64
+
+# pnv_phb4.c
+pnv_phb4_xive_notify(uint64_t notif_port, uint64_t data) "notif=@0x%"PRIx64" data=0x%"PRIx64
diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index c64b5d08bd..01517a6c6c 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -74,6 +74,8 @@
#define MPC8544_I2C_IRQ 43
#define RTC_REGS_OFFSET 0x68
+#define PLATFORM_CLK_FREQ_HZ (400 * 1000 * 1000)
+
struct boot_info
{
uint32_t dt_base;
@@ -124,7 +126,7 @@ static void dt_serial_create(void *fdt, unsigned long long offset,
qemu_fdt_setprop_string(fdt, ser, "compatible", "ns16550");
qemu_fdt_setprop_cells(fdt, ser, "reg", offset, 0x100);
qemu_fdt_setprop_cell(fdt, ser, "cell-index", idx);
- qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", 0);
+ qemu_fdt_setprop_cell(fdt, ser, "clock-frequency", PLATFORM_CLK_FREQ_HZ);
qemu_fdt_setprop_cells(fdt, ser, "interrupts", 42, 2);
qemu_fdt_setprop_phandle(fdt, ser, "interrupt-parent", mpic);
qemu_fdt_setprop_string(fdt, "/aliases", alias, ser);
@@ -320,8 +322,8 @@ static int ppce500_load_device_tree(PPCE500MachineState *pms,
int fdt_size;
void *fdt;
uint8_t hypercall[16];
- uint32_t clock_freq = 400000000;
- uint32_t tb_freq = 400000000;
+ uint32_t clock_freq = PLATFORM_CLK_FREQ_HZ;
+ uint32_t tb_freq = PLATFORM_CLK_FREQ_HZ;
int i;
char compatible_sb[] = "fsl,mpc8544-immr\0simple-bus";
char *soc;
@@ -890,7 +892,7 @@ void ppce500_init(MachineState *machine)
env->spr_cb[SPR_BOOKE_PIR].default_value = cs->cpu_index = i;
env->mpic_iack = pmc->ccsrbar_base + MPC8544_MPIC_REGS_OFFSET + 0xa0;
- ppc_booke_timers_init(cpu, 400000000, PPC_TIMER_E500);
+ ppc_booke_timers_init(cpu, PLATFORM_CLK_FREQ_HZ, PPC_TIMER_E500);
/* Register reset handler */
if (!i) {
diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build
index ffa2ec37fa..218631c883 100644
--- a/hw/ppc/meson.build
+++ b/hw/ppc/meson.build
@@ -27,6 +27,7 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files(
'spapr_nvdimm.c',
'spapr_rtas_ddw.c',
'spapr_numa.c',
+ 'pef.c',
))
ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c'))
ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files(
diff --git a/hw/ppc/pef.c b/hw/ppc/pef.c
new file mode 100644
index 0000000000..573be3ed79
--- /dev/null
+++ b/hw/ppc/pef.c
@@ -0,0 +1,140 @@
+/*
+ * PEF (Protected Execution Facility) for POWER support
+ *
+ * Copyright Red Hat.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/kvm.h"
+#include "migration/blocker.h"
+#include "exec/confidential-guest-support.h"
+#include "hw/ppc/pef.h"
+
+#define TYPE_PEF_GUEST "pef-guest"
+OBJECT_DECLARE_SIMPLE_TYPE(PefGuest, PEF_GUEST)
+
+typedef struct PefGuest PefGuest;
+typedef struct PefGuestClass PefGuestClass;
+
+struct PefGuestClass {
+ ConfidentialGuestSupportClass parent_class;
+};
+
+/**
+ * PefGuest:
+ *
+ * The PefGuest object is used for creating and managing a PEF
+ * guest.
+ *
+ * # $QEMU \
+ * -object pef-guest,id=pef0 \
+ * -machine ...,confidential-guest-support=pef0
+ */
+struct PefGuest {
+ ConfidentialGuestSupport parent_obj;
+};
+
+static int kvmppc_svm_init(Error **errp)
+{
+#ifdef CONFIG_KVM
+ static Error *pef_mig_blocker;
+
+ if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_SECURE_GUEST)) {
+ error_setg(errp,
+ "KVM implementation does not support Secure VMs (is an ultravisor running?)");
+ return -1;
+ } else {
+ int ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SECURE_GUEST, 0, 1);
+
+ if (ret < 0) {
+ error_setg(errp,
+ "Error enabling PEF with KVM");
+ return -1;
+ }
+ }
+
+ /* add migration blocker */
+ error_setg(&pef_mig_blocker, "PEF: Migration is not implemented");
+ /* NB: This can fail if --only-migratable is used */
+ migrate_add_blocker(pef_mig_blocker, &error_fatal);
+
+ return 0;
+#else
+ g_assert_not_reached();
+#endif
+}
+
+/*
+ * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels
+ * that don't support this ioctl.
+ */
+static int kvmppc_svm_off(Error **errp)
+{
+#ifdef CONFIG_KVM
+ int rc;
+
+ rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF);
+ if (rc && rc != -ENOTTY) {
+ error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed");
+ return rc;
+ }
+ return 0;
+#else
+ g_assert_not_reached();
+#endif
+}
+
+int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) {
+ return 0;
+ }
+
+ if (!kvm_enabled()) {
+ error_setg(errp, "PEF requires KVM");
+ return -1;
+ }
+
+ return kvmppc_svm_init(errp);
+}
+
+int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ if (!object_dynamic_cast(OBJECT(cgs), TYPE_PEF_GUEST)) {
+ return 0;
+ }
+
+ /*
+ * If we don't have KVM we should never have been able to
+ * initialize PEF, so we should never get this far
+ */
+ assert(kvm_enabled());
+
+ return kvmppc_svm_off(errp);
+}
+
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(PefGuest,
+ pef_guest,
+ PEF_GUEST,
+ CONFIDENTIAL_GUEST_SUPPORT,
+ { TYPE_USER_CREATABLE },
+ { NULL })
+
+static void pef_guest_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void pef_guest_init(Object *obj)
+{
+}
+
+static void pef_guest_finalize(Object *obj)
+{
+}
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index 14fc9758a9..77af846cdf 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -21,6 +21,7 @@
#include "qemu-common.h"
#include "qemu/datadir.h"
#include "qemu/units.h"
+#include "qemu/cutils.h"
#include "qapi/error.h"
#include "sysemu/qtest.h"
#include "sysemu/sysemu.h"
@@ -65,9 +66,9 @@
#define FW_MAX_SIZE (16 * MiB)
#define KERNEL_LOAD_ADDR 0x20000000
-#define KERNEL_MAX_SIZE (256 * MiB)
-#define INITRD_LOAD_ADDR 0x60000000
-#define INITRD_MAX_SIZE (256 * MiB)
+#define KERNEL_MAX_SIZE (128 * MiB)
+#define INITRD_LOAD_ADDR 0x28000000
+#define INITRD_MAX_SIZE (128 * MiB)
static const char *pnv_chip_core_typename(const PnvChip *o)
{
@@ -725,8 +726,11 @@ static void pnv_init(MachineState *machine)
DeviceState *dev;
/* allocate RAM */
- if (machine->ram_size < (1 * GiB)) {
- warn_report("skiboot may not work with < 1GB of RAM");
+ if (machine->ram_size < mc->default_ram_size) {
+ char *sz = size_to_str(mc->default_ram_size);
+ error_report("Invalid RAM size, should be bigger than %s", sz);
+ g_free(sz);
+ exit(EXIT_FAILURE);
}
memory_region_add_subregion(get_system_memory(), 0, machine->ram);
@@ -872,6 +876,14 @@ static void pnv_init(MachineState *machine)
}
/*
+ * The PNOR is mapped on the LPC FW address space by the BMC.
+ * Since we can not reach the remote BMC machine with LPC memops,
+ * map it always for now.
+ */
+ memory_region_add_subregion(pnv->chips[0]->fw_mr, PNOR_SPI_OFFSET,
+ &pnv->pnor->mmio);
+
+ /*
* OpenPOWER systems use a IPMI SEL Event message to notify the
* host to powerdown
*/
@@ -1150,6 +1162,7 @@ static void pnv_chip_power8_realize(DeviceState *dev, Error **errp)
qdev_realize(DEVICE(&chip8->lpc), NULL, &error_fatal);
pnv_xscom_add_subregion(chip, PNV_XSCOM_LPC_BASE, &chip8->lpc.xscom_regs);
+ chip->fw_mr = &chip8->lpc.isa_fw;
chip->dt_isa_nodename = g_strdup_printf("/xscom@%" PRIx64 "/isa@%x",
(uint64_t) PNV_XSCOM_BASE(chip),
PNV_XSCOM_LPC_BASE);
@@ -1479,6 +1492,7 @@ static void pnv_chip_power9_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion(get_system_memory(), PNV9_LPCM_BASE(chip),
&chip9->lpc.xscom_regs);
+ chip->fw_mr = &chip9->lpc.isa_fw;
chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0",
(uint64_t) PNV9_LPCM_BASE(chip));
@@ -1592,6 +1606,7 @@ static void pnv_chip_power10_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion(get_system_memory(), PNV10_LPCM_BASE(chip),
&chip10->lpc.xscom_regs);
+ chip->fw_mr = &chip10->lpc.isa_fw;
chip->dt_isa_nodename = g_strdup_printf("/lpcm-opb@%" PRIx64 "/lpc@0",
(uint64_t) PNV10_LPCM_BASE(chip));
}
@@ -1983,7 +1998,7 @@ static void pnv_machine_class_init(ObjectClass *oc, void *data)
* RAM defaults to less than 2048 for 32-bit hosts, and large
* enough to fit the maximum initrd size at it's load address
*/
- mc->default_ram_size = INITRD_LOAD_ADDR + INITRD_MAX_SIZE;
+ mc->default_ram_size = 1 * GiB;
mc->default_ram_id = "pnv.ram";
ispc->print_info = pnv_pic_print_info;
nc->nmi_monitor_handler = pnv_nmi;
diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c
index 67ebb16c4d..b9bf5735ea 100644
--- a/hw/ppc/pnv_bmc.c
+++ b/hw/ppc/pnv_bmc.c
@@ -51,6 +51,11 @@ typedef struct OemSel {
#define SOFT_OFF 0x00
#define SOFT_REBOOT 0x01
+static bool pnv_bmc_is_simulator(IPMIBmc *bmc)
+{
+ return object_dynamic_cast(OBJECT(bmc), TYPE_IPMI_BMC_SIMULATOR);
+}
+
static void pnv_gen_oem_sel(IPMIBmc *bmc, uint8_t reboot)
{
/* IPMI SEL Event are 16 bytes long */
@@ -79,6 +84,10 @@ void pnv_dt_bmc_sensors(IPMIBmc *bmc, void *fdt)
const struct ipmi_sdr_compact *sdr;
uint16_t nextrec;
+ if (!pnv_bmc_is_simulator(bmc)) {
+ return;
+ }
+
offset = fdt_add_subnode(fdt, 0, "bmc");
_FDT(offset);
@@ -243,6 +252,10 @@ static const IPMINetfn hiomap_netfn = {
void pnv_bmc_set_pnor(IPMIBmc *bmc, PnvPnor *pnor)
{
+ if (!pnv_bmc_is_simulator(bmc)) {
+ return;
+ }
+
object_ref(OBJECT(pnor));
object_property_add_const_link(OBJECT(bmc), "pnor", OBJECT(pnor));
@@ -260,13 +273,8 @@ IPMIBmc *pnv_bmc_create(PnvPnor *pnor)
Object *obj;
obj = object_new(TYPE_IPMI_BMC_SIMULATOR);
- object_ref(OBJECT(pnor));
- object_property_add_const_link(obj, "pnor", OBJECT(pnor));
qdev_realize(DEVICE(obj), NULL, &error_fatal);
-
- /* Install the HIOMAP protocol handlers to access the PNOR */
- ipmi_sim_register_netfn(IPMI_BMC_SIMULATOR(obj), IPMI_NETFN_OEM,
- &hiomap_netfn);
+ pnv_bmc_set_pnor(IPMI_BMC(obj), pnor);
return IPMI_BMC(obj);
}
@@ -291,7 +299,7 @@ static int bmc_find(Object *child, void *opaque)
IPMIBmc *pnv_bmc_find(Error **errp)
{
- ForeachArgs args = { TYPE_IPMI_BMC_SIMULATOR, NULL };
+ ForeachArgs args = { TYPE_IPMI_BMC, NULL };
int ret;
ret = object_child_foreach_recursive(object_get_root(), bmc_find, &args);
diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c
index 5903590220..bcbca3db97 100644
--- a/hw/ppc/pnv_lpc.c
+++ b/hw/ppc/pnv_lpc.c
@@ -824,8 +824,6 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool use_cpld, Error **errp)
ISABus *isa_bus;
qemu_irq *irqs;
qemu_irq_handler handler;
- PnvMachineState *pnv = PNV_MACHINE(qdev_get_machine());
- bool hostboot_mode = !!pnv->fw_load_addr;
/* let isa_bus_new() create its own bridge on SysBus otherwise
* devices specified on the command line won't find the bus and
@@ -851,18 +849,5 @@ ISABus *pnv_lpc_isa_create(PnvLpcController *lpc, bool use_cpld, Error **errp)
isa_bus_irqs(isa_bus, irqs);
- /*
- * TODO: Map PNOR on the LPC FW address space on demand ?
- */
- memory_region_add_subregion(&lpc->isa_fw, PNOR_SPI_OFFSET,
- &pnv->pnor->mmio);
- /*
- * Start disabled. The HIOMAP protocol will activate the mapping
- * with HIOMAP_C_CREATE_WRITE_WINDOW
- */
- if (!hostboot_mode) {
- memory_region_set_enabled(&pnv->pnor->mmio, false);
- }
-
return isa_bus;
}
diff --git a/hw/ppc/prep_systemio.c b/hw/ppc/prep_systemio.c
index 4e48ef245c..b2bd783248 100644
--- a/hw/ppc/prep_systemio.c
+++ b/hw/ppc/prep_systemio.c
@@ -23,6 +23,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/log.h"
#include "hw/irq.h"
#include "hw/isa/isa.h"
#include "hw/qdev-properties.h"
@@ -235,8 +236,15 @@ static uint64_t ppc_parity_error_readl(void *opaque, hwaddr addr,
return val;
}
+static void ppc_parity_error_writel(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
+}
+
static const MemoryRegionOps ppc_parity_error_ops = {
.read = ppc_parity_error_readl,
+ .write = ppc_parity_error_writel,
.valid = {
.min_access_size = 4,
.max_access_size = 4,
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 6c47466fc2..85fe65f894 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -83,6 +83,7 @@
#include "hw/ppc/spapr_tpm_proxy.h"
#include "hw/ppc/spapr_nvdimm.h"
#include "hw/ppc/spapr_numa.h"
+#include "hw/ppc/pef.h"
#include "monitor/monitor.h"
@@ -295,15 +296,6 @@ static hwaddr spapr_node0_size(MachineState *machine)
return machine->ram_size;
}
-bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr)
-{
- MachineState *machine = MACHINE(spapr);
- SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
-
- return smc->pre_5_2_numa_associativity ||
- machine->numa_state->num_nodes <= 1;
-}
-
static void add_str(GString *s, const gchar *s1)
{
g_string_append_len(s, s1, strlen(s1) + 1);
@@ -790,7 +782,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
CPUState *cs;
int n_cpus;
int cpus_offset;
- char *nodename;
int i;
cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
@@ -818,6 +809,7 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
PowerPCCPU *cpu = POWERPC_CPU(cs);
int index = spapr_get_vcpu_id(cpu);
DeviceClass *dc = DEVICE_GET_CLASS(cs);
+ g_autofree char *nodename = NULL;
int offset;
if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
@@ -826,7 +818,6 @@ static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
offset = fdt_add_subnode(fdt, cpus_offset, nodename);
- g_free(nodename);
_FDT(offset);
spapr_dt_cpu(cs, fdt, offset, spapr);
}
@@ -1574,7 +1565,7 @@ static void spapr_machine_reset(MachineState *machine)
void *fdt;
int rc;
- kvmppc_svm_off(&error_fatal);
+ pef_kvm_reset(machine->cgs, &error_fatal);
spapr_caps_apply(spapr);
first_ppc_cpu = POWERPC_CPU(first_cpu);
@@ -2658,6 +2649,11 @@ static void spapr_machine_init(MachineState *machine)
char *filename;
Error *resize_hpt_err = NULL;
+ /*
+ * if Secure VM (PEF) support is configured, then initialize it
+ */
+ pef_kvm_init(machine->cgs, &error_fatal);
+
msi_nonbroken = true;
QLIST_INIT(&spapr->phbs);
@@ -2774,16 +2770,7 @@ static void spapr_machine_init(MachineState *machine)
}
- /*
- * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
- * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
- * called from vPHB reset handler so we initialize the counter here.
- * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
- * must be equally distant from any other node.
- * The final value of spapr->gpu_numa_id is going to be written to
- * max-associativity-domains in spapr_build_fdt().
- */
- spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes);
+ spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
/* Init numa_assoc_array */
spapr_numa_associativity_init(spapr, machine);
@@ -3049,6 +3036,7 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
SCSIDevice *d = CAST(SCSIDevice, dev, TYPE_SCSI_DEVICE);
SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
+ PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
if (d) {
void *spapr = CAST(void, bus->parent, "spapr-vscsi");
@@ -3122,6 +3110,10 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
}
+ if (pcidev) {
+ return spapr_pci_fw_dev_name(pcidev);
+ }
+
return NULL;
}
@@ -3743,15 +3735,27 @@ int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
PowerPCCPU *cpu = POWERPC_CPU(cs);
DeviceClass *dc = DEVICE_GET_CLASS(cs);
int id = spapr_get_vcpu_id(cpu);
- char *nodename;
+ g_autofree char *nodename = NULL;
int offset;
nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
offset = fdt_add_subnode(fdt, 0, nodename);
- g_free(nodename);
spapr_dt_cpu(cs, fdt, offset, spapr);
+ /*
+ * spapr_dt_cpu() does not fill the 'name' property in the
+ * CPU node. The function is called during boot process, before
+ * and after CAS, and overwriting the 'name' property written
+ * by SLOF is not allowed.
+ *
+ * Write it manually after spapr_dt_cpu(). This makes the hotplug
+ * CPUs more compatible with the coldplugged ones, which have
+ * the 'name' property. Linux Kernel also relies on this
+ * property to identify CPU nodes.
+ */
+ _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
+
*fdt_start_offset = offset;
return 0;
}
diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c
index b50796bbe3..779f18b994 100644
--- a/hw/ppc/spapr_numa.c
+++ b/hw/ppc/spapr_numa.c
@@ -19,6 +19,15 @@
/* Moved from hw/ppc/spapr_pci_nvlink2.c */
#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1))
+static bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr)
+{
+ MachineState *machine = MACHINE(spapr);
+ SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
+
+ return smc->pre_5_2_numa_associativity ||
+ machine->numa_state->num_nodes <= 1;
+}
+
static bool spapr_numa_is_symmetrical(MachineState *ms)
{
int src, dst;
@@ -38,6 +47,20 @@ static bool spapr_numa_is_symmetrical(MachineState *ms)
}
/*
+ * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
+ * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
+ * called from vPHB reset handler so we initialize the counter here.
+ * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
+ * must be equally distant from any other node.
+ * The final value of spapr->gpu_numa_id is going to be written to
+ * max-associativity-domains in spapr_build_fdt().
+ */
+unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine)
+{
+ return MAX(1, machine->numa_state->num_nodes);
+}
+
+/*
* This function will translate the user distances into
* what the kernel understand as possible values: 10
* (local distance), 20, 40, 80 and 160, and return the equivalent
@@ -288,6 +311,8 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
{
MachineState *ms = MACHINE(spapr);
SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+ uint32_t number_nvgpus_nodes = spapr->gpu_numa_id -
+ spapr_numa_initial_nvgpu_numa_id(ms);
uint32_t refpoints[] = {
cpu_to_be32(0x4),
cpu_to_be32(0x3),
@@ -295,7 +320,7 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas)
cpu_to_be32(0x1),
};
uint32_t nr_refpoints = ARRAY_SIZE(refpoints);
- uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id;
+ uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes;
uint32_t maxdomains[] = {
cpu_to_be32(4),
cpu_to_be32(maxdomain),
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 76d7c91e9c..f1c7479816 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -53,6 +53,7 @@
#include "sysemu/hostmem.h"
#include "sysemu/numa.h"
#include "hw/ppc/spapr_numa.h"
+#include "qemu/log.h"
/* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
#define RTAS_QUERY_FN 0
@@ -739,6 +740,12 @@ static PCIINTxRoute spapr_route_intx_pin_to_irq(void *opaque, int pin)
return route;
}
+static uint64_t spapr_msi_read(void *opaque, hwaddr addr, unsigned size)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
+ return 0;
+}
+
/*
* MSI/MSIX memory region implementation.
* The handler handles both MSI and MSIX.
@@ -756,8 +763,11 @@ static void spapr_msi_write(void *opaque, hwaddr addr,
}
static const MemoryRegionOps spapr_msi_ops = {
- /* There is no .read as the read result is undefined by PCI spec */
- .read = NULL,
+ /*
+ * .read result is undefined by PCI spec.
+ * define .read method to avoid assert failure in memory_region_init_io
+ */
+ .read = spapr_msi_read,
.write = spapr_msi_write,
.endianness = DEVICE_LITTLE_ENDIAN
};
@@ -1334,15 +1344,29 @@ static int spapr_dt_pci_bus(SpaprPhbState *sphb, PCIBus *bus,
return offset;
}
+char *spapr_pci_fw_dev_name(PCIDevice *dev)
+{
+ const gchar *basename;
+ int slot = PCI_SLOT(dev->devfn);
+ int func = PCI_FUNC(dev->devfn);
+ uint32_t ccode = pci_default_read_config(dev, PCI_CLASS_PROG, 3);
+
+ basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff,
+ ccode & 0xff);
+
+ if (func != 0) {
+ return g_strdup_printf("%s@%x,%x", basename, slot, func);
+ } else {
+ return g_strdup_printf("%s@%x", basename, slot);
+ }
+}
+
/* create OF node for pci device and required OF DT properties */
static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev,
void *fdt, int parent_offset)
{
int offset;
- const gchar *basename;
- gchar *nodename;
- int slot = PCI_SLOT(dev->devfn);
- int func = PCI_FUNC(dev->devfn);
+ g_autofree gchar *nodename = spapr_pci_fw_dev_name(dev);
PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
ResourceProps rp;
SpaprDrc *drc = drc_from_dev(sphb, dev);
@@ -1359,19 +1383,8 @@ static int spapr_dt_pci_device(SpaprPhbState *sphb, PCIDevice *dev,
uint32_t pci_status = pci_default_read_config(dev, PCI_STATUS, 2);
gchar *loc_code;
- basename = dt_name_from_class((ccode >> 16) & 0xff, (ccode >> 8) & 0xff,
- ccode & 0xff);
-
- if (func != 0) {
- nodename = g_strdup_printf("%s@%x,%x", basename, slot, func);
- } else {
- nodename = g_strdup_printf("%s@%x", basename, slot);
- }
-
_FDT(offset = fdt_add_subnode(fdt, parent_offset, nodename));
- g_free(nodename);
-
/* in accordance with PAPR+ v2.7 13.6.3, Table 181 */
_FDT(fdt_setprop_cell(fdt, offset, "vendor-id", vendor_id));
_FDT(fdt_setprop_cell(fdt, offset, "device-id", device_id));
@@ -2173,6 +2186,16 @@ static int spapr_pci_pre_save(void *opaque)
return 0;
}
+static int spapr_pci_post_save(void *opaque)
+{
+ SpaprPhbState *sphb = opaque;
+
+ g_free(sphb->msi_devs);
+ sphb->msi_devs = NULL;
+ sphb->msi_devs_num = 0;
+ return 0;
+}
+
static int spapr_pci_post_load(void *opaque, int version_id)
{
SpaprPhbState *sphb = opaque;
@@ -2205,6 +2228,7 @@ static const VMStateDescription vmstate_spapr_pci = {
.version_id = 2,
.minimum_version_id = 2,
.pre_save = spapr_pci_pre_save,
+ .post_save = spapr_pci_post_save,
.post_load = spapr_pci_post_load,
.fields = (VMStateField[]) {
VMSTATE_UINT64_EQUAL(buid, SpaprPhbState, NULL),
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
new file mode 100644
index 0000000000..08c16e235f
--- /dev/null
+++ b/hw/remote/Kconfig
@@ -0,0 +1,4 @@
+config MULTIPROCESS
+ bool
+ depends on PCI && PCI_EXPRESS && KVM
+ select REMOTE_PCIHOST
diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c
new file mode 100644
index 0000000000..e4ff131a6b
--- /dev/null
+++ b/hw/remote/iohub.c
@@ -0,0 +1,119 @@
+/*
+ * Remote IO Hub
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+#include "hw/pci/pci_bus.h"
+#include "qemu/thread.h"
+#include "hw/boards.h"
+#include "hw/remote/machine.h"
+#include "hw/remote/iohub.h"
+#include "qemu/main-loop.h"
+
+void remote_iohub_init(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
+ memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_mutex_init(&iohub->irq_level_lock[pirq]);
+ iohub->irq_level[pirq] = 0;
+ event_notifier_init_fd(&iohub->irqfds[pirq], -1);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
+ }
+}
+
+void remote_iohub_finalize(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
+ }
+}
+
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
+{
+ return pci_dev->devfn;
+}
+
+void remote_iohub_set_irq(void *opaque, int pirq, int level)
+{
+ RemoteIOHubState *iohub = opaque;
+
+ assert(pirq >= 0);
+ assert(pirq < PCI_DEVFN_MAX);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (level) {
+ if (++iohub->irq_level[pirq] == 1) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+ } else if (iohub->irq_level[pirq] > 0) {
+ iohub->irq_level[pirq]--;
+ }
+}
+
+static void intr_resample_handler(void *opaque)
+{
+ ResampleToken *token = opaque;
+ RemoteIOHubState *iohub = token->iohub;
+ int pirq, s;
+
+ pirq = token->pirq;
+
+ s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
+
+ assert(s >= 0);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (iohub->irq_level[pirq]) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+}
+
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
+{
+ RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
+ RemoteIOHubState *iohub = &machine->iohub;
+ int pirq, intx;
+
+ intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ pirq = remote_iohub_map_irq(pci_dev, intx);
+
+ if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
+ }
+
+ event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
+
+ iohub->token[pirq].iohub = iohub;
+ iohub->token[pirq].pirq = pirq;
+
+ qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
+ &iohub->token[pirq]);
+}
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
new file mode 100644
index 0000000000..c0ab4f528a
--- /dev/null
+++ b/hw/remote/machine.c
@@ -0,0 +1,80 @@
+/*
+ * Machine for remote device
+ *
+ * This machine type is used by the remote device process in multi-process
+ * QEMU. QEMU device models depend on parent busses, interrupt controllers,
+ * memory regions, etc. The remote machine type offers this environment so
+ * that QEMU device models can be used as remote devices.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/machine.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "qapi/error.h"
+#include "hw/pci/pci_host.h"
+#include "hw/remote/iohub.h"
+
+static void remote_machine_init(MachineState *machine)
+{
+ MemoryRegion *system_memory, *system_io, *pci_memory;
+ RemoteMachineState *s = REMOTE_MACHINE(machine);
+ RemotePCIHost *rem_host;
+ PCIHostState *pci_host;
+
+ system_memory = get_system_memory();
+ system_io = get_system_io();
+
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+
+ rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
+
+ rem_host->mr_pci_mem = pci_memory;
+ rem_host->mr_sys_mem = system_memory;
+ rem_host->mr_sys_io = system_io;
+
+ s->host = rem_host;
+
+ object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
+ memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
+
+ qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
+
+ pci_host = PCI_HOST_BRIDGE(rem_host);
+
+ remote_iohub_init(&s->iohub);
+
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+}
+
+static void remote_machine_class_init(ObjectClass *oc, void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+
+ mc->init = remote_machine_init;
+ mc->desc = "Experimental remote machine";
+}
+
+static const TypeInfo remote_machine = {
+ .name = TYPE_REMOTE_MACHINE,
+ .parent = TYPE_MACHINE,
+ .instance_size = sizeof(RemoteMachineState),
+ .class_init = remote_machine_class_init,
+};
+
+static void remote_machine_register_types(void)
+{
+ type_register_static(&remote_machine);
+}
+
+type_init(remote_machine_register_types);
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
new file mode 100644
index 0000000000..32085b1e05
--- /dev/null
+++ b/hw/remote/memory.c
@@ -0,0 +1,65 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/memory.h"
+#include "exec/address-spaces.h"
+#include "exec/ram_addr.h"
+#include "qapi/error.h"
+
+static void remote_sysmem_reset(void)
+{
+ MemoryRegion *sysmem, *subregion, *next;
+
+ sysmem = get_system_memory();
+
+ QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
+ if (subregion->ram) {
+ memory_region_del_subregion(sysmem, subregion);
+ object_unparent(OBJECT(subregion));
+ }
+ }
+}
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
+ MemoryRegion *sysmem, *subregion;
+ static unsigned int suffix;
+ int region;
+
+ sysmem = get_system_memory();
+
+ remote_sysmem_reset();
+
+ for (region = 0; region < msg->num_fds; region++) {
+ g_autofree char *name;
+ subregion = g_new(MemoryRegion, 1);
+ name = g_strdup_printf("remote-mem-%u", suffix++);
+ memory_region_init_ram_from_fd(subregion, NULL,
+ name, sysmem_info->sizes[region],
+ true, msg->fds[region],
+ sysmem_info->offsets[region],
+ errp);
+
+ if (*errp) {
+ g_free(subregion);
+ remote_sysmem_reset();
+ return;
+ }
+
+ memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
+ subregion);
+
+ }
+}
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
new file mode 100644
index 0000000000..e6a5574242
--- /dev/null
+++ b/hw/remote/meson.build
@@ -0,0 +1,13 @@
+remote_ss = ss.source_set()
+
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
+
+softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
diff --git a/hw/remote/message.c b/hw/remote/message.c
new file mode 100644
index 0000000000..11d729845c
--- /dev/null
+++ b/hw/remote/message.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/machine.h"
+#include "io/channel.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "sysemu/runstate.h"
+#include "hw/pci/pci.h"
+#include "exec/memattrs.h"
+#include "hw/remote/memory.h"
+#include "hw/remote/iohub.h"
+#include "sysemu/reset.h"
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp);
+
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
+{
+ g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
+ PCIDevice *pci_dev = NULL;
+ Error *local_err = NULL;
+
+ assert(com->ioc);
+
+ pci_dev = com->dev;
+ for (; !local_err;) {
+ MPQemuMsg msg = {0};
+
+ if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
+ break;
+ }
+
+ if (!mpqemu_msg_valid(&msg)) {
+ error_setg(&local_err, "Received invalid message from proxy"
+ "in remote process pid="FMT_pid"",
+ getpid());
+ break;
+ }
+
+ switch (msg.cmd) {
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ process_config_write(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_PCI_CFGREAD:
+ process_config_read(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ process_bar_write(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_READ:
+ process_bar_read(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ remote_sysmem_reconfig(&msg, &local_err);
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ process_set_irqfd_msg(pci_dev, &msg);
+ break;
+ case MPQEMU_CMD_DEVICE_RESET:
+ process_device_reset_msg(com->ioc, pci_dev, &local_err);
+ break;
+ default:
+ error_setg(&local_err,
+ "Unknown command (%d) received for device %s"
+ " (pid="FMT_pid")",
+ msg.cmd, DEVICE(pci_dev)->id, getpid());
+ }
+ }
+
+ if (local_err) {
+ error_report_err(local_err);
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
+ } else {
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ }
+}
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ pci_default_write_config(dev, conf->addr, conf->val, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ AddressSpace *as =
+ bar_access->memory ? &address_space_memory : &address_space_io;
+ MPQemuMsg ret = { 0 };
+ MemTxResult res;
+ uint64_t val;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ ret.data.u64 = UINT64_MAX;
+ goto fail;
+ }
+
+ val = cpu_to_le64(bar_access->val);
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, true);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ ret.data.u64 = -1;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ MPQemuMsg ret = { 0 };
+ AddressSpace *as;
+ MemTxResult res;
+ uint64_t val = 0;
+
+ as = bar_access->memory ? &address_space_memory : &address_space_io;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ val = UINT64_MAX;
+ goto fail;
+ }
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, false);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ val = UINT64_MAX;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.data.u64 = le64_to_cpu(val);
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp)
+{
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
+ DeviceState *s = DEVICE(dev);
+ MPQemuMsg ret = { 0 };
+
+ if (dc->reset) {
+ dc->reset(s);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+
+ mpqemu_msg_send(&ret, ioc, errp);
+}
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
new file mode 100644
index 0000000000..9ce31526e8
--- /dev/null
+++ b/hw/remote/mpqemu-link.c
@@ -0,0 +1,267 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/module.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "io/channel.h"
+#include "sysemu/iothread.h"
+#include "trace.h"
+
+/*
+ * Send message over the ioc QIOChannel.
+ * This function is safe to call from:
+ * - main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ * Returns true if no errors were encountered, false otherwise.
+ */
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ ERRP_GUARD();
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ struct iovec send[2] = {};
+ int *fds = NULL;
+ size_t nfds = 0;
+ bool ret = false;
+
+ send[0].iov_base = msg;
+ send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
+
+ send[1].iov_base = (void *)&msg->data;
+ send[1].iov_len = msg->size;
+
+ if (msg->num_fds) {
+ nfds = msg->num_fds;
+ fds = msg->fds;
+ }
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ /*
+ * Skip unlocking/locking iothread lock when the IOThread is running
+ * in co-routine context. Co-routine context is asserted above
+ * for IOThread case.
+ * Also skip lock handling while in a co-routine in the main context.
+ */
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
+ fds, nfds, errp)) {
+ ret = true;
+ } else {
+ trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
+ }
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ /* See above comment why skip locking here. */
+ qemu_mutex_lock_iothread();
+ }
+
+ return ret;
+}
+
+/*
+ * Read message from the ioc QIOChannel.
+ * This function is safe to call from:
+ * - From main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - From vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - From IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ */
+static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
+ size_t *nfds, Error **errp)
+{
+ ERRP_GUARD();
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ int ret = -1;
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_lock_iothread();
+ }
+
+ return (ret <= 0) ? ret : iov.iov_len;
+}
+
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ ERRP_GUARD();
+ g_autofree int *fds = NULL;
+ size_t nfds = 0;
+ ssize_t len;
+ bool ret = false;
+
+ len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
+ if (len <= 0) {
+ goto fail;
+ } else if (len != MPQEMU_MSG_HDR_SIZE) {
+ error_setg(errp, "Message header corrupted");
+ goto fail;
+ }
+
+ if (msg->size > sizeof(msg->data)) {
+ error_setg(errp, "Invalid size for message");
+ goto fail;
+ }
+
+ if (!msg->size) {
+ goto copy_fds;
+ }
+
+ len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
+ if (len <= 0) {
+ goto fail;
+ }
+ if (len != msg->size) {
+ error_setg(errp, "Unable to read full message");
+ goto fail;
+ }
+
+copy_fds:
+ msg->num_fds = nfds;
+ if (nfds > G_N_ELEMENTS(msg->fds)) {
+ error_setg(errp,
+ "Overflow error: received %zu fds, more than max of %d fds",
+ nfds, REMOTE_MAX_FDS);
+ goto fail;
+ }
+ if (nfds) {
+ memcpy(msg->fds, fds, nfds * sizeof(int));
+ }
+
+ ret = true;
+
+fail:
+ if (*errp) {
+ trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
+ }
+ while (*errp && nfds) {
+ close(fds[nfds - 1]);
+ nfds--;
+ }
+
+ return ret;
+}
+
+/*
+ * Send msg and wait for a reply with command code RET_MSG.
+ * Returns the message received of size u64 or UINT64_MAX
+ * on error.
+ * Called from VCPU thread in non-coroutine context.
+ * Used by the Proxy object to communicate to remote processes.
+ */
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
+ Error **errp)
+{
+ ERRP_GUARD();
+ MPQemuMsg msg_reply = {0};
+ uint64_t ret = UINT64_MAX;
+
+ assert(!qemu_in_coroutine());
+
+ QEMU_LOCK_GUARD(&pdev->io_mutex);
+ if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
+ error_setg(errp, "ERROR: Invalid reply received for command %d",
+ msg->cmd);
+ return ret;
+ }
+
+ return msg_reply.data.u64;
+}
+
+bool mpqemu_msg_valid(MPQemuMsg *msg)
+{
+ if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
+ return false;
+ }
+
+ /* Verify FDs. */
+ if (msg->num_fds >= REMOTE_MAX_FDS) {
+ return false;
+ }
+
+ if (msg->num_fds > 0) {
+ for (int i = 0; i < msg->num_fds; i++) {
+ if (fcntl(msg->fds[i], F_GETFL) == -1) {
+ return false;
+ }
+ }
+ }
+
+ /* Verify message specific fields. */
+ switch (msg->cmd) {
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ case MPQEMU_CMD_PCI_CFGREAD:
+ if (msg->size != sizeof(PciConfDataMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ case MPQEMU_CMD_BAR_READ:
+ if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ if (msg->size || (msg->num_fds != 2)) {
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c
new file mode 100644
index 0000000000..af1fa6f5aa
--- /dev/null
+++ b/hw/remote/proxy-memory-listener.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/compiler.h"
+#include "qemu/int128.h"
+#include "qemu/range.h"
+#include "exec/memory.h"
+#include "exec/cpu-common.h"
+#include "cpu.h"
+#include "exec/ram_addr.h"
+#include "exec/address-spaces.h"
+#include "qapi/error.h"
+#include "hw/remote/mpqemu-link.h"
+#include "hw/remote/proxy-memory-listener.h"
+
+/*
+ * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
+ * proxy_memory_listener_commit() defined below perform tasks similar to the
+ * functions defined in vhost-user.c. These functions are good candidates
+ * for refactoring.
+ *
+ */
+
+static void proxy_memory_listener_reset(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ int mrs;
+
+ for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
+ memory_region_unref(proxy_listener->mr_sections[mrs].mr);
+ }
+
+ g_free(proxy_listener->mr_sections);
+ proxy_listener->mr_sections = NULL;
+ proxy_listener->n_mr_sections = 0;
+}
+
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
+{
+ MemoryRegion *mr;
+ ram_addr_t off;
+
+ /**
+ * Assumes that the host address is a valid address as it's
+ * coming from the MemoryListener system. In the case host
+ * address is not valid, the following call would return
+ * the default subregion of "system_memory" region, and
+ * not NULL. So it's not possible to check for NULL here.
+ */
+ mr = memory_region_from_host((void *)(uintptr_t)host, &off);
+
+ if (offset) {
+ *offset = off;
+ }
+
+ return memory_region_get_fd(mr);
+}
+
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
+{
+ if (((prev_host + size) != host)) {
+ return false;
+ }
+
+ if (get_fd_from_hostaddr(host, NULL) !=
+ get_fd_from_hostaddr(prev_host, NULL)) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool try_merge(ProxyMemoryListener *proxy_listener,
+ MemoryRegionSection *section)
+{
+ uint64_t mrs_size, mrs_gpa, mrs_page;
+ MemoryRegionSection *prev_sec;
+ bool merged = false;
+ uintptr_t mrs_host;
+ RAMBlock *mrs_rb;
+
+ if (!proxy_listener->n_mr_sections) {
+ return false;
+ }
+
+ mrs_rb = section->mr->ram_block;
+ mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
+ mrs_size = int128_get64(section->size);
+ mrs_gpa = section->offset_within_address_space;
+ mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+
+ if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
+ return true;
+ }
+
+ mrs_host = mrs_host & ~(mrs_page - 1);
+ mrs_gpa = mrs_gpa & ~(mrs_page - 1);
+ mrs_size = ROUND_UP(mrs_size, mrs_page);
+
+ prev_sec = proxy_listener->mr_sections +
+ (proxy_listener->n_mr_sections - 1);
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
+ uint64_t prev_size = int128_get64(prev_sec->size);
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
+ uint64_t prev_host_start =
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
+ prev_sec->offset_within_region;
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
+
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
+ g_assert(mrs_gpa > prev_gpa_start);
+
+ if ((section->mr == prev_sec->mr) &&
+ proxy_mrs_can_merge(mrs_host, prev_host_start,
+ (mrs_gpa - prev_gpa_start))) {
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
+ merged = true;
+ prev_sec->offset_within_address_space =
+ MIN(prev_gpa_start, mrs_gpa);
+ prev_sec->offset_within_region =
+ MIN(prev_host_start, mrs_host) -
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
+ mrs_host));
+ }
+ }
+
+ return merged;
+}
+
+static void proxy_memory_listener_region_addnop(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+
+ if (!memory_region_is_ram(section->mr) ||
+ memory_region_is_rom(section->mr)) {
+ return;
+ }
+
+ if (try_merge(proxy_listener, section)) {
+ return;
+ }
+
+ ++proxy_listener->n_mr_sections;
+ proxy_listener->mr_sections = g_renew(MemoryRegionSection,
+ proxy_listener->mr_sections,
+ proxy_listener->n_mr_sections);
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
+ memory_region_ref(section->mr);
+}
+
+static void proxy_memory_listener_commit(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ MPQemuMsg msg;
+ MemoryRegionSection *section;
+ ram_addr_t offset;
+ uintptr_t host_addr;
+ int region;
+ Error *local_err = NULL;
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+
+ msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
+ msg.num_fds = proxy_listener->n_mr_sections;
+ msg.size = sizeof(SyncSysmemMsg);
+ if (msg.num_fds > REMOTE_MAX_FDS) {
+ error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
+ return;
+ }
+
+ for (region = 0; region < proxy_listener->n_mr_sections; region++) {
+ section = &proxy_listener->mr_sections[region];
+ msg.data.sync_sysmem.gpas[region] =
+ section->offset_within_address_space;
+ msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
+ host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
+ msg.data.sync_sysmem.offsets[region] = offset;
+ }
+ if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+}
+
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
+{
+ memory_listener_unregister(&proxy_listener->listener);
+
+ proxy_memory_listener_reset(&proxy_listener->listener);
+}
+
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
+ QIOChannel *ioc)
+{
+ proxy_listener->n_mr_sections = 0;
+ proxy_listener->mr_sections = NULL;
+
+ proxy_listener->ioc = ioc;
+
+ proxy_listener->listener.begin = proxy_memory_listener_reset;
+ proxy_listener->listener.commit = proxy_memory_listener_commit;
+ proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.priority = 10;
+
+ memory_listener_register(&proxy_listener->listener,
+ &address_space_memory);
+}
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
new file mode 100644
index 0000000000..4fa4be079d
--- /dev/null
+++ b/hw/remote/proxy.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "hw/remote/proxy.h"
+#include "hw/pci/pci.h"
+#include "qapi/error.h"
+#include "io/channel-util.h"
+#include "hw/qdev-properties.h"
+#include "monitor/monitor.h"
+#include "migration/blocker.h"
+#include "qemu/sockets.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qemu/error-report.h"
+#include "hw/remote/proxy-memory-listener.h"
+#include "qom/object.h"
+#include "qemu/event_notifier.h"
+#include "sysemu/kvm.h"
+#include "util/event_notifier-posix.c"
+
+static void probe_pci_info(PCIDevice *dev, Error **errp);
+static void proxy_device_reset(DeviceState *dev);
+
+static void proxy_intx_update(PCIDevice *pci_dev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
+ PCIINTxRoute route;
+ int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
+ dev->virq = -1;
+ }
+
+ route = pci_device_route_intx_to_irq(pci_dev, pin);
+
+ dev->virq = route.irq;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
+ &dev->resample, dev->virq);
+ }
+}
+
+static void setup_irqfd(PCIProxyDev *dev)
+{
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
+ MPQemuMsg msg;
+ Error *local_err = NULL;
+
+ event_notifier_init(&dev->intr, 0);
+ event_notifier_init(&dev->resample, 0);
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+ msg.cmd = MPQEMU_CMD_SET_IRQFD;
+ msg.num_fds = 2;
+ msg.fds[0] = event_notifier_get_fd(&dev->intr);
+ msg.fds[1] = event_notifier_get_fd(&dev->resample);
+ msg.size = 0;
+
+ if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+
+ dev->virq = -1;
+
+ proxy_intx_update(pci_dev);
+
+ pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
+}
+
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
+{
+ ERRP_GUARD();
+ PCIProxyDev *dev = PCI_PROXY_DEV(device);
+ uint8_t *pci_conf = device->config;
+ int fd;
+
+ if (!dev->fd) {
+ error_setg(errp, "fd parameter not specified for %s",
+ DEVICE(device)->id);
+ return;
+ }
+
+ fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
+ if (fd == -1) {
+ error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "proxy: fd %d is not a socket", fd);
+ close(fd);
+ return;
+ }
+
+ dev->ioc = qio_channel_new_fd(fd, errp);
+
+ error_setg(&dev->migration_blocker, "%s does not support migration",
+ TYPE_PCI_PROXY_DEV);
+ migrate_add_blocker(dev->migration_blocker, errp);
+
+ qemu_mutex_init(&dev->io_mutex);
+ qio_channel_set_blocking(dev->ioc, true, NULL);
+
+ pci_conf[PCI_LATENCY_TIMER] = 0xff;
+ pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+ proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
+
+ setup_irqfd(dev);
+
+ probe_pci_info(PCI_DEVICE(dev), errp);
+}
+
+static void pci_proxy_dev_exit(PCIDevice *pdev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
+
+ if (dev->ioc) {
+ qio_channel_close(dev->ioc, NULL);
+ }
+
+ migrate_del_blocker(dev->migration_blocker);
+
+ error_free(dev->migration_blocker);
+
+ proxy_memory_listener_deconfigure(&dev->proxy_listener);
+
+ event_notifier_cleanup(&dev->intr);
+ event_notifier_cleanup(&dev->resample);
+}
+
+static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
+ int len, unsigned int op)
+{
+ MPQemuMsg msg = { 0 };
+ uint64_t ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.cmd = op;
+ msg.data.pci_conf_data.addr = addr;
+ msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
+ msg.data.pci_conf_data.len = len;
+ msg.size = sizeof(PciConfDataMsg);
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (ret == UINT64_MAX) {
+ error_report("Failed to perform PCI config %s operation",
+ (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
+ }
+
+ if (op == MPQEMU_CMD_PCI_CFGREAD) {
+ *val = (uint32_t)ret;
+ }
+}
+
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+ uint32_t val;
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
+
+ return val;
+}
+
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
+ int len)
+{
+ /*
+ * Some of the functions access the copy of remote device's PCI config
+ * space which is cached in the proxy device. Therefore, maintain
+ * it updated.
+ */
+ pci_default_write_config(d, addr, val, len);
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
+}
+
+static Property proxy_properties[] = {
+ DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = pci_proxy_dev_realize;
+ k->exit = pci_proxy_dev_exit;
+ k->config_read = pci_proxy_read_config;
+ k->config_write = pci_proxy_write_config;
+
+ dc->reset = proxy_device_reset;
+
+ device_class_set_props(dc, proxy_properties);
+}
+
+static const TypeInfo pci_proxy_dev_type_info = {
+ .name = TYPE_PCI_PROXY_DEV,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(PCIProxyDev),
+ .class_init = pci_proxy_dev_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void pci_proxy_dev_register_types(void)
+{
+ type_register_static(&pci_proxy_dev_type_info);
+}
+
+type_init(pci_proxy_dev_register_types)
+
+static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
+ bool write, hwaddr addr, uint64_t *val,
+ unsigned size, bool memory)
+{
+ MPQemuMsg msg = { 0 };
+ long ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.size = sizeof(BarAccessMsg);
+ msg.data.bar_access.addr = mr->addr + addr;
+ msg.data.bar_access.size = size;
+ msg.data.bar_access.memory = memory;
+
+ if (write) {
+ msg.cmd = MPQEMU_CMD_BAR_WRITE;
+ msg.data.bar_access.val = *val;
+ } else {
+ msg.cmd = MPQEMU_CMD_BAR_READ;
+ }
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (!write) {
+ *val = ret;
+ }
+}
+
+static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
+ pmr->memory);
+}
+
+static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+ uint64_t val;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
+ pmr->memory);
+
+ return val;
+}
+
+const MemoryRegionOps proxy_mr_ops = {
+ .read = proxy_bar_read,
+ .write = proxy_bar_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
+static void probe_pci_info(PCIDevice *dev, Error **errp)
+{
+ PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+ uint32_t orig_val, new_val, base_class, val;
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ DeviceClass *dc = DEVICE_CLASS(pc);
+ uint8_t type;
+ int i, size;
+
+ config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->vendor_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->device_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->class_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->subsystem_id = (uint16_t)val;
+
+ base_class = pc->class_id >> 4;
+ switch (base_class) {
+ case PCI_BASE_CLASS_BRIDGE:
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_STORAGE:
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_NETWORK:
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+ break;
+ case PCI_BASE_CLASS_INPUT:
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ break;
+ case PCI_BASE_CLASS_DISPLAY:
+ set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+ break;
+ case PCI_BASE_CLASS_PROCESSOR:
+ set_bit(DEVICE_CATEGORY_CPU, dc->categories);
+ break;
+ default:
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ break;
+ }
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ new_val = 0xffffffff;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ size = (~(new_val & 0xFFFFFFF0)) + 1;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ type = (new_val & 0x1) ?
+ PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+ if (size) {
+ g_autofree char *name;
+ pdev->region[i].dev = pdev;
+ pdev->region[i].present = true;
+ if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+ pdev->region[i].memory = true;
+ }
+ name = g_strdup_printf("bar-region-%d", i);
+ memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+ &proxy_mr_ops, &pdev->region[i],
+ name, size);
+ pci_register_bar(dev, i, type, &pdev->region[i].mr);
+ }
+ }
+}
+
+static void proxy_device_reset(DeviceState *dev)
+{
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ MPQemuMsg msg = { 0 };
+ Error *local_err = NULL;
+
+ msg.cmd = MPQEMU_CMD_DEVICE_RESET;
+ msg.size = 0;
+
+ mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+}
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
new file mode 100644
index 0000000000..4f21254219
--- /dev/null
+++ b/hw/remote/remote-obj.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "qemu/error-report.h"
+#include "qemu/notify.h"
+#include "qom/object_interfaces.h"
+#include "hw/qdev-core.h"
+#include "io/channel.h"
+#include "hw/qdev-core.h"
+#include "hw/remote/machine.h"
+#include "io/channel-util.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+#include "hw/pci/pci.h"
+#include "qemu/sockets.h"
+#include "monitor/monitor.h"
+
+#define TYPE_REMOTE_OBJECT "x-remote-object"
+OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
+
+struct RemoteObjectClass {
+ ObjectClass parent_class;
+
+ unsigned int nr_devs;
+ unsigned int max_devs;
+};
+
+struct RemoteObject {
+ /* private */
+ Object parent;
+
+ Notifier machine_done;
+
+ int32_t fd;
+ char *devid;
+
+ QIOChannel *ioc;
+
+ DeviceState *dev;
+ DeviceListener listener;
+};
+
+static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "File descriptor '%s' is not a socket", str);
+ close(fd);
+ return;
+ }
+
+ o->fd = fd;
+}
+
+static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ g_free(o->devid);
+
+ o->devid = g_strdup(str);
+}
+
+static void remote_object_unrealize_listener(DeviceListener *listener,
+ DeviceState *dev)
+{
+ RemoteObject *o = container_of(listener, RemoteObject, listener);
+
+ if (o->dev == dev) {
+ object_unref(OBJECT(o));
+ }
+}
+
+static void remote_object_machine_done(Notifier *notifier, void *data)
+{
+ RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
+ DeviceState *dev = NULL;
+ QIOChannel *ioc = NULL;
+ Coroutine *co = NULL;
+ RemoteCommDev *comdev = NULL;
+ Error *err = NULL;
+
+ dev = qdev_find_recursive(sysbus_get_default(), o->devid);
+ if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ error_report("%s is not a PCI device", o->devid);
+ return;
+ }
+
+ ioc = qio_channel_new_fd(o->fd, &err);
+ if (!ioc) {
+ error_report_err(err);
+ return;
+ }
+ qio_channel_set_blocking(ioc, false, NULL);
+
+ o->dev = dev;
+
+ o->listener.unrealize = remote_object_unrealize_listener;
+ device_listener_register(&o->listener);
+
+ /* co-routine should free this. */
+ comdev = g_new0(RemoteCommDev, 1);
+ *comdev = (RemoteCommDev) {
+ .ioc = ioc,
+ .dev = PCI_DEVICE(dev),
+ };
+
+ co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
+ qemu_coroutine_enter(co);
+}
+
+static void remote_object_init(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ if (k->nr_devs >= k->max_devs) {
+ error_report("Reached maximum number of devices: %u", k->max_devs);
+ return;
+ }
+
+ o->ioc = NULL;
+ o->fd = -1;
+ o->devid = NULL;
+
+ k->nr_devs++;
+
+ o->machine_done.notify = remote_object_machine_done;
+ qemu_add_machine_init_done_notifier(&o->machine_done);
+}
+
+static void remote_object_finalize(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ device_listener_unregister(&o->listener);
+
+ if (o->ioc) {
+ qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ qio_channel_close(o->ioc, NULL);
+ }
+
+ object_unref(OBJECT(o->ioc));
+
+ k->nr_devs--;
+ g_free(o->devid);
+}
+
+static void remote_object_class_init(ObjectClass *klass, void *data)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
+
+ /*
+ * Limit number of supported devices to 1. This is done to avoid devices
+ * from one VM accessing the RAM of another VM. This is done until we
+ * start using separate address spaces for individual devices.
+ */
+ k->max_devs = 1;
+ k->nr_devs = 0;
+
+ object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
+ object_class_property_add_str(klass, "devid", NULL,
+ remote_object_set_devid);
+}
+
+static const TypeInfo remote_object_info = {
+ .name = TYPE_REMOTE_OBJECT,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(RemoteObject),
+ .instance_init = remote_object_init,
+ .instance_finalize = remote_object_finalize,
+ .class_size = sizeof(RemoteObjectClass),
+ .class_init = remote_object_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&remote_object_info);
+}
+
+type_init(register_types);
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
new file mode 100644
index 0000000000..0b23974f90
--- /dev/null
+++ b/hw/remote/trace-events
@@ -0,0 +1,4 @@
+# multi-process trace events
+
+mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
+mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
diff --git a/hw/remote/trace.h b/hw/remote/trace.h
new file mode 100644
index 0000000000..5d5e3ac720
--- /dev/null
+++ b/hw/remote/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_remote.h"
diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index ab3a2482aa..93eccfc05d 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -14,8 +14,11 @@
#include <linux/kvm.h>
#include "cpu.h"
+#include "qapi/error.h"
#include "qemu/error-report.h"
#include "sysemu/kvm.h"
+#include "qom/object_interfaces.h"
+#include "exec/confidential-guest-support.h"
#include "hw/s390x/ipl.h"
#include "hw/s390x/pv.h"
@@ -111,3 +114,62 @@ void s390_pv_inject_reset_error(CPUState *cs)
/* Report that we are unable to enter protected mode */
env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV;
}
+
+#define TYPE_S390_PV_GUEST "s390-pv-guest"
+OBJECT_DECLARE_SIMPLE_TYPE(S390PVGuest, S390_PV_GUEST)
+
+/**
+ * S390PVGuest:
+ *
+ * The S390PVGuest object is basically a dummy used to tell the
+ * confidential guest support system to use s390's PV mechanism.
+ *
+ * # $QEMU \
+ * -object s390-pv-guest,id=pv0 \
+ * -machine ...,confidential-guest-support=pv0
+ */
+struct S390PVGuest {
+ ConfidentialGuestSupport parent_obj;
+};
+
+typedef struct S390PVGuestClass S390PVGuestClass;
+
+struct S390PVGuestClass {
+ ConfidentialGuestSupportClass parent_class;
+};
+
+int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ if (!object_dynamic_cast(OBJECT(cgs), TYPE_S390_PV_GUEST)) {
+ return 0;
+ }
+
+ if (!s390_has_feat(S390_FEAT_UNPACK)) {
+ error_setg(errp,
+ "CPU model does not support Protected Virtualization");
+ return -1;
+ }
+
+ cgs->ready = true;
+
+ return 0;
+}
+
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(S390PVGuest,
+ s390_pv_guest,
+ S390_PV_GUEST,
+ CONFIDENTIAL_GUEST_SUPPORT,
+ { TYPE_USER_CREATABLE },
+ { NULL })
+
+static void s390_pv_guest_class_init(ObjectClass *oc, void *data)
+{
+}
+
+static void s390_pv_guest_init(Object *obj)
+{
+}
+
+static void s390_pv_guest_finalize(Object *obj)
+{
+}
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index a2d9a79c84..2972b607f3 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -250,6 +250,9 @@ static void ccw_init(MachineState *machine)
/* init CPUs (incl. CPU model) early so s390_has_feature() works */
s390_init_cpus(machine);
+ /* Need CPU model to be determined before we can set up PV */
+ s390_pv_init(machine->cgs, &error_fatal);
+
s390_flic_init();
/* init the SIGP facility */
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index b995bab3a2..2c83a0ab1f 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -126,6 +126,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
{
int i;
int rc;
+ int vq_init_count = 0;
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
@@ -153,17 +154,22 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
if (rc) {
goto fail_vrings;
}
+
+ vq_init_count++;
rc = virtio_scsi_vring_init(s, vs->event_vq, 1,
virtio_scsi_data_plane_handle_event);
if (rc) {
goto fail_vrings;
}
+
+ vq_init_count++;
for (i = 0; i < vs->conf.num_queues; i++) {
rc = virtio_scsi_vring_init(s, vs->cmd_vqs[i], i + 2,
virtio_scsi_data_plane_handle_cmd);
if (rc) {
goto fail_vrings;
}
+ vq_init_count++;
}
s->dataplane_starting = false;
@@ -174,7 +180,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
fail_vrings:
aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
aio_context_release(s->ctx);
- for (i = 0; i < vs->conf.num_queues + 2; i++) {
+ for (i = 0; i < vq_init_count; i++) {
virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
}
diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index 98e70b2d26..15caff0e41 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -185,10 +185,11 @@ static arm_timer_state *arm_timer_init(uint32_t freq)
return s;
}
-/* ARM PrimeCell SP804 dual timer module.
+/*
+ * ARM PrimeCell SP804 dual timer module.
* Docs at
- * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0271d/index.html
-*/
+ * https://developer.arm.com/documentation/ddi0271/latest/
+ */
#define TYPE_SP804 "sp804"
OBJECT_DECLARE_SIMPLE_TYPE(SP804State, SP804)
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index fc8d63c850..c5c4c61d01 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -14,6 +14,7 @@
#include CONFIG_DEVICES
#include "exec/memop.h"
#include "qemu/units.h"
+#include "qemu/log.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
@@ -264,8 +265,15 @@ static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
return data;
}
+static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
+}
+
static const MemoryRegionOps vfio_ati_3c3_quirk = {
.read = vfio_ati_3c3_quirk_read,
+ .write = vfio_ati_3c3_quirk_write,
.endianness = DEVICE_LITTLE_ENDIAN,
};
diff --git a/include/block/block.h b/include/block/block.h
index 0a9f2c187c..2f2698074e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -849,4 +849,7 @@ int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
BdrvChild *dst, int64_t dst_offset,
int64_t bytes, BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags);
+
+void bdrv_cancel_in_flight(BlockDriverState *bs);
+
#endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 22a2789d35..88e4111939 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -353,6 +353,15 @@ struct BlockDriver {
int64_t *map, BlockDriverState **file);
/*
+ * This informs the driver that we are no longer interested in the result
+ * of in-flight requests, so don't waste the time if possible.
+ *
+ * One example usage is to avoid waiting for an nbd target node reconnect
+ * timeout during job-cancel.
+ */
+ void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
+
+ /*
* Invalidate any cached meta-data.
*/
void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs,
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3e02d9ca98..07cfc92936 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar {
uint64_t acq;
uint32_t cmbloc;
uint32_t cmbsz;
- uint8_t padding[3520]; /* not used by QEMU */
+ uint32_t bpinfo;
+ uint32_t bprsel;
+ uint64_t bpmbl;
+ uint64_t cmbmsc;
+ uint32_t cmbsts;
+ uint8_t rsvd92[3492];
uint32_t pmrcap;
uint32_t pmrctl;
uint32_t pmrsts;
uint32_t pmrebs;
uint32_t pmrswtp;
uint64_t pmrmsc;
- uint8_t reserved[484];
+ uint8_t css[484];
} NvmeBar;
enum NvmeCapShift {
@@ -35,7 +40,8 @@ enum NvmeCapShift {
CAP_CSS_SHIFT = 37,
CAP_MPSMIN_SHIFT = 48,
CAP_MPSMAX_SHIFT = 52,
- CAP_PMR_SHIFT = 56,
+ CAP_PMRS_SHIFT = 56,
+ CAP_CMBS_SHIFT = 57,
};
enum NvmeCapMask {
@@ -48,7 +54,8 @@ enum NvmeCapMask {
CAP_CSS_MASK = 0xff,
CAP_MPSMIN_MASK = 0xf,
CAP_MPSMAX_MASK = 0xf,
- CAP_PMR_MASK = 0x1,
+ CAP_PMRS_MASK = 0x1,
+ CAP_CMBS_MASK = 0x1,
};
#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK)
@@ -60,6 +67,8 @@ enum NvmeCapMask {
#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK)
#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK)
+#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK)
#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \
<< CAP_MQES_SHIFT)
@@ -78,12 +87,15 @@ enum NvmeCapMask {
#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
- << CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
- << CAP_PMR_SHIFT)
+ << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \
+ << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \
+ << CAP_CMBS_SHIFT)
enum NvmeCapCss {
NVME_CAP_CSS_NVM = 1 << 0,
+ NVME_CAP_CSS_CSI_SUPP = 1 << 6,
NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
};
@@ -117,9 +129,25 @@ enum NvmeCcMask {
enum NvmeCcCss {
NVME_CC_CSS_NVM = 0x0,
+ NVME_CC_CSS_CSI = 0x6,
NVME_CC_CSS_ADMIN_ONLY = 0x7,
};
+#define NVME_SET_CC_EN(cc, val) \
+ (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val) \
+ (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+ (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+ (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
enum NvmeCstsShift {
CSTS_RDY_SHIFT = 0,
CSTS_CFS_SHIFT = 1,
@@ -162,25 +190,64 @@ enum NvmeAqaMask {
#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
enum NvmeCmblocShift {
- CMBLOC_BIR_SHIFT = 0,
- CMBLOC_OFST_SHIFT = 12,
+ CMBLOC_BIR_SHIFT = 0,
+ CMBLOC_CQMMS_SHIFT = 3,
+ CMBLOC_CQPDS_SHIFT = 4,
+ CMBLOC_CDPMLS_SHIFT = 5,
+ CMBLOC_CDPCILS_SHIFT = 6,
+ CMBLOC_CDMMMS_SHIFT = 7,
+ CMBLOC_CQDA_SHIFT = 8,
+ CMBLOC_OFST_SHIFT = 12,
};
enum NvmeCmblocMask {
- CMBLOC_BIR_MASK = 0x7,
- CMBLOC_OFST_MASK = 0xfffff,
+ CMBLOC_BIR_MASK = 0x7,
+ CMBLOC_CQMMS_MASK = 0x1,
+ CMBLOC_CQPDS_MASK = 0x1,
+ CMBLOC_CDPMLS_MASK = 0x1,
+ CMBLOC_CDPCILS_MASK = 0x1,
+ CMBLOC_CDMMMS_MASK = 0x1,
+ CMBLOC_CQDA_MASK = 0x1,
+ CMBLOC_OFST_MASK = 0xfffff,
};
-#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \
- CMBLOC_BIR_MASK)
-#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \
- CMBLOC_OFST_MASK)
-
-#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
+#define NVME_CMBLOC_BIR(cmbloc) \
+ ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK)
+#define NVME_CMBLOC_CQMMS(cmbloc) \
+ ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK)
+#define NVME_CMBLOC_CQPDS(cmbloc) \
+ ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK)
+#define NVME_CMBLOC_CDPMLS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK)
+#define NVME_CMBLOC_CDPCILS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK)
+#define NVME_CMBLOC_CDMMMS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK)
+#define NVME_CMBLOC_CQDA(cmbloc) \
+ ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK)
+#define NVME_CMBLOC_OFST(cmbloc) \
+ ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK)
+
+#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
+#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT)
+#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT)
+#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT)
+#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT)
#define NVME_CMBLOC_SET_OFST(cmbloc, val) \
(cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
+#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT)
+
enum NvmeCmbszShift {
CMBSZ_SQS_SHIFT = 0,
CMBSZ_CQS_SHIFT = 1,
@@ -227,6 +294,46 @@ enum NvmeCmbszMask {
#define NVME_CMBSZ_GETSIZE(cmbsz) \
(NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
+enum NvmeCmbmscShift {
+ CMBMSC_CRE_SHIFT = 0,
+ CMBMSC_CMSE_SHIFT = 1,
+ CMBMSC_CBA_SHIFT = 12,
+};
+
+enum NvmeCmbmscMask {
+ CMBMSC_CRE_MASK = 0x1,
+ CMBMSC_CMSE_MASK = 0x1,
+ CMBMSC_CBA_MASK = ((1ULL << 52) - 1),
+};
+
+#define NVME_CMBMSC_CRE(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CRE_SHIFT) & CMBMSC_CRE_MASK)
+#define NVME_CMBMSC_CMSE(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK)
+#define NVME_CMBMSC_CBA(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK)
+
+
+#define NVME_CMBMSC_SET_CRE(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT)
+#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT)
+#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT)
+
+enum NvmeCmbstsShift {
+ CMBSTS_CBAI_SHIFT = 0,
+};
+enum NvmeCmbstsMask {
+ CMBSTS_CBAI_MASK = 0x1,
+};
+
+#define NVME_CMBSTS_CBAI(cmbsts) \
+ ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK)
+
+#define NVME_CMBSTS_SET_CBAI(cmbsts, val) \
+ (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT)
+
enum NvmePmrcapShift {
PMRCAP_RDS_SHIFT = 3,
PMRCAP_WDS_SHIFT = 4,
@@ -472,6 +579,9 @@ enum NvmeIoCommands {
NVME_CMD_COMPARE = 0x05,
NVME_CMD_WRITE_ZEROES = 0x08,
NVME_CMD_DSM = 0x09,
+ NVME_CMD_ZONE_MGMT_SEND = 0x79,
+ NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+ NVME_CMD_ZONE_APPEND = 0x7d,
};
typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -540,8 +650,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
uint64_t rsvd2[2];
uint64_t prp1;
uint64_t prp2;
- uint32_t cns;
- uint32_t rsvd11[5];
+ uint8_t cns;
+ uint8_t rsvd10;
+ uint16_t ctrlid;
+ uint16_t nvmsetid;
+ uint8_t rsvd11;
+ uint8_t csi;
+ uint32_t rsvd12[4];
} NvmeIdentify;
typedef struct QEMU_PACKED NvmeRwCmd {
@@ -632,9 +747,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
uint8_t resv;
} NvmeAerResult;
+typedef struct QEMU_PACKED NvmeZonedResult {
+ uint64_t slba;
+} NvmeZonedResult;
+
typedef struct QEMU_PACKED NvmeCqe {
uint32_t result;
- uint32_t rsvd;
+ uint32_t dw1;
uint16_t sq_head;
uint16_t sq_id;
uint16_t cid;
@@ -662,6 +781,8 @@ enum NvmeStatusCodes {
NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
NVME_INVALID_USE_OF_CMB = 0x0012,
NVME_INVALID_PRP_OFFSET = 0x0013,
+ NVME_CMD_SET_CMB_REJECTED = 0x002b,
+ NVME_INVALID_CMD_SET = 0x002c,
NVME_LBA_RANGE = 0x0080,
NVME_CAP_EXCEEDED = 0x0081,
NVME_NS_NOT_READY = 0x0082,
@@ -686,6 +807,14 @@ enum NvmeStatusCodes {
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
+ NVME_ZONE_BOUNDARY_ERROR = 0x01b8,
+ NVME_ZONE_FULL = 0x01b9,
+ NVME_ZONE_READ_ONLY = 0x01ba,
+ NVME_ZONE_OFFLINE = 0x01bb,
+ NVME_ZONE_INVALID_WRITE = 0x01bc,
+ NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd,
+ NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+ NVME_ZONE_INVAL_TRANSITION = 0x01bf,
NVME_WRITE_FAULT = 0x0280,
NVME_UNRECOVERED_READ = 0x0281,
NVME_E2E_GUARD_ERROR = 0x0282,
@@ -693,6 +822,7 @@ enum NvmeStatusCodes {
NVME_E2E_REF_ERROR = 0x0284,
NVME_CMP_FAILURE = 0x0285,
NVME_ACCESS_DENIED = 0x0286,
+ NVME_DULB = 0x0287,
NVME_MORE = 0x2000,
NVME_DNR = 0x4000,
NVME_NO_COMPLETE = 0xffff,
@@ -743,18 +873,37 @@ typedef struct QEMU_PACKED NvmeSmartLog {
uint8_t reserved2[320];
} NvmeSmartLog;
+#define NVME_SMART_WARN_MAX 6
enum NvmeSmartWarn {
NVME_SMART_SPARE = 1 << 0,
NVME_SMART_TEMPERATURE = 1 << 1,
NVME_SMART_RELIABILITY = 1 << 2,
NVME_SMART_MEDIA_READ_ONLY = 1 << 3,
NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4,
+ NVME_SMART_PMR_UNRELIABLE = 1 << 5,
+};
+
+typedef struct NvmeEffectsLog {
+ uint32_t acs[256];
+ uint32_t iocs[256];
+ uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+ NVME_CMD_EFF_CSUPP = 1 << 0,
+ NVME_CMD_EFF_LBCC = 1 << 1,
+ NVME_CMD_EFF_NCC = 1 << 2,
+ NVME_CMD_EFF_NIC = 1 << 3,
+ NVME_CMD_EFF_CCC = 1 << 4,
+ NVME_CMD_EFF_CSE_MASK = 3 << 16,
+ NVME_CMD_EFF_UUID_SEL = 1 << 19,
};
enum NvmeLogIdentifier {
NVME_LOG_ERROR_INFO = 0x01,
NVME_LOG_SMART_INFO = 0x02,
NVME_LOG_FW_SLOT_INFO = 0x03,
+ NVME_LOG_CMD_EFFECTS = 0x05,
};
typedef struct QEMU_PACKED NvmePSD {
@@ -771,11 +920,19 @@ typedef struct QEMU_PACKED NvmePSD {
#define NVME_IDENTIFY_DATA_SIZE 4096
-enum {
- NVME_ID_CNS_NS = 0x0,
- NVME_ID_CNS_CTRL = 0x1,
- NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
- NVME_ID_CNS_NS_DESCR_LIST = 0x3,
+enum NvmeIdCns {
+ NVME_ID_CNS_NS = 0x00,
+ NVME_ID_CNS_CTRL = 0x01,
+ NVME_ID_CNS_NS_ACTIVE_LIST = 0x02,
+ NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+ NVME_ID_CNS_CS_NS = 0x05,
+ NVME_ID_CNS_CS_CTRL = 0x06,
+ NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+ NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
+ NVME_ID_CNS_NS_PRESENT = 0x11,
+ NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a,
+ NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+ NVME_ID_CNS_IO_COMMAND_SET = 0x1c,
};
typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -794,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
uint32_t rtd3e;
uint32_t oaes;
uint32_t ctratt;
- uint8_t rsvd100[12];
+ uint8_t rsvd100[11];
+ uint8_t cntrltype;
uint8_t fguid[16];
uint8_t rsvd128[128];
uint16_t oacs;
@@ -845,6 +1003,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
uint8_t vs[1024];
} NvmeIdCtrl;
+typedef struct NvmeIdCtrlZoned {
+ uint8_t zasl;
+ uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
enum NvmeIdCtrlOacs {
NVME_OACS_SECURITY = 1 << 0,
NVME_OACS_FORMAT = 1 << 1,
@@ -867,6 +1030,7 @@ enum NvmeIdCtrlFrmw {
enum NvmeIdCtrlLpa {
NVME_LPA_NS_SMART = 1 << 0,
+ NVME_LPA_CSE = 1 << 1,
NVME_LPA_EXTENDED = 1 << 2,
};
@@ -909,6 +1073,9 @@ enum NvmeIdCtrlLpa {
#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1)
#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
enum NvmeFeatureIds {
NVME_ARBITRATION = 0x1,
NVME_POWER_MANAGEMENT = 0x2,
@@ -922,6 +1089,7 @@ enum NvmeFeatureIds {
NVME_WRITE_ATOMICITY = 0xa,
NVME_ASYNCHRONOUS_EVENT_CONF = 0xb,
NVME_TIMESTAMP = 0xe,
+ NVME_COMMAND_SET_PROFILE = 0x19,
NVME_SOFTWARE_PROGRESS_MARKER = 0x80,
NVME_FID_MAX = 0x100,
};
@@ -968,6 +1136,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
uint8_t rp;
} NvmeLBAF;
+typedef struct QEMU_PACKED NvmeLBAFE {
+ uint64_t zsze;
+ uint8_t zdes;
+ uint8_t rsvd9[7];
+} NvmeLBAFE;
+
#define NVME_NSID_BROADCAST 0xffffffff
typedef struct QEMU_PACKED NvmeIdNs {
@@ -992,7 +1166,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
uint16_t nabspf;
uint16_t noiob;
uint8_t nvmcap[16];
- uint8_t rsvd64[40];
+ uint16_t npwg;
+ uint16_t npwa;
+ uint16_t npdg;
+ uint16_t npda;
+ uint16_t nows;
+ uint8_t rsvd74[30];
uint8_t nguid[16];
uint64_t eui64;
NvmeLBAF lbaf[16];
@@ -1006,18 +1185,40 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
uint8_t rsvd2[2];
} NvmeIdNsDescr;
-enum {
- NVME_NIDT_EUI64_LEN = 8,
- NVME_NIDT_NGUID_LEN = 16,
- NVME_NIDT_UUID_LEN = 16,
+enum NvmeNsIdentifierLength {
+ NVME_NIDL_EUI64 = 8,
+ NVME_NIDL_NGUID = 16,
+ NVME_NIDL_UUID = 16,
+ NVME_NIDL_CSI = 1,
};
enum NvmeNsIdentifierType {
- NVME_NIDT_EUI64 = 0x1,
- NVME_NIDT_NGUID = 0x2,
- NVME_NIDT_UUID = 0x3,
+ NVME_NIDT_EUI64 = 0x01,
+ NVME_NIDT_NGUID = 0x02,
+ NVME_NIDT_UUID = 0x03,
+ NVME_NIDT_CSI = 0x04,
};
+enum NvmeCsi {
+ NVME_CSI_NVM = 0x00,
+ NVME_CSI_ZONED = 0x02,
+};
+
+#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
+
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+ uint16_t zoc;
+ uint16_t ozcs;
+ uint32_t mar;
+ uint32_t mor;
+ uint32_t rrl;
+ uint32_t frl;
+ uint8_t rsvd20[2796];
+ NvmeLBAFE lbafe[16];
+ uint8_t rsvd3072[768];
+ uint8_t vs[256];
+} NvmeIdNsZoned;
+
/*Deallocate Logical Block Features*/
#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10)
#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08)
@@ -1029,6 +1230,7 @@ enum NvmeNsIdentifierType {
#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
@@ -1049,10 +1251,76 @@ enum NvmeIdNsDps {
DPS_FIRST_EIGHT = 8,
};
+enum NvmeZoneAttr {
+ NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+ NVME_ZA_FINISH_RECOMMENDED = 1 << 1,
+ NVME_ZA_RESET_RECOMMENDED = 1 << 2,
+ NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+ uint64_t nr_zones;
+ uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+ NVME_ZONE_REPORT = 0,
+ NVME_ZONE_REPORT_EXTENDED = 1,
+};
+
+enum NvmeZoneReportType {
+ NVME_ZONE_REPORT_ALL = 0,
+ NVME_ZONE_REPORT_EMPTY = 1,
+ NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+ NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+ NVME_ZONE_REPORT_CLOSED = 4,
+ NVME_ZONE_REPORT_FULL = 5,
+ NVME_ZONE_REPORT_READ_ONLY = 6,
+ NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+ NVME_ZONE_TYPE_RESERVED = 0x00,
+ NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+ NVME_ZONE_ACTION_RSD = 0x00,
+ NVME_ZONE_ACTION_CLOSE = 0x01,
+ NVME_ZONE_ACTION_FINISH = 0x02,
+ NVME_ZONE_ACTION_OPEN = 0x03,
+ NVME_ZONE_ACTION_RESET = 0x04,
+ NVME_ZONE_ACTION_OFFLINE = 0x05,
+ NVME_ZONE_ACTION_SET_ZD_EXT = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+ uint8_t zt;
+ uint8_t zs;
+ uint8_t za;
+ uint8_t rsvd3[5];
+ uint64_t zcap;
+ uint64_t zslba;
+ uint64_t wp;
+ uint8_t rsvd32[32];
+} NvmeZoneDescr;
+
+typedef enum NvmeZoneState {
+ NVME_ZONE_STATE_RESERVED = 0x00,
+ NVME_ZONE_STATE_EMPTY = 0x01,
+ NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02,
+ NVME_ZONE_STATE_EXPLICITLY_OPEN = 0x03,
+ NVME_ZONE_STATE_CLOSED = 0x04,
+ NVME_ZONE_STATE_READ_ONLY = 0x0D,
+ NVME_ZONE_STATE_FULL = 0x0E,
+ NVME_ZONE_STATE_OFFLINE = 0x0F,
+} NvmeZoneState;
+
static inline void _nvme_check_size(void)
{
QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8);
QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
@@ -1066,9 +1334,15 @@ static inline void _nvme_check_size(void)
QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
}
#endif
diff --git a/include/block/snapshot.h b/include/block/snapshot.h
index b0fe42993d..940345692f 100644
--- a/include/block/snapshot.h
+++ b/include/block/snapshot.h
@@ -25,7 +25,7 @@
#ifndef SNAPSHOT_H
#define SNAPSHOT_H
-
+#include "qapi/qapi-builtin-types.h"
#define SNAPSHOT_OPT_BASE "snapshot."
#define SNAPSHOT_OPT_ID "snapshot.id"
@@ -77,17 +77,26 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
* These functions will properly handle dataplane (take aio_context_acquire
* when appropriate for appropriate block drivers */
-bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs);
-int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bsd_bs,
+bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
+ Error **errp);
+int bdrv_all_delete_snapshot(const char *name,
+ bool has_devices, strList *devices,
Error **errp);
-int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs,
+int bdrv_all_goto_snapshot(const char *name,
+ bool has_devices, strList *devices,
Error **errp);
-int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs);
+int bdrv_all_has_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp);
int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
BlockDriverState *vm_state_bs,
uint64_t vm_state_size,
- BlockDriverState **first_bad_bs);
+ bool has_devices,
+ strList *devices,
+ Error **errp);
-BlockDriverState *bdrv_all_find_vmstate_bs(void);
+BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
+ bool has_devices, strList *devices,
+ Error **errp);
#endif
diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h
new file mode 100644
index 0000000000..ba2dd4b5df
--- /dev/null
+++ b/include/exec/confidential-guest-support.h
@@ -0,0 +1,62 @@
+/*
+ * QEMU Confidential Guest support
+ * This interface describes the common pieces between various
+ * schemes for protecting guest memory or other state against a
+ * compromised hypervisor. This includes memory encryption (AMD's
+ * SEV and Intel's MKTME) or special protection modes (PEF on POWER,
+ * or PV on s390x).
+ *
+ * Copyright Red Hat.
+ *
+ * Authors:
+ * David Gibson <david@gibson.dropbear.id.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_CONFIDENTIAL_GUEST_SUPPORT_H
+#define QEMU_CONFIDENTIAL_GUEST_SUPPORT_H
+
+#ifndef CONFIG_USER_ONLY
+
+#include "qom/object.h"
+
+#define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support"
+OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT)
+
+struct ConfidentialGuestSupport {
+ Object parent;
+
+ /*
+ * ready: flag set by CGS initialization code once it's ready to
+ * start executing instructions in a potentially-secure
+ * guest
+ *
+ * The definition here is a bit fuzzy, because this is essentially
+ * part of a self-sanity-check, rather than a strict mechanism.
+ *
+ * It's not feasible to have a single point in the common machine
+ * init path to configure confidential guest support, because
+ * different mechanisms have different interdependencies requiring
+ * initialization in different places, often in arch or machine
+ * type specific code. It's also usually not possible to check
+ * for invalid configurations until that initialization code.
+ * That means it would be very easy to have a bug allowing CGS
+ * init to be bypassed entirely in certain configurations.
+ *
+ * Silently ignoring a requested security feature would be bad, so
+ * to avoid that we check late in init that this 'ready' flag is
+ * set if CGS was requested. If the CGS init hasn't happened, and
+ * so 'ready' is not set, we'll abort.
+ */
+ bool ready;
+};
+
+typedef struct ConfidentialGuestSupportClass {
+ ObjectClass parent;
+} ConfidentialGuestSupportClass;
+
+#endif /* !CONFIG_USER_ONLY */
+
+#endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */
diff --git a/include/exec/memory.h b/include/exec/memory.h
index c6ce74fb79..c6fb714e49 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -45,13 +45,11 @@ DECLARE_OBJ_CHECKERS(IOMMUMemoryRegion, IOMMUMemoryRegionClass,
#ifdef CONFIG_FUZZ
void fuzz_dma_read_cb(size_t addr,
size_t len,
- MemoryRegion *mr,
- bool is_write);
+ MemoryRegion *mr);
#else
static inline void fuzz_dma_read_cb(size_t addr,
size_t len,
- MemoryRegion *mr,
- bool is_write)
+ MemoryRegion *mr)
{
/* Do Nothing */
}
@@ -149,6 +147,14 @@ typedef struct IOMMUTLBEvent {
/* RAM is a persistent kind memory */
#define RAM_PMEM (1 << 5)
+
+/*
+ * UFFDIO_WRITEPROTECT is used on this RAMBlock to
+ * support 'write-tracking' migration type.
+ * Implies ram_state->ram_wt_enabled.
+ */
+#define RAM_UF_WRITEPROTECT (1 << 6)
+
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags,
hwaddr start, hwaddr end,
@@ -992,6 +998,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
* @size: size of the region.
* @share: %true if memory must be mmaped with the MAP_SHARED flag
* @fd: the fd to mmap.
+ * @offset: offset within the file referenced by fd
* @errp: pointer to Error*, to store an error if it happens.
*
* Note that this function does not do anything to cause the data in the
@@ -1003,6 +1010,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
uint64_t size,
bool share,
int fd,
+ ram_addr_t offset,
Error **errp);
#endif
@@ -2506,7 +2514,7 @@ address_space_read_cached(MemoryRegionCache *cache, hwaddr addr,
void *buf, hwaddr len)
{
assert(addr < cache->len && len <= cache->len - addr);
- fuzz_dma_read_cb(cache->xlat + addr, len, cache->mrs.mr, false);
+ fuzz_dma_read_cb(cache->xlat + addr, len, cache->mrs.mr);
if (likely(cache->ptr)) {
memcpy(buf, cache->ptr + addr, len);
return MEMTX_OK;
diff --git a/include/exec/memory_ldst_cached.h.inc b/include/exec/memory_ldst_cached.h.inc
index 01efad62de..7bc8790d34 100644
--- a/include/exec/memory_ldst_cached.h.inc
+++ b/include/exec/memory_ldst_cached.h.inc
@@ -28,7 +28,7 @@ static inline uint32_t ADDRESS_SPACE_LD_CACHED(l)(MemoryRegionCache *cache,
hwaddr addr, MemTxAttrs attrs, MemTxResult *result)
{
assert(addr < cache->len && 4 <= cache->len - addr);
- fuzz_dma_read_cb(cache->xlat + addr, 4, cache->mrs.mr, false);
+ fuzz_dma_read_cb(cache->xlat + addr, 4, cache->mrs.mr);
if (likely(cache->ptr)) {
return LD_P(l)(cache->ptr + addr);
} else {
@@ -40,7 +40,7 @@ static inline uint64_t ADDRESS_SPACE_LD_CACHED(q)(MemoryRegionCache *cache,
hwaddr addr, MemTxAttrs attrs, MemTxResult *result)
{
assert(addr < cache->len && 8 <= cache->len - addr);
- fuzz_dma_read_cb(cache->xlat + addr, 8, cache->mrs.mr, false);
+ fuzz_dma_read_cb(cache->xlat + addr, 8, cache->mrs.mr);
if (likely(cache->ptr)) {
return LD_P(q)(cache->ptr + addr);
} else {
@@ -52,7 +52,7 @@ static inline uint32_t ADDRESS_SPACE_LD_CACHED(uw)(MemoryRegionCache *cache,
hwaddr addr, MemTxAttrs attrs, MemTxResult *result)
{
assert(addr < cache->len && 2 <= cache->len - addr);
- fuzz_dma_read_cb(cache->xlat + addr, 2, cache->mrs.mr, false);
+ fuzz_dma_read_cb(cache->xlat + addr, 2, cache->mrs.mr);
if (likely(cache->ptr)) {
return LD_P(uw)(cache->ptr + addr);
} else {
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 40b16609ab..3cb9791df3 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -121,8 +121,8 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, const char *mem_path,
bool readonly, Error **errp);
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
- uint32_t ram_flags, int fd, bool readonly,
- Error **errp);
+ uint32_t ram_flags, int fd, off_t offset,
+ bool readonly, Error **errp);
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
MemoryRegion *mr, Error **errp);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 85af4faf76..a46dfe5d1a 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -270,7 +270,7 @@ struct MachineState {
bool iommu;
bool suppress_vmdesc;
bool enable_graphics;
- char *memory_encryption;
+ ConfidentialGuestSupport *cgs;
char *ram_memdev_id;
/*
* convenience alias to ram_memdev_id backend memory region
diff --git a/include/hw/dma/pl080.h b/include/hw/dma/pl080.h
index 1883f04270..3c9659e438 100644
--- a/include/hw/dma/pl080.h
+++ b/include/hw/dma/pl080.h
@@ -10,11 +10,12 @@
* (at your option) any later version.
*/
-/* This is a model of the Arm PrimeCell PL080/PL081 DMA controller:
+/*
+ * This is a model of the Arm PrimeCell PL080/PL081 DMA controller:
* The PL080 TRM is:
- * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0196g/DDI0196.pdf
+ * https://developer.arm.com/documentation/ddi0196/latest
* and the PL081 TRM is:
- * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0218e/DDI0218.pdf
+ * https://developer.arm.com/documentation/ddi0218/latest
*
* QEMU interface:
* + sysbus IRQ 0: DMACINTR combined interrupt line
diff --git a/include/hw/misc/arm_integrator_debug.h b/include/hw/misc/arm_integrator_debug.h
index 0077dacb44..798b082164 100644
--- a/include/hw/misc/arm_integrator_debug.h
+++ b/include/hw/misc/arm_integrator_debug.h
@@ -3,7 +3,7 @@
*
* Browse the data sheet:
*
- * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0159b/Babbfijf.html
+ * https://developer.arm.com/documentation/dui0159/b/peripherals-and-interfaces/debug-leds-and-dip-switch-interface
*
* Copyright (c) 2013 Alex Bennée <alex@bennee.com>
*
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
new file mode 100644
index 0000000000..3dcf6aa51d
--- /dev/null
+++ b/include/hw/pci-host/remote.h
@@ -0,0 +1,30 @@
+/*
+ * PCI Host for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_PCIHOST_H
+#define REMOTE_PCIHOST_H
+
+#include "exec/memory.h"
+#include "hw/pci/pcie_host.h"
+
+#define TYPE_REMOTE_PCIHOST "remote-pcihost"
+OBJECT_DECLARE_SIMPLE_TYPE(RemotePCIHost, REMOTE_PCIHOST)
+
+struct RemotePCIHost {
+ /*< private >*/
+ PCIExpressHost parent_obj;
+ /*< public >*/
+
+ MemoryRegion *mr_pci_mem;
+ MemoryRegion *mr_sys_io;
+ MemoryRegion *mr_sys_mem;
+};
+
+#endif
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
index bd014823a9..5b03a7b0eb 100644
--- a/include/hw/pci-host/spapr.h
+++ b/include/hw/pci-host/spapr.h
@@ -210,4 +210,6 @@ static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb)
return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
}
+char *spapr_pci_fw_dev_name(PCIDevice *dev);
+
#endif /* PCI_HOST_SPAPR_H */
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 11f8ab7149..bd0c17dc78 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -192,6 +192,9 @@
#define PCI_DEVICE_ID_SUN_SIMBA 0x5000
#define PCI_DEVICE_ID_SUN_SABRE 0xa000
+#define PCI_VENDOR_ID_ORACLE 0x108e
+#define PCI_DEVICE_ID_REMOTE_IOHUB 0xb000
+
#define PCI_VENDOR_ID_CMD 0x1095
#define PCI_DEVICE_ID_CMD_646 0x0646
diff --git a/include/hw/ppc/pef.h b/include/hw/ppc/pef.h
new file mode 100644
index 0000000000..707dbe524c
--- /dev/null
+++ b/include/hw/ppc/pef.h
@@ -0,0 +1,17 @@
+/*
+ * PEF (Protected Execution Facility) for POWER support
+ *
+ * Copyright Red Hat.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HW_PPC_PEF_H
+#define HW_PPC_PEF_H
+
+int pef_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
+int pef_kvm_reset(ConfidentialGuestSupport *cgs, Error **errp);
+
+#endif /* HW_PPC_PEF_H */
diff --git a/include/hw/ppc/pnv.h b/include/hw/ppc/pnv.h
index ee7eda3e01..d69cee17b2 100644
--- a/include/hw/ppc/pnv.h
+++ b/include/hw/ppc/pnv.h
@@ -58,6 +58,7 @@ struct PnvChip {
MemoryRegion xscom;
AddressSpace xscom_as;
+ MemoryRegion *fw_mr;
gchar *dt_isa_nodename;
};
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index c27c7ce515..ccbeeca1de 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -851,7 +851,6 @@ int spapr_max_server_number(SpaprMachineState *spapr);
void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
uint64_t pte0, uint64_t pte1);
void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
-bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr);
/* DRC callbacks. */
void spapr_core_release(DeviceState *dev);
diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h
index b3fd950634..6f9f02d3de 100644
--- a/include/hw/ppc/spapr_numa.h
+++ b/include/hw/ppc/spapr_numa.h
@@ -31,5 +31,6 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt,
int offset, PowerPCCPU *cpu);
int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt,
int offset);
+unsigned int spapr_numa_initial_nvgpu_numa_id(MachineState *machine);
#endif /* HW_SPAPR_NUMA_H */
diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
index 7879692825..b7fde2354e 100644
--- a/include/hw/ppc/xive_regs.h
+++ b/include/hw/ppc/xive_regs.h
@@ -236,6 +236,8 @@ typedef struct XiveEND {
(be32_to_cpu((end)->w0) & END_W0_UNCOND_ESCALATE)
#define xive_end_is_silent_escalation(end) \
(be32_to_cpu((end)->w0) & END_W0_SILENT_ESCALATE)
+#define xive_end_is_firmware(end) \
+ (be32_to_cpu((end)->w0) & END_W0_FIRMWARE)
static inline uint64_t xive_end_qaddr(XiveEND *end)
{
diff --git a/include/hw/remote/iohub.h b/include/hw/remote/iohub.h
new file mode 100644
index 0000000000..0bf98e0d78
--- /dev/null
+++ b/include/hw/remote/iohub.h
@@ -0,0 +1,42 @@
+/*
+ * IO Hub for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_IOHUB_H
+#define REMOTE_IOHUB_H
+
+#include "hw/pci/pci.h"
+#include "qemu/event_notifier.h"
+#include "qemu/thread-posix.h"
+#include "hw/remote/mpqemu-link.h"
+
+#define REMOTE_IOHUB_NB_PIRQS PCI_DEVFN_MAX
+
+typedef struct ResampleToken {
+ void *iohub;
+ int pirq;
+} ResampleToken;
+
+typedef struct RemoteIOHubState {
+ PCIDevice d;
+ EventNotifier irqfds[REMOTE_IOHUB_NB_PIRQS];
+ EventNotifier resamplefds[REMOTE_IOHUB_NB_PIRQS];
+ unsigned int irq_level[REMOTE_IOHUB_NB_PIRQS];
+ ResampleToken token[REMOTE_IOHUB_NB_PIRQS];
+ QemuMutex irq_level_lock[REMOTE_IOHUB_NB_PIRQS];
+} RemoteIOHubState;
+
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx);
+void remote_iohub_set_irq(void *opaque, int pirq, int level);
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg);
+
+void remote_iohub_init(RemoteIOHubState *iohub);
+void remote_iohub_finalize(RemoteIOHubState *iohub);
+
+#endif
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
new file mode 100644
index 0000000000..2a2a33c4b2
--- /dev/null
+++ b/include/hw/remote/machine.h
@@ -0,0 +1,38 @@
+/*
+ * Remote machine configuration
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_MACHINE_H
+#define REMOTE_MACHINE_H
+
+#include "qom/object.h"
+#include "hw/boards.h"
+#include "hw/pci-host/remote.h"
+#include "io/channel.h"
+#include "hw/remote/iohub.h"
+
+struct RemoteMachineState {
+ MachineState parent_obj;
+
+ RemotePCIHost *host;
+ RemoteIOHubState iohub;
+};
+
+/* Used to pass to co-routine device and ioc. */
+typedef struct RemoteCommDev {
+ PCIDevice *dev;
+ QIOChannel *ioc;
+} RemoteCommDev;
+
+#define TYPE_REMOTE_MACHINE "x-remote-machine"
+OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
+
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data);
+
+#endif
diff --git a/include/hw/remote/memory.h b/include/hw/remote/memory.h
new file mode 100644
index 0000000000..bc2e30945f
--- /dev/null
+++ b/include/hw/remote/memory.h
@@ -0,0 +1,19 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef REMOTE_MEMORY_H
+#define REMOTE_MEMORY_H
+
+#include "exec/hwaddr.h"
+#include "hw/remote/mpqemu-link.h"
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp);
+
+#endif
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
new file mode 100644
index 0000000000..4ec0915885
--- /dev/null
+++ b/include/hw/remote/mpqemu-link.h
@@ -0,0 +1,99 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef MPQEMU_LINK_H
+#define MPQEMU_LINK_H
+
+#include "qom/object.h"
+#include "qemu/thread.h"
+#include "io/channel.h"
+#include "exec/hwaddr.h"
+#include "io/channel-socket.h"
+#include "hw/remote/proxy.h"
+
+#define REMOTE_MAX_FDS 8
+
+#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data.u64)
+
+/**
+ * MPQemuCmd:
+ *
+ * MPQemuCmd enum type to specify the command to be executed on the remote
+ * device.
+ *
+ * This uses a private protocol between QEMU and the remote process. vfio-user
+ * protocol would supersede this in the future.
+ *
+ */
+typedef enum {
+ MPQEMU_CMD_SYNC_SYSMEM,
+ MPQEMU_CMD_RET,
+ MPQEMU_CMD_PCI_CFGWRITE,
+ MPQEMU_CMD_PCI_CFGREAD,
+ MPQEMU_CMD_BAR_WRITE,
+ MPQEMU_CMD_BAR_READ,
+ MPQEMU_CMD_SET_IRQFD,
+ MPQEMU_CMD_DEVICE_RESET,
+ MPQEMU_CMD_MAX,
+} MPQemuCmd;
+
+typedef struct {
+ hwaddr gpas[REMOTE_MAX_FDS];
+ uint64_t sizes[REMOTE_MAX_FDS];
+ off_t offsets[REMOTE_MAX_FDS];
+} SyncSysmemMsg;
+
+typedef struct {
+ uint32_t addr;
+ uint32_t val;
+ int len;
+} PciConfDataMsg;
+
+typedef struct {
+ hwaddr addr;
+ uint64_t val;
+ unsigned size;
+ bool memory;
+} BarAccessMsg;
+
+/**
+ * MPQemuMsg:
+ * @cmd: The remote command
+ * @size: Size of the data to be shared
+ * @data: Structured data
+ * @fds: File descriptors to be shared with remote device
+ *
+ * MPQemuMsg Format of the message sent to the remote device from QEMU.
+ *
+ */
+
+typedef struct {
+ int cmd;
+ size_t size;
+
+ union {
+ uint64_t u64;
+ PciConfDataMsg pci_conf_data;
+ SyncSysmemMsg sync_sysmem;
+ BarAccessMsg bar_access;
+ } data;
+
+ int fds[REMOTE_MAX_FDS];
+ int num_fds;
+} MPQemuMsg;
+
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
+
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
+ Error **errp);
+bool mpqemu_msg_valid(MPQemuMsg *msg);
+
+#endif
diff --git a/include/hw/remote/proxy-memory-listener.h b/include/hw/remote/proxy-memory-listener.h
new file mode 100644
index 0000000000..c4f3efb928
--- /dev/null
+++ b/include/hw/remote/proxy-memory-listener.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PROXY_MEMORY_LISTENER_H
+#define PROXY_MEMORY_LISTENER_H
+
+#include "exec/memory.h"
+#include "io/channel.h"
+
+typedef struct ProxyMemoryListener {
+ MemoryListener listener;
+
+ int n_mr_sections;
+ MemoryRegionSection *mr_sections;
+
+ QIOChannel *ioc;
+} ProxyMemoryListener;
+
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
+ QIOChannel *ioc);
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener);
+
+#endif
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
new file mode 100644
index 0000000000..741def71f1
--- /dev/null
+++ b/include/hw/remote/proxy.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PROXY_H
+#define PROXY_H
+
+#include "hw/pci/pci.h"
+#include "io/channel.h"
+#include "hw/remote/proxy-memory-listener.h"
+#include "qemu/event_notifier.h"
+
+#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
+OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
+
+typedef struct ProxyMemoryRegion {
+ PCIProxyDev *dev;
+ MemoryRegion mr;
+ bool memory;
+ bool present;
+ uint8_t type;
+} ProxyMemoryRegion;
+
+struct PCIProxyDev {
+ PCIDevice parent_dev;
+ char *fd;
+
+ /*
+ * Mutex used to protect the QIOChannel fd from
+ * the concurrent access by the VCPUs since proxy
+ * blocks while awaiting for the replies from the
+ * process remote.
+ */
+ QemuMutex io_mutex;
+ QIOChannel *ioc;
+ Error *migration_blocker;
+ ProxyMemoryListener proxy_listener;
+ int virq;
+ EventNotifier intr;
+ EventNotifier resample;
+ ProxyMemoryRegion region[PCI_NUM_REGIONS];
+};
+
+#endif /* PROXY_H */
diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index aee758bc2d..1f1f545bfc 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -12,6 +12,9 @@
#ifndef HW_S390_PV_H
#define HW_S390_PV_H
+#include "qapi/error.h"
+#include "sysemu/kvm.h"
+
#ifdef CONFIG_KVM
#include "cpu.h"
#include "hw/s390x/s390-virtio-ccw.h"
@@ -55,4 +58,18 @@ static inline void s390_pv_unshare(void) {}
static inline void s390_pv_inject_reset_error(CPUState *cs) {};
#endif /* CONFIG_KVM */
+int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
+static inline int s390_pv_init(ConfidentialGuestSupport *cgs, Error **errp)
+{
+ if (!cgs) {
+ return 0;
+ }
+ if (kvm_enabled()) {
+ return s390_pv_kvm_init(cgs, errp);
+ }
+
+ error_setg(errp, "Protected Virtualization requires KVM");
+ return -1;
+}
+
#endif /* HW_S390_PV_H */
diff --git a/include/hw/ssi/pl022.h b/include/hw/ssi/pl022.h
index 545b52689c..25d58db5f3 100644
--- a/include/hw/ssi/pl022.h
+++ b/include/hw/ssi/pl022.h
@@ -9,9 +9,10 @@
* (at your option) any later version.
*/
-/* This is a model of the Arm PrimeCell PL022 synchronous serial port.
+/*
+ * This is a model of the Arm PrimeCell PL022 synchronous serial port.
* The PL022 TRM is:
- * http://infocenter.arm.com/help/topic/com.arm.doc.ddi0194h/DDI0194H_ssp_pl022_trm.pdf
+ * https://developer.arm.com/documentation/ddi0194/latest
*
* QEMU interface:
* + sysbus IRQ: SSPINTR combined interrupt line
diff --git a/include/io/channel.h b/include/io/channel.h
index ab9ea77959..88988979f8 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -777,4 +777,82 @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
IOHandler *io_write,
void *opaque);
+/**
+ * qio_channel_readv_full_all_eof:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data to
+ * @niov: the length of the @iov array
+ * @fds: an array of file handles to read
+ * @nfds: number of file handles in @fds
+ * @errp: pointer to a NULL-initialized error object
+ *
+ *
+ * Performs same function as qio_channel_readv_all_eof.
+ * Additionally, attempts to read file descriptors shared
+ * over the channel. The function will wait for all
+ * requested data to be read, yielding from the current
+ * coroutine if required. data refers to both file
+ * descriptors and the iovs.
+ *
+ * Returns: 1 if all bytes were read, 0 if end-of-file
+ * occurs without data, or -1 on error
+ */
+
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int **fds, size_t *nfds,
+ Error **errp);
+
+/**
+ * qio_channel_readv_full_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data to
+ * @niov: the length of the @iov array
+ * @fds: an array of file handles to read
+ * @nfds: number of file handles in @fds
+ * @errp: pointer to a NULL-initialized error object
+ *
+ *
+ * Performs same function as qio_channel_readv_all_eof.
+ * Additionally, attempts to read file descriptors shared
+ * over the channel. The function will wait for all
+ * requested data to be read, yielding from the current
+ * coroutine if required. data refers to both file
+ * descriptors and the iovs.
+ *
+ * Returns: 0 if all bytes were read, or -1 on error
+ */
+
+int qio_channel_readv_full_all(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int **fds, size_t *nfds,
+ Error **errp);
+
+/**
+ * qio_channel_writev_full_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to write data from
+ * @niov: the length of the @iov array
+ * @fds: an array of file handles to send
+ * @nfds: number of file handles in @fds
+ * @errp: pointer to a NULL-initialized error object
+ *
+ *
+ * Behaves like qio_channel_writev_full but will attempt
+ * to send all data passed (file handles and memory regions).
+ * The function will wait for all requested data
+ * to be written, yielding from the current coroutine
+ * if required.
+ *
+ * Returns: 0 if all bytes were written, or -1 on error
+ */
+
+int qio_channel_writev_full_all(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int *fds, size_t nfds,
+ Error **errp);
+
#endif /* QIO_CHANNEL_H */
diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h
index c85b6ec75b..e72083b117 100644
--- a/include/migration/snapshot.h
+++ b/include/migration/snapshot.h
@@ -15,7 +15,50 @@
#ifndef QEMU_MIGRATION_SNAPSHOT_H
#define QEMU_MIGRATION_SNAPSHOT_H
-int save_snapshot(const char *name, Error **errp);
-int load_snapshot(const char *name, Error **errp);
+#include "qapi/qapi-builtin-types.h"
+
+/**
+ * save_snapshot: Save an internal snapshot.
+ * @name: name of internal snapshot
+ * @overwrite: replace existing snapshot with @name
+ * @vmstate: blockdev node name to store VM state in
+ * @has_devices: whether to use explicit device list
+ * @devices: explicit device list to snapshot
+ * @errp: pointer to error object
+ * On success, return %true.
+ * On failure, store an error through @errp and return %false.
+ */
+bool save_snapshot(const char *name, bool overwrite,
+ const char *vmstate,
+ bool has_devices, strList *devices,
+ Error **errp);
+
+/**
+ * load_snapshot: Load an internal snapshot.
+ * @name: name of internal snapshot
+ * @vmstate: blockdev node name to load VM state from
+ * @has_devices: whether to use explicit device list
+ * @devices: explicit device list to snapshot
+ * @errp: pointer to error object
+ * On success, return %true.
+ * On failure, store an error through @errp and return %false.
+ */
+bool load_snapshot(const char *name,
+ const char *vmstate,
+ bool has_devices, strList *devices,
+ Error **errp);
+
+/**
+ * delete_snapshot: Delete a snapshot.
+ * @name: path to snapshot
+ * @has_devices: whether to use explicit device list
+ * @devices: explicit device list to snapshot
+ * @errp: pointer to error object
+ * On success, return %true.
+ * On failure, store an error through @errp and return %false.
+ */
+bool delete_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp);
#endif
diff --git a/include/qemu/event_notifier.h b/include/qemu/event_notifier.h
index 3380b662f3..b79add035d 100644
--- a/include/qemu/event_notifier.h
+++ b/include/qemu/event_notifier.h
@@ -24,6 +24,7 @@ struct EventNotifier {
#else
int rfd;
int wfd;
+ bool initialized;
#endif
};
diff --git a/include/qemu/fifo8.h b/include/qemu/fifo8.h
index 489c354291..28bf2cee57 100644
--- a/include/qemu/fifo8.h
+++ b/include/qemu/fifo8.h
@@ -148,12 +148,16 @@ uint32_t fifo8_num_used(Fifo8 *fifo);
extern const VMStateDescription vmstate_fifo8;
-#define VMSTATE_FIFO8(_field, _state) { \
- .name = (stringify(_field)), \
- .size = sizeof(Fifo8), \
- .vmsd = &vmstate_fifo8, \
- .flags = VMS_STRUCT, \
- .offset = vmstate_offset_value(_state, _field, Fifo8), \
+#define VMSTATE_FIFO8_TEST(_field, _state, _test) { \
+ .name = (stringify(_field)), \
+ .field_exists = (_test), \
+ .size = sizeof(Fifo8), \
+ .vmsd = &vmstate_fifo8, \
+ .flags = VMS_STRUCT, \
+ .offset = vmstate_offset_value(_state, _field, Fifo8), \
}
+#define VMSTATE_FIFO8(_field, _state) \
+ VMSTATE_FIFO8_TEST(_field, _state, NULL)
+
#endif /* QEMU_FIFO8_H */
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 32aabb1c60..efc6fa7544 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -251,6 +251,11 @@ struct JobDriver {
*/
void (*clean)(Job *job);
+ /**
+ * If the callback is not NULL, it will be invoked in job_cancel_async
+ */
+ void (*cancel)(Job *job);
+
/** Called when the job is freed */
void (*free)(Job *job);
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index 8b7a5c70f3..456ff87df1 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -17,6 +17,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
* @readonly: true for a read-only mapping, false for read/write.
* @shared: map has RAM_SHARED flag.
* @is_pmem: map has RAM_PMEM flag.
+ * @map_offset: map starts at offset of map_offset from the start of fd
*
* Return:
* On success, return a pointer to the mapped area.
@@ -27,7 +28,8 @@ void *qemu_ram_mmap(int fd,
size_t align,
bool readonly,
bool shared,
- bool is_pmem);
+ bool is_pmem,
+ off_t map_offset);
void qemu_ram_munmap(int fd, void *ptr, size_t size);
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 68deb74ef6..dc39b05c30 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -37,6 +37,7 @@ typedef struct Chardev Chardev;
typedef struct Clock Clock;
typedef struct CompatProperty CompatProperty;
typedef struct CoMutex CoMutex;
+typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
typedef struct CPUAddressSpace CPUAddressSpace;
typedef struct CPUState CPUState;
typedef struct DeviceListener DeviceListener;
diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
new file mode 100644
index 0000000000..6b74f92792
--- /dev/null
+++ b/include/qemu/userfaultfd.h
@@ -0,0 +1,35 @@
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef USERFAULTFD_H
+#define USERFAULTFD_H
+
+#include "qemu/osdep.h"
+#include "exec/hwaddr.h"
+#include <linux/userfaultfd.h>
+
+int uffd_query_features(uint64_t *features);
+int uffd_create_fd(uint64_t features, bool non_blocking);
+void uffd_close_fd(int uffd_fd);
+int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
+ uint64_t mode, uint64_t *ioctls);
+int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length);
+int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
+ bool wp, bool dont_wake);
+int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
+ uint64_t length, bool dont_wake);
+int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake);
+int uffd_wakeup(int uffd_fd, void *addr, uint64_t length);
+int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count);
+bool uffd_poll_events(int uffd_fd, int tmo);
+
+#endif /* USERFAULTFD_H */
diff --git a/include/qom/object.h b/include/qom/object.h
index d378f13a11..6721cd312e 100644
--- a/include/qom/object.h
+++ b/include/qom/object.h
@@ -638,7 +638,8 @@ bool object_apply_global_props(Object *obj, const GPtrArray *props,
Error **errp);
void object_set_machine_compat_props(GPtrArray *compat_props);
void object_set_accelerator_compat_props(GPtrArray *compat_props);
-void object_register_sugar_prop(const char *driver, const char *prop, const char *value);
+void object_register_sugar_prop(const char *driver, const char *prop,
+ const char *value, bool optional);
void object_apply_compat_props(Object *obj);
/**
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
index 0c5284dbbc..f177142f16 100644
--- a/include/sysemu/iothread.h
+++ b/include/sysemu/iothread.h
@@ -57,4 +57,10 @@ IOThread *iothread_create(const char *id, Error **errp);
void iothread_stop(IOThread *iothread);
void iothread_destroy(IOThread *iothread);
+/*
+ * Returns true if executing withing IOThread context,
+ * false otherwise.
+ */
+bool qemu_in_iothread(void);
+
#endif /* IOTHREAD_H */
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 739682f3c3..c5546bdecc 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -233,22 +233,6 @@ int kvm_has_intx_set_mask(void);
*/
bool kvm_arm_supports_user_irq(void);
-/**
- * kvm_memcrypt_enabled - return boolean indicating whether memory encryption
- * is enabled
- * Returns: 1 memory encryption is enabled
- * 0 memory encryption is disabled
- */
-bool kvm_memcrypt_enabled(void);
-
-/**
- * kvm_memcrypt_encrypt_data: encrypt the memory range
- *
- * Return: 1 failed to encrypt the range
- * 0 succesfully encrypted memory region
- */
-int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len);
-
#ifdef NEED_CPU_H
#include "cpu.h"
diff --git a/include/sysemu/sev.h b/include/sysemu/sev.h
index 7ab6e3e31d..5c5a13c6ca 100644
--- a/include/sysemu/sev.h
+++ b/include/sysemu/sev.h
@@ -16,8 +16,8 @@
#include "sysemu/kvm.h"
-void *sev_guest_init(const char *id);
-int sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len);
+int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
+int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp);
int sev_inject_launch_secret(const char *hdr, const char *secret,
uint64_t gpa, Error **errp);
#endif
diff --git a/io/channel.c b/io/channel.c
index 93d449dee2..e8b019dc36 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -92,19 +92,47 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
size_t niov,
Error **errp)
{
+ return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
+}
+
+int qio_channel_readv_all(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ Error **errp)
+{
+ return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp);
+}
+
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int **fds, size_t *nfds,
+ Error **errp)
+{
int ret = -1;
struct iovec *local_iov = g_new(struct iovec, niov);
struct iovec *local_iov_head = local_iov;
unsigned int nlocal_iov = niov;
+ int **local_fds = fds;
+ size_t *local_nfds = nfds;
bool partial = false;
+ if (nfds) {
+ *nfds = 0;
+ }
+
+ if (fds) {
+ *fds = NULL;
+ }
+
nlocal_iov = iov_copy(local_iov, nlocal_iov,
iov, niov,
0, iov_size(iov, niov));
- while (nlocal_iov > 0) {
+ while ((nlocal_iov > 0) || local_fds) {
ssize_t len;
- len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp);
+ len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
+ local_nfds, errp);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(ioc, G_IO_IN);
@@ -112,20 +140,50 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
qio_channel_wait(ioc, G_IO_IN);
}
continue;
- } else if (len < 0) {
- goto cleanup;
- } else if (len == 0) {
- if (partial) {
- error_setg(errp,
- "Unexpected end-of-file before all bytes were read");
- } else {
+ }
+
+ if (len == 0) {
+ if (local_nfds && *local_nfds) {
+ /*
+ * Got some FDs, but no data yet. This isn't an EOF
+ * scenario (yet), so carry on to try to read data
+ * on next loop iteration
+ */
+ goto next_iter;
+ } else if (!partial) {
+ /* No fds and no data - EOF before any data read */
ret = 0;
+ goto cleanup;
+ } else {
+ len = -1;
+ error_setg(errp,
+ "Unexpected end-of-file before all data were read");
+ /* Fallthrough into len < 0 handling */
+ }
+ }
+
+ if (len < 0) {
+ /* Close any FDs we previously received */
+ if (nfds && fds) {
+ size_t i;
+ for (i = 0; i < (*nfds); i++) {
+ close((*fds)[i]);
+ }
+ g_free(*fds);
+ *fds = NULL;
+ *nfds = 0;
}
goto cleanup;
}
+ if (nlocal_iov) {
+ iov_discard_front(&local_iov, &nlocal_iov, len);
+ }
+
+next_iter:
partial = true;
- iov_discard_front(&local_iov, &nlocal_iov, len);
+ local_fds = NULL;
+ local_nfds = NULL;
}
ret = 1;
@@ -135,20 +193,22 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
return ret;
}
-int qio_channel_readv_all(QIOChannel *ioc,
- const struct iovec *iov,
- size_t niov,
- Error **errp)
+int qio_channel_readv_full_all(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int **fds, size_t *nfds,
+ Error **errp)
{
- int ret = qio_channel_readv_all_eof(ioc, iov, niov, errp);
+ int ret = qio_channel_readv_full_all_eof(ioc, iov, niov, fds, nfds, errp);
if (ret == 0) {
- ret = -1;
- error_setg(errp,
- "Unexpected end-of-file before all bytes were read");
- } else if (ret == 1) {
- ret = 0;
+ error_setg(errp, "Unexpected end-of-file before all data were read");
+ return -1;
+ }
+ if (ret == 1) {
+ return 0;
}
+
return ret;
}
@@ -157,6 +217,15 @@ int qio_channel_writev_all(QIOChannel *ioc,
size_t niov,
Error **errp)
{
+ return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp);
+}
+
+int qio_channel_writev_full_all(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int *fds, size_t nfds,
+ Error **errp)
+{
int ret = -1;
struct iovec *local_iov = g_new(struct iovec, niov);
struct iovec *local_iov_head = local_iov;
@@ -168,7 +237,8 @@ int qio_channel_writev_all(QIOChannel *ioc,
while (nlocal_iov > 0) {
ssize_t len;
- len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp);
+ len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds,
+ errp);
if (len == QIO_CHANNEL_ERR_BLOCK) {
if (qemu_in_coroutine()) {
qio_channel_yield(ioc, G_IO_OUT);
@@ -182,6 +252,9 @@ int qio_channel_writev_all(QIOChannel *ioc,
}
iov_discard_front(&local_iov, &nlocal_iov, len);
+
+ fds = NULL;
+ nfds = 0;
}
ret = 0;
diff --git a/iothread.c b/iothread.c
index b9f2751382..7f086387be 100644
--- a/iothread.c
+++ b/iothread.c
@@ -369,3 +369,9 @@ IOThread *iothread_by_id(const char *id)
{
return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL));
}
+
+bool qemu_in_iothread(void)
+{
+ return qemu_get_current_aio_context() == qemu_get_aio_context() ?
+ false : true;
+}
diff --git a/job.c b/job.c
index 3aaaebafe2..289edee143 100644
--- a/job.c
+++ b/job.c
@@ -715,6 +715,9 @@ static int job_finalize_single(Job *job)
static void job_cancel_async(Job *job, bool force)
{
+ if (job->driver->cancel) {
+ job->driver->cancel(job);
+ }
if (job->user_paused) {
/* Do not call job_enter here, the caller will handle it. */
if (job->driver->user_resume) {
diff --git a/memory_ldst.c.inc b/memory_ldst.c.inc
index 2fed2de18e..b56e961967 100644
--- a/memory_ldst.c.inc
+++ b/memory_ldst.c.inc
@@ -42,7 +42,7 @@ static inline uint32_t glue(address_space_ldl_internal, SUFFIX)(ARG1_DECL,
MO_32 | devend_memop(endian), attrs);
} else {
/* RAM case */
- fuzz_dma_read_cb(addr, 4, mr, false);
+ fuzz_dma_read_cb(addr, 4, mr);
ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
switch (endian) {
case DEVICE_LITTLE_ENDIAN:
@@ -111,7 +111,7 @@ static inline uint64_t glue(address_space_ldq_internal, SUFFIX)(ARG1_DECL,
MO_64 | devend_memop(endian), attrs);
} else {
/* RAM case */
- fuzz_dma_read_cb(addr, 8, mr, false);
+ fuzz_dma_read_cb(addr, 8, mr);
ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
switch (endian) {
case DEVICE_LITTLE_ENDIAN:
@@ -177,7 +177,7 @@ uint32_t glue(address_space_ldub, SUFFIX)(ARG1_DECL,
r = memory_region_dispatch_read(mr, addr1, &val, MO_8, attrs);
} else {
/* RAM case */
- fuzz_dma_read_cb(addr, 1, mr, false);
+ fuzz_dma_read_cb(addr, 1, mr);
ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
val = ldub_p(ptr);
r = MEMTX_OK;
@@ -215,7 +215,7 @@ static inline uint32_t glue(address_space_lduw_internal, SUFFIX)(ARG1_DECL,
MO_16 | devend_memop(endian), attrs);
} else {
/* RAM case */
- fuzz_dma_read_cb(addr, 2, mr, false);
+ fuzz_dma_read_cb(addr, 2, mr);
ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
switch (endian) {
case DEVICE_LITTLE_ENDIAN:
diff --git a/meson.build b/meson.build
index 2d8b433ff0..a923f249d8 100644
--- a/meson.build
+++ b/meson.build
@@ -18,6 +18,9 @@ config_host = keyval.load(meson.current_build_dir() / 'config-host.mak')
enable_modules = 'CONFIG_MODULES' in config_host
enable_static = 'CONFIG_STATIC' in config_host
+# Allow both shared and static libraries unless --enable-static
+static_kwargs = enable_static ? {'static': true} : {}
+
# Temporary directory used for files created while
# configure runs. Since it is in the build directory
# we can safely blow away any previous version of it
@@ -224,10 +227,17 @@ tcg_arch = config_host['ARCH']
if not get_option('tcg').disabled()
if cpu not in supported_cpus
if get_option('tcg_interpreter')
- warning('Unsupported CPU @0@, will use TCG with TCI (experimental)'.format(cpu))
+ warning('Unsupported CPU @0@, will use TCG with TCI (experimental and slow)'.format(cpu))
else
error('Unsupported CPU @0@, try --enable-tcg-interpreter'.format(cpu))
endif
+ elif get_option('tcg_interpreter')
+ warning('Use of the TCG interpretor is not recommended on this host')
+ warning('architecture. There is a native TCG execution backend available')
+ warning('which provides substantially better performance and reliability.')
+ warning('It is strongly recommended to remove the --enable-tcg-interpreter')
+ warning('configuration option on this architecture to use the native')
+ warning('backend.')
endif
if get_option('tcg_interpreter')
tcg_arch = 'tci'
@@ -311,14 +321,14 @@ endif
pixman = not_found
if have_system or have_tools
pixman = dependency('pixman-1', required: have_system, version:'>=0.21.8',
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
pam = not_found
if 'CONFIG_AUTH_PAM' in config_host
pam = cc.find_library('pam')
endif
libaio = cc.find_library('aio', required: false)
-zlib = dependency('zlib', required: true, static: enable_static)
+zlib = dependency('zlib', required: true, kwargs: static_kwargs)
linux_io_uring = not_found
if 'CONFIG_LINUX_IO_URING' in config_host
linux_io_uring = declare_dependency(compile_args: config_host['LINUX_IO_URING_CFLAGS'].split(),
@@ -333,7 +343,7 @@ libnfs = not_found
if not get_option('libnfs').auto() or have_block
libnfs = dependency('libnfs', version: '>=1.9.3',
required: get_option('libnfs'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
libattr_test = '''
@@ -354,7 +364,7 @@ if not get_option('attr').disabled()
else
libattr = cc.find_library('attr', has_headers: ['attr/xattr.h'],
required: get_option('attr'),
- static: enable_static)
+ kwargs: static_kwargs)
if libattr.found() and not \
cc.links(libattr_test, dependencies: libattr, args: '-DCONFIG_LIBATTR')
libattr = not_found
@@ -381,14 +391,14 @@ seccomp = not_found
if not get_option('seccomp').auto() or have_system or have_tools
seccomp = dependency('libseccomp', version: '>=2.3.0',
required: get_option('seccomp'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
libcap_ng = not_found
if not get_option('cap_ng').auto() or have_system or have_tools
libcap_ng = cc.find_library('cap-ng', has_headers: ['cap-ng.h'],
required: get_option('cap_ng'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
if libcap_ng.found() and not cc.links('''
#include <cap-ng.h>
@@ -409,7 +419,7 @@ if get_option('xkbcommon').auto() and not have_system and not have_tools
xkbcommon = not_found
else
xkbcommon = dependency('xkbcommon', required: get_option('xkbcommon'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
vde = not_found
if config_host.has_key('CONFIG_VDE')
@@ -445,13 +455,13 @@ libiscsi = not_found
if not get_option('libiscsi').auto() or have_block
libiscsi = dependency('libiscsi', version: '>=1.9.0',
required: get_option('libiscsi'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
zstd = not_found
if not get_option('zstd').auto() or have_block
zstd = dependency('libzstd', version: '>=1.4.0',
required: get_option('zstd'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
endif
gbm = not_found
if 'CONFIG_GBM' in config_host
@@ -468,14 +478,14 @@ if not get_option('curl').auto() or have_block
curl = dependency('libcurl', version: '>=7.29.0',
method: 'pkg-config',
required: get_option('curl'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
libudev = not_found
if targetos == 'linux' and (have_system or have_tools)
libudev = dependency('libudev',
method: 'pkg-config',
required: get_option('libudev'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
mpathlibs = [libudev]
@@ -511,17 +521,17 @@ if targetos == 'linux' and have_tools and not get_option('mpath').disabled()
}'''
libmpathpersist = cc.find_library('mpathpersist',
required: get_option('mpath'),
- static: enable_static)
+ kwargs: static_kwargs)
if libmpathpersist.found()
mpathlibs += libmpathpersist
if enable_static
mpathlibs += cc.find_library('devmapper',
required: get_option('mpath'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
mpathlibs += cc.find_library('multipath',
required: get_option('mpath'),
- static: enable_static)
+ kwargs: static_kwargs)
foreach lib: mpathlibs
if not lib.found()
mpathlibs = []
@@ -571,7 +581,7 @@ if have_system and not get_option('curses').disabled()
curses = dependency(curses_dep,
required: false,
method: 'pkg-config',
- static: enable_static)
+ kwargs: static_kwargs)
endif
endforeach
msg = get_option('curses').enabled() ? 'curses library not found' : ''
@@ -596,7 +606,7 @@ if have_system and not get_option('curses').disabled()
foreach curses_libname : curses_libname_list
libcurses = cc.find_library(curses_libname,
required: false,
- static: enable_static)
+ kwargs: static_kwargs)
if libcurses.found()
if cc.links(curses_test, args: curses_compile_args, dependencies: libcurses)
curses = declare_dependency(compile_args: curses_compile_args,
@@ -647,7 +657,7 @@ brlapi = not_found
if not get_option('brlapi').auto() or have_system
brlapi = cc.find_library('brlapi', has_headers: ['brlapi.h'],
required: get_option('brlapi'),
- static: enable_static)
+ kwargs: static_kwargs)
if brlapi.found() and not cc.links('''
#include <brlapi.h>
#include <stddef.h>
@@ -663,7 +673,7 @@ endif
sdl = not_found
if not get_option('sdl').auto() or (have_system and not cocoa.found())
- sdl = dependency('sdl2', required: get_option('sdl'), static: enable_static)
+ sdl = dependency('sdl2', required: get_option('sdl'), kwargs: static_kwargs)
sdl_image = not_found
endif
if sdl.found()
@@ -671,7 +681,7 @@ if sdl.found()
sdl = declare_dependency(compile_args: '-Wno-undef',
dependencies: sdl)
sdl_image = dependency('SDL2_image', required: get_option('sdl_image'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
else
if get_option('sdl_image').enabled()
error('sdl-image required, but SDL was @0@'.format(
@@ -683,19 +693,25 @@ endif
rbd = not_found
if not get_option('rbd').auto() or have_block
librados = cc.find_library('rados', required: get_option('rbd'),
- static: enable_static)
+ kwargs: static_kwargs)
librbd = cc.find_library('rbd', has_headers: ['rbd/librbd.h'],
required: get_option('rbd'),
- static: enable_static)
- if librados.found() and librbd.found() and cc.links('''
- #include <stdio.h>
- #include <rbd/librbd.h>
- int main(void) {
- rados_t cluster;
- rados_create(&cluster, NULL);
- return 0;
- }''', dependencies: [librbd, librados])
- rbd = declare_dependency(dependencies: [librbd, librados])
+ kwargs: static_kwargs)
+ if librados.found() and librbd.found()
+ if cc.links('''
+ #include <stdio.h>
+ #include <rbd/librbd.h>
+ int main(void) {
+ rados_t cluster;
+ rados_create(&cluster, NULL);
+ return 0;
+ }''', dependencies: [librbd, librados])
+ rbd = declare_dependency(dependencies: [librbd, librados])
+ elif get_option('rbd').enabled()
+ error('could not link librados')
+ else
+ warning('could not link librados, disabling')
+ endif
endif
endif
@@ -705,7 +721,7 @@ glusterfs_iocb_has_stat = false
if not get_option('glusterfs').auto() or have_block
glusterfs = dependency('glusterfs-api', version: '>=3',
required: get_option('glusterfs'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
if glusterfs.found()
glusterfs_ftruncate_has_stat = cc.links('''
#include <glusterfs/api/glfs.h>
@@ -744,7 +760,7 @@ libbzip2 = not_found
if not get_option('bzip2').auto() or have_block
libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'],
required: get_option('bzip2'),
- static: enable_static)
+ kwargs: static_kwargs)
if libbzip2.found() and not cc.links('''
#include <bzlib.h>
int main(void) { BZ2_bzlibVersion(); return 0; }''', dependencies: libbzip2)
@@ -761,7 +777,7 @@ liblzfse = not_found
if not get_option('lzfse').auto() or have_block
liblzfse = cc.find_library('lzfse', has_headers: ['lzfse.h'],
required: get_option('lzfse'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
if liblzfse.found() and not cc.links('''
#include <lzfse.h>
@@ -798,12 +814,12 @@ if not get_option('gtk').auto() or (have_system and not cocoa.found())
gtk = dependency('gtk+-3.0', version: '>=3.22.0',
method: 'pkg-config',
required: get_option('gtk'),
- static: enable_static)
+ kwargs: static_kwargs)
if gtk.found()
gtkx11 = dependency('gtk+-x11-3.0', version: '>=3.22.0',
method: 'pkg-config',
required: false,
- static: enable_static)
+ kwargs: static_kwargs)
gtk = declare_dependency(dependencies: [gtk, gtkx11])
endif
endif
@@ -816,7 +832,7 @@ endif
x11 = not_found
if gtkx11.found() or 'lm32-softmmu' in target_dirs
x11 = dependency('x11', method: 'pkg-config', required: gtkx11.found(),
- static: enable_static)
+ kwargs: static_kwargs)
endif
vnc = not_found
png = not_found
@@ -825,12 +841,12 @@ sasl = not_found
if get_option('vnc').enabled()
vnc = declare_dependency() # dummy dependency
png = dependency('libpng', required: get_option('vnc_png'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
jpeg = dependency('libjpeg', required: get_option('vnc_jpeg'),
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
sasl = cc.find_library('sasl2', has_headers: ['sasl/sasl.h'],
required: get_option('vnc_sasl'),
- static: enable_static)
+ kwargs: static_kwargs)
if sasl.found()
sasl = declare_dependency(dependencies: sasl,
compile_args: '-DSTRUCT_IOVEC_DEFINED')
@@ -841,7 +857,7 @@ snappy = not_found
if not get_option('snappy').auto() or have_system
snappy = cc.find_library('snappy', has_headers: ['snappy-c.h'],
required: get_option('snappy'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
if snappy.found() and not cc.links('''
#include <snappy-c.h>
@@ -858,7 +874,7 @@ lzo = not_found
if not get_option('lzo').auto() or have_system
lzo = cc.find_library('lzo2', has_headers: ['lzo/lzo1x.h'],
required: get_option('lzo'),
- static: enable_static)
+ kwargs: static_kwargs)
endif
if lzo.found() and not cc.links('''
#include <lzo/lzo1x.h>
@@ -893,7 +909,7 @@ u2f = not_found
if have_system
u2f = dependency('u2f-emu', required: get_option('u2f'),
method: 'pkg-config',
- static: enable_static)
+ kwargs: static_kwargs)
endif
usbredir = not_found
if 'CONFIG_USB_REDIR' in config_host
@@ -920,7 +936,7 @@ if 'CONFIG_TASN1' in config_host
link_args: config_host['TASN1_LIBS'].split())
endif
keyutils = dependency('libkeyutils', required: false,
- method: 'pkg-config', static: enable_static)
+ method: 'pkg-config', kwargs: static_kwargs)
has_gettid = cc.has_function('gettid')
@@ -979,7 +995,7 @@ endif
fuse = dependency('fuse3', required: get_option('fuse'),
version: '>=3.1', method: 'pkg-config',
- static: enable_static)
+ kwargs: static_kwargs)
fuse_lseek = not_found
if not get_option('fuse_lseek').disabled()
@@ -1210,7 +1226,8 @@ host_kconfig = \
('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
- ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : [])
+ ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \
+ ('CONFIG_MULTIPROCESS_ALLOWED' in config_host ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
@@ -1367,7 +1384,7 @@ capstone_opt = get_option('capstone')
if capstone_opt in ['enabled', 'auto', 'system']
have_internal = fs.exists(meson.current_source_dir() / 'capstone/Makefile')
capstone = dependency('capstone', version: '>=4.0',
- static: enable_static, method: 'pkg-config',
+ kwargs: static_kwargs, method: 'pkg-config',
required: capstone_opt == 'system' or
capstone_opt == 'enabled' and not have_internal)
if capstone.found()
@@ -1477,7 +1494,7 @@ if have_system
slirp_opt = get_option('slirp')
if slirp_opt in ['enabled', 'auto', 'system']
have_internal = fs.exists(meson.current_source_dir() / 'slirp/meson.build')
- slirp = dependency('slirp', static: enable_static,
+ slirp = dependency('slirp', kwargs: static_kwargs,
method: 'pkg-config',
required: slirp_opt == 'system' or
slirp_opt == 'enabled' and not have_internal)
@@ -1556,7 +1573,7 @@ fdt_opt = get_option('fdt')
if have_system
if fdt_opt in ['enabled', 'auto', 'system']
have_internal = fs.exists(meson.current_source_dir() / 'dtc/libfdt/Makefile.libfdt')
- fdt = cc.find_library('fdt', static: enable_static,
+ fdt = cc.find_library('fdt', kwargs: static_kwargs,
required: fdt_opt == 'system' or
fdt_opt == 'enabled' and not have_internal)
if fdt.found() and cc.links('''
@@ -1725,10 +1742,11 @@ target_softmmu_arch = {}
# TODO: add each directory to the subdirs from its own meson.build, once
# we have those
trace_events_subdirs = [
- 'accel/kvm',
- 'accel/tcg',
'crypto',
+ 'qapi',
+ 'qom',
'monitor',
+ 'util',
]
if have_user
trace_events_subdirs += [ 'linux-user' ]
@@ -1744,6 +1762,7 @@ if have_block
endif
if have_system
trace_events_subdirs += [
+ 'accel/kvm',
'audio',
'backends',
'backends/tpm',
@@ -1799,23 +1818,24 @@ if have_system
'net',
'softmmu',
'ui',
+ 'hw/remote',
+ ]
+endif
+if have_system or have_user
+ trace_events_subdirs += [
+ 'accel/tcg',
+ 'hw/core',
+ 'target/arm',
+ 'target/hppa',
+ 'target/i386',
+ 'target/i386/kvm',
+ 'target/mips',
+ 'target/ppc',
+ 'target/riscv',
+ 'target/s390x',
+ 'target/sparc',
]
endif
-trace_events_subdirs += [
- 'hw/core',
- 'qapi',
- 'qom',
- 'target/arm',
- 'target/hppa',
- 'target/i386',
- 'target/i386/kvm',
- 'target/mips',
- 'target/ppc',
- 'target/riscv',
- 'target/s390x',
- 'target/sparc',
- 'util',
-]
vhost_user = not_found
if 'CONFIG_VHOST_USER' in config_host
@@ -1849,41 +1869,45 @@ libqemuutil = static_library('qemuutil',
qemuutil = declare_dependency(link_with: libqemuutil,
sources: genh + version_res)
-decodetree = generator(find_program('scripts/decodetree.py'),
- output: 'decode-@BASENAME@.c.inc',
- arguments: ['@INPUT@', '@EXTRA_ARGS@', '-o', '@OUTPUT@'])
+if have_system or have_user
+ decodetree = generator(find_program('scripts/decodetree.py'),
+ output: 'decode-@BASENAME@.c.inc',
+ arguments: ['@INPUT@', '@EXTRA_ARGS@', '-o', '@OUTPUT@'])
+ subdir('libdecnumber')
+ subdir('target')
+endif
subdir('audio')
subdir('io')
subdir('chardev')
subdir('fsdev')
-subdir('libdecnumber')
-subdir('target')
subdir('dump')
-block_ss.add(files(
- 'block.c',
- 'blockjob.c',
- 'job.c',
- 'qemu-io-cmds.c',
-))
-block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c'))
-
-subdir('nbd')
-subdir('scsi')
-subdir('block')
-
-blockdev_ss.add(files(
- 'blockdev.c',
- 'blockdev-nbd.c',
- 'iothread.c',
- 'job-qmp.c',
-), gnutls)
-
-# os-posix.c contains POSIX-specific functions used by qemu-storage-daemon,
-# os-win32.c does not
-blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
-softmmu_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+if have_block
+ block_ss.add(files(
+ 'block.c',
+ 'blockjob.c',
+ 'job.c',
+ 'qemu-io-cmds.c',
+ ))
+ block_ss.add(when: 'CONFIG_REPLICATION', if_true: files('replication.c'))
+
+ subdir('nbd')
+ subdir('scsi')
+ subdir('block')
+
+ blockdev_ss.add(files(
+ 'blockdev.c',
+ 'blockdev-nbd.c',
+ 'iothread.c',
+ 'job-qmp.c',
+ ), gnutls)
+
+ # os-posix.c contains POSIX-specific functions used by qemu-storage-daemon,
+ # os-win32.c does not
+ blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
+ softmmu_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+endif
common_ss.add(files('cpus-common.c'))
@@ -2500,8 +2524,12 @@ if have_system
endif
summary_info += {'TCG support': config_all.has_key('CONFIG_TCG')}
if config_all.has_key('CONFIG_TCG')
+ if get_option('tcg_interpreter')
+ summary_info += {'TCG backend': 'TCI (TCG with bytecode interpreter, experimental and slow)'}
+ else
+ summary_info += {'TCG backend': 'native (@0@)'.format(cpu)}
+ endif
summary_info += {'TCG debug enabled': config_host.has_key('CONFIG_DEBUG_TCG')}
- summary_info += {'TCG interpreter': tcg_arch == 'tci'}
endif
summary_info += {'target list': ' '.join(target_dirs)}
if have_system
@@ -2626,6 +2654,7 @@ summary_info += {'libpmem support': config_host.has_key('CONFIG_LIBPMEM')}
summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')}
summary_info += {'libudev': libudev.found()}
summary_info += {'FUSE lseek': fuse_lseek.found()}
+summary_info += {'Multiprocess QEMU': config_host.has_key('CONFIG_MULTIPROCESS_ALLOWED')}
summary(summary_info, bool_yn: true, section: 'Dependencies')
if not supported_cpus.contains(cpu)
diff --git a/meson_options.txt b/meson_options.txt
index 95f1079829..675a9c500a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -40,7 +40,7 @@ option('xen_pci_passthrough', type: 'feature', value: 'auto',
option('tcg', type: 'feature', value: 'auto',
description: 'TCG support')
option('tcg_interpreter', type: 'boolean', value: false,
- description: 'TCG bytecode interpreter (TCI)')
+ description: 'TCG with bytecode interpreter (experimental and slow)')
option('cfi', type: 'boolean', value: 'false',
description: 'Control-Flow Integrity (CFI)')
option('cfi_debug', type: 'boolean', value: 'false',
diff --git a/migration/migration.c b/migration/migration.c
index 1986cb8573..a5ddf43559 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -58,6 +58,7 @@
#include "qemu/queue.h"
#include "multifd.h"
#include "qemu/yank.h"
+#include "sysemu/cpus.h"
#ifdef CONFIG_VFIO
#include "hw/vfio/vfio-common.h"
@@ -134,6 +135,38 @@ enum mig_rp_message_type {
MIG_RP_MSG_MAX
};
+/* Migration capabilities set */
+struct MigrateCapsSet {
+ int size; /* Capability set size */
+ MigrationCapability caps[]; /* Variadic array of capabilities */
+};
+typedef struct MigrateCapsSet MigrateCapsSet;
+
+/* Define and initialize MigrateCapsSet */
+#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...) \
+ MigrateCapsSet _name = { \
+ .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
+ .caps = { __VA_ARGS__ } \
+ }
+
+/* Background-snapshot compatibility check list */
+static const
+INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
+ MIGRATION_CAPABILITY_POSTCOPY_RAM,
+ MIGRATION_CAPABILITY_DIRTY_BITMAPS,
+ MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
+ MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
+ MIGRATION_CAPABILITY_RETURN_PATH,
+ MIGRATION_CAPABILITY_MULTIFD,
+ MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
+ MIGRATION_CAPABILITY_AUTO_CONVERGE,
+ MIGRATION_CAPABILITY_RELEASE_RAM,
+ MIGRATION_CAPABILITY_RDMA_PIN_ALL,
+ MIGRATION_CAPABILITY_COMPRESS,
+ MIGRATION_CAPABILITY_XBZRLE,
+ MIGRATION_CAPABILITY_X_COLO,
+ MIGRATION_CAPABILITY_VALIDATE_UUID);
+
/* When we add fault tolerance, we could have several
migrations at once. For now we don't need to add
dynamic creation of migration */
@@ -141,6 +174,8 @@ enum mig_rp_message_type {
static MigrationState *current_migration;
static MigrationIncomingState *current_incoming;
+static GSList *migration_blockers;
+
static bool migration_object_check(MigrationState *ms, Error **errp);
static int migration_maybe_pause(MigrationState *s,
int *current_active_state,
@@ -1041,6 +1076,27 @@ static void fill_source_migration_info(MigrationInfo *info)
{
MigrationState *s = migrate_get_current();
+ info->blocked = migration_is_blocked(NULL);
+ info->has_blocked_reasons = info->blocked;
+ info->blocked_reasons = NULL;
+ if (info->blocked) {
+ GSList *cur_blocker = migration_blockers;
+
+ /*
+ * There are two types of reasons a migration might be blocked;
+ * a) devices marked in VMState as non-migratable, and
+ * b) Explicit migration blockers
+ * We need to add both of them here.
+ */
+ qemu_savevm_non_migratable_list(&info->blocked_reasons);
+
+ while (cur_blocker) {
+ QAPI_LIST_PREPEND(info->blocked_reasons,
+ g_strdup(error_get_pretty(cur_blocker->data)));
+ cur_blocker = g_slist_next(cur_blocker);
+ }
+ }
+
switch (s->state) {
case MIGRATION_STATUS_NONE:
/* no migration has happened ever */
@@ -1089,6 +1145,31 @@ static void fill_source_migration_info(MigrationInfo *info)
info->status = s->state;
}
+typedef enum WriteTrackingSupport {
+ WT_SUPPORT_UNKNOWN = 0,
+ WT_SUPPORT_ABSENT,
+ WT_SUPPORT_AVAILABLE,
+ WT_SUPPORT_COMPATIBLE
+} WriteTrackingSupport;
+
+static
+WriteTrackingSupport migrate_query_write_tracking(void)
+{
+ /* Check if kernel supports required UFFD features */
+ if (!ram_write_tracking_available()) {
+ return WT_SUPPORT_ABSENT;
+ }
+ /*
+ * Check if current memory configuration is
+ * compatible with required UFFD features.
+ */
+ if (!ram_write_tracking_compatible()) {
+ return WT_SUPPORT_AVAILABLE;
+ }
+
+ return WT_SUPPORT_COMPATIBLE;
+}
+
/**
* @migration_caps_check - check capability validity
*
@@ -1150,6 +1231,39 @@ static bool migrate_caps_check(bool *cap_list,
}
}
+ if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
+ WriteTrackingSupport wt_support;
+ int idx;
+ /*
+ * Check if 'background-snapshot' capability is supported by
+ * host kernel and compatible with guest memory configuration.
+ */
+ wt_support = migrate_query_write_tracking();
+ if (wt_support < WT_SUPPORT_AVAILABLE) {
+ error_setg(errp, "Background-snapshot is not supported by host kernel");
+ return false;
+ }
+ if (wt_support < WT_SUPPORT_COMPATIBLE) {
+ error_setg(errp, "Background-snapshot is not compatible "
+ "with guest memory configuration");
+ return false;
+ }
+
+ /*
+ * Check if there are any migration capabilities
+ * incompatible with 'background-snapshot'.
+ */
+ for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
+ int incomp_cap = check_caps_background_snapshot.caps[idx];
+ if (cap_list[incomp_cap]) {
+ error_setg(errp,
+ "Background-snapshot is not compatible with %s",
+ MigrationCapability_str(incomp_cap));
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -1226,21 +1340,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
if (params->has_compress_level &&
(params->compress_level > 9)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
- "is invalid, it should be in the range of 0 to 9");
+ "a value between 0 and 9");
return false;
}
if (params->has_compress_threads && (params->compress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"compress_threads",
- "is invalid, it should be in the range of 1 to 255");
+ "a value between 1 and 255");
return false;
}
if (params->has_decompress_threads && (params->decompress_threads < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"decompress_threads",
- "is invalid, it should be in the range of 1 to 255");
+ "a value between 1 and 255");
return false;
}
@@ -1293,21 +1407,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
if (params->has_multifd_channels && (params->multifd_channels < 1)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"multifd_channels",
- "is invalid, it should be in the range of 1 to 255");
+ "a value between 1 and 255");
return false;
}
if (params->has_multifd_zlib_level &&
(params->multifd_zlib_level > 9)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
- "is invalid, it should be in the range of 0 to 9");
+ "a value between 0 and 9");
return false;
}
if (params->has_multifd_zstd_level &&
(params->multifd_zstd_level > 20)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
- "is invalid, it should be in the range of 0 to 20");
+ "a value between 0 and 20");
return false;
}
@@ -1316,8 +1430,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
!is_power_of_2(params->xbzrle_cache_size))) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"xbzrle_cache_size",
- "is invalid, it should be bigger than target page size"
- " and a power of 2");
+ "a power of two no less than the target page size");
return false;
}
@@ -1334,21 +1447,21 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
params->announce_initial > 100000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_initial",
- "is invalid, it must be less than 100000 ms");
+ "a value between 0 and 100000");
return false;
}
if (params->has_announce_max &&
params->announce_max > 100000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_max",
- "is invalid, it must be less than 100000 ms");
+ "a value between 0 and 100000");
return false;
}
if (params->has_announce_rounds &&
params->announce_rounds > 1000) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_rounds",
- "is invalid, it must be in the range of 0 to 1000");
+ "a value between 0 and 1000");
return false;
}
if (params->has_announce_step &&
@@ -1356,7 +1469,7 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
params->announce_step > 10000)) {
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"announce_step",
- "is invalid, it must be in the range of 1 to 10000 ms");
+ "a value between 0 and 10000");
return false;
}
@@ -1909,6 +2022,7 @@ void migrate_init(MigrationState *s)
* locks.
*/
s->cleanup_bh = 0;
+ s->vm_start_bh = 0;
s->to_dst_file = NULL;
s->state = MIGRATION_STATUS_NONE;
s->rp_state.from_dst_file = NULL;
@@ -1934,8 +2048,6 @@ void migrate_init(MigrationState *s)
s->threshold_size = 0;
}
-static GSList *migration_blockers;
-
int migrate_add_blocker(Error *reason, Error **errp)
{
if (only_migratable) {
@@ -2216,7 +2328,7 @@ void qmp_migrate_set_cache_size(int64_t value, Error **errp)
qmp_migrate_set_parameters(&p, errp);
}
-int64_t qmp_query_migrate_cache_size(Error **errp)
+uint64_t qmp_query_migrate_cache_size(Error **errp)
{
return migrate_xbzrle_cache_size();
}
@@ -2446,7 +2558,7 @@ int migrate_use_xbzrle(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
}
-int64_t migrate_xbzrle_cache_size(void)
+uint64_t migrate_xbzrle_cache_size(void)
{
MigrationState *s;
@@ -2491,6 +2603,15 @@ bool migrate_use_block_incremental(void)
return s->parameters.block_incremental;
}
+bool migrate_background_snapshot(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
+}
+
/* migration thread support */
/*
* Something bad happened to the RP stream, mark an error
@@ -3117,6 +3238,50 @@ fail:
MIGRATION_STATUS_FAILED);
}
+/**
+ * bg_migration_completion: Used by bg_migration_thread when after all the
+ * RAM has been saved. The caller 'breaks' the loop when this returns.
+ *
+ * @s: Current migration state
+ */
+static void bg_migration_completion(MigrationState *s)
+{
+ int current_active_state = s->state;
+
+ /*
+ * Stop tracking RAM writes - un-protect memory, un-register UFFD
+ * memory ranges, flush kernel wait queues and wake up threads
+ * waiting for write fault to be resolved.
+ */
+ ram_write_tracking_stop();
+
+ if (s->state == MIGRATION_STATUS_ACTIVE) {
+ /*
+ * By this moment we have RAM content saved into the migration stream.
+ * The next step is to flush the non-RAM content (device state)
+ * right after the ram content. The device state has been stored into
+ * the temporary buffer before RAM saving started.
+ */
+ qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
+ qemu_fflush(s->to_dst_file);
+ } else if (s->state == MIGRATION_STATUS_CANCELLING) {
+ goto fail;
+ }
+
+ if (qemu_file_get_error(s->to_dst_file)) {
+ trace_migration_completion_file_err();
+ goto fail;
+ }
+
+ migrate_set_state(&s->state, current_active_state,
+ MIGRATION_STATUS_COMPLETED);
+ return;
+
+fail:
+ migrate_set_state(&s->state, current_active_state,
+ MIGRATION_STATUS_FAILED);
+}
+
bool migrate_colo_enabled(void)
{
MigrationState *s = migrate_get_current();
@@ -3457,6 +3622,47 @@ static void migration_iteration_finish(MigrationState *s)
qemu_mutex_unlock_iothread();
}
+static void bg_migration_iteration_finish(MigrationState *s)
+{
+ qemu_mutex_lock_iothread();
+ switch (s->state) {
+ case MIGRATION_STATUS_COMPLETED:
+ migration_calculate_complete(s);
+ break;
+
+ case MIGRATION_STATUS_ACTIVE:
+ case MIGRATION_STATUS_FAILED:
+ case MIGRATION_STATUS_CANCELLED:
+ case MIGRATION_STATUS_CANCELLING:
+ break;
+
+ default:
+ /* Should not reach here, but if so, forgive the VM. */
+ error_report("%s: Unknown ending state %d", __func__, s->state);
+ break;
+ }
+
+ migrate_fd_cleanup_schedule(s);
+ qemu_mutex_unlock_iothread();
+}
+
+/*
+ * Return true if continue to the next iteration directly, false
+ * otherwise.
+ */
+static MigIterateState bg_migration_iteration_run(MigrationState *s)
+{
+ int res;
+
+ res = qemu_savevm_state_iterate(s->to_dst_file, false);
+ if (res > 0) {
+ bg_migration_completion(s);
+ return MIG_ITERATE_BREAK;
+ }
+
+ return MIG_ITERATE_RESUME;
+}
+
void migration_make_urgent_request(void)
{
qemu_sem_post(&migrate_get_current()->rate_limit_sem);
@@ -3604,6 +3810,165 @@ static void *migration_thread(void *opaque)
return NULL;
}
+static void bg_migration_vm_start_bh(void *opaque)
+{
+ MigrationState *s = opaque;
+
+ qemu_bh_delete(s->vm_start_bh);
+ s->vm_start_bh = NULL;
+
+ vm_start();
+ s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
+}
+
+/**
+ * Background snapshot thread, based on live migration code.
+ * This is an alternative implementation of live migration mechanism
+ * introduced specifically to support background snapshots.
+ *
+ * It takes advantage of userfault_fd write protection mechanism introduced
+ * in v5.7 kernel. Compared to existing dirty page logging migration much
+ * lesser stream traffic is produced resulting in smaller snapshot images,
+ * simply cause of no page duplicates can get into the stream.
+ *
+ * Another key point is that generated vmstate stream reflects machine state
+ * 'frozen' at the beginning of snapshot creation compared to dirty page logging
+ * mechanism, which effectively results in that saved snapshot is the state of VM
+ * at the end of the process.
+ */
+static void *bg_migration_thread(void *opaque)
+{
+ MigrationState *s = opaque;
+ int64_t setup_start;
+ MigThrError thr_error;
+ QEMUFile *fb;
+ bool early_fail = true;
+
+ rcu_register_thread();
+ object_ref(OBJECT(s));
+
+ qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+
+ setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+ /*
+ * We want to save vmstate for the moment when migration has been
+ * initiated but also we want to save RAM content while VM is running.
+ * The RAM content should appear first in the vmstate. So, we first
+ * stash the non-RAM part of the vmstate to the temporary buffer,
+ * then write RAM part of the vmstate to the migration stream
+ * with vCPUs running and, finally, write stashed non-RAM part of
+ * the vmstate from the buffer to the migration stream.
+ */
+ s->bioc = qio_channel_buffer_new(128 * 1024);
+ qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
+ fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
+ object_unref(OBJECT(s->bioc));
+
+ update_iteration_initial_status(s);
+
+ qemu_savevm_state_header(s->to_dst_file);
+ qemu_savevm_state_setup(s->to_dst_file);
+
+ if (qemu_savevm_state_guest_unplug_pending()) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_WAIT_UNPLUG);
+
+ while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
+ qemu_savevm_state_guest_unplug_pending()) {
+ qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+ }
+
+ migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
+ MIGRATION_STATUS_ACTIVE);
+ } else {
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_ACTIVE);
+ }
+ s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+
+ trace_migration_thread_setup_complete();
+ s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ qemu_mutex_lock_iothread();
+
+ /*
+ * If VM is currently in suspended state, then, to make a valid runstate
+ * transition in vm_stop_force_state() we need to wakeup it up.
+ */
+ qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
+ s->vm_was_running = runstate_is_running();
+
+ if (global_state_store()) {
+ goto fail;
+ }
+ /* Forcibly stop VM before saving state of vCPUs and devices */
+ if (vm_stop_force_state(RUN_STATE_PAUSED)) {
+ goto fail;
+ }
+ /*
+ * Put vCPUs in sync with shadow context structures, then
+ * save their state to channel-buffer along with devices.
+ */
+ cpu_synchronize_all_states();
+ if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
+ goto fail;
+ }
+ /* Now initialize UFFD context and start tracking RAM writes */
+ if (ram_write_tracking_start()) {
+ goto fail;
+ }
+ early_fail = false;
+
+ /*
+ * Start VM from BH handler to avoid write-fault lock here.
+ * UFFD-WP protection for the whole RAM is already enabled so
+ * calling VM state change notifiers from vm_start() would initiate
+ * writes to virtio VQs memory which is in write-protected region.
+ */
+ s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
+ qemu_bh_schedule(s->vm_start_bh);
+
+ qemu_mutex_unlock_iothread();
+
+ while (migration_is_active(s)) {
+ MigIterateState iter_state = bg_migration_iteration_run(s);
+ if (iter_state == MIG_ITERATE_SKIP) {
+ continue;
+ } else if (iter_state == MIG_ITERATE_BREAK) {
+ break;
+ }
+
+ /*
+ * Try to detect any kind of failures, and see whether we
+ * should stop the migration now.
+ */
+ thr_error = migration_detect_error(s);
+ if (thr_error == MIG_THR_ERR_FATAL) {
+ /* Stop migration */
+ break;
+ }
+
+ migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
+ }
+
+ trace_migration_thread_after_loop();
+
+fail:
+ if (early_fail) {
+ migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+ qemu_mutex_unlock_iothread();
+ }
+
+ bg_migration_iteration_finish(s);
+
+ qemu_fclose(fb);
+ object_unref(OBJECT(s));
+ rcu_unregister_thread();
+
+ return NULL;
+}
+
void migrate_fd_connect(MigrationState *s, Error *error_in)
{
Error *local_err = NULL;
@@ -3667,8 +4032,14 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
migrate_fd_cleanup(s);
return;
}
- qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
- QEMU_THREAD_JOINABLE);
+
+ if (migrate_background_snapshot()) {
+ qemu_thread_create(&s->thread, "bg_snapshot",
+ bg_migration_thread, s, QEMU_THREAD_JOINABLE);
+ } else {
+ qemu_thread_create(&s->thread, "live_migration",
+ migration_thread, s, QEMU_THREAD_JOINABLE);
+ }
s->migration_thread_running = true;
}
@@ -3784,6 +4155,8 @@ static Property migration_properties[] = {
DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
+ DEFINE_PROP_MIG_CAP("x-background-snapshot",
+ MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/migration/migration.h b/migration/migration.h
index d096b77f74..db6708326b 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -20,6 +20,7 @@
#include "qemu/thread.h"
#include "qemu/coroutine_int.h"
#include "io/channel.h"
+#include "io/channel-buffer.h"
#include "net/announce.h"
#include "qom/object.h"
@@ -147,8 +148,10 @@ struct MigrationState {
/*< public >*/
QemuThread thread;
+ QEMUBH *vm_start_bh;
QEMUBH *cleanup_bh;
QEMUFile *to_dst_file;
+ QIOChannelBuffer *bioc;
/*
* Protects to_dst_file pointer. We need to make sure we won't
* yield or hang during the critical section, since this lock will
@@ -324,7 +327,7 @@ int migrate_multifd_zlib_level(void);
int migrate_multifd_zstd_level(void);
int migrate_use_xbzrle(void);
-int64_t migrate_xbzrle_cache_size(void);
+uint64_t migrate_xbzrle_cache_size(void);
bool migrate_colo_enabled(void);
bool migrate_use_block(void);
@@ -341,6 +344,7 @@ int migrate_compress_wait_thread(void);
int migrate_decompress_threads(void);
bool migrate_use_events(void);
bool migrate_postcopy_blocktime(void);
+bool migrate_background_snapshot(void);
/* Sending on the return path - generic and then for each message type */
void migrate_send_rp_shut(MigrationIncomingState *mis,
diff --git a/migration/page_cache.c b/migration/page_cache.c
index 098b436223..6d4f7a9bbc 100644
--- a/migration/page_cache.c
+++ b/migration/page_cache.c
@@ -38,7 +38,7 @@ struct PageCache {
size_t num_items;
};
-PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp)
+PageCache *cache_init(uint64_t new_size, size_t page_size, Error **errp)
{
int64_t i;
size_t num_pages = new_size / page_size;
@@ -60,8 +60,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp)
/* We prefer not to abort if there is no memory */
cache = g_try_malloc(sizeof(*cache));
if (!cache) {
- error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
- "Failed to allocate cache");
+ error_setg(errp, "Failed to allocate cache");
return NULL;
}
cache->page_size = page_size;
@@ -74,8 +73,7 @@ PageCache *cache_init(int64_t new_size, size_t page_size, Error **errp)
cache->page_cache = g_try_malloc((cache->max_num_items) *
sizeof(*cache->page_cache));
if (!cache->page_cache) {
- error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
- "Failed to allocate page cache");
+ error_setg(errp, "Failed to allocate page cache");
g_free(cache);
return NULL;
}
diff --git a/migration/page_cache.h b/migration/page_cache.h
index 0cb94498a0..8733b4df6e 100644
--- a/migration/page_cache.h
+++ b/migration/page_cache.h
@@ -28,7 +28,7 @@ typedef struct PageCache PageCache;
* @page_size: cache page size
* @errp: set *errp if the check failed, with reason
*/
-PageCache *cache_init(int64_t cache_size, size_t page_size, Error **errp);
+PageCache *cache_init(uint64_t cache_size, size_t page_size, Error **errp);
/**
* cache_fini: free all cache resources
* @cache pointer to the PageCache struct
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index be21518c57..d6e03dbc0e 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -595,7 +595,7 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
{
if (size < IO_BUF_SIZE) {
size_t res;
- uint8_t *src;
+ uint8_t *src = NULL;
res = qemu_peek_buffer(f, &src, size, 0);
diff --git a/migration/ram.c b/migration/ram.c
index 7811cde643..72143da0ac 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -56,6 +56,11 @@
#include "savevm.h"
#include "qemu/iov.h"
#include "multifd.h"
+#include "sysemu/runstate.h"
+
+#if defined(__linux__)
+#include "qemu/userfaultfd.h"
+#endif /* defined(__linux__) */
/***********************************************************/
/* ram save/restore */
@@ -126,7 +131,7 @@ static void XBZRLE_cache_unlock(void)
* @new_size: new cache size
* @errp: set *errp if the check failed, with reason
*/
-int xbzrle_cache_resize(int64_t new_size, Error **errp)
+int xbzrle_cache_resize(uint64_t new_size, Error **errp)
{
PageCache *new_cache;
int64_t ret = 0;
@@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
struct RAMState {
/* QEMUFile used for this migration */
QEMUFile *f;
+ /* UFFD file descriptor, used in 'write-tracking' migration */
+ int uffdio_fd;
/* Last block that we have visited searching for dirty pages */
RAMBlock *last_seen_block;
/* Last block from where we have sent data */
@@ -1434,6 +1441,269 @@ static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
return block;
}
+#if defined(__linux__)
+/**
+ * poll_fault_page: try to get next UFFD write fault page and, if pending fault
+ * is found, return RAM block pointer and page offset
+ *
+ * Returns pointer to the RAMBlock containing faulting page,
+ * NULL if no write faults are pending
+ *
+ * @rs: current RAM state
+ * @offset: page offset from the beginning of the block
+ */
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ struct uffd_msg uffd_msg;
+ void *page_address;
+ RAMBlock *bs;
+ int res;
+
+ if (!migrate_background_snapshot()) {
+ return NULL;
+ }
+
+ res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
+ if (res <= 0) {
+ return NULL;
+ }
+
+ page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
+ bs = qemu_ram_block_from_host(page_address, false, offset);
+ assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
+ return bs;
+}
+
+/**
+ * ram_save_release_protection: release UFFD write protection after
+ * a range of pages has been saved
+ *
+ * @rs: current RAM state
+ * @pss: page-search-status structure
+ * @start_page: index of the first page in the range relative to pss->block
+ *
+ * Returns 0 on success, negative value in case of an error
+*/
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ int res = 0;
+
+ /* Check if page is from UFFD-managed region. */
+ if (pss->block->flags & RAM_UF_WRITEPROTECT) {
+ void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
+ uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
+
+ /* Flush async buffers before un-protect. */
+ qemu_fflush(rs->f);
+ /* Un-protect memory range. */
+ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
+ false, false);
+ }
+
+ return res;
+}
+
+/* ram_write_tracking_available: check if kernel supports required UFFD features
+ *
+ * Returns true if supports, false otherwise
+ */
+bool ram_write_tracking_available(void)
+{
+ uint64_t uffd_features;
+ int res;
+
+ res = uffd_query_features(&uffd_features);
+ return (res == 0 &&
+ (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
+}
+
+/* ram_write_tracking_compatible: check if guest configuration is
+ * compatible with 'write-tracking'
+ *
+ * Returns true if compatible, false otherwise
+ */
+bool ram_write_tracking_compatible(void)
+{
+ const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
+ int uffd_fd;
+ RAMBlock *bs;
+ bool ret = false;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
+ if (uffd_fd < 0) {
+ return false;
+ }
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ uint64_t uffd_ioctls;
+
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (bs->mr->readonly || bs->mr->rom_device) {
+ continue;
+ }
+ /* Try to register block memory via UFFD-IO to track writes */
+ if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
+ UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
+ goto out;
+ }
+ if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
+ goto out;
+ }
+ }
+ ret = true;
+
+out:
+ uffd_close_fd(uffd_fd);
+ return ret;
+}
+
+/*
+ * ram_write_tracking_start: start UFFD-WP memory tracking
+ *
+ * Returns 0 for success or negative value in case of error
+ */
+int ram_write_tracking_start(void)
+{
+ int uffd_fd;
+ RAMState *rs = ram_state;
+ RAMBlock *bs;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
+ if (uffd_fd < 0) {
+ return uffd_fd;
+ }
+ rs->uffdio_fd = uffd_fd;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (bs->mr->readonly || bs->mr->rom_device) {
+ continue;
+ }
+
+ /* Register block memory with UFFD to track writes */
+ if (uffd_register_memory(rs->uffdio_fd, bs->host,
+ bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
+ goto fail;
+ }
+ /* Apply UFFD write protection to the block memory range */
+ if (uffd_change_protection(rs->uffdio_fd, bs->host,
+ bs->max_length, true, false)) {
+ goto fail;
+ }
+ bs->flags |= RAM_UF_WRITEPROTECT;
+ memory_region_ref(bs->mr);
+
+ trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
+ bs->host, bs->max_length);
+ }
+
+ return 0;
+
+fail:
+ error_report("ram_write_tracking_start() failed: restoring initial memory state");
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /*
+ * In case some memory block failed to be write-protected
+ * remove protection and unregister all succeeded RAM blocks
+ */
+ uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
+ uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
+ /* Cleanup flags and remove reference */
+ bs->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(bs->mr);
+ }
+
+ uffd_close_fd(uffd_fd);
+ rs->uffdio_fd = -1;
+ return -1;
+}
+
+/**
+ * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
+ */
+void ram_write_tracking_stop(void)
+{
+ RAMState *rs = ram_state;
+ RAMBlock *bs;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+ if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /* Remove protection and unregister all affected RAM blocks */
+ uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
+ uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
+
+ trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
+ bs->host, bs->max_length);
+
+ /* Cleanup flags and remove reference */
+ bs->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(bs->mr);
+ }
+
+ /* Finally close UFFD file descriptor */
+ uffd_close_fd(rs->uffdio_fd);
+ rs->uffdio_fd = -1;
+}
+
+#else
+/* No target OS support, stubs just fail or ignore */
+
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ (void) rs;
+ (void) offset;
+
+ return NULL;
+}
+
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ (void) rs;
+ (void) pss;
+ (void) start_page;
+
+ return 0;
+}
+
+bool ram_write_tracking_available(void)
+{
+ return false;
+}
+
+bool ram_write_tracking_compatible(void)
+{
+ assert(0);
+ return false;
+}
+
+int ram_write_tracking_start(void)
+{
+ assert(0);
+ return -1;
+}
+
+void ram_write_tracking_stop(void)
+{
+ assert(0);
+}
+#endif /* defined(__linux__) */
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
@@ -1473,6 +1743,14 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
} while (block && !dirty);
+ if (!block) {
+ /*
+ * Poll write faults too if background snapshot is enabled; that's
+ * when we have vcpus got blocked by the write protected pages.
+ */
+ block = poll_fault_page(rs, &offset);
+ }
+
if (block) {
/*
* As soon as we start servicing pages out of order, then we have
@@ -1715,6 +1993,8 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
int tmppages, pages = 0;
size_t pagesize_bits =
qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
+ unsigned long start_page = pss->page;
+ int res;
if (ramblock_is_ignored(pss->block)) {
error_report("block %s should not be migrated !", pss->block->idstr);
@@ -1740,10 +2020,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
} while ((pss->page & (pagesize_bits - 1)) &&
offset_in_ramblock(pss->block,
((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
-
/* The offset we leave with is the last one we looked at */
pss->page--;
- return pages;
+
+ res = ram_save_release_protection(rs, pss, start_page);
+ return (res < 0 ? res : pages);
}
/**
@@ -1880,10 +2161,13 @@ static void ram_save_cleanup(void *opaque)
RAMState **rsp = opaque;
RAMBlock *block;
- /* caller have hold iothread lock or is in a bh, so there is
- * no writing race against the migration bitmap
- */
- memory_global_dirty_log_stop();
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ /* caller have hold iothread lock or is in a bh, so there is
+ * no writing race against the migration bitmap
+ */
+ memory_global_dirty_log_stop();
+ }
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
g_free(block->clear_bmap);
@@ -2343,8 +2627,11 @@ static void ram_init_bitmaps(RAMState *rs)
WITH_RCU_READ_LOCK_GUARD() {
ram_list_init_bitmaps();
- memory_global_dirty_log_start();
- migration_bitmap_sync_precopy(rs);
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ memory_global_dirty_log_start();
+ migration_bitmap_sync_precopy(rs);
+ }
}
qemu_mutex_unlock_ramlist();
qemu_mutex_unlock_iothread();
@@ -3521,7 +3808,7 @@ static int ram_load_precopy(QEMUFile *f)
}
}
/* For postcopy we need to check hugepage sizes match */
- if (postcopy_advised &&
+ if (postcopy_advised && migrate_postcopy_ram() &&
block->page_size != qemu_host_page_size) {
uint64_t remote_page_size = qemu_get_be64(f);
if (remote_page_size != block->page_size) {
diff --git a/migration/ram.h b/migration/ram.h
index 011e85414e..6378bb3ebc 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -47,7 +47,7 @@ bool ramblock_is_ignored(RAMBlock *block);
INTERNAL_RAMBLOCK_FOREACH(block) \
if (!qemu_ram_is_migratable(block)) {} else
-int xbzrle_cache_resize(int64_t new_size, Error **errp);
+int xbzrle_cache_resize(uint64_t new_size, Error **errp);
uint64_t ram_bytes_remaining(void);
uint64_t ram_bytes_total(void);
@@ -79,4 +79,10 @@ void colo_flush_ram_cache(void);
void colo_release_ram_cache(void);
void colo_incoming_start_dirty_log(void);
+/* Background snapshot */
+bool ram_write_tracking_available(void);
+bool ram_write_tracking_compatible(void);
+int ram_write_tracking_start(void);
+void ram_write_tracking_stop(void);
+
#endif
diff --git a/migration/savevm.c b/migration/savevm.c
index 4f3b69ecfc..52e2d72e4b 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -43,6 +43,8 @@
#include "qapi/error.h"
#include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/json-writer.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-builtin-visit.h"
#include "qapi/qmp/qerror.h"
#include "qemu/error-report.h"
#include "sysemu/cpus.h"
@@ -315,6 +317,16 @@ static int configuration_pre_save(void *opaque)
return 0;
}
+static int configuration_post_save(void *opaque)
+{
+ SaveState *state = opaque;
+
+ g_free(state->capabilities);
+ state->capabilities = NULL;
+ state->caps_count = 0;
+ return 0;
+}
+
static int configuration_pre_load(void *opaque)
{
SaveState *state = opaque;
@@ -365,24 +377,36 @@ static int configuration_post_load(void *opaque, int version_id)
{
SaveState *state = opaque;
const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
+ int ret = 0;
if (strncmp(state->name, current_name, state->len) != 0) {
error_report("Machine type received is '%.*s' and local is '%s'",
(int) state->len, state->name, current_name);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (state->target_page_bits != qemu_target_page_bits()) {
error_report("Received TARGET_PAGE_BITS is %d but local is %d",
state->target_page_bits, qemu_target_page_bits());
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (!configuration_validate_capabilities(state)) {
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
- return 0;
+out:
+ g_free((void *)state->name);
+ state->name = NULL;
+ state->len = 0;
+ g_free(state->capabilities);
+ state->capabilities = NULL;
+ state->caps_count = 0;
+
+ return ret;
}
static int get_capability(QEMUFile *f, void *pv, size_t size,
@@ -516,6 +540,7 @@ static const VMStateDescription vmstate_configuration = {
.pre_load = configuration_pre_load,
.post_load = configuration_post_load,
.pre_save = configuration_pre_save,
+ .post_save = configuration_post_save,
.fields = (VMStateField[]) {
VMSTATE_UINT32(len, SaveState),
VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
@@ -1131,6 +1156,19 @@ bool qemu_savevm_state_blocked(Error **errp)
return false;
}
+void qemu_savevm_non_migratable_list(strList **reasons)
+{
+ SaveStateEntry *se;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (se->vmsd && se->vmsd->unmigratable) {
+ QAPI_LIST_PREPEND(*reasons,
+ g_strdup_printf("non-migratable device: %s",
+ se->idstr));
+ }
+ }
+}
+
void qemu_savevm_state_header(QEMUFile *f)
{
trace_savevm_state_header();
@@ -1355,7 +1393,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
return 0;
}
-static
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy,
bool inactivate_disks)
@@ -2729,9 +2766,10 @@ int qemu_load_device_state(QEMUFile *f)
return 0;
}
-int save_snapshot(const char *name, Error **errp)
+bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
+ bool has_devices, strList *devices, Error **errp)
{
- BlockDriverState *bs, *bs1;
+ BlockDriverState *bs;
QEMUSnapshotInfo sn1, *sn = &sn1;
int ret = -1, ret2;
QEMUFile *f;
@@ -2742,35 +2780,43 @@ int save_snapshot(const char *name, Error **errp)
AioContext *aio_context;
if (migration_is_blocked(errp)) {
- return ret;
+ return false;
}
if (!replay_can_snapshot()) {
error_setg(errp, "Record/replay does not allow making snapshot "
"right now. Try once more later.");
- return ret;
+ return false;
}
- if (!bdrv_all_can_snapshot(&bs)) {
- error_setg(errp, "Device '%s' is writable but does not support "
- "snapshots", bdrv_get_device_or_node_name(bs));
- return ret;
+ if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
+ return false;
}
/* Delete old snapshots of the same name */
if (name) {
- ret = bdrv_all_delete_snapshot(name, &bs1, errp);
- if (ret < 0) {
- error_prepend(errp, "Error while deleting snapshot on device "
- "'%s': ", bdrv_get_device_or_node_name(bs1));
- return ret;
+ if (overwrite) {
+ if (bdrv_all_delete_snapshot(name, has_devices,
+ devices, errp) < 0) {
+ return false;
+ }
+ } else {
+ ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
+ if (ret2 < 0) {
+ return false;
+ }
+ if (ret2 == 1) {
+ error_setg(errp,
+ "Snapshot '%s' already exists in one or more devices",
+ name);
+ return false;
+ }
}
}
- bs = bdrv_all_find_vmstate_bs();
+ bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
if (bs == NULL) {
- error_setg(errp, "No block device can accept snapshots");
- return ret;
+ return false;
}
aio_context = bdrv_get_aio_context(bs);
@@ -2779,7 +2825,7 @@ int save_snapshot(const char *name, Error **errp)
ret = global_state_store();
if (ret) {
error_setg(errp, "Error saving global state");
- return ret;
+ return false;
}
vm_stop(RUN_STATE_SAVE_VM);
@@ -2833,11 +2879,10 @@ int save_snapshot(const char *name, Error **errp)
aio_context_release(aio_context);
aio_context = NULL;
- ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
+ ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
+ has_devices, devices, errp);
if (ret < 0) {
- error_setg(errp, "Error while creating snapshot on '%s'",
- bdrv_get_device_or_node_name(bs));
- bdrv_all_delete_snapshot(sn->name, &bs, NULL);
+ bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
goto the_end;
}
@@ -2853,7 +2898,7 @@ int save_snapshot(const char *name, Error **errp)
if (saved_vm_running) {
vm_start();
}
- return ret;
+ return ret == 0;
}
void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
@@ -2938,33 +2983,32 @@ void qmp_xen_load_devices_state(const char *filename, Error **errp)
migration_incoming_state_destroy();
}
-int load_snapshot(const char *name, Error **errp)
+bool load_snapshot(const char *name, const char *vmstate,
+ bool has_devices, strList *devices, Error **errp)
{
- BlockDriverState *bs, *bs_vm_state;
+ BlockDriverState *bs_vm_state;
QEMUSnapshotInfo sn;
QEMUFile *f;
int ret;
AioContext *aio_context;
MigrationIncomingState *mis = migration_incoming_get_current();
- if (!bdrv_all_can_snapshot(&bs)) {
- error_setg(errp,
- "Device '%s' is writable but does not support snapshots",
- bdrv_get_device_or_node_name(bs));
- return -ENOTSUP;
+ if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
+ return false;
}
- ret = bdrv_all_find_snapshot(name, &bs);
+ ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
if (ret < 0) {
- error_setg(errp,
- "Device '%s' does not have the requested snapshot '%s'",
- bdrv_get_device_or_node_name(bs), name);
- return ret;
+ return false;
+ }
+ if (ret == 0) {
+ error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
+ name);
+ return false;
}
- bs_vm_state = bdrv_all_find_vmstate_bs();
+ bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
if (!bs_vm_state) {
- error_setg(errp, "No block device supports snapshots");
- return -ENOTSUP;
+ return false;
}
aio_context = bdrv_get_aio_context(bs_vm_state);
@@ -2973,11 +3017,11 @@ int load_snapshot(const char *name, Error **errp)
ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
aio_context_release(aio_context);
if (ret < 0) {
- return ret;
+ return false;
} else if (sn.vm_state_size == 0) {
error_setg(errp, "This is a disk-only snapshot. Revert to it "
" offline using qemu-img");
- return -EINVAL;
+ return false;
}
/*
@@ -2989,10 +3033,8 @@ int load_snapshot(const char *name, Error **errp)
/* Flush all IO requests so they don't interfere with the new state. */
bdrv_drain_all_begin();
- ret = bdrv_all_goto_snapshot(name, &bs, errp);
+ ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
if (ret < 0) {
- error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
- name, bdrv_get_device_or_node_name(bs));
goto err_drain;
}
@@ -3000,7 +3042,6 @@ int load_snapshot(const char *name, Error **errp)
f = qemu_fopen_bdrv(bs_vm_state, 0);
if (!f) {
error_setg(errp, "Could not open VM state file");
- ret = -EINVAL;
goto err_drain;
}
@@ -3020,14 +3061,28 @@ int load_snapshot(const char *name, Error **errp)
if (ret < 0) {
error_setg(errp, "Error %d while loading VM state", ret);
- return ret;
+ return false;
}
- return 0;
+ return true;
err_drain:
bdrv_drain_all_end();
- return ret;
+ return false;
+}
+
+bool delete_snapshot(const char *name, bool has_devices,
+ strList *devices, Error **errp)
+{
+ if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
+ return false;
+ }
+
+ if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
+ return false;
+ }
+
+ return true;
}
void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
@@ -3057,3 +3112,187 @@ bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
return !(vmsd && vmsd->unmigratable);
}
+
+typedef struct SnapshotJob {
+ Job common;
+ char *tag;
+ char *vmstate;
+ strList *devices;
+ Coroutine *co;
+ Error **errp;
+ bool ret;
+} SnapshotJob;
+
+static void qmp_snapshot_job_free(SnapshotJob *s)
+{
+ g_free(s->tag);
+ g_free(s->vmstate);
+ qapi_free_strList(s->devices);
+}
+
+
+static void snapshot_load_job_bh(void *opaque)
+{
+ Job *job = opaque;
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+ int orig_vm_running;
+
+ job_progress_set_remaining(&s->common, 1);
+
+ orig_vm_running = runstate_is_running();
+ vm_stop(RUN_STATE_RESTORE_VM);
+
+ s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
+ if (s->ret && orig_vm_running) {
+ vm_start();
+ }
+
+ job_progress_update(&s->common, 1);
+
+ qmp_snapshot_job_free(s);
+ aio_co_wake(s->co);
+}
+
+static void snapshot_save_job_bh(void *opaque)
+{
+ Job *job = opaque;
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+
+ job_progress_set_remaining(&s->common, 1);
+ s->ret = save_snapshot(s->tag, false, s->vmstate,
+ true, s->devices, s->errp);
+ job_progress_update(&s->common, 1);
+
+ qmp_snapshot_job_free(s);
+ aio_co_wake(s->co);
+}
+
+static void snapshot_delete_job_bh(void *opaque)
+{
+ Job *job = opaque;
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+
+ job_progress_set_remaining(&s->common, 1);
+ s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
+ job_progress_update(&s->common, 1);
+
+ qmp_snapshot_job_free(s);
+ aio_co_wake(s->co);
+}
+
+static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
+{
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+ s->errp = errp;
+ s->co = qemu_coroutine_self();
+ aio_bh_schedule_oneshot(qemu_get_aio_context(),
+ snapshot_save_job_bh, job);
+ qemu_coroutine_yield();
+ return s->ret ? 0 : -1;
+}
+
+static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
+{
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+ s->errp = errp;
+ s->co = qemu_coroutine_self();
+ aio_bh_schedule_oneshot(qemu_get_aio_context(),
+ snapshot_load_job_bh, job);
+ qemu_coroutine_yield();
+ return s->ret ? 0 : -1;
+}
+
+static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
+{
+ SnapshotJob *s = container_of(job, SnapshotJob, common);
+ s->errp = errp;
+ s->co = qemu_coroutine_self();
+ aio_bh_schedule_oneshot(qemu_get_aio_context(),
+ snapshot_delete_job_bh, job);
+ qemu_coroutine_yield();
+ return s->ret ? 0 : -1;
+}
+
+
+static const JobDriver snapshot_load_job_driver = {
+ .instance_size = sizeof(SnapshotJob),
+ .job_type = JOB_TYPE_SNAPSHOT_LOAD,
+ .run = snapshot_load_job_run,
+};
+
+static const JobDriver snapshot_save_job_driver = {
+ .instance_size = sizeof(SnapshotJob),
+ .job_type = JOB_TYPE_SNAPSHOT_SAVE,
+ .run = snapshot_save_job_run,
+};
+
+static const JobDriver snapshot_delete_job_driver = {
+ .instance_size = sizeof(SnapshotJob),
+ .job_type = JOB_TYPE_SNAPSHOT_DELETE,
+ .run = snapshot_delete_job_run,
+};
+
+
+void qmp_snapshot_save(const char *job_id,
+ const char *tag,
+ const char *vmstate,
+ strList *devices,
+ Error **errp)
+{
+ SnapshotJob *s;
+
+ s = job_create(job_id, &snapshot_save_job_driver, NULL,
+ qemu_get_aio_context(), JOB_MANUAL_DISMISS,
+ NULL, NULL, errp);
+ if (!s) {
+ return;
+ }
+
+ s->tag = g_strdup(tag);
+ s->vmstate = g_strdup(vmstate);
+ s->devices = QAPI_CLONE(strList, devices);
+
+ job_start(&s->common);
+}
+
+void qmp_snapshot_load(const char *job_id,
+ const char *tag,
+ const char *vmstate,
+ strList *devices,
+ Error **errp)
+{
+ SnapshotJob *s;
+
+ s = job_create(job_id, &snapshot_load_job_driver, NULL,
+ qemu_get_aio_context(), JOB_MANUAL_DISMISS,
+ NULL, NULL, errp);
+ if (!s) {
+ return;
+ }
+
+ s->tag = g_strdup(tag);
+ s->vmstate = g_strdup(vmstate);
+ s->devices = QAPI_CLONE(strList, devices);
+
+ job_start(&s->common);
+}
+
+void qmp_snapshot_delete(const char *job_id,
+ const char *tag,
+ strList *devices,
+ Error **errp)
+{
+ SnapshotJob *s;
+
+ s = job_create(job_id, &snapshot_delete_job_driver, NULL,
+ qemu_get_aio_context(), JOB_MANUAL_DISMISS,
+ NULL, NULL, errp);
+ if (!s) {
+ return;
+ }
+
+ s->tag = g_strdup(tag);
+ s->devices = QAPI_CLONE(strList, devices);
+
+ job_start(&s->common);
+}
diff --git a/migration/savevm.h b/migration/savevm.h
index ba64a7e271..6461342cb4 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -30,6 +30,7 @@
#define QEMU_VM_SECTION_FOOTER 0x7e
bool qemu_savevm_state_blocked(Error **errp);
+void qemu_savevm_non_migratable_list(strList **reasons);
void qemu_savevm_state_setup(QEMUFile *f);
bool qemu_savevm_state_guest_unplug_pending(void);
int qemu_savevm_state_resume_prepare(MigrationState *s);
@@ -64,5 +65,7 @@ int qemu_loadvm_state(QEMUFile *f);
void qemu_loadvm_state_cleanup(void);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
int qemu_load_device_state(QEMUFile *f);
+int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
+ bool in_postcopy, bool inactivate_disks);
#endif
diff --git a/migration/trace-events b/migration/trace-events
index 75de5004ac..668c562fed 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -111,6 +111,8 @@ save_xbzrle_page_skipping(void) ""
save_xbzrle_page_overflow(void) ""
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
+ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
+ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
# multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %d"
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index a48bc1e904..3c88a4faef 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -224,6 +224,15 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
migration_global_dump(mon);
+ if (info->blocked) {
+ strList *reasons = info->blocked_reasons;
+ monitor_printf(mon, "Outgoing migration blocked:\n");
+ while (reasons) {
+ monitor_printf(mon, " %s\n", reasons->value);
+ reasons = reasons->next;
+ }
+ }
+
if (info->has_status) {
monitor_printf(mon, "Migration status: %s",
MigrationStatus_str(info->status));
@@ -1130,7 +1139,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict)
vm_stop(RUN_STATE_RESTORE_VM);
- if (load_snapshot(name, &err) == 0 && saved_vm_running) {
+ if (!load_snapshot(name, NULL, false, NULL, &err) && saved_vm_running) {
vm_start();
}
hmp_handle_error(mon, err);
@@ -1140,21 +1149,17 @@ void hmp_savevm(Monitor *mon, const QDict *qdict)
{
Error *err = NULL;
- save_snapshot(qdict_get_try_str(qdict, "name"), &err);
+ save_snapshot(qdict_get_try_str(qdict, "name"),
+ true, NULL, false, NULL, &err);
hmp_handle_error(mon, err);
}
void hmp_delvm(Monitor *mon, const QDict *qdict)
{
- BlockDriverState *bs;
Error *err = NULL;
const char *name = qdict_get_str(qdict, "name");
- if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) {
- error_prepend(&err,
- "deleting snapshot on device '%s': ",
- bdrv_get_device_name(bs));
- }
+ delete_snapshot(name, false, NULL, &err);
hmp_handle_error(mon, err);
}
@@ -1294,11 +1299,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
switch (val) {
case MIGRATION_PARAMETER_COMPRESS_LEVEL:
p->has_compress_level = true;
- visit_type_int(v, param, &p->compress_level, &err);
+ visit_type_uint8(v, param, &p->compress_level, &err);
break;
case MIGRATION_PARAMETER_COMPRESS_THREADS:
p->has_compress_threads = true;
- visit_type_int(v, param, &p->compress_threads, &err);
+ visit_type_uint8(v, param, &p->compress_threads, &err);
break;
case MIGRATION_PARAMETER_COMPRESS_WAIT_THREAD:
p->has_compress_wait_thread = true;
@@ -1306,19 +1311,19 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break;
case MIGRATION_PARAMETER_DECOMPRESS_THREADS:
p->has_decompress_threads = true;
- visit_type_int(v, param, &p->decompress_threads, &err);
+ visit_type_uint8(v, param, &p->decompress_threads, &err);
break;
case MIGRATION_PARAMETER_THROTTLE_TRIGGER_THRESHOLD:
p->has_throttle_trigger_threshold = true;
- visit_type_int(v, param, &p->throttle_trigger_threshold, &err);
+ visit_type_uint8(v, param, &p->throttle_trigger_threshold, &err);
break;
case MIGRATION_PARAMETER_CPU_THROTTLE_INITIAL:
p->has_cpu_throttle_initial = true;
- visit_type_int(v, param, &p->cpu_throttle_initial, &err);
+ visit_type_uint8(v, param, &p->cpu_throttle_initial, &err);
break;
case MIGRATION_PARAMETER_CPU_THROTTLE_INCREMENT:
p->has_cpu_throttle_increment = true;
- visit_type_int(v, param, &p->cpu_throttle_increment, &err);
+ visit_type_uint8(v, param, &p->cpu_throttle_increment, &err);
break;
case MIGRATION_PARAMETER_CPU_THROTTLE_TAILSLOW:
p->has_cpu_throttle_tailslow = true;
@@ -1326,7 +1331,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break;
case MIGRATION_PARAMETER_MAX_CPU_THROTTLE:
p->has_max_cpu_throttle = true;
- visit_type_int(v, param, &p->max_cpu_throttle, &err);
+ visit_type_uint8(v, param, &p->max_cpu_throttle, &err);
break;
case MIGRATION_PARAMETER_TLS_CREDS:
p->has_tls_creds = true;
@@ -1362,11 +1367,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break;
case MIGRATION_PARAMETER_DOWNTIME_LIMIT:
p->has_downtime_limit = true;
- visit_type_int(v, param, &p->downtime_limit, &err);
+ visit_type_size(v, param, &p->downtime_limit, &err);
break;
case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY:
p->has_x_checkpoint_delay = true;
- visit_type_int(v, param, &p->x_checkpoint_delay, &err);
+ visit_type_uint32(v, param, &p->x_checkpoint_delay, &err);
break;
case MIGRATION_PARAMETER_BLOCK_INCREMENTAL:
p->has_block_incremental = true;
@@ -1374,7 +1379,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break;
case MIGRATION_PARAMETER_MULTIFD_CHANNELS:
p->has_multifd_channels = true;
- visit_type_int(v, param, &p->multifd_channels, &err);
+ visit_type_uint8(v, param, &p->multifd_channels, &err);
break;
case MIGRATION_PARAMETER_MULTIFD_COMPRESSION:
p->has_multifd_compression = true;
@@ -1383,11 +1388,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
break;
case MIGRATION_PARAMETER_MULTIFD_ZLIB_LEVEL:
p->has_multifd_zlib_level = true;
- visit_type_int(v, param, &p->multifd_zlib_level, &err);
+ visit_type_uint8(v, param, &p->multifd_zlib_level, &err);
break;
case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL:
p->has_multifd_zstd_level = true;
- visit_type_int(v, param, &p->multifd_zstd_level, &err);
+ visit_type_uint8(v, param, &p->multifd_zstd_level, &err);
break;
case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
p->has_xbzrle_cache_size = true;
diff --git a/pc-bios/README b/pc-bios/README
index 33f9754ad3..db7129ef64 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -20,7 +20,7 @@
legacy x86 software to communicate with an attached serial console as
if a video card were attached. The master sources reside in a subversion
repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is
- available at https://git.qemu.org/git/sgabios.git.
+ available at https://gitlab.com/qemu-project/sgabios.git.
- The PXE roms come from the iPXE project. Built with BANNER_TIME 0.
Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping:
@@ -37,7 +37,7 @@
- The u-boot binary for e500 comes from the upstream denx u-boot project where
it was compiled using the qemu-ppce500 target.
- A git mirror is available at: https://git.qemu.org/git/u-boot.git
+ A git mirror is available at: https://gitlab.com/qemu-project/u-boot.git
The hash used to compile the current version is: 2072e72
- Skiboot (https://github.com/open-power/skiboot/) is an OPAL
diff --git a/pc-bios/descriptors/meson.build b/pc-bios/descriptors/meson.build
index ac6ec66b00..29efa16d99 100644
--- a/pc-bios/descriptors/meson.build
+++ b/pc-bios/descriptors/meson.build
@@ -9,7 +9,7 @@ if install_edk2_blobs
]
configure_file(input: files(f),
output: f,
- configuration: {'DATADIR': qemu_datadir},
+ configuration: {'DATADIR': get_option('prefix') / qemu_datadir},
install: get_option('install_blobs'),
install_dir: qemu_datadir / 'firmware')
endforeach
diff --git a/pc-bios/meson.build b/pc-bios/meson.build
index af95c5d1f1..f2b32598af 100644
--- a/pc-bios/meson.build
+++ b/pc-bios/meson.build
@@ -12,6 +12,7 @@ if install_edk2_blobs
foreach f : fds
custom_target(f,
+ build_by_default: have_system,
output: f,
input: '@0@.bz2'.format(f),
capture: true,
diff --git a/qapi/job.json b/qapi/job.json
index 280c2f76f1..1a6ef03451 100644
--- a/qapi/job.json
+++ b/qapi/job.json
@@ -22,10 +22,17 @@
#
# @amend: image options amend job type, see "x-blockdev-amend" (since 5.1)
#
+# @snapshot-load: snapshot load job type, see "snapshot-load" (since 6.0)
+#
+# @snapshot-save: snapshot save job type, see "snapshot-save" (since 6.0)
+#
+# @snapshot-delete: snapshot delete job type, see "snapshot-delete" (since 6.0)
+#
# Since: 1.7
##
{ 'enum': 'JobType',
- 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend'] }
+ 'data': ['commit', 'stream', 'mirror', 'backup', 'create', 'amend',
+ 'snapshot-load', 'snapshot-save', 'snapshot-delete'] }
##
# @JobStatus:
diff --git a/qapi/meson.build b/qapi/meson.build
index ab68e7900e..0652569bc4 100644
--- a/qapi/meson.build
+++ b/qapi/meson.build
@@ -4,18 +4,20 @@ util_ss.add(files(
'qapi-dealloc-visitor.c',
'qapi-util.c',
'qapi-visit-core.c',
- 'qmp-dispatch.c',
- 'qmp-event.c',
- 'qmp-registry.c',
'qobject-input-visitor.c',
'qobject-output-visitor.c',
'string-input-visitor.c',
'string-output-visitor.c',
))
+if have_system or have_tools
+ util_ss.add(files(
+ 'qmp-dispatch.c',
+ 'qmp-event.c',
+ 'qmp-registry.c',
+ ))
+endif
qapi_all_modules = [
- 'acpi',
- 'audio',
'authz',
'block',
'block-core',
@@ -35,20 +37,30 @@ qapi_all_modules = [
'misc-target',
'net',
'pragma',
- 'qdev',
- 'pci',
'qom',
- 'rdma',
'replay',
- 'rocker',
'run-state',
'sockets',
- 'tpm',
'trace',
'transaction',
- 'ui',
'yank',
]
+if have_system
+ qapi_all_modules += [
+ 'acpi',
+ 'audio',
+ 'qdev',
+ 'pci',
+ 'rdma',
+ 'rocker',
+ 'tpm',
+ ]
+endif
+if have_system or have_tools
+ qapi_all_modules += [
+ 'ui',
+ ]
+endif
qapi_storage_daemon_modules = [
'block-core',
diff --git a/qapi/migration.json b/qapi/migration.json
index d1d9632c2a..ce14d78071 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -78,7 +78,7 @@
# Since: 1.2
##
{ 'struct': 'XBZRLECacheStats',
- 'data': {'cache-size': 'int', 'bytes': 'int', 'pages': 'int',
+ 'data': {'cache-size': 'size', 'bytes': 'int', 'pages': 'int',
'cache-miss': 'int', 'cache-miss-rate': 'number',
'encoding-rate': 'number', 'overflow': 'int' } }
@@ -224,6 +224,10 @@
# only returned if VFIO device is present, migration is supported by all
# VFIO devices and status is 'active' or 'completed' (since 5.2)
#
+# @blocked: True if outgoing migration is blocked (since 6.0)
+#
+# @blocked-reasons: A list of reasons an outgoing migration is blocked (since 6.0)
+#
# Since: 0.14
##
{ 'struct': 'MigrationInfo',
@@ -237,6 +241,8 @@
'*setup-time': 'int',
'*cpu-throttle-percentage': 'int',
'*error-desc': 'str',
+ 'blocked': 'bool',
+ '*blocked-reasons': ['str'],
'*postcopy-blocktime' : 'uint32',
'*postcopy-vcpu-blocktime': ['uint32'],
'*compression': 'CompressionStats',
@@ -442,6 +448,11 @@
# @validate-uuid: Send the UUID of the source to allow the destination
# to ensure it is the same. (since 4.2)
#
+# @background-snapshot: If enabled, the migration stream will be a snapshot
+# of the VM exactly at the point when the migration
+# procedure starts. The VM RAM is saved with running VM.
+# (since 6.0)
+#
# Since: 1.2
##
{ 'enum': 'MigrationCapability',
@@ -449,7 +460,7 @@
'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram',
'block', 'return-path', 'pause-before-switchover', 'multifd',
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
- 'x-ignore-shared', 'validate-uuid' ] }
+ 'x-ignore-shared', 'validate-uuid', 'background-snapshot'] }
##
# @MigrationCapabilityStatus:
@@ -885,28 +896,28 @@
'*announce-max': 'size',
'*announce-rounds': 'size',
'*announce-step': 'size',
- '*compress-level': 'int',
- '*compress-threads': 'int',
+ '*compress-level': 'uint8',
+ '*compress-threads': 'uint8',
'*compress-wait-thread': 'bool',
- '*decompress-threads': 'int',
- '*throttle-trigger-threshold': 'int',
- '*cpu-throttle-initial': 'int',
- '*cpu-throttle-increment': 'int',
+ '*decompress-threads': 'uint8',
+ '*throttle-trigger-threshold': 'uint8',
+ '*cpu-throttle-initial': 'uint8',
+ '*cpu-throttle-increment': 'uint8',
'*cpu-throttle-tailslow': 'bool',
'*tls-creds': 'StrOrNull',
'*tls-hostname': 'StrOrNull',
'*tls-authz': 'StrOrNull',
- '*max-bandwidth': 'int',
- '*downtime-limit': 'int',
- '*x-checkpoint-delay': 'int',
+ '*max-bandwidth': 'size',
+ '*downtime-limit': 'uint64',
+ '*x-checkpoint-delay': 'uint32',
'*block-incremental': 'bool',
- '*multifd-channels': 'int',
+ '*multifd-channels': 'uint8',
'*xbzrle-cache-size': 'size',
'*max-postcopy-bandwidth': 'size',
- '*max-cpu-throttle': 'int',
+ '*max-cpu-throttle': 'uint8',
'*multifd-compression': 'MultiFDCompression',
- '*multifd-zlib-level': 'int',
- '*multifd-zstd-level': 'int',
+ '*multifd-zlib-level': 'uint8',
+ '*multifd-zstd-level': 'uint8',
'*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ] } }
##
@@ -1093,7 +1104,7 @@
'*max-bandwidth': 'size',
'*downtime-limit': 'uint64',
'*x-checkpoint-delay': 'uint32',
- '*block-incremental': 'bool' ,
+ '*block-incremental': 'bool',
'*multifd-channels': 'uint8',
'*xbzrle-cache-size': 'size',
'*max-postcopy-bandwidth': 'size',
@@ -1465,7 +1476,7 @@
# <- { "return": 67108864 }
#
##
-{ 'command': 'query-migrate-cache-size', 'returns': 'int',
+{ 'command': 'query-migrate-cache-size', 'returns': 'size',
'features': [ 'deprecated' ] }
##
@@ -1843,3 +1854,176 @@
# Since: 5.2
##
{ 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
+
+##
+# @snapshot-save:
+#
+# Save a VM snapshot
+#
+# @job-id: identifier for the newly created job
+# @tag: name of the snapshot to create
+# @vmstate: block device node name to save vmstate to
+# @devices: list of block device node names to save a snapshot to
+#
+# Applications should not assume that the snapshot save is complete
+# when this command returns. The job commands / events must be used
+# to determine completion and to fetch details of any errors that arise.
+#
+# Note that execution of the guest CPUs may be stopped during the
+# time it takes to save the snapshot. A future version of QEMU
+# may ensure CPUs are executing continuously.
+#
+# It is strongly recommended that @devices contain all writable
+# block device nodes if a consistent snapshot is required.
+#
+# If @tag already exists, an error will be reported
+#
+# Returns: nothing
+#
+# Example:
+#
+# -> { "execute": "snapshot-save",
+# "data": {
+# "job-id": "snapsave0",
+# "tag": "my-snap",
+# "vmstate": "disk0",
+# "devices": ["disk0", "disk1"]
+# }
+# }
+# <- { "return": { } }
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "created", "id": "snapsave0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "running", "id": "snapsave0"}}
+# <- {"event": "STOP"}
+# <- {"event": "RESUME"}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "waiting", "id": "snapsave0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "pending", "id": "snapsave0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "concluded", "id": "snapsave0"}}
+# -> {"execute": "query-jobs"}
+# <- {"return": [{"current-progress": 1,
+# "status": "concluded",
+# "total-progress": 1,
+# "type": "snapshot-save",
+# "id": "snapsave0"}]}
+#
+# Since: 6.0
+##
+{ 'command': 'snapshot-save',
+ 'data': { 'job-id': 'str',
+ 'tag': 'str',
+ 'vmstate': 'str',
+ 'devices': ['str'] } }
+
+##
+# @snapshot-load:
+#
+# Load a VM snapshot
+#
+# @job-id: identifier for the newly created job
+# @tag: name of the snapshot to load.
+# @vmstate: block device node name to load vmstate from
+# @devices: list of block device node names to load a snapshot from
+#
+# Applications should not assume that the snapshot load is complete
+# when this command returns. The job commands / events must be used
+# to determine completion and to fetch details of any errors that arise.
+#
+# Note that execution of the guest CPUs will be stopped during the
+# time it takes to load the snapshot.
+#
+# It is strongly recommended that @devices contain all writable
+# block device nodes that can have changed since the original
+# @snapshot-save command execution.
+#
+# Returns: nothing
+#
+# Example:
+#
+# -> { "execute": "snapshot-load",
+# "data": {
+# "job-id": "snapload0",
+# "tag": "my-snap",
+# "vmstate": "disk0",
+# "devices": ["disk0", "disk1"]
+# }
+# }
+# <- { "return": { } }
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "created", "id": "snapload0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "running", "id": "snapload0"}}
+# <- {"event": "STOP"}
+# <- {"event": "RESUME"}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "waiting", "id": "snapload0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "pending", "id": "snapload0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "concluded", "id": "snapload0"}}
+# -> {"execute": "query-jobs"}
+# <- {"return": [{"current-progress": 1,
+# "status": "concluded",
+# "total-progress": 1,
+# "type": "snapshot-load",
+# "id": "snapload0"}]}
+#
+# Since: 6.0
+##
+{ 'command': 'snapshot-load',
+ 'data': { 'job-id': 'str',
+ 'tag': 'str',
+ 'vmstate': 'str',
+ 'devices': ['str'] } }
+
+##
+# @snapshot-delete:
+#
+# Delete a VM snapshot
+#
+# @job-id: identifier for the newly created job
+# @tag: name of the snapshot to delete.
+# @devices: list of block device node names to delete a snapshot from
+#
+# Applications should not assume that the snapshot delete is complete
+# when this command returns. The job commands / events must be used
+# to determine completion and to fetch details of any errors that arise.
+#
+# Returns: nothing
+#
+# Example:
+#
+# -> { "execute": "snapshot-delete",
+# "data": {
+# "job-id": "snapdelete0",
+# "tag": "my-snap",
+# "devices": ["disk0", "disk1"]
+# }
+# }
+# <- { "return": { } }
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "created", "id": "snapdelete0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "running", "id": "snapdelete0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "waiting", "id": "snapdelete0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "pending", "id": "snapdelete0"}}
+# <- {"event": "JOB_STATUS_CHANGE",
+# "data": {"status": "concluded", "id": "snapdelete0"}}
+# -> {"execute": "query-jobs"}
+# <- {"return": [{"current-progress": 1,
+# "status": "concluded",
+# "total-progress": 1,
+# "type": "snapshot-delete",
+# "id": "snapdelete0"}]}
+#
+# Since: 6.0
+##
+{ 'command': 'snapshot-delete',
+ 'data': { 'job-id': 'str',
+ 'tag': 'str',
+ 'devices': ['str'] } }
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 608c63e82a..b1b9430a8f 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -328,7 +328,7 @@ static void *nbd_client_thread(void *arg)
static int nbd_can_accept(void)
{
- return state == RUNNING && nb_fds < shared;
+ return state == RUNNING && (shared == 0 || nb_fds < shared);
}
static void nbd_update_server_watch(void);
@@ -707,7 +707,7 @@ int main(int argc, char **argv)
break;
case 'e':
if (qemu_strtoi(optarg, NULL, 0, &shared) < 0 ||
- shared < 1) {
+ shared < 0) {
error_report("Invalid shared device number '%s'", optarg);
exit(EXIT_FAILURE);
}
@@ -964,8 +964,16 @@ int main(int argc, char **argv)
server = qio_net_listener_new();
if (socket_activation == 0) {
+ int backlog;
+
+ if (persistent || shared == 0) {
+ backlog = SOMAXCONN;
+ } else {
+ backlog = MIN(shared, SOMAXCONN);
+ }
saddr = nbd_build_socket_address(sockpath, bindto, port);
- if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) {
+ if (qio_net_listener_open_sync(server, saddr, backlog,
+ &local_err) < 0) {
object_unref(OBJECT(server));
error_report_err(local_err);
exit(EXIT_FAILURE);
diff --git a/qemu-options.hx b/qemu-options.hx
index c09c4646e2..6c34c7050f 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -35,7 +35,8 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
" suppress-vmdesc=on|off disables self-describing migration (default=off)\n"
" nvdimm=on|off controls NVDIMM support (default=off)\n"
" memory-encryption=@var{} memory encryption object to use (default=none)\n"
- " hmat=on|off controls ACPI HMAT support (default=off)\n",
+ " hmat=on|off controls ACPI HMAT support (default=off)\n"
+ " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n",
QEMU_ARCH_ALL)
SRST
``-machine [type=]name[,prop=value[,...]]``
@@ -96,6 +97,29 @@ SRST
``hmat=on|off``
Enables or disables ACPI Heterogeneous Memory Attribute Table
(HMAT) support. The default is off.
+
+ ``memory-backend='id'``
+ An alternative to legacy ``-mem-path`` and ``mem-prealloc`` options.
+ Allows to use a memory backend as main RAM.
+
+ For example:
+ ::
+ -object memory-backend-file,id=pc.ram,size=512M,mem-path=/hugetlbfs,prealloc=on,share=on
+ -machine memory-backend=pc.ram
+ -m 512M
+
+ Migration compatibility note:
+ a) as backend id one shall use value of 'default-ram-id', advertised by
+ machine type (available via ``query-machines`` QMP command), if migration
+ to/from old QEMU (<5.0) is expected.
+ b) for machine types 4.0 and older, user shall
+ use ``x-use-canonical-path-for-ramblock-id=off`` backend option
+ if migration to/from old QEMU (<5.0) is expected.
+ For example:
+ ::
+ -object memory-backend-ram,id=pc.ram,size=512M,x-use-canonical-path-for-ramblock-id=off
+ -machine memory-backend=pc.ram
+ -m 512M
ERST
HXCOMM Deprecated by -machine
diff --git a/qom/object.c b/qom/object.c
index 2fa0119647..491823db4a 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -442,7 +442,8 @@ static GPtrArray *object_compat_props[3];
* other than "-global". These are generally used for syntactic
* sugar and legacy command line options.
*/
-void object_register_sugar_prop(const char *driver, const char *prop, const char *value)
+void object_register_sugar_prop(const char *driver, const char *prop,
+ const char *value, bool optional)
{
GlobalProperty *g;
if (!object_compat_props[2]) {
@@ -452,6 +453,7 @@ void object_register_sugar_prop(const char *driver, const char *prop, const char
g->driver = g_strdup(driver);
g->property = g_strdup(prop);
g->value = g_strdup(value);
+ g->optional = optional;
g_ptr_array_add(object_compat_props[2], g);
}
diff --git a/replay/replay-debugging.c b/replay/replay-debugging.c
index 5ec574724a..1cde50e9f3 100644
--- a/replay/replay-debugging.c
+++ b/replay/replay-debugging.c
@@ -143,12 +143,13 @@ static char *replay_find_nearest_snapshot(int64_t icount,
QEMUSnapshotInfo *sn_tab;
QEMUSnapshotInfo *nearest = NULL;
char *ret = NULL;
+ int rv;
int nb_sns, i;
AioContext *aio_context;
*snapshot_icount = -1;
- bs = bdrv_all_find_vmstate_bs();
+ bs = bdrv_all_find_vmstate_bs(NULL, false, NULL, NULL);
if (!bs) {
goto fail;
}
@@ -159,7 +160,10 @@ static char *replay_find_nearest_snapshot(int64_t icount,
aio_context_release(aio_context);
for (i = 0; i < nb_sns; i++) {
- if (bdrv_all_find_snapshot(sn_tab[i].name, &bs) == 0) {
+ rv = bdrv_all_has_snapshot(sn_tab[i].name, false, NULL, NULL);
+ if (rv < 0)
+ goto fail;
+ if (rv == 1) {
if (sn_tab[i].icount != -1ULL
&& sn_tab[i].icount <= icount
&& (!nearest || nearest->icount < sn_tab[i].icount)) {
@@ -192,7 +196,7 @@ static void replay_seek(int64_t icount, QEMUTimerCB callback, Error **errp)
if (icount < replay_get_current_icount()
|| replay_get_current_icount() < snapshot_icount) {
vm_stop(RUN_STATE_RESTORE_VM);
- load_snapshot(snapshot, errp);
+ load_snapshot(snapshot, NULL, false, NULL, errp);
}
g_free(snapshot);
}
@@ -323,7 +327,7 @@ void replay_gdb_attached(void)
*/
if (replay_mode == REPLAY_MODE_PLAY
&& !replay_snapshot) {
- if (save_snapshot("start_debugging", NULL) != 0) {
+ if (!save_snapshot("start_debugging", true, NULL, false, NULL, NULL)) {
/* Can't create the snapshot. Continue conventional debugging. */
}
}
diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
index e26fa4c892..e8767a1937 100644
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@@ -77,13 +77,14 @@ void replay_vmstate_init(void)
if (replay_snapshot) {
if (replay_mode == REPLAY_MODE_RECORD) {
- if (save_snapshot(replay_snapshot, &err) != 0) {
+ if (!save_snapshot(replay_snapshot,
+ true, NULL, false, NULL, &err)) {
error_report_err(err);
error_report("Could not create snapshot for icount record");
exit(1);
}
} else if (replay_mode == REPLAY_MODE_PLAY) {
- if (load_snapshot(replay_snapshot, &err) != 0) {
+ if (!load_snapshot(replay_snapshot, NULL, false, NULL, &err)) {
error_report_err(err);
error_report("Could not load snapshot for icount replay");
exit(1);
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 271f5ff42a..e5499b94b4 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -1377,7 +1377,7 @@ sub vcs_exists {
warn("$P: No supported VCS found. Add --nogit to options?\n");
warn("Using a git repository produces better results.\n");
warn("Try latest git repository using:\n");
- warn("git clone https://git.qemu.org/git/qemu.git\n");
+ warn("git clone https://gitlab.com/qemu-project/qemu.git\n");
$printed_novcs = 1;
}
return 0;
diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
index 25ee6887cf..cbbcba100d 100644
--- a/scripts/mtest2make.py
+++ b/scripts/mtest2make.py
@@ -110,6 +110,7 @@ def emit_suite(name, suite, prefix):
print('ifneq ($(filter %s %s, $(MAKECMDGOALS)),)' % (target, prefix))
print('.tests += $(.test.$(SPEED).%s)' % (target, ))
print('endif')
+ print('all-%s-targets += %s' % (prefix, target))
targets = {t['id']: [os.path.relpath(f) for f in t['filename']]
for t in introspect['targets']}
diff --git a/scripts/oss-fuzz/minimize_qtest_trace.py b/scripts/oss-fuzz/minimize_qtest_trace.py
index 4cba96dee2..20825768c2 100755
--- a/scripts/oss-fuzz/minimize_qtest_trace.py
+++ b/scripts/oss-fuzz/minimize_qtest_trace.py
@@ -261,7 +261,7 @@ def clear_bits(newtrace, outpath):
data_try = hex(int("".join(data_bin_list), 2))
# It seems qtest only accepts padded hex-values.
if len(data_try) % 2 == 1:
- data_try = data_try[:2] + "0" + data_try[2:-1]
+ data_try = data_try[:2] + "0" + data_try[2:]
newtrace[i] = "{prefix} {data_try}\n".format(
prefix=prefix,
diff --git a/scripts/qapi/commands.py b/scripts/qapi/commands.py
index 50978090b4..54af519f44 100644
--- a/scripts/qapi/commands.py
+++ b/scripts/qapi/commands.py
@@ -23,7 +23,6 @@ from typing import (
from .common import c_name, mcgen
from .gen import (
QAPIGenC,
- QAPIGenCCode,
QAPISchemaModularCVisitor,
build_params,
ifcontext,
@@ -126,6 +125,9 @@ def gen_marshal(name: str,
boxed: bool,
ret_type: Optional[QAPISchemaType]) -> str:
have_args = boxed or (arg_type and not arg_type.is_empty())
+ if have_args:
+ assert arg_type is not None
+ arg_type_c_name = arg_type.c_name()
ret = mcgen('''
@@ -147,7 +149,7 @@ def gen_marshal(name: str,
ret += mcgen('''
%(c_name)s arg = {0};
''',
- c_name=arg_type.c_name())
+ c_name=arg_type_c_name)
ret += mcgen('''
@@ -163,7 +165,7 @@ def gen_marshal(name: str,
ok = visit_check_struct(v, errp);
}
''',
- c_arg_type=arg_type.c_name())
+ c_arg_type=arg_type_c_name)
else:
ret += mcgen('''
ok = visit_check_struct(v, errp);
@@ -193,7 +195,7 @@ out:
ret += mcgen('''
visit_type_%(c_arg_type)s_members(v, &arg, NULL);
''',
- c_arg_type=arg_type.c_name())
+ c_arg_type=arg_type_c_name)
ret += mcgen('''
visit_end_struct(v, NULL);
@@ -234,28 +236,11 @@ def gen_register_command(name: str,
return ret
-def gen_registry(registry: str, prefix: str) -> str:
- ret = mcgen('''
-
-void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds)
-{
- QTAILQ_INIT(cmds);
-
-''',
- c_prefix=c_name(prefix, protect=False))
- ret += registry
- ret += mcgen('''
-}
-''')
- return ret
-
-
class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor):
def __init__(self, prefix: str):
super().__init__(
prefix, 'qapi-commands',
' * Schema-defined QAPI/QMP commands', None, __doc__)
- self._regy = QAPIGenCCode(None)
self._visited_ret_types: Dict[QAPIGenC, Set[QAPISchemaType]] = {}
def _begin_user_module(self, name: str) -> None:
@@ -282,25 +267,36 @@ class QAPISchemaGenCommandVisitor(QAPISchemaModularCVisitor):
''',
types=types))
- def visit_end(self) -> None:
- self._add_system_module('init', ' * QAPI Commands initialization')
+ def visit_begin(self, schema: QAPISchema) -> None:
+ self._add_module('./init', ' * QAPI Commands initialization')
self._genh.add(mcgen('''
#include "qapi/qmp/dispatch.h"
void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds);
''',
c_prefix=c_name(self._prefix, protect=False)))
- self._genc.preamble_add(mcgen('''
+ self._genc.add(mcgen('''
#include "qemu/osdep.h"
#include "%(prefix)sqapi-commands.h"
#include "%(prefix)sqapi-init-commands.h"
+
+void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds)
+{
+ QTAILQ_INIT(cmds);
+
''',
- prefix=self._prefix))
- self._genc.add(gen_registry(self._regy.get_content(), self._prefix))
+ prefix=self._prefix,
+ c_prefix=c_name(self._prefix, protect=False)))
+
+ def visit_end(self) -> None:
+ with self._temp_module('./init'):
+ self._genc.add(mcgen('''
+}
+'''))
def visit_command(self,
name: str,
- info: QAPISourceInfo,
+ info: Optional[QAPISourceInfo],
ifcond: List[str],
features: List[QAPISchemaFeature],
arg_type: Optional[QAPISchemaObjectType],
@@ -321,15 +317,17 @@ void %(c_prefix)sqmp_init_marshal(QmpCommandList *cmds);
if ret_type and ret_type not in self._visited_ret_types[self._genc]:
self._visited_ret_types[self._genc].add(ret_type)
with ifcontext(ret_type.ifcond,
- self._genh, self._genc, self._regy):
+ self._genh, self._genc):
self._genc.add(gen_marshal_output(ret_type))
- with ifcontext(ifcond, self._genh, self._genc, self._regy):
+ with ifcontext(ifcond, self._genh, self._genc):
self._genh.add(gen_command_decl(name, arg_type, boxed, ret_type))
self._genh.add(gen_marshal_decl(name))
self._genc.add(gen_marshal(name, arg_type, boxed, ret_type))
- self._regy.add(gen_register_command(name, success_response,
- allow_oob, allow_preconfig,
- coroutine))
+ with self._temp_module('./init'):
+ with ifcontext(ifcond, self._genh, self._genc):
+ self._genc.add(gen_register_command(name, success_response,
+ allow_oob, allow_preconfig,
+ coroutine))
def gen_commands(schema: QAPISchema,
diff --git a/scripts/qapi/events.py b/scripts/qapi/events.py
index 599f3d1f56..8c57deb2b8 100644
--- a/scripts/qapi/events.py
+++ b/scripts/qapi/events.py
@@ -12,7 +12,7 @@ This work is licensed under the terms of the GNU GPL, version 2.
See the COPYING file in the top-level directory.
"""
-from typing import List
+from typing import List, Optional
from .common import c_enum_const, c_name, mcgen
from .gen import QAPISchemaModularCVisitor, build_params, ifcontext
@@ -27,7 +27,7 @@ from .types import gen_enum, gen_enum_lookup
def build_event_send_proto(name: str,
- arg_type: QAPISchemaObjectType,
+ arg_type: Optional[QAPISchemaObjectType],
boxed: bool) -> str:
return 'void qapi_event_send_%(c_name)s(%(param)s)' % {
'c_name': c_name(name.lower()),
@@ -35,7 +35,7 @@ def build_event_send_proto(name: str,
def gen_event_send_decl(name: str,
- arg_type: QAPISchemaObjectType,
+ arg_type: Optional[QAPISchemaObjectType],
boxed: bool) -> str:
return mcgen('''
@@ -78,7 +78,7 @@ def gen_param_var(typ: QAPISchemaObjectType) -> str:
def gen_event_send(name: str,
- arg_type: QAPISchemaObjectType,
+ arg_type: Optional[QAPISchemaObjectType],
boxed: bool,
event_enum_name: str,
event_emit: str) -> str:
@@ -99,6 +99,7 @@ def gen_event_send(name: str,
proto=build_event_send_proto(name, arg_type, boxed))
if have_args:
+ assert arg_type is not None
ret += mcgen('''
QObject *obj;
Visitor *v;
@@ -114,6 +115,7 @@ def gen_event_send(name: str,
name=name)
if have_args:
+ assert arg_type is not None
ret += mcgen('''
v = qobject_output_visitor_new(&obj);
''')
@@ -189,7 +191,7 @@ class QAPISchemaGenEventVisitor(QAPISchemaModularCVisitor):
types=types))
def visit_end(self) -> None:
- self._add_system_module('emit', ' * QAPI Events emission')
+ self._add_module('./emit', ' * QAPI Events emission')
self._genc.preamble_add(mcgen('''
#include "qemu/osdep.h"
#include "%(prefix)sqapi-emit-events.h"
@@ -211,10 +213,10 @@ void %(event_emit)s(%(event_enum)s event, QDict *qdict);
def visit_event(self,
name: str,
- info: QAPISourceInfo,
+ info: Optional[QAPISourceInfo],
ifcond: List[str],
features: List[QAPISchemaFeature],
- arg_type: QAPISchemaObjectType,
+ arg_type: Optional[QAPISchemaObjectType],
boxed: bool) -> None:
with ifcontext(ifcond, self._genh, self._genc):
self._genh.add(gen_event_send_decl(name, arg_type, boxed))
diff --git a/scripts/qapi/gen.py b/scripts/qapi/gen.py
index b40f18eee3..63549cc8d4 100644
--- a/scripts/qapi/gen.py
+++ b/scripts/qapi/gen.py
@@ -31,12 +31,16 @@ from .common import (
guardstart,
mcgen,
)
-from .schema import QAPISchemaObjectType, QAPISchemaVisitor
+from .schema import (
+ QAPISchemaModule,
+ QAPISchemaObjectType,
+ QAPISchemaVisitor,
+)
from .source import QAPISourceInfo
class QAPIGen:
- def __init__(self, fname: Optional[str]):
+ def __init__(self, fname: str):
self.fname = fname
self._preamble = ''
self._body = ''
@@ -121,7 +125,7 @@ def build_params(arg_type: Optional[QAPISchemaObjectType],
class QAPIGenCCode(QAPIGen):
- def __init__(self, fname: Optional[str]):
+ def __init__(self, fname: str):
super().__init__(fname)
self._start_if: Optional[Tuple[List[str], str, str]] = None
@@ -130,15 +134,12 @@ class QAPIGenCCode(QAPIGen):
self._start_if = (ifcond, self._body, self._preamble)
def end_if(self) -> None:
- assert self._start_if
- self._wrap_ifcond()
- self._start_if = None
-
- def _wrap_ifcond(self) -> None:
+ assert self._start_if is not None
self._body = _wrap_ifcond(self._start_if[0],
self._start_if[1], self._body)
self._preamble = _wrap_ifcond(self._start_if[0],
self._start_if[2], self._preamble)
+ self._start_if = None
def get_content(self) -> str:
assert self._start_if is None
@@ -243,85 +244,88 @@ class QAPISchemaModularCVisitor(QAPISchemaVisitor):
self._user_blurb = user_blurb
self._builtin_blurb = builtin_blurb
self._pydoc = pydoc
- self._genc: Optional[QAPIGenC] = None
- self._genh: Optional[QAPIGenH] = None
- self._module: Dict[Optional[str], Tuple[QAPIGenC, QAPIGenH]] = {}
+ self._current_module: Optional[str] = None
+ self._module: Dict[str, Tuple[QAPIGenC, QAPIGenH]] = {}
self._main_module: Optional[str] = None
- @staticmethod
- def _is_user_module(name: Optional[str]) -> bool:
- return bool(name and not name.startswith('./'))
+ @property
+ def _genc(self) -> QAPIGenC:
+ assert self._current_module is not None
+ return self._module[self._current_module][0]
- @staticmethod
- def _is_builtin_module(name: Optional[str]) -> bool:
- return not name
+ @property
+ def _genh(self) -> QAPIGenH:
+ assert self._current_module is not None
+ return self._module[self._current_module][1]
- def _module_dirname(self, name: Optional[str]) -> str:
- if self._is_user_module(name):
+ @staticmethod
+ def _module_dirname(name: str) -> str:
+ if QAPISchemaModule.is_user_module(name):
return os.path.dirname(name)
return ''
- def _module_basename(self, what: str, name: Optional[str]) -> str:
- ret = '' if self._is_builtin_module(name) else self._prefix
- if self._is_user_module(name):
+ def _module_basename(self, what: str, name: str) -> str:
+ ret = '' if QAPISchemaModule.is_builtin_module(name) else self._prefix
+ if QAPISchemaModule.is_user_module(name):
basename = os.path.basename(name)
ret += what
if name != self._main_module:
ret += '-' + os.path.splitext(basename)[0]
else:
- name = name[2:] if name else 'builtin'
- ret += re.sub(r'-', '-' + name + '-', what)
+ assert QAPISchemaModule.is_system_module(name)
+ ret += re.sub(r'-', '-' + name[2:] + '-', what)
return ret
- def _module_filename(self, what: str, name: Optional[str]) -> str:
+ def _module_filename(self, what: str, name: str) -> str:
return os.path.join(self._module_dirname(name),
self._module_basename(what, name))
- def _add_module(self, name: Optional[str], blurb: str) -> None:
+ def _add_module(self, name: str, blurb: str) -> None:
+ if QAPISchemaModule.is_user_module(name):
+ if self._main_module is None:
+ self._main_module = name
basename = self._module_filename(self._what, name)
genc = QAPIGenC(basename + '.c', blurb, self._pydoc)
genh = QAPIGenH(basename + '.h', blurb, self._pydoc)
self._module[name] = (genc, genh)
- self._genc, self._genh = self._module[name]
-
- def _add_user_module(self, name: str, blurb: str) -> None:
- assert self._is_user_module(name)
- if self._main_module is None:
- self._main_module = name
- self._add_module(name, blurb)
+ self._current_module = name
- def _add_system_module(self, name: Optional[str], blurb: str) -> None:
- self._add_module(name and './' + name, blurb)
+ @contextmanager
+ def _temp_module(self, name: str) -> Iterator[None]:
+ old_module = self._current_module
+ self._current_module = name
+ yield
+ self._current_module = old_module
def write(self, output_dir: str, opt_builtins: bool = False) -> None:
for name in self._module:
- if self._is_builtin_module(name) and not opt_builtins:
+ if QAPISchemaModule.is_builtin_module(name) and not opt_builtins:
continue
(genc, genh) = self._module[name]
genc.write(output_dir)
genh.write(output_dir)
- def _begin_system_module(self, name: None) -> None:
+ def _begin_builtin_module(self) -> None:
pass
def _begin_user_module(self, name: str) -> None:
pass
- def visit_module(self, name: Optional[str]) -> None:
- if name is None:
+ def visit_module(self, name: str) -> None:
+ if QAPISchemaModule.is_builtin_module(name):
if self._builtin_blurb:
- self._add_system_module(None, self._builtin_blurb)
- self._begin_system_module(name)
+ self._add_module(name, self._builtin_blurb)
+ self._begin_builtin_module()
else:
# The built-in module has not been created. No code may
# be generated.
- self._genc = None
- self._genh = None
+ self._current_module = None
else:
- self._add_user_module(name, self._user_blurb)
+ assert QAPISchemaModule.is_user_module(name)
+ self._add_module(name, self._user_blurb)
self._begin_user_module(name)
- def visit_include(self, name: str, info: QAPISourceInfo) -> None:
+ def visit_include(self, name: str, info: Optional[QAPISourceInfo]) -> None:
relname = os.path.relpath(self._module_filename(self._what, name),
os.path.dirname(self._genh.fname))
self._genh.preamble_add(mcgen('''
diff --git a/scripts/qapi/main.py b/scripts/qapi/main.py
index 42517210b8..703e7ed1ed 100644
--- a/scripts/qapi/main.py
+++ b/scripts/qapi/main.py
@@ -23,6 +23,8 @@ from .visit import gen_visit
def invalid_prefix_char(prefix: str) -> Optional[str]:
match = re.match(r'([A-Za-z_.-][A-Za-z0-9_.-]*)?', prefix)
+ # match cannot be None, but mypy cannot infer that.
+ assert match is not None
if match.end() != len(prefix):
return prefix[match.end()]
return None
diff --git a/scripts/qapi/mypy.ini b/scripts/qapi/mypy.ini
index 74fc6c8215..04bd5db527 100644
--- a/scripts/qapi/mypy.ini
+++ b/scripts/qapi/mypy.ini
@@ -1,6 +1,5 @@
[mypy]
strict = True
-strict_optional = False
disallow_untyped_calls = False
python_version = 3.6
diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py
index 720449feee..353e8020a2 100644
--- a/scripts/qapi/schema.py
+++ b/scripts/qapi/schema.py
@@ -68,7 +68,8 @@ class QAPISchemaEntity:
def _set_module(self, schema, info):
assert self._checked
- self._module = schema.module_by_fname(info and info.fname)
+ fname = info.fname if info else QAPISchemaModule.BUILTIN_MODULE_NAME
+ self._module = schema.module_by_fname(fname)
self._module.add_entity(self)
def set_module(self, schema):
@@ -137,10 +138,40 @@ class QAPISchemaVisitor:
class QAPISchemaModule:
+
+ BUILTIN_MODULE_NAME = './builtin'
+
def __init__(self, name):
self.name = name
self._entity_list = []
+ @staticmethod
+ def is_system_module(name: str) -> bool:
+ """
+ System modules are internally defined modules.
+
+ Their names start with the "./" prefix.
+ """
+ return name.startswith('./')
+
+ @classmethod
+ def is_user_module(cls, name: str) -> bool:
+ """
+ User modules are those defined by the user in qapi JSON files.
+
+ They do not start with the "./" prefix.
+ """
+ return not cls.is_system_module(name)
+
+ @classmethod
+ def is_builtin_module(cls, name: str) -> bool:
+ """
+ The built-in module is a single System module for the built-in types.
+
+ It is always "./builtin".
+ """
+ return name == cls.BUILTIN_MODULE_NAME
+
def add_entity(self, ent):
self._entity_list.append(ent)
@@ -825,7 +856,7 @@ class QAPISchema:
self._entity_dict = {}
self._module_dict = OrderedDict()
self._schema_dir = os.path.dirname(fname)
- self._make_module(None) # built-ins
+ self._make_module(QAPISchemaModule.BUILTIN_MODULE_NAME)
self._make_module(fname)
self._predefining = True
self._def_predefineds()
@@ -870,9 +901,9 @@ class QAPISchema:
info, "%s uses unknown type '%s'" % (what, name))
return typ
- def _module_name(self, fname):
- if fname is None:
- return None
+ def _module_name(self, fname: str) -> str:
+ if QAPISchemaModule.is_system_module(fname):
+ return fname
return os.path.relpath(fname, self._schema_dir)
def _make_module(self, fname):
@@ -883,7 +914,6 @@ class QAPISchema:
def module_by_fname(self, fname):
name = self._module_name(fname)
- assert name in self._module_dict
return self._module_dict[name]
def _def_include(self, expr, info, doc):
diff --git a/scripts/qapi/types.py b/scripts/qapi/types.py
index 2b4916cdaa..2bdd626847 100644
--- a/scripts/qapi/types.py
+++ b/scripts/qapi/types.py
@@ -271,7 +271,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor):
prefix, 'qapi-types', ' * Schema-defined QAPI types',
' * Built-in QAPI types', __doc__)
- def _begin_system_module(self, name: None) -> None:
+ def _begin_builtin_module(self) -> None:
self._genc.preamble_add(mcgen('''
#include "qemu/osdep.h"
#include "qapi/dealloc-visitor.h"
@@ -350,7 +350,7 @@ class QAPISchemaGenTypeVisitor(QAPISchemaModularCVisitor):
def visit_alternate_type(self,
name: str,
- info: QAPISourceInfo,
+ info: Optional[QAPISourceInfo],
ifcond: List[str],
features: List[QAPISchemaFeature],
variants: QAPISchemaVariants) -> None:
diff --git a/scripts/qapi/visit.py b/scripts/qapi/visit.py
index 339f152152..22e62df901 100644
--- a/scripts/qapi/visit.py
+++ b/scripts/qapi/visit.py
@@ -305,7 +305,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor):
prefix, 'qapi-visit', ' * Schema-defined QAPI visitors',
' * Built-in QAPI visitors', __doc__)
- def _begin_system_module(self, name: None) -> None:
+ def _begin_builtin_module(self) -> None:
self._genc.preamble_add(mcgen('''
#include "qemu/osdep.h"
#include "qapi/error.h"
@@ -336,7 +336,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor):
def visit_enum_type(self,
name: str,
- info: QAPISourceInfo,
+ info: Optional[QAPISourceInfo],
ifcond: List[str],
features: List[QAPISchemaFeature],
members: List[QAPISchemaEnumMember],
@@ -378,7 +378,7 @@ class QAPISchemaGenVisitVisitor(QAPISchemaModularCVisitor):
def visit_alternate_type(self,
name: str,
- info: QAPISourceInfo,
+ info: Optional[QAPISourceInfo],
ifcond: List[str],
features: List[QAPISchemaFeature],
variants: QAPISchemaVariants) -> None:
diff --git a/scripts/userfaultfd-wrlat.py b/scripts/userfaultfd-wrlat.py
new file mode 100755
index 0000000000..0684be4e04
--- /dev/null
+++ b/scripts/userfaultfd-wrlat.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python3
+#
+# userfaultfd-wrlat Summarize userfaultfd write fault latencies.
+# Events are continuously accumulated for the
+# run, while latency distribution histogram is
+# dumped each 'interval' seconds.
+#
+# For Linux, uses BCC, eBPF.
+#
+# USAGE: userfaultfd-lat [interval [count]]
+#
+# Copyright Virtuozzo GmbH, 2020
+#
+# Authors:
+# Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later. See the COPYING file in the top-level directory.
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+
+def usage():
+ print("USAGE: %s [interval [count]]" % argv[0])
+ exit()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/mm.h>
+
+BPF_HASH(ev_start, u32, u64);
+BPF_HISTOGRAM(ev_delta_hist, u64);
+
+/* Trace UFFD page fault start event. */
+static void do_event_start()
+{
+ /* Using "(u32)" to drop group ID which is upper 32 bits */
+ u32 tid = (u32) bpf_get_current_pid_tgid();
+ u64 ts = bpf_ktime_get_ns();
+
+ ev_start.update(&tid, &ts);
+}
+
+/* Trace UFFD page fault end event. */
+static void do_event_end()
+{
+ /* Using "(u32)" to drop group ID which is upper 32 bits */
+ u32 tid = (u32) bpf_get_current_pid_tgid();
+ u64 ts = bpf_ktime_get_ns();
+ u64 *tsp;
+
+ tsp = ev_start.lookup(&tid);
+ if (tsp) {
+ u64 delta = ts - (*tsp);
+ /* Transform time delta to milliseconds */
+ ev_delta_hist.increment(bpf_log2l(delta / 1000000));
+ ev_start.delete(&tid);
+ }
+}
+
+/* KPROBE for handle_userfault(). */
+int probe_handle_userfault(struct pt_regs *ctx, struct vm_fault *vmf,
+ unsigned long reason)
+{
+ /* Trace only UFFD write faults. */
+ if (reason & VM_UFFD_WP) {
+ do_event_start();
+ }
+ return 0;
+}
+
+/* KRETPROBE for handle_userfault(). */
+int retprobe_handle_userfault(struct pt_regs *ctx)
+{
+ do_event_end();
+ return 0;
+}
+"""
+
+# arguments
+interval = 10
+count = -1
+if len(argv) > 1:
+ try:
+ interval = int(argv[1])
+ if interval == 0:
+ raise
+ if len(argv) > 2:
+ count = int(argv[2])
+ except: # also catches -h, --help
+ usage()
+
+# load BPF program
+b = BPF(text=bpf_text)
+# attach KRPOBEs
+b.attach_kprobe(event="handle_userfault", fn_name="probe_handle_userfault")
+b.attach_kretprobe(event="handle_userfault", fn_name="retprobe_handle_userfault")
+
+# header
+print("Tracing UFFD-WP write fault latency... Hit Ctrl-C to end.")
+
+# output
+loop = 0
+do_exit = 0
+while (1):
+ if count > 0:
+ loop += 1
+ if loop > count:
+ exit()
+ try:
+ sleep(interval)
+ except KeyboardInterrupt:
+ pass; do_exit = 1
+
+ print()
+ b["ev_delta_hist"].print_log2_hist("msecs")
+ if do_exit:
+ exit()
diff --git a/softmmu/cpu-throttle.c b/softmmu/cpu-throttle.c
index 2ec4b8e0bc..8c2144ab95 100644
--- a/softmmu/cpu-throttle.c
+++ b/softmmu/cpu-throttle.c
@@ -90,14 +90,21 @@ static void cpu_throttle_timer_tick(void *opaque)
void cpu_throttle_set(int new_throttle_pct)
{
+ /*
+ * boolean to store whether throttle is already active or not,
+ * before modifying throttle_percentage
+ */
+ bool throttle_active = cpu_throttle_active();
+
/* Ensure throttle percentage is within valid range */
new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
qatomic_set(&throttle_percentage, new_throttle_pct);
- timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
- CPU_THROTTLE_TIMESLICE_NS);
+ if (!throttle_active) {
+ cpu_throttle_timer_tick(NULL);
+ }
}
void cpu_throttle_stop(void)
diff --git a/softmmu/memory.c b/softmmu/memory.c
index c0c814fbb9..874a8fccde 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -1440,7 +1440,7 @@ MemTxResult memory_region_dispatch_read(MemoryRegion *mr,
unsigned size = memop_size(op);
MemTxResult r;
- fuzz_dma_read_cb(addr, size, mr, false);
+ fuzz_dma_read_cb(addr, size, mr);
if (!memory_region_access_valid(mr, addr, size, false, attrs)) {
*pval = unassigned_mem_read(mr, addr, size);
return MEMTX_DECODE_ERROR;
@@ -1612,6 +1612,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
uint64_t size,
bool share,
int fd,
+ ram_addr_t offset,
Error **errp)
{
Error *err = NULL;
@@ -1621,7 +1622,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
mr->destructor = memory_region_destructor_ram;
mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
share ? RAM_SHARED : 0,
- fd, false, &err);
+ fd, offset, false, &err);
if (err) {
mr->size = int128_zero();
object_unparent(OBJECT(mr));
@@ -3285,8 +3286,7 @@ void memory_region_init_rom_device(MemoryRegion *mr,
#ifdef CONFIG_FUZZ
void __attribute__((weak)) fuzz_dma_read_cb(size_t addr,
size_t len,
- MemoryRegion *mr,
- bool is_write)
+ MemoryRegion *mr)
{
}
#endif
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 243c3097d3..19e0aa9836 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -1543,6 +1543,7 @@ static void *file_ram_alloc(RAMBlock *block,
int fd,
bool readonly,
bool truncate,
+ off_t offset,
Error **errp)
{
void *area;
@@ -1593,7 +1594,8 @@ static void *file_ram_alloc(RAMBlock *block,
}
area = qemu_ram_mmap(fd, memory, block->mr->align, readonly,
- block->flags & RAM_SHARED, block->flags & RAM_PMEM);
+ block->flags & RAM_SHARED, block->flags & RAM_PMEM,
+ offset);
if (area == MAP_FAILED) {
error_setg_errno(errp, errno,
"unable to map backing store for guest RAM");
@@ -2024,8 +2026,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
#ifdef CONFIG_POSIX
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
- uint32_t ram_flags, int fd, bool readonly,
- Error **errp)
+ uint32_t ram_flags, int fd, off_t offset,
+ bool readonly, Error **errp)
{
RAMBlock *new_block;
Error *local_err = NULL;
@@ -2079,7 +2081,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
new_block->max_length = size;
new_block->flags = ram_flags;
new_block->host = file_ram_alloc(new_block, size, fd, readonly,
- !file_size, errp);
+ !file_size, offset, errp);
if (!new_block->host) {
g_free(new_block);
return NULL;
@@ -2110,7 +2112,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
return NULL;
}
- block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, readonly, errp);
+ block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp);
if (!block) {
if (created) {
unlink(mem_path);
@@ -2839,7 +2841,7 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
stn_he_p(buf, l, val);
} else {
/* RAM case */
- fuzz_dma_read_cb(addr, len, mr, false);
+ fuzz_dma_read_cb(addr, len, mr);
ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
memcpy(buf, ram_ptr, l);
}
@@ -3200,7 +3202,7 @@ void *address_space_map(AddressSpace *as,
memory_region_ref(mr);
*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
l, is_write, attrs);
- fuzz_dma_read_cb(addr, *plen, mr, is_write);
+ fuzz_dma_read_cb(addr, *plen, mr);
ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
return ptr;
diff --git a/softmmu/rtc.c b/softmmu/rtc.c
index e1e15ef613..5632684fc9 100644
--- a/softmmu/rtc.c
+++ b/softmmu/rtc.c
@@ -179,7 +179,8 @@ void configure_rtc(QemuOpts *opts)
if (!strcmp(value, "slew")) {
object_register_sugar_prop("mc146818rtc",
"lost_tick_policy",
- "slew");
+ "slew",
+ false);
} else if (!strcmp(value, "none")) {
/* discard is default */
} else {
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 2bf94ece9c..b219ce1f35 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -101,6 +101,7 @@
#include "qemu/plugin.h"
#include "qemu/queue.h"
#include "sysemu/arch_init.h"
+#include "exec/confidential-guest-support.h"
#include "ui/qemu-spice.h"
#include "qapi/string-input-visitor.h"
@@ -1663,16 +1664,20 @@ static int machine_set_property(void *opaque,
return 0;
}
if (g_str_equal(qom_name, "igd-passthru")) {
- object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value);
+ object_register_sugar_prop(ACCEL_CLASS_NAME("xen"), qom_name, value,
+ false);
return 0;
}
if (g_str_equal(qom_name, "kvm-shadow-mem")) {
- object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value);
+ object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value,
+ false);
return 0;
}
if (g_str_equal(qom_name, "kernel-irqchip")) {
- object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value);
- object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value);
+ object_register_sugar_prop(ACCEL_CLASS_NAME("kvm"), qom_name, value,
+ false);
+ object_register_sugar_prop(ACCEL_CLASS_NAME("whpx"), qom_name, value,
+ false);
return 0;
}
@@ -2298,9 +2303,10 @@ static void qemu_process_sugar_options(void)
val = g_strdup_printf("%d",
(uint32_t) qemu_opt_get_number(qemu_find_opts_singleton("smp-opts"), "cpus", 1));
- object_register_sugar_prop("memory-backend", "prealloc-threads", val);
+ object_register_sugar_prop("memory-backend", "prealloc-threads", val,
+ false);
g_free(val);
- object_register_sugar_prop("memory-backend", "prealloc", "on");
+ object_register_sugar_prop("memory-backend", "prealloc", "on", false);
}
if (watchdog) {
@@ -2493,6 +2499,8 @@ static void qemu_create_cli_devices(void)
static void qemu_machine_creation_done(void)
{
+ MachineState *machine = MACHINE(qdev_get_machine());
+
/* Did we create any drives that we failed to create a device for? */
drive_check_orphaned();
@@ -2512,6 +2520,13 @@ static void qemu_machine_creation_done(void)
qdev_machine_creation_done();
+ if (machine->cgs) {
+ /*
+ * Verify that Confidential Guest Support has actually been initialized
+ */
+ assert(machine->cgs->ready);
+ }
+
if (foreach_device_config(DEV_GDB, gdbserver_start) < 0) {
exit(1);
}
@@ -2530,7 +2545,7 @@ void qmp_x_exit_preconfig(Error **errp)
if (loadvm) {
Error *local_err = NULL;
- if (load_snapshot(loadvm, &local_err) < 0) {
+ if (!load_snapshot(loadvm, NULL, false, NULL, &local_err)) {
error_report_err(local_err);
autostart = 0;
exit(1);
diff --git a/stubs/meson.build b/stubs/meson.build
index 1a656cd070..a054d5877f 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -53,4 +53,6 @@ endif
if have_system
stub_ss.add(files('semihost.c'))
stub_ss.add(files('xen-hw-stub.c'))
+else
+ stub_ss.add(files('qdev.c'))
endif
diff --git a/stubs/qdev.c b/stubs/qdev.c
new file mode 100644
index 0000000000..92e6143134
--- /dev/null
+++ b/stubs/qdev.c
@@ -0,0 +1,23 @@
+/*
+ * QOM stubs
+ *
+ * Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Author:
+ * Philippe Mathieu-Daudé <philmd@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qapi-events-qdev.h"
+
+void qapi_event_send_device_deleted(bool has_device,
+ const char *device,
+ const char *path)
+{
+ /* Nothing to do. */
+}
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 8ddb2556f8..5cf6c056c5 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2202,6 +2202,10 @@ static void arm_max_initfn(Object *obj)
t = FIELD_DP32(t, ID_MMFR4, CNP, 1); /* TTCNP */
t = FIELD_DP32(t, ID_MMFR4, XNX, 1); /* TTS2UXN */
cpu->isar.id_mmfr4 = t;
+
+ t = cpu->isar.id_pfr0;
+ t = FIELD_DP32(t, ID_PFR0, DIT, 1);
+ cpu->isar.id_pfr0 = t;
}
#endif
}
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index d080239863..f240275407 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -1243,6 +1243,7 @@ void pmu_init(ARMCPU *cpu);
#define CPSR_IT_2_7 (0xfc00U)
#define CPSR_GE (0xfU << 16)
#define CPSR_IL (1U << 20)
+#define CPSR_DIT (1U << 21)
#define CPSR_PAN (1U << 22)
#define CPSR_J (1U << 24)
#define CPSR_IT_0_1 (3U << 25)
@@ -1310,6 +1311,7 @@ void pmu_init(ARMCPU *cpu);
#define PSTATE_SS (1U << 21)
#define PSTATE_PAN (1U << 22)
#define PSTATE_UAO (1U << 23)
+#define PSTATE_DIT (1U << 24)
#define PSTATE_TCO (1U << 25)
#define PSTATE_V (1U << 28)
#define PSTATE_C (1U << 29)
@@ -3876,6 +3878,11 @@ static inline bool isar_feature_aa32_tts2uxn(const ARMISARegisters *id)
return FIELD_EX32(id->id_mmfr4, ID_MMFR4, XNX) != 0;
}
+static inline bool isar_feature_aa32_dit(const ARMISARegisters *id)
+{
+ return FIELD_EX32(id->id_pfr0, ID_PFR0, DIT) != 0;
+}
+
/*
* 64-bit feature tests via id registers.
*/
@@ -4033,6 +4040,11 @@ static inline bool isar_feature_aa64_aa32(const ARMISARegisters *id)
return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, EL0) >= 2;
}
+static inline bool isar_feature_aa64_aa32_el1(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, EL1) >= 2;
+}
+
static inline bool isar_feature_aa64_sve(const ARMISARegisters *id)
{
return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, SVE) != 0;
@@ -4120,6 +4132,11 @@ static inline bool isar_feature_aa64_tts2uxn(const ARMISARegisters *id)
return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, XNX) != 0;
}
+static inline bool isar_feature_aa64_dit(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64pfr0, ID_AA64PFR0, DIT) != 0;
+}
+
/*
* Feature tests for "does this exist in either 32-bit or 64-bit?"
*/
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 10c5118176..c255f1bcc3 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -669,6 +669,7 @@ static void aarch64_max_initfn(Object *obj)
t = FIELD_DP64(t, ID_AA64PFR0, FP, 1);
t = FIELD_DP64(t, ID_AA64PFR0, ADVSIMD, 1);
t = FIELD_DP64(t, ID_AA64PFR0, SEL2, 1);
+ t = FIELD_DP64(t, ID_AA64PFR0, DIT, 1);
cpu->isar.id_aa64pfr0 = t;
t = cpu->isar.id_aa64pfr1;
@@ -718,6 +719,10 @@ static void aarch64_max_initfn(Object *obj)
u = FIELD_DP32(u, ID_ISAR6, SPECRES, 1);
cpu->isar.id_isar6 = u;
+ u = cpu->isar.id_pfr0;
+ u = FIELD_DP32(u, ID_PFR0, DIT, 1);
+ cpu->isar.id_pfr0 = u;
+
u = cpu->isar.id_mmfr3;
u = FIELD_DP32(u, ID_MMFR3, PAN, 2); /* ATS1E1 */
cpu->isar.id_mmfr3 = u;
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index c426c23d2c..ae611d73c2 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -945,11 +945,31 @@ static int el_from_spsr(uint32_t spsr)
}
}
+static void cpsr_write_from_spsr_elx(CPUARMState *env,
+ uint32_t val)
+{
+ uint32_t mask;
+
+ /* Save SPSR_ELx.SS into PSTATE. */
+ env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
+ val &= ~PSTATE_SS;
+
+ /* Move DIT to the correct location for CPSR */
+ if (val & PSTATE_DIT) {
+ val &= ~PSTATE_DIT;
+ val |= CPSR_DIT;
+ }
+
+ mask = aarch32_cpsr_valid_mask(env->features, \
+ &env_archcpu(env)->isar);
+ cpsr_write(env, val, mask, CPSRWriteRaw);
+}
+
void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
{
int cur_el = arm_current_el(env);
unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
- uint32_t mask, spsr = env->banked_spsr[spsr_idx];
+ uint32_t spsr = env->banked_spsr[spsr_idx];
int new_el;
bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
@@ -998,10 +1018,9 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
* will sort the register banks out for us, and we've already
* caught all the bad-mode cases in el_from_spsr().
*/
- mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar);
- cpsr_write(env, spsr, mask, CPSRWriteRaw);
+ cpsr_write_from_spsr_elx(env, spsr);
if (!arm_singlestep_active(env)) {
- env->uncached_cpsr &= ~PSTATE_SS;
+ env->pstate &= ~PSTATE_SS;
}
aarch64_sync_64_to_32(env);
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 1a64bd748c..0e1a3b9421 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -38,6 +38,7 @@
#endif
#define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */
+#define PMCR_NUM_COUNTERS 4 /* QEMU IMPDEF choice */
#ifndef CONFIG_USER_ONLY
@@ -2024,7 +2025,10 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
ARMCPU *cpu = env_archcpu(env);
if (ri->state == ARM_CP_STATE_AA64) {
- value |= SCR_FW | SCR_AW; /* these two bits are RES1. */
+ if (arm_feature(env, ARM_FEATURE_AARCH64) &&
+ !cpu_isar_feature(aa64_aa32_el1, cpu)) {
+ value |= SCR_FW | SCR_AW; /* these two bits are RES1. */
+ }
valid_mask &= ~SCR_NET;
if (cpu_isar_feature(aa64_lor, cpu)) {
@@ -2063,6 +2067,15 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value)
raw_write(env, ri, value);
}
+static void scr_reset(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+ /*
+ * scr_write will set the RES1 bits on an AArch64-only CPU.
+ * The reset value will be 0x30 on an AArch64-only CPU and 0 otherwise.
+ */
+ scr_write(env, ri, 0);
+}
+
static CPAccessResult access_aa64_tid2(CPUARMState *env,
const ARMCPRegInfo *ri,
bool isread)
@@ -4419,6 +4432,24 @@ static const ARMCPRegInfo uao_reginfo = {
.readfn = aa64_uao_read, .writefn = aa64_uao_write
};
+static uint64_t aa64_dit_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+ return env->pstate & PSTATE_DIT;
+}
+
+static void aa64_dit_write(CPUARMState *env, const ARMCPRegInfo *ri,
+ uint64_t value)
+{
+ env->pstate = (env->pstate & ~PSTATE_DIT) | (value & PSTATE_DIT);
+}
+
+static const ARMCPRegInfo dit_reginfo = {
+ .name = "DIT", .state = ARM_CP_STATE_AA64,
+ .opc0 = 3, .opc1 = 3, .crn = 4, .crm = 2, .opc2 = 5,
+ .type = ARM_CP_NO_RAW, .access = PL0_RW,
+ .readfn = aa64_dit_read, .writefn = aa64_dit_write
+};
+
static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
const ARMCPRegInfo *ri,
bool isread)
@@ -5705,13 +5736,11 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
.writefn = gt_hyp_ctl_write, .raw_writefn = raw_write },
#endif
/* The only field of MDCR_EL2 that has a defined architectural reset value
- * is MDCR_EL2.HPMN which should reset to the value of PMCR_EL0.N; but we
- * don't implement any PMU event counters, so using zero as a reset
- * value for MDCR_EL2 is okay
+ * is MDCR_EL2.HPMN which should reset to the value of PMCR_EL0.N.
*/
{ .name = "MDCR_EL2", .state = ARM_CP_STATE_BOTH,
.opc0 = 3, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 1,
- .access = PL2_RW, .resetvalue = 0,
+ .access = PL2_RW, .resetvalue = PMCR_NUM_COUNTERS,
.fieldoffset = offsetof(CPUARMState, cp15.mdcr_el2), },
{ .name = "HPFAR", .state = ARM_CP_STATE_AA32,
.cp = 15, .opc1 = 4, .crn = 6, .crm = 0, .opc2 = 4,
@@ -5785,7 +5814,7 @@ static const ARMCPRegInfo el3_cp_reginfo[] = {
{ .name = "SCR_EL3", .state = ARM_CP_STATE_AA64,
.opc0 = 3, .opc1 = 6, .crn = 1, .crm = 1, .opc2 = 0,
.access = PL3_RW, .fieldoffset = offsetof(CPUARMState, cp15.scr_el3),
- .resetvalue = 0, .writefn = scr_write },
+ .resetfn = scr_reset, .writefn = scr_write },
{ .name = "SCR", .type = ARM_CP_ALIAS | ARM_CP_NEWEL,
.cp = 15, .opc1 = 0, .crn = 1, .crm = 1, .opc2 = 0,
.access = PL1_RW, .accessfn = access_trap_aa32s_el1,
@@ -6642,7 +6671,7 @@ static void define_pmu_regs(ARMCPU *cpu)
* field as main ID register, and we implement four counters in
* addition to the cycle count register.
*/
- unsigned int i, pmcrn = 4;
+ unsigned int i, pmcrn = PMCR_NUM_COUNTERS;
ARMCPRegInfo pmcr = {
.name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0,
.access = PL0_RW,
@@ -8212,6 +8241,10 @@ void register_cp_regs_for_features(ARMCPU *cpu)
define_one_arm_cp_reg(cpu, &uao_reginfo);
}
+ if (cpu_isar_feature(aa64_dit, cpu)) {
+ define_one_arm_cp_reg(cpu, &dit_reginfo);
+ }
+
if (arm_feature(env, ARM_FEATURE_EL2) && cpu_isar_feature(aa64_vh, cpu)) {
define_arm_cp_regs(cpu, vhe_reginfo);
}
@@ -9411,7 +9444,7 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode,
* For exceptions taken to AArch32 we must clear the SS bit in both
* PSTATE and in the old-state value we save to SPSR_<mode>, so zero it now.
*/
- env->uncached_cpsr &= ~PSTATE_SS;
+ env->pstate &= ~PSTATE_SS;
env->spsr = cpsr_read(env);
/* Clear IT bits. */
env->condexec_bits = 0;
@@ -9767,6 +9800,21 @@ static int aarch64_regnum(CPUARMState *env, int aarch32_reg)
}
}
+static uint32_t cpsr_read_for_spsr_elx(CPUARMState *env)
+{
+ uint32_t ret = cpsr_read(env);
+
+ /* Move DIT to the correct location for SPSR_ELx */
+ if (ret & CPSR_DIT) {
+ ret &= ~CPSR_DIT;
+ ret |= PSTATE_DIT;
+ }
+ /* Merge PSTATE.SS into SPSR_ELx */
+ ret |= env->pstate & PSTATE_SS;
+
+ return ret;
+}
+
/* Handle exception entry to a target EL which is using AArch64 */
static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
{
@@ -9889,7 +9937,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
aarch64_save_sp(env, arm_current_el(env));
env->elr_el[new_el] = env->pc;
} else {
- old_mode = cpsr_read(env);
+ old_mode = cpsr_read_for_spsr_elx(env);
env->elr_el[new_el] = env->regs[15];
aarch64_sync_32_to_64(env);
@@ -13183,7 +13231,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
target_ulong *cs_base, uint32_t *pflags)
{
uint32_t flags = env->hflags;
- uint32_t pstate_for_ss;
*cs_base = 0;
assert_hflags_rebuild_correctly(env);
@@ -13193,7 +13240,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
if (cpu_isar_feature(aa64_bti, env_archcpu(env))) {
flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
}
- pstate_for_ss = env->pstate;
} else {
*pc = env->regs[15];
@@ -13241,7 +13287,6 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
flags = FIELD_DP32(flags, TBFLAG_AM32, THUMB, env->thumb);
flags = FIELD_DP32(flags, TBFLAG_AM32, CONDEXEC, env->condexec_bits);
- pstate_for_ss = env->uncached_cpsr;
}
/*
@@ -13254,7 +13299,7 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
* SS_ACTIVE is set in hflags; PSTATE_SS is computed every TB.
*/
if (FIELD_EX32(flags, TBFLAG_ANY, SS_ACTIVE) &&
- (pstate_for_ss & PSTATE_SS)) {
+ (env->pstate & PSTATE_SS)) {
flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
}
diff --git a/target/arm/internals.h b/target/arm/internals.h
index 448982dd2f..b251fe4450 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1228,6 +1228,9 @@ static inline uint32_t aarch32_cpsr_valid_mask(uint64_t features,
if (isar_feature_aa32_pan(id)) {
valid |= CPSR_PAN;
}
+ if (isar_feature_aa32_dit(id)) {
+ valid |= CPSR_DIT;
+ }
return valid;
}
@@ -1246,6 +1249,9 @@ static inline uint32_t aarch64_pstate_valid_mask(const ARMISARegisters *id)
if (isar_feature_aa64_uao(id)) {
valid |= PSTATE_UAO;
}
+ if (isar_feature_aa64_dit(id)) {
+ valid |= PSTATE_DIT;
+ }
if (isar_feature_aa64_mte(id)) {
valid |= PSTATE_TCO;
}
diff --git a/target/arm/machine.c b/target/arm/machine.c
index 581852bc53..6ad1d306b1 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -810,7 +810,7 @@ const VMStateDescription vmstate_arm_cpu = {
VMSTATE_UINT64(env.exclusive_addr, ARMCPU),
VMSTATE_UINT64(env.exclusive_val, ARMCPU),
VMSTATE_UINT64(env.exclusive_high, ARMCPU),
- VMSTATE_UINT64(env.features, ARMCPU),
+ VMSTATE_UNUSED(sizeof(uint64_t)),
VMSTATE_UINT32(env.exception.syndrome, ARMCPU),
VMSTATE_UINT32(env.exception.fsr, ARMCPU),
VMSTATE_UINT64(env.exception.vaddress, ARMCPU),
diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
index 5e0f123043..65cb37d088 100644
--- a/target/arm/op_helper.c
+++ b/target/arm/op_helper.c
@@ -389,14 +389,7 @@ void HELPER(exception_bkpt_insn)(CPUARMState *env, uint32_t syndrome)
uint32_t HELPER(cpsr_read)(CPUARMState *env)
{
- /*
- * We store the ARMv8 PSTATE.SS bit in env->uncached_cpsr.
- * This is convenient for populating SPSR_ELx, but must be
- * hidden from aarch32 mode, where it is not visible.
- *
- * TODO: ARMv8.4-DIT -- need to move SS somewhere else.
- */
- return cpsr_read(env) & ~(CPSR_EXEC | PSTATE_SS);
+ return cpsr_read(env) & ~CPSR_EXEC;
}
void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index ffc060e5d7..1c4b8d02f3 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1700,6 +1700,18 @@ static void handle_msr_i(DisasContext *s, uint32_t insn,
tcg_temp_free_i32(t1);
break;
+ case 0x1a: /* DIT */
+ if (!dc_isar_feature(aa64_dit, s)) {
+ goto do_unallocated;
+ }
+ if (crm & 1) {
+ set_pstate_bits(PSTATE_DIT);
+ } else {
+ clear_pstate_bits(PSTATE_DIT);
+ }
+ /* There's no need to rebuild hflags because DIT is a nop */
+ break;
+
case 0x1e: /* DAIFSet */
t1 = tcg_const_i32(crm);
gen_helper_msr_i_daifset(cpu_env, t1);
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index ae89024d36..9c3d2d60b7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -667,7 +667,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
CPUID_7_0_EBX_RDSEED */
#define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_PKU | \
/* CPUID_7_0_ECX_OSPKE is dynamic */ \
- CPUID_7_0_ECX_LA57)
+ CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS)
#define TCG_7_0_EDX_FEATURES 0
#define TCG_7_1_EAX_FEATURES 0
#define TCG_APM_FEATURES 0
@@ -926,11 +926,11 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"npt", "lbrv", "svm-lock", "nrip-save",
"tsc-scale", "vmcb-clean", "flushbyasid", "decodeassists",
NULL, NULL, "pause-filter", NULL,
- "pfthreshold", NULL, NULL, NULL,
- NULL, NULL, NULL, NULL,
- NULL, NULL, NULL, NULL,
+ "pfthreshold", "avic", NULL, "v-vmsave-vmload",
+ "vgif", NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
+ "svme-addr-chk", NULL, NULL, NULL,
},
.cpuid = { .eax = 0x8000000A, .reg = R_EDX, },
.tcg_features = TCG_SVM_FEATURES,
@@ -964,7 +964,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"la57", NULL, NULL, NULL,
NULL, NULL, "rdpid", NULL,
NULL, "cldemote", NULL, "movdiri",
- "movdir64b", NULL, NULL, NULL,
+ "movdir64b", NULL, NULL, "pks",
},
.cpuid = {
.eax = 7,
@@ -1215,7 +1215,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"vmx-exit-save-efer", "vmx-exit-load-efer",
"vmx-exit-save-preemption-timer", "vmx-exit-clear-bndcfgs",
NULL, "vmx-exit-clear-rtit-ctl", NULL, NULL,
- NULL, NULL, NULL, NULL,
+ NULL, "vmx-exit-load-pkrs", NULL, NULL,
},
.msr = {
.index = MSR_IA32_VMX_TRUE_EXIT_CTLS,
@@ -1230,7 +1230,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
NULL, "vmx-entry-ia32e-mode", NULL, NULL,
NULL, "vmx-entry-load-perf-global-ctrl", "vmx-entry-load-pat", "vmx-entry-load-efer",
"vmx-entry-load-bndcfgs", NULL, "vmx-entry-load-rtit-ctl", NULL,
- NULL, NULL, NULL, NULL,
+ NULL, NULL, "vmx-entry-load-pkrs", NULL,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
},
@@ -5073,6 +5073,11 @@ static uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
} else {
return ~0;
}
+#ifndef TARGET_X86_64
+ if (w == FEAT_8000_0001_EDX) {
+ r &= ~CPUID_EXT2_LM;
+ }
+#endif
if (migratable_only) {
r &= x86_cpu_get_migratable_flags(w);
}
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index d23a5b340a..8d599bb5b8 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -247,6 +247,7 @@ typedef enum X86Seg {
#define CR4_SMEP_MASK (1U << 20)
#define CR4_SMAP_MASK (1U << 21)
#define CR4_PKE_MASK (1U << 22)
+#define CR4_PKS_MASK (1U << 24)
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
@@ -357,6 +358,7 @@ typedef enum X86Seg {
#define MSR_IA32_TSX_CTRL 0x122
#define MSR_IA32_TSCDEADLINE 0x6e0
+#define MSR_IA32_PKRS 0x6e1
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
@@ -670,16 +672,20 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define CPUID_EXT3_PERFCORE (1U << 23)
#define CPUID_EXT3_PERFNB (1U << 24)
-#define CPUID_SVM_NPT (1U << 0)
-#define CPUID_SVM_LBRV (1U << 1)
-#define CPUID_SVM_SVMLOCK (1U << 2)
-#define CPUID_SVM_NRIPSAVE (1U << 3)
-#define CPUID_SVM_TSCSCALE (1U << 4)
-#define CPUID_SVM_VMCBCLEAN (1U << 5)
-#define CPUID_SVM_FLUSHASID (1U << 6)
-#define CPUID_SVM_DECODEASSIST (1U << 7)
-#define CPUID_SVM_PAUSEFILTER (1U << 10)
-#define CPUID_SVM_PFTHRESHOLD (1U << 12)
+#define CPUID_SVM_NPT (1U << 0)
+#define CPUID_SVM_LBRV (1U << 1)
+#define CPUID_SVM_SVMLOCK (1U << 2)
+#define CPUID_SVM_NRIPSAVE (1U << 3)
+#define CPUID_SVM_TSCSCALE (1U << 4)
+#define CPUID_SVM_VMCBCLEAN (1U << 5)
+#define CPUID_SVM_FLUSHASID (1U << 6)
+#define CPUID_SVM_DECODEASSIST (1U << 7)
+#define CPUID_SVM_PAUSEFILTER (1U << 10)
+#define CPUID_SVM_PFTHRESHOLD (1U << 12)
+#define CPUID_SVM_AVIC (1U << 13)
+#define CPUID_SVM_V_VMSAVE_VMLOAD (1U << 15)
+#define CPUID_SVM_VGIF (1U << 16)
+#define CPUID_SVM_SVME_ADDR_CHK (1U << 28)
/* Support RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */
#define CPUID_7_0_EBX_FSGSBASE (1U << 0)
@@ -768,6 +774,8 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define CPUID_7_0_ECX_MOVDIRI (1U << 27)
/* Move 64 Bytes as Direct Store Instruction */
#define CPUID_7_0_ECX_MOVDIR64B (1U << 28)
+/* Protection Keys for Supervisor-mode Pages */
+#define CPUID_7_0_ECX_PKS (1U << 31)
/* AVX512 Neural Network Instructions */
#define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2)
@@ -965,6 +973,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define VMX_VM_EXIT_CLEAR_BNDCFGS 0x00800000
#define VMX_VM_EXIT_PT_CONCEAL_PIP 0x01000000
#define VMX_VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000
+#define VMX_VM_EXIT_LOAD_IA32_PKRS 0x20000000
#define VMX_VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
#define VMX_VM_ENTRY_IA32E_MODE 0x00000200
@@ -976,6 +985,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
#define VMX_VM_ENTRY_LOAD_BNDCFGS 0x00010000
#define VMX_VM_ENTRY_PT_CONCEAL_PIP 0x00020000
#define VMX_VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000
+#define VMX_VM_ENTRY_LOAD_IA32_PKRS 0x00400000
/* Supported Hyper-V Enlightenments */
#define HYPERV_FEAT_RELAXED 0
@@ -1483,6 +1493,7 @@ typedef struct CPUX86State {
uint64_t msr_smi_count;
uint32_t pkru;
+ uint32_t pkrs;
uint32_t tsx_ctrl;
uint64_t spec_ctrl;
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 6bb0c53182..618ad1c409 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -194,6 +194,9 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKU)) {
new_cr4 &= ~CR4_PKE_MASK;
}
+ if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) {
+ new_cr4 &= ~CR4_PKS_MASK;
+ }
env->cr[4] = new_cr4;
env->hflags = hflags;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 6dc1ee052d..e97f841757 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -42,6 +42,7 @@
#include "hw/i386/intel_iommu.h"
#include "hw/i386/x86-iommu.h"
#include "hw/i386/e820_memory_layout.h"
+#include "sysemu/sev.h"
#include "hw/pci/pci.h"
#include "hw/pci/msi.h"
@@ -112,6 +113,7 @@ static bool has_msr_vmx_vmfunc;
static bool has_msr_ucode_rev;
static bool has_msr_vmx_procbased_ctls2;
static bool has_msr_perf_capabs;
+static bool has_msr_pkrs;
static uint32_t has_architectural_pmu_version;
static uint32_t num_architectural_pmu_gp_counters;
@@ -2086,6 +2088,9 @@ static int kvm_get_supported_msrs(KVMState *s)
case MSR_IA32_VMX_PROCBASED_CTLS2:
has_msr_vmx_procbased_ctls2 = true;
break;
+ case MSR_IA32_PKRS:
+ has_msr_pkrs = true;
+ break;
}
}
}
@@ -2135,6 +2140,25 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
uint64_t shadow_mem;
int ret;
struct utsname utsname;
+ Error *local_err = NULL;
+
+ /*
+ * Initialize SEV context, if required
+ *
+ * If no memory encryption is requested (ms->cgs == NULL) this is
+ * a no-op.
+ *
+ * It's also a no-op if a non-SEV confidential guest support
+ * mechanism is selected. SEV is the only mechanism available to
+ * select on x86 at present, so this doesn't arise, but if new
+ * mechanisms are supported in future (e.g. TDX), they'll need
+ * their own initialization either here or elsewhere.
+ */
+ ret = sev_kvm_init(ms->cgs, &local_err);
+ if (ret < 0) {
+ error_report_err(local_err);
+ return ret;
+ }
if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
@@ -2794,6 +2818,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
if (has_msr_smi_count) {
kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
}
+ if (has_msr_pkrs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
+ }
if (has_msr_bndcfgs) {
kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
}
@@ -3185,6 +3212,9 @@ static int kvm_get_msrs(X86CPU *cpu)
if (has_msr_feature_control) {
kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
}
+ if (has_msr_pkrs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
+ }
if (has_msr_bndcfgs) {
kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
}
@@ -3455,6 +3485,9 @@ static int kvm_get_msrs(X86CPU *cpu)
case MSR_IA32_UMWAIT_CONTROL:
env->umwait = msrs[i].data;
break;
+ case MSR_IA32_PKRS:
+ env->pkrs = msrs[i].data;
+ break;
default:
if (msrs[i].index >= MSR_MC0_CTL &&
msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 1614e8c2f8..3768a753af 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -980,7 +980,6 @@ static const VMStateDescription vmstate_umwait = {
}
};
-#ifdef TARGET_X86_64
static bool pkru_needed(void *opaque)
{
X86CPU *cpu = opaque;
@@ -999,7 +998,25 @@ static const VMStateDescription vmstate_pkru = {
VMSTATE_END_OF_LIST()
}
};
-#endif
+
+static bool pkrs_needed(void *opaque)
+{
+ X86CPU *cpu = opaque;
+ CPUX86State *env = &cpu->env;
+
+ return env->pkrs != 0;
+}
+
+static const VMStateDescription vmstate_pkrs = {
+ .name = "cpu/pkrs",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = pkrs_needed,
+ .fields = (VMStateField[]){
+ VMSTATE_UINT32(env.pkrs, X86CPU),
+ VMSTATE_END_OF_LIST()
+ }
+};
static bool tsc_khz_needed(void *opaque)
{
@@ -1480,9 +1497,8 @@ VMStateDescription vmstate_x86_cpu = {
&vmstate_umwait,
&vmstate_tsc_khz,
&vmstate_msr_smi_count,
-#ifdef TARGET_X86_64
&vmstate_pkru,
-#endif
+ &vmstate_pkrs,
&vmstate_spec_ctrl,
&vmstate_mcg_ext_ctl,
&vmstate_msr_intel_pt,
diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c
index c1fecc2101..1ac1fd5b94 100644
--- a/target/i386/sev-stub.c
+++ b/target/i386/sev-stub.c
@@ -54,3 +54,8 @@ int sev_inject_launch_secret(const char *hdr, const char *secret,
{
return 1;
}
+
+int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
+{
+ return 0;
+}
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 1546606811..11c9a3cc21 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -31,6 +31,7 @@
#include "qom/object.h"
#include "exec/address-spaces.h"
#include "monitor/monitor.h"
+#include "exec/confidential-guest-support.h"
#define TYPE_SEV_GUEST "sev-guest"
OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST)
@@ -47,7 +48,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST)
* -machine ...,memory-encryption=sev0
*/
struct SevGuestState {
- Object parent_obj;
+ ConfidentialGuestSupport parent_obj;
/* configuration parameters */
char *sev_device;
@@ -322,7 +323,7 @@ sev_guest_instance_init(Object *obj)
/* sev guest info */
static const TypeInfo sev_guest_info = {
- .parent = TYPE_OBJECT,
+ .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT,
.name = TYPE_SEV_GUEST,
.instance_size = sizeof(SevGuestState),
.instance_finalize = sev_guest_finalize,
@@ -334,26 +335,6 @@ static const TypeInfo sev_guest_info = {
}
};
-static SevGuestState *
-lookup_sev_guest_info(const char *id)
-{
- Object *obj;
- SevGuestState *info;
-
- obj = object_resolve_path_component(object_get_objects_root(), id);
- if (!obj) {
- return NULL;
- }
-
- info = (SevGuestState *)
- object_dynamic_cast(obj, TYPE_SEV_GUEST);
- if (!info) {
- return NULL;
- }
-
- return info;
-}
-
bool
sev_enabled(void)
{
@@ -681,27 +662,24 @@ sev_vm_state_change(void *opaque, int running, RunState state)
}
}
-void *
-sev_guest_init(const char *id)
+int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
- SevGuestState *sev;
+ SevGuestState *sev
+ = (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST);
char *devname;
int ret, fw_error;
uint32_t ebx;
uint32_t host_cbitpos;
struct sev_user_data_status status = {};
+ if (!sev) {
+ return 0;
+ }
+
ret = ram_block_discard_disable(true);
if (ret) {
error_report("%s: cannot disable RAM discard", __func__);
- return NULL;
- }
-
- sev = lookup_sev_guest_info(id);
- if (!sev) {
- error_report("%s: '%s' is not a valid '%s' object",
- __func__, id, TYPE_SEV_GUEST);
- goto err;
+ return -1;
}
sev_guest = sev;
@@ -711,14 +689,14 @@ sev_guest_init(const char *id)
host_cbitpos = ebx & 0x3f;
if (host_cbitpos != sev->cbitpos) {
- error_report("%s: cbitpos check failed, host '%d' requested '%d'",
- __func__, host_cbitpos, sev->cbitpos);
+ error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'",
+ __func__, host_cbitpos, sev->cbitpos);
goto err;
}
if (sev->reduced_phys_bits < 1) {
- error_report("%s: reduced_phys_bits check failed, it should be >=1,"
- " requested '%d'", __func__, sev->reduced_phys_bits);
+ error_setg(errp, "%s: reduced_phys_bits check failed, it should be >=1,"
+ " requested '%d'", __func__, sev->reduced_phys_bits);
goto err;
}
@@ -727,20 +705,19 @@ sev_guest_init(const char *id)
devname = object_property_get_str(OBJECT(sev), "sev-device", NULL);
sev->sev_fd = open(devname, O_RDWR);
if (sev->sev_fd < 0) {
- error_report("%s: Failed to open %s '%s'", __func__,
- devname, strerror(errno));
- }
- g_free(devname);
- if (sev->sev_fd < 0) {
+ error_setg(errp, "%s: Failed to open %s '%s'", __func__,
+ devname, strerror(errno));
+ g_free(devname);
goto err;
}
+ g_free(devname);
ret = sev_platform_ioctl(sev->sev_fd, SEV_PLATFORM_STATUS, &status,
&fw_error);
if (ret) {
- error_report("%s: failed to get platform status ret=%d "
- "fw_error='%d: %s'", __func__, ret, fw_error,
- fw_error_to_str(fw_error));
+ error_setg(errp, "%s: failed to get platform status ret=%d "
+ "fw_error='%d: %s'", __func__, ret, fw_error,
+ fw_error_to_str(fw_error));
goto err;
}
sev->build_id = status.build;
@@ -750,14 +727,14 @@ sev_guest_init(const char *id)
trace_kvm_sev_init();
ret = sev_ioctl(sev->sev_fd, KVM_SEV_INIT, NULL, &fw_error);
if (ret) {
- error_report("%s: failed to initialize ret=%d fw_error=%d '%s'",
- __func__, ret, fw_error, fw_error_to_str(fw_error));
+ error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'",
+ __func__, ret, fw_error, fw_error_to_str(fw_error));
goto err;
}
ret = sev_launch_start(sev);
if (ret) {
- error_report("%s: failed to create encryption context", __func__);
+ error_setg(errp, "%s: failed to create encryption context", __func__);
goto err;
}
@@ -765,23 +742,29 @@ sev_guest_init(const char *id)
qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
qemu_add_vm_change_state_handler(sev_vm_state_change, sev);
- return sev;
+ cgs->ready = true;
+
+ return 0;
err:
sev_guest = NULL;
ram_block_discard_disable(false);
- return NULL;
+ return -1;
}
int
-sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len)
+sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp)
{
- SevGuestState *sev = handle;
-
- assert(sev);
+ if (!sev_guest) {
+ return 0;
+ }
/* if SEV is in update state then encrypt the data else do nothing */
- if (sev_check_state(sev, SEV_STATE_LAUNCH_UPDATE)) {
- return sev_launch_update_data(sev, ptr, len);
+ if (sev_check_state(sev_guest, SEV_STATE_LAUNCH_UPDATE)) {
+ int ret = sev_launch_update_data(sev_guest, ptr, len);
+ if (ret < 0) {
+ error_setg(errp, "failed to encrypt pflash rom");
+ return ret;
+ }
}
return 0;
diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c
index a0f44431fe..b7d6259e4a 100644
--- a/target/i386/tcg/excp_helper.c
+++ b/target/i386/tcg/excp_helper.c
@@ -361,6 +361,7 @@ static int handle_mmu_fault(CPUState *cs, vaddr addr, int size,
uint64_t rsvd_mask = PG_HI_RSVD_MASK;
uint32_t page_offset;
target_ulong vaddr;
+ uint32_t pkr;
is_user = mmu_idx == MMU_USER_IDX;
#if defined(DEBUG_MMU)
@@ -588,21 +589,28 @@ do_check_protect_pse36:
!((env->cr[4] & CR4_SMEP_MASK) && (ptep & PG_USER_MASK)))) {
prot |= PAGE_EXEC;
}
- if ((env->cr[4] & CR4_PKE_MASK) && (env->hflags & HF_LMA_MASK) &&
- (ptep & PG_USER_MASK) && env->pkru) {
+
+ if (!(env->hflags & HF_LMA_MASK)) {
+ pkr = 0;
+ } else if (ptep & PG_USER_MASK) {
+ pkr = env->cr[4] & CR4_PKE_MASK ? env->pkru : 0;
+ } else {
+ pkr = env->cr[4] & CR4_PKS_MASK ? env->pkrs : 0;
+ }
+ if (pkr) {
uint32_t pk = (pte & PG_PKRU_MASK) >> PG_PKRU_BIT;
- uint32_t pkru_ad = (env->pkru >> pk * 2) & 1;
- uint32_t pkru_wd = (env->pkru >> pk * 2) & 2;
- uint32_t pkru_prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
-
- if (pkru_ad) {
- pkru_prot &= ~(PAGE_READ | PAGE_WRITE);
- } else if (pkru_wd && (is_user || env->cr[0] & CR0_WP_MASK)) {
- pkru_prot &= ~PAGE_WRITE;
+ uint32_t pkr_ad = (pkr >> pk * 2) & 1;
+ uint32_t pkr_wd = (pkr >> pk * 2) & 2;
+ uint32_t pkr_prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+
+ if (pkr_ad) {
+ pkr_prot &= ~(PAGE_READ | PAGE_WRITE);
+ } else if (pkr_wd && (is_user || env->cr[0] & CR0_WP_MASK)) {
+ pkr_prot &= ~PAGE_WRITE;
}
- prot &= pkru_prot;
- if ((pkru_prot & (1 << is_write1)) == 0) {
+ prot &= pkr_prot;
+ if ((pkr_prot & (1 << is_write1)) == 0) {
assert(is_write1 != 2);
error_code |= PG_ERROR_PK_MASK;
goto do_fault_protect;
diff --git a/target/i386/tcg/misc_helper.c b/target/i386/tcg/misc_helper.c
index 0bd6c95749..f02e4fd400 100644
--- a/target/i386/tcg/misc_helper.c
+++ b/target/i386/tcg/misc_helper.c
@@ -244,6 +244,7 @@ void helper_rdmsr(CPUX86State *env)
void helper_wrmsr(CPUX86State *env)
{
uint64_t val;
+ CPUState *cs = env_cpu(env);
cpu_svm_check_intercept_param(env, SVM_EXIT_MSR, 1, GETPC());
@@ -296,6 +297,13 @@ void helper_wrmsr(CPUX86State *env)
case MSR_PAT:
env->pat = val;
break;
+ case MSR_IA32_PKRS:
+ if (val & 0xFFFFFFFF00000000ull) {
+ goto error;
+ }
+ env->pkrs = val;
+ tlb_flush(cs);
+ break;
case MSR_VM_HSAVE_PA:
env->vm_hsave = val;
break;
@@ -399,6 +407,9 @@ void helper_wrmsr(CPUX86State *env)
/* XXX: exception? */
break;
}
+ return;
+error:
+ raise_exception_err_ra(env, EXCP0D_GPF, 0, GETPC());
}
void helper_rdmsr(CPUX86State *env)
@@ -430,6 +441,9 @@ void helper_rdmsr(CPUX86State *env)
case MSR_PAT:
val = env->pat;
break;
+ case MSR_IA32_PKRS:
+ val = env->pkrs;
+ break;
case MSR_VM_HSAVE_PA:
val = env->vm_hsave;
break;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 6a4c31f933..af1faf9342 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3075,7 +3075,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
}
if (is_xmm
&& !(s->flags & HF_OSFXSR_MASK)
- && ((b != 0x38 && b != 0x3a) || (s->prefix & PREFIX_DATA))) {
+ && (b != 0x38 && b != 0x3a)) {
goto unknown_op;
}
if (b == 0x0e) {
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 2609e4082e..e73416da68 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1919,6 +1919,7 @@ typedef PowerPCCPU ArchCPU;
#define SPR_750FX_HID2 (0x3F8)
#define SPR_Exxx_L1FINV0 (0x3F8)
#define SPR_L2CR (0x3F9)
+#define SPR_Exxx_L2CSR0 (0x3F9)
#define SPR_L3CR (0x3FA)
#define SPR_750_TDCH (0x3FA)
#define SPR_IABR2 (0x3FA)
@@ -1974,6 +1975,11 @@ typedef PowerPCCPU ArchCPU;
#define L1CSR1_ICFI 0x00000002 /* Instruction Cache Flash Invalidate */
#define L1CSR1_ICE 0x00000001 /* Instruction Cache Enable */
+/* E500 L2CSR0 */
+#define E500_L2CSR0_L2FI (1 << 21) /* L2 cache flash invalidate */
+#define E500_L2CSR0_L2FL (1 << 11) /* L2 cache flush */
+#define E500_L2CSR0_L2LFC (1 << 10) /* L2 cache lock flash clear */
+
/* HID0 bits */
#define HID0_DEEPNAP (1 << 24) /* pre-2.06 */
#define HID0_DOZE (1 << 23) /* pre-2.06 */
@@ -2205,9 +2211,6 @@ enum {
* may be needed for precise access rights control and precise exceptions.
*/
enum {
- /* 1 bit to define user level / supervisor access */
- ACCESS_USER = 0x00,
- ACCESS_SUPER = 0x01,
/* Type of instruction that generated the access */
ACCESS_CODE = 0x10, /* Code fetch access */
ACCESS_INT = 0x20, /* Integer load/store access */
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index daf690a678..0c5056dd5b 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -2929,21 +2929,3 @@ void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t tb_offset)
kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &tb_offset);
}
}
-
-/*
- * Don't set error if KVM_PPC_SVM_OFF ioctl is invoked on kernels
- * that don't support this ioctl.
- */
-void kvmppc_svm_off(Error **errp)
-{
- int rc;
-
- if (!kvm_enabled()) {
- return;
- }
-
- rc = kvm_vm_ioctl(KVM_STATE(current_accel()), KVM_PPC_SVM_OFF);
- if (rc && rc != -ENOTTY) {
- error_setg_errno(errp, -rc, "KVM_PPC_SVM_OFF ioctl failed");
- }
-}
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 73ce2bc951..989f61ace0 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -39,7 +39,6 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
bool radix, bool gtse,
uint64_t proc_tbl);
-void kvmppc_svm_off(Error **errp);
#ifndef CONFIG_USER_ONLY
bool kvmppc_spapr_use_multitce(void);
int kvmppc_spapr_enable_inkernel_multitce(void);
@@ -216,11 +215,6 @@ static inline target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
return 0;
}
-static inline void kvmppc_svm_off(Error **errp)
-{
- return;
-}
-
static inline void kvmppc_set_reg_ppc_online(PowerPCCPU *cpu,
unsigned int online)
{
diff --git a/target/ppc/translate_init.c.inc b/target/ppc/translate_init.c.inc
index 9867d0a6e4..3ec45cbc19 100644
--- a/target/ppc/translate_init.c.inc
+++ b/target/ppc/translate_init.c.inc
@@ -1735,6 +1735,16 @@ static void spr_write_e500_l1csr1(DisasContext *ctx, int sprn, int gprn)
tcg_temp_free(t0);
}
+static void spr_write_e500_l2csr0(DisasContext *ctx, int sprn, int gprn)
+{
+ TCGv t0 = tcg_temp_new();
+
+ tcg_gen_andi_tl(t0, cpu_gpr[gprn],
+ ~(E500_L2CSR0_L2FI | E500_L2CSR0_L2FL | E500_L2CSR0_L2LFC));
+ gen_store_spr(sprn, t0);
+ tcg_temp_free(t0);
+}
+
static void spr_write_booke206_mmucsr0(DisasContext *ctx, int sprn, int gprn)
{
gen_helper_booke206_tlbflush(cpu_env, cpu_gpr[gprn]);
@@ -5029,6 +5039,12 @@ static void init_proc_e500(CPUPPCState *env, int version)
SPR_NOACCESS, SPR_NOACCESS,
&spr_read_generic, &spr_write_e500_l1csr1,
0x00000000);
+ if (version != fsl_e500v1 && version != fsl_e500v2) {
+ spr_register(env, SPR_Exxx_L2CSR0, "L2CSR0",
+ SPR_NOACCESS, SPR_NOACCESS,
+ &spr_read_generic, &spr_write_e500_l2csr0,
+ 0x00000000);
+ }
spr_register(env, SPR_BOOKE_MCSRR0, "MCSRR0",
SPR_NOACCESS, SPR_NOACCESS,
&spr_read_generic, &spr_write_generic,
diff --git a/tests/Makefile.include b/tests/Makefile.include
index ceaf3f0d6e..d34254fb29 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -12,7 +12,7 @@ check-help:
@echo " $(MAKE) check-speed Run qobject speed tests"
@echo " $(MAKE) check-qapi-schema Run QAPI schema tests"
@echo " $(MAKE) check-block Run block tests"
-ifeq ($(CONFIG_TCG),y)
+ifneq ($(filter $(all-check-targets), check-softfloat),)
@echo " $(MAKE) check-tcg Run TCG tests"
@echo " $(MAKE) check-softfloat Run FPU emulation tests"
endif
@@ -40,11 +40,13 @@ SYSEMU_TARGET_LIST := $(subst -softmmu.mak,,$(notdir \
SPEED = quick
-# Per guest TCG tests
+# Build up our target list from the filtered list of ninja targets
+TARGETS=$(patsubst libqemu-%.fa, %, $(filter libqemu-%.fa, $(ninja-targets)))
-BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGET_DIRS))
-CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGET_DIRS))
-RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGET_DIRS))
+# Per guest TCG tests
+BUILD_TCG_TARGET_RULES=$(patsubst %,build-tcg-tests-%, $(TARGETS))
+CLEAN_TCG_TARGET_RULES=$(patsubst %,clean-tcg-tests-%, $(TARGETS))
+RUN_TCG_TARGET_RULES=$(patsubst %,run-tcg-tests-%, $(TARGETS))
# Probe for the Docker Builds needed for each build
$(foreach PROBE_TARGET,$(TARGET_DIRS), \
diff --git a/tests/acceptance/boot_linux.py b/tests/acceptance/boot_linux.py
index 1da4a53d6a..bcd923bb4a 100644
--- a/tests/acceptance/boot_linux.py
+++ b/tests/acceptance/boot_linux.py
@@ -57,7 +57,7 @@ class BootLinuxBase(Test):
self.cancel('Failed to download/prepare boot image')
return boot.path
- def download_cloudinit(self, ssh_pubkey=None):
+ def prepare_cloudinit(self, ssh_pubkey=None):
self.log.info('Preparing cloudinit image')
try:
cloudinit_iso = os.path.join(self.workdir, 'cloudinit.iso')
@@ -70,7 +70,7 @@ class BootLinuxBase(Test):
phone_home_port=self.phone_home_port,
authorized_key=ssh_pubkey)
except Exception:
- self.cancel('Failed to prepared cloudinit image')
+ self.cancel('Failed to prepare the cloudinit image')
return cloudinit_iso
class BootLinux(BootLinuxBase):
@@ -85,15 +85,15 @@ class BootLinux(BootLinuxBase):
super(BootLinux, self).setUp()
self.vm.add_args('-smp', '2')
self.vm.add_args('-m', '1024')
- self.prepare_boot()
- self.prepare_cloudinit(ssh_pubkey)
+ self.set_up_boot()
+ self.set_up_cloudinit(ssh_pubkey)
- def prepare_boot(self):
+ def set_up_boot(self):
path = self.download_boot()
self.vm.add_args('-drive', 'file=%s' % path)
- def prepare_cloudinit(self, ssh_pubkey=None):
- cloudinit_iso = self.download_cloudinit(ssh_pubkey)
+ def set_up_cloudinit(self, ssh_pubkey=None):
+ cloudinit_iso = self.prepare_cloudinit(ssh_pubkey)
self.vm.add_args('-drive', 'file=%s,format=raw' % cloudinit_iso)
def launch_and_wait(self):
diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py
index fb41bb7144..eb01286799 100644
--- a/tests/acceptance/boot_linux_console.py
+++ b/tests/acceptance/boot_linux_console.py
@@ -802,27 +802,7 @@ class BootLinuxConsole(LinuxKernelTest):
# Wait for VM to shut down gracefully
self.vm.wait()
- @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'),
- 'Test artifacts fetched from unreliable dl.armbian.com')
- @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited')
- @skipUnless(P7ZIP_AVAILABLE, '7z not installed')
- def test_arm_orangepi_bionic(self):
- """
- :avocado: tags=arch:arm
- :avocado: tags=machine:orangepi-pc
- :avocado: tags=device:sd
- """
-
- # This test download a 196MB compressed image and expand it to 1GB
- image_url = ('https://dl.armbian.com/orangepipc/archive/'
- 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.7z')
- image_hash = '196a8ffb72b0123d92cea4a070894813d305c71e'
- image_path_7z = self.fetch_asset(image_url, asset_hash=image_hash)
- image_name = 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.img'
- image_path = os.path.join(self.workdir, image_name)
- process.run("7z e -o%s %s" % (self.workdir, image_path_7z))
- image_pow2ceil_expand(image_path)
-
+ def do_test_arm_orangepi_uboot_armbian(self, image_path):
self.vm.set_console()
self.vm.add_args('-drive', 'file=' + image_path + ',if=sd,format=raw',
'-nic', 'user',
@@ -848,6 +828,54 @@ class BootLinuxConsole(LinuxKernelTest):
'to <orangepipc>')
self.wait_for_console_pattern('Starting Load Kernel Modules...')
+ @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'),
+ 'Test artifacts fetched from unreliable apt.armbian.com')
+ @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited')
+ @skipUnless(P7ZIP_AVAILABLE, '7z not installed')
+ def test_arm_orangepi_bionic_19_11(self):
+ """
+ :avocado: tags=arch:arm
+ :avocado: tags=machine:orangepi-pc
+ :avocado: tags=device:sd
+ """
+
+ # This test download a 196MB compressed image and expand it to 1GB
+ image_url = ('https://dl.armbian.com/orangepipc/archive/'
+ 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.7z')
+ image_hash = '196a8ffb72b0123d92cea4a070894813d305c71e'
+ image_path_7z = self.fetch_asset(image_url, asset_hash=image_hash)
+ image_name = 'Armbian_19.11.3_Orangepipc_bionic_current_5.3.9.img'
+ image_path = os.path.join(self.workdir, image_name)
+ process.run("7z e -o%s %s" % (self.workdir, image_path_7z))
+ image_pow2ceil_expand(image_path)
+
+ self.do_test_arm_orangepi_uboot_armbian(image_path)
+
+ @skipUnless(os.getenv('ARMBIAN_ARTIFACTS_CACHED'),
+ 'Test artifacts fetched from unreliable apt.armbian.com')
+ @skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited')
+ def test_arm_orangepi_bionic_20_08(self):
+ """
+ :avocado: tags=arch:arm
+ :avocado: tags=machine:orangepi-pc
+ :avocado: tags=device:sd
+ """
+
+ # This test download a 275 MiB compressed image and expand it
+ # to 1036 MiB, but the underlying filesystem is 1552 MiB...
+ # As we expand it to 2 GiB we are safe.
+
+ image_url = ('https://dl.armbian.com/orangepipc/archive/'
+ 'Armbian_20.08.1_Orangepipc_bionic_current_5.8.5.img.xz')
+ image_hash = ('b4d6775f5673486329e45a0586bf06b6'
+ 'dbe792199fd182ac6b9c7bb6c7d3e6dd')
+ image_path_xz = self.fetch_asset(image_url, asset_hash=image_hash,
+ algorithm='sha256')
+ image_path = archive.extract(image_path_xz, self.workdir)
+ image_pow2ceil_expand(image_path)
+
+ self.do_test_arm_orangepi_uboot_armbian(image_path)
+
@skipUnless(os.getenv('AVOCADO_ALLOW_LARGE_STORAGE'), 'storage limited')
def test_arm_orangepi_uboot_netbsd9(self):
"""
@@ -976,25 +1004,6 @@ class BootLinuxConsole(LinuxKernelTest):
console_pattern = 'Kernel command line: %s' % kernel_command_line
self.wait_for_console_pattern(console_pattern)
- def test_ppc64_pseries(self):
- """
- :avocado: tags=arch:ppc64
- :avocado: tags=machine:pseries
- """
- kernel_url = ('https://archives.fedoraproject.org/pub/archive'
- '/fedora-secondary/releases/29/Everything/ppc64le/os'
- '/ppc/ppc64/vmlinuz')
- kernel_hash = '3fe04abfc852b66653b8c3c897a59a689270bc77'
- kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
-
- self.vm.set_console()
- kernel_command_line = self.KERNEL_COMMON_COMMAND_LINE + 'console=hvc0'
- self.vm.add_args('-kernel', kernel_path,
- '-append', kernel_command_line)
- self.vm.launch()
- console_pattern = 'Kernel command line: %s' % kernel_command_line
- self.wait_for_console_pattern(console_pattern)
-
def test_m68k_q800(self):
"""
:avocado: tags=arch:m68k
@@ -1047,15 +1056,6 @@ class BootLinuxConsole(LinuxKernelTest):
tar_hash = 'ac688fd00561a2b6ce1359f9ff6aa2b98c9a570c'
self.do_test_advcal_2018('07', tar_hash, 'sanity-clause.elf')
- @skip("Test currently broken") # Console stuck as of 5.2-rc1
- def test_microblaze_s3adsp1800(self):
- """
- :avocado: tags=arch:microblaze
- :avocado: tags=machine:petalogix-s3adsp1800
- """
- tar_hash = '08bf3e3bfb6b6c7ce1e54ab65d54e189f2caf13f'
- self.do_test_advcal_2018('17', tar_hash, 'ballerina.bin')
-
def test_or1k_sim(self):
"""
:avocado: tags=arch:or1k
diff --git a/tests/acceptance/linux_ssh_mips_malta.py b/tests/acceptance/linux_ssh_mips_malta.py
index 25c5c5f741..6dbd02d49d 100644
--- a/tests/acceptance/linux_ssh_mips_malta.py
+++ b/tests/acceptance/linux_ssh_mips_malta.py
@@ -91,7 +91,7 @@ class LinuxSSH(Test):
except:
time.sleep(4)
pass
- self.fail("sshd timeout")
+ self.fail("ssh connection timeout")
def ssh_disconnect_vm(self):
self.ssh_session.quit()
diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
index 2baba5fdc2..09e2745cc5 100644
--- a/tests/acceptance/machine_m68k_nextcube.py
+++ b/tests/acceptance/machine_m68k_nextcube.py
@@ -1,19 +1,17 @@
# Functional test that boots a VM and run OCR on the framebuffer
#
-# Copyright (c) Philippe Mathieu-Daudé <f4bug@amsat.org>
+# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org>
#
# This work is licensed under the terms of the GNU GPL, version 2 or
# later. See the COPYING file in the top-level directory.
import os
-import re
import time
-import logging
from avocado_qemu import Test
from avocado import skipUnless
-from avocado.utils import process
-from avocado.utils.path import find_command, CmdNotFoundError
+
+from tesseract_utils import tesseract_available, tesseract_ocr
PIL_AVAILABLE = True
try:
@@ -22,25 +20,6 @@ except ImportError:
PIL_AVAILABLE = False
-def tesseract_available(expected_version):
- try:
- find_command('tesseract')
- except CmdNotFoundError:
- return False
- res = process.run('tesseract --version')
- try:
- version = res.stdout_text.split()[1]
- except IndexError:
- version = res.stderr_text.split()[1]
- return int(version.split('.')[0]) == expected_version
-
- match = re.match(r'tesseract\s(\d)', res)
- if match is None:
- return False
- # now this is guaranteed to be a digit
- return int(match.groups()[0]) == expected_version
-
-
class NextCubeMachine(Test):
"""
:avocado: tags=arch:m68k
@@ -80,12 +59,8 @@ class NextCubeMachine(Test):
def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
screenshot_path = os.path.join(self.workdir, "dump.ppm")
self.check_bootrom_framebuffer(screenshot_path)
-
- console_logger = logging.getLogger('console')
- text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
- for line in text.split('\n'):
- if len(line):
- console_logger.debug(line)
+ lines = tesseract_ocr(screenshot_path, tesseract_version=3)
+ text = '\n'.join(lines)
self.assertIn('Backplane', text)
self.assertIn('Ethernet address', text)
@@ -96,13 +71,8 @@ class NextCubeMachine(Test):
def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
screenshot_path = os.path.join(self.workdir, "dump.ppm")
self.check_bootrom_framebuffer(screenshot_path)
-
- console_logger = logging.getLogger('console')
- proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
- text = proc.stdout_text
- for line in text.split('\n'):
- if len(line):
- console_logger.debug(line)
+ lines = tesseract_ocr(screenshot_path, tesseract_version=4)
+ text = '\n'.join(lines)
self.assertIn('Testing the FPU, SCC', text)
self.assertIn('System test failed. Error code', text)
self.assertIn('Boot command', text)
diff --git a/tests/acceptance/machine_microblaze.py b/tests/acceptance/machine_microblaze.py
new file mode 100644
index 0000000000..7f6d18495d
--- /dev/null
+++ b/tests/acceptance/machine_microblaze.py
@@ -0,0 +1,35 @@
+# Functional test that boots a microblaze Linux kernel and checks the console
+#
+# Copyright (c) 2018, 2021 Red Hat, Inc.
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later. See the COPYING file in the top-level directory.
+
+from avocado_qemu import Test
+from avocado_qemu import wait_for_console_pattern
+from avocado.utils import archive
+
+class MicroblazeMachine(Test):
+
+ timeout = 90
+
+ def test_microblaze_s3adsp1800(self):
+ """
+ :avocado: tags=arch:microblaze
+ :avocado: tags=machine:petalogix-s3adsp1800
+ """
+
+ tar_url = ('https://www.qemu-advent-calendar.org'
+ '/2018/download/day17.tar.xz')
+ tar_hash = '08bf3e3bfb6b6c7ce1e54ab65d54e189f2caf13f'
+ file_path = self.fetch_asset(tar_url, asset_hash=tar_hash)
+ archive.extract(file_path, self.workdir)
+ self.vm.set_console()
+ self.vm.add_args('-kernel', self.workdir + '/day17/ballerina.bin')
+ self.vm.launch()
+ wait_for_console_pattern(self, 'This architecture does not have '
+ 'kernel memory protection')
+ # Note:
+ # The kernel sometimes gets stuck after the "This architecture ..."
+ # message, that's why we don't test for a later string here. This
+ # needs some investigation by a microblaze wizard one day...
diff --git a/tests/acceptance/machine_ppc.py b/tests/acceptance/machine_ppc.py
new file mode 100644
index 0000000000..a836e2496f
--- /dev/null
+++ b/tests/acceptance/machine_ppc.py
@@ -0,0 +1,69 @@
+# Test that Linux kernel boots on ppc machines and check the console
+#
+# Copyright (c) 2018, 2020 Red Hat, Inc.
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later. See the COPYING file in the top-level directory.
+
+from avocado.utils import archive
+from avocado_qemu import Test
+from avocado_qemu import wait_for_console_pattern
+
+class PpcMachine(Test):
+
+ timeout = 90
+ KERNEL_COMMON_COMMAND_LINE = 'printk.time=0 '
+ panic_message = 'Kernel panic - not syncing'
+
+ def test_ppc64_pseries(self):
+ """
+ :avocado: tags=arch:ppc64
+ :avocado: tags=machine:pseries
+ """
+ kernel_url = ('https://archives.fedoraproject.org/pub/archive'
+ '/fedora-secondary/releases/29/Everything/ppc64le/os'
+ '/ppc/ppc64/vmlinuz')
+ kernel_hash = '3fe04abfc852b66653b8c3c897a59a689270bc77'
+ kernel_path = self.fetch_asset(kernel_url, asset_hash=kernel_hash)
+
+ self.vm.set_console()
+ kernel_command_line = self.KERNEL_COMMON_COMMAND_LINE + 'console=hvc0'
+ self.vm.add_args('-kernel', kernel_path,
+ '-append', kernel_command_line)
+ self.vm.launch()
+ console_pattern = 'Kernel command line: %s' % kernel_command_line
+ wait_for_console_pattern(self, console_pattern, self.panic_message)
+
+ def test_ppc_mpc8544ds(self):
+ """
+ :avocado: tags=arch:ppc
+ :avocado: tags=machine:mpc8544ds
+ """
+ tar_url = ('https://www.qemu-advent-calendar.org'
+ '/2020/download/day17.tar.gz')
+ tar_hash = '7a5239542a7c4257aa4d3b7f6ddf08fb6775c494'
+ file_path = self.fetch_asset(tar_url, asset_hash=tar_hash)
+ archive.extract(file_path, self.workdir)
+ self.vm.set_console()
+ self.vm.add_args('-kernel', self.workdir + '/creek/creek.bin')
+ self.vm.launch()
+ wait_for_console_pattern(self, 'QEMU advent calendar 2020',
+ self.panic_message)
+
+ def test_ppc_virtex_ml507(self):
+ """
+ :avocado: tags=arch:ppc
+ :avocado: tags=machine:virtex-ml507
+ """
+ tar_url = ('https://www.qemu-advent-calendar.org'
+ '/2020/download/hippo.tar.gz')
+ tar_hash = '306b95bfe7d147f125aa176a877e266db8ef914a'
+ file_path = self.fetch_asset(tar_url, asset_hash=tar_hash)
+ archive.extract(file_path, self.workdir)
+ self.vm.set_console()
+ self.vm.add_args('-kernel', self.workdir + '/hippo/hippo.linux',
+ '-dtb', self.workdir + '/hippo/virtex440-ml507.dtb',
+ '-m', '512')
+ self.vm.launch()
+ wait_for_console_pattern(self, 'QEMU advent calendar 2020',
+ self.panic_message)
diff --git a/tests/acceptance/replay_kernel.py b/tests/acceptance/replay_kernel.py
index 772633b01d..c1cb862468 100644
--- a/tests/acceptance/replay_kernel.py
+++ b/tests/acceptance/replay_kernel.py
@@ -31,7 +31,7 @@ class ReplayKernelBase(LinuxKernelTest):
terminates.
"""
- timeout = 90
+ timeout = 120
KERNEL_COMMON_COMMAND_LINE = 'printk.time=1 panic=-1 '
def run_vm(self, kernel_path, kernel_command_line, console_pattern,
diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
new file mode 100644
index 0000000000..72cd9ab798
--- /dev/null
+++ b/tests/acceptance/tesseract_utils.py
@@ -0,0 +1,46 @@
+# ...
+#
+# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later. See the COPYING file in the top-level directory.
+
+import re
+import logging
+
+from avocado.utils import process
+from avocado.utils.path import find_command, CmdNotFoundError
+
+def tesseract_available(expected_version):
+ try:
+ find_command('tesseract')
+ except CmdNotFoundError:
+ return False
+ res = process.run('tesseract --version')
+ try:
+ version = res.stdout_text.split()[1]
+ except IndexError:
+ version = res.stderr_text.split()[1]
+ return int(version.split('.')[0]) == expected_version
+
+ match = re.match(r'tesseract\s(\d)', res)
+ if match is None:
+ return False
+ # now this is guaranteed to be a digit
+ return int(match.groups()[0]) == expected_version
+
+
+def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
+ console_logger = logging.getLogger('tesseract')
+ console_logger.debug(image_path)
+ if tesseract_version == 4:
+ tesseract_args += ' --oem 1'
+ proc = process.run("tesseract {} {} stdout".format(tesseract_args,
+ image_path))
+ lines = []
+ for line in proc.stdout_text.split('\n'):
+ sline = line.strip()
+ if len(sline):
+ console_logger.debug(sline)
+ lines += [sline]
+ return lines
diff --git a/tests/acceptance/virtiofs_submounts.py b/tests/acceptance/virtiofs_submounts.py
index 361e5990b6..949ca87a83 100644
--- a/tests/acceptance/virtiofs_submounts.py
+++ b/tests/acceptance/virtiofs_submounts.py
@@ -83,20 +83,21 @@ class VirtiofsSubmountsTest(BootLinux):
command_line='info usernet')
for line in res.split('\r\n'):
match = \
- re.search(r'TCP.HOST_FORWARD.*127\.0\.0\.1\s*(\d+)\s+10\.',
+ re.search(r'TCP.HOST_FORWARD.*127\.0\.0\.1\s+(\d+)\s+10\.',
line)
if match is not None:
- port = match[1]
+ port = int(match[1])
break
self.assertIsNotNone(port)
- self.log.debug('sshd listening on port: ' + port)
+ self.assertGreater(port, 0)
+ self.log.debug('sshd listening on port: %d', port)
return port
def ssh_connect(self, username, keyfile):
self.ssh_logger = logging.getLogger('ssh')
port = self.get_portfwd()
- self.ssh_session = ssh.Session('127.0.0.1', port=int(port),
+ self.ssh_session = ssh.Session('127.0.0.1', port=port,
user=username, key=keyfile)
for i in range(10):
try:
@@ -105,7 +106,7 @@ class VirtiofsSubmountsTest(BootLinux):
except:
time.sleep(4)
pass
- self.fail('sshd timeout')
+ self.fail('ssh connection timeout')
def ssh_command(self, command):
self.ssh_logger.info(command)
@@ -136,8 +137,7 @@ class VirtiofsSubmountsTest(BootLinux):
return (stdout, stderr, ret)
def set_up_shared_dir(self):
- atwd = os.getenv('AVOCADO_TEST_WORKDIR')
- self.shared_dir = os.path.join(atwd, 'virtiofs-shared')
+ self.shared_dir = os.path.join(self.workdir, 'virtiofs-shared')
os.mkdir(self.shared_dir)
@@ -234,10 +234,9 @@ class VirtiofsSubmountsTest(BootLinux):
self.seed = self.params.get('seed')
- atwd = os.getenv('AVOCADO_TEST_WORKDIR')
- self.ssh_key = os.path.join(atwd, 'id_ed25519')
+ self.ssh_key = os.path.join(self.workdir, 'id_ed25519')
- self.run(('ssh-keygen', '-t', 'ed25519', '-f', self.ssh_key))
+ self.run(('ssh-keygen', '-N', '', '-t', 'ed25519', '-f', self.ssh_key))
pubkey = open(self.ssh_key + '.pub').read()
@@ -249,7 +248,7 @@ class VirtiofsSubmountsTest(BootLinux):
# Allow us to connect to SSH
self.vm.add_args('-netdev', 'user,id=vnet,hostfwd=:127.0.0.1:0-:22',
- '-device', 'e1000,netdev=vnet')
+ '-device', 'virtio-net,netdev=vnet')
if not kvm_available(self.arch, self.qemu_bin):
self.cancel(KVM_NOT_AVAILABLE)
diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include
index 0779dab5b9..93b29ad823 100644
--- a/tests/docker/Makefile.include
+++ b/tests/docker/Makefile.include
@@ -1,6 +1,6 @@
# Makefile for Docker tests
-.PHONY: docker docker-test docker-clean docker-image docker-qemu-src
+.PHONY: docker docker-help docker-test docker-clean docker-image docker-qemu-src
NULL :=
SPACE := $(NULL) #
@@ -11,7 +11,7 @@ HOST_ARCH = $(if $(ARCH),$(ARCH),$(shell uname -m))
DOCKER_SUFFIX := .docker
DOCKER_FILES_DIR := $(SRC_PATH)/tests/docker/dockerfiles
# we don't run tests on intermediate images (used as base by another image)
-DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap
+DOCKER_PARTIAL_IMAGES := debian10 debian11 debian-bootstrap empty
DOCKER_IMAGES := $(sort $(notdir $(basename $(wildcard $(DOCKER_FILES_DIR)/*.docker))))
DOCKER_TARGETS := $(patsubst %,docker-image-%,$(DOCKER_IMAGES))
# Use a global constant ccache directory to speed up repetitive builds
@@ -92,6 +92,24 @@ docker-binfmt-image-debian-%: $(DOCKER_FILES_DIR)/debian-bootstrap.docker
{ echo "You will need to build $(EXECUTABLE)"; exit 1;},\
"CHECK", "debian-$* exists"))
+# These are test targets
+USER_TCG_TARGETS=$(patsubst %-linux-user,qemu-%,$(filter %-linux-user,$(TARGET_DIRS)))
+EXEC_COPY_TESTS=$(patsubst %,docker-exec-copy-test-%, $(USER_TCG_TARGETS))
+
+$(EXEC_COPY_TESTS): docker-exec-copy-test-%: $(DOCKER_FILES_DIR)/empty.docker
+ $(call quiet-command, \
+ $(DOCKER_SCRIPT) build -t qemu/exec-copy-test-$* -f $< \
+ $(if $V,,--quiet) --no-cache \
+ --include-executable=$* \
+ --skip-binfmt, \
+ "TEST","copy $* to container")
+ $(call quiet-command, \
+ $(DOCKER_SCRIPT) run qemu/exec-copy-test-$* \
+ /$* -version > tests/docker-exec-copy-test-$*.out, \
+ "TEST","check $* works in container")
+
+docker-exec-copy-test: $(EXEC_COPY_TESTS)
+
endif
# Enforce dependencies for composite images
@@ -209,7 +227,7 @@ endif
@echo ' before running the command.'
@echo ' NETWORK=1 Enable virtual network interface with default backend.'
@echo ' NETWORK=$$BACKEND Enable virtual network interface with $$BACKEND.'
- @echo ' NOUSER Define to disable adding current user to containers passwd.'
+ @echo ' NOUSER=1 Define to disable adding current user to containers passwd.'
@echo ' NOCACHE=1 Ignore cache when build images.'
@echo ' EXECUTABLE=<path> Include executable in image.'
@echo ' EXTRA_FILES="<path> [... <path>]"'
@@ -218,6 +236,8 @@ endif
@echo ' Specify which container engine to run.'
@echo ' REGISTRY=url Cache builds from registry (default:$(DOCKER_REGISTRY))'
+docker-help: docker
+
# This rule if for directly running against an arbitrary docker target.
# It is called by the expanded docker targets (e.g. make
# docker-test-foo@bar) which will do additional verification.
diff --git a/tests/docker/docker.py b/tests/docker/docker.py
index 884dfeb29c..d28df4c140 100755
--- a/tests/docker/docker.py
+++ b/tests/docker/docker.py
@@ -93,7 +93,7 @@ def _guess_engine_command():
commands_txt)
-def _copy_with_mkdir(src, root_dir, sub_path='.'):
+def _copy_with_mkdir(src, root_dir, sub_path='.', name=None):
"""Copy src into root_dir, creating sub_path as needed."""
dest_dir = os.path.normpath("%s/%s" % (root_dir, sub_path))
try:
@@ -102,8 +102,13 @@ def _copy_with_mkdir(src, root_dir, sub_path='.'):
# we can safely ignore already created directories
pass
- dest_file = "%s/%s" % (dest_dir, os.path.basename(src))
- copy(src, dest_file)
+ dest_file = "%s/%s" % (dest_dir, name if name else os.path.basename(src))
+
+ try:
+ copy(src, dest_file)
+ except FileNotFoundError:
+ print("Couldn't copy %s to %s" % (src, dest_file))
+ pass
def _get_so_libs(executable):
@@ -120,7 +125,7 @@ def _get_so_libs(executable):
search = ldd_re.search(line)
if search:
try:
- libs.append(s.group(1))
+ libs.append(search.group(1))
except IndexError:
pass
except subprocess.CalledProcessError:
@@ -150,8 +155,9 @@ def _copy_binary_with_libs(src, bin_dest, dest_dir):
if libs:
for l in libs:
so_path = os.path.dirname(l)
+ name = os.path.basename(l)
real_l = os.path.realpath(l)
- _copy_with_mkdir(real_l, dest_dir, so_path)
+ _copy_with_mkdir(real_l, dest_dir, so_path, name)
def _check_binfmt_misc(executable):
@@ -432,6 +438,9 @@ class BuildCommand(SubCommand):
help="""Specify a binary that will be copied to the
container together with all its dependent
libraries""")
+ parser.add_argument("--skip-binfmt",
+ action="store_true",
+ help="""Skip binfmt entry check (used for testing)""")
parser.add_argument("--extra-files", nargs='*',
help="""Specify files that will be copied in the
Docker image, fulfilling the ADD directive from the
@@ -460,7 +469,9 @@ class BuildCommand(SubCommand):
docker_dir = tempfile.mkdtemp(prefix="docker_build")
# Validate binfmt_misc will work
- if args.include_executable:
+ if args.skip_binfmt:
+ qpath = args.include_executable
+ elif args.include_executable:
qpath, enabled = _check_binfmt_misc(args.include_executable)
if not enabled:
return 1
diff --git a/tests/docker/dockerfiles/empty.docker b/tests/docker/dockerfiles/empty.docker
new file mode 100644
index 0000000000..9ba980f1a8
--- /dev/null
+++ b/tests/docker/dockerfiles/empty.docker
@@ -0,0 +1,8 @@
+#
+# Empty Dockerfile
+#
+
+FROM scratch
+
+# Add everything from the context into the container
+ADD . /
diff --git a/tests/meson.build b/tests/meson.build
index 29ebaba48d..7d7da6a636 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -86,7 +86,6 @@ tests = {
'test-qobject-input-visitor': [testqapi],
'test-string-input-visitor': [testqapi],
'test-string-output-visitor': [testqapi],
- 'test-qmp-event': [testqapi],
'test-opts-visitor': [testqapi],
'test-visitor-serialization': [testqapi],
'test-bitmap': [],
@@ -117,6 +116,12 @@ tests = {
'test-qapi-util': [],
}
+if have_system or have_tools
+ tests += {
+ 'test-qmp-event': [testqapi],
+ }
+endif
+
test_deps = {
'test-qht-par': qht_bench,
}
@@ -276,7 +281,9 @@ test('decodetree', sh,
workdir: meson.current_source_dir() / 'decode',
suite: 'decodetree')
-subdir('fp')
+if 'CONFIG_TCG' in config_all
+ subdir('fp')
+endif
if not get_option('tcg').disabled()
if 'CONFIG_PLUGIN' in config_host
diff --git a/tests/qapi-schema/comments.out b/tests/qapi-schema/comments.out
index 273f0f54e1..ce4f6a4f0f 100644
--- a/tests/qapi-schema/comments.out
+++ b/tests/qapi-schema/comments.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/doc-good.out b/tests/qapi-schema/doc-good.out
index 419284dae2..715b0bbc1a 100644
--- a/tests/qapi-schema/doc-good.out
+++ b/tests/qapi-schema/doc-good.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/empty.out b/tests/qapi-schema/empty.out
index 69666c39ad..3feb3f69d3 100644
--- a/tests/qapi-schema/empty.out
+++ b/tests/qapi-schema/empty.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/event-case.out b/tests/qapi-schema/event-case.out
index 42ae519656..9ae44052ac 100644
--- a/tests/qapi-schema/event-case.out
+++ b/tests/qapi-schema/event-case.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/include-repetition.out b/tests/qapi-schema/include-repetition.out
index 0b654ddebb..16dbd9b819 100644
--- a/tests/qapi-schema/include-repetition.out
+++ b/tests/qapi-schema/include-repetition.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/include-simple.out b/tests/qapi-schema/include-simple.out
index 061f81e509..48e923bfbc 100644
--- a/tests/qapi-schema/include-simple.out
+++ b/tests/qapi-schema/include-simple.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/indented-expr.out b/tests/qapi-schema/indented-expr.out
index 04356775cd..6a30ded3fa 100644
--- a/tests/qapi-schema/indented-expr.out
+++ b/tests/qapi-schema/indented-expr.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qapi-schema/qapi-schema-test.out b/tests/qapi-schema/qapi-schema-test.out
index 8868ca0dca..3b1387d9f1 100644
--- a/tests/qapi-schema/qapi-schema-test.out
+++ b/tests/qapi-schema/qapi-schema-test.out
@@ -1,4 +1,4 @@
-module None
+module ./builtin
object q_empty
enum QType
prefix QTYPE
diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out
index dc1a3c9786..2e9fc596eb 100644
--- a/tests/qemu-iotests/210.out
+++ b/tests/qemu-iotests/210.out
@@ -182,7 +182,7 @@ Job failed: The requested file size is too large
=== Resize image with invalid sizes ===
{"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775296}}
-{"error": {"class": "GenericError", "desc": "Required too big image size, it must be not greater than 9223372035781033984"}}
+{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) exceeds maximum(9223372035781033984)"}}
{"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775808}}
{"error": {"class": "GenericError", "desc": "Invalid parameter type for 'size', expected: integer"}}
{"execute": "block_resize", "arguments": {"node-name": "node1", "size": 18446744073709551104}}
diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
index e725cefd47..4f96825a22 100755
--- a/tests/qemu-iotests/264
+++ b/tests/qemu-iotests/264
@@ -20,60 +20,102 @@
#
import time
+import os
import iotests
-from iotests import qemu_img_create, file_path, qemu_nbd_popen, log
-
-iotests.script_initialize(
- supported_fmts=['qcow2'],
-)
+from iotests import qemu_img_create, file_path, qemu_nbd_popen
disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
nbd_uri = 'nbd+unix:///?socket=' + nbd_sock
-size = 5 * 1024 * 1024
wait_limit = 3.0
wait_step = 0.2
-qemu_img_create('-f', iotests.imgfmt, disk_a, str(size))
-qemu_img_create('-f', iotests.imgfmt, disk_b, str(size))
-
-with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b):
- vm = iotests.VM().add_drive(disk_a)
- vm.launch()
- vm.hmp_qemu_io('drive0', 'write 0 {}'.format(size))
-
- vm.qmp_log('blockdev-add', filters=[iotests.filter_qmp_testfiles],
- **{'node_name': 'backup0',
- 'driver': 'raw',
- 'file': {'driver': 'nbd',
- 'server': {'type': 'unix', 'path': nbd_sock},
- 'reconnect-delay': 10}})
- vm.qmp_log('blockdev-backup', device='drive0', sync='full',
- target='backup0', speed=(1 * 1024 * 1024))
-
- # Wait for some progress
- t = 0.0
- while t < wait_limit:
- jobs = vm.qmp('query-block-jobs')['return']
- if jobs and jobs[0]['offset'] > 0:
- break
- time.sleep(wait_step)
- t += wait_step
-
- if jobs and jobs[0]['offset'] > 0:
- log('Backup job is started')
-
-jobs = vm.qmp('query-block-jobs')['return']
-if jobs and jobs[0]['offset'] < jobs[0]['len']:
- log('Backup job is still in progress')
-
-vm.qmp_log('block-job-set-speed', device='drive0', speed=0)
-
-# Emulate server down time for 1 second
-time.sleep(1)
-
-with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b):
- e = vm.event_wait('BLOCK_JOB_COMPLETED')
- log('Backup completed: {}'.format(e['data']['offset']))
- vm.qmp_log('blockdev-del', node_name='backup0')
- vm.shutdown()
+
+class TestNbdReconnect(iotests.QMPTestCase):
+ def init_vm(self, disk_size):
+ qemu_img_create('-f', iotests.imgfmt, disk_a, str(disk_size))
+ qemu_img_create('-f', iotests.imgfmt, disk_b, str(disk_size))
+ self.vm = iotests.VM().add_drive(disk_a)
+ self.vm.launch()
+ self.vm.hmp_qemu_io('drive0', 'write 0 {}'.format(disk_size))
+
+ def tearDown(self):
+ self.vm.shutdown()
+ os.remove(disk_a)
+ os.remove(disk_b)
+
+ def start_job(self, job):
+ """Stat job with nbd target and kill the server"""
+ assert job in ('blockdev-backup', 'blockdev-mirror')
+ with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b):
+ result = self.vm.qmp('blockdev-add',
+ **{'node_name': 'backup0',
+ 'driver': 'raw',
+ 'file': {'driver': 'nbd',
+ 'server': {'type': 'unix',
+ 'path': nbd_sock},
+ 'reconnect-delay': 10}})
+ self.assert_qmp(result, 'return', {})
+ result = self.vm.qmp(job, device='drive0',
+ sync='full', target='backup0',
+ speed=(1 * 1024 * 1024))
+ self.assert_qmp(result, 'return', {})
+
+ # Wait for some progress
+ t = 0.0
+ while t < wait_limit:
+ jobs = self.vm.qmp('query-block-jobs')['return']
+ if jobs and jobs[0]['offset'] > 0:
+ break
+ time.sleep(wait_step)
+ t += wait_step
+
+ self.assertTrue(jobs and jobs[0]['offset'] > 0) # job started
+
+ jobs = self.vm.qmp('query-block-jobs')['return']
+ # Check that job is still in progress
+ self.assertTrue(jobs)
+ self.assertTrue(jobs[0]['offset'] < jobs[0]['len'])
+
+ result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0)
+ self.assert_qmp(result, 'return', {})
+
+ # Emulate server down time for 1 second
+ time.sleep(1)
+
+ def test_backup(self):
+ size = 5 * 1024 * 1024
+ self.init_vm(size)
+ self.start_job('blockdev-backup')
+
+ with qemu_nbd_popen('-k', nbd_sock, '-f', iotests.imgfmt, disk_b):
+ e = self.vm.event_wait('BLOCK_JOB_COMPLETED')
+ self.assertEqual(e['data']['offset'], size)
+ result = self.vm.qmp('blockdev-del', node_name='backup0')
+ self.assert_qmp(result, 'return', {})
+
+ def cancel_job(self):
+ result = self.vm.qmp('block-job-cancel', device='drive0')
+ self.assert_qmp(result, 'return', {})
+
+ start_t = time.time()
+ self.vm.event_wait('BLOCK_JOB_CANCELLED')
+ delta_t = time.time() - start_t
+ self.assertTrue(delta_t < 2.0)
+
+ def test_mirror_cancel(self):
+ # Mirror speed limit doesn't work well enough, it seems that mirror
+ # will run many parallel requests anyway. MAX_IN_FLIGHT is 16 and
+ # MAX_IO_BYTES is 1M in mirror.c, so let's use 20M disk.
+ self.init_vm(20 * 1024 * 1024)
+ self.start_job('blockdev-mirror')
+ self.cancel_job()
+
+ def test_backup_cancel(self):
+ self.init_vm(5 * 1024 * 1024)
+ self.start_job('blockdev-backup')
+ self.cancel_job()
+
+
+if __name__ == '__main__':
+ iotests.main(supported_fmts=['qcow2'])
diff --git a/tests/qemu-iotests/264.out b/tests/qemu-iotests/264.out
index c45b1e81ef..8d7e996700 100644
--- a/tests/qemu-iotests/264.out
+++ b/tests/qemu-iotests/264.out
@@ -1,15 +1,5 @@
-Start NBD server
-{"execute": "blockdev-add", "arguments": {"driver": "raw", "file": {"driver": "nbd", "reconnect-delay": 10, "server": {"path": "TEST_DIR/PID-nbd-sock", "type": "unix"}}, "node-name": "backup0"}}
-{"return": {}}
-{"execute": "blockdev-backup", "arguments": {"device": "drive0", "speed": 1048576, "sync": "full", "target": "backup0"}}
-{"return": {}}
-Backup job is started
-Kill NBD server
-Backup job is still in progress
-{"execute": "block-job-set-speed", "arguments": {"device": "drive0", "speed": 0}}
-{"return": {}}
-Start NBD server
-Backup completed: 5242880
-{"execute": "blockdev-del", "arguments": {"node-name": "backup0"}}
-{"return": {}}
-Kill NBD server
+...
+----------------------------------------------------------------------
+Ran 3 tests
+
+OK
diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out
index 27471ffae8..7176e376e1 100644
--- a/tests/qemu-iotests/267.out
+++ b/tests/qemu-iotests/267.out
@@ -6,11 +6,11 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
Testing:
QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0
-Error: No block device can accept snapshots
+Error: no block device can store vmstate for snapshot
(qemu) info snapshots
-No available block device supports snapshots
+no block device can store vmstate for snapshot
(qemu) loadvm snap0
-Error: No block device supports snapshots
+Error: no block device can store vmstate for snapshot
(qemu) quit
@@ -22,7 +22,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0
Error: Device 'none0' is writable but does not support snapshots
(qemu) info snapshots
-No available block device supports snapshots
+no block device can store vmstate for snapshot
(qemu) loadvm snap0
Error: Device 'none0' is writable but does not support snapshots
(qemu) quit
@@ -58,7 +58,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0
Error: Device 'virtio0' is writable but does not support snapshots
(qemu) info snapshots
-No available block device supports snapshots
+no block device can store vmstate for snapshot
(qemu) loadvm snap0
Error: Device 'virtio0' is writable but does not support snapshots
(qemu) quit
@@ -83,7 +83,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
(qemu) savevm snap0
Error: Device 'file' is writable but does not support snapshots
(qemu) info snapshots
-No available block device supports snapshots
+no block device can store vmstate for snapshot
(qemu) loadvm snap0
Error: Device 'file' is writable but does not support snapshots
(qemu) quit
diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index ef105dfc39..0fc52d20d7 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -53,6 +53,15 @@ _in_fd=4
# If $mismatch_only is set, only non-matching responses will
# be echoed.
#
+# If $capture_events is non-empty, then any QMP event names it lists
+# will not be echoed out, but instead collected in the $QEMU_EVENTS
+# variable. The _wait_event function can later be used to receive
+# the cached events.
+#
+# If $only_capture_events is set to anything but an empty string,
+# then an error will be raised if a QMP message is seen which is
+# not an event listed in $capture_events.
+#
# If $success_or_failure is set, the meaning of the arguments is
# changed as follows:
# $2: A string to search for in the response; if found, this indicates
@@ -78,6 +87,31 @@ _timed_wait_for()
QEMU_STATUS[$h]=0
while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]}
do
+ if [ -n "$capture_events" ]; then
+ capture=0
+ local evname
+ for evname in $capture_events
+ do
+ case ${resp} in
+ *\"event\":\ \"${evname}\"* ) capture=1 ;;
+ esac
+ done
+ if [ $capture = 1 ];
+ then
+ ev=$(echo "${resp}" | tr -d '\r' | tr % .)
+ QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}"
+ if [ -n "$only_capture_events" ]; then
+ return
+ else
+ continue
+ fi
+ fi
+ fi
+ if [ -n "$only_capture_events" ]; then
+ echo "Only expected $capture_events but got ${resp}"
+ exit 1
+ fi
+
if [ -z "${silent}" ] && [ -z "${mismatch_only}" ]; then
echo "${resp}" | _filter_testdir | _filter_qemu \
| _filter_qemu_io | _filter_qmp | _filter_hmp
@@ -172,12 +206,82 @@ _send_qemu_cmd()
let count--;
done
if [ ${QEMU_STATUS[$h]} -ne 0 ] && [ -z "${qemu_error_no_exit}" ]; then
- echo "Timeout waiting for ${1} on handle ${h}"
+ echo "Timeout waiting for command ${1} response on handle ${h}"
exit 1 #Timeout means the test failed
fi
}
+# Check event cache for a named QMP event
+#
+# Input parameters:
+# $1: Name of the QMP event to check for
+#
+# Checks if the named QMP event that was previously captured
+# into $QEMU_EVENTS. When matched, the QMP event will be echoed
+# and the $matched variable set to 1.
+#
+# _wait_event is more suitable for test usage in most cases
+_check_cached_events()
+{
+ local evname=${1}
+
+ local match="\"event\": \"$evname\""
+
+ matched=0
+ if [ -n "$QEMU_EVENTS" ]; then
+ CURRENT_QEMU_EVENTS=$QEMU_EVENTS
+ QEMU_EVENTS=
+ old_IFS=$IFS
+ IFS="%"
+ for ev in $CURRENT_QEMU_EVENTS
+ do
+ grep -q "$match" < <(echo "${ev}")
+ if [ $? -eq 0 ] && [ $matched = 0 ]; then
+ echo "${ev}" | _filter_testdir | _filter_qemu \
+ | _filter_qemu_io | _filter_qmp | _filter_hmp
+ matched=1
+ else
+ QEMU_EVENTS="${QEMU_EVENTS:+${QEMU_EVENTS}%}${ev}"
+ fi
+ done
+ IFS=$old_IFS
+ fi
+}
+
+# Wait for a named QMP event
+#
+# Input parameters:
+# $1: QEMU handle to use
+# $2: Name of the QMP event to wait for
+#
+# Checks if the named QMP even was previously captured
+# into $QEMU_EVENTS. If none are present, then waits for the
+# event to arrive on the QMP channel. When matched, the QMP
+# event will be echoed
+_wait_event()
+{
+ local h=${1}
+ local evname=${2}
+
+ while true
+ do
+ _check_cached_events $evname
+
+ if [ $matched = 1 ];
+ then
+ return
+ fi
+
+ only_capture_events=1 qemu_error_no_exit=1 _timed_wait_for ${h}
+
+ if [ ${QEMU_STATUS[$h]} -ne 0 ] ; then
+ echo "Timeout waiting for event ${evname} on handle ${h}"
+ exit 1 #Timeout means the test failed
+ fi
+ done
+}
+
# Launch a QEMU process.
#
# Input parameters:
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 297acf9b6a..77c37e8312 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -109,8 +109,14 @@ peek_file_raw()
dd if="$1" bs=1 skip="$2" count="$3" status=none
}
-
-if ! . ./common.config
+config=common.config
+test -f $config || config=../common.config
+if ! test -f $config
+then
+ echo "$0: failed to find common.config"
+ exit 1
+fi
+if ! . $config
then
echo "$0: failed to source common.config"
exit 1
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 00be68eca3..4e758308f2 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -296,7 +296,9 @@ def qemu_nbd_list_log(*args: str) -> str:
@contextmanager
def qemu_nbd_popen(*args):
'''Context manager running qemu-nbd within the context'''
- pid_file = file_path("pid")
+ pid_file = file_path("qemu_nbd_popen-nbd-pid-file")
+
+ assert not os.path.exists(pid_file)
cmd = list(qemu_nbd_args)
cmd.extend(('--persistent', '--pid-file', pid_file))
@@ -314,6 +316,8 @@ def qemu_nbd_popen(*args):
time.sleep(0.01)
yield
finally:
+ if os.path.exists(pid_file):
+ os.remove(pid_file)
log('Kill NBD server')
p.kill()
p.wait()
diff --git a/tests/qtest/fuzz/fuzz.c b/tests/qtest/fuzz/fuzz.c
index 238866a037..496d11a231 100644
--- a/tests/qtest/fuzz/fuzz.c
+++ b/tests/qtest/fuzz/fuzz.c
@@ -159,6 +159,8 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp)
char *target_name;
const char *bindir;
char *datadir;
+ GString *cmd_line;
+ gchar *pretty_cmd_line;
bool serialize = false;
/* Initialize qgraph and modules */
@@ -217,7 +219,7 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp)
}
/* Run QEMU's softmmu main with the fuzz-target dependent arguments */
- GString *cmd_line = fuzz_target->get_init_cmdline(fuzz_target);
+ cmd_line = fuzz_target->get_init_cmdline(fuzz_target);
g_string_append_printf(cmd_line, " %s -qtest /dev/null ",
getenv("QTEST_LOG") ? "" : "-qtest-log none");
@@ -226,6 +228,13 @@ int LLVMFuzzerInitialize(int *argc, char ***argv, char ***envp)
wordexp(cmd_line->str, &result, 0);
g_string_free(cmd_line, true);
+ if (getenv("QTEST_LOG")) {
+ pretty_cmd_line = g_strjoinv(" ", result.we_wordv + 1);
+ printf("Starting %s with Arguments: %s\n",
+ result.we_wordv[0], pretty_cmd_line);
+ g_free(pretty_cmd_line);
+ }
+
qemu_init(result.we_wordc, result.we_wordv, NULL);
/* re-enable the rcu atfork, which was previously disabled in qemu_init */
diff --git a/tests/qtest/fuzz/generic_fuzz.c b/tests/qtest/fuzz/generic_fuzz.c
index be76d47d2d..ee8c17a04c 100644
--- a/tests/qtest/fuzz/generic_fuzz.c
+++ b/tests/qtest/fuzz/generic_fuzz.c
@@ -175,7 +175,7 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
* generic_fuzz(), avoiding potential race-conditions, which we don't have
* a good way for reproducing right now.
*/
-void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write)
+void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr)
{
/* Are we in the generic-fuzzer or are we using another fuzz-target? */
if (!qts_global) {
@@ -187,14 +187,11 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write)
* - We have no DMA patterns defined
* - The length of the DMA read request is zero
* - The DMA read is hitting an MR other than the machine's main RAM
- * - The DMA request is not a read (what happens for a address_space_map
- * with is_write=True? Can the device use the same pointer to do reads?)
* - The DMA request hits past the bounds of our RAM
*/
if (dma_patterns->len == 0
|| len == 0
|| mr != current_machine->ram
- || is_write
|| addr > current_machine->ram_size) {
return;
}
@@ -213,12 +210,12 @@ void fuzz_dma_read_cb(size_t addr, size_t len, MemoryRegion *mr, bool is_write)
double_fetch = true;
if (addr < region.addr
&& avoid_double_fetches) {
- fuzz_dma_read_cb(addr, region.addr - addr, mr, is_write);
+ fuzz_dma_read_cb(addr, region.addr - addr, mr);
}
if (addr + len > region.addr + region.size
&& avoid_double_fetches) {
fuzz_dma_read_cb(region.addr + region.size,
- addr + len - (region.addr + region.size), mr, is_write);
+ addr + len - (region.addr + region.size), mr);
}
return;
}
@@ -936,12 +933,20 @@ static GString *generic_fuzz_cmdline(FuzzTarget *t)
static GString *generic_fuzz_predefined_config_cmdline(FuzzTarget *t)
{
+ gchar *args;
const generic_fuzz_config *config;
g_assert(t->opaque);
config = t->opaque;
setenv("QEMU_AVOID_DOUBLE_FETCH", "1", 1);
- setenv("QEMU_FUZZ_ARGS", config->args, 1);
+ if (config->argfunc) {
+ args = config->argfunc();
+ setenv("QEMU_FUZZ_ARGS", args, 1);
+ g_free(args);
+ } else {
+ g_assert_nonnull(config->args);
+ setenv("QEMU_FUZZ_ARGS", config->args, 1);
+ }
setenv("QEMU_FUZZ_OBJECTS", config->objects, 1);
return generic_fuzz_cmdline(t);
}
diff --git a/tests/qtest/fuzz/generic_fuzz_configs.h b/tests/qtest/fuzz/generic_fuzz_configs.h
index 7fed035345..5d599765c4 100644
--- a/tests/qtest/fuzz/generic_fuzz_configs.h
+++ b/tests/qtest/fuzz/generic_fuzz_configs.h
@@ -16,8 +16,19 @@
typedef struct generic_fuzz_config {
const char *name, *args, *objects;
+ gchar* (*argfunc)(void); /* Result must be freeable by g_free() */
} generic_fuzz_config;
+static inline gchar *generic_fuzzer_virtio_9p_args(void){
+ char tmpdir[] = "/tmp/qemu-fuzz.XXXXXX";
+ g_assert_nonnull(mkdtemp(tmpdir));
+
+ return g_strdup_printf("-machine q35 -nodefaults "
+ "-device virtio-9p,fsdev=hshare,mount_tag=hshare "
+ "-fsdev local,id=hshare,path=%s,security_model=mapped-xattr,"
+ "writeout=immediate,fmode=0600,dmode=0700", tmpdir);
+}
+
const generic_fuzz_config predefined_configs[] = {
{
.name = "virtio-net-pci-slirp",
@@ -60,6 +71,16 @@ const generic_fuzz_config predefined_configs[] = {
.args = "-machine q35 -nodefaults -device virtio-mouse",
.objects = "virtio*",
},{
+ .name = "virtio-9p",
+ .argfunc = generic_fuzzer_virtio_9p_args,
+ .objects = "virtio*",
+ },{
+ .name = "virtio-9p-synth",
+ .args = "-machine q35 -nodefaults "
+ "-device virtio-9p,fsdev=hshare,mount_tag=hshare "
+ "-fsdev synth,id=hshare",
+ .objects = "virtio*",
+ },{
.name = "e1000",
.args = "-M q35 -nodefaults "
"-device e1000,netdev=net0 -netdev user,id=net0",
@@ -85,10 +106,28 @@ const generic_fuzz_config predefined_configs[] = {
.objects = "intel-hda",
},{
.name = "ide-hd",
+ .args = "-machine pc -nodefaults "
+ "-drive file=null-co://,if=none,format=raw,id=disk0 "
+ "-device ide-hd,drive=disk0",
+ .objects = "*ide*",
+ },{
+ .name = "ide-atapi",
+ .args = "-machine pc -nodefaults "
+ "-drive file=null-co://,if=none,format=raw,id=disk0 "
+ "-device ide-cd,drive=disk0",
+ .objects = "*ide*",
+ },{
+ .name = "ahci-hd",
.args = "-machine q35 -nodefaults "
"-drive file=null-co://,if=none,format=raw,id=disk0 "
"-device ide-hd,drive=disk0",
- .objects = "ahci*",
+ .objects = "*ahci*",
+ },{
+ .name = "ahci-atapi",
+ .args = "-machine q35 -nodefaults "
+ "-drive file=null-co://,if=none,format=raw,id=disk0 "
+ "-device ide-cd,drive=disk0",
+ .objects = "*ahci*",
},{
.name = "floppy",
.args = "-machine pc -nodefaults -device floppy,id=floppy0 "
diff --git a/tests/tcg/Makefile.qemu b/tests/tcg/Makefile.qemu
index c096c611a2..a56564660c 100644
--- a/tests/tcg/Makefile.qemu
+++ b/tests/tcg/Makefile.qemu
@@ -90,11 +90,11 @@ run-guest-tests: guest-tests
else
guest-tests:
- $(call quiet-command, /bin/true, "BUILD", \
+ $(call quiet-command, true, "BUILD", \
"$(TARGET) guest-tests SKIPPED")
run-guest-tests:
- $(call quiet-command, /bin/true, "RUN", \
+ $(call quiet-command, true, "RUN", \
"tests for $(TARGET) SKIPPED")
endif
diff --git a/tests/tcg/multiarch/Makefile.target b/tests/tcg/multiarch/Makefile.target
index 1dd0f64d23..abbdb2e126 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -63,8 +63,11 @@ run-gdbstub-qxfer-auxv-read: sha1
--bin $< --test $(MULTIARCH_SRC)/gdbstub/test-qxfer-auxv-read.py, \
"basic gdbstub qXfer:auxv:read support")
-EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read
+else
+run-gdbstub-%:
+ $(call skip-test, "gdbstub test $*", "need working gdb")
endif
+EXTRA_RUNS += run-gdbstub-sha1 run-gdbstub-qxfer-auxv-read
# Update TESTS
diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c
index 00d93204f9..5b2110e861 100644
--- a/util/event_notifier-posix.c
+++ b/util/event_notifier-posix.c
@@ -29,6 +29,7 @@ void event_notifier_init_fd(EventNotifier *e, int fd)
{
e->rfd = fd;
e->wfd = fd;
+ e->initialized = true;
}
#endif
@@ -68,6 +69,7 @@ int event_notifier_init(EventNotifier *e, int active)
if (active) {
event_notifier_set(e);
}
+ e->initialized = true;
return 0;
fail:
@@ -78,12 +80,18 @@ fail:
void event_notifier_cleanup(EventNotifier *e)
{
+ if (!e->initialized) {
+ return;
+ }
+
if (e->rfd != e->wfd) {
close(e->rfd);
}
+
e->rfd = -1;
close(e->wfd);
e->wfd = -1;
+ e->initialized = false;
}
int event_notifier_get_fd(const EventNotifier *e)
@@ -96,6 +104,10 @@ int event_notifier_set(EventNotifier *e)
static const uint64_t value = 1;
ssize_t ret;
+ if (!e->initialized) {
+ return -1;
+ }
+
do {
ret = write(e->wfd, &value, sizeof(value));
} while (ret < 0 && errno == EINTR);
@@ -113,6 +125,10 @@ int event_notifier_test_and_clear(EventNotifier *e)
ssize_t len;
char buffer[512];
+ if (!e->initialized) {
+ return 0;
+ }
+
/* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
value = 0;
do {
diff --git a/util/fifo8.c b/util/fifo8.c
index a5dd789ce5..d4d1c135e0 100644
--- a/util/fifo8.c
+++ b/util/fifo8.c
@@ -31,9 +31,7 @@ void fifo8_destroy(Fifo8 *fifo)
void fifo8_push(Fifo8 *fifo, uint8_t data)
{
- if (fifo->num == fifo->capacity) {
- abort();
- }
+ assert(fifo->num < fifo->capacity);
fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data;
fifo->num++;
}
@@ -42,9 +40,7 @@ void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num)
{
uint32_t start, avail;
- if (fifo->num + num > fifo->capacity) {
- abort();
- }
+ assert(fifo->num + num <= fifo->capacity);
start = (fifo->head + fifo->num) % fifo->capacity;
@@ -63,9 +59,7 @@ uint8_t fifo8_pop(Fifo8 *fifo)
{
uint8_t ret;
- if (fifo->num == 0) {
- abort();
- }
+ assert(fifo->num > 0);
ret = fifo->data[fifo->head++];
fifo->head %= fifo->capacity;
fifo->num--;
@@ -76,9 +70,7 @@ const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num)
{
uint8_t *ret;
- if (max == 0 || max > fifo->num) {
- abort();
- }
+ assert(max > 0 && max <= fifo->num);
*num = MIN(fifo->capacity - fifo->head, max);
ret = &fifo->data[fifo->head];
fifo->head += *num;
diff --git a/util/meson.build b/util/meson.build
index 3eccdbe596..984fba965f 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -52,6 +52,7 @@ if have_system
util_ss.add(files('crc-ccitt.c'))
util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
util_ss.add(files('yank.c'))
+ util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
endif
if have_block
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 890fda6a35..e6fa8b598b 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -87,7 +87,8 @@ void *qemu_ram_mmap(int fd,
size_t align,
bool readonly,
bool shared,
- bool is_pmem)
+ bool is_pmem,
+ off_t map_offset)
{
int prot;
int flags;
@@ -150,7 +151,8 @@ void *qemu_ram_mmap(int fd,
prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
- ptr = mmap(guardptr + offset, size, prot, flags | map_sync_flags, fd, 0);
+ ptr = mmap(guardptr + offset, size, prot,
+ flags | map_sync_flags, fd, map_offset);
if (ptr == MAP_FAILED && map_sync_flags) {
if (errno == ENOTSUP) {
@@ -174,7 +176,7 @@ void *qemu_ram_mmap(int fd,
* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
* we will remove these flags to handle compatibility.
*/
- ptr = mmap(guardptr + offset, size, prot, flags, fd, 0);
+ ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset);
}
if (ptr == MAP_FAILED) {
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index bf57d3b030..36820fec16 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -230,7 +230,7 @@ void *qemu_memalign(size_t alignment, size_t size)
void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
{
size_t align = QEMU_VMALLOC_ALIGN;
- void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false);
+ void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false, 0);
if (ptr == MAP_FAILED) {
return NULL;
diff --git a/util/trace-events b/util/trace-events
index 61e0d4bcdf..bac0924899 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -91,3 +91,12 @@ qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uin
qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
+
+#userfaultfd.c
+uffd_query_features_nosys(int err) "errno: %i"
+uffd_query_features_api_failed(int err) "errno: %i"
+uffd_create_fd_nosys(int err) "errno: %i"
+uffd_create_fd_api_failed(int err) "errno: %i"
+uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64
+uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i"
+uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i"
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
new file mode 100644
index 0000000000..f1cd6af2b1
--- /dev/null
+++ b/util/userfaultfd.c
@@ -0,0 +1,345 @@
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/userfaultfd.h"
+#include "trace.h"
+#include <poll.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+
+/**
+ * uffd_query_features: query UFFD features
+ *
+ * Returns: 0 on success, negative value in case of an error
+ *
+ * @features: parameter to receive 'uffdio_api.features'
+ */
+int uffd_query_features(uint64_t *features)
+{
+ int uffd_fd;
+ struct uffdio_api api_struct = { 0 };
+ int ret = -1;
+
+ uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ if (uffd_fd < 0) {
+ trace_uffd_query_features_nosys(errno);
+ return -1;
+ }
+
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+
+ if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+ trace_uffd_query_features_api_failed(errno);
+ goto out;
+ }
+ *features = api_struct.features;
+ ret = 0;
+
+out:
+ close(uffd_fd);
+ return ret;
+}
+
+/**
+ * uffd_create_fd: create UFFD file descriptor
+ *
+ * Returns non-negative file descriptor or negative value in case of an error
+ *
+ * @features: UFFD features to request
+ * @non_blocking: create UFFD file descriptor for non-blocking operation
+ */
+int uffd_create_fd(uint64_t features, bool non_blocking)
+{
+ int uffd_fd;
+ int flags;
+ struct uffdio_api api_struct = { 0 };
+ uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
+
+ flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
+ uffd_fd = syscall(__NR_userfaultfd, flags);
+ if (uffd_fd < 0) {
+ trace_uffd_create_fd_nosys(errno);
+ return -1;
+ }
+
+ api_struct.api = UFFD_API;
+ api_struct.features = features;
+ if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+ trace_uffd_create_fd_api_failed(errno);
+ goto fail;
+ }
+ if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+ trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
+ goto fail;
+ }
+
+ return uffd_fd;
+
+fail:
+ close(uffd_fd);
+ return -1;
+}
+
+/**
+ * uffd_close_fd: close UFFD file descriptor
+ *
+ * @uffd_fd: UFFD file descriptor
+ */
+void uffd_close_fd(int uffd_fd)
+{
+ assert(uffd_fd >= 0);
+ close(uffd_fd);
+}
+
+/**
+ * uffd_register_memory: register memory range via UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
+ * @ioctls: optional pointer to receive supported IOCTL mask
+ */
+int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
+ uint64_t mode, uint64_t *ioctls)
+{
+ struct uffdio_register uffd_register;
+
+ uffd_register.range.start = (uintptr_t) addr;
+ uffd_register.range.len = length;
+ uffd_register.mode = mode;
+
+ if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
+ trace_uffd_register_memory_failed(addr, length, mode, errno);
+ return -1;
+ }
+ if (ioctls) {
+ *ioctls = uffd_register.ioctls;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_unregister_memory: un-register memory range with UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ */
+int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
+{
+ struct uffdio_range uffd_range;
+
+ uffd_range.start = (uintptr_t) addr;
+ uffd_range.len = length;
+
+ if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
+ trace_uffd_unregister_memory_failed(addr, length, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
+ *
+ * Returns 0 on success, negative value in case of error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @wp: write-protect/unprotect
+ * @dont_wake: do not wake threads waiting on wr-protected page
+ */
+int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
+ bool wp, bool dont_wake)
+{
+ struct uffdio_writeprotect uffd_writeprotect;
+
+ uffd_writeprotect.range.start = (uintptr_t) addr;
+ uffd_writeprotect.range.len = length;
+ if (!wp && dont_wake) {
+ /* DONTWAKE is meaningful only on protection release */
+ uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+ } else {
+ uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
+ }
+
+ if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+ error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", addr, length,
+ (uint64_t) uffd_writeprotect.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_copy_page: copy range of pages to destination via UFFD-IO
+ *
+ * Copy range of source pages to the destination to resolve
+ * missing page fault somewhere in the destination range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @dst_addr: destination base address
+ * @src_addr: source base address
+ * @length: length of the range to copy
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
+ uint64_t length, bool dont_wake)
+{
+ struct uffdio_copy uffd_copy;
+
+ uffd_copy.dst = (uintptr_t) dst_addr;
+ uffd_copy.src = (uintptr_t) src_addr;
+ uffd_copy.len = length;
+ uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
+
+ if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
+ error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
+ length, (uint64_t) uffd_copy.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
+ *
+ * Fill range pages with zeroes to resolve missing page fault within the range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range to fill with zeroes
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
+{
+ struct uffdio_zeropage uffd_zeropage;
+
+ uffd_zeropage.range.start = (uintptr_t) addr;
+ uffd_zeropage.range.len = length;
+ uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
+
+ if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
+ error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", addr, length,
+ (uint64_t) uffd_zeropage.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
+ *
+ * Wake up threads waiting on any page/pages from the designated range.
+ * The main use case is when during some period, page faults are resolved
+ * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
+ * for the whole memory range are satisfied in a single call to uffd_wakeup().
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range
+ */
+int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
+{
+ struct uffdio_range uffd_range;
+
+ uffd_range.start = (uintptr_t) addr;
+ uffd_range.len = length;
+
+ if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
+ error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
+ addr, length, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_read_events: read pending UFFD events
+ *
+ * Returns number of fetched messages, 0 if non is available or
+ * negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @msgs: pointer to message buffer
+ * @count: number of messages that can fit in the buffer
+ */
+int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
+{
+ ssize_t res;
+ do {
+ res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
+ } while (res < 0 && errno == EINTR);
+
+ if ((res < 0 && errno == EAGAIN)) {
+ return 0;
+ }
+ if (res < 0) {
+ error_report("uffd_read_events() failed: errno=%i", errno);
+ return -1;
+ }
+
+ return (int) (res / sizeof(struct uffd_msg));
+}
+
+/**
+ * uffd_poll_events: poll UFFD file descriptor for read
+ *
+ * Returns true if events are available for read, false otherwise
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @tmo: timeout value
+ */
+bool uffd_poll_events(int uffd_fd, int tmo)
+{
+ int res;
+ struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
+
+ do {
+ res = poll(&poll_fd, 1, tmo);
+ } while (res < 0 && errno == EINTR);
+
+ if (res == 0) {
+ return false;
+ }
+ if (res < 0) {
+ error_report("uffd_poll_events() failed: errno=%i", errno);
+ return false;
+ }
+
+ return (poll_fd.revents & POLLIN) != 0;
+}