summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--.gitlab-ci-edk2.yml49
-rw-r--r--.gitlab-ci.d/edk2/Dockerfile27
-rw-r--r--.gitlab-ci.yml4
-rw-r--r--MAINTAINERS29
-rw-r--r--Makefile50
-rw-r--r--Makefile.objs1
-rw-r--r--accel/tcg/atomic_template.h67
-rw-r--r--accel/tcg/cpu-exec.c2
-rw-r--r--accel/tcg/cputlb.c602
-rw-r--r--accel/tcg/tcg-runtime-gvec.c2
-rw-r--r--accel/tcg/tcg-runtime.c1
-rw-r--r--accel/tcg/translate-all.c2
-rw-r--r--accel/tcg/user-exec.c238
-rw-r--r--backends/dbus-vmstate.c3
-rw-r--r--bsd-user/main.c2
-rwxr-xr-xconfigure26
-rw-r--r--contrib/libvhost-user/libvhost-user.c57
-rw-r--r--contrib/libvhost-user/libvhost-user.h6
-rw-r--r--cpus.c2
-rw-r--r--default-configs/arm-softmmu.mak1
-rw-r--r--docs/devel/loads-stores.rst213
-rw-r--r--docs/index.html.in1
-rw-r--r--docs/index.rst2
-rw-r--r--docs/interop/conf.py4
-rw-r--r--docs/interop/index.rst1
-rw-r--r--docs/interop/qemu-nbd.rst263
-rw-r--r--docs/interop/qemu-option-trace.rst.inc30
-rw-r--r--docs/interop/vhost-user.json4
-rw-r--r--docs/qemu-block-drivers.texi889
-rw-r--r--docs/specs/acpi_cpu_hotplug.txt89
-rw-r--r--docs/system/conf.py22
-rw-r--r--docs/system/index.rst17
-rw-r--r--docs/system/qemu-block-drivers.rst985
-rw-r--r--exec.c6
-rw-r--r--fsdev/virtfs-proxy-helper.c4
-rw-r--r--hw/9pfs/9p-local.c16
-rw-r--r--hw/9pfs/9p.c42
-rw-r--r--hw/9pfs/9p.h2
-rw-r--r--hw/9pfs/virtio-9p-device.c12
-rw-r--r--hw/9pfs/xen-9p-backend.c13
-rw-r--r--hw/acpi/cpu.c18
-rw-r--r--hw/acpi/generic_event_device.c2
-rw-r--r--hw/acpi/trace-events1
-rw-r--r--hw/arm/Kconfig10
-rw-r--r--hw/arm/Makefile.objs2
-rw-r--r--hw/arm/allwinner-a10.c39
-rw-r--r--hw/arm/exynos4210.c77
-rw-r--r--hw/arm/fsl-imx25.c11
-rw-r--r--hw/arm/netduinoplus2.c52
-rw-r--r--hw/arm/stellaris.c2
-rw-r--r--hw/arm/stm32f405_soc.c302
-rw-r--r--hw/arm/virt.c6
-rw-r--r--hw/char/exynos4210_uart.c245
-rw-r--r--hw/char/trace-events20
-rw-r--r--hw/core/qdev.c3
-rw-r--r--hw/display/ads7846.c2
-rw-r--r--hw/dma/pl330.c88
-rw-r--r--hw/dma/trace-events24
-rw-r--r--hw/i2c/core.c2
-rw-r--r--hw/i386/acpi-build.c1
-rw-r--r--hw/i386/pc.c4
-rw-r--r--hw/input/stellaris_input.c3
-rw-r--r--hw/intc/apic_common.c7
-rw-r--r--hw/intc/arm_gicv3_cpuif.c3
-rw-r--r--hw/misc/Kconfig6
-rw-r--r--hw/misc/Makefile.objs3
-rw-r--r--hw/misc/imx_rngc.c278
-rw-r--r--hw/misc/max111x.c3
-rw-r--r--hw/misc/stm32f4xx_exti.c188
-rw-r--r--hw/misc/stm32f4xx_syscfg.c171
-rw-r--r--hw/misc/trace-events11
-rw-r--r--hw/net/eepro100.c3
-rw-r--r--hw/pci-host/q35.c84
-rw-r--r--hw/pci/pci.c2
-rw-r--r--hw/ppc/spapr.c2
-rw-r--r--hw/scsi/virtio-scsi.c6
-rw-r--r--hw/timer/arm_timer.c2
-rw-r--r--hw/tpm/tpm_emulator.c3
-rw-r--r--hw/virtio/vhost-user.c2
-rw-r--r--hw/virtio/vhost-vsock.c12
-rw-r--r--hw/virtio/vhost.c39
-rw-r--r--include/elf.h1
-rw-r--r--include/exec/cpu_ldst.h441
-rw-r--r--include/exec/cpu_ldst_template.h211
-rw-r--r--include/exec/cpu_ldst_useronly_template.h159
-rw-r--r--include/exec/translator.h48
-rw-r--r--include/hw/arm/allwinner-a10.h7
-rw-r--r--include/hw/arm/exynos4210.h4
-rw-r--r--include/hw/arm/fsl-imx25.h5
-rw-r--r--include/hw/arm/stm32f405_soc.h73
-rw-r--r--include/hw/misc/imx_rngc.h35
-rw-r--r--include/hw/misc/stm32f4xx_exti.h60
-rw-r--r--include/hw/misc/stm32f4xx_syscfg.h61
-rw-r--r--include/hw/or-irq.h2
-rw-r--r--include/hw/pci-host/q35.h10
-rw-r--r--include/hw/virtio/vhost-vsock.h2
-rw-r--r--include/migration/register.h2
-rw-r--r--include/migration/vmstate.h25
-rw-r--r--include/qemu/queue.h39
-rw-r--r--include/standard-headers/linux/fuse.h891
-rw-r--r--include/tcg/tcg-gvec-desc.h (renamed from tcg/tcg-gvec-desc.h)0
-rw-r--r--include/tcg/tcg-mo.h (renamed from tcg/tcg-mo.h)0
-rw-r--r--include/tcg/tcg-op-gvec.h (renamed from tcg/tcg-op-gvec.h)0
-rw-r--r--include/tcg/tcg-op.h (renamed from tcg/tcg-op.h)2
-rw-r--r--include/tcg/tcg-opc.h (renamed from tcg/tcg-opc.h)0
-rw-r--r--include/tcg/tcg.h (renamed from tcg/tcg.h)33
-rw-r--r--include/user/syscall-trace.h2
-rw-r--r--linux-user/elfload.c75
-rw-r--r--linux-user/ioctls.h41
-rw-r--r--linux-user/main.c2
-rw-r--r--linux-user/qemu.h1
-rw-r--r--linux-user/syscall.c7
-rw-r--r--linux-user/syscall_defs.h59
-rw-r--r--linux-user/syscall_types.h37
-rw-r--r--migration/migration.c72
-rw-r--r--migration/migration.h1
-rw-r--r--migration/ram.c196
-rw-r--r--migration/savevm.c61
-rw-r--r--migration/trace-events9
-rw-r--r--migration/vmstate-types.c70
-rw-r--r--plugins/api.c1
-rw-r--r--qemu-doc.texi18
-rw-r--r--qemu-nbd.texi214
-rw-r--r--qemu-option-trace.texi4
-rw-r--r--qemu-options.hx2
-rw-r--r--roms/edk2-funcs.sh3
-rw-r--r--scripts/git.orderfile3
-rw-r--r--scripts/qapi/schema.py2
-rwxr-xr-xscripts/update-linux-headers.sh1
-rw-r--r--stubs/vmstate.c2
-rw-r--r--target/alpha/cpu.h2
-rw-r--r--target/alpha/translate.c2
-rw-r--r--target/arm/arch_dump.c124
-rw-r--r--target/arm/arm-semi.c5
-rw-r--r--target/arm/cpu.c1
-rw-r--r--target/arm/cpu.h25
-rw-r--r--target/arm/helper-a64.c2
-rw-r--r--target/arm/kvm64.c24
-rw-r--r--target/arm/op_helper.c7
-rw-r--r--target/arm/pauth_helper.c4
-rw-r--r--target/arm/sve_helper.c1
-rw-r--r--target/arm/tlb_helper.c2
-rw-r--r--target/arm/translate-a64.c4
-rw-r--r--target/arm/translate-sve.c6
-rw-r--r--target/arm/translate.c7
-rw-r--r--target/cris/cpu.h2
-rw-r--r--target/cris/translate.c2
-rw-r--r--target/hppa/translate.c2
-rw-r--r--target/i386/cpu.h3
-rw-r--r--target/i386/mem_helper.c2
-rw-r--r--target/i386/seg_helper.c62
-rw-r--r--target/i386/translate.c2
-rw-r--r--target/lm32/translate.c2
-rw-r--r--target/m68k/cpu.h2
-rw-r--r--target/m68k/op_helper.c77
-rw-r--r--target/m68k/translate.c44
-rw-r--r--target/microblaze/cpu.h3
-rw-r--r--target/microblaze/translate.c2
-rw-r--r--target/mips/cpu.h4
-rw-r--r--target/mips/op_helper.c182
-rw-r--r--target/mips/translate.c2
-rw-r--r--target/moxie/translate.c2
-rw-r--r--target/nios2/cpu.h2
-rw-r--r--target/nios2/translate.c2
-rw-r--r--target/openrisc/fpu_helper.c2
-rw-r--r--target/openrisc/translate.c2
-rw-r--r--target/ppc/cpu.h2
-rw-r--r--target/ppc/mem_helper.c13
-rw-r--r--target/ppc/translate.c4
-rw-r--r--target/riscv/cpu_helper.c2
-rw-r--r--target/riscv/translate.c2
-rw-r--r--target/s390x/cpu.h5
-rw-r--r--target/s390x/mem_helper.c11
-rw-r--r--target/s390x/translate.c4
-rw-r--r--target/sh4/cpu.h2
-rw-r--r--target/sh4/translate.c2
-rw-r--r--target/sparc/ldst_helper.c2
-rw-r--r--target/sparc/translate.c2
-rw-r--r--target/tilegx/translate.c2
-rw-r--r--target/tricore/translate.c2
-rw-r--r--target/unicore32/cpu.h2
-rw-r--r--target/unicore32/translate.c2
-rw-r--r--target/xtensa/cpu.h4
-rw-r--r--target/xtensa/mmu_helper.c5
-rw-r--r--target/xtensa/translate.c2
-rw-r--r--tcg/aarch64/tcg-target.inc.c4
-rw-r--r--tcg/arm/tcg-target.inc.c4
-rw-r--r--tcg/i386/tcg-target.h2
-rw-r--r--tcg/i386/tcg-target.inc.c4
-rw-r--r--tcg/mips/tcg-target.inc.c2
-rw-r--r--tcg/optimize.c2
-rw-r--r--tcg/ppc/tcg-target.inc.c4
-rw-r--r--tcg/riscv/tcg-target.inc.c4
-rw-r--r--tcg/s390/tcg-target.inc.c4
-rw-r--r--tcg/sparc/tcg-target.inc.c2
-rw-r--r--tcg/tcg-common.c2
-rw-r--r--tcg/tcg-op-gvec.c8
-rw-r--r--tcg/tcg-op-vec.c6
-rw-r--r--tcg/tcg-op.c6
-rw-r--r--tcg/tcg.c2
-rw-r--r--tcg/tci.c2
-rw-r--r--tests/acceptance/boot_linux_console.py85
-rw-r--r--tests/data/acpi/q35/DSDTbin7879 -> 7869 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.acpihmatbin9203 -> 9193 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.bridgebin7896 -> 7886 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.cphpbin8342 -> 8332 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.dimmpxmbin9532 -> 9522 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.ipmibtbin7954 -> 7944 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.memhpbin9238 -> 9228 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.mmio64bin9009 -> 8999 bytes
-rw-r--r--tests/data/acpi/q35/DSDT.numamembin7885 -> 7875 bytes
-rwxr-xr-xtests/data/acpi/rebuild-expected-aml.sh6
-rw-r--r--tests/qtest/bios-tables-test.c23
-rw-r--r--tests/qtest/migration-test.c97
-rw-r--r--tests/qtest/q35-test.c105
-rw-r--r--tests/qtest/vhost-user-test.c11
-rw-r--r--tests/tcg/aarch64/Makefile.softmmu-target5
-rw-r--r--tests/tcg/aarch64/Makefile.target3
-rw-r--r--tests/tcg/aarch64/pauth-1.c2
-rw-r--r--tests/tcg/aarch64/pauth-2.c2
-rw-r--r--tests/tcg/aarch64/pauth-4.c25
-rw-r--r--tests/tcg/aarch64/system/pauth-3.c40
-rw-r--r--tests/test-vmstate.c170
-rw-r--r--tools/virtiofsd/50-qemu-virtiofsd.json.in5
-rw-r--r--tools/virtiofsd/Makefile.objs12
-rw-r--r--tools/virtiofsd/buffer.c351
-rw-r--r--tools/virtiofsd/fuse.h1249
-rw-r--r--tools/virtiofsd/fuse_common.h816
-rw-r--r--tools/virtiofsd/fuse_i.h115
-rw-r--r--tools/virtiofsd/fuse_log.c41
-rw-r--r--tools/virtiofsd/fuse_log.h74
-rw-r--r--tools/virtiofsd/fuse_lowlevel.c2761
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h1991
-rw-r--r--tools/virtiofsd/fuse_misc.h60
-rw-r--r--tools/virtiofsd/fuse_opt.c450
-rw-r--r--tools/virtiofsd/fuse_opt.h272
-rw-r--r--tools/virtiofsd/fuse_signals.c98
-rw-r--r--tools/virtiofsd/fuse_virtio.c986
-rw-r--r--tools/virtiofsd/fuse_virtio.h33
-rw-r--r--tools/virtiofsd/helper.c349
-rw-r--r--tools/virtiofsd/passthrough_helpers.h51
-rw-r--r--tools/virtiofsd/passthrough_ll.c3006
-rw-r--r--tools/virtiofsd/seccomp.c165
-rw-r--r--tools/virtiofsd/seccomp.h16
-rw-r--r--trace/mem-internal.h17
-rw-r--r--ui/console.c1
-rw-r--r--ui/vnc-enc-zrle.c4
-rw-r--r--ui/vnc.c31
-rw-r--r--util/cacheinfo.c10
-rw-r--r--vl.c37
251 files changed, 19846 insertions, 2996 deletions
diff --git a/.gitignore b/.gitignore
index efad605e1a..bc0a035f9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
/config-target.*
/config.status
/config-temp
+/tools/virtiofsd/50-qemu-virtiofsd.json
/elf2dmp
/trace-events-all
/trace/generated-events.h
diff --git a/.gitlab-ci-edk2.yml b/.gitlab-ci-edk2.yml
new file mode 100644
index 0000000000..088ba4b43a
--- /dev/null
+++ b/.gitlab-ci-edk2.yml
@@ -0,0 +1,49 @@
+docker-edk2:
+ stage: build
+ rules: # Only run this job when the Dockerfile is modified
+ - changes:
+ - .gitlab-ci-edk2.yml
+ - .gitlab-ci.d/edk2/Dockerfile
+ when: always
+ image: docker:19.03.1
+ services:
+ - docker:19.03.1-dind
+ variables:
+ GIT_DEPTH: 3
+ IMAGE_TAG: $CI_REGISTRY_IMAGE:edk2-cross-build
+ # We don't use TLS
+ DOCKER_HOST: tcp://docker:2375
+ DOCKER_TLS_CERTDIR: ""
+ before_script:
+ - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
+ script:
+ - docker pull $IMAGE_TAG || true
+ - docker build --cache-from $IMAGE_TAG --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+ --tag $IMAGE_TAG .gitlab-ci.d/edk2
+ - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA
+ - docker push $IMAGE_TAG
+
+build-edk2:
+ rules: # Only run this job when ...
+ - changes: # ... roms/edk2/ is modified (submodule updated)
+ - roms/edk2/*
+ when: always
+ - if: '$CI_COMMIT_REF_NAME =~ /^edk2/' # or the branch/tag starts with 'edk2'
+ when: always
+ - if: '$CI_COMMIT_MESSAGE =~ /edk2/i' # or last commit description contains 'EDK2'
+ when: always
+ artifacts:
+ paths: # 'artifacts.zip' will contains the following files:
+ - pc-bios/edk2*bz2
+ - pc-bios/edk2-licenses.txt
+ - edk2-stdout.log
+ - edk2-stderr.log
+ image: $CI_REGISTRY_IMAGE:edk2-cross-build
+ variables:
+ GIT_DEPTH: 3
+ script: # Clone the required submodules and build EDK2
+ - git submodule update --init roms/edk2
+ - git -C roms/edk2 submodule update --init
+ - export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1))
+ - echo "=== Using ${JOBS} simultaneous jobs ==="
+ - make -j${JOBS} -C roms efi 2>&1 1>edk2-stdout.log | tee -a edk2-stderr.log >&2
diff --git a/.gitlab-ci.d/edk2/Dockerfile b/.gitlab-ci.d/edk2/Dockerfile
new file mode 100644
index 0000000000..b4584d1cf6
--- /dev/null
+++ b/.gitlab-ci.d/edk2/Dockerfile
@@ -0,0 +1,27 @@
+#
+# Docker image to cross-compile EDK2 firmware binaries
+#
+FROM ubuntu:16.04
+
+MAINTAINER Philippe Mathieu-Daudé <philmd@redhat.com>
+
+# Install packages required to build EDK2
+RUN apt update \
+ && \
+ \
+ DEBIAN_FRONTEND=noninteractive \
+ apt install --assume-yes --no-install-recommends \
+ build-essential \
+ ca-certificates \
+ dos2unix \
+ gcc-aarch64-linux-gnu \
+ gcc-arm-linux-gnueabi \
+ git \
+ iasl \
+ make \
+ nasm \
+ python \
+ uuid-dev \
+ && \
+ \
+ rm -rf /var/lib/apt/lists/*
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index dce8f2d3f5..228783993e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,3 +1,6 @@
+include:
+ - local: '/.gitlab-ci-edk2.yml'
+
before_script:
- apt-get update -qq
- apt-get install -y -qq flex bison libglib2.0-dev libpixman-1-dev genisoimage
@@ -87,6 +90,7 @@ build-tci:
- ../configure --enable-tcg-interpreter
--target-list="$(for tg in $TARGETS; do echo -n ${tg}'-softmmu '; done)"
- make -j2
+ - make run-tcg-tests-x86_64-softmmu
- make tests/qtest/boot-serial-test tests/qtest/cdrom-test tests/qtest/pxe-test
- for tg in $TARGETS ; do
export QTEST_QEMU_BINARY="${tg}-softmmu/qemu-system-${tg}" ;
diff --git a/MAINTAINERS b/MAINTAINERS
index 483edfbc0b..f6511d5120 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -816,12 +816,26 @@ F: hw/adc/*
F: hw/ssi/stm32f2xx_spi.c
F: include/hw/*/stm32*.h
+STM32F405
+M: Alistair Francis <alistair@alistair23.me>
+M: Peter Maydell <peter.maydell@linaro.org>
+S: Maintained
+F: hw/arm/stm32f405_soc.c
+F: hw/misc/stm32f4xx_syscfg.c
+F: hw/misc/stm32f4xx_exti.c
+
Netduino 2
M: Alistair Francis <alistair@alistair23.me>
M: Peter Maydell <peter.maydell@linaro.org>
S: Maintained
F: hw/arm/netduino2.c
+Netduino Plus 2
+M: Alistair Francis <alistair@alistair23.me>
+M: Peter Maydell <peter.maydell@linaro.org>
+S: Maintained
+F: hw/arm/netduinoplus2.c
+
SmartFusion2
M: Subbaraya Sundeep <sundeep.lkml@gmail.com>
M: Peter Maydell <peter.maydell@linaro.org>
@@ -1581,6 +1595,14 @@ T: git https://github.com/cohuck/qemu.git s390-next
T: git https://github.com/borntraeger/qemu.git s390-next
L: qemu-s390x@nongnu.org
+virtiofs
+M: Dr. David Alan Gilbert <dgilbert@redhat.com>
+M: Stefan Hajnoczi <stefanha@redhat.com>
+S: Supported
+F: tools/virtiofsd/*
+F: hw/virtio/vhost-user-fs*
+F: include/hw/virtio/vhost-user-fs.h
+
virtio-input
M: Gerd Hoffmann <kraxel@redhat.com>
S: Maintained
@@ -2354,6 +2376,8 @@ F: roms/edk2
F: roms/edk2-*
F: tests/data/uefi-boot-images/
F: tests/uefi-test-tools/
+F: .gitlab-ci-edk2.yml
+F: .gitlab-ci.d/edk2/
Usermode Emulation
------------------
@@ -2382,6 +2406,7 @@ Common TCG code
M: Richard Henderson <rth@twiddle.net>
S: Maintained
F: tcg/
+F: include/tcg/
TCG Plugins
M: Alex Bennée <alex.bennee@linaro.org>
@@ -2391,8 +2416,7 @@ F: plugins/
F: tests/plugin
AArch64 TCG target
-M: Claudio Fontana <claudio.fontana@huawei.com>
-M: Claudio Fontana <claudio.fontana@gmail.com>
+M: Richard Henderson <richard.henderson@linaro.org>
S: Maintained
L: qemu-arm@nongnu.org
F: tcg/aarch64/
@@ -2503,6 +2527,7 @@ F: include/block/nbd*
F: qemu-nbd.*
F: blockdev-nbd.c
F: docs/interop/nbd.txt
+F: docs/interop/qemu-nbd.rst
T: git https://repo.or.cz/qemu/ericb.git nbd
NFS
diff --git a/Makefile b/Makefile
index afa5cb6548..04c77d3b96 100644
--- a/Makefile
+++ b/Makefile
@@ -327,6 +327,11 @@ HELPERS-y += vhost-user-gpu$(EXESUF)
vhost-user-json-y += contrib/vhost-user-gpu/50-qemu-gpu.json
endif
+ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy)
+HELPERS-y += virtiofsd$(EXESUF)
+vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json
+endif
+
# Sphinx does not allow building manuals into the same directory as
# the source files, so if we're doing an in-tree QEMU build we must
# build the manuals into a subdirectory (and then install them from
@@ -339,10 +344,12 @@ MANUAL_BUILDDIR := docs
endif
ifdef BUILD_DOCS
-DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1 qemu-nbd.8 $(MANUAL_BUILDDIR)/interop/qemu-ga.8
+DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1
+DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-nbd.8
+DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-ga.8
+DOCS+=$(MANUAL_BUILDDIR)/system/qemu-block-drivers.7
DOCS+=docs/interop/qemu-qmp-ref.html docs/interop/qemu-qmp-ref.txt docs/interop/qemu-qmp-ref.7
DOCS+=docs/interop/qemu-ga-ref.html docs/interop/qemu-ga-ref.txt docs/interop/qemu-ga-ref.7
-DOCS+=docs/qemu-block-drivers.7
DOCS+=docs/qemu-cpu-models.7
DOCS+=$(MANUAL_BUILDDIR)/index.html
ifdef CONFIG_VIRTFS
@@ -429,6 +436,7 @@ dummy := $(call unnest-vars,, \
elf2dmp-obj-y \
ivshmem-client-obj-y \
ivshmem-server-obj-y \
+ virtiofsd-obj-y \
rdmacm-mux-obj-y \
libvhost-user-obj-y \
vhost-user-scsi-obj-y \
@@ -668,6 +676,12 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad"
rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS)
$(call LINK, $^)
+# relies on Linux-specific syscalls
+ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy)
+virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS)
+ $(call LINK, $^)
+endif
+
vhost-user-gpu$(EXESUF): $(vhost-user-gpu-obj-y) $(libvhost-user-obj-y) libqemuutil.a libqemustub.a
$(call LINK, $^)
@@ -749,12 +763,12 @@ distclean: clean
rm -f docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
rm -f docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf
rm -f docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html
- rm -f docs/qemu-block-drivers.7
rm -f docs/qemu-cpu-models.7
rm -rf .doctrees
$(call clean-manual,devel)
$(call clean-manual,interop)
$(call clean-manual,specs)
+ $(call clean-manual,system)
for d in $(TARGET_DIRS); do \
rm -rf $$d || exit 1 ; \
done
@@ -811,6 +825,7 @@ endef
install-sphinxdocs: sphinxdocs
$(call install-manual,interop)
$(call install-manual,specs)
+ $(call install-manual,system)
install-doc: $(DOCS) install-sphinxdocs
$(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)"
@@ -824,12 +839,12 @@ ifdef CONFIG_POSIX
$(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1"
$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7"
$(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7"
- $(INSTALL_DATA) docs/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7"
+ $(INSTALL_DATA) $(MANUAL_BUILDDIR)/system/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7"
$(INSTALL_DATA) docs/qemu-cpu-models.7 "$(DESTDIR)$(mandir)/man7"
ifeq ($(CONFIG_TOOLS),y)
$(INSTALL_DATA) qemu-img.1 "$(DESTDIR)$(mandir)/man1"
$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man8"
- $(INSTALL_DATA) qemu-nbd.8 "$(DESTDIR)$(mandir)/man8"
+ $(INSTALL_DATA) $(MANUAL_BUILDDIR)/interop/qemu-nbd.8 "$(DESTDIR)$(mandir)/man8"
endif
ifdef CONFIG_TRACE_SYSTEMTAP
$(INSTALL_DATA) scripts/qemu-trace-stap.1 "$(DESTDIR)$(mandir)/man1"
@@ -998,7 +1013,10 @@ docs/version.texi: $(SRC_PATH)/VERSION config-host.mak
# and handles "don't rebuild things unless necessary" itself.
# The '.doctrees' files are cached information to speed this up.
.PHONY: sphinxdocs
-sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html $(MANUAL_BUILDDIR)/interop/index.html $(MANUAL_BUILDDIR)/specs/index.html
+sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html \
+ $(MANUAL_BUILDDIR)/interop/index.html \
+ $(MANUAL_BUILDDIR)/specs/index.html \
+ $(MANUAL_BUILDDIR)/system/index.html
# Canned command to build a single manual
# Arguments: $1 = manual name, $2 = Sphinx builder ('html' or 'man')
@@ -1007,7 +1025,9 @@ sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html $(MANUAL_BUILDDIR)/interop/index
# a single doctree: https://github.com/sphinx-doc/sphinx/issues/2946
build-manual = $(call quiet-command,CONFDIR="$(qemu_confdir)" sphinx-build $(if $(V),,-q) -W -b $2 -D version=$(VERSION) -D release="$(FULL_VERSION)" -d .doctrees/$1-$2 $(SRC_PATH)/docs/$1 $(MANUAL_BUILDDIR)/$1 ,"SPHINX","$(MANUAL_BUILDDIR)/$1")
# We assume all RST files in the manual's directory are used in it
-manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py
+manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) \
+ $(wildcard $(SRC_PATH)/docs/$1/*.rst.inc) \
+ $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py
$(MANUAL_BUILDDIR)/devel/index.html: $(call manual-deps,devel)
$(call build-manual,devel,html)
@@ -1018,10 +1038,20 @@ $(MANUAL_BUILDDIR)/interop/index.html: $(call manual-deps,interop)
$(MANUAL_BUILDDIR)/specs/index.html: $(call manual-deps,specs)
$(call build-manual,specs,html)
+$(MANUAL_BUILDDIR)/system/index.html: $(call manual-deps,system)
+ $(call build-manual,system,html)
+
$(MANUAL_BUILDDIR)/interop/qemu-ga.8: $(call manual-deps,interop)
$(call build-manual,interop,man)
+$(MANUAL_BUILDDIR)/interop/qemu-nbd.8: $(call manual-deps,interop)
+ $(call build-manual,interop,man)
+
+$(MANUAL_BUILDDIR)/system/qemu-block-drivers.7: $(call manual-deps,system)
+ $(call build-manual,system,man)
+
$(MANUAL_BUILDDIR)/index.html: $(SRC_PATH)/docs/index.html.in qemu-version.h
+ @mkdir -p "$(MANUAL_BUILDDIR)"
$(call quiet-command, sed "s|@@VERSION@@|${VERSION}|g" $< >$@, \
"GEN","$@")
@@ -1047,8 +1077,6 @@ qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
qemu.1: qemu-option-trace.texi
qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi
fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi
-qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi
-docs/qemu-block-drivers.7: docs/qemu-block-drivers.texi
docs/qemu-cpu-models.7: docs/qemu-cpu-models.texi
scripts/qemu-trace-stap.1: scripts/qemu-trace-stap.texi
@@ -1058,10 +1086,10 @@ pdf: qemu-doc.pdf docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf
txt: qemu-doc.txt docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt
qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \
- qemu-img.texi qemu-nbd.texi qemu-options.texi \
+ qemu-img.texi qemu-options.texi \
qemu-tech.texi qemu-option-trace.texi \
qemu-deprecated.texi qemu-monitor.texi qemu-img-cmds.texi \
- qemu-monitor-info.texi docs/qemu-block-drivers.texi \
+ qemu-monitor-info.texi \
docs/qemu-cpu-models.texi docs/security.texi
docs/interop/qemu-ga-ref.dvi docs/interop/qemu-ga-ref.html \
diff --git a/Makefile.objs b/Makefile.objs
index 7c1e50f9d6..ff396b9209 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -123,6 +123,7 @@ vhost-user-blk-obj-y = contrib/vhost-user-blk/
rdmacm-mux-obj-y = contrib/rdmacm-mux/
vhost-user-input-obj-y = contrib/vhost-user-input/
vhost-user-gpu-obj-y = contrib/vhost-user-gpu/
+virtiofsd-obj-y = tools/virtiofsd/
######################################################################
trace-events-subdirs =
diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
index 837676231f..26969487d6 100644
--- a/accel/tcg/atomic_template.h
+++ b/accel/tcg/atomic_template.h
@@ -64,13 +64,10 @@
the ATOMIC_NAME macro, and redefined below. */
#if DATA_SIZE == 1
# define END
-# define MEND _be /* either le or be would be fine */
#elif defined(HOST_WORDS_BIGENDIAN)
# define END _be
-# define MEND _be
#else
# define END _le
-# define MEND _le
#endif
ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -79,8 +76,8 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
DATA_TYPE ret;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+ ATOMIC_MMU_IDX);
atomic_trace_rmw_pre(env, addr, info);
#if DATA_SIZE == 16
@@ -99,8 +96,8 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
{
ATOMIC_MMU_DECLS;
DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+ ATOMIC_MMU_IDX);
atomic_trace_ld_pre(env, addr, info);
val = atomic16_read(haddr);
@@ -114,8 +111,8 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
{
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, true,
+ ATOMIC_MMU_IDX);
atomic_trace_st_pre(env, addr, info);
atomic16_set(haddr, val);
@@ -130,8 +127,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
DATA_TYPE ret;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, false,
+ ATOMIC_MMU_IDX);
atomic_trace_rmw_pre(env, addr, info);
ret = atomic_xchg__nocheck(haddr, val);
@@ -147,10 +144,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \
ATOMIC_MMU_DECLS; \
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \
DATA_TYPE ret; \
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \
- false, \
- ATOMIC_MMU_IDX); \
- \
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, \
+ ATOMIC_MMU_IDX); \
atomic_trace_rmw_pre(env, addr, info); \
ret = atomic_##X(haddr, val); \
ATOMIC_MMU_CLEANUP; \
@@ -183,10 +178,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \
ATOMIC_MMU_DECLS; \
XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \
XDATA_TYPE cmp, old, new, val = xval; \
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \
- false, \
- ATOMIC_MMU_IDX); \
- \
+ uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, \
+ ATOMIC_MMU_IDX); \
atomic_trace_rmw_pre(env, addr, info); \
smp_mb(); \
cmp = atomic_read__nocheck(haddr); \
@@ -213,7 +206,6 @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX, DATA_TYPE, new)
#endif /* DATA SIZE >= 16 */
#undef END
-#undef MEND
#if DATA_SIZE > 1
@@ -221,10 +213,8 @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX, DATA_TYPE, new)
within the ATOMIC_NAME macro. */
#ifdef HOST_WORDS_BIGENDIAN
# define END _le
-# define MEND _le
#else
# define END _be
-# define MEND _be
#endif
ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
@@ -233,9 +223,8 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr,
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
DATA_TYPE ret;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
- false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+ ATOMIC_MMU_IDX);
atomic_trace_rmw_pre(env, addr, info);
#if DATA_SIZE == 16
@@ -254,9 +243,8 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS)
{
ATOMIC_MMU_DECLS;
DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
- false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+ ATOMIC_MMU_IDX);
atomic_trace_ld_pre(env, addr, info);
val = atomic16_read(haddr);
@@ -270,9 +258,8 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr,
{
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
- true,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, true,
+ ATOMIC_MMU_IDX);
val = BSWAP(val);
atomic_trace_st_pre(env, addr, info);
@@ -289,9 +276,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr,
ATOMIC_MMU_DECLS;
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;
ABI_TYPE ret;
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT,
- false,
- ATOMIC_MMU_IDX);
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false,
+ ATOMIC_MMU_IDX);
atomic_trace_rmw_pre(env, addr, info);
ret = atomic_xchg__nocheck(haddr, BSWAP(val));
@@ -307,10 +293,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \
ATOMIC_MMU_DECLS; \
DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \
DATA_TYPE ret; \
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \
- false, \
- ATOMIC_MMU_IDX); \
- \
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, \
+ false, ATOMIC_MMU_IDX); \
atomic_trace_rmw_pre(env, addr, info); \
ret = atomic_##X(haddr, BSWAP(val)); \
ATOMIC_MMU_CLEANUP; \
@@ -341,10 +325,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \
ATOMIC_MMU_DECLS; \
XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \
XDATA_TYPE ldo, ldn, old, new, val = xval; \
- uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \
- false, \
- ATOMIC_MMU_IDX); \
- \
+ uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, \
+ false, ATOMIC_MMU_IDX); \
atomic_trace_rmw_pre(env, addr, info); \
smp_mb(); \
ldn = atomic_read__nocheck(haddr); \
@@ -378,7 +360,6 @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new)
#endif /* DATA_SIZE >= 16 */
#undef END
-#undef MEND
#endif /* DATA_SIZE > 1 */
#undef BSWAP
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 62068d10c3..2560c90eec 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -23,7 +23,7 @@
#include "trace.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "qemu/atomic.h"
#include "sysemu/qtest.h"
#include "qemu/timer.h"
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 98221948d6..e3b5750c3b 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -34,6 +34,8 @@
#include "qemu/atomic.h"
#include "qemu/atomic128.h"
#include "translate-all.h"
+#include "trace-root.h"
+#include "trace/mem.h"
#ifdef CONFIG_PLUGIN
#include "qemu/plugin-memory.h"
#endif
@@ -78,9 +80,14 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
-static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
+static inline size_t tlb_n_entries(CPUTLBDescFast *fast)
{
- return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS);
+ return (fast->mask >> CPU_TLB_ENTRY_BITS) + 1;
+}
+
+static inline size_t sizeof_tlb(CPUTLBDescFast *fast)
+{
+ return fast->mask + (1 << CPU_TLB_ENTRY_BITS);
}
static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
@@ -90,26 +97,10 @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,
desc->window_max_entries = max_entries;
}
-static void tlb_dyn_init(CPUArchState *env)
-{
- int i;
-
- for (i = 0; i < NB_MMU_MODES; i++) {
- CPUTLBDesc *desc = &env_tlb(env)->d[i];
- size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
-
- tlb_window_reset(desc, get_clock_realtime(), 0);
- desc->n_used_entries = 0;
- env_tlb(env)->f[i].mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
- env_tlb(env)->f[i].table = g_new(CPUTLBEntry, n_entries);
- env_tlb(env)->d[i].iotlb = g_new(CPUIOTLBEntry, n_entries);
- }
-}
-
/**
* tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
- * @env: CPU that owns the TLB
- * @mmu_idx: MMU index of the TLB
+ * @desc: The CPUTLBDesc portion of the TLB
+ * @fast: The CPUTLBDescFast portion of the same TLB
*
* Called with tlb_lock_held.
*
@@ -146,13 +137,12 @@ static void tlb_dyn_init(CPUArchState *env)
* high), since otherwise we are likely to have a significant amount of
* conflict misses.
*/
-static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
+ int64_t now)
{
- CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
- size_t old_size = tlb_n_entries(env, mmu_idx);
+ size_t old_size = tlb_n_entries(fast);
size_t rate;
size_t new_size = old_size;
- int64_t now = get_clock_realtime();
int64_t window_len_ms = 100;
int64_t window_len_ns = window_len_ms * 1000 * 1000;
bool window_expired = now > desc->window_begin_ns + window_len_ns;
@@ -191,14 +181,15 @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
return;
}
- g_free(env_tlb(env)->f[mmu_idx].table);
- g_free(env_tlb(env)->d[mmu_idx].iotlb);
+ g_free(fast->table);
+ g_free(desc->iotlb);
tlb_window_reset(desc, now, 0);
/* desc->n_used_entries is cleared by the caller */
- env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
- env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
- env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
+ fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+ fast->table = g_try_new(CPUTLBEntry, new_size);
+ desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+
/*
* If the allocations fail, try smaller sizes. We just freed some
* memory, so going back to half of new_size has a good chance of working.
@@ -206,27 +197,51 @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
* allocations to fail though, so we progressively reduce the allocation
* size, aborting if we cannot even allocate the smallest TLB we support.
*/
- while (env_tlb(env)->f[mmu_idx].table == NULL ||
- env_tlb(env)->d[mmu_idx].iotlb == NULL) {
+ while (fast->table == NULL || desc->iotlb == NULL) {
if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
error_report("%s: %s", __func__, strerror(errno));
abort();
}
new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
- env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+ fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
- g_free(env_tlb(env)->f[mmu_idx].table);
- g_free(env_tlb(env)->d[mmu_idx].iotlb);
- env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size);
- env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size);
+ g_free(fast->table);
+ g_free(desc->iotlb);
+ fast->table = g_try_new(CPUTLBEntry, new_size);
+ desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
}
}
-static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
+static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast)
{
- tlb_mmu_resize_locked(env, mmu_idx);
- memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx));
- env_tlb(env)->d[mmu_idx].n_used_entries = 0;
+ desc->n_used_entries = 0;
+ desc->large_page_addr = -1;
+ desc->large_page_mask = -1;
+ desc->vindex = 0;
+ memset(fast->table, -1, sizeof_tlb(fast));
+ memset(desc->vtable, -1, sizeof(desc->vtable));
+}
+
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx,
+ int64_t now)
+{
+ CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx];
+ CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx];
+
+ tlb_mmu_resize_locked(desc, fast, now);
+ tlb_mmu_flush_locked(desc, fast);
+}
+
+static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
+{
+ size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
+
+ tlb_window_reset(desc, now, 0);
+ desc->n_used_entries = 0;
+ fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+ fast->table = g_new(CPUTLBEntry, n_entries);
+ desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+ tlb_mmu_flush_locked(desc, fast);
}
static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
@@ -242,13 +257,17 @@ static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
void tlb_init(CPUState *cpu)
{
CPUArchState *env = cpu->env_ptr;
+ int64_t now = get_clock_realtime();
+ int i;
qemu_spin_init(&env_tlb(env)->c.lock);
- /* Ensure that cpu_reset performs a full flush. */
- env_tlb(env)->c.dirty = ALL_MMUIDX_BITS;
+ /* All tlbs are initialized flushed. */
+ env_tlb(env)->c.dirty = 0;
- tlb_dyn_init(env);
+ for (i = 0; i < NB_MMU_MODES; i++) {
+ tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now);
+ }
}
/* flush_all_helper: run fn across all cpus
@@ -287,21 +306,12 @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
*pelide = elide;
}
-static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
-{
- tlb_table_flush_by_mmuidx(env, mmu_idx);
- env_tlb(env)->d[mmu_idx].large_page_addr = -1;
- env_tlb(env)->d[mmu_idx].large_page_mask = -1;
- env_tlb(env)->d[mmu_idx].vindex = 0;
- memset(env_tlb(env)->d[mmu_idx].vtable, -1,
- sizeof(env_tlb(env)->d[0].vtable));
-}
-
static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
{
CPUArchState *env = cpu->env_ptr;
uint16_t asked = data.host_int;
uint16_t all_dirty, work, to_clean;
+ int64_t now = get_clock_realtime();
assert_cpu_is_self(cpu);
@@ -316,7 +326,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
for (work = to_clean; work != 0; work &= work - 1) {
int mmu_idx = ctz32(work);
- tlb_flush_one_mmuidx_locked(env, mmu_idx);
+ tlb_flush_one_mmuidx_locked(env, mmu_idx, now);
}
qemu_spin_unlock(&env_tlb(env)->c.lock);
@@ -438,7 +448,7 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
tlb_debug("forcing full flush midx %d ("
TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
midx, lp_addr, lp_mask);
- tlb_flush_one_mmuidx_locked(env, midx);
+ tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
} else {
if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
tlb_n_used_entries_dec(env, midx);
@@ -447,28 +457,29 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
}
}
-/* As we are going to hijack the bottom bits of the page address for a
- * mmuidx bit mask we need to fail to build if we can't do that
+/**
+ * tlb_flush_page_by_mmuidx_async_0:
+ * @cpu: cpu on which to flush
+ * @addr: page of virtual address to flush
+ * @idxmap: set of mmu_idx to flush
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, flush one page
+ * at @addr from the tlbs indicated by @idxmap from @cpu.
*/
-QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
-
-static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
- run_on_cpu_data data)
+static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
+ target_ulong addr,
+ uint16_t idxmap)
{
CPUArchState *env = cpu->env_ptr;
- target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
- target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
- unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
int mmu_idx;
assert_cpu_is_self(cpu);
- tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
- addr, mmu_idx_bitmap);
+ tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%x\n", addr, idxmap);
qemu_spin_lock(&env_tlb(env)->c.lock);
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
- if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+ if ((idxmap >> mmu_idx) & 1) {
tlb_flush_page_locked(env, mmu_idx, addr);
}
}
@@ -477,22 +488,75 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
tb_flush_jmp_cache(cpu, addr);
}
-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
+/**
+ * tlb_flush_page_by_mmuidx_async_1:
+ * @cpu: cpu on which to flush
+ * @data: encoded addr + idxmap
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, called through
+ * async_run_on_cpu. The idxmap parameter is encoded in the page
+ * offset of the target_ptr field. This limits the set of mmu_idx
+ * that can be passed via this method.
+ */
+static void tlb_flush_page_by_mmuidx_async_1(CPUState *cpu,
+ run_on_cpu_data data)
{
- target_ulong addr_and_mmu_idx;
+ target_ulong addr_and_idxmap = (target_ulong) data.target_ptr;
+ target_ulong addr = addr_and_idxmap & TARGET_PAGE_MASK;
+ uint16_t idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK;
+ tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
+}
+
+typedef struct {
+ target_ulong addr;
+ uint16_t idxmap;
+} TLBFlushPageByMMUIdxData;
+
+/**
+ * tlb_flush_page_by_mmuidx_async_2:
+ * @cpu: cpu on which to flush
+ * @data: allocated addr + idxmap
+ *
+ * Helper for tlb_flush_page_by_mmuidx and friends, called through
+ * async_run_on_cpu. The addr+idxmap parameters are stored in a
+ * TLBFlushPageByMMUIdxData structure that has been allocated
+ * specifically for this helper. Free the structure when done.
+ */
+static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu,
+ run_on_cpu_data data)
+{
+ TLBFlushPageByMMUIdxData *d = data.host_ptr;
+
+ tlb_flush_page_by_mmuidx_async_0(cpu, d->addr, d->idxmap);
+ g_free(d);
+}
+
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
+{
tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);
/* This should already be page aligned */
- addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
- addr_and_mmu_idx |= idxmap;
+ addr &= TARGET_PAGE_MASK;
- if (!qemu_cpu_is_self(cpu)) {
- async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work,
- RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+ if (qemu_cpu_is_self(cpu)) {
+ tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap);
+ } else if (idxmap < TARGET_PAGE_SIZE) {
+ /*
+ * Most targets have only a few mmu_idx. In the case where
+ * we can stuff idxmap into the low TARGET_PAGE_BITS, avoid
+ * allocating memory for this operation.
+ */
+ async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_1,
+ RUN_ON_CPU_TARGET_PTR(addr | idxmap));
} else {
- tlb_flush_page_by_mmuidx_async_work(
- cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+ TLBFlushPageByMMUIdxData *d = g_new(TLBFlushPageByMMUIdxData, 1);
+
+ /* Otherwise allocate a structure, freed by the worker. */
+ d->addr = addr;
+ d->idxmap = idxmap;
+ async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_2,
+ RUN_ON_CPU_HOST_PTR(d));
}
}
@@ -504,17 +568,36 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr)
void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
uint16_t idxmap)
{
- const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
- target_ulong addr_and_mmu_idx;
-
tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
/* This should already be page aligned */
- addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
- addr_and_mmu_idx |= idxmap;
+ addr &= TARGET_PAGE_MASK;
- flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
- fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+ /*
+ * Allocate memory to hold addr+idxmap only when needed.
+ * See tlb_flush_page_by_mmuidx for details.
+ */
+ if (idxmap < TARGET_PAGE_SIZE) {
+ flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+ RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+ } else {
+ CPUState *dst_cpu;
+
+ /* Allocate a separate data block for each destination cpu. */
+ CPU_FOREACH(dst_cpu) {
+ if (dst_cpu != src_cpu) {
+ TLBFlushPageByMMUIdxData *d
+ = g_new(TLBFlushPageByMMUIdxData, 1);
+
+ d->addr = addr;
+ d->idxmap = idxmap;
+ async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
+ RUN_ON_CPU_HOST_PTR(d));
+ }
+ }
+ }
+
+ tlb_flush_page_by_mmuidx_async_0(src_cpu, addr, idxmap);
}
void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
@@ -526,17 +609,41 @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
target_ulong addr,
uint16_t idxmap)
{
- const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
- target_ulong addr_and_mmu_idx;
-
tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
/* This should already be page aligned */
- addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
- addr_and_mmu_idx |= idxmap;
+ addr &= TARGET_PAGE_MASK;
+
+ /*
+ * Allocate memory to hold addr+idxmap only when needed.
+ * See tlb_flush_page_by_mmuidx for details.
+ */
+ if (idxmap < TARGET_PAGE_SIZE) {
+ flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+ RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+ async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_1,
+ RUN_ON_CPU_TARGET_PTR(addr | idxmap));
+ } else {
+ CPUState *dst_cpu;
+ TLBFlushPageByMMUIdxData *d;
+
+ /* Allocate a separate data block for each destination cpu. */
+ CPU_FOREACH(dst_cpu) {
+ if (dst_cpu != src_cpu) {
+ d = g_new(TLBFlushPageByMMUIdxData, 1);
+ d->addr = addr;
+ d->idxmap = idxmap;
+ async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2,
+ RUN_ON_CPU_HOST_PTR(d));
+ }
+ }
- flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
- async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+ d = g_new(TLBFlushPageByMMUIdxData, 1);
+ d->addr = addr;
+ d->idxmap = idxmap;
+ async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_2,
+ RUN_ON_CPU_HOST_PTR(d));
+ }
}
void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
@@ -620,7 +727,7 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
qemu_spin_lock(&env_tlb(env)->c.lock);
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
unsigned int i;
- unsigned int n = tlb_n_entries(env, mmu_idx);
+ unsigned int n = tlb_n_entries(&env_tlb(env)->f[mmu_idx]);
for (i = 0; i < n; i++) {
tlb_reset_dirty_range_locked(&env_tlb(env)->f[mmu_idx].table[i],
@@ -1626,6 +1733,137 @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
}
/*
+ * Load helpers for cpu_ldst.h.
+ */
+
+static inline uint64_t cpu_load_helper(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t retaddr,
+ MemOp op, FullLoadHelper *full_load)
+{
+ uint16_t meminfo;
+ TCGMemOpIdx oi;
+ uint64_t ret;
+
+ meminfo = trace_mem_get_info(op, mmu_idx, false);
+ trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
+
+ op &= ~MO_SIGN;
+ oi = make_memop_idx(op, mmu_idx);
+ ret = full_load(env, addr, oi, retaddr);
+
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
+
+ return ret;
+}
+
+uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_load_helper(env, addr, mmu_idx, ra, MO_UB, full_ldub_mmu);
+}
+
+int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return (int8_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_SB,
+ full_ldub_mmu);
+}
+
+uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUW,
+ MO_TE == MO_LE
+ ? full_le_lduw_mmu : full_be_lduw_mmu);
+}
+
+int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return (int16_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_TESW,
+ MO_TE == MO_LE
+ ? full_le_lduw_mmu : full_be_lduw_mmu);
+}
+
+uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUL,
+ MO_TE == MO_LE
+ ? full_le_ldul_mmu : full_be_ldul_mmu);
+}
+
+uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEQ,
+ MO_TE == MO_LE
+ ? helper_le_ldq_mmu : helper_be_ldq_mmu);
+}
+
+uint32_t cpu_ldub_data_ra(CPUArchState *env, target_ulong ptr,
+ uintptr_t retaddr)
+{
+ return cpu_ldub_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+int cpu_ldsb_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+ return cpu_ldsb_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_lduw_data_ra(CPUArchState *env, target_ulong ptr,
+ uintptr_t retaddr)
+{
+ return cpu_lduw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+int cpu_ldsw_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+ return cpu_ldsw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_ldl_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+ return cpu_ldl_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint64_t cpu_ldq_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr)
+{
+ return cpu_ldq_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr);
+}
+
+uint32_t cpu_ldub_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_ldub_data_ra(env, ptr, 0);
+}
+
+int cpu_ldsb_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_ldsb_data_ra(env, ptr, 0);
+}
+
+uint32_t cpu_lduw_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_lduw_data_ra(env, ptr, 0);
+}
+
+int cpu_ldsw_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_ldsw_data_ra(env, ptr, 0);
+}
+
+uint32_t cpu_ldl_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_ldl_data_ra(env, ptr, 0);
+}
+
+uint64_t cpu_ldq_data(CPUArchState *env, target_ulong ptr)
+{
+ return cpu_ldq_data_ra(env, ptr, 0);
+}
+
+/*
* Store Helpers
*/
@@ -1854,6 +2092,94 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
store_helper(env, addr, val, oi, retaddr, MO_BEQ);
}
+/*
+ * Store Helpers for cpu_ldst.h
+ */
+
+static inline void QEMU_ALWAYS_INLINE
+cpu_store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+ int mmu_idx, uintptr_t retaddr, MemOp op)
+{
+ TCGMemOpIdx oi;
+ uint16_t meminfo;
+
+ meminfo = trace_mem_get_info(op, mmu_idx, true);
+ trace_guest_mem_before_exec(env_cpu(env), addr, meminfo);
+
+ oi = make_memop_idx(op, mmu_idx);
+ store_helper(env, addr, val, oi, retaddr, op);
+
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo);
+}
+
+void cpu_stb_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr)
+{
+ cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_UB);
+}
+
+void cpu_stw_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr)
+{
+ cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUW);
+}
+
+void cpu_stl_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr)
+{
+ cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUL);
+}
+
+void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val,
+ int mmu_idx, uintptr_t retaddr)
+{
+ cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ);
+}
+
+void cpu_stb_data_ra(CPUArchState *env, target_ulong ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ cpu_stb_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stw_data_ra(CPUArchState *env, target_ulong ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ cpu_stw_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stl_data_ra(CPUArchState *env, target_ulong ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ cpu_stl_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stq_data_ra(CPUArchState *env, target_ulong ptr,
+ uint64_t val, uintptr_t retaddr)
+{
+ cpu_stq_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr);
+}
+
+void cpu_stb_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+ cpu_stb_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stw_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+ cpu_stw_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stl_data(CPUArchState *env, target_ulong ptr, uint32_t val)
+{
+ cpu_stl_data_ra(env, ptr, val, 0);
+}
+
+void cpu_stq_data(CPUArchState *env, target_ulong ptr, uint64_t val)
+{
+ cpu_stq_data_ra(env, ptr, val, 0);
+}
+
/* First set of helpers allows passing in of OI and RETADDR. This makes
them callable from other helpers. */
@@ -1912,98 +2238,50 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
/* Code access functions. */
-static uint64_t full_ldub_cmmu(CPUArchState *env, target_ulong addr,
+static uint64_t full_ldub_code(CPUArchState *env, target_ulong addr,
TCGMemOpIdx oi, uintptr_t retaddr)
{
- return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_cmmu);
-}
-
-uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return full_ldub_cmmu(env, addr, oi, retaddr);
-}
-
-int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return (int8_t) full_ldub_cmmu(env, addr, oi, retaddr);
-}
-
-static uint64_t full_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return load_helper(env, addr, oi, retaddr, MO_LEUW, true,
- full_le_lduw_cmmu);
-}
-
-uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return full_le_lduw_cmmu(env, addr, oi, retaddr);
+ return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_code);
}
-int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr)
{
- return (int16_t) full_le_lduw_cmmu(env, addr, oi, retaddr);
+ TCGMemOpIdx oi = make_memop_idx(MO_UB, cpu_mmu_index(env, true));
+ return full_ldub_code(env, addr, oi, 0);
}
-static uint64_t full_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return load_helper(env, addr, oi, retaddr, MO_BEUW, true,
- full_be_lduw_cmmu);
-}
-
-uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return full_be_lduw_cmmu(env, addr, oi, retaddr);
-}
-
-int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
-{
- return (int16_t) full_be_lduw_cmmu(env, addr, oi, retaddr);
-}
-
-static uint64_t full_le_ldul_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_lduw_code(CPUArchState *env, target_ulong addr,
+ TCGMemOpIdx oi, uintptr_t retaddr)
{
- return load_helper(env, addr, oi, retaddr, MO_LEUL, true,
- full_le_ldul_cmmu);
+ return load_helper(env, addr, oi, retaddr, MO_TEUW, true, full_lduw_code);
}
-uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr)
{
- return full_le_ldul_cmmu(env, addr, oi, retaddr);
+ TCGMemOpIdx oi = make_memop_idx(MO_TEUW, cpu_mmu_index(env, true));
+ return full_lduw_code(env, addr, oi, 0);
}
-static uint64_t full_be_ldul_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_ldl_code(CPUArchState *env, target_ulong addr,
+ TCGMemOpIdx oi, uintptr_t retaddr)
{
- return load_helper(env, addr, oi, retaddr, MO_BEUL, true,
- full_be_ldul_cmmu);
+ return load_helper(env, addr, oi, retaddr, MO_TEUL, true, full_ldl_code);
}
-uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr)
{
- return full_be_ldul_cmmu(env, addr, oi, retaddr);
+ TCGMemOpIdx oi = make_memop_idx(MO_TEUL, cpu_mmu_index(env, true));
+ return full_ldl_code(env, addr, oi, 0);
}
-uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+static uint64_t full_ldq_code(CPUArchState *env, target_ulong addr,
+ TCGMemOpIdx oi, uintptr_t retaddr)
{
- return load_helper(env, addr, oi, retaddr, MO_LEQ, true,
- helper_le_ldq_cmmu);
+ return load_helper(env, addr, oi, retaddr, MO_TEQ, true, full_ldq_code);
}
-uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr)
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr)
{
- return load_helper(env, addr, oi, retaddr, MO_BEQ, true,
- helper_be_ldq_cmmu);
+ TCGMemOpIdx oi = make_memop_idx(MO_TEQ, cpu_mmu_index(env, true));
+ return full_ldq_code(env, addr, oi, 0);
}
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 51cb29ca79..5b1902d591 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -21,7 +21,7 @@
#include "qemu/host-utils.h"
#include "cpu.h"
#include "exec/helper-proto.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-gvec-desc.h"
/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index 8a1e408e31..446465a09a 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -30,6 +30,7 @@
#include "exec/tb-lookup.h"
#include "disas/disas.h"
#include "exec/log.h"
+#include "tcg/tcg.h"
/* 32-bit helpers */
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index bb325a2bc4..a08ab11f65 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -25,7 +25,7 @@
#include "trace.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#if defined(CONFIG_USER_ONLY)
#include "qemu.h"
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index b09f7a1577..4be78eb9b3 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -20,12 +20,14 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "qemu/bitops.h"
#include "exec/cpu_ldst.h"
#include "translate-all.h"
#include "exec/helper-proto.h"
#include "qemu/atomic128.h"
+#include "trace-root.h"
+#include "trace/mem.h"
#undef EAX
#undef ECX
@@ -734,6 +736,240 @@ int cpu_signal_handler(int host_signum, void *pinfo,
/* The softmmu versions of these helpers are in cputlb.c. */
+uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+ uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = ldub_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr)
+{
+ int ret;
+ uint16_t meminfo = trace_mem_get_info(MO_SB, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = ldsb_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+ uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = lduw_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr)
+{
+ int ret;
+ uint16_t meminfo = trace_mem_get_info(MO_TESW, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = ldsw_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+ uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = ldl_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr)
+{
+ uint64_t ret;
+ uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, false);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ ret = ldq_p(g2h(ptr));
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+ return ret;
+}
+
+uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_ldub_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ int ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_ldsb_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_lduw_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ int ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_ldsw_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_ldl_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr)
+{
+ uint64_t ret;
+
+ set_helper_retaddr(retaddr);
+ ret = cpu_ldq_data(env, ptr);
+ clear_helper_retaddr();
+ return ret;
+}
+
+void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+ uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, true);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ stb_p(g2h(ptr), val);
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+ uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, true);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ stw_p(g2h(ptr), val);
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val)
+{
+ uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, true);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ stl_p(g2h(ptr), val);
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val)
+{
+ uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, true);
+
+ trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
+ stq_p(g2h(ptr), val);
+ qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
+}
+
+void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ set_helper_retaddr(retaddr);
+ cpu_stb_data(env, ptr, val);
+ clear_helper_retaddr();
+}
+
+void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ set_helper_retaddr(retaddr);
+ cpu_stw_data(env, ptr, val);
+ clear_helper_retaddr();
+}
+
+void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr)
+{
+ set_helper_retaddr(retaddr);
+ cpu_stl_data(env, ptr, val);
+ clear_helper_retaddr();
+}
+
+void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint64_t val, uintptr_t retaddr)
+{
+ set_helper_retaddr(retaddr);
+ cpu_stq_data(env, ptr, val);
+ clear_helper_retaddr();
+}
+
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(1);
+ ret = ldub_p(g2h(ptr));
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(1);
+ ret = lduw_p(g2h(ptr));
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr ptr)
+{
+ uint32_t ret;
+
+ set_helper_retaddr(1);
+ ret = ldl_p(g2h(ptr));
+ clear_helper_retaddr();
+ return ret;
+}
+
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr)
+{
+ uint64_t ret;
+
+ set_helper_retaddr(1);
+ ret = ldq_p(g2h(ptr));
+ clear_helper_retaddr();
+ return ret;
+}
+
/* Do not allow unaligned operations to proceed. Return the host address. */
static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
int size, uintptr_t retaddr)
diff --git a/backends/dbus-vmstate.c b/backends/dbus-vmstate.c
index 56b482a7d6..cc594a722e 100644
--- a/backends/dbus-vmstate.c
+++ b/backends/dbus-vmstate.c
@@ -412,7 +412,8 @@ dbus_vmstate_complete(UserCreatable *uc, Error **errp)
return;
}
- if (vmstate_register(VMSTATE_IF(self), -1, &dbus_vmstate, self) < 0) {
+ if (vmstate_register(VMSTATE_IF(self), VMSTATE_INSTANCE_ID_ANY,
+ &dbus_vmstate, self) < 0) {
error_setg(errp, "Failed to register vmstate");
}
}
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 7f4e3cd627..770c2b267a 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -33,7 +33,7 @@
#include "qemu/module.h"
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "qemu/timer.h"
#include "qemu/envlist.h"
#include "exec/log.h"
diff --git a/configure b/configure
index 08c3a1c1f0..6ab028dd0d 100755
--- a/configure
+++ b/configure
@@ -4761,6 +4761,12 @@ if compile_prog "" "" ; then
syncfs=yes
fi
+# check for kcov support (kernel must be 4.4+, compiled with certain options)
+kcov=no
+if check_include sys/kcov.h ; then
+ kcov=yes
+fi
+
# Check we have a new enough version of sphinx-build
has_sphinx_build() {
# This is a bit awkward but works: create a trivial document and
@@ -5191,6 +5197,19 @@ if compile_prog "" "" ; then
strchrnul=yes
fi
+#########################################
+# check if we have st_atim
+
+st_atim=no
+cat > $TMPC << EOF
+#include <sys/stat.h>
+#include <stddef.h>
+int main(void) { return offsetof(struct stat, st_atim); }
+EOF
+if compile_prog "" "" ; then
+ st_atim=yes
+fi
+
##########################################
# check if trace backend exists
@@ -6874,6 +6893,9 @@ fi
if test "$syncfs" = "yes" ; then
echo "CONFIG_SYNCFS=y" >> $config_host_mak
fi
+if test "$kcov" = "yes" ; then
+ echo "CONFIG_KCOV=y" >> $config_host_mak
+fi
if test "$inotify" = "yes" ; then
echo "CONFIG_INOTIFY=y" >> $config_host_mak
fi
@@ -6886,6 +6908,9 @@ fi
if test "$strchrnul" = "yes" ; then
echo "HAVE_STRCHRNUL=y" >> $config_host_mak
fi
+if test "$st_atim" = "yes" ; then
+ echo "HAVE_STRUCT_STAT_ST_ATIM=y" >> $config_host_mak
+fi
if test "$byteswap_h" = "yes" ; then
echo "CONFIG_BYTESWAP_H=y" >> $config_host_mak
fi
@@ -7437,7 +7462,6 @@ elif test "$ARCH" = "riscv32" || test "$ARCH" = "riscv64" ; then
else
QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg/\$(ARCH) $QEMU_INCLUDES"
fi
-QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg $QEMU_INCLUDES"
echo "TOOLS=$tools" >> $config_host_mak
echo "ROMS=$roms" >> $config_host_mak
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index ec27b78ff1..b89bf18501 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -392,26 +392,37 @@ vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
return vu_message_write(dev, conn_fd, vmsg);
}
+/*
+ * Processes a reply on the slave channel.
+ * Entered with slave_mutex held and releases it before exit.
+ * Returns true on success.
+ */
static bool
vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg)
{
VhostUserMsg msg_reply;
+ bool result = false;
if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
- return true;
+ result = true;
+ goto out;
}
if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) {
- return false;
+ goto out;
}
if (msg_reply.request != vmsg->request) {
DPRINT("Received unexpected msg type. Expected %d received %d",
vmsg->request, msg_reply.request);
- return false;
+ goto out;
}
- return msg_reply.payload.u64 == 0;
+ result = msg_reply.payload.u64 == 0;
+
+out:
+ pthread_mutex_unlock(&dev->slave_mutex);
+ return result;
}
/* Kick the log_call_fd if required. */
@@ -554,6 +565,21 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
}
static bool
+map_ring(VuDev *dev, VuVirtq *vq)
+{
+ vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
+ vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
+ vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
+
+ DPRINT("Setting virtq addresses:\n");
+ DPRINT(" vring_desc at %p\n", vq->vring.desc);
+ DPRINT(" vring_used at %p\n", vq->vring.used);
+ DPRINT(" vring_avail at %p\n", vq->vring.avail);
+
+ return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
+}
+
+static bool
vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
{
int i;
@@ -756,6 +782,14 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
close(vmsg->fds[i]);
}
+ for (i = 0; i < dev->max_queues; i++) {
+ if (dev->vq[i].vring.desc) {
+ if (map_ring(dev, &dev->vq[i])) {
+ vu_panic(dev, "remaping queue %d during setmemtable", i);
+ }
+ }
+ }
+
return false;
}
@@ -842,18 +876,12 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", vra->avail_user_addr);
DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", vra->log_guest_addr);
+ vq->vra = *vra;
vq->vring.flags = vra->flags;
- vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
- vq->vring.used = qva_to_va(dev, vra->used_user_addr);
- vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
vq->vring.log_guest_addr = vra->log_guest_addr;
- DPRINT("Setting virtq addresses:\n");
- DPRINT(" vring_desc at %p\n", vq->vring.desc);
- DPRINT(" vring_used at %p\n", vq->vring.used);
- DPRINT(" vring_avail at %p\n", vq->vring.avail);
- if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
+ if (map_ring(dev, vq)) {
vu_panic(dev, "Invalid vring_addr message");
return false;
}
@@ -1105,10 +1133,13 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd,
return false;
}
+ pthread_mutex_lock(&dev->slave_mutex);
if (!vu_message_write(dev, dev->slave_fd, &vmsg)) {
+ pthread_mutex_unlock(&dev->slave_mutex);
return false;
}
+ /* Also unlocks the slave_mutex */
return vu_process_message_reply(dev, &vmsg);
}
@@ -1628,6 +1659,7 @@ vu_deinit(VuDev *dev)
close(dev->slave_fd);
dev->slave_fd = -1;
}
+ pthread_mutex_destroy(&dev->slave_mutex);
if (dev->sock != -1) {
close(dev->sock);
@@ -1663,6 +1695,7 @@ vu_init(VuDev *dev,
dev->remove_watch = remove_watch;
dev->iface = iface;
dev->log_call_fd = -1;
+ pthread_mutex_init(&dev->slave_mutex, NULL);
dev->slave_fd = -1;
dev->max_queues = max_queues;
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 46b600799b..5cb7708559 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -19,6 +19,7 @@
#include <stddef.h>
#include <sys/poll.h>
#include <linux/vhost.h>
+#include <pthread.h>
#include "standard-headers/linux/virtio_ring.h"
/* Based on qemu/hw/virtio/vhost-user.c */
@@ -326,6 +327,9 @@ typedef struct VuVirtq {
int err_fd;
unsigned int enable;
bool started;
+
+ /* Guest addresses of our ring */
+ struct vhost_vring_addr vra;
} VuVirtq;
enum VuWatchCondtion {
@@ -355,6 +359,8 @@ struct VuDev {
VuVirtq *vq;
VuDevInflightInfo inflight_info;
int log_call_fd;
+ /* Must be held while using slave_fd */
+ pthread_mutex_t slave_mutex;
int slave_fd;
uint64_t log_size;
uint8_t *log_table;
diff --git a/cpus.c b/cpus.c
index be2d655f37..b612116f95 100644
--- a/cpus.c
+++ b/cpus.c
@@ -53,7 +53,7 @@
#include "qemu/bitmap.h"
#include "qemu/seqlock.h"
#include "qemu/guest-random.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "hw/nmi.h"
#include "sysemu/replay.h"
#include "sysemu/runstate.h"
diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
index 1f2e0e7fde..645e6201bb 100644
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@@ -30,6 +30,7 @@ CONFIG_Z2=y
CONFIG_COLLIE=y
CONFIG_ASPEED_SOC=y
CONFIG_NETDUINO2=y
+CONFIG_NETDUINOPLUS2=y
CONFIG_MPS2=y
CONFIG_RASPI=y
CONFIG_DIGIC=y
diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
index c74cd090e6..03aa9e7ff8 100644
--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -72,31 +72,66 @@ Regexes for git grep
- ``\<ldn_\([hbl]e\)?_p\>``
- ``\<stn_\([hbl]e\)?_p\>``
-``cpu_{ld,st}_*``
-~~~~~~~~~~~~~~~~~
+``cpu_{ld,st}*_mmuidx_ra``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions operate on a guest virtual address plus a context,
+known as a "mmu index" or ``mmuidx``, which controls how that virtual
+address is translated. The meaning of the indexes are target specific,
+but specifying a particular index might be necessary if, for instance,
+the helper requires an "always as non-privileged" access rather that
+the default access for the current state of the guest CPU.
+
+These functions may cause a guest CPU exception to be taken
+(e.g. for an alignment fault or MMU fault) which will result in
+guest CPU state being updated and control longjmp'ing out of the
+function call. They should therefore only be used in code that is
+implementing emulation of the guest CPU.
+
+The ``retaddr`` parameter is used to control unwinding of the
+guest CPU state in case of a guest CPU exception. This is passed
+to ``cpu_restore_state()``. Therefore the value should either be 0,
+to indicate that the guest CPU state is already synchronized, or
+the result of ``GETPC()`` from the top level ``HELPER(foo)``
+function, which is a return address into the generated code.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmuidx, retaddr)``
+
+store: ``cpu_st{size}_mmuidx_ra(env, ptr, val, mmuidx, retaddr)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_mmuidx_ra\>``
+ - ``\<cpu_st[bwlq]_mmuidx_ra\>``
+
+``cpu_{ld,st}*_data_ra``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions work like the ``cpu_{ld,st}_mmuidx_ra`` functions
+except that the ``mmuidx`` parameter is taken from the current mode
+of the guest CPU, as determined by ``cpu_mmu_index(env, false)``.
-These functions operate on a guest virtual address. Be aware
-that these functions may cause a guest CPU exception to be
-taken (e.g. for an alignment fault or MMU fault) which will
-result in guest CPU state being updated and control longjumping
-out of the function call. They should therefore only be used
-in code that is implementing emulation of the target CPU.
-
-These functions may throw an exception (longjmp() back out
-to the top level TCG loop). This means they must only be used
-from helper functions where the translator has saved all
-necessary CPU state before generating the helper function call.
-It's usually better to use the ``_ra`` variants described below
-from helper functions, but these functions are the right choice
-for calls made from hooks like the CPU do_interrupt hook or
-when you know for certain that the translator had to save all
-the CPU state that ``cpu_restore_state()`` would restore anyway.
+These are generally the preferred way to do accesses by guest
+virtual address from helper functions, unless the access should
+be performed with a context other than the default.
Function names follow the pattern:
-load: ``cpu_ld{sign}{size}_{mmusuffix}(env, ptr)``
+load: ``cpu_ld{sign}{size}_data_ra(env, ptr, ra)``
-store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
+store: ``cpu_st{size}_data_ra(env, ptr, val, ra)``
``sign``
- (empty) : for 32 or 64 bit sizes
@@ -109,56 +144,119 @@ store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)``
- ``l`` : 32 bits
- ``q`` : 64 bits
-``mmusuffix`` is one of the generic suffixes ``data`` or ``code``, or
-(for softmmu configs) a target-specific MMU mode suffix as defined
-in the target's ``cpu.h``.
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_data_ra\>``
+ - ``\<cpu_st[bwlq]_data_ra\>``
+
+``cpu_{ld,st}*_data``
+~~~~~~~~~~~~~~~~~~~~~
+
+These functions work like the ``cpu_{ld,st}_data_ra`` functions
+except that the ``retaddr`` parameter is 0, and thus does not
+unwind guest CPU state.
+
+This means they must only be used from helper functions where the
+translator has saved all necessary CPU state. These functions are
+the right choice for calls made from hooks like the CPU ``do_interrupt``
+hook or when you know for certain that the translator had to save all
+the CPU state anyway.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_data(env, ptr)``
+
+store: ``cpu_st{size}_data(env, ptr, val)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
Regexes for git grep
- - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+\>``
- - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+\>``
+ - ``\<cpu_ld[us]\?[bwlq]_data\>``
+ - ``\<cpu_st[bwlq]_data\+\>``
-``cpu_{ld,st}_*_ra``
-~~~~~~~~~~~~~~~~~~~~
+``cpu_ld*_code``
+~~~~~~~~~~~~~~~~
-These functions work like the ``cpu_{ld,st}_*`` functions except
-that they also take a ``retaddr`` argument. This extra argument
-allows for correct unwinding of any exception that is taken,
-and should generally be the result of GETPC() called directly
-from the top level HELPER(foo) function (i.e. the return address
-in the generated code).
+These functions perform a read for instruction execution. The ``mmuidx``
+parameter is taken from the current mode of the guest CPU, as determined
+by ``cpu_mmu_index(env, true)``. The ``retaddr`` parameter is 0, and
+thus does not unwind guest CPU state, because CPU state is always
+synchronized while translating instructions. Any guest CPU exception
+that is raised will indicate an instruction execution fault rather than
+a data read fault.
-These are generally the preferred way to do accesses by guest
-virtual address from helper functions; see the documentation
-of the non-``_ra`` variants for when those would be better.
+In general these functions should not be used directly during translation.
+There are wrapper functions that are to be used which also take care of
+plugins for tracing.
+
+Function names follow the pattern:
+
+load: ``cpu_ld{sign}{size}_code(env, ptr)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
+
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
+
+Regexes for git grep:
+ - ``\<cpu_ld[us]\?[bwlq]_code\>``
+
+``translator_ld*``
+~~~~~~~~~~~~~~~~~~
-Calling these functions with a ``retaddr`` argument of 0 is
-equivalent to calling the non-``_ra`` version of the function.
+These functions are a wrapper for ``cpu_ld*_code`` which also perform
+any actions required by any tracing plugins. They are only to be
+called during the translator callback ``translate_insn``.
+
+There is a set of functions ending in ``_swap`` which, if the parameter
+is true, returns the value in the endianness that is the reverse of
+the guest native endianness, as determined by ``TARGET_WORDS_BIGENDIAN``.
Function names follow the pattern:
-load: ``cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)``
+load: ``translator_ld{sign}{size}(env, ptr)``
+
+swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)``
+
+``sign``
+ - (empty) : for 32 or 64 bit sizes
+ - ``u`` : unsigned
+ - ``s`` : signed
-store: ``cpu_st{sign}{size}_{mmusuffix}_ra(env, ptr, val, retaddr)``
+``size``
+ - ``b`` : 8 bits
+ - ``w`` : 16 bits
+ - ``l`` : 32 bits
+ - ``q`` : 64 bits
Regexes for git grep
- - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+_ra\>``
- - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+_ra\>``
+ - ``\<translator_ld[us]\?[bwlq]\(_swap\)\?\>``
-``helper_*_{ld,st}*mmu``
-~~~~~~~~~~~~~~~~~~~~~~~~
+``helper_*_{ld,st}*_mmu``
+~~~~~~~~~~~~~~~~~~~~~~~~~
These functions are intended primarily to be called by the code
generated by the TCG backend. They may also be called by target
-CPU helper function code. Like the ``cpu_{ld,st}_*_ra`` functions
-they perform accesses by guest virtual address; the difference is
-that these functions allow you to specify an ``opindex`` parameter
-which encodes (among other things) the mmu index to use for the
-access. This is necessary if your helper needs to make an access
-via a specific mmu index (for instance, an "always as non-privileged"
-access) rather than using the default mmu index for the current state
-of the guest CPU.
+CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions
+they perform accesses by guest virtual address, with a given ``mmuidx``.
-The ``opindex`` parameter should be created by calling ``make_memop_idx()``.
+These functions specify an ``opindex`` parameter which encodes
+(among other things) the mmu index to use for the access. This parameter
+should be created by calling ``make_memop_idx()``.
The ``retaddr`` parameter should be the result of GETPC() called directly
from the top level HELPER(foo) function (or 0 if no guest CPU state
@@ -166,13 +264,12 @@ unwinding is required).
**TODO** The names of these functions are a bit odd for historical
reasons because they were originally expected to be called only from
-within generated code. We should rename them to bring them
-more in line with the other memory access functions.
+within generated code. We should rename them to bring them more in
+line with the other memory access functions. The explicit endianness
+is the only feature they have beyond ``*_mmuidx_ra``.
load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)``
-load (code): ``helper_{endian}_ld{sign}{size}_cmmu(env, addr, opindex, retaddr)``
-
store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
``sign``
@@ -192,7 +289,7 @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)``
- ``ret`` : target endianness
Regexes for git grep
- - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_c\?mmu\>``
+ - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_mmu\>``
- ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>``
``address_space_*``
diff --git a/docs/index.html.in b/docs/index.html.in
index 94eb782cf7..8512933d14 100644
--- a/docs/index.html.in
+++ b/docs/index.html.in
@@ -12,6 +12,7 @@
<li><a href="qemu-ga-ref.html">Guest Agent Protocol Reference</a></li>
<li><a href="interop/index.html">System Emulation Management and Interoperability Guide</a></li>
<li><a href="specs/index.html">System Emulation Guest Hardware Specifications</a></li>
+ <li><a href="system/index.html">System Emulation User's Guide</a></li>
</ul>
</body>
</html>
diff --git a/docs/index.rst b/docs/index.rst
index baa5791c17..46405d4f07 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,4 +13,4 @@ Welcome to QEMU's documentation!
interop/index
devel/index
specs/index
-
+ system/index
diff --git a/docs/interop/conf.py b/docs/interop/conf.py
index e87b8c22be..40b1ad811d 100644
--- a/docs/interop/conf.py
+++ b/docs/interop/conf.py
@@ -18,5 +18,7 @@ html_theme_options['description'] = u'System Emulation Management and Interopera
# (source start file, name, description, authors, manual section).
man_pages = [
('qemu-ga', 'qemu-ga', u'QEMU Guest Agent',
- ['Michael Roth <mdroth@linux.vnet.ibm.com>'], 8)
+ ['Michael Roth <mdroth@linux.vnet.ibm.com>'], 8),
+ ('qemu-nbd', 'qemu-nbd', u'QEMU Disk Network Block Device Server',
+ ['Anthony Liguori <anthony@codemonkey.ws>'], 8)
]
diff --git a/docs/interop/index.rst b/docs/interop/index.rst
index 049387ac6d..c28f7785a5 100644
--- a/docs/interop/index.rst
+++ b/docs/interop/index.rst
@@ -18,5 +18,6 @@ Contents:
live-block-operations
pr-helper
qemu-ga
+ qemu-nbd
vhost-user
vhost-user-gpu
diff --git a/docs/interop/qemu-nbd.rst b/docs/interop/qemu-nbd.rst
new file mode 100644
index 0000000000..873bb9e17d
--- /dev/null
+++ b/docs/interop/qemu-nbd.rst
@@ -0,0 +1,263 @@
+QEMU Disk Network Block Device Server
+=====================================
+
+Synopsis
+--------
+
+**qemu-nbd** [*OPTION*]... *filename*
+
+**qemu-nbd** -L [*OPTION*]...
+
+**qemu-nbd** -d *dev*
+
+Description
+-----------
+
+Export a QEMU disk image using the NBD protocol.
+
+Other uses:
+
+- Bind a /dev/nbdX block device to a QEMU server (on Linux).
+- As a client to query exports of a remote NBD server.
+
+Options
+-------
+
+.. program:: qemu-nbd
+
+*filename* is a disk image filename, or a set of block
+driver options if ``--image-opts`` is specified.
+
+*dev* is an NBD device.
+
+.. option:: --object type,id=ID,...props...
+
+ Define a new instance of the *type* object class identified by *ID*.
+ See the :manpage:`qemu(1)` manual page for full details of the properties
+ supported. The common object types that it makes sense to define are the
+ ``secret`` object, which is used to supply passwords and/or encryption
+ keys, and the ``tls-creds`` object, which is used to supply TLS
+ credentials for the qemu-nbd server or client.
+
+.. option:: -p, --port=PORT
+
+ TCP port to listen on as a server, or connect to as a client
+ (default ``10809``).
+
+.. option:: -o, --offset=OFFSET
+
+ The offset into the image.
+
+.. option:: -b, --bind=IFACE
+
+ The interface to bind to as a server, or connect to as a client
+ (default ``0.0.0.0``).
+
+.. option:: -k, --socket=PATH
+
+ Use a unix socket with path *PATH*.
+
+.. option:: --image-opts
+
+ Treat *filename* as a set of image options, instead of a plain
+ filename. If this flag is specified, the ``-f`` flag should
+ not be used, instead the :option:`format=` option should be set.
+
+.. option:: -f, --format=FMT
+
+ Force the use of the block driver for format *FMT* instead of
+ auto-detecting.
+
+.. option:: -r, --read-only
+
+ Export the disk as read-only.
+
+.. option:: -P, --partition=NUM
+
+ Deprecated: Only expose MBR partition *NUM*. Understands physical
+ partitions 1-4 and logical partition 5. New code should instead use
+ :option:`--image-opts` with the raw driver wrapping a subset of the
+ original image.
+
+.. option:: -B, --bitmap=NAME
+
+ If *filename* has a qcow2 persistent bitmap *NAME*, expose
+ that bitmap via the ``qemu:dirty-bitmap:NAME`` context
+ accessible through NBD_OPT_SET_META_CONTEXT.
+
+.. option:: -s, --snapshot
+
+ Use *filename* as an external snapshot, create a temporary
+ file with ``backing_file=``\ *filename*, redirect the write to
+ the temporary one.
+
+.. option:: -l, --load-snapshot=SNAPSHOT_PARAM
+
+ Load an internal snapshot inside *filename* and export it
+ as an read-only device, SNAPSHOT_PARAM format is
+ ``snapshot.id=[ID],snapshot.name=[NAME]`` or ``[ID_OR_NAME]``
+
+.. option:: --cache=CACHE
+
+ The cache mode to be used with the file. See the documentation of
+ the emulator's ``-drive cache=...`` option for allowed values.
+
+.. option:: -n, --nocache
+
+ Equivalent to :option:`--cache=none`.
+
+.. option:: --aio=AIO
+
+ Set the asynchronous I/O mode between ``threads`` (the default)
+ and ``native`` (Linux only).
+
+.. option:: --discard=DISCARD
+
+ Control whether ``discard`` (also known as ``trim`` or ``unmap``)
+ requests are ignored or passed to the filesystem. *DISCARD* is one of
+ ``ignore`` (or ``off``), ``unmap`` (or ``on``). The default is
+ ``ignore``.
+
+.. option:: --detect-zeroes=DETECT_ZEROES
+
+ Control the automatic conversion of plain zero writes by the OS to
+ driver-specific optimized zero write commands. *DETECT_ZEROES* is one of
+ ``off``, ``on``, or ``unmap``. ``unmap``
+ converts a zero write to an unmap operation and can only be used if
+ *DISCARD* is set to ``unmap``. The default is ``off``.
+
+.. option:: -c, --connect=DEV
+
+ Connect *filename* to NBD device *DEV* (Linux only).
+
+.. option:: -d, --disconnect
+
+ Disconnect the device *DEV* (Linux only).
+
+.. option:: -e, --shared=NUM
+
+ Allow up to *NUM* clients to share the device (default
+ ``1``). Safe for readers, but for now, consistency is not
+ guaranteed between multiple writers.
+
+.. option:: -t, --persistent
+
+ Don't exit on the last connection.
+
+.. option:: -x, --export-name=NAME
+
+ Set the NBD volume export name (default of a zero-length string).
+
+.. option:: -D, --description=DESCRIPTION
+
+ Set the NBD volume export description, as a human-readable
+ string.
+
+.. option:: -L, --list
+
+ Connect as a client and list all details about the exports exposed by
+ a remote NBD server. This enables list mode, and is incompatible
+ with options that change behavior related to a specific export (such as
+ :option:`--export-name`, :option:`--offset`, ...).
+
+.. option:: --tls-creds=ID
+
+ Enable mandatory TLS encryption for the server by setting the ID
+ of the TLS credentials object previously created with the --object
+ option; or provide the credentials needed for connecting as a client
+ in list mode.
+
+.. option:: --fork
+
+ Fork off the server process and exit the parent once the server is running.
+
+.. option:: --pid-file=PATH
+
+ Store the server's process ID in the given file.
+
+.. option:: --tls-authz=ID
+
+ Specify the ID of a qauthz object previously created with the
+ :option:`--object` option. This will be used to authorize connecting users
+ against their x509 distinguished name.
+
+.. option:: -v, --verbose
+
+ Display extra debugging information.
+
+.. option:: -h, --help
+
+ Display this help and exit.
+
+.. option:: -V, --version
+
+ Display version information and exit.
+
+.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE]
+
+ .. include:: qemu-option-trace.rst.inc
+
+Examples
+--------
+
+Start a server listening on port 10809 that exposes only the
+guest-visible contents of a qcow2 file, with no TLS encryption, and
+with the default export name (an empty string). The command is
+one-shot, and will block until the first successful client
+disconnects:
+
+::
+
+ qemu-nbd -f qcow2 file.qcow2
+
+Start a long-running server listening with encryption on port 10810,
+and whitelist clients with a specific X.509 certificate to connect to
+a 1 megabyte subset of a raw file, using the export name 'subset':
+
+::
+
+ qemu-nbd \
+ --object tls-creds-x509,id=tls0,endpoint=server,dir=/path/to/qemutls \
+ --object 'authz-simple,id=auth0,identity=CN=laptop.example.com,,\
+ O=Example Org,,L=London,,ST=London,,C=GB' \
+ --tls-creds tls0 --tls-authz auth0 \
+ -t -x subset -p 10810 \
+ --image-opts driver=raw,offset=1M,size=1M,file.driver=file,file.filename=file.raw
+
+Serve a read-only copy of just the first MBR partition of a guest
+image over a Unix socket with as many as 5 simultaneous readers, with
+a persistent process forked as a daemon:
+
+::
+
+ qemu-nbd --fork --persistent --shared=5 --socket=/path/to/sock \
+ --partition=1 --read-only --format=qcow2 file.qcow2
+
+Expose the guest-visible contents of a qcow2 file via a block device
+/dev/nbd0 (and possibly creating /dev/nbd0p1 and friends for
+partitions found within), then disconnect the device when done.
+Access to bind qemu-nbd to an /dev/nbd device generally requires root
+privileges, and may also require the execution of ``modprobe nbd``
+to enable the kernel NBD client module. *CAUTION*: Do not use
+this method to mount filesystems from an untrusted guest image - a
+malicious guest may have prepared the image to attempt to trigger
+kernel bugs in partition probing or file system mounting.
+
+::
+
+ qemu-nbd -c /dev/nbd0 -f qcow2 file.qcow2
+ qemu-nbd -d /dev/nbd0
+
+Query a remote server to see details about what export(s) it is
+serving on port 10809, and authenticating via PSK:
+
+::
+
+ qemu-nbd \
+ --object tls-creds-psk,id=tls0,dir=/tmp/keys,username=eblake,endpoint=client \
+ --tls-creds tls0 -L -b remote.example.com
+
+See also
+--------
+
+:manpage:`qemu(1)`, :manpage:`qemu-img(1)`
diff --git a/docs/interop/qemu-option-trace.rst.inc b/docs/interop/qemu-option-trace.rst.inc
new file mode 100644
index 0000000000..23cfcb4853
--- /dev/null
+++ b/docs/interop/qemu-option-trace.rst.inc
@@ -0,0 +1,30 @@
+..
+ The contents of this file must be kept in sync with qemu-option-trace.texi
+ until all the users of the texi file have been converted to rst and
+ the texi file can be removed.
+
+Specify tracing options.
+
+.. option:: [enable=]PATTERN
+
+ Immediately enable events matching *PATTERN*
+ (either event name or a globbing pattern). This option is only
+ available if QEMU has been compiled with the ``simple``, ``log``
+ or ``ftrace`` tracing backend. To specify multiple events or patterns,
+ specify the :option:`-trace` option multiple times.
+
+ Use :option:`-trace help` to print a list of names of trace points.
+
+.. option:: events=FILE
+
+ Immediately enable events listed in *FILE*.
+ The file must contain one event name (as listed in the ``trace-events-all``
+ file) per line; globbing patterns are accepted too. This option is only
+ available if QEMU has been compiled with the ``simple``, ``log`` or
+ ``ftrace`` tracing backend.
+
+.. option:: file=FILE
+
+ Log output traces to *FILE*.
+ This option is only available if QEMU has been compiled with
+ the ``simple`` tracing backend.
diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json
index ce0ef74db5..ef8ac5941f 100644
--- a/docs/interop/vhost-user.json
+++ b/docs/interop/vhost-user.json
@@ -31,6 +31,7 @@
# @rproc-serial: virtio remoteproc serial link
# @scsi: virtio scsi
# @vsock: virtio vsock transport
+# @fs: virtio fs (since 4.2)
#
# Since: 4.0
##
@@ -50,7 +51,8 @@
'rpmsg',
'rproc-serial',
'scsi',
- 'vsock'
+ 'vsock',
+ 'fs'
]
}
diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi
deleted file mode 100644
index 2c7ea49c32..0000000000
--- a/docs/qemu-block-drivers.texi
+++ /dev/null
@@ -1,889 +0,0 @@
-@c man begin SYNOPSIS
-QEMU block driver reference manual
-@c man end
-
-@set qemu_system qemu-system-x86_64
-
-@c man begin DESCRIPTION
-
-@node disk_images_formats
-@subsection Disk image file formats
-
-QEMU supports many image file formats that can be used with VMs as well as with
-any of the tools (like @code{qemu-img}). This includes the preferred formats
-raw and qcow2 as well as formats that are supported for compatibility with
-older QEMU versions or other hypervisors.
-
-Depending on the image format, different options can be passed to
-@code{qemu-img create} and @code{qemu-img convert} using the @code{-o} option.
-This section describes each format and the options that are supported for it.
-
-@table @option
-@item raw
-
-Raw disk image format. This format has the advantage of
-being simple and easily exportable to all other emulators. If your
-file system supports @emph{holes} (for example in ext2 or ext3 on
-Linux or NTFS on Windows), then only the written sectors will reserve
-space. Use @code{qemu-img info} to know the real size used by the
-image or @code{ls -ls} on Unix/Linux.
-
-Supported options:
-@table @code
-@item preallocation
-Preallocation mode (allowed values: @code{off}, @code{falloc}, @code{full}).
-@code{falloc} mode preallocates space for image by calling posix_fallocate().
-@code{full} mode preallocates space for image by writing data to underlying
-storage. This data may or may not be zero, depending on the storage location.
-@end table
-
-@item qcow2
-QEMU image format, the most versatile format. Use it to have smaller
-images (useful if your filesystem does not supports holes, for example
-on Windows), zlib based compression and support of multiple VM
-snapshots.
-
-Supported options:
-@table @code
-@item compat
-Determines the qcow2 version to use. @code{compat=0.10} uses the
-traditional image format that can be read by any QEMU since 0.10.
-@code{compat=1.1} enables image format extensions that only QEMU 1.1 and
-newer understand (this is the default). Amongst others, this includes
-zero clusters, which allow efficient copy-on-read for sparse images.
-
-@item backing_file
-File name of a base image (see @option{create} subcommand)
-@item backing_fmt
-Image format of the base image
-@item encryption
-This option is deprecated and equivalent to @code{encrypt.format=aes}
-
-@item encrypt.format
-
-If this is set to @code{luks}, it requests that the qcow2 payload (not
-qcow2 header) be encrypted using the LUKS format. The passphrase to
-use to unlock the LUKS key slot is given by the @code{encrypt.key-secret}
-parameter. LUKS encryption parameters can be tuned with the other
-@code{encrypt.*} parameters.
-
-If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
-The encryption key is given by the @code{encrypt.key-secret} parameter.
-This encryption format is considered to be flawed by modern cryptography
-standards, suffering from a number of design problems:
-
-@itemize @minus
-@item The AES-CBC cipher is used with predictable initialization vectors based
-on the sector number. This makes it vulnerable to chosen plaintext attacks
-which can reveal the existence of encrypted data.
-@item The user passphrase is directly used as the encryption key. A poorly
-chosen or short passphrase will compromise the security of the encryption.
-@item In the event of the passphrase being compromised there is no way to
-change the passphrase to protect data in any qcow images. The files must
-be cloned, using a different encryption passphrase in the new file. The
-original file must then be securely erased using a program like shred,
-though even this is ineffective with many modern storage technologies.
-@end itemize
-
-The use of this is no longer supported in system emulators. Support only
-remains in the command line utilities, for the purposes of data liberation
-and interoperability with old versions of QEMU. The @code{luks} format
-should be used instead.
-
-@item encrypt.key-secret
-
-Provides the ID of a @code{secret} object that contains the passphrase
-(@code{encrypt.format=luks}) or encryption key (@code{encrypt.format=aes}).
-
-@item encrypt.cipher-alg
-
-Name of the cipher algorithm and key length. Currently defaults
-to @code{aes-256}. Only used when @code{encrypt.format=luks}.
-
-@item encrypt.cipher-mode
-
-Name of the encryption mode to use. Currently defaults to @code{xts}.
-Only used when @code{encrypt.format=luks}.
-
-@item encrypt.ivgen-alg
-
-Name of the initialization vector generator algorithm. Currently defaults
-to @code{plain64}. Only used when @code{encrypt.format=luks}.
-
-@item encrypt.ivgen-hash-alg
-
-Name of the hash algorithm to use with the initialization vector generator
-(if required). Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}.
-
-@item encrypt.hash-alg
-
-Name of the hash algorithm to use for PBKDF algorithm
-Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}.
-
-@item encrypt.iter-time
-
-Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
-Defaults to @code{2000}. Only used when @code{encrypt.format=luks}.
-
-@item cluster_size
-Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster
-sizes can improve the image file size whereas larger cluster sizes generally
-provide better performance.
-
-@item preallocation
-Preallocation mode (allowed values: @code{off}, @code{metadata}, @code{falloc},
-@code{full}). An image with preallocated metadata is initially larger but can
-improve performance when the image needs to grow. @code{falloc} and @code{full}
-preallocations are like the same options of @code{raw} format, but sets up
-metadata also.
-
-@item lazy_refcounts
-If this option is set to @code{on}, reference count updates are postponed with
-the goal of avoiding metadata I/O and improving performance. This is
-particularly interesting with @option{cache=writethrough} which doesn't batch
-metadata updates. The tradeoff is that after a host crash, the reference count
-tables must be rebuilt, i.e. on the next open an (automatic) @code{qemu-img
-check -r all} is required, which may take some time.
-
-This option can only be enabled if @code{compat=1.1} is specified.
-
-@item nocow
-If this option is set to @code{on}, it will turn off COW of the file. It's only
-valid on btrfs, no effect on other file systems.
-
-Btrfs has low performance when hosting a VM image file, even more when the guest
-on the VM also using btrfs as file system. Turning off COW is a way to mitigate
-this bad performance. Generally there are two ways to turn off COW on btrfs:
-a) Disable it by mounting with nodatacow, then all newly created files will be
-NOCOW. b) For an empty file, add the NOCOW file attribute. That's what this option
-does.
-
-Note: this option is only valid to new or empty files. If there is an existing
-file which is COW and has data blocks already, it couldn't be changed to NOCOW
-by setting @code{nocow=on}. One can issue @code{lsattr filename} to check if
-the NOCOW flag is set or not (Capital 'C' is NOCOW flag).
-
-@end table
-
-@item qed
-Old QEMU image format with support for backing files and compact image files
-(when your filesystem or transport medium does not support holes).
-
-When converting QED images to qcow2, you might want to consider using the
-@code{lazy_refcounts=on} option to get a more QED-like behaviour.
-
-Supported options:
-@table @code
-@item backing_file
-File name of a base image (see @option{create} subcommand).
-@item backing_fmt
-Image file format of backing file (optional). Useful if the format cannot be
-autodetected because it has no header, like some vhd/vpc files.
-@item cluster_size
-Changes the cluster size (must be power-of-2 between 4K and 64K). Smaller
-cluster sizes can improve the image file size whereas larger cluster sizes
-generally provide better performance.
-@item table_size
-Changes the number of clusters per L1/L2 table (must be power-of-2 between 1
-and 16). There is normally no need to change this value but this option can be
-used for performance benchmarking.
-@end table
-
-@item qcow
-Old QEMU image format with support for backing files, compact image files,
-encryption and compression.
-
-Supported options:
-@table @code
-@item backing_file
-File name of a base image (see @option{create} subcommand)
-@item encryption
-This option is deprecated and equivalent to @code{encrypt.format=aes}
-
-@item encrypt.format
-If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
-The encryption key is given by the @code{encrypt.key-secret} parameter.
-This encryption format is considered to be flawed by modern cryptography
-standards, suffering from a number of design problems enumerated previously
-against the @code{qcow2} image format.
-
-The use of this is no longer supported in system emulators. Support only
-remains in the command line utilities, for the purposes of data liberation
-and interoperability with old versions of QEMU.
-
-Users requiring native encryption should use the @code{qcow2} format
-instead with @code{encrypt.format=luks}.
-
-@item encrypt.key-secret
-
-Provides the ID of a @code{secret} object that contains the encryption
-key (@code{encrypt.format=aes}).
-
-@end table
-
-@item luks
-
-LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup
-
-Supported options:
-@table @code
-
-@item key-secret
-
-Provides the ID of a @code{secret} object that contains the passphrase.
-
-@item cipher-alg
-
-Name of the cipher algorithm and key length. Currently defaults
-to @code{aes-256}.
-
-@item cipher-mode
-
-Name of the encryption mode to use. Currently defaults to @code{xts}.
-
-@item ivgen-alg
-
-Name of the initialization vector generator algorithm. Currently defaults
-to @code{plain64}.
-
-@item ivgen-hash-alg
-
-Name of the hash algorithm to use with the initialization vector generator
-(if required). Defaults to @code{sha256}.
-
-@item hash-alg
-
-Name of the hash algorithm to use for PBKDF algorithm
-Defaults to @code{sha256}.
-
-@item iter-time
-
-Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
-Defaults to @code{2000}.
-
-@end table
-
-@item vdi
-VirtualBox 1.1 compatible image format.
-Supported options:
-@table @code
-@item static
-If this option is set to @code{on}, the image is created with metadata
-preallocation.
-@end table
-
-@item vmdk
-VMware 3 and 4 compatible image format.
-
-Supported options:
-@table @code
-@item backing_file
-File name of a base image (see @option{create} subcommand).
-@item compat6
-Create a VMDK version 6 image (instead of version 4)
-@item hwversion
-Specify vmdk virtual hardware version. Compat6 flag cannot be enabled
-if hwversion is specified.
-@item subformat
-Specifies which VMDK subformat to use. Valid options are
-@code{monolithicSparse} (default),
-@code{monolithicFlat},
-@code{twoGbMaxExtentSparse},
-@code{twoGbMaxExtentFlat} and
-@code{streamOptimized}.
-@end table
-
-@item vpc
-VirtualPC compatible image format (VHD).
-Supported options:
-@table @code
-@item subformat
-Specifies which VHD subformat to use. Valid options are
-@code{dynamic} (default) and @code{fixed}.
-@end table
-
-@item VHDX
-Hyper-V compatible image format (VHDX).
-Supported options:
-@table @code
-@item subformat
-Specifies which VHDX subformat to use. Valid options are
-@code{dynamic} (default) and @code{fixed}.
-@item block_state_zero
-Force use of payload blocks of type 'ZERO'. Can be set to @code{on} (default)
-or @code{off}. When set to @code{off}, new blocks will be created as
-@code{PAYLOAD_BLOCK_NOT_PRESENT}, which means parsers are free to return
-arbitrary data for those blocks. Do not set to @code{off} when using
-@code{qemu-img convert} with @code{subformat=dynamic}.
-@item block_size
-Block size; min 1 MB, max 256 MB. 0 means auto-calculate based on image size.
-@item log_size
-Log size; min 1 MB.
-@end table
-@end table
-
-@subsubsection Read-only formats
-More disk image file formats are supported in a read-only mode.
-@table @option
-@item bochs
-Bochs images of @code{growing} type.
-@item cloop
-Linux Compressed Loop image, useful only to reuse directly compressed
-CD-ROM images present for example in the Knoppix CD-ROMs.
-@item dmg
-Apple disk image.
-@item parallels
-Parallels disk image format.
-@end table
-
-
-@node host_drives
-@subsection Using host drives
-
-In addition to disk image files, QEMU can directly access host
-devices. We describe here the usage for QEMU version >= 0.8.3.
-
-@subsubsection Linux
-
-On Linux, you can directly use the host device filename instead of a
-disk image filename provided you have enough privileges to access
-it. For example, use @file{/dev/cdrom} to access to the CDROM.
-
-@table @code
-@item CD
-You can specify a CDROM device even if no CDROM is loaded. QEMU has
-specific code to detect CDROM insertion or removal. CDROM ejection by
-the guest OS is supported. Currently only data CDs are supported.
-@item Floppy
-You can specify a floppy device even if no floppy is loaded. Floppy
-removal is currently not detected accurately (if you change floppy
-without doing floppy access while the floppy is not loaded, the guest
-OS will think that the same floppy is loaded).
-Use of the host's floppy device is deprecated, and support for it will
-be removed in a future release.
-@item Hard disks
-Hard disks can be used. Normally you must specify the whole disk
-(@file{/dev/hdb} instead of @file{/dev/hdb1}) so that the guest OS can
-see it as a partitioned disk. WARNING: unless you know what you do, it
-is better to only make READ-ONLY accesses to the hard disk otherwise
-you may corrupt your host data (use the @option{-snapshot} command
-line option or modify the device permissions accordingly).
-@end table
-
-@subsubsection Windows
-
-@table @code
-@item CD
-The preferred syntax is the drive letter (e.g. @file{d:}). The
-alternate syntax @file{\\.\d:} is supported. @file{/dev/cdrom} is
-supported as an alias to the first CDROM drive.
-
-Currently there is no specific code to handle removable media, so it
-is better to use the @code{change} or @code{eject} monitor commands to
-change or eject media.
-@item Hard disks
-Hard disks can be used with the syntax: @file{\\.\PhysicalDrive@var{N}}
-where @var{N} is the drive number (0 is the first hard disk).
-
-WARNING: unless you know what you do, it is better to only make
-READ-ONLY accesses to the hard disk otherwise you may corrupt your
-host data (use the @option{-snapshot} command line so that the
-modifications are written in a temporary file).
-@end table
-
-
-@subsubsection Mac OS X
-
-@file{/dev/cdrom} is an alias to the first CDROM.
-
-Currently there is no specific code to handle removable media, so it
-is better to use the @code{change} or @code{eject} monitor commands to
-change or eject media.
-
-@node disk_images_fat_images
-@subsection Virtual FAT disk images
-
-QEMU can automatically create a virtual FAT disk image from a
-directory tree. In order to use it, just type:
-
-@example
-@value{qemu_system} linux.img -hdb fat:/my_directory
-@end example
-
-Then you access access to all the files in the @file{/my_directory}
-directory without having to copy them in a disk image or to export
-them via SAMBA or NFS. The default access is @emph{read-only}.
-
-Floppies can be emulated with the @code{:floppy:} option:
-
-@example
-@value{qemu_system} linux.img -fda fat:floppy:/my_directory
-@end example
-
-A read/write support is available for testing (beta stage) with the
-@code{:rw:} option:
-
-@example
-@value{qemu_system} linux.img -fda fat:floppy:rw:/my_directory
-@end example
-
-What you should @emph{never} do:
-@itemize
-@item use non-ASCII filenames ;
-@item use "-snapshot" together with ":rw:" ;
-@item expect it to work when loadvm'ing ;
-@item write to the FAT directory on the host system while accessing it with the guest system.
-@end itemize
-
-@node disk_images_nbd
-@subsection NBD access
-
-QEMU can access directly to block device exported using the Network Block Device
-protocol.
-
-@example
-@value{qemu_system} linux.img -hdb nbd://my_nbd_server.mydomain.org:1024/
-@end example
-
-If the NBD server is located on the same host, you can use an unix socket instead
-of an inet socket:
-
-@example
-@value{qemu_system} linux.img -hdb nbd+unix://?socket=/tmp/my_socket
-@end example
-
-In this case, the block device must be exported using qemu-nbd:
-
-@example
-qemu-nbd --socket=/tmp/my_socket my_disk.qcow2
-@end example
-
-The use of qemu-nbd allows sharing of a disk between several guests:
-@example
-qemu-nbd --socket=/tmp/my_socket --share=2 my_disk.qcow2
-@end example
-
-@noindent
-and then you can use it with two guests:
-@example
-@value{qemu_system} linux1.img -hdb nbd+unix://?socket=/tmp/my_socket
-@value{qemu_system} linux2.img -hdb nbd+unix://?socket=/tmp/my_socket
-@end example
-
-If the nbd-server uses named exports (supported since NBD 2.9.18, or with QEMU's
-own embedded NBD server), you must specify an export name in the URI:
-@example
-@value{qemu_system} -cdrom nbd://localhost/debian-500-ppc-netinst
-@value{qemu_system} -cdrom nbd://localhost/openSUSE-11.1-ppc-netinst
-@end example
-
-The URI syntax for NBD is supported since QEMU 1.3. An alternative syntax is
-also available. Here are some example of the older syntax:
-@example
-@value{qemu_system} linux.img -hdb nbd:my_nbd_server.mydomain.org:1024
-@value{qemu_system} linux2.img -hdb nbd:unix:/tmp/my_socket
-@value{qemu_system} -cdrom nbd:localhost:10809:exportname=debian-500-ppc-netinst
-@end example
-
-@node disk_images_sheepdog
-@subsection Sheepdog disk images
-
-Sheepdog is a distributed storage system for QEMU. It provides highly
-available block level storage volumes that can be attached to
-QEMU-based virtual machines.
-
-You can create a Sheepdog disk image with the command:
-@example
-qemu-img create sheepdog:///@var{image} @var{size}
-@end example
-where @var{image} is the Sheepdog image name and @var{size} is its
-size.
-
-To import the existing @var{filename} to Sheepdog, you can use a
-convert command.
-@example
-qemu-img convert @var{filename} sheepdog:///@var{image}
-@end example
-
-You can boot from the Sheepdog disk image with the command:
-@example
-@value{qemu_system} sheepdog:///@var{image}
-@end example
-
-You can also create a snapshot of the Sheepdog image like qcow2.
-@example
-qemu-img snapshot -c @var{tag} sheepdog:///@var{image}
-@end example
-where @var{tag} is a tag name of the newly created snapshot.
-
-To boot from the Sheepdog snapshot, specify the tag name of the
-snapshot.
-@example
-@value{qemu_system} sheepdog:///@var{image}#@var{tag}
-@end example
-
-You can create a cloned image from the existing snapshot.
-@example
-qemu-img create -b sheepdog:///@var{base}#@var{tag} sheepdog:///@var{image}
-@end example
-where @var{base} is an image name of the source snapshot and @var{tag}
-is its tag name.
-
-You can use an unix socket instead of an inet socket:
-
-@example
-@value{qemu_system} sheepdog+unix:///@var{image}?socket=@var{path}
-@end example
-
-If the Sheepdog daemon doesn't run on the local host, you need to
-specify one of the Sheepdog servers to connect to.
-@example
-qemu-img create sheepdog://@var{hostname}:@var{port}/@var{image} @var{size}
-@value{qemu_system} sheepdog://@var{hostname}:@var{port}/@var{image}
-@end example
-
-@node disk_images_iscsi
-@subsection iSCSI LUNs
-
-iSCSI is a popular protocol used to access SCSI devices across a computer
-network.
-
-There are two different ways iSCSI devices can be used by QEMU.
-
-The first method is to mount the iSCSI LUN on the host, and make it appear as
-any other ordinary SCSI device on the host and then to access this device as a
-/dev/sd device from QEMU. How to do this differs between host OSes.
-
-The second method involves using the iSCSI initiator that is built into
-QEMU. This provides a mechanism that works the same way regardless of which
-host OS you are running QEMU on. This section will describe this second method
-of using iSCSI together with QEMU.
-
-In QEMU, iSCSI devices are described using special iSCSI URLs
-
-@example
-URL syntax:
-iscsi://[<username>[%<password>]@@]<host>[:<port>]/<target-iqn-name>/<lun>
-@end example
-
-Username and password are optional and only used if your target is set up
-using CHAP authentication for access control.
-Alternatively the username and password can also be set via environment
-variables to have these not show up in the process list
-
-@example
-export LIBISCSI_CHAP_USERNAME=<username>
-export LIBISCSI_CHAP_PASSWORD=<password>
-iscsi://<host>/<target-iqn-name>/<lun>
-@end example
-
-Various session related parameters can be set via special options, either
-in a configuration file provided via '-readconfig' or directly on the
-command line.
-
-If the initiator-name is not specified qemu will use a default name
-of 'iqn.2008-11.org.linux-kvm[:<uuid>'] where <uuid> is the UUID of the
-virtual machine. If the UUID is not specified qemu will use
-'iqn.2008-11.org.linux-kvm[:<name>'] where <name> is the name of the
-virtual machine.
-
-@example
-Setting a specific initiator name to use when logging in to the target
--iscsi initiator-name=iqn.qemu.test:my-initiator
-@end example
-
-@example
-Controlling which type of header digest to negotiate with the target
--iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
-@end example
-
-These can also be set via a configuration file
-@example
-[iscsi]
- user = "CHAP username"
- password = "CHAP password"
- initiator-name = "iqn.qemu.test:my-initiator"
- # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
- header-digest = "CRC32C"
-@end example
-
-
-Setting the target name allows different options for different targets
-@example
-[iscsi "iqn.target.name"]
- user = "CHAP username"
- password = "CHAP password"
- initiator-name = "iqn.qemu.test:my-initiator"
- # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
- header-digest = "CRC32C"
-@end example
-
-
-Howto use a configuration file to set iSCSI configuration options:
-@example
-cat >iscsi.conf <<EOF
-[iscsi]
- user = "me"
- password = "my password"
- initiator-name = "iqn.qemu.test:my-initiator"
- header-digest = "CRC32C"
-EOF
-
-@value{qemu_system} -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \
- -readconfig iscsi.conf
-@end example
-
-
-How to set up a simple iSCSI target on loopback and access it via QEMU:
-@example
-This example shows how to set up an iSCSI target with one CDROM and one DISK
-using the Linux STGT software target. This target is available on Red Hat based
-systems as the package 'scsi-target-utils'.
-
-tgtd --iscsi portal=127.0.0.1:3260
-tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.qemu.test
-tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 1 \
- -b /IMAGES/disk.img --device-type=disk
-tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 2 \
- -b /IMAGES/cd.iso --device-type=cd
-tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL
-
-@value{qemu_system} -iscsi initiator-name=iqn.qemu.test:my-initiator \
- -boot d -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \
- -cdrom iscsi://127.0.0.1/iqn.qemu.test/2
-@end example
-
-@node disk_images_gluster
-@subsection GlusterFS disk images
-
-GlusterFS is a user space distributed file system.
-
-You can boot from the GlusterFS disk image with the command:
-@example
-URI:
-@value{qemu_system} -drive file=gluster[+@var{type}]://[@var{host}[:@var{port}]]/@var{volume}/@var{path}
- [?socket=...][,file.debug=9][,file.logfile=...]
-
-JSON:
-@value{qemu_system} 'json:@{"driver":"qcow2",
- "file":@{"driver":"gluster",
- "volume":"testvol","path":"a.img","debug":9,"logfile":"...",
- "server":[@{"type":"tcp","host":"...","port":"..."@},
- @{"type":"unix","socket":"..."@}]@}@}'
-@end example
-
-@var{gluster} is the protocol.
-
-@var{type} specifies the transport type used to connect to gluster
-management daemon (glusterd). Valid transport types are
-tcp and unix. In the URI form, if a transport type isn't specified,
-then tcp type is assumed.
-
-@var{host} specifies the server where the volume file specification for
-the given volume resides. This can be either a hostname or an ipv4 address.
-If transport type is unix, then @var{host} field should not be specified.
-Instead @var{socket} field needs to be populated with the path to unix domain
-socket.
-
-@var{port} is the port number on which glusterd is listening. This is optional
-and if not specified, it defaults to port 24007. If the transport type is unix,
-then @var{port} should not be specified.
-
-@var{volume} is the name of the gluster volume which contains the disk image.
-
-@var{path} is the path to the actual disk image that resides on gluster volume.
-
-@var{debug} is the logging level of the gluster protocol driver. Debug levels
-are 0-9, with 9 being the most verbose, and 0 representing no debugging output.
-The default level is 4. The current logging levels defined in the gluster source
-are 0 - None, 1 - Emergency, 2 - Alert, 3 - Critical, 4 - Error, 5 - Warning,
-6 - Notice, 7 - Info, 8 - Debug, 9 - Trace
-
-@var{logfile} is a commandline option to mention log file path which helps in
-logging to the specified file and also help in persisting the gfapi logs. The
-default is stderr.
-
-
-
-
-You can create a GlusterFS disk image with the command:
-@example
-qemu-img create gluster://@var{host}/@var{volume}/@var{path} @var{size}
-@end example
-
-Examples
-@example
-@value{qemu_system} -drive file=gluster://1.2.3.4/testvol/a.img
-@value{qemu_system} -drive file=gluster+tcp://1.2.3.4/testvol/a.img
-@value{qemu_system} -drive file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
-@value{qemu_system} -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
-@value{qemu_system} -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
-@value{qemu_system} -drive file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
-@value{qemu_system} -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
-@value{qemu_system} -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img
-@value{qemu_system} -drive file=gluster://1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log
-@value{qemu_system} 'json:@{"driver":"qcow2",
- "file":@{"driver":"gluster",
- "volume":"testvol","path":"a.img",
- "debug":9,"logfile":"/var/log/qemu-gluster.log",
- "server":[@{"type":"tcp","host":"1.2.3.4","port":24007@},
- @{"type":"unix","socket":"/var/run/glusterd.socket"@}]@}@}'
-@value{qemu_system} -drive driver=qcow2,file.driver=gluster,file.volume=testvol,file.path=/path/a.img,
- file.debug=9,file.logfile=/var/log/qemu-gluster.log,
- file.server.0.type=tcp,file.server.0.host=1.2.3.4,file.server.0.port=24007,
- file.server.1.type=unix,file.server.1.socket=/var/run/glusterd.socket
-@end example
-
-@node disk_images_ssh
-@subsection Secure Shell (ssh) disk images
-
-You can access disk images located on a remote ssh server
-by using the ssh protocol:
-
-@example
-@value{qemu_system} -drive file=ssh://[@var{user}@@]@var{server}[:@var{port}]/@var{path}[?host_key_check=@var{host_key_check}]
-@end example
-
-Alternative syntax using properties:
-
-@example
-@value{qemu_system} -drive file.driver=ssh[,file.user=@var{user}],file.host=@var{server}[,file.port=@var{port}],file.path=@var{path}[,file.host_key_check=@var{host_key_check}]
-@end example
-
-@var{ssh} is the protocol.
-
-@var{user} is the remote user. If not specified, then the local
-username is tried.
-
-@var{server} specifies the remote ssh server. Any ssh server can be
-used, but it must implement the sftp-server protocol. Most Unix/Linux
-systems should work without requiring any extra configuration.
-
-@var{port} is the port number on which sshd is listening. By default
-the standard ssh port (22) is used.
-
-@var{path} is the path to the disk image.
-
-The optional @var{host_key_check} parameter controls how the remote
-host's key is checked. The default is @code{yes} which means to use
-the local @file{.ssh/known_hosts} file. Setting this to @code{no}
-turns off known-hosts checking. Or you can check that the host key
-matches a specific fingerprint:
-@code{host_key_check=md5:78:45:8e:14:57:4f:d5:45:83:0a:0e:f3:49:82:c9:c8}
-(@code{sha1:} can also be used as a prefix, but note that OpenSSH
-tools only use MD5 to print fingerprints).
-
-Currently authentication must be done using ssh-agent. Other
-authentication methods may be supported in future.
-
-Note: Many ssh servers do not support an @code{fsync}-style operation.
-The ssh driver cannot guarantee that disk flush requests are
-obeyed, and this causes a risk of disk corruption if the remote
-server or network goes down during writes. The driver will
-print a warning when @code{fsync} is not supported:
-
-warning: ssh server @code{ssh.example.com:22} does not support fsync
-
-With sufficiently new versions of libssh and OpenSSH, @code{fsync} is
-supported.
-
-@node disk_images_nvme
-@subsection NVMe disk images
-
-NVM Express (NVMe) storage controllers can be accessed directly by a userspace
-driver in QEMU. This bypasses the host kernel file system and block layers
-while retaining QEMU block layer functionalities, such as block jobs, I/O
-throttling, image formats, etc. Disk I/O performance is typically higher than
-with @code{-drive file=/dev/sda} using either thread pool or linux-aio.
-
-The controller will be exclusively used by the QEMU process once started. To be
-able to share storage between multiple VMs and other applications on the host,
-please use the file based protocols.
-
-Before starting QEMU, bind the host NVMe controller to the host vfio-pci
-driver. For example:
-
-@example
-# modprobe vfio-pci
-# lspci -n -s 0000:06:0d.0
-06:0d.0 0401: 1102:0002 (rev 08)
-# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
-# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
-
-# @value{qemu_system} -drive file=nvme://@var{host}:@var{bus}:@var{slot}.@var{func}/@var{namespace}
-@end example
-
-Alternative syntax using properties:
-
-@example
-@value{qemu_system} -drive file.driver=nvme,file.device=@var{host}:@var{bus}:@var{slot}.@var{func},file.namespace=@var{namespace}
-@end example
-
-@var{host}:@var{bus}:@var{slot}.@var{func} is the NVMe controller's PCI device
-address on the host.
-
-@var{namespace} is the NVMe namespace number, starting from 1.
-
-@node disk_image_locking
-@subsection Disk image file locking
-
-By default, QEMU tries to protect image files from unexpected concurrent
-access, as long as it's supported by the block protocol driver and host
-operating system. If multiple QEMU processes (including QEMU emulators and
-utilities) try to open the same image with conflicting accessing modes, all but
-the first one will get an error.
-
-This feature is currently supported by the file protocol on Linux with the Open
-File Descriptor (OFD) locking API, and can be configured to fall back to POSIX
-locking if the POSIX host doesn't support Linux OFD locking.
-
-To explicitly enable image locking, specify "locking=on" in the file protocol
-driver options. If OFD locking is not possible, a warning will be printed and
-the POSIX locking API will be used. In this case there is a risk that the lock
-will get silently lost when doing hot plugging and block jobs, due to the
-shortcomings of the POSIX locking API.
-
-QEMU transparently handles lock handover during shared storage migration. For
-shared virtual disk images between multiple VMs, the "share-rw" device option
-should be used.
-
-By default, the guest has exclusive write access to its disk image. If the
-guest can safely share the disk image with other writers the @code{-device
-...,share-rw=on} parameter can be used. This is only safe if the guest is
-running software, such as a cluster file system, that coordinates disk accesses
-to avoid corruption.
-
-Note that share-rw=on only declares the guest's ability to share the disk.
-Some QEMU features, such as image file formats, require exclusive write access
-to the disk image and this is unaffected by the share-rw=on option.
-
-Alternatively, locking can be fully disabled by "locking=off" block device
-option. In the command line, the option is usually in the form of
-"file.locking=off" as the protocol driver is normally placed as a "file" child
-under a format driver. For example:
-
-@code{-blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file}
-
-To check if image locking is active, check the output of the "lslocks" command
-on host and see if there are locks held by the QEMU process on the image file.
-More than one byte could be locked by the QEMU instance, each byte of which
-reflects a particular permission that is acquired or protected by the running
-block driver.
-
-@c man end
-
-@ignore
-
-@setfilename qemu-block-drivers
-@settitle QEMU block drivers reference
-
-@c man begin SEEALSO
-The HTML documentation of QEMU for more precise information and Linux
-user mode emulator invocation.
-@c man end
-
-@c man begin AUTHOR
-Fabrice Bellard and the QEMU Project developers
-@c man end
-
-@end ignore
diff --git a/docs/specs/acpi_cpu_hotplug.txt b/docs/specs/acpi_cpu_hotplug.txt
index ee219c8358..a8ce5e7402 100644
--- a/docs/specs/acpi_cpu_hotplug.txt
+++ b/docs/specs/acpi_cpu_hotplug.txt
@@ -15,14 +15,14 @@ CPU present bitmap for:
PIIX-PM (IO port 0xaf00-0xaf1f, 1-byte access)
One bit per CPU. Bit position reflects corresponding CPU APIC ID. Read-only.
The first DWORD in bitmap is used in write mode to switch from legacy
- to new CPU hotplug interface, write 0 into it to do switch.
+ to modern CPU hotplug interface, write 0 into it to do switch.
---------------------------------------------------------------
QEMU sets corresponding CPU bit on hot-add event and issues SCI
with GPE.2 event set. CPU present map is read by ACPI BIOS GPE.2 handler
to notify OS about CPU hot-add events. CPU hot-remove isn't supported.
=====================================
-ACPI CPU hotplug interface registers:
+Modern ACPI CPU hotplug interface registers:
-------------------------------------
Register block base address:
ICH9-LPC IO port 0x0cd8
@@ -30,9 +30,25 @@ Register block base address:
Register block size:
ACPI_CPU_HOTPLUG_REG_LEN = 12
+All accesses to registers described below, imply little-endian byte order.
+
+Reserved resisters behavior:
+ - write accesses are ignored
+ - read accesses return all bits set to 0.
+
+The last stored value in 'CPU selector' must refer to a possible CPU, otherwise
+ - reads from any register return 0
+ - writes to any other register are ignored until valid value is stored into it
+On QEMU start, 'CPU selector' is initialized to a valid value, on reset it
+keeps the current value.
+
read access:
offset:
- [0x0-0x3] reserved
+ [0x0-0x3] Command data 2: (DWORD access)
+ if value last stored in 'Command field':
+ 0: reads as 0x0
+ 3: upper 32 bits of architecture specific CPU ID value
+ other values: reserved
[0x4] CPU device status fields: (1 byte access)
bits:
0: Device is enabled and may be used by guest
@@ -44,15 +60,17 @@ read access:
3-7: reserved and should be ignored by OSPM
[0x5-0x7] reserved
[0x8] Command data: (DWORD access)
- in case of error or unsupported command reads is 0xFFFFFFFF
- current 'Command field' value:
- 0: returns PXM value corresponding to device
+ contains 0 unless value last stored in 'Command field' is one of:
+ 0: contains 'CPU selector' value of a CPU with pending event[s]
+ 3: lower 32 bits of architecture specific CPU ID value
+ (in x86 case: APIC ID)
write access:
offset:
[0x0-0x3] CPU selector: (DWORD access)
selects active CPU device. All following accesses to other
registers will read/store data from/to selected CPU.
+ Valid values: [0 .. max_cpus)
[0x4] CPU device control fields: (1 byte access)
bits:
0: reserved, OSPM must clear it before writing to register.
@@ -69,9 +87,9 @@ write access:
value:
0: selects a CPU device with inserting/removing events and
following reads from 'Command data' register return
- selected CPU (CPU selector value). If no CPU with events
- found, the current CPU selector doesn't change and
- corresponding insert/remove event flags are not set.
+ selected CPU ('CPU selector' value).
+ If no CPU with events found, the current 'CPU selector' doesn't
+ change and corresponding insert/remove event flags are not modified.
1: following writes to 'Command data' register set OST event
register in QEMU
2: following writes to 'Command data' register set OST status
@@ -79,16 +97,53 @@ write access:
other values: reserved
[0x6-0x7] reserved
[0x8] Command data: (DWORD access)
- current 'Command field' value:
- 0: OSPM reads value of CPU selector
+ if last stored 'Command field' value:
1: stores value into OST event register
2: stores value into OST status register, triggers
ACPI_DEVICE_OST QMP event from QEMU to external applications
with current values of OST event and status registers.
- other values: reserved
+ other values: reserved
+
+Typical usecases:
+ - (x86) Detecting and enabling modern CPU hotplug interface.
+ QEMU starts with legacy CPU hotplug interface enabled. Detecting and
+ switching to modern interface is based on the 2 legacy CPU hotplug features:
+ 1. Writes into CPU bitmap are ignored.
+ 2. CPU bitmap always has bit#0 set, corresponding to boot CPU.
+
+ Use following steps to detect and enable modern CPU hotplug interface:
+ 1. Store 0x0 to the 'CPU selector' register,
+ attempting to switch to modern mode
+ 2. Store 0x0 to the 'CPU selector' register,
+ to ensure valid selector value
+ 3. Store 0x0 to the 'Command field' register,
+ 4. Read the 'Command data 2' register.
+ If read value is 0x0, the modern interface is enabled.
+ Otherwise legacy or no CPU hotplug interface available
+
+ - Get a cpu with pending event
+ 1. Store 0x0 to the 'CPU selector' register.
+ 2. Store 0x0 to the 'Command field' register.
+ 3. Read the 'CPU device status fields' register.
+ 4. If both bit#1 and bit#2 are clear in the value read, there is no CPU
+ with a pending event and selected CPU remains unchanged.
+ 5. Otherwise, read the 'Command data' register. The value read is the
+ selector of the CPU with the pending event (which is already
+ selected).
-Selecting CPU device beyond possible range has no effect on platform:
- - write accesses to CPU hot-plug registers not documented above are
- ignored
- - read accesses to CPU hot-plug registers not documented above return
- all bits set to 0.
+ - Enumerate CPUs present/non present CPUs
+ 01. Set the present CPU count to 0.
+ 02. Set the iterator to 0.
+ 03. Store 0x0 to the 'CPU selector' register, to ensure that it's in
+ a valid state and that access to other registers won't be ignored.
+ 04. Store 0x0 to the 'Command field' register to make 'Command data'
+ register return 'CPU selector' value of selected CPU
+ 05. Read the 'CPU device status fields' register.
+ 06. If bit#0 is set, increment the present CPU count.
+ 07. Increment the iterator.
+ 08. Store the iterator to the 'CPU selector' register.
+ 09. Read the 'Command data' register.
+ 10. If the value read is not zero, goto 05.
+ 11. Otherwise store 0x0 to the 'CPU selector' register, to put it
+ into a valid state and exit.
+ The iterator at this point equals "max_cpus".
diff --git a/docs/system/conf.py b/docs/system/conf.py
new file mode 100644
index 0000000000..7ca115f5e0
--- /dev/null
+++ b/docs/system/conf.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+#
+# QEMU documentation build configuration file for the 'system' manual.
+#
+# This includes the top level conf file and then makes any necessary tweaks.
+import sys
+import os
+
+qemu_docdir = os.path.abspath("..")
+parent_config = os.path.join(qemu_docdir, "conf.py")
+exec(compile(open(parent_config, "rb").read(), parent_config, 'exec'))
+
+# This slightly misuses the 'description', but is the best way to get
+# the manual title to appear in the sidebar.
+html_theme_options['description'] = u'System Emulation User''s Guide'
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('qemu-block-drivers', 'qemu-block-drivers',
+ u'QEMU block drivers reference',
+ ['Fabrice Bellard and the QEMU Project developers'], 7)
+]
diff --git a/docs/system/index.rst b/docs/system/index.rst
new file mode 100644
index 0000000000..f66e6ea585
--- /dev/null
+++ b/docs/system/index.rst
@@ -0,0 +1,17 @@
+.. This is the top level page for the 'system' manual.
+
+
+QEMU System Emulation User's Guide
+==================================
+
+This manual is the overall guide for users using QEMU
+for full system emulation (as opposed to user-mode emulation).
+This includes working with hypervisors such as KVM, Xen, Hax
+or Hypervisor.Framework.
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ qemu-block-drivers
diff --git a/docs/system/qemu-block-drivers.rst b/docs/system/qemu-block-drivers.rst
new file mode 100644
index 0000000000..388adbefbf
--- /dev/null
+++ b/docs/system/qemu-block-drivers.rst
@@ -0,0 +1,985 @@
+QEMU block drivers reference
+============================
+
+.. |qemu_system| replace:: qemu-system-x86_64
+
+..
+ We put the 'Synopsis' and 'See also' sections into the manpage, but not
+ the HTML. This makes the HTML docs read better and means the ToC in
+ the index has a more useful set of entries. Ideally, the section
+ headings 'Disk image file formats' would be top-level headings for
+ the HTML, but sub-headings of the conventional manpage 'Description'
+ header for the manpage. Unfortunately, due to deficiencies in
+ the Sphinx 'only' directive, this isn't possible: they must be headers
+ at the same level as 'Synopsis' and 'See also', otherwise Sphinx's
+ identification of which header underline style is which gets confused.
+
+.. only:: man
+
+ Synopsis
+ --------
+
+ QEMU block driver reference manual
+
+Disk image file formats
+-----------------------
+
+QEMU supports many image file formats that can be used with VMs as well as with
+any of the tools (like ``qemu-img``). This includes the preferred formats
+raw and qcow2 as well as formats that are supported for compatibility with
+older QEMU versions or other hypervisors.
+
+Depending on the image format, different options can be passed to
+``qemu-img create`` and ``qemu-img convert`` using the ``-o`` option.
+This section describes each format and the options that are supported for it.
+
+.. program:: image-formats
+.. option:: raw
+
+ Raw disk image format. This format has the advantage of
+ being simple and easily exportable to all other emulators. If your
+ file system supports *holes* (for example in ext2 or ext3 on
+ Linux or NTFS on Windows), then only the written sectors will reserve
+ space. Use ``qemu-img info`` to know the real size used by the
+ image or ``ls -ls`` on Unix/Linux.
+
+ Supported options:
+
+ .. program:: raw
+ .. option:: preallocation
+
+ Preallocation mode (allowed values: ``off``, ``falloc``,
+ ``full``). ``falloc`` mode preallocates space for image by
+ calling ``posix_fallocate()``. ``full`` mode preallocates space
+ for image by writing data to underlying storage. This data may or
+ may not be zero, depending on the storage location.
+
+.. program:: image-formats
+.. option:: qcow2
+
+ QEMU image format, the most versatile format. Use it to have smaller
+ images (useful if your filesystem does not supports holes, for example
+ on Windows), zlib based compression and support of multiple VM
+ snapshots.
+
+ Supported options:
+
+ .. program:: qcow2
+ .. option:: compat
+
+ Determines the qcow2 version to use. ``compat=0.10`` uses the
+ traditional image format that can be read by any QEMU since 0.10.
+ ``compat=1.1`` enables image format extensions that only QEMU 1.1 and
+ newer understand (this is the default). Amongst others, this includes
+ zero clusters, which allow efficient copy-on-read for sparse images.
+
+ .. option:: backing_file
+
+ File name of a base image (see ``create`` subcommand)
+
+ .. option:: backing_fmt
+
+ Image format of the base image
+
+ .. option:: encryption
+
+ This option is deprecated and equivalent to ``encrypt.format=aes``
+
+ .. option:: encrypt.format
+
+ If this is set to ``luks``, it requests that the qcow2 payload (not
+ qcow2 header) be encrypted using the LUKS format. The passphrase to
+ use to unlock the LUKS key slot is given by the ``encrypt.key-secret``
+ parameter. LUKS encryption parameters can be tuned with the other
+ ``encrypt.*`` parameters.
+
+ If this is set to ``aes``, the image is encrypted with 128-bit AES-CBC.
+ The encryption key is given by the ``encrypt.key-secret`` parameter.
+ This encryption format is considered to be flawed by modern cryptography
+ standards, suffering from a number of design problems:
+
+ - The AES-CBC cipher is used with predictable initialization vectors based
+ on the sector number. This makes it vulnerable to chosen plaintext attacks
+ which can reveal the existence of encrypted data.
+ - The user passphrase is directly used as the encryption key. A poorly
+ chosen or short passphrase will compromise the security of the encryption.
+ - In the event of the passphrase being compromised there is no way to
+ change the passphrase to protect data in any qcow images. The files must
+ be cloned, using a different encryption passphrase in the new file. The
+ original file must then be securely erased using a program like shred,
+ though even this is ineffective with many modern storage technologies.
+
+ The use of this is no longer supported in system emulators. Support only
+ remains in the command line utilities, for the purposes of data liberation
+ and interoperability with old versions of QEMU. The ``luks`` format
+ should be used instead.
+
+ .. option:: encrypt.key-secret
+
+ Provides the ID of a ``secret`` object that contains the passphrase
+ (``encrypt.format=luks``) or encryption key (``encrypt.format=aes``).
+
+ .. option:: encrypt.cipher-alg
+
+ Name of the cipher algorithm and key length. Currently defaults
+ to ``aes-256``. Only used when ``encrypt.format=luks``.
+
+ .. option:: encrypt.cipher-mode
+
+ Name of the encryption mode to use. Currently defaults to ``xts``.
+ Only used when ``encrypt.format=luks``.
+
+ .. option:: encrypt.ivgen-alg
+
+ Name of the initialization vector generator algorithm. Currently defaults
+ to ``plain64``. Only used when ``encrypt.format=luks``.
+
+ .. option:: encrypt.ivgen-hash-alg
+
+ Name of the hash algorithm to use with the initialization vector generator
+ (if required). Defaults to ``sha256``. Only used when ``encrypt.format=luks``.
+
+ .. option:: encrypt.hash-alg
+
+ Name of the hash algorithm to use for PBKDF algorithm
+ Defaults to ``sha256``. Only used when ``encrypt.format=luks``.
+
+ .. option:: encrypt.iter-time
+
+ Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
+ Defaults to ``2000``. Only used when ``encrypt.format=luks``.
+
+ .. option:: cluster_size
+
+ Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster
+ sizes can improve the image file size whereas larger cluster sizes generally
+ provide better performance.
+
+ .. option:: preallocation
+
+ Preallocation mode (allowed values: ``off``, ``metadata``, ``falloc``,
+ ``full``). An image with preallocated metadata is initially larger but can
+ improve performance when the image needs to grow. ``falloc`` and ``full``
+ preallocations are like the same options of ``raw`` format, but sets up
+ metadata also.
+
+ .. option:: lazy_refcounts
+
+ If this option is set to ``on``, reference count updates are postponed with
+ the goal of avoiding metadata I/O and improving performance. This is
+ particularly interesting with :option:`cache=writethrough` which doesn't batch
+ metadata updates. The tradeoff is that after a host crash, the reference count
+ tables must be rebuilt, i.e. on the next open an (automatic) ``qemu-img
+ check -r all`` is required, which may take some time.
+
+ This option can only be enabled if ``compat=1.1`` is specified.
+
+ .. option:: nocow
+
+ If this option is set to ``on``, it will turn off COW of the file. It's only
+ valid on btrfs, no effect on other file systems.
+
+ Btrfs has low performance when hosting a VM image file, even more
+ when the guest on the VM also using btrfs as file system. Turning off
+ COW is a way to mitigate this bad performance. Generally there are two
+ ways to turn off COW on btrfs:
+
+ - Disable it by mounting with nodatacow, then all newly created files
+ will be NOCOW.
+ - For an empty file, add the NOCOW file attribute. That's what this
+ option does.
+
+ Note: this option is only valid to new or empty files. If there is
+ an existing file which is COW and has data blocks already, it couldn't
+ be changed to NOCOW by setting ``nocow=on``. One can issue ``lsattr
+ filename`` to check if the NOCOW flag is set or not (Capital 'C' is
+ NOCOW flag).
+
+.. program:: image-formats
+.. option:: qed
+
+ Old QEMU image format with support for backing files and compact image files
+ (when your filesystem or transport medium does not support holes).
+
+ When converting QED images to qcow2, you might want to consider using the
+ ``lazy_refcounts=on`` option to get a more QED-like behaviour.
+
+ Supported options:
+
+ .. program:: qed
+ .. option:: backing_file
+
+ File name of a base image (see ``create`` subcommand).
+
+ .. option:: backing_fmt
+
+ Image file format of backing file (optional). Useful if the format cannot be
+ autodetected because it has no header, like some vhd/vpc files.
+
+ .. option:: cluster_size
+
+ Changes the cluster size (must be power-of-2 between 4K and 64K). Smaller
+ cluster sizes can improve the image file size whereas larger cluster sizes
+ generally provide better performance.
+
+ .. option:: table_size
+
+ Changes the number of clusters per L1/L2 table (must be
+ power-of-2 between 1 and 16). There is normally no need to
+ change this value but this option can between used for
+ performance benchmarking.
+
+.. program:: image-formats
+.. option:: qcow
+
+ Old QEMU image format with support for backing files, compact image files,
+ encryption and compression.
+
+ Supported options:
+
+ .. program:: qcow
+ .. option:: backing_file
+
+ File name of a base image (see ``create`` subcommand)
+
+ .. option:: encryption
+
+ This option is deprecated and equivalent to ``encrypt.format=aes``
+
+ .. option:: encrypt.format
+
+ If this is set to ``aes``, the image is encrypted with 128-bit AES-CBC.
+ The encryption key is given by the ``encrypt.key-secret`` parameter.
+ This encryption format is considered to be flawed by modern cryptography
+ standards, suffering from a number of design problems enumerated previously
+ against the ``qcow2`` image format.
+
+ The use of this is no longer supported in system emulators. Support only
+ remains in the command line utilities, for the purposes of data liberation
+ and interoperability with old versions of QEMU.
+
+ Users requiring native encryption should use the ``qcow2`` format
+ instead with ``encrypt.format=luks``.
+
+ .. option:: encrypt.key-secret
+
+ Provides the ID of a ``secret`` object that contains the encryption
+ key (``encrypt.format=aes``).
+
+.. program:: image-formats
+.. option:: luks
+
+ LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup
+
+ Supported options:
+
+ .. program:: luks
+ .. option:: key-secret
+
+ Provides the ID of a ``secret`` object that contains the passphrase.
+
+ .. option:: cipher-alg
+
+ Name of the cipher algorithm and key length. Currently defaults
+ to ``aes-256``.
+
+ .. option:: cipher-mode
+
+ Name of the encryption mode to use. Currently defaults to ``xts``.
+
+ .. option:: ivgen-alg
+
+ Name of the initialization vector generator algorithm. Currently defaults
+ to ``plain64``.
+
+ .. option:: ivgen-hash-alg
+
+ Name of the hash algorithm to use with the initialization vector generator
+ (if required). Defaults to ``sha256``.
+
+ .. option:: hash-alg
+
+ Name of the hash algorithm to use for PBKDF algorithm
+ Defaults to ``sha256``.
+
+ .. option:: iter-time
+
+ Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
+ Defaults to ``2000``.
+
+.. program:: image-formats
+.. option:: vdi
+
+ VirtualBox 1.1 compatible image format.
+
+ Supported options:
+
+ .. program:: vdi
+ .. option:: static
+
+ If this option is set to ``on``, the image is created with metadata
+ preallocation.
+
+.. program:: image-formats
+.. option:: vmdk
+
+ VMware 3 and 4 compatible image format.
+
+ Supported options:
+
+ .. program: vmdk
+ .. option:: backing_file
+
+ File name of a base image (see ``create`` subcommand).
+
+ .. option:: compat6
+
+ Create a VMDK version 6 image (instead of version 4)
+
+ .. option:: hwversion
+
+ Specify vmdk virtual hardware version. Compat6 flag cannot be enabled
+ if hwversion is specified.
+
+ .. option:: subformat
+
+ Specifies which VMDK subformat to use. Valid options are
+ ``monolithicSparse`` (default),
+ ``monolithicFlat``,
+ ``twoGbMaxExtentSparse``,
+ ``twoGbMaxExtentFlat`` and
+ ``streamOptimized``.
+
+.. program:: image-formats
+.. option:: vpc
+
+ VirtualPC compatible image format (VHD).
+
+ Supported options:
+
+ .. program:: vpc
+ .. option:: subformat
+
+ Specifies which VHD subformat to use. Valid options are
+ ``dynamic`` (default) and ``fixed``.
+
+.. program:: image-formats
+.. option:: VHDX
+
+ Hyper-V compatible image format (VHDX).
+
+ Supported options:
+
+ .. program:: VHDX
+ .. option:: subformat
+
+ Specifies which VHDX subformat to use. Valid options are
+ ``dynamic`` (default) and ``fixed``.
+
+ .. option:: block_state_zero
+
+ Force use of payload blocks of type 'ZERO'. Can be set to ``on`` (default)
+ or ``off``. When set to ``off``, new blocks will be created as
+ ``PAYLOAD_BLOCK_NOT_PRESENT``, which means parsers are free to return
+ arbitrary data for those blocks. Do not set to ``off`` when using
+ ``qemu-img convert`` with ``subformat=dynamic``.
+
+ .. option:: block_size
+
+ Block size; min 1 MB, max 256 MB. 0 means auto-calculate based on
+ image size.
+
+ .. option:: log_size
+
+ Log size; min 1 MB.
+
+Read-only formats
+-----------------
+
+More disk image file formats are supported in a read-only mode.
+
+.. program:: image-formats
+.. option:: bochs
+
+ Bochs images of ``growing`` type.
+
+.. program:: image-formats
+.. option:: cloop
+
+ Linux Compressed Loop image, useful only to reuse directly compressed
+ CD-ROM images present for example in the Knoppix CD-ROMs.
+
+.. program:: image-formats
+.. option:: dmg
+
+ Apple disk image.
+
+.. program:: image-formats
+.. option:: parallels
+
+ Parallels disk image format.
+
+Using host drives
+-----------------
+
+In addition to disk image files, QEMU can directly access host
+devices. We describe here the usage for QEMU version >= 0.8.3.
+
+Linux
+'''''
+
+On Linux, you can directly use the host device filename instead of a
+disk image filename provided you have enough privileges to access
+it. For example, use ``/dev/cdrom`` to access to the CDROM.
+
+CD
+ You can specify a CDROM device even if no CDROM is loaded. QEMU has
+ specific code to detect CDROM insertion or removal. CDROM ejection by
+ the guest OS is supported. Currently only data CDs are supported.
+
+Floppy
+ You can specify a floppy device even if no floppy is loaded. Floppy
+ removal is currently not detected accurately (if you change floppy
+ without doing floppy access while the floppy is not loaded, the guest
+ OS will think that the same floppy is loaded).
+ Use of the host's floppy device is deprecated, and support for it will
+ be removed in a future release.
+
+Hard disks
+ Hard disks can be used. Normally you must specify the whole disk
+ (``/dev/hdb`` instead of ``/dev/hdb1``) so that the guest OS can
+ see it as a partitioned disk. WARNING: unless you know what you do, it
+ is better to only make READ-ONLY accesses to the hard disk otherwise
+ you may corrupt your host data (use the ``-snapshot`` command
+ line option or modify the device permissions accordingly).
+
+Windows
+'''''''
+
+CD
+ The preferred syntax is the drive letter (e.g. ``d:``). The
+ alternate syntax ``\\.\d:`` is supported. ``/dev/cdrom`` is
+ supported as an alias to the first CDROM drive.
+
+ Currently there is no specific code to handle removable media, so it
+ is better to use the ``change`` or ``eject`` monitor commands to
+ change or eject media.
+
+Hard disks
+ Hard disks can be used with the syntax: ``\\.\PhysicalDriveN``
+ where *N* is the drive number (0 is the first hard disk).
+
+ WARNING: unless you know what you do, it is better to only make
+ READ-ONLY accesses to the hard disk otherwise you may corrupt your
+ host data (use the ``-snapshot`` command line so that the
+ modifications are written in a temporary file).
+
+Mac OS X
+''''''''
+
+``/dev/cdrom`` is an alias to the first CDROM.
+
+Currently there is no specific code to handle removable media, so it
+is better to use the ``change`` or ``eject`` monitor commands to
+change or eject media.
+
+Virtual FAT disk images
+-----------------------
+
+QEMU can automatically create a virtual FAT disk image from a
+directory tree. In order to use it, just type:
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -hdb fat:/my_directory
+
+Then you access access to all the files in the ``/my_directory``
+directory without having to copy them in a disk image or to export
+them via SAMBA or NFS. The default access is *read-only*.
+
+Floppies can be emulated with the ``:floppy:`` option:
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -fda fat:floppy:/my_directory
+
+A read/write support is available for testing (beta stage) with the
+``:rw:`` option:
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -fda fat:floppy:rw:/my_directory
+
+What you should *never* do:
+
+- use non-ASCII filenames
+- use "-snapshot" together with ":rw:"
+- expect it to work when loadvm'ing
+- write to the FAT directory on the host system while accessing it with the guest system
+
+NBD access
+----------
+
+QEMU can access directly to block device exported using the Network Block Device
+protocol.
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -hdb nbd://my_nbd_server.mydomain.org:1024/
+
+If the NBD server is located on the same host, you can use an unix socket instead
+of an inet socket:
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -hdb nbd+unix://?socket=/tmp/my_socket
+
+In this case, the block device must be exported using qemu-nbd:
+
+.. parsed-literal::
+
+ qemu-nbd --socket=/tmp/my_socket my_disk.qcow2
+
+The use of qemu-nbd allows sharing of a disk between several guests:
+
+.. parsed-literal::
+
+ qemu-nbd --socket=/tmp/my_socket --share=2 my_disk.qcow2
+
+and then you can use it with two guests:
+
+.. parsed-literal::
+
+ |qemu_system| linux1.img -hdb nbd+unix://?socket=/tmp/my_socket
+ |qemu_system| linux2.img -hdb nbd+unix://?socket=/tmp/my_socket
+
+If the nbd-server uses named exports (supported since NBD 2.9.18, or with QEMU's
+own embedded NBD server), you must specify an export name in the URI:
+
+.. parsed-literal::
+
+ |qemu_system| -cdrom nbd://localhost/debian-500-ppc-netinst
+ |qemu_system| -cdrom nbd://localhost/openSUSE-11.1-ppc-netinst
+
+The URI syntax for NBD is supported since QEMU 1.3. An alternative syntax is
+also available. Here are some example of the older syntax:
+
+.. parsed-literal::
+
+ |qemu_system| linux.img -hdb nbd:my_nbd_server.mydomain.org:1024
+ |qemu_system| linux2.img -hdb nbd:unix:/tmp/my_socket
+ |qemu_system| -cdrom nbd:localhost:10809:exportname=debian-500-ppc-netinst
+
+
+
+Sheepdog disk images
+--------------------
+
+Sheepdog is a distributed storage system for QEMU. It provides highly
+available block level storage volumes that can be attached to
+QEMU-based virtual machines.
+
+You can create a Sheepdog disk image with the command:
+
+.. parsed-literal::
+
+ qemu-img create sheepdog:///IMAGE SIZE
+
+where *IMAGE* is the Sheepdog image name and *SIZE* is its
+size.
+
+To import the existing *FILENAME* to Sheepdog, you can use a
+convert command.
+
+.. parsed-literal::
+
+ qemu-img convert FILENAME sheepdog:///IMAGE
+
+You can boot from the Sheepdog disk image with the command:
+
+.. parsed-literal::
+
+ |qemu_system| sheepdog:///IMAGE
+
+You can also create a snapshot of the Sheepdog image like qcow2.
+
+.. parsed-literal::
+
+ qemu-img snapshot -c TAG sheepdog:///IMAGE
+
+where *TAG* is a tag name of the newly created snapshot.
+
+To boot from the Sheepdog snapshot, specify the tag name of the
+snapshot.
+
+.. parsed-literal::
+
+ |qemu_system| sheepdog:///IMAGE#TAG
+
+You can create a cloned image from the existing snapshot.
+
+.. parsed-literal::
+
+ qemu-img create -b sheepdog:///BASE#TAG sheepdog:///IMAGE
+
+where *BASE* is an image name of the source snapshot and *TAG*
+is its tag name.
+
+You can use an unix socket instead of an inet socket:
+
+.. parsed-literal::
+
+ |qemu_system| sheepdog+unix:///IMAGE?socket=PATH
+
+If the Sheepdog daemon doesn't run on the local host, you need to
+specify one of the Sheepdog servers to connect to.
+
+.. parsed-literal::
+
+ qemu-img create sheepdog://HOSTNAME:PORT/IMAGE SIZE
+ |qemu_system| sheepdog://HOSTNAME:PORT/IMAGE
+
+iSCSI LUNs
+----------
+
+iSCSI is a popular protocol used to access SCSI devices across a computer
+network.
+
+There are two different ways iSCSI devices can be used by QEMU.
+
+The first method is to mount the iSCSI LUN on the host, and make it appear as
+any other ordinary SCSI device on the host and then to access this device as a
+/dev/sd device from QEMU. How to do this differs between host OSes.
+
+The second method involves using the iSCSI initiator that is built into
+QEMU. This provides a mechanism that works the same way regardless of which
+host OS you are running QEMU on. This section will describe this second method
+of using iSCSI together with QEMU.
+
+In QEMU, iSCSI devices are described using special iSCSI URLs. URL syntax:
+
+::
+
+ iscsi://[<username>[%<password>]@]<host>[:<port>]/<target-iqn-name>/<lun>
+
+Username and password are optional and only used if your target is set up
+using CHAP authentication for access control.
+Alternatively the username and password can also be set via environment
+variables to have these not show up in the process list:
+
+::
+
+ export LIBISCSI_CHAP_USERNAME=<username>
+ export LIBISCSI_CHAP_PASSWORD=<password>
+ iscsi://<host>/<target-iqn-name>/<lun>
+
+Various session related parameters can be set via special options, either
+in a configuration file provided via '-readconfig' or directly on the
+command line.
+
+If the initiator-name is not specified qemu will use a default name
+of 'iqn.2008-11.org.linux-kvm[:<uuid>'] where <uuid> is the UUID of the
+virtual machine. If the UUID is not specified qemu will use
+'iqn.2008-11.org.linux-kvm[:<name>'] where <name> is the name of the
+virtual machine.
+
+Setting a specific initiator name to use when logging in to the target:
+
+::
+
+ -iscsi initiator-name=iqn.qemu.test:my-initiator
+
+Controlling which type of header digest to negotiate with the target:
+
+::
+
+ -iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+
+These can also be set via a configuration file:
+
+::
+
+ [iscsi]
+ user = "CHAP username"
+ password = "CHAP password"
+ initiator-name = "iqn.qemu.test:my-initiator"
+ # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+ header-digest = "CRC32C"
+
+Setting the target name allows different options for different targets:
+
+::
+
+ [iscsi "iqn.target.name"]
+ user = "CHAP username"
+ password = "CHAP password"
+ initiator-name = "iqn.qemu.test:my-initiator"
+ # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE
+ header-digest = "CRC32C"
+
+How to use a configuration file to set iSCSI configuration options:
+
+.. parsed-literal::
+
+ cat >iscsi.conf <<EOF
+ [iscsi]
+ user = "me"
+ password = "my password"
+ initiator-name = "iqn.qemu.test:my-initiator"
+ header-digest = "CRC32C"
+ EOF
+
+ |qemu_system| -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \\
+ -readconfig iscsi.conf
+
+How to set up a simple iSCSI target on loopback and access it via QEMU:
+this example shows how to set up an iSCSI target with one CDROM and one DISK
+using the Linux STGT software target. This target is available on Red Hat based
+systems as the package 'scsi-target-utils'.
+
+.. parsed-literal::
+
+ tgtd --iscsi portal=127.0.0.1:3260
+ tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.qemu.test
+ tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 1 \\
+ -b /IMAGES/disk.img --device-type=disk
+ tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 2 \\
+ -b /IMAGES/cd.iso --device-type=cd
+ tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL
+
+ |qemu_system| -iscsi initiator-name=iqn.qemu.test:my-initiator \\
+ -boot d -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \\
+ -cdrom iscsi://127.0.0.1/iqn.qemu.test/2
+
+GlusterFS disk images
+---------------------
+
+GlusterFS is a user space distributed file system.
+
+You can boot from the GlusterFS disk image with the command:
+
+URI:
+
+.. parsed-literal::
+
+ |qemu_system| -drive file=gluster[+TYPE]://[HOST}[:PORT]]/VOLUME/PATH
+ [?socket=...][,file.debug=9][,file.logfile=...]
+
+JSON:
+
+.. parsed-literal::
+
+ |qemu_system| 'json:{"driver":"qcow2",
+ "file":{"driver":"gluster",
+ "volume":"testvol","path":"a.img","debug":9,"logfile":"...",
+ "server":[{"type":"tcp","host":"...","port":"..."},
+ {"type":"unix","socket":"..."}]}}'
+
+*gluster* is the protocol.
+
+*TYPE* specifies the transport type used to connect to gluster
+management daemon (glusterd). Valid transport types are
+tcp and unix. In the URI form, if a transport type isn't specified,
+then tcp type is assumed.
+
+*HOST* specifies the server where the volume file specification for
+the given volume resides. This can be either a hostname or an ipv4 address.
+If transport type is unix, then *HOST* field should not be specified.
+Instead *socket* field needs to be populated with the path to unix domain
+socket.
+
+*PORT* is the port number on which glusterd is listening. This is optional
+and if not specified, it defaults to port 24007. If the transport type is unix,
+then *PORT* should not be specified.
+
+*VOLUME* is the name of the gluster volume which contains the disk image.
+
+*PATH* is the path to the actual disk image that resides on gluster volume.
+
+*debug* is the logging level of the gluster protocol driver. Debug levels
+are 0-9, with 9 being the most verbose, and 0 representing no debugging output.
+The default level is 4. The current logging levels defined in the gluster source
+are 0 - None, 1 - Emergency, 2 - Alert, 3 - Critical, 4 - Error, 5 - Warning,
+6 - Notice, 7 - Info, 8 - Debug, 9 - Trace
+
+*logfile* is a commandline option to mention log file path which helps in
+logging to the specified file and also help in persisting the gfapi logs. The
+default is stderr.
+
+You can create a GlusterFS disk image with the command:
+
+.. parsed-literal::
+
+ qemu-img create gluster://HOST/VOLUME/PATH SIZE
+
+Examples
+
+.. parsed-literal::
+
+ |qemu_system| -drive file=gluster://1.2.3.4/testvol/a.img
+ |qemu_system| -drive file=gluster+tcp://1.2.3.4/testvol/a.img
+ |qemu_system| -drive file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+ |qemu_system| -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ |qemu_system| -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+ |qemu_system| -drive file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ |qemu_system| -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+ |qemu_system| -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+ |qemu_system| -drive file=gluster://1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log
+ |qemu_system| 'json:{"driver":"qcow2",
+ "file":{"driver":"gluster",
+ "volume":"testvol","path":"a.img",
+ "debug":9,"logfile":"/var/log/qemu-gluster.log",
+ "server":[{"type":"tcp","host":"1.2.3.4","port":24007},
+ {"type":"unix","socket":"/var/run/glusterd.socket"}]}}'
+ |qemu_system| -drive driver=qcow2,file.driver=gluster,file.volume=testvol,file.path=/path/a.img,
+ file.debug=9,file.logfile=/var/log/qemu-gluster.log,
+ file.server.0.type=tcp,file.server.0.host=1.2.3.4,file.server.0.port=24007,
+ file.server.1.type=unix,file.server.1.socket=/var/run/glusterd.socket
+
+Secure Shell (ssh) disk images
+------------------------------
+
+You can access disk images located on a remote ssh server
+by using the ssh protocol:
+
+.. parsed-literal::
+
+ |qemu_system| -drive file=ssh://[USER@]SERVER[:PORT]/PATH[?host_key_check=HOST_KEY_CHECK]
+
+Alternative syntax using properties:
+
+.. parsed-literal::
+
+ |qemu_system| -drive file.driver=ssh[,file.user=USER],file.host=SERVER[,file.port=PORT],file.path=PATH[,file.host_key_check=HOST_KEY_CHECK]
+
+*ssh* is the protocol.
+
+*USER* is the remote user. If not specified, then the local
+username is tried.
+
+*SERVER* specifies the remote ssh server. Any ssh server can be
+used, but it must implement the sftp-server protocol. Most Unix/Linux
+systems should work without requiring any extra configuration.
+
+*PORT* is the port number on which sshd is listening. By default
+the standard ssh port (22) is used.
+
+*PATH* is the path to the disk image.
+
+The optional *HOST_KEY_CHECK* parameter controls how the remote
+host's key is checked. The default is ``yes`` which means to use
+the local ``.ssh/known_hosts`` file. Setting this to ``no``
+turns off known-hosts checking. Or you can check that the host key
+matches a specific fingerprint:
+``host_key_check=md5:78:45:8e:14:57:4f:d5:45:83:0a:0e:f3:49:82:c9:c8``
+(``sha1:`` can also be used as a prefix, but note that OpenSSH
+tools only use MD5 to print fingerprints).
+
+Currently authentication must be done using ssh-agent. Other
+authentication methods may be supported in future.
+
+Note: Many ssh servers do not support an ``fsync``-style operation.
+The ssh driver cannot guarantee that disk flush requests are
+obeyed, and this causes a risk of disk corruption if the remote
+server or network goes down during writes. The driver will
+print a warning when ``fsync`` is not supported:
+
+::
+
+ warning: ssh server ssh.example.com:22 does not support fsync
+
+With sufficiently new versions of libssh and OpenSSH, ``fsync`` is
+supported.
+
+NVMe disk images
+----------------
+
+NVM Express (NVMe) storage controllers can be accessed directly by a userspace
+driver in QEMU. This bypasses the host kernel file system and block layers
+while retaining QEMU block layer functionalities, such as block jobs, I/O
+throttling, image formats, etc. Disk I/O performance is typically higher than
+with ``-drive file=/dev/sda`` using either thread pool or linux-aio.
+
+The controller will be exclusively used by the QEMU process once started. To be
+able to share storage between multiple VMs and other applications on the host,
+please use the file based protocols.
+
+Before starting QEMU, bind the host NVMe controller to the host vfio-pci
+driver. For example:
+
+.. parsed-literal::
+
+ # modprobe vfio-pci
+ # lspci -n -s 0000:06:0d.0
+ 06:0d.0 0401: 1102:0002 (rev 08)
+ # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
+ # echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
+
+ # |qemu_system| -drive file=nvme://HOST:BUS:SLOT.FUNC/NAMESPACE
+
+Alternative syntax using properties:
+
+.. parsed-literal::
+
+ |qemu_system| -drive file.driver=nvme,file.device=HOST:BUS:SLOT.FUNC,file.namespace=NAMESPACE
+
+*HOST*:*BUS*:*SLOT*.\ *FUNC* is the NVMe controller's PCI device
+address on the host.
+
+*NAMESPACE* is the NVMe namespace number, starting from 1.
+
+Disk image file locking
+-----------------------
+
+By default, QEMU tries to protect image files from unexpected concurrent
+access, as long as it's supported by the block protocol driver and host
+operating system. If multiple QEMU processes (including QEMU emulators and
+utilities) try to open the same image with conflicting accessing modes, all but
+the first one will get an error.
+
+This feature is currently supported by the file protocol on Linux with the Open
+File Descriptor (OFD) locking API, and can be configured to fall back to POSIX
+locking if the POSIX host doesn't support Linux OFD locking.
+
+To explicitly enable image locking, specify "locking=on" in the file protocol
+driver options. If OFD locking is not possible, a warning will be printed and
+the POSIX locking API will be used. In this case there is a risk that the lock
+will get silently lost when doing hot plugging and block jobs, due to the
+shortcomings of the POSIX locking API.
+
+QEMU transparently handles lock handover during shared storage migration. For
+shared virtual disk images between multiple VMs, the "share-rw" device option
+should be used.
+
+By default, the guest has exclusive write access to its disk image. If the
+guest can safely share the disk image with other writers the
+``-device ...,share-rw=on`` parameter can be used. This is only safe if
+the guest is running software, such as a cluster file system, that
+coordinates disk accesses to avoid corruption.
+
+Note that share-rw=on only declares the guest's ability to share the disk.
+Some QEMU features, such as image file formats, require exclusive write access
+to the disk image and this is unaffected by the share-rw=on option.
+
+Alternatively, locking can be fully disabled by "locking=off" block device
+option. In the command line, the option is usually in the form of
+"file.locking=off" as the protocol driver is normally placed as a "file" child
+under a format driver. For example:
+
+::
+
+ -blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file
+
+To check if image locking is active, check the output of the "lslocks" command
+on host and see if there are locks held by the QEMU process on the image file.
+More than one byte could be locked by the QEMU instance, each byte of which
+reflects a particular permission that is acquired or protected by the running
+block driver.
+
+.. only:: man
+
+ See also
+ --------
+
+ The HTML documentation of QEMU for more precise information and Linux
+ user mode emulator invocation.
diff --git a/exec.c b/exec.c
index d4b769d0d4..67e520d18e 100644
--- a/exec.c
+++ b/exec.c
@@ -25,7 +25,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "exec/target_page.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "hw/qdev-core.h"
#include "hw/qdev-properties.h"
#if !defined(CONFIG_USER_ONLY)
@@ -3895,7 +3895,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
uint8_t *host_startaddr = rb->host + start;
- if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
+ if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
error_report("ram_block_discard_range: Unaligned start address: %p",
host_startaddr);
goto err;
@@ -3903,7 +3903,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
if ((start + length) <= rb->used_length) {
bool need_madvise, need_fallocate;
- if (length & (rb->page_size - 1)) {
+ if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
error_report("ram_block_discard_range: Unaligned length: %zx",
length);
goto err;
diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c
index 0d4de49dcf..aa1ab2590d 100644
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -287,8 +287,7 @@ static int setugid(int uid, int gid, int *suid, int *sgid)
*sgid = getegid();
if (setresgid(-1, gid, *sgid) == -1) {
- retval = -errno;
- goto err_out;
+ return -errno;
}
if (setresuid(-1, uid, *suid) == -1) {
@@ -322,7 +321,6 @@ err_sgid:
if (setresgid(-1, *sgid, *sgid) == -1) {
abort();
}
-err_out:
return retval;
}
diff --git a/hw/9pfs/9p-local.c b/hw/9pfs/9p-local.c
index ca641390fb..54e012e5b4 100644
--- a/hw/9pfs/9p-local.c
+++ b/hw/9pfs/9p-local.c
@@ -947,7 +947,7 @@ static int local_link(FsContext *ctx, V9fsPath *oldpath,
if (ctx->export_flags & V9FS_SM_MAPPED_FILE &&
local_is_mapped_file_metadata(ctx, name)) {
errno = EINVAL;
- return -1;
+ goto out;
}
odirfd = local_opendir_nofollow(ctx, odirpath);
@@ -1076,7 +1076,7 @@ out:
static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name,
int flags)
{
- int ret = -1;
+ int ret;
if (ctx->export_flags & V9FS_SM_MAPPED_FILE) {
int map_dirfd;
@@ -1094,12 +1094,12 @@ static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name,
fd = openat_dir(dirfd, name);
if (fd == -1) {
- goto err_out;
+ return -1;
}
ret = unlinkat(fd, VIRTFS_META_DIR, AT_REMOVEDIR);
close_preserve_errno(fd);
if (ret < 0 && errno != ENOENT) {
- goto err_out;
+ return -1;
}
}
map_dirfd = openat_dir(dirfd, VIRTFS_META_DIR);
@@ -1107,16 +1107,14 @@ static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name,
ret = unlinkat(map_dirfd, name, 0);
close_preserve_errno(map_dirfd);
if (ret < 0 && errno != ENOENT) {
- goto err_out;
+ return -1;
}
} else if (errno != ENOENT) {
- goto err_out;
+ return -1;
}
}
- ret = unlinkat(dirfd, name, flags);
-err_out:
- return ret;
+ return unlinkat(dirfd, name, flags);
}
static int local_remove(FsContext *ctx, const char *path)
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 520177f40c..b0e445d6bd 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2090,22 +2090,29 @@ out_nofid:
* with qemu_iovec_destroy().
*/
static void v9fs_init_qiov_from_pdu(QEMUIOVector *qiov, V9fsPDU *pdu,
- size_t skip, size_t size,
+ size_t skip, size_t *size,
bool is_write)
{
QEMUIOVector elem;
struct iovec *iov;
unsigned int niov;
+ size_t alloc_size = *size + skip;
if (is_write) {
- pdu->s->transport->init_out_iov_from_pdu(pdu, &iov, &niov, size + skip);
+ pdu->s->transport->init_out_iov_from_pdu(pdu, &iov, &niov, alloc_size);
} else {
- pdu->s->transport->init_in_iov_from_pdu(pdu, &iov, &niov, size + skip);
+ pdu->s->transport->init_in_iov_from_pdu(pdu, &iov, &niov, &alloc_size);
+ }
+
+ if (alloc_size < skip) {
+ *size = 0;
+ } else {
+ *size = alloc_size - skip;
}
qemu_iovec_init_external(&elem, iov, niov);
qemu_iovec_init(qiov, niov);
- qemu_iovec_concat(qiov, &elem, skip, size);
+ qemu_iovec_concat(qiov, &elem, skip, *size);
}
static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
@@ -2113,15 +2120,14 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
{
ssize_t err;
size_t offset = 7;
- uint64_t read_count;
+ size_t read_count;
QEMUIOVector qiov_full;
if (fidp->fs.xattr.len < off) {
read_count = 0;
- } else {
+ } else if (fidp->fs.xattr.len - off < max_count) {
read_count = fidp->fs.xattr.len - off;
- }
- if (read_count > max_count) {
+ } else {
read_count = max_count;
}
err = pdu_marshal(pdu, offset, "d", read_count);
@@ -2130,7 +2136,7 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
}
offset += err;
- v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, read_count, false);
+ v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, &read_count, false);
err = v9fs_pack(qiov_full.iov, qiov_full.niov, 0,
((char *)fidp->fs.xattr.value) + off,
read_count);
@@ -2259,9 +2265,11 @@ static void coroutine_fn v9fs_read(void *opaque)
QEMUIOVector qiov_full;
QEMUIOVector qiov;
int32_t len;
+ size_t size = max_count;
- v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset + 4, max_count, false);
+ v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset + 4, &size, false);
qemu_iovec_init(&qiov, qiov_full.niov);
+ max_count = size;
do {
qemu_iovec_reset(&qiov);
qemu_iovec_concat(&qiov, &qiov_full, count, qiov_full.size - count);
@@ -2464,8 +2472,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
if (fidp->fs.xattr.len < off) {
- err = -ENOSPC;
- goto out;
+ return -ENOSPC;
}
write_count = fidp->fs.xattr.len - off;
if (write_count > count) {
@@ -2491,7 +2498,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
off += to_copy;
write_count -= to_copy;
}
-out:
+
return err;
}
@@ -2504,6 +2511,7 @@ static void coroutine_fn v9fs_write(void *opaque)
int32_t len = 0;
int32_t total = 0;
size_t offset = 7;
+ size_t size;
V9fsFidState *fidp;
V9fsPDU *pdu = opaque;
V9fsState *s = pdu->s;
@@ -2516,7 +2524,9 @@ static void coroutine_fn v9fs_write(void *opaque)
return;
}
offset += err;
- v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, count, true);
+ size = count;
+ v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, &size, true);
+ count = size;
trace_v9fs_write(pdu->tag, pdu->id, fid, off, count, qiov_full.niov);
fidp = get_fid(pdu, fid);
@@ -3056,8 +3066,7 @@ static int coroutine_fn v9fs_complete_rename(V9fsPDU *pdu, V9fsFidState *fidp,
if (newdirfid != -1) {
dirfidp = get_fid(pdu, newdirfid);
if (dirfidp == NULL) {
- err = -ENOENT;
- goto out_nofid;
+ return -ENOENT;
}
if (fidp->fid_type != P9_FID_NONE) {
err = -EINVAL;
@@ -3100,7 +3109,6 @@ out:
put_fid(pdu, dirfidp);
}
v9fs_path_free(&new_path);
-out_nofid:
return err;
}
diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h
index 3904f82901..8d07a0b301 100644
--- a/hw/9pfs/9p.h
+++ b/hw/9pfs/9p.h
@@ -425,7 +425,7 @@ struct V9fsTransport {
ssize_t (*pdu_vunmarshal)(V9fsPDU *pdu, size_t offset, const char *fmt,
va_list ap);
void (*init_in_iov_from_pdu)(V9fsPDU *pdu, struct iovec **piov,
- unsigned int *pniov, size_t size);
+ unsigned int *pniov, size_t *size);
void (*init_out_iov_from_pdu)(V9fsPDU *pdu, struct iovec **piov,
unsigned int *pniov, size_t size);
void (*push_and_notify)(V9fsPDU *pdu);
diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index b5a7c03f26..1d1c50409c 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -147,19 +147,22 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset,
}
static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov,
- unsigned int *pniov, size_t size)
+ unsigned int *pniov, size_t *size)
{
V9fsState *s = pdu->s;
V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
VirtQueueElement *elem = v->elems[pdu->idx];
size_t buf_size = iov_size(elem->in_sg, elem->in_num);
- if (buf_size < size) {
+ if (buf_size < P9_IOHDRSZ) {
VirtIODevice *vdev = VIRTIO_DEVICE(v);
virtio_error(vdev,
- "VirtFS reply type %d needs %zu bytes, buffer has %zu",
- pdu->id + 1, size, buf_size);
+ "VirtFS reply type %d needs %zu bytes, buffer has %zu, less than minimum",
+ pdu->id + 1, *size, buf_size);
+ }
+ if (buf_size < *size) {
+ *size = buf_size;
}
*piov = elem->in_sg;
@@ -215,6 +218,7 @@ static void virtio_9p_device_unrealize(DeviceState *dev, Error **errp)
V9fsVirtioState *v = VIRTIO_9P(dev);
V9fsState *s = &v->state;
+ virtio_delete_queue(v->vq);
virtio_cleanup(vdev);
v9fs_device_unrealize_common(s, errp);
}
diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c
index 71eebe12dd..18fe5b7c92 100644
--- a/hw/9pfs/xen-9p-backend.c
+++ b/hw/9pfs/xen-9p-backend.c
@@ -187,7 +187,7 @@ static void xen_9pfs_init_out_iov_from_pdu(V9fsPDU *pdu,
static void xen_9pfs_init_in_iov_from_pdu(V9fsPDU *pdu,
struct iovec **piov,
unsigned int *pniov,
- size_t size)
+ size_t *size)
{
Xen9pfsDev *xen_9pfs = container_of(pdu->s, Xen9pfsDev, state);
Xen9pfsRing *ring = &xen_9pfs->rings[pdu->tag % xen_9pfs->num_rings];
@@ -197,16 +197,19 @@ static void xen_9pfs_init_in_iov_from_pdu(V9fsPDU *pdu,
g_free(ring->sg);
ring->sg = g_new0(struct iovec, 2);
- xen_9pfs_in_sg(ring, ring->sg, &num, pdu->idx, size);
+ xen_9pfs_in_sg(ring, ring->sg, &num, pdu->idx, *size);
buf_size = iov_size(ring->sg, num);
- if (buf_size < size) {
+ if (buf_size < P9_IOHDRSZ) {
xen_pv_printf(&xen_9pfs->xendev, 0, "Xen 9pfs request type %d"
- "needs %zu bytes, buffer has %zu\n", pdu->id, size,
- buf_size);
+ "needs %zu bytes, buffer has %zu, less than minimum\n",
+ pdu->id, *size, buf_size);
xen_be_set_state(&xen_9pfs->xendev, XenbusStateClosing);
xen_9pfs_disconnect(&xen_9pfs->xendev);
}
+ if (buf_size < *size) {
+ *size = buf_size;
+ }
*piov = ring->sg;
*pniov = num;
diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index 87f30a31d7..e2c957ce00 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -12,11 +12,13 @@
#define ACPI_CPU_FLAGS_OFFSET_RW 4
#define ACPI_CPU_CMD_OFFSET_WR 5
#define ACPI_CPU_CMD_DATA_OFFSET_RW 8
+#define ACPI_CPU_CMD_DATA2_OFFSET_R 0
enum {
CPHP_GET_NEXT_CPU_WITH_EVENT_CMD = 0,
CPHP_OST_EVENT_CMD = 1,
CPHP_OST_STATUS_CMD = 2,
+ CPHP_GET_CPU_ID_CMD = 3,
CPHP_CMD_MAX
};
@@ -74,11 +76,27 @@ static uint64_t cpu_hotplug_rd(void *opaque, hwaddr addr, unsigned size)
case CPHP_GET_NEXT_CPU_WITH_EVENT_CMD:
val = cpu_st->selector;
break;
+ case CPHP_GET_CPU_ID_CMD:
+ val = cdev->arch_id & 0xFFFFFFFF;
+ break;
default:
break;
}
trace_cpuhp_acpi_read_cmd_data(cpu_st->selector, val);
break;
+ case ACPI_CPU_CMD_DATA2_OFFSET_R:
+ switch (cpu_st->command) {
+ case CPHP_GET_NEXT_CPU_WITH_EVENT_CMD:
+ val = 0;
+ break;
+ case CPHP_GET_CPU_ID_CMD:
+ val = cdev->arch_id >> 32;
+ break;
+ default:
+ break;
+ }
+ trace_cpuhp_acpi_read_cmd_data2(cpu_st->selector, val);
+ break;
default:
break;
}
diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 9cee90cc70..55eb29d80a 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -175,7 +175,7 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev,
AcpiGedState *s = ACPI_GED(hotplug_dev);
if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
- acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp);
+ acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp);
} else {
error_setg(errp, "virt: device plug request for unsupported device"
" type: %s", object_get_typename(OBJECT(dev)));
diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events
index 96b8273297..afbc77de1c 100644
--- a/hw/acpi/trace-events
+++ b/hw/acpi/trace-events
@@ -23,6 +23,7 @@ cpuhp_acpi_read_flags(uint32_t idx, uint8_t flags) "idx[0x%"PRIx32"] flags: 0x%"
cpuhp_acpi_write_idx(uint32_t idx) "set active cpu idx: 0x%"PRIx32
cpuhp_acpi_write_cmd(uint32_t idx, uint8_t cmd) "idx[0x%"PRIx32"] cmd: 0x%"PRIx8
cpuhp_acpi_read_cmd_data(uint32_t idx, uint32_t data) "idx[0x%"PRIx32"] data: 0x%"PRIx32
+cpuhp_acpi_read_cmd_data2(uint32_t idx, uint32_t data) "idx[0x%"PRIx32"] data: 0x%"PRIx32
cpuhp_acpi_cpu_has_events(uint32_t idx, bool ins, bool rm) "idx[0x%"PRIx32"] inserting: %d, removing: %d"
cpuhp_acpi_clear_inserting_evt(uint32_t idx) "idx[0x%"PRIx32"]"
cpuhp_acpi_clear_remove_evt(uint32_t idx) "idx[0x%"PRIx32"]"
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index c6e7782580..3d86691ae0 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -101,6 +101,10 @@ config NETDUINO2
bool
select STM32F205_SOC
+config NETDUINOPLUS2
+ bool
+ select STM32F405_SOC
+
config NSERIES
bool
select OMAP
@@ -307,6 +311,12 @@ config STM32F205_SOC
select STM32F2XX_ADC
select STM32F2XX_SPI
+config STM32F405_SOC
+ bool
+ select ARM_V7M
+ select STM32F4XX_SYSCFG
+ select STM32F4XX_EXTI
+
config XLNX_ZYNQMP_ARM
bool
select AHCI
diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
index fe749f65fd..336f6dd374 100644
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -11,6 +11,7 @@ obj-$(CONFIG_MAINSTONE) += mainstone.o
obj-$(CONFIG_MICROBIT) += microbit.o
obj-$(CONFIG_MUSICPAL) += musicpal.o
obj-$(CONFIG_NETDUINO2) += netduino2.o
+obj-$(CONFIG_NETDUINOPLUS2) += netduinoplus2.o
obj-$(CONFIG_NSERIES) += nseries.o
obj-$(CONFIG_SX1) += omap_sx1.o
obj-$(CONFIG_CHEETAH) += palm.o
@@ -36,6 +37,7 @@ obj-$(CONFIG_STRONGARM) += strongarm.o
obj-$(CONFIG_ALLWINNER_A10) += allwinner-a10.o cubieboard.o
obj-$(CONFIG_RASPI) += bcm2835_peripherals.o bcm2836.o raspi.o
obj-$(CONFIG_STM32F205_SOC) += stm32f205_soc.o
+obj-$(CONFIG_STM32F405_SOC) += stm32f405_soc.o
obj-$(CONFIG_XLNX_ZYNQMP_ARM) += xlnx-zynqmp.o xlnx-zcu102.o
obj-$(CONFIG_XLNX_VERSAL) += xlnx-versal.o xlnx-versal-virt.o
obj-$(CONFIG_FSL_IMX25) += fsl-imx25.o imx25_pdk.o
diff --git a/hw/arm/allwinner-a10.c b/hw/arm/allwinner-a10.c
index 118032c8c7..1cde165611 100644
--- a/hw/arm/allwinner-a10.c
+++ b/hw/arm/allwinner-a10.c
@@ -25,6 +25,12 @@
#include "hw/misc/unimp.h"
#include "sysemu/sysemu.h"
+#define AW_A10_PIC_REG_BASE 0x01c20400
+#define AW_A10_PIT_REG_BASE 0x01c20c00
+#define AW_A10_UART0_REG_BASE 0x01c28000
+#define AW_A10_EMAC_BASE 0x01c0b000
+#define AW_A10_SATA_BASE 0x01c18000
+
static void aw_a10_init(Object *obj)
{
AwA10State *s = AW_A10(obj);
@@ -49,8 +55,6 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
{
AwA10State *s = AW_A10(dev);
SysBusDevice *sysbusdev;
- uint8_t i;
- qemu_irq fiq, irq;
Error *err = NULL;
object_property_set_bool(OBJECT(&s->cpu), true, "realized", &err);
@@ -58,8 +62,6 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
error_propagate(errp, err);
return;
}
- irq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_IRQ);
- fiq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_FIQ);
object_property_set_bool(OBJECT(&s->intc), true, "realized", &err);
if (err != NULL) {
@@ -68,11 +70,11 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
}
sysbusdev = SYS_BUS_DEVICE(&s->intc);
sysbus_mmio_map(sysbusdev, 0, AW_A10_PIC_REG_BASE);
- sysbus_connect_irq(sysbusdev, 0, irq);
- sysbus_connect_irq(sysbusdev, 1, fiq);
- for (i = 0; i < AW_A10_PIC_INT_NR; i++) {
- s->irq[i] = qdev_get_gpio_in(DEVICE(&s->intc), i);
- }
+ sysbus_connect_irq(sysbusdev, 0,
+ qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_IRQ));
+ sysbus_connect_irq(sysbusdev, 1,
+ qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_FIQ));
+ qdev_pass_gpios(DEVICE(&s->intc), dev, NULL);
object_property_set_bool(OBJECT(&s->timer), true, "realized", &err);
if (err != NULL) {
@@ -81,12 +83,12 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
}
sysbusdev = SYS_BUS_DEVICE(&s->timer);
sysbus_mmio_map(sysbusdev, 0, AW_A10_PIT_REG_BASE);
- sysbus_connect_irq(sysbusdev, 0, s->irq[22]);
- sysbus_connect_irq(sysbusdev, 1, s->irq[23]);
- sysbus_connect_irq(sysbusdev, 2, s->irq[24]);
- sysbus_connect_irq(sysbusdev, 3, s->irq[25]);
- sysbus_connect_irq(sysbusdev, 4, s->irq[67]);
- sysbus_connect_irq(sysbusdev, 5, s->irq[68]);
+ sysbus_connect_irq(sysbusdev, 0, qdev_get_gpio_in(dev, 22));
+ sysbus_connect_irq(sysbusdev, 1, qdev_get_gpio_in(dev, 23));
+ sysbus_connect_irq(sysbusdev, 2, qdev_get_gpio_in(dev, 24));
+ sysbus_connect_irq(sysbusdev, 3, qdev_get_gpio_in(dev, 25));
+ sysbus_connect_irq(sysbusdev, 4, qdev_get_gpio_in(dev, 67));
+ sysbus_connect_irq(sysbusdev, 5, qdev_get_gpio_in(dev, 68));
memory_region_init_ram(&s->sram_a, OBJECT(dev), "sram A", 48 * KiB,
&error_fatal);
@@ -105,7 +107,7 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
}
sysbusdev = SYS_BUS_DEVICE(&s->emac);
sysbus_mmio_map(sysbusdev, 0, AW_A10_EMAC_BASE);
- sysbus_connect_irq(sysbusdev, 0, s->irq[55]);
+ sysbus_connect_irq(sysbusdev, 0, qdev_get_gpio_in(dev, 55));
object_property_set_bool(OBJECT(&s->sata), true, "realized", &err);
if (err) {
@@ -113,10 +115,11 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
return;
}
sysbus_mmio_map(SYS_BUS_DEVICE(&s->sata), 0, AW_A10_SATA_BASE);
- sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, s->irq[56]);
+ sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, qdev_get_gpio_in(dev, 56));
/* FIXME use a qdev chardev prop instead of serial_hd() */
- serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, s->irq[1],
+ serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2,
+ qdev_get_gpio_in(dev, 1),
115200, serial_hd(0), DEVICE_NATIVE_ENDIAN);
}
diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c
index 77fbe1baab..59a27bdd68 100644
--- a/hw/arm/exynos4210.c
+++ b/hw/arm/exynos4210.c
@@ -166,17 +166,37 @@ static uint64_t exynos4210_calc_affinity(int cpu)
return (0x9 << ARM_AFF1_SHIFT) | cpu;
}
-static void pl330_create(uint32_t base, qemu_irq irq, int nreq)
+static DeviceState *pl330_create(uint32_t base, qemu_or_irq *orgate,
+ qemu_irq irq, int nreq, int nevents, int width)
{
SysBusDevice *busdev;
DeviceState *dev;
+ int i;
dev = qdev_create(NULL, "pl330");
+ qdev_prop_set_uint8(dev, "num_events", nevents);
+ qdev_prop_set_uint8(dev, "num_chnls", 8);
qdev_prop_set_uint8(dev, "num_periph_req", nreq);
+
+ qdev_prop_set_uint8(dev, "wr_cap", 4);
+ qdev_prop_set_uint8(dev, "wr_q_dep", 8);
+ qdev_prop_set_uint8(dev, "rd_cap", 4);
+ qdev_prop_set_uint8(dev, "rd_q_dep", 8);
+ qdev_prop_set_uint8(dev, "data_width", width);
+ qdev_prop_set_uint16(dev, "data_buffer_dep", width);
qdev_init_nofail(dev);
busdev = SYS_BUS_DEVICE(dev);
sysbus_mmio_map(busdev, 0, base);
- sysbus_connect_irq(busdev, 0, irq);
+
+ object_property_set_int(OBJECT(orgate), nevents + 1, "num-lines",
+ &error_abort);
+ object_property_set_bool(OBJECT(orgate), true, "realized", &error_abort);
+
+ for (i = 0; i < nevents + 1; i++) {
+ sysbus_connect_irq(busdev, i, qdev_get_gpio_in(DEVICE(orgate), i));
+ }
+ qdev_connect_gpio_out(DEVICE(orgate), 0, irq);
+ return dev;
}
static void exynos4210_realize(DeviceState *socdev, Error **errp)
@@ -185,7 +205,7 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp)
MemoryRegion *system_mem = get_system_memory();
qemu_irq gate_irq[EXYNOS4210_NCPUS][EXYNOS4210_IRQ_GATE_NINPUTS];
SysBusDevice *busdev;
- DeviceState *dev;
+ DeviceState *dev, *uart[4], *pl330[3];
int i, n;
for (n = 0; n < EXYNOS4210_NCPUS; n++) {
@@ -371,19 +391,19 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp)
/*** UARTs ***/
- exynos4210_uart_create(EXYNOS4210_UART0_BASE_ADDR,
+ uart[0] = exynos4210_uart_create(EXYNOS4210_UART0_BASE_ADDR,
EXYNOS4210_UART0_FIFO_SIZE, 0, serial_hd(0),
s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 0)]);
- exynos4210_uart_create(EXYNOS4210_UART1_BASE_ADDR,
+ uart[1] = exynos4210_uart_create(EXYNOS4210_UART1_BASE_ADDR,
EXYNOS4210_UART1_FIFO_SIZE, 1, serial_hd(1),
s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 1)]);
- exynos4210_uart_create(EXYNOS4210_UART2_BASE_ADDR,
+ uart[2] = exynos4210_uart_create(EXYNOS4210_UART2_BASE_ADDR,
EXYNOS4210_UART2_FIFO_SIZE, 2, serial_hd(2),
s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 2)]);
- exynos4210_uart_create(EXYNOS4210_UART3_BASE_ADDR,
+ uart[3] = exynos4210_uart_create(EXYNOS4210_UART3_BASE_ADDR,
EXYNOS4210_UART3_FIFO_SIZE, 3, serial_hd(3),
s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 3)]);
@@ -431,12 +451,42 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp)
s->irq_table[exynos4210_get_irq(28, 3)]);
/*** DMA controllers ***/
- pl330_create(EXYNOS4210_PL330_BASE0_ADDR,
- qemu_irq_invert(s->irq_table[exynos4210_get_irq(35, 1)]), 32);
- pl330_create(EXYNOS4210_PL330_BASE1_ADDR,
- qemu_irq_invert(s->irq_table[exynos4210_get_irq(36, 1)]), 32);
- pl330_create(EXYNOS4210_PL330_BASE2_ADDR,
- qemu_irq_invert(s->irq_table[exynos4210_get_irq(34, 1)]), 1);
+ pl330[0] = pl330_create(EXYNOS4210_PL330_BASE0_ADDR,
+ &s->pl330_irq_orgate[0],
+ s->irq_table[exynos4210_get_irq(21, 0)],
+ 32, 32, 32);
+ pl330[1] = pl330_create(EXYNOS4210_PL330_BASE1_ADDR,
+ &s->pl330_irq_orgate[1],
+ s->irq_table[exynos4210_get_irq(21, 1)],
+ 32, 32, 32);
+ pl330[2] = pl330_create(EXYNOS4210_PL330_BASE2_ADDR,
+ &s->pl330_irq_orgate[2],
+ s->irq_table[exynos4210_get_irq(20, 1)],
+ 1, 31, 64);
+
+ sysbus_connect_irq(SYS_BUS_DEVICE(uart[0]), 1,
+ qdev_get_gpio_in(pl330[0], 15));
+ sysbus_connect_irq(SYS_BUS_DEVICE(uart[1]), 1,
+ qdev_get_gpio_in(pl330[1], 15));
+ sysbus_connect_irq(SYS_BUS_DEVICE(uart[2]), 1,
+ qdev_get_gpio_in(pl330[0], 17));
+ sysbus_connect_irq(SYS_BUS_DEVICE(uart[3]), 1,
+ qdev_get_gpio_in(pl330[1], 17));
+}
+
+static void exynos4210_init(Object *obj)
+{
+ Exynos4210State *s = EXYNOS4210_SOC(obj);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(s->pl330_irq_orgate); i++) {
+ char *name = g_strdup_printf("pl330-irq-orgate%d", i);
+ qemu_or_irq *orgate = &s->pl330_irq_orgate[i];
+
+ object_initialize_child(obj, name, orgate, sizeof(*orgate),
+ TYPE_OR_IRQ, &error_abort, NULL);
+ g_free(name);
+ }
}
static void exynos4210_class_init(ObjectClass *klass, void *data)
@@ -450,6 +500,7 @@ static const TypeInfo exynos4210_info = {
.name = TYPE_EXYNOS4210_SOC,
.parent = TYPE_SYS_BUS_DEVICE,
.instance_size = sizeof(Exynos4210State),
+ .instance_init = exynos4210_init,
.class_init = exynos4210_class_init,
};
diff --git a/hw/arm/fsl-imx25.c b/hw/arm/fsl-imx25.c
index 3cb5a8fdfd..da3471b395 100644
--- a/hw/arm/fsl-imx25.c
+++ b/hw/arm/fsl-imx25.c
@@ -62,6 +62,9 @@ static void fsl_imx25_init(Object *obj)
sysbus_init_child_obj(obj, "fec", &s->fec, sizeof(s->fec), TYPE_IMX_FEC);
+ sysbus_init_child_obj(obj, "rngc", &s->rngc, sizeof(s->rngc),
+ TYPE_IMX_RNGC);
+
for (i = 0; i < FSL_IMX25_NUM_I2CS; i++) {
sysbus_init_child_obj(obj, "i2c[*]", &s->i2c[i], sizeof(s->i2c[i]),
TYPE_IMX_I2C);
@@ -188,6 +191,14 @@ static void fsl_imx25_realize(DeviceState *dev, Error **errp)
sysbus_connect_irq(SYS_BUS_DEVICE(&s->fec), 0,
qdev_get_gpio_in(DEVICE(&s->avic), FSL_IMX25_FEC_IRQ));
+ object_property_set_bool(OBJECT(&s->rngc), true, "realized", &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+ sysbus_mmio_map(SYS_BUS_DEVICE(&s->rngc), 0, FSL_IMX25_RNGC_ADDR);
+ sysbus_connect_irq(SYS_BUS_DEVICE(&s->rngc), 0,
+ qdev_get_gpio_in(DEVICE(&s->avic), FSL_IMX25_RNGC_IRQ));
/* Initialize all I2C */
for (i = 0; i < FSL_IMX25_NUM_I2CS; i++) {
diff --git a/hw/arm/netduinoplus2.c b/hw/arm/netduinoplus2.c
new file mode 100644
index 0000000000..e5e247edbe
--- /dev/null
+++ b/hw/arm/netduinoplus2.c
@@ -0,0 +1,52 @@
+/*
+ * Netduino Plus 2 Machine Model
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/boards.h"
+#include "hw/qdev-properties.h"
+#include "qemu/error-report.h"
+#include "hw/arm/stm32f405_soc.h"
+#include "hw/arm/boot.h"
+
+static void netduinoplus2_init(MachineState *machine)
+{
+ DeviceState *dev;
+
+ dev = qdev_create(NULL, TYPE_STM32F405_SOC);
+ qdev_prop_set_string(dev, "cpu-type", ARM_CPU_TYPE_NAME("cortex-m4"));
+ object_property_set_bool(OBJECT(dev), true, "realized", &error_fatal);
+
+ armv7m_load_kernel(ARM_CPU(first_cpu),
+ machine->kernel_filename,
+ FLASH_SIZE);
+}
+
+static void netduinoplus2_machine_init(MachineClass *mc)
+{
+ mc->desc = "Netduino Plus 2 Machine";
+ mc->init = netduinoplus2_init;
+}
+
+DEFINE_MACHINE("netduinoplus2", netduinoplus2_machine_init)
diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c
index b198066b54..bb025e0bd0 100644
--- a/hw/arm/stellaris.c
+++ b/hw/arm/stellaris.c
@@ -708,7 +708,7 @@ static int stellaris_sys_init(uint32_t base, qemu_irq irq,
memory_region_init_io(&s->iomem, NULL, &ssys_ops, s, "ssys", 0x00001000);
memory_region_add_subregion(get_system_memory(), base, &s->iomem);
ssys_reset(s);
- vmstate_register(NULL, -1, &vmstate_stellaris_sys, s);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_stellaris_sys, s);
return 0;
}
diff --git a/hw/arm/stm32f405_soc.c b/hw/arm/stm32f405_soc.c
new file mode 100644
index 0000000000..f22516fdf7
--- /dev/null
+++ b/hw/arm/stm32f405_soc.c
@@ -0,0 +1,302 @@
+/*
+ * STM32F405 SoC
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
+#include "hw/arm/stm32f405_soc.h"
+#include "hw/misc/unimp.h"
+
+#define SYSCFG_ADD 0x40013800
+static const uint32_t usart_addr[] = { 0x40011000, 0x40004400, 0x40004800,
+ 0x40004C00, 0x40005000, 0x40011400,
+ 0x40007800, 0x40007C00 };
+/* At the moment only Timer 2 to 5 are modelled */
+static const uint32_t timer_addr[] = { 0x40000000, 0x40000400,
+ 0x40000800, 0x40000C00 };
+#define ADC_ADDR 0x40012000
+static const uint32_t spi_addr[] = { 0x40013000, 0x40003800, 0x40003C00,
+ 0x40013400, 0x40015000, 0x40015400 };
+#define EXTI_ADDR 0x40013C00
+
+#define SYSCFG_IRQ 71
+static const int usart_irq[] = { 37, 38, 39, 52, 53, 71, 82, 83 };
+static const int timer_irq[] = { 28, 29, 30, 50 };
+#define ADC_IRQ 18
+static const int spi_irq[] = { 35, 36, 51, 0, 0, 0 };
+static const int exti_irq[] = { 6, 7, 8, 9, 10, 23, 23, 23, 23, 23, 40,
+ 40, 40, 40, 40, 40} ;
+
+
+static void stm32f405_soc_initfn(Object *obj)
+{
+ STM32F405State *s = STM32F405_SOC(obj);
+ int i;
+
+ sysbus_init_child_obj(obj, "armv7m", &s->armv7m, sizeof(s->armv7m),
+ TYPE_ARMV7M);
+
+ sysbus_init_child_obj(obj, "syscfg", &s->syscfg, sizeof(s->syscfg),
+ TYPE_STM32F4XX_SYSCFG);
+
+ for (i = 0; i < STM_NUM_USARTS; i++) {
+ sysbus_init_child_obj(obj, "usart[*]", &s->usart[i],
+ sizeof(s->usart[i]), TYPE_STM32F2XX_USART);
+ }
+
+ for (i = 0; i < STM_NUM_TIMERS; i++) {
+ sysbus_init_child_obj(obj, "timer[*]", &s->timer[i],
+ sizeof(s->timer[i]), TYPE_STM32F2XX_TIMER);
+ }
+
+ for (i = 0; i < STM_NUM_ADCS; i++) {
+ sysbus_init_child_obj(obj, "adc[*]", &s->adc[i], sizeof(s->adc[i]),
+ TYPE_STM32F2XX_ADC);
+ }
+
+ for (i = 0; i < STM_NUM_SPIS; i++) {
+ sysbus_init_child_obj(obj, "spi[*]", &s->spi[i], sizeof(s->spi[i]),
+ TYPE_STM32F2XX_SPI);
+ }
+
+ sysbus_init_child_obj(obj, "exti", &s->exti, sizeof(s->exti),
+ TYPE_STM32F4XX_EXTI);
+}
+
+static void stm32f405_soc_realize(DeviceState *dev_soc, Error **errp)
+{
+ STM32F405State *s = STM32F405_SOC(dev_soc);
+ MemoryRegion *system_memory = get_system_memory();
+ DeviceState *dev, *armv7m;
+ SysBusDevice *busdev;
+ Error *err = NULL;
+ int i;
+
+ memory_region_init_ram(&s->flash, NULL, "STM32F405.flash", FLASH_SIZE,
+ &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ memory_region_init_alias(&s->flash_alias, NULL, "STM32F405.flash.alias",
+ &s->flash, 0, FLASH_SIZE);
+
+ memory_region_set_readonly(&s->flash, true);
+ memory_region_set_readonly(&s->flash_alias, true);
+
+ memory_region_add_subregion(system_memory, FLASH_BASE_ADDRESS, &s->flash);
+ memory_region_add_subregion(system_memory, 0, &s->flash_alias);
+
+ memory_region_init_ram(&s->sram, NULL, "STM32F405.sram", SRAM_SIZE,
+ &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ memory_region_add_subregion(system_memory, SRAM_BASE_ADDRESS, &s->sram);
+
+ armv7m = DEVICE(&s->armv7m);
+ qdev_prop_set_uint32(armv7m, "num-irq", 96);
+ qdev_prop_set_string(armv7m, "cpu-type", s->cpu_type);
+ qdev_prop_set_bit(armv7m, "enable-bitband", true);
+ object_property_set_link(OBJECT(&s->armv7m), OBJECT(system_memory),
+ "memory", &error_abort);
+ object_property_set_bool(OBJECT(&s->armv7m), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ /* System configuration controller */
+ dev = DEVICE(&s->syscfg);
+ object_property_set_bool(OBJECT(&s->syscfg), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, SYSCFG_ADD);
+ sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, SYSCFG_IRQ));
+
+ /* Attach UART (uses USART registers) and USART controllers */
+ for (i = 0; i < STM_NUM_USARTS; i++) {
+ dev = DEVICE(&(s->usart[i]));
+ qdev_prop_set_chr(dev, "chardev", serial_hd(i));
+ object_property_set_bool(OBJECT(&s->usart[i]), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, usart_addr[i]);
+ sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, usart_irq[i]));
+ }
+
+ /* Timer 2 to 5 */
+ for (i = 0; i < STM_NUM_TIMERS; i++) {
+ dev = DEVICE(&(s->timer[i]));
+ qdev_prop_set_uint64(dev, "clock-frequency", 1000000000);
+ object_property_set_bool(OBJECT(&s->timer[i]), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, timer_addr[i]);
+ sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, timer_irq[i]));
+ }
+
+ /* ADC device, the IRQs are ORed together */
+ object_initialize_child(OBJECT(s), "adc-orirq", &s->adc_irqs,
+ sizeof(s->adc_irqs), TYPE_OR_IRQ,
+ &err, NULL);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ object_property_set_int(OBJECT(&s->adc_irqs), STM_NUM_ADCS,
+ "num-lines", &err);
+ object_property_set_bool(OBJECT(&s->adc_irqs), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ qdev_connect_gpio_out(DEVICE(&s->adc_irqs), 0,
+ qdev_get_gpio_in(armv7m, ADC_IRQ));
+
+ dev = DEVICE(&(s->adc[i]));
+ object_property_set_bool(OBJECT(&s->adc[i]), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, ADC_ADDR);
+ sysbus_connect_irq(busdev, 0,
+ qdev_get_gpio_in(DEVICE(&s->adc_irqs), i));
+
+ /* SPI devices */
+ for (i = 0; i < STM_NUM_SPIS; i++) {
+ dev = DEVICE(&(s->spi[i]));
+ object_property_set_bool(OBJECT(&s->spi[i]), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, spi_addr[i]);
+ sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, spi_irq[i]));
+ }
+
+ /* EXTI device */
+ dev = DEVICE(&s->exti);
+ object_property_set_bool(OBJECT(&s->exti), true, "realized", &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ busdev = SYS_BUS_DEVICE(dev);
+ sysbus_mmio_map(busdev, 0, EXTI_ADDR);
+ for (i = 0; i < 16; i++) {
+ sysbus_connect_irq(busdev, i, qdev_get_gpio_in(armv7m, exti_irq[i]));
+ }
+ for (i = 0; i < 16; i++) {
+ qdev_connect_gpio_out(DEVICE(&s->syscfg), i, qdev_get_gpio_in(dev, i));
+ }
+
+ create_unimplemented_device("timer[7]", 0x40001400, 0x400);
+ create_unimplemented_device("timer[12]", 0x40001800, 0x400);
+ create_unimplemented_device("timer[6]", 0x40001000, 0x400);
+ create_unimplemented_device("timer[13]", 0x40001C00, 0x400);
+ create_unimplemented_device("timer[14]", 0x40002000, 0x400);
+ create_unimplemented_device("RTC and BKP", 0x40002800, 0x400);
+ create_unimplemented_device("WWDG", 0x40002C00, 0x400);
+ create_unimplemented_device("IWDG", 0x40003000, 0x400);
+ create_unimplemented_device("I2S2ext", 0x40003000, 0x400);
+ create_unimplemented_device("I2S3ext", 0x40004000, 0x400);
+ create_unimplemented_device("I2C1", 0x40005400, 0x400);
+ create_unimplemented_device("I2C2", 0x40005800, 0x400);
+ create_unimplemented_device("I2C3", 0x40005C00, 0x400);
+ create_unimplemented_device("CAN1", 0x40006400, 0x400);
+ create_unimplemented_device("CAN2", 0x40006800, 0x400);
+ create_unimplemented_device("PWR", 0x40007000, 0x400);
+ create_unimplemented_device("DAC", 0x40007400, 0x400);
+ create_unimplemented_device("timer[1]", 0x40010000, 0x400);
+ create_unimplemented_device("timer[8]", 0x40010400, 0x400);
+ create_unimplemented_device("SDIO", 0x40012C00, 0x400);
+ create_unimplemented_device("timer[9]", 0x40014000, 0x400);
+ create_unimplemented_device("timer[10]", 0x40014400, 0x400);
+ create_unimplemented_device("timer[11]", 0x40014800, 0x400);
+ create_unimplemented_device("GPIOA", 0x40020000, 0x400);
+ create_unimplemented_device("GPIOB", 0x40020400, 0x400);
+ create_unimplemented_device("GPIOC", 0x40020800, 0x400);
+ create_unimplemented_device("GPIOD", 0x40020C00, 0x400);
+ create_unimplemented_device("GPIOE", 0x40021000, 0x400);
+ create_unimplemented_device("GPIOF", 0x40021400, 0x400);
+ create_unimplemented_device("GPIOG", 0x40021800, 0x400);
+ create_unimplemented_device("GPIOH", 0x40021C00, 0x400);
+ create_unimplemented_device("GPIOI", 0x40022000, 0x400);
+ create_unimplemented_device("CRC", 0x40023000, 0x400);
+ create_unimplemented_device("RCC", 0x40023800, 0x400);
+ create_unimplemented_device("Flash Int", 0x40023C00, 0x400);
+ create_unimplemented_device("BKPSRAM", 0x40024000, 0x400);
+ create_unimplemented_device("DMA1", 0x40026000, 0x400);
+ create_unimplemented_device("DMA2", 0x40026400, 0x400);
+ create_unimplemented_device("Ethernet", 0x40028000, 0x1400);
+ create_unimplemented_device("USB OTG HS", 0x40040000, 0x30000);
+ create_unimplemented_device("USB OTG FS", 0x50000000, 0x31000);
+ create_unimplemented_device("DCMI", 0x50050000, 0x400);
+ create_unimplemented_device("RNG", 0x50060800, 0x400);
+}
+
+static Property stm32f405_soc_properties[] = {
+ DEFINE_PROP_STRING("cpu-type", STM32F405State, cpu_type),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void stm32f405_soc_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = stm32f405_soc_realize;
+ dc->props = stm32f405_soc_properties;
+ /* No vmstate or reset required: device has no internal state */
+}
+
+static const TypeInfo stm32f405_soc_info = {
+ .name = TYPE_STM32F405_SOC,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(STM32F405State),
+ .instance_init = stm32f405_soc_initfn,
+ .class_init = stm32f405_soc_class_init,
+};
+
+static void stm32f405_soc_types(void)
+{
+ type_register_static(&stm32f405_soc_info);
+}
+
+type_init(stm32f405_soc_types)
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 39ab5f47e0..656b0081c2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1934,7 +1934,6 @@ static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
static void virt_memory_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
- HotplugHandlerClass *hhc;
VirtMachineState *vms = VIRT_MACHINE(hotplug_dev);
Error *local_err = NULL;
@@ -1943,8 +1942,9 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev,
goto out;
}
- hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev);
- hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &error_abort);
+ hotplug_handler_plug(HOTPLUG_HANDLER(vms->acpi_dev),
+ dev, &error_abort);
+
out:
error_propagate(errp, local_err);
}
diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c
index 7e5c5ce789..20d8509107 100644
--- a/hw/char/exynos4210_uart.c
+++ b/hw/char/exynos4210_uart.c
@@ -24,6 +24,7 @@
#include "migration/vmstate.h"
#include "qemu/error-report.h"
#include "qemu/module.h"
+#include "qemu/timer.h"
#include "chardev/char-fe.h"
#include "chardev/char-serial.h"
@@ -31,45 +32,7 @@
#include "hw/irq.h"
#include "hw/qdev-properties.h"
-#undef DEBUG_UART
-#undef DEBUG_UART_EXTEND
-#undef DEBUG_IRQ
-#undef DEBUG_Rx_DATA
-#undef DEBUG_Tx_DATA
-
-#define DEBUG_UART 0
-#define DEBUG_UART_EXTEND 0
-#define DEBUG_IRQ 0
-#define DEBUG_Rx_DATA 0
-#define DEBUG_Tx_DATA 0
-
-#if DEBUG_UART
-#define PRINT_DEBUG(fmt, args...) \
- do { \
- fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \
- } while (0)
-
-#if DEBUG_UART_EXTEND
-#define PRINT_DEBUG_EXTEND(fmt, args...) \
- do { \
- fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \
- } while (0)
-#else
-#define PRINT_DEBUG_EXTEND(fmt, args...) \
- do {} while (0)
-#endif /* EXTEND */
-
-#else
-#define PRINT_DEBUG(fmt, args...) \
- do {} while (0)
-#define PRINT_DEBUG_EXTEND(fmt, args...) \
- do {} while (0)
-#endif
-
-#define PRINT_ERROR(fmt, args...) \
- do { \
- fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \
- } while (0)
+#include "trace.h"
/*
* Offsets for UART registers relative to SFR base address
@@ -156,6 +119,7 @@ static const Exynos4210UartReg exynos4210_uart_regs[] = {
#define ULCON_STOP_BIT_SHIFT 1
/* UART Tx/Rx Status */
+#define UTRSTAT_Rx_TIMEOUT 0x8
#define UTRSTAT_TRANSMITTER_EMPTY 0x4
#define UTRSTAT_Tx_BUFFER_EMPTY 0x2
#define UTRSTAT_Rx_BUFFER_DATA_READY 0x1
@@ -185,16 +149,19 @@ typedef struct Exynos4210UartState {
Exynos4210UartFIFO rx;
Exynos4210UartFIFO tx;
+ QEMUTimer *fifo_timeout_timer;
+ uint64_t wordtime; /* word time in ns */
+
CharBackend chr;
qemu_irq irq;
+ qemu_irq dmairq;
uint32_t channel;
} Exynos4210UartState;
-#if DEBUG_UART
-/* Used only for debugging inside PRINT_DEBUG_... macros */
+/* Used only for tracing */
static const char *exynos4210_uart_regname(hwaddr offset)
{
@@ -208,7 +175,6 @@ static const char *exynos4210_uart_regname(hwaddr offset)
return NULL;
}
-#endif
static void fifo_store(Exynos4210UartFIFO *q, uint8_t ch)
@@ -249,15 +215,12 @@ static void fifo_reset(Exynos4210UartFIFO *q)
q->rp = 0;
}
-static uint32_t exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState *s)
+static uint32_t exynos4210_uart_FIFO_trigger_level(uint32_t channel,
+ uint32_t reg)
{
- uint32_t level = 0;
- uint32_t reg;
-
- reg = (s->reg[I_(UFCON)] & UFCON_Tx_FIFO_TRIGGER_LEVEL) >>
- UFCON_Tx_FIFO_TRIGGER_LEVEL_SHIFT;
+ uint32_t level;
- switch (s->channel) {
+ switch (channel) {
case 0:
level = reg * 32;
break;
@@ -271,12 +234,52 @@ static uint32_t exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState
break;
default:
level = 0;
- PRINT_ERROR("Wrong UART channel number: %d\n", s->channel);
+ trace_exynos_uart_channel_error(channel);
+ break;
}
-
return level;
}
+static uint32_t
+exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState *s)
+{
+ uint32_t reg;
+
+ reg = (s->reg[I_(UFCON)] & UFCON_Tx_FIFO_TRIGGER_LEVEL) >>
+ UFCON_Tx_FIFO_TRIGGER_LEVEL_SHIFT;
+
+ return exynos4210_uart_FIFO_trigger_level(s->channel, reg);
+}
+
+static uint32_t
+exynos4210_uart_Rx_FIFO_trigger_level(const Exynos4210UartState *s)
+{
+ uint32_t reg;
+
+ reg = ((s->reg[I_(UFCON)] & UFCON_Rx_FIFO_TRIGGER_LEVEL) >>
+ UFCON_Rx_FIFO_TRIGGER_LEVEL_SHIFT) + 1;
+
+ return exynos4210_uart_FIFO_trigger_level(s->channel, reg);
+}
+
+/*
+ * Update Rx DMA busy signal if Rx DMA is enabled. For simplicity,
+ * mark DMA as busy if DMA is enabled and the receive buffer is empty.
+ */
+static void exynos4210_uart_update_dmabusy(Exynos4210UartState *s)
+{
+ bool rx_dma_enabled = (s->reg[I_(UCON)] & 0x03) == 0x02;
+ uint32_t count = fifo_elements_number(&s->rx);
+
+ if (rx_dma_enabled && !count) {
+ qemu_irq_raise(s->dmairq);
+ trace_exynos_uart_dmabusy(s->channel);
+ } else {
+ qemu_irq_lower(s->dmairq);
+ trace_exynos_uart_dmaready(s->channel);
+ }
+}
+
static void exynos4210_uart_update_irq(Exynos4210UartState *s)
{
/*
@@ -284,27 +287,53 @@ static void exynos4210_uart_update_irq(Exynos4210UartState *s)
* transmit FIFO is smaller than the trigger level.
*/
if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) {
-
uint32_t count = (s->reg[I_(UFSTAT)] & UFSTAT_Tx_FIFO_COUNT) >>
UFSTAT_Tx_FIFO_COUNT_SHIFT;
if (count <= exynos4210_uart_Tx_FIFO_trigger_level(s)) {
s->reg[I_(UINTSP)] |= UINTSP_TXD;
}
+
+ /*
+ * Rx interrupt if trigger level is reached or if rx timeout
+ * interrupt is disabled and there is data in the receive buffer
+ */
+ count = fifo_elements_number(&s->rx);
+ if ((count && !(s->reg[I_(UCON)] & 0x80)) ||
+ count >= exynos4210_uart_Rx_FIFO_trigger_level(s)) {
+ exynos4210_uart_update_dmabusy(s);
+ s->reg[I_(UINTSP)] |= UINTSP_RXD;
+ timer_del(s->fifo_timeout_timer);
+ }
+ } else if (s->reg[I_(UTRSTAT)] & UTRSTAT_Rx_BUFFER_DATA_READY) {
+ exynos4210_uart_update_dmabusy(s);
+ s->reg[I_(UINTSP)] |= UINTSP_RXD;
}
s->reg[I_(UINTP)] = s->reg[I_(UINTSP)] & ~s->reg[I_(UINTM)];
if (s->reg[I_(UINTP)]) {
qemu_irq_raise(s->irq);
-
-#if DEBUG_IRQ
- fprintf(stderr, "UART%d: IRQ has been raised: %08x\n",
- s->channel, s->reg[I_(UINTP)]);
-#endif
-
+ trace_exynos_uart_irq_raised(s->channel, s->reg[I_(UINTP)]);
} else {
qemu_irq_lower(s->irq);
+ trace_exynos_uart_irq_lowered(s->channel);
+ }
+}
+
+static void exynos4210_uart_timeout_int(void *opaque)
+{
+ Exynos4210UartState *s = opaque;
+
+ trace_exynos_uart_rx_timeout(s->channel, s->reg[I_(UTRSTAT)],
+ s->reg[I_(UINTSP)]);
+
+ if ((s->reg[I_(UTRSTAT)] & UTRSTAT_Rx_BUFFER_DATA_READY) ||
+ (s->reg[I_(UCON)] & (1 << 11))) {
+ s->reg[I_(UINTSP)] |= UINTSP_RXD;
+ s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_TIMEOUT;
+ exynos4210_uart_update_dmabusy(s);
+ exynos4210_uart_update_irq(s);
}
}
@@ -346,10 +375,24 @@ static void exynos4210_uart_update_parameters(Exynos4210UartState *s)
ssp.data_bits = data_bits;
ssp.stop_bits = stop_bits;
+ s->wordtime = NANOSECONDS_PER_SECOND * (data_bits + stop_bits + 1) / speed;
+
qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_PARAMS, &ssp);
- PRINT_DEBUG("UART%d: speed: %d, parity: %c, data: %d, stop: %d\n",
- s->channel, speed, parity, data_bits, stop_bits);
+ trace_exynos_uart_update_params(
+ s->channel, speed, parity, data_bits, stop_bits, s->wordtime);
+}
+
+static void exynos4210_uart_rx_timeout_set(Exynos4210UartState *s)
+{
+ if (s->reg[I_(UCON)] & 0x80) {
+ uint32_t timeout = ((s->reg[I_(UCON)] >> 12) & 0x0f) * s->wordtime;
+
+ timer_mod(s->fifo_timeout_timer,
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + timeout);
+ } else {
+ timer_del(s->fifo_timeout_timer);
+ }
}
static void exynos4210_uart_write(void *opaque, hwaddr offset,
@@ -358,8 +401,8 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset,
Exynos4210UartState *s = (Exynos4210UartState *)opaque;
uint8_t ch;
- PRINT_DEBUG_EXTEND("UART%d: <0x%04x> %s <- 0x%08llx\n", s->channel,
- offset, exynos4210_uart_regname(offset), (long long unsigned int)val);
+ trace_exynos_uart_write(s->channel, offset,
+ exynos4210_uart_regname(offset), val);
switch (offset) {
case ULCON:
@@ -373,12 +416,12 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset,
if (val & UFCON_Rx_FIFO_RESET) {
fifo_reset(&s->rx);
s->reg[I_(UFCON)] &= ~UFCON_Rx_FIFO_RESET;
- PRINT_DEBUG("UART%d: Rx FIFO Reset\n", s->channel);
+ trace_exynos_uart_rx_fifo_reset(s->channel);
}
if (val & UFCON_Tx_FIFO_RESET) {
fifo_reset(&s->tx);
s->reg[I_(UFCON)] &= ~UFCON_Tx_FIFO_RESET;
- PRINT_DEBUG("UART%d: Tx FIFO Reset\n", s->channel);
+ trace_exynos_uart_tx_fifo_reset(s->channel);
}
break;
@@ -390,9 +433,7 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset,
/* XXX this blocks entire thread. Rewrite to use
* qemu_chr_fe_write and background I/O callbacks */
qemu_chr_fe_write_all(&s->chr, &ch, 1);
-#if DEBUG_Tx_DATA
- fprintf(stderr, "%c", ch);
-#endif
+ trace_exynos_uart_tx(s->channel, ch);
s->reg[I_(UTRSTAT)] |= UTRSTAT_TRANSMITTER_EMPTY |
UTRSTAT_Tx_BUFFER_EMPTY;
s->reg[I_(UINTSP)] |= UINTSP_TXD;
@@ -403,16 +444,19 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset,
case UINTP:
s->reg[I_(UINTP)] &= ~val;
s->reg[I_(UINTSP)] &= ~val;
- PRINT_DEBUG("UART%d: UINTP [%04x] have been cleared: %08x\n",
- s->channel, offset, s->reg[I_(UINTP)]);
+ trace_exynos_uart_intclr(s->channel, s->reg[I_(UINTP)]);
exynos4210_uart_update_irq(s);
break;
case UTRSTAT:
+ if (val & UTRSTAT_Rx_TIMEOUT) {
+ s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_TIMEOUT;
+ }
+ break;
case UERSTAT:
case UFSTAT:
case UMSTAT:
case URXH:
- PRINT_DEBUG("UART%d: Trying to write into RO register: %s [%04x]\n",
+ trace_exynos_uart_ro_write(
s->channel, exynos4210_uart_regname(offset), offset);
break;
case UINTSP:
@@ -429,6 +473,7 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset,
break;
}
}
+
static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset,
unsigned size)
{
@@ -439,6 +484,8 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset,
case UERSTAT: /* Read Only */
res = s->reg[I_(UERSTAT)];
s->reg[I_(UERSTAT)] = 0;
+ trace_exynos_uart_read(s->channel, offset,
+ exynos4210_uart_regname(offset), res);
return res;
case UFSTAT: /* Read Only */
s->reg[I_(UFSTAT)] = fifo_elements_number(&s->rx) & 0xff;
@@ -446,20 +493,22 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset,
s->reg[I_(UFSTAT)] |= UFSTAT_Rx_FIFO_FULL;
s->reg[I_(UFSTAT)] &= ~0xff;
}
+ trace_exynos_uart_read(s->channel, offset,
+ exynos4210_uart_regname(offset),
+ s->reg[I_(UFSTAT)]);
return s->reg[I_(UFSTAT)];
case URXH:
if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) {
if (fifo_elements_number(&s->rx)) {
res = fifo_retrieve(&s->rx);
-#if DEBUG_Rx_DATA
- fprintf(stderr, "%c", res);
-#endif
+ trace_exynos_uart_rx(s->channel, res);
if (!fifo_elements_number(&s->rx)) {
s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_BUFFER_DATA_READY;
} else {
s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY;
}
} else {
+ trace_exynos_uart_rx_error(s->channel);
s->reg[I_(UINTSP)] |= UINTSP_ERROR;
exynos4210_uart_update_irq(s);
res = 0;
@@ -468,15 +517,23 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset,
s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_BUFFER_DATA_READY;
res = s->reg[I_(URXH)];
}
+ exynos4210_uart_update_dmabusy(s);
+ trace_exynos_uart_read(s->channel, offset,
+ exynos4210_uart_regname(offset), res);
return res;
case UTXH:
- PRINT_DEBUG("UART%d: Trying to read from WO register: %s [%04x]\n",
- s->channel, exynos4210_uart_regname(offset), offset);
+ trace_exynos_uart_wo_read(s->channel, exynos4210_uart_regname(offset),
+ offset);
break;
default:
+ trace_exynos_uart_read(s->channel, offset,
+ exynos4210_uart_regname(offset),
+ s->reg[I_(offset)]);
return s->reg[I_(offset)];
}
+ trace_exynos_uart_read(s->channel, offset, exynos4210_uart_regname(offset),
+ 0);
return 0;
}
@@ -497,7 +554,6 @@ static int exynos4210_uart_can_receive(void *opaque)
return fifo_empty_elements_number(&s->rx);
}
-
static void exynos4210_uart_receive(void *opaque, const uint8_t *buf, int size)
{
Exynos4210UartState *s = (Exynos4210UartState *)opaque;
@@ -505,24 +561,17 @@ static void exynos4210_uart_receive(void *opaque, const uint8_t *buf, int size)
if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) {
if (fifo_empty_elements_number(&s->rx) < size) {
- for (i = 0; i < fifo_empty_elements_number(&s->rx); i++) {
- fifo_store(&s->rx, buf[i]);
- }
+ size = fifo_empty_elements_number(&s->rx);
s->reg[I_(UINTSP)] |= UINTSP_ERROR;
- s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY;
- } else {
- for (i = 0; i < size; i++) {
- fifo_store(&s->rx, buf[i]);
- }
- s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY;
}
- /* XXX: Around here we maybe should check Rx trigger level */
- s->reg[I_(UINTSP)] |= UINTSP_RXD;
+ for (i = 0; i < size; i++) {
+ fifo_store(&s->rx, buf[i]);
+ }
+ exynos4210_uart_rx_timeout_set(s);
} else {
s->reg[I_(URXH)] = buf[0];
- s->reg[I_(UINTSP)] |= UINTSP_RXD;
- s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY;
}
+ s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY;
exynos4210_uart_update_irq(s);
}
@@ -555,13 +604,24 @@ static void exynos4210_uart_reset(DeviceState *dev)
fifo_reset(&s->rx);
fifo_reset(&s->tx);
- PRINT_DEBUG("UART%d: Rx FIFO size: %d\n", s->channel, s->rx.size);
+ trace_exynos_uart_rxsize(s->channel, s->rx.size);
+}
+
+static int exynos4210_uart_post_load(void *opaque, int version_id)
+{
+ Exynos4210UartState *s = (Exynos4210UartState *)opaque;
+
+ exynos4210_uart_update_parameters(s);
+ exynos4210_uart_rx_timeout_set(s);
+
+ return 0;
}
static const VMStateDescription vmstate_exynos4210_uart_fifo = {
.name = "exynos4210.uart.fifo",
.version_id = 1,
.minimum_version_id = 1,
+ .post_load = exynos4210_uart_post_load,
.fields = (VMStateField[]) {
VMSTATE_UINT32(sp, Exynos4210UartFIFO),
VMSTATE_UINT32(rp, Exynos4210UartFIFO),
@@ -614,12 +674,17 @@ static void exynos4210_uart_init(Object *obj)
SysBusDevice *dev = SYS_BUS_DEVICE(obj);
Exynos4210UartState *s = EXYNOS4210_UART(dev);
+ s->fifo_timeout_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ exynos4210_uart_timeout_int, s);
+ s->wordtime = NANOSECONDS_PER_SECOND * 10 / 9600;
+
/* memory mapping */
memory_region_init_io(&s->iomem, obj, &exynos4210_uart_ops, s,
"exynos4210.uart", EXYNOS4210_UART_REGS_MEM_SIZE);
sysbus_init_mmio(dev, &s->iomem);
sysbus_init_irq(dev, &s->irq);
+ sysbus_init_irq(dev, &s->dmairq);
}
static void exynos4210_uart_realize(DeviceState *dev, Error **errp)
diff --git a/hw/char/trace-events b/hw/char/trace-events
index 2ce7f2f998..6f938301d9 100644
--- a/hw/char/trace-events
+++ b/hw/char/trace-events
@@ -77,3 +77,23 @@ cmsdk_apb_uart_set_params(int speed) "CMSDK APB UART: params set to %d 8N1"
# nrf51_uart.c
nrf51_uart_read(uint64_t addr, uint64_t r, unsigned int size) "addr 0x%" PRIx64 " value 0x%" PRIx64 " size %u"
nrf51_uart_write(uint64_t addr, uint64_t value, unsigned int size) "addr 0x%" PRIx64 " value 0x%" PRIx64 " size %u"
+
+# exynos4210_uart.c
+exynos_uart_dmabusy(uint32_t channel) "UART%d: DMA busy (Rx buffer empty)"
+exynos_uart_dmaready(uint32_t channel) "UART%d: DMA ready"
+exynos_uart_irq_raised(uint32_t channel, uint32_t reg) "UART%d: IRQ raised: 0x%08"PRIx32
+exynos_uart_irq_lowered(uint32_t channel) "UART%d: IRQ lowered"
+exynos_uart_update_params(uint32_t channel, int speed, uint8_t parity, int data, int stop, uint64_t wordtime) "UART%d: speed: %d, parity: %c, data bits: %d, stop bits: %d wordtime: %"PRId64"ns"
+exynos_uart_write(uint32_t channel, uint32_t offset, const char *name, uint64_t val) "UART%d: <0x%04x> %s <- 0x%" PRIx64
+exynos_uart_read(uint32_t channel, uint32_t offset, const char *name, uint64_t val) "UART%d: <0x%04x> %s -> 0x%" PRIx64
+exynos_uart_rx_fifo_reset(uint32_t channel) "UART%d: Rx FIFO Reset"
+exynos_uart_tx_fifo_reset(uint32_t channel) "UART%d: Tx FIFO Reset"
+exynos_uart_tx(uint32_t channel, uint8_t ch) "UART%d: Tx 0x%02"PRIx32
+exynos_uart_intclr(uint32_t channel, uint32_t reg) "UART%d: interrupts cleared: 0x%08"PRIx32
+exynos_uart_ro_write(uint32_t channel, const char *name, uint32_t reg) "UART%d: Trying to write into RO register: %s [0x%04"PRIx32"]"
+exynos_uart_rx(uint32_t channel, uint8_t ch) "UART%d: Rx 0x%02"PRIx32
+exynos_uart_rx_error(uint32_t channel) "UART%d: Rx error"
+exynos_uart_wo_read(uint32_t channel, const char *name, uint32_t reg) "UART%d: Trying to read from WO register: %s [0x%04"PRIx32"]"
+exynos_uart_rxsize(uint32_t channel, uint32_t size) "UART%d: Rx FIFO size: %d"
+exynos_uart_channel_error(uint32_t channel) "Wrong UART channel number: %d"
+exynos_uart_rx_timeout(uint32_t channel, uint32_t stat, uint32_t intsp) "UART%d: Rx timeout stat=0x%x intsp=0x%x"
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 9f1753f5cf..58e87d336d 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -879,7 +879,8 @@ static void device_set_realized(Object *obj, bool value, Error **errp)
if (qdev_get_vmsd(dev)) {
if (vmstate_register_with_alias_id(VMSTATE_IF(dev),
- -1, qdev_get_vmsd(dev), dev,
+ VMSTATE_INSTANCE_ID_ANY,
+ qdev_get_vmsd(dev), dev,
dev->instance_id_alias,
dev->alias_required_for_version,
&local_err) < 0) {
diff --git a/hw/display/ads7846.c b/hw/display/ads7846.c
index c12272ae72..9228b40b1a 100644
--- a/hw/display/ads7846.c
+++ b/hw/display/ads7846.c
@@ -154,7 +154,7 @@ static void ads7846_realize(SSISlave *d, Error **errp)
ads7846_int_update(s);
- vmstate_register(NULL, -1, &vmstate_ads7846, s);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_ads7846, s);
}
static void ads7846_class_init(ObjectClass *klass, void *data)
diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c
index f2bb2d9ac1..64519971ef 100644
--- a/hw/dma/pl330.c
+++ b/hw/dma/pl330.c
@@ -25,19 +25,12 @@
#include "sysemu/dma.h"
#include "qemu/log.h"
#include "qemu/module.h"
+#include "trace.h"
#ifndef PL330_ERR_DEBUG
#define PL330_ERR_DEBUG 0
#endif
-#define DB_PRINT_L(lvl, fmt, args...) do {\
- if (PL330_ERR_DEBUG >= lvl) {\
- fprintf(stderr, "PL330: %s:" fmt, __func__, ## args);\
- } \
-} while (0)
-
-#define DB_PRINT(fmt, args...) DB_PRINT_L(1, fmt, ## args)
-
#define PL330_PERIPH_NUM 32
#define PL330_MAX_BURST_LEN 128
#define PL330_INSN_MAXSIZE 6
@@ -319,6 +312,26 @@ typedef struct PL330InsnDesc {
void (*exec)(PL330Chan *, uint8_t opcode, uint8_t *args, int len);
} PL330InsnDesc;
+static void pl330_hexdump(uint8_t *buf, size_t size)
+{
+ unsigned int b, i, len;
+ char tmpbuf[80];
+
+ for (b = 0; b < size; b += 16) {
+ len = size - b;
+ if (len > 16) {
+ len = 16;
+ }
+ tmpbuf[0] = '\0';
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0) {
+ strcat(tmpbuf, " ");
+ }
+ sprintf(tmpbuf + strlen(tmpbuf), " %02x", buf[b + i]);
+ }
+ trace_pl330_hexdump(b, tmpbuf);
+ }
+}
/* MFIFO Implementation
*
@@ -582,7 +595,7 @@ static inline void pl330_queue_remove_tagged(PL330Queue *s, uint8_t tag)
static inline void pl330_fault(PL330Chan *ch, uint32_t flags)
{
- DB_PRINT("ch: %p, flags: %" PRIx32 "\n", ch, flags);
+ trace_pl330_fault(ch, flags);
ch->fault_type |= flags;
if (ch->state == pl330_chan_fault) {
return;
@@ -590,7 +603,7 @@ static inline void pl330_fault(PL330Chan *ch, uint32_t flags)
ch->state = pl330_chan_fault;
ch->parent->num_faulting++;
if (ch->parent->num_faulting == 1) {
- DB_PRINT("abort interrupt raised\n");
+ trace_pl330_fault_abort();
qemu_irq_raise(ch->parent->irq_abort);
}
}
@@ -648,7 +661,7 @@ static void pl330_dmaend(PL330Chan *ch, uint8_t opcode,
return;
}
}
- DB_PRINT("DMA ending!\n");
+ trace_pl330_dmaend();
pl330_fifo_tagged_remove(&s->fifo, ch->tag);
pl330_queue_remove_tagged(&s->read_queue, ch->tag);
pl330_queue_remove_tagged(&s->write_queue, ch->tag);
@@ -683,7 +696,7 @@ static void pl330_dmago(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len)
uint32_t pc;
PL330Chan *s;
- DB_PRINT("\n");
+ trace_pl330_dmago();
if (!ch->is_manager) {
pl330_fault(ch, PL330_FAULT_UNDEF_INSTR);
@@ -740,9 +753,7 @@ static void pl330_dmald(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len)
ch->stall = pl330_queue_put_insn(&ch->parent->read_queue, ch->src,
size, num, inc, 0, ch->tag);
if (!ch->stall) {
- DB_PRINT("channel:%" PRId8 " address:%08" PRIx32 " size:%" PRIx32
- " num:%" PRId32 " %c\n",
- ch->tag, ch->src, size, num, inc ? 'Y' : 'N');
+ trace_pl330_dmald(ch->tag, ch->src, size, num, inc ? 'Y' : 'N');
ch->src += inc ? size * num - (ch->src & (size - 1)) : 0;
}
}
@@ -782,7 +793,7 @@ static void pl330_dmakill(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len)
ch->fault_type = 0;
ch->parent->num_faulting--;
if (ch->parent->num_faulting == 0) {
- DB_PRINT("abort interrupt lowered\n");
+ trace_pl330_dmakill();
qemu_irq_lower(ch->parent->irq_abort);
}
}
@@ -800,6 +811,8 @@ static void pl330_dmalpend(PL330Chan *ch, uint8_t opcode,
uint8_t bs = opcode & 3;
uint8_t lc = (opcode & 4) >> 2;
+ trace_pl330_dmalpend(nf, bs, lc, ch->lc[lc], ch->request_flag);
+
if (bs == 2) {
pl330_fault(ch, PL330_FAULT_OPERAND_INVALID);
return;
@@ -813,12 +826,12 @@ static void pl330_dmalpend(PL330Chan *ch, uint8_t opcode,
if (nf) {
ch->lc[lc]--;
}
- DB_PRINT("loop reiteration\n");
+ trace_pl330_dmalpiter();
ch->pc -= args[0];
ch->pc -= len + 1;
/* "ch->pc -= args[0] + len + 1" is incorrect when args[0] == 256 */
} else {
- DB_PRINT("loop fallthrough\n");
+ trace_pl330_dmalpfallthrough();
}
}
@@ -886,10 +899,10 @@ static void pl330_dmasev(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len)
}
if (ch->parent->inten & (1 << ev_id)) {
ch->parent->int_status |= (1 << ev_id);
- DB_PRINT("event interrupt raised %" PRId8 "\n", ev_id);
+ trace_pl330_dmasev_evirq(ev_id);
qemu_irq_raise(ch->parent->irq[ev_id]);
}
- DB_PRINT("event raised %" PRId8 "\n", ev_id);
+ trace_pl330_dmasev_event(ev_id);
ch->parent->ev_status |= (1 << ev_id);
}
@@ -914,9 +927,7 @@ static void pl330_dmast(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len)
ch->stall = pl330_queue_put_insn(&ch->parent->write_queue, ch->dst,
size, num, inc, 0, ch->tag);
if (!ch->stall) {
- DB_PRINT("channel:%" PRId8 " address:%08" PRIx32 " size:%" PRIx32
- " num:%" PRId32 " %c\n",
- ch->tag, ch->dst, size, num, inc ? 'Y' : 'N');
+ trace_pl330_dmast(ch->tag, ch->dst, size, num, inc ? 'Y' : 'N');
ch->dst += inc ? size * num - (ch->dst & (size - 1)) : 0;
}
}
@@ -992,7 +1003,7 @@ static void pl330_dmawfe(PL330Chan *ch, uint8_t opcode,
}
}
ch->parent->ev_status &= ~(1 << ev_id);
- DB_PRINT("event lowered %" PRIx8 "\n", ev_id);
+ trace_pl330_dmawfe(ev_id);
} else {
ch->stall = 1;
}
@@ -1135,7 +1146,7 @@ static int pl330_chan_exec(PL330Chan *ch)
ch->stall = 0;
insn = pl330_fetch_insn(ch);
if (!insn) {
- DB_PRINT("pl330 undefined instruction\n");
+ trace_pl330_chan_exec_undef();
pl330_fault(ch, PL330_FAULT_UNDEF_INSTR);
return 0;
}
@@ -1175,10 +1186,9 @@ static int pl330_exec_cycle(PL330Chan *channel)
int len = q->len - (q->addr & (q->len - 1));
dma_memory_read(&address_space_memory, q->addr, buf, len);
- if (PL330_ERR_DEBUG > 1) {
- DB_PRINT("PL330 read from memory @%08" PRIx32 " (size = %08x):\n",
- q->addr, len);
- qemu_hexdump((char *)buf, stderr, "", len);
+ trace_pl330_exec_cycle(q->addr, len);
+ if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) {
+ pl330_hexdump(buf, len);
}
fifo_res = pl330_fifo_push(&s->fifo, buf, len, q->tag);
if (fifo_res == PL330_FIFO_OK) {
@@ -1207,10 +1217,9 @@ static int pl330_exec_cycle(PL330Chan *channel)
}
if (fifo_res == PL330_FIFO_OK || q->z) {
dma_memory_write(&address_space_memory, q->addr, buf, len);
- if (PL330_ERR_DEBUG > 1) {
- DB_PRINT("PL330 read from memory @%08" PRIx32
- " (size = %08x):\n", q->addr, len);
- qemu_hexdump((char *)buf, stderr, "", len);
+ trace_pl330_exec_cycle(q->addr, len);
+ if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) {
+ pl330_hexdump(buf, len);
}
if (q->inc) {
q->addr += len;
@@ -1252,8 +1261,8 @@ static int pl330_exec_channel(PL330Chan *channel)
static inline void pl330_exec(PL330State *s)
{
- DB_PRINT("\n");
int i, insr_exec;
+ trace_pl330_exec();
do {
insr_exec = pl330_exec_channel(&s->manager);
@@ -1298,7 +1307,7 @@ static void pl330_debug_exec(PL330State *s)
args[2] = (s->dbg[1] >> 8) & 0xff;
args[3] = (s->dbg[1] >> 16) & 0xff;
args[4] = (s->dbg[1] >> 24) & 0xff;
- DB_PRINT("chan id: %" PRIx8 "\n", chan_id);
+ trace_pl330_debug_exec(chan_id);
if (s->dbg[0] & 1) {
ch = &s->chan[chan_id];
} else {
@@ -1320,6 +1329,7 @@ static void pl330_debug_exec(PL330State *s)
ch->fault_type |= PL330_FAULT_DBG_INSTR;
}
if (ch->stall) {
+ trace_pl330_debug_exec_stall();
qemu_log_mask(LOG_UNIMP, "pl330: stall of debug instruction not "
"implemented\n");
}
@@ -1334,7 +1344,7 @@ static void pl330_iomem_write(void *opaque, hwaddr offset,
PL330State *s = (PL330State *) opaque;
int i;
- DB_PRINT("addr: %08x data: %08x\n", (unsigned)offset, (unsigned)value);
+ trace_pl330_iomem_write((unsigned)offset, (unsigned)value);
switch (offset) {
case PL330_REG_INTEN:
@@ -1343,7 +1353,7 @@ static void pl330_iomem_write(void *opaque, hwaddr offset,
case PL330_REG_INTCLR:
for (i = 0; i < s->num_events; i++) {
if (s->int_status & s->inten & value & (1 << i)) {
- DB_PRINT("event interrupt lowered %d\n", i);
+ trace_pl330_iomem_write_clr(i);
qemu_irq_lower(s->irq[i]);
}
}
@@ -1361,11 +1371,9 @@ static void pl330_iomem_write(void *opaque, hwaddr offset,
}
break;
case PL330_REG_DBGINST0:
- DB_PRINT("s->dbg[0] = %08x\n", (unsigned)value);
s->dbg[0] = value;
break;
case PL330_REG_DBGINST1:
- DB_PRINT("s->dbg[1] = %08x\n", (unsigned)value);
s->dbg[1] = value;
break;
default:
@@ -1489,7 +1497,7 @@ static uint64_t pl330_iomem_read(void *opaque, hwaddr offset,
unsigned size)
{
uint32_t ret = pl330_iomem_read_imp(opaque, offset);
- DB_PRINT("addr: %08" HWADDR_PRIx " data: %08" PRIx32 "\n", offset, ret);
+ trace_pl330_iomem_read((uint32_t)offset, ret);
return ret;
}
diff --git a/hw/dma/trace-events b/hw/dma/trace-events
index e4498428c5..44893995f6 100644
--- a/hw/dma/trace-events
+++ b/hw/dma/trace-events
@@ -20,3 +20,27 @@ sparc32_dma_enable_lower(void) "Lower DMA enable"
# i8257.c
i8257_unregistered_dma(int nchan, int dma_pos, int dma_len) "unregistered DMA channel used nchan=%d dma_pos=%d dma_len=%d"
+
+# pl330.c
+pl330_fault(void *ptr, uint32_t flags) "ch: %p, flags: 0x%"PRIx32
+pl330_fault_abort(void) "abort interrupt raised"
+pl330_dmaend(void) "DMA ending"
+pl330_dmago(void) "DMA run"
+pl330_dmald(uint8_t chan, uint32_t addr, uint32_t size, uint32_t num, char ch) "channel:%"PRId8" address:0x%08"PRIx32" size:0x%"PRIx32" num:%"PRId32"%c"
+pl330_dmakill(void) "abort interrupt lowered"
+pl330_dmalpend(uint8_t nf, uint8_t bs, uint8_t lc, uint8_t ch, uint8_t flag) "nf=0x%02x bs=0x%02x lc=0x%02x ch=0x%02x flag=0x%02x"
+pl330_dmalpiter(void) "loop reiteration"
+pl330_dmalpfallthrough(void) "loop fallthrough"
+pl330_dmasev_evirq(uint8_t ev_id) "event interrupt raised %"PRId8
+pl330_dmasev_event(uint8_t ev_id) "event raised %"PRId8
+pl330_dmast(uint8_t chan, uint32_t addr, uint32_t sz, uint32_t num, char ch) "channel:%"PRId8" address:0x%08"PRIx32" size:0x%"PRIx32" num:%"PRId32" %c"
+pl330_dmawfe(uint8_t ev_id) "event lowered 0x%"PRIx8
+pl330_chan_exec_undef(void) "undefined instruction"
+pl330_exec_cycle(uint32_t addr, uint32_t size) "PL330 read from memory @0x%08"PRIx32" (size = 0x%08"PRIx32")"
+pl330_hexdump(uint32_t offset, char *str) " 0x%04"PRIx32":%s"
+pl330_exec(void) "pl330_exec"
+pl330_debug_exec(uint8_t ch) "chan id: 0x%"PRIx8
+pl330_debug_exec_stall(void) "stall of debug instruction not implemented"
+pl330_iomem_write(uint32_t offset, uint32_t value) "addr: 0x%08"PRIx32" data: 0x%08"PRIx32
+pl330_iomem_write_clr(int i) "event interrupt lowered %d"
+pl330_iomem_read(uint32_t addr, uint32_t data) "addr: 0x%08"PRIx32" data: 0x%08"PRIx32
diff --git a/hw/i2c/core.c b/hw/i2c/core.c
index 92cd489069..d770035ba0 100644
--- a/hw/i2c/core.c
+++ b/hw/i2c/core.c
@@ -61,7 +61,7 @@ I2CBus *i2c_init_bus(DeviceState *parent, const char *name)
bus = I2C_BUS(qbus_create(TYPE_I2C_BUS, parent, name));
QLIST_INIT(&bus->current_devs);
- vmstate_register(NULL, -1, &vmstate_i2c_bus, bus);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_i2c_bus, bus);
return bus;
}
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index e25df838f0..9c4e46fa74 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1816,7 +1816,6 @@ static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func)
Aml *scope = aml_scope("_SB.PCI0");
Aml *dev = aml_device("SMB0");
- aml_append(dev, aml_name_decl("_HID", aml_eisaid("APP0005")));
aml_append(dev, aml_name_decl("_ADR", aml_int(devnr << 16 | func)));
build_acpi_ipmi_devices(dev, BUS(smbus), "\\_SB.PCI0.SMB0");
aml_append(scope, dev);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 8054bc4147..a6302a772d 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -93,7 +93,9 @@
#include "fw_cfg.h"
#include "trace.h"
-GlobalProperty pc_compat_4_2[] = {};
+GlobalProperty pc_compat_4_2[] = {
+ { "mch", "smbase-smram", "off" },
+};
const size_t pc_compat_4_2_len = G_N_ELEMENTS(pc_compat_4_2);
GlobalProperty pc_compat_4_1[] = {};
diff --git a/hw/input/stellaris_input.c b/hw/input/stellaris_input.c
index 59892b07fc..e6ee5e11f1 100644
--- a/hw/input/stellaris_input.c
+++ b/hw/input/stellaris_input.c
@@ -88,5 +88,6 @@ void stellaris_gamepad_init(int n, qemu_irq *irq, const int *keycode)
}
s->num_buttons = n;
qemu_add_kbd_event_handler(stellaris_gamepad_put_key, s);
- vmstate_register(NULL, -1, &vmstate_stellaris_gamepad, s);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY,
+ &vmstate_stellaris_gamepad, s);
}
diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
index 375cb6abe9..b5dbeb6206 100644
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
@@ -268,7 +268,10 @@ static void apic_common_realize(DeviceState *dev, Error **errp)
APICCommonState *s = APIC_COMMON(dev);
APICCommonClass *info;
static DeviceState *vapic;
- int instance_id = s->id;
+ uint32_t instance_id = s->initial_apic_id;
+
+ /* Normally initial APIC ID should be no more than hundreds */
+ assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
info = APIC_COMMON_GET_CLASS(s);
info->realize(dev, errp);
@@ -284,7 +287,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp)
}
if (s->legacy_instance_id) {
- instance_id = -1;
+ instance_id = VMSTATE_INSTANCE_ID_ANY;
}
vmstate_register_with_alias_id(NULL, instance_id, &vmstate_apic_common,
s, -1, 0, NULL);
diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index a254b0ce87..08e000e33c 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -664,6 +664,9 @@ static uint64_t icv_iar_read(CPUARMState *env, const ARMCPRegInfo *ri)
trace_gicv3_icv_iar_read(ri->crm == 8 ? 0 : 1,
gicv3_redist_affid(cs), intid);
+
+ gicv3_cpuif_virt_update(cs);
+
return intid;
}
diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig
index 2164646553..bdd77d8020 100644
--- a/hw/misc/Kconfig
+++ b/hw/misc/Kconfig
@@ -82,6 +82,12 @@ config IMX
config STM32F2XX_SYSCFG
bool
+config STM32F4XX_SYSCFG
+ bool
+
+config STM32F4XX_EXTI
+ bool
+
config MIPS_ITU
bool
diff --git a/hw/misc/Makefile.objs b/hw/misc/Makefile.objs
index ba898a5781..da993f45b7 100644
--- a/hw/misc/Makefile.objs
+++ b/hw/misc/Makefile.objs
@@ -42,6 +42,7 @@ common-obj-$(CONFIG_IMX) += imx7_ccm.o
common-obj-$(CONFIG_IMX) += imx2_wdt.o
common-obj-$(CONFIG_IMX) += imx7_snvs.o
common-obj-$(CONFIG_IMX) += imx7_gpr.o
+common-obj-$(CONFIG_IMX) += imx_rngc.o
common-obj-$(CONFIG_MILKYMIST) += milkymist-hpdmc.o
common-obj-$(CONFIG_MILKYMIST) += milkymist-pfpu.o
common-obj-$(CONFIG_MAINSTONE) += mst_fpga.o
@@ -58,6 +59,8 @@ common-obj-$(CONFIG_SLAVIO) += slavio_misc.o
common-obj-$(CONFIG_ZYNQ) += zynq_slcr.o
common-obj-$(CONFIG_ZYNQ) += zynq-xadc.o
common-obj-$(CONFIG_STM32F2XX_SYSCFG) += stm32f2xx_syscfg.o
+common-obj-$(CONFIG_STM32F4XX_SYSCFG) += stm32f4xx_syscfg.o
+common-obj-$(CONFIG_STM32F4XX_EXTI) += stm32f4xx_exti.o
obj-$(CONFIG_MIPS_CPS) += mips_cmgcr.o
obj-$(CONFIG_MIPS_CPS) += mips_cpc.o
obj-$(CONFIG_MIPS_ITU) += mips_itu.o
diff --git a/hw/misc/imx_rngc.c b/hw/misc/imx_rngc.c
new file mode 100644
index 0000000000..4c270df2db
--- /dev/null
+++ b/hw/misc/imx_rngc.c
@@ -0,0 +1,278 @@
+/*
+ * Freescale i.MX RNGC emulation
+ *
+ * Copyright (C) 2020 Martin Kaiser <martin@kaiser.cx>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * This driver provides the minimum functionality to initialize and seed
+ * an rngc and to read random numbers. The rngb that is found in imx25
+ * chipsets is also supported.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qemu/log.h"
+#include "qemu/guest-random.h"
+#include "hw/irq.h"
+#include "hw/misc/imx_rngc.h"
+#include "migration/vmstate.h"
+
+#define RNGC_NAME "i.MX RNGC"
+
+#define RNGC_VER_ID 0x00
+#define RNGC_COMMAND 0x04
+#define RNGC_CONTROL 0x08
+#define RNGC_STATUS 0x0C
+#define RNGC_FIFO 0x14
+
+/* These version info are reported by the rngb in an imx258 chip. */
+#define RNG_TYPE_RNGB 0x1
+#define V_MAJ 0x2
+#define V_MIN 0x40
+
+#define RNGC_CMD_BIT_SW_RST 0x40
+#define RNGC_CMD_BIT_CLR_ERR 0x20
+#define RNGC_CMD_BIT_CLR_INT 0x10
+#define RNGC_CMD_BIT_SEED 0x02
+#define RNGC_CMD_BIT_SELF_TEST 0x01
+
+#define RNGC_CTRL_BIT_MASK_ERR 0x40
+#define RNGC_CTRL_BIT_MASK_DONE 0x20
+#define RNGC_CTRL_BIT_AUTO_SEED 0x10
+
+/* the current status for self-test and seed operations */
+#define OP_IDLE 0
+#define OP_RUN 1
+#define OP_DONE 2
+
+static uint64_t imx_rngc_read(void *opaque, hwaddr offset, unsigned size)
+{
+ IMXRNGCState *s = IMX_RNGC(opaque);
+ uint64_t val = 0;
+
+ switch (offset) {
+ case RNGC_VER_ID:
+ val |= RNG_TYPE_RNGB << 28 | V_MAJ << 8 | V_MIN;
+ break;
+
+ case RNGC_COMMAND:
+ if (s->op_seed == OP_RUN) {
+ val |= RNGC_CMD_BIT_SEED;
+ }
+ if (s->op_self_test == OP_RUN) {
+ val |= RNGC_CMD_BIT_SELF_TEST;
+ }
+ break;
+
+ case RNGC_CONTROL:
+ /*
+ * The CTL_ACC and VERIF_MODE bits are not supported yet.
+ * They read as 0.
+ */
+ val |= s->mask;
+ if (s->auto_seed) {
+ val |= RNGC_CTRL_BIT_AUTO_SEED;
+ }
+ /*
+ * We don't have an internal fifo like the real hardware.
+ * There's no need for strategy to handle fifo underflows.
+ * We return the FIFO_UFLOW_RESPONSE bits as 0.
+ */
+ break;
+
+ case RNGC_STATUS:
+ /*
+ * We never report any statistics test or self-test errors or any
+ * other errors. STAT_TEST_PF, ST_PF and ERROR are always 0.
+ */
+
+ /*
+ * We don't have an internal fifo, see above. Therefore, we
+ * report back the default fifo size (5 32-bit words) and
+ * indicate that our fifo is always full.
+ */
+ val |= 5 << 12 | 5 << 8;
+
+ /* We always have a new seed available. */
+ val |= 1 << 6;
+
+ if (s->op_seed == OP_DONE) {
+ val |= 1 << 5;
+ }
+ if (s->op_self_test == OP_DONE) {
+ val |= 1 << 4;
+ }
+ if (s->op_seed == OP_RUN || s->op_self_test == OP_RUN) {
+ /*
+ * We're busy if self-test is running or if we're
+ * seeding the prng.
+ */
+ val |= 1 << 1;
+ } else {
+ /*
+ * We're ready to provide secure random numbers whenever
+ * we're not busy.
+ */
+ val |= 1;
+ }
+ break;
+
+ case RNGC_FIFO:
+ qemu_guest_getrandom_nofail(&val, sizeof(val));
+ break;
+ }
+
+ return val;
+}
+
+static void imx_rngc_do_reset(IMXRNGCState *s)
+{
+ s->op_self_test = OP_IDLE;
+ s->op_seed = OP_IDLE;
+ s->mask = 0;
+ s->auto_seed = false;
+}
+
+static void imx_rngc_write(void *opaque, hwaddr offset, uint64_t value,
+ unsigned size)
+{
+ IMXRNGCState *s = IMX_RNGC(opaque);
+
+ switch (offset) {
+ case RNGC_COMMAND:
+ if (value & RNGC_CMD_BIT_SW_RST) {
+ imx_rngc_do_reset(s);
+ }
+
+ /*
+ * For now, both CLR_ERR and CLR_INT clear the interrupt. We
+ * don't report any errors yet.
+ */
+ if (value & (RNGC_CMD_BIT_CLR_ERR | RNGC_CMD_BIT_CLR_INT)) {
+ qemu_irq_lower(s->irq);
+ }
+
+ if (value & RNGC_CMD_BIT_SEED) {
+ s->op_seed = OP_RUN;
+ qemu_bh_schedule(s->seed_bh);
+ }
+
+ if (value & RNGC_CMD_BIT_SELF_TEST) {
+ s->op_self_test = OP_RUN;
+ qemu_bh_schedule(s->self_test_bh);
+ }
+ break;
+
+ case RNGC_CONTROL:
+ /*
+ * The CTL_ACC and VERIF_MODE bits are not supported yet.
+ * We ignore them if they're set by the caller.
+ */
+
+ if (value & RNGC_CTRL_BIT_MASK_ERR) {
+ s->mask |= RNGC_CTRL_BIT_MASK_ERR;
+ } else {
+ s->mask &= ~RNGC_CTRL_BIT_MASK_ERR;
+ }
+
+ if (value & RNGC_CTRL_BIT_MASK_DONE) {
+ s->mask |= RNGC_CTRL_BIT_MASK_DONE;
+ } else {
+ s->mask &= ~RNGC_CTRL_BIT_MASK_DONE;
+ }
+
+ if (value & RNGC_CTRL_BIT_AUTO_SEED) {
+ s->auto_seed = true;
+ } else {
+ s->auto_seed = false;
+ }
+ break;
+ }
+}
+
+static const MemoryRegionOps imx_rngc_ops = {
+ .read = imx_rngc_read,
+ .write = imx_rngc_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void imx_rngc_self_test(void *opaque)
+{
+ IMXRNGCState *s = IMX_RNGC(opaque);
+
+ s->op_self_test = OP_DONE;
+ if (!(s->mask & RNGC_CTRL_BIT_MASK_DONE)) {
+ qemu_irq_raise(s->irq);
+ }
+}
+
+static void imx_rngc_seed(void *opaque)
+{
+ IMXRNGCState *s = IMX_RNGC(opaque);
+
+ s->op_seed = OP_DONE;
+ if (!(s->mask & RNGC_CTRL_BIT_MASK_DONE)) {
+ qemu_irq_raise(s->irq);
+ }
+}
+
+static void imx_rngc_realize(DeviceState *dev, Error **errp)
+{
+ IMXRNGCState *s = IMX_RNGC(dev);
+ SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+
+ memory_region_init_io(&s->iomem, OBJECT(s), &imx_rngc_ops, s,
+ TYPE_IMX_RNGC, 0x1000);
+ sysbus_init_mmio(sbd, &s->iomem);
+
+ sysbus_init_irq(sbd, &s->irq);
+ s->self_test_bh = qemu_bh_new(imx_rngc_self_test, s);
+ s->seed_bh = qemu_bh_new(imx_rngc_seed, s);
+}
+
+static void imx_rngc_reset(DeviceState *dev)
+{
+ IMXRNGCState *s = IMX_RNGC(dev);
+
+ imx_rngc_do_reset(s);
+}
+
+static const VMStateDescription vmstate_imx_rngc = {
+ .name = RNGC_NAME,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8(op_self_test, IMXRNGCState),
+ VMSTATE_UINT8(op_seed, IMXRNGCState),
+ VMSTATE_UINT8(mask, IMXRNGCState),
+ VMSTATE_BOOL(auto_seed, IMXRNGCState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void imx_rngc_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = imx_rngc_realize;
+ dc->reset = imx_rngc_reset;
+ dc->desc = RNGC_NAME,
+ dc->vmsd = &vmstate_imx_rngc;
+}
+
+static const TypeInfo imx_rngc_info = {
+ .name = TYPE_IMX_RNGC,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(IMXRNGCState),
+ .class_init = imx_rngc_class_init,
+};
+
+static void imx_rngc_register_types(void)
+{
+ type_register_static(&imx_rngc_info);
+}
+
+type_init(imx_rngc_register_types)
diff --git a/hw/misc/max111x.c b/hw/misc/max111x.c
index 211008ce02..2b87bdee5b 100644
--- a/hw/misc/max111x.c
+++ b/hw/misc/max111x.c
@@ -146,7 +146,8 @@ static int max111x_init(SSISlave *d, int inputs)
s->input[7] = 0x80;
s->com = 0;
- vmstate_register(VMSTATE_IF(dev), -1, &vmstate_max111x, s);
+ vmstate_register(VMSTATE_IF(dev), VMSTATE_INSTANCE_ID_ANY,
+ &vmstate_max111x, s);
return 0;
}
diff --git a/hw/misc/stm32f4xx_exti.c b/hw/misc/stm32f4xx_exti.c
new file mode 100644
index 0000000000..02e7810046
--- /dev/null
+++ b/hw/misc/stm32f4xx_exti.c
@@ -0,0 +1,188 @@
+/*
+ * STM32F4XX EXTI
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+#include "hw/misc/stm32f4xx_exti.h"
+
+static void stm32f4xx_exti_reset(DeviceState *dev)
+{
+ STM32F4xxExtiState *s = STM32F4XX_EXTI(dev);
+
+ s->exti_imr = 0x00000000;
+ s->exti_emr = 0x00000000;
+ s->exti_rtsr = 0x00000000;
+ s->exti_ftsr = 0x00000000;
+ s->exti_swier = 0x00000000;
+ s->exti_pr = 0x00000000;
+}
+
+static void stm32f4xx_exti_set_irq(void *opaque, int irq, int level)
+{
+ STM32F4xxExtiState *s = opaque;
+
+ trace_stm32f4xx_exti_set_irq(irq, level);
+
+ if (((1 << irq) & s->exti_rtsr) && level) {
+ /* Rising Edge */
+ s->exti_pr |= 1 << irq;
+ }
+
+ if (((1 << irq) & s->exti_ftsr) && !level) {
+ /* Falling Edge */
+ s->exti_pr |= 1 << irq;
+ }
+
+ if (!((1 << irq) & s->exti_imr)) {
+ /* Interrupt is masked */
+ return;
+ }
+ qemu_irq_pulse(s->irq[irq]);
+}
+
+static uint64_t stm32f4xx_exti_read(void *opaque, hwaddr addr,
+ unsigned int size)
+{
+ STM32F4xxExtiState *s = opaque;
+
+ trace_stm32f4xx_exti_read(addr);
+
+ switch (addr) {
+ case EXTI_IMR:
+ return s->exti_imr;
+ case EXTI_EMR:
+ return s->exti_emr;
+ case EXTI_RTSR:
+ return s->exti_rtsr;
+ case EXTI_FTSR:
+ return s->exti_ftsr;
+ case EXTI_SWIER:
+ return s->exti_swier;
+ case EXTI_PR:
+ return s->exti_pr;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "STM32F4XX_exti_read: Bad offset %x\n", (int)addr);
+ return 0;
+ }
+ return 0;
+}
+
+static void stm32f4xx_exti_write(void *opaque, hwaddr addr,
+ uint64_t val64, unsigned int size)
+{
+ STM32F4xxExtiState *s = opaque;
+ uint32_t value = (uint32_t) val64;
+
+ trace_stm32f4xx_exti_write(addr, value);
+
+ switch (addr) {
+ case EXTI_IMR:
+ s->exti_imr = value;
+ return;
+ case EXTI_EMR:
+ s->exti_emr = value;
+ return;
+ case EXTI_RTSR:
+ s->exti_rtsr = value;
+ return;
+ case EXTI_FTSR:
+ s->exti_ftsr = value;
+ return;
+ case EXTI_SWIER:
+ s->exti_swier = value;
+ return;
+ case EXTI_PR:
+ /* This bit is cleared by writing a 1 to it */
+ s->exti_pr &= ~value;
+ return;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "STM32F4XX_exti_write: Bad offset %x\n", (int)addr);
+ }
+}
+
+static const MemoryRegionOps stm32f4xx_exti_ops = {
+ .read = stm32f4xx_exti_read,
+ .write = stm32f4xx_exti_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void stm32f4xx_exti_init(Object *obj)
+{
+ STM32F4xxExtiState *s = STM32F4XX_EXTI(obj);
+ int i;
+
+ for (i = 0; i < NUM_INTERRUPT_OUT_LINES; i++) {
+ sysbus_init_irq(SYS_BUS_DEVICE(obj), &s->irq[i]);
+ }
+
+ memory_region_init_io(&s->mmio, obj, &stm32f4xx_exti_ops, s,
+ TYPE_STM32F4XX_EXTI, 0x400);
+ sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->mmio);
+
+ qdev_init_gpio_in(DEVICE(obj), stm32f4xx_exti_set_irq,
+ NUM_GPIO_EVENT_IN_LINES);
+}
+
+static const VMStateDescription vmstate_stm32f4xx_exti = {
+ .name = TYPE_STM32F4XX_EXTI,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(exti_imr, STM32F4xxExtiState),
+ VMSTATE_UINT32(exti_emr, STM32F4xxExtiState),
+ VMSTATE_UINT32(exti_rtsr, STM32F4xxExtiState),
+ VMSTATE_UINT32(exti_ftsr, STM32F4xxExtiState),
+ VMSTATE_UINT32(exti_swier, STM32F4xxExtiState),
+ VMSTATE_UINT32(exti_pr, STM32F4xxExtiState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void stm32f4xx_exti_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->reset = stm32f4xx_exti_reset;
+ dc->vmsd = &vmstate_stm32f4xx_exti;
+}
+
+static const TypeInfo stm32f4xx_exti_info = {
+ .name = TYPE_STM32F4XX_EXTI,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(STM32F4xxExtiState),
+ .instance_init = stm32f4xx_exti_init,
+ .class_init = stm32f4xx_exti_class_init,
+};
+
+static void stm32f4xx_exti_register_types(void)
+{
+ type_register_static(&stm32f4xx_exti_info);
+}
+
+type_init(stm32f4xx_exti_register_types)
diff --git a/hw/misc/stm32f4xx_syscfg.c b/hw/misc/stm32f4xx_syscfg.c
new file mode 100644
index 0000000000..f960e4ea1e
--- /dev/null
+++ b/hw/misc/stm32f4xx_syscfg.c
@@ -0,0 +1,171 @@
+/*
+ * STM32F4xx SYSCFG
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "trace.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+#include "hw/misc/stm32f4xx_syscfg.h"
+
+static void stm32f4xx_syscfg_reset(DeviceState *dev)
+{
+ STM32F4xxSyscfgState *s = STM32F4XX_SYSCFG(dev);
+
+ s->syscfg_memrmp = 0x00000000;
+ s->syscfg_pmc = 0x00000000;
+ s->syscfg_exticr[0] = 0x00000000;
+ s->syscfg_exticr[1] = 0x00000000;
+ s->syscfg_exticr[2] = 0x00000000;
+ s->syscfg_exticr[3] = 0x00000000;
+ s->syscfg_cmpcr = 0x00000000;
+}
+
+static void stm32f4xx_syscfg_set_irq(void *opaque, int irq, int level)
+{
+ STM32F4xxSyscfgState *s = opaque;
+ int icrreg = irq / 4;
+ int startbit = (irq & 3) * 4;
+ uint8_t config = irq / 16;
+
+ trace_stm32f4xx_syscfg_set_irq(irq / 16, irq % 16, level);
+
+ g_assert(icrreg < SYSCFG_NUM_EXTICR);
+
+ if (extract32(s->syscfg_exticr[icrreg], startbit, 4) == config) {
+ qemu_set_irq(s->gpio_out[irq], level);
+ trace_stm32f4xx_pulse_exti(irq);
+ }
+}
+
+static uint64_t stm32f4xx_syscfg_read(void *opaque, hwaddr addr,
+ unsigned int size)
+{
+ STM32F4xxSyscfgState *s = opaque;
+
+ trace_stm32f4xx_syscfg_read(addr);
+
+ switch (addr) {
+ case SYSCFG_MEMRMP:
+ return s->syscfg_memrmp;
+ case SYSCFG_PMC:
+ return s->syscfg_pmc;
+ case SYSCFG_EXTICR1...SYSCFG_EXTICR4:
+ return s->syscfg_exticr[addr / 4 - SYSCFG_EXTICR1 / 4];
+ case SYSCFG_CMPCR:
+ return s->syscfg_cmpcr;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: Bad offset 0x%"HWADDR_PRIx"\n", __func__, addr);
+ return 0;
+ }
+}
+
+static void stm32f4xx_syscfg_write(void *opaque, hwaddr addr,
+ uint64_t val64, unsigned int size)
+{
+ STM32F4xxSyscfgState *s = opaque;
+ uint32_t value = val64;
+
+ trace_stm32f4xx_syscfg_write(value, addr);
+
+ switch (addr) {
+ case SYSCFG_MEMRMP:
+ qemu_log_mask(LOG_UNIMP,
+ "%s: Changing the memory mapping isn't supported " \
+ "in QEMU\n", __func__);
+ return;
+ case SYSCFG_PMC:
+ qemu_log_mask(LOG_UNIMP,
+ "%s: Changing the memory mapping isn't supported " \
+ "in QEMU\n", __func__);
+ return;
+ case SYSCFG_EXTICR1...SYSCFG_EXTICR4:
+ s->syscfg_exticr[addr / 4 - SYSCFG_EXTICR1 / 4] = (value & 0xFFFF);
+ return;
+ case SYSCFG_CMPCR:
+ s->syscfg_cmpcr = value;
+ return;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: Bad offset 0x%"HWADDR_PRIx"\n", __func__, addr);
+ }
+}
+
+static const MemoryRegionOps stm32f4xx_syscfg_ops = {
+ .read = stm32f4xx_syscfg_read,
+ .write = stm32f4xx_syscfg_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void stm32f4xx_syscfg_init(Object *obj)
+{
+ STM32F4xxSyscfgState *s = STM32F4XX_SYSCFG(obj);
+
+ sysbus_init_irq(SYS_BUS_DEVICE(obj), &s->irq);
+
+ memory_region_init_io(&s->mmio, obj, &stm32f4xx_syscfg_ops, s,
+ TYPE_STM32F4XX_SYSCFG, 0x400);
+ sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->mmio);
+
+ qdev_init_gpio_in(DEVICE(obj), stm32f4xx_syscfg_set_irq, 16 * 9);
+ qdev_init_gpio_out(DEVICE(obj), s->gpio_out, 16);
+}
+
+static const VMStateDescription vmstate_stm32f4xx_syscfg = {
+ .name = TYPE_STM32F4XX_SYSCFG,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(syscfg_memrmp, STM32F4xxSyscfgState),
+ VMSTATE_UINT32(syscfg_pmc, STM32F4xxSyscfgState),
+ VMSTATE_UINT32_ARRAY(syscfg_exticr, STM32F4xxSyscfgState,
+ SYSCFG_NUM_EXTICR),
+ VMSTATE_UINT32(syscfg_cmpcr, STM32F4xxSyscfgState),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static void stm32f4xx_syscfg_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->reset = stm32f4xx_syscfg_reset;
+ dc->vmsd = &vmstate_stm32f4xx_syscfg;
+}
+
+static const TypeInfo stm32f4xx_syscfg_info = {
+ .name = TYPE_STM32F4XX_SYSCFG,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(STM32F4xxSyscfgState),
+ .instance_init = stm32f4xx_syscfg_init,
+ .class_init = stm32f4xx_syscfg_class_init,
+};
+
+static void stm32f4xx_syscfg_register_types(void)
+{
+ type_register_static(&stm32f4xx_syscfg_info);
+}
+
+type_init(stm32f4xx_syscfg_register_types)
diff --git a/hw/misc/trace-events b/hw/misc/trace-events
index 2e0c820834..7f0f5dff3a 100644
--- a/hw/misc/trace-events
+++ b/hw/misc/trace-events
@@ -84,6 +84,17 @@ mos6522_set_sr_int(void) "set sr_int"
mos6522_write(uint64_t addr, uint64_t val) "reg=0x%"PRIx64 " val=0x%"PRIx64
mos6522_read(uint64_t addr, unsigned val) "reg=0x%"PRIx64 " val=0x%x"
+# stm32f4xx_syscfg
+stm32f4xx_syscfg_set_irq(int gpio, int line, int level) "Interupt: GPIO: %d, Line: %d; Level: %d"
+stm32f4xx_pulse_exti(int irq) "Pulse EXTI: %d"
+stm32f4xx_syscfg_read(uint64_t addr) "reg read: addr: 0x%" PRIx64 " "
+stm32f4xx_syscfg_write(uint64_t addr, uint64_t data) "reg write: addr: 0x%" PRIx64 " val: 0x%" PRIx64 ""
+
+# stm32f4xx_exti
+stm32f4xx_exti_set_irq(int irq, int leve) "Set EXTI: %d to %d"
+stm32f4xx_exti_read(uint64_t addr) "reg read: addr: 0x%" PRIx64 " "
+stm32f4xx_exti_write(uint64_t addr, uint64_t data) "reg write: addr: 0x%" PRIx64 " val: 0x%" PRIx64 ""
+
# tz-mpc.c
tz_mpc_reg_read(uint32_t offset, uint64_t data, unsigned size) "TZ MPC regs read: offset 0x%x data 0x%" PRIx64 " size %u"
tz_mpc_reg_write(uint32_t offset, uint64_t data, unsigned size) "TZ MPC regs write: offset 0x%x data 0x%" PRIx64 " size %u"
diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index cc71a7a036..6cc97769d9 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -1874,7 +1874,8 @@ static void e100_nic_realize(PCIDevice *pci_dev, Error **errp)
s->vmstate = g_memdup(&vmstate_eepro100, sizeof(vmstate_eepro100));
s->vmstate->name = qemu_get_queue(s->nic)->model;
- vmstate_register(VMSTATE_IF(&pci_dev->qdev), -1, s->vmstate, s);
+ vmstate_register(VMSTATE_IF(&pci_dev->qdev), VMSTATE_INSTANCE_ID_ANY,
+ s->vmstate, s);
}
static void eepro100_instance_init(Object *obj)
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 158d270b9f..6342f73b9f 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -275,20 +275,20 @@ static const TypeInfo q35_host_info = {
* MCH D0:F0
*/
-static uint64_t tseg_blackhole_read(void *ptr, hwaddr reg, unsigned size)
+static uint64_t blackhole_read(void *ptr, hwaddr reg, unsigned size)
{
return 0xffffffff;
}
-static void tseg_blackhole_write(void *opaque, hwaddr addr, uint64_t val,
- unsigned width)
+static void blackhole_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned width)
{
/* nothing */
}
-static const MemoryRegionOps tseg_blackhole_ops = {
- .read = tseg_blackhole_read,
- .write = tseg_blackhole_write,
+static const MemoryRegionOps blackhole_ops = {
+ .read = blackhole_read,
+ .write = blackhole_write,
.endianness = DEVICE_NATIVE_ENDIAN,
.valid.min_access_size = 1,
.valid.max_access_size = 4,
@@ -430,6 +430,46 @@ static void mch_update_ext_tseg_mbytes(MCHPCIState *mch)
}
}
+static void mch_update_smbase_smram(MCHPCIState *mch)
+{
+ PCIDevice *pd = PCI_DEVICE(mch);
+ uint8_t *reg = pd->config + MCH_HOST_BRIDGE_F_SMBASE;
+ bool lck;
+
+ if (!mch->has_smram_at_smbase) {
+ return;
+ }
+
+ if (*reg == MCH_HOST_BRIDGE_F_SMBASE_QUERY) {
+ pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] =
+ MCH_HOST_BRIDGE_F_SMBASE_LCK;
+ *reg = MCH_HOST_BRIDGE_F_SMBASE_IN_RAM;
+ return;
+ }
+
+ /*
+ * default/reset state, discard written value
+ * which will disable SMRAM balackhole at SMBASE
+ */
+ if (pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] == 0xff) {
+ *reg = 0x00;
+ }
+
+ memory_region_transaction_begin();
+ if (*reg & MCH_HOST_BRIDGE_F_SMBASE_LCK) {
+ /* disable all writes */
+ pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] &=
+ ~MCH_HOST_BRIDGE_F_SMBASE_LCK;
+ *reg = MCH_HOST_BRIDGE_F_SMBASE_LCK;
+ lck = true;
+ } else {
+ lck = false;
+ }
+ memory_region_set_enabled(&mch->smbase_blackhole, lck);
+ memory_region_set_enabled(&mch->smbase_window, lck);
+ memory_region_transaction_commit();
+}
+
static void mch_write_config(PCIDevice *d,
uint32_t address, uint32_t val, int len)
{
@@ -456,6 +496,10 @@ static void mch_write_config(PCIDevice *d,
MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_SIZE)) {
mch_update_ext_tseg_mbytes(mch);
}
+
+ if (ranges_overlap(address, len, MCH_HOST_BRIDGE_F_SMBASE, 1)) {
+ mch_update_smbase_smram(mch);
+ }
}
static void mch_update(MCHPCIState *mch)
@@ -464,6 +508,7 @@ static void mch_update(MCHPCIState *mch)
mch_update_pam(mch);
mch_update_smram(mch);
mch_update_ext_tseg_mbytes(mch);
+ mch_update_smbase_smram(mch);
/*
* pci hole goes from end-of-low-ram to io-apic.
@@ -514,6 +559,9 @@ static void mch_reset(DeviceState *qdev)
MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY);
}
+ d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0;
+ d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff;
+
mch_update(mch);
}
@@ -563,7 +611,7 @@ static void mch_realize(PCIDevice *d, Error **errp)
memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram);
memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch),
- &tseg_blackhole_ops, NULL,
+ &blackhole_ops, NULL,
"tseg-blackhole", 0);
memory_region_set_enabled(&mch->tseg_blackhole, false);
memory_region_add_subregion_overlap(mch->system_memory,
@@ -575,6 +623,27 @@ static void mch_realize(PCIDevice *d, Error **errp)
memory_region_set_enabled(&mch->tseg_window, false);
memory_region_add_subregion(&mch->smram, mch->below_4g_mem_size,
&mch->tseg_window);
+
+ /*
+ * This is not what hardware does, so it's QEMU specific hack.
+ * See commit message for details.
+ */
+ memory_region_init_io(&mch->smbase_blackhole, OBJECT(mch), &blackhole_ops,
+ NULL, "smbase-blackhole",
+ MCH_HOST_BRIDGE_SMBASE_SIZE);
+ memory_region_set_enabled(&mch->smbase_blackhole, false);
+ memory_region_add_subregion_overlap(mch->system_memory,
+ MCH_HOST_BRIDGE_SMBASE_ADDR,
+ &mch->smbase_blackhole, 1);
+
+ memory_region_init_alias(&mch->smbase_window, OBJECT(mch),
+ "smbase-window", mch->ram_memory,
+ MCH_HOST_BRIDGE_SMBASE_ADDR,
+ MCH_HOST_BRIDGE_SMBASE_SIZE);
+ memory_region_set_enabled(&mch->smbase_window, false);
+ memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMBASE_ADDR,
+ &mch->smbase_window);
+
object_property_add_const_link(qdev_get_machine(), "smram",
OBJECT(&mch->smram), &error_abort);
@@ -601,6 +670,7 @@ uint64_t mch_mcfg_base(void)
static Property mch_props[] = {
DEFINE_PROP_UINT16("extended-tseg-mbytes", MCHPCIState, ext_tseg_mbytes,
16),
+ DEFINE_PROP_BOOL("smbase-smram", MCHPCIState, has_smram_at_smbase, true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e3d310365d..3ac7961451 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -122,7 +122,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp)
bus->machine_done.notify = pcibus_machine_done;
qemu_add_machine_init_done_notifier(&bus->machine_done);
- vmstate_register(NULL, -1, &vmstate_pcibus, bus);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_pcibus, bus);
}
static void pcie_bus_realize(BusState *qbus, Error **errp)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 30a5fbd3be..02cf53fc5b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2948,7 +2948,7 @@ static void spapr_machine_init(MachineState *machine)
* interface, this is a legacy from the sPAPREnvironment structure
* which predated MachineState but had a similar function */
vmstate_register(NULL, 0, &vmstate_spapr, spapr);
- register_savevm_live("spapr/htab", -1, 1,
+ register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
&savevm_htab_handlers, spapr);
qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine),
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 4bc73a370e..d3af42ef92 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -943,7 +943,13 @@ void virtio_scsi_common_unrealize(DeviceState *dev)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
+ int i;
+ virtio_delete_queue(vs->ctrl_vq);
+ virtio_delete_queue(vs->event_vq);
+ for (i = 0; i < vs->conf.num_queues; i++) {
+ virtio_delete_queue(vs->cmd_vqs[i]);
+ }
g_free(vs->cmd_vqs);
virtio_cleanup(vdev);
}
diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index af524fabf7..beaa285685 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -180,7 +180,7 @@ static arm_timer_state *arm_timer_init(uint32_t freq)
s->control = TIMER_CTRL_IE;
s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_DEFAULT);
- vmstate_register(NULL, -1, &vmstate_arm_timer, s);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_arm_timer, s);
return s;
}
diff --git a/hw/tpm/tpm_emulator.c b/hw/tpm/tpm_emulator.c
index 10d587ed40..3a0fc442f3 100644
--- a/hw/tpm/tpm_emulator.c
+++ b/hw/tpm/tpm_emulator.c
@@ -914,7 +914,8 @@ static void tpm_emulator_inst_init(Object *obj)
tpm_emu->cur_locty_number = ~0;
qemu_mutex_init(&tpm_emu->mutex);
- vmstate_register(NULL, -1, &vmstate_tpm_emulator, obj);
+ vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY,
+ &vmstate_tpm_emulator, obj);
}
/*
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index d27a10fcc6..2e81f5514f 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1061,7 +1061,7 @@ static void slave_read(void *opaque)
fd[0]);
break;
default:
- error_report("Received unexpected msg type.");
+ error_report("Received unexpected msg type: %d.", hdr.request);
ret = -EINVAL;
}
diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
index f5744363a8..b6cee479bb 100644
--- a/hw/virtio/vhost-vsock.c
+++ b/hw/virtio/vhost-vsock.c
@@ -335,8 +335,10 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp)
sizeof(struct virtio_vsock_config));
/* Receive and transmit queues belong to vhost */
- virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, vhost_vsock_handle_output);
- virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, vhost_vsock_handle_output);
+ vsock->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+ vhost_vsock_handle_output);
+ vsock->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+ vhost_vsock_handle_output);
/* The event queue belongs to QEMU */
vsock->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
@@ -363,6 +365,9 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp)
err_vhost_dev:
vhost_dev_cleanup(&vsock->vhost_dev);
err_virtio:
+ virtio_delete_queue(vsock->recv_vq);
+ virtio_delete_queue(vsock->trans_vq);
+ virtio_delete_queue(vsock->event_vq);
virtio_cleanup(vdev);
close(vhostfd);
return;
@@ -379,6 +384,9 @@ static void vhost_vsock_device_unrealize(DeviceState *dev, Error **errp)
vhost_vsock_set_status(vdev, 0);
vhost_dev_cleanup(&vsock->vhost_dev);
+ virtio_delete_queue(vsock->recv_vq);
+ virtio_delete_queue(vsock->trans_vq);
+ virtio_delete_queue(vsock->event_vq);
virtio_cleanup(vdev);
}
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 4da0d5a6c5..9edfadc81d 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -547,26 +547,28 @@ static void vhost_region_add_section(struct vhost_dev *dev,
uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
section->offset_within_region;
RAMBlock *mrs_rb = section->mr->ram_block;
- size_t mrs_page = qemu_ram_pagesize(mrs_rb);
trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
mrs_host);
- /* Round the section to it's page size */
- /* First align the start down to a page boundary */
- uint64_t alignage = mrs_host & (mrs_page - 1);
- if (alignage) {
- mrs_host -= alignage;
- mrs_size += alignage;
- mrs_gpa -= alignage;
- }
- /* Now align the size up to a page boundary */
- alignage = mrs_size & (mrs_page - 1);
- if (alignage) {
- mrs_size += mrs_page - alignage;
+ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
+ /* Round the section to it's page size */
+ /* First align the start down to a page boundary */
+ size_t mrs_page = qemu_ram_pagesize(mrs_rb);
+ uint64_t alignage = mrs_host & (mrs_page - 1);
+ if (alignage) {
+ mrs_host -= alignage;
+ mrs_size += alignage;
+ mrs_gpa -= alignage;
+ }
+ /* Now align the size up to a page boundary */
+ alignage = mrs_size & (mrs_page - 1);
+ if (alignage) {
+ mrs_size += mrs_page - alignage;
+ }
+ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
+ mrs_size, mrs_host);
}
- trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
- mrs_host);
if (dev->n_tmp_sections) {
/* Since we already have at least one section, lets see if
@@ -590,9 +592,10 @@ static void vhost_region_add_section(struct vhost_dev *dev,
* match up in the same RAMBlock if they do.
*/
if (mrs_gpa < prev_gpa_start) {
- error_report("%s:Section rounded to %"PRIx64
- " prior to previous %"PRIx64,
- __func__, mrs_gpa, prev_gpa_start);
+ error_report("%s:Section '%s' rounded to %"PRIx64
+ " prior to previous '%s' %"PRIx64,
+ __func__, section->mr->name, mrs_gpa,
+ prev_sec->mr->name, prev_gpa_start);
/* A way to cleanly fail here would be better */
return;
}
diff --git a/include/elf.h b/include/elf.h
index 3501e0c8d0..8fbfe60e09 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -1650,6 +1650,7 @@ typedef struct elf64_shdr {
#define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */
#define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */
#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */
+#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension regs */
/*
* Physical entry point into the kernel.
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index fd499f7e2f..53de19753a 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -25,9 +25,13 @@
*
* The syntax for the accessors is:
*
- * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
+ * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
+ * cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)
+ * cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmu_idx, retaddr)
*
- * store: cpu_st{sign}{size}_{mmusuffix}(env, ptr, val)
+ * store: cpu_st{size}_{mmusuffix}(env, ptr, val)
+ * cpu_st{size}_{mmusuffix}_ra(env, ptr, val, retaddr)
+ * cpu_st{size}_mmuidx_ra(env, ptr, val, mmu_idx, retaddr)
*
* sign is:
* (empty): for 32 and 64 bit sizes
@@ -40,9 +44,10 @@
* l: 32 bits
* q: 64 bits
*
- * mmusuffix is one of the generic suffixes "data" or "code", or
- * (for softmmu configs) a target-specific MMU mode suffix as defined
- * in target cpu.h.
+ * mmusuffix is one of the generic suffixes "data" or "code", or "mmuidx".
+ * The "mmuidx" suffix carries an extra mmu_idx argument that specifies
+ * the index to use; the "data" and "code" suffixes take the index from
+ * cpu_mmu_index().
*/
#ifndef CPU_LDST_H
#define CPU_LDST_H
@@ -89,6 +94,34 @@ typedef target_ulong abi_ptr;
#define TARGET_ABI_FMT_ptr TARGET_ABI_FMT_lx
#endif
+uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr);
+uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr);
+uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr);
+uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr);
+int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr);
+int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr);
+
+uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr);
+
+void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val);
+void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val);
+
+void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr);
+void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr);
+void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint32_t val, uintptr_t retaddr);
+void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr,
+ uint64_t val, uintptr_t retaddr);
+
#if defined(CONFIG_USER_ONLY)
extern __thread uintptr_t helper_retaddr;
@@ -113,47 +146,75 @@ static inline void clear_helper_retaddr(void)
helper_retaddr = 0;
}
-/* In user-only mode we provide only the _code and _data accessors. */
+/*
+ * Provide the same *_mmuidx_ra interface as for softmmu.
+ * The mmu_idx argument is ignored.
+ */
-#define MEMSUFFIX _data
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
+static inline uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_ldub_data_ra(env, addr, ra);
+}
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
+static inline uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_lduw_data_ra(env, addr, ra);
+}
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
+static inline uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_ldl_data_ra(env, addr, ra);
+}
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
+static inline uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_ldq_data_ra(env, addr, ra);
+}
-/*
- * Code access is deprecated in favour of translator_ld* functions
- * (see translator.h). However there are still users that need to
- * converted so for now these stay.
- */
-#define MEMSUFFIX _code
-#define CODE_ACCESS
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
+static inline int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_ldsb_data_ra(env, addr, ra);
+}
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
+static inline int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra)
+{
+ return cpu_ldsw_data_ra(env, addr, ra);
+}
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
+static inline void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ uint32_t val, int mmu_idx, uintptr_t ra)
+{
+ cpu_stb_data_ra(env, addr, val, ra);
+}
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
-#undef CODE_ACCESS
+static inline void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ uint32_t val, int mmu_idx, uintptr_t ra)
+{
+ cpu_stw_data_ra(env, addr, val, ra);
+}
+
+static inline void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ uint32_t val, int mmu_idx, uintptr_t ra)
+{
+ cpu_stl_data_ra(env, addr, val, ra);
+}
+
+static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ uint64_t val, int mmu_idx, uintptr_t ra)
+{
+ cpu_stq_data_ra(env, addr, val, ra);
+}
#else
-/* The memory helpers for tcg-generated code need tcg_target_long etc. */
-#include "tcg.h"
+/* Needed for TCG_OVERSIZED_GUEST */
+#include "tcg/tcg.h"
static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
{
@@ -173,11 +234,6 @@ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
return (addr >> TARGET_PAGE_BITS) & size_mask;
}
-static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
-{
- return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1;
-}
-
/* Find the TLB entry corresponding to the mmu_idx + address pair. */
static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
target_ulong addr)
@@ -185,280 +241,45 @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
return &env_tlb(env)->f[mmu_idx].table[tlb_index(env, mmu_idx, addr)];
}
-#ifdef MMU_MODE0_SUFFIX
-#define CPU_MMU_INDEX 0
-#define MEMSUFFIX MMU_MODE0_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
-
-#if (NB_MMU_MODES >= 2) && defined(MMU_MODE1_SUFFIX)
-#define CPU_MMU_INDEX 1
-#define MEMSUFFIX MMU_MODE1_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
-
-#if (NB_MMU_MODES >= 3) && defined(MMU_MODE2_SUFFIX)
-
-#define CPU_MMU_INDEX 2
-#define MEMSUFFIX MMU_MODE2_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 3) */
-
-#if (NB_MMU_MODES >= 4) && defined(MMU_MODE3_SUFFIX)
-
-#define CPU_MMU_INDEX 3
-#define MEMSUFFIX MMU_MODE3_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 4) */
-
-#if (NB_MMU_MODES >= 5) && defined(MMU_MODE4_SUFFIX)
-
-#define CPU_MMU_INDEX 4
-#define MEMSUFFIX MMU_MODE4_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
+uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+
+int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr,
+ int mmu_idx, uintptr_t ra);
+
+void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr);
+void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr);
+void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val,
+ int mmu_idx, uintptr_t retaddr);
+void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val,
+ int mmu_idx, uintptr_t retaddr);
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 5) */
-
-#if (NB_MMU_MODES >= 6) && defined(MMU_MODE5_SUFFIX)
-
-#define CPU_MMU_INDEX 5
-#define MEMSUFFIX MMU_MODE5_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 6) */
-
-#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX)
-
-#define CPU_MMU_INDEX 6
-#define MEMSUFFIX MMU_MODE6_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 7) */
-
-#if (NB_MMU_MODES >= 8) && defined(MMU_MODE7_SUFFIX)
-
-#define CPU_MMU_INDEX 7
-#define MEMSUFFIX MMU_MODE7_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 8) */
-
-#if (NB_MMU_MODES >= 9) && defined(MMU_MODE8_SUFFIX)
-
-#define CPU_MMU_INDEX 8
-#define MEMSUFFIX MMU_MODE8_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 9) */
-
-#if (NB_MMU_MODES >= 10) && defined(MMU_MODE9_SUFFIX)
-
-#define CPU_MMU_INDEX 9
-#define MEMSUFFIX MMU_MODE9_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 10) */
-
-#if (NB_MMU_MODES >= 11) && defined(MMU_MODE10_SUFFIX)
-
-#define CPU_MMU_INDEX 10
-#define MEMSUFFIX MMU_MODE10_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 11) */
-
-#if (NB_MMU_MODES >= 12) && defined(MMU_MODE11_SUFFIX)
-
-#define CPU_MMU_INDEX 11
-#define MEMSUFFIX MMU_MODE11_SUFFIX
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif /* (NB_MMU_MODES >= 12) */
-
-#if (NB_MMU_MODES > 12)
-#error "NB_MMU_MODES > 12 is not supported for now"
-#endif /* (NB_MMU_MODES > 12) */
-
-/* these access are slower, they must be as rare as possible */
-#define CPU_MMU_INDEX (cpu_mmu_index(env, false))
-#define MEMSUFFIX _data
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-
-/*
- * Code access is deprecated in favour of translator_ld* functions
- * (see translator.h). However there are still users that need to
- * converted so for now these stay.
- */
-
-#define CPU_MMU_INDEX (cpu_mmu_index(env, true))
-#define MEMSUFFIX _code
-#define SOFTMMU_CODE_ACCESS
-
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
+#endif /* defined(CONFIG_USER_ONLY) */
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
+uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr);
+uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr);
+uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr);
+uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr);
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#undef SOFTMMU_CODE_ACCESS
+static inline int cpu_ldsb_code(CPUArchState *env, abi_ptr addr)
+{
+ return (int8_t)cpu_ldub_code(env, addr);
+}
-#endif /* defined(CONFIG_USER_ONLY) */
+static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr)
+{
+ return (int16_t)cpu_lduw_code(env, addr);
+}
/**
* tlb_vaddr_to_host:
diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h
deleted file mode 100644
index 54b5e858ce..0000000000
--- a/include/exec/cpu_ldst_template.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Software MMU support
- *
- * Generate inline load/store functions for one MMU mode and data
- * size.
- *
- * Generate a store function as well as signed and unsigned loads.
- *
- * Not used directly but included from cpu_ldst.h.
- *
- * Copyright (c) 2003 Fabrice Bellard
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#if !defined(SOFTMMU_CODE_ACCESS)
-#include "trace-root.h"
-#endif
-
-#include "qemu/plugin.h"
-#include "trace/mem.h"
-
-#if DATA_SIZE == 8
-#define SUFFIX q
-#define USUFFIX q
-#define DATA_TYPE uint64_t
-#define SHIFT 3
-#elif DATA_SIZE == 4
-#define SUFFIX l
-#define USUFFIX l
-#define DATA_TYPE uint32_t
-#define SHIFT 2
-#elif DATA_SIZE == 2
-#define SUFFIX w
-#define USUFFIX uw
-#define DATA_TYPE uint16_t
-#define DATA_STYPE int16_t
-#define SHIFT 1
-#elif DATA_SIZE == 1
-#define SUFFIX b
-#define USUFFIX ub
-#define DATA_TYPE uint8_t
-#define DATA_STYPE int8_t
-#define SHIFT 0
-#else
-#error unsupported data size
-#endif
-
-#if DATA_SIZE == 8
-#define RES_TYPE uint64_t
-#else
-#define RES_TYPE uint32_t
-#endif
-
-#ifdef SOFTMMU_CODE_ACCESS
-#define ADDR_READ addr_code
-#define MMUSUFFIX _cmmu
-#define URETSUFFIX USUFFIX
-#define SRETSUFFIX glue(s, SUFFIX)
-#else
-#define ADDR_READ addr_read
-#define MMUSUFFIX _mmu
-#define URETSUFFIX USUFFIX
-#define SRETSUFFIX glue(s, SUFFIX)
-#endif
-
-/* generic load/store macros */
-
-static inline RES_TYPE
-glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- target_ulong ptr,
- uintptr_t retaddr)
-{
- CPUTLBEntry *entry;
- RES_TYPE res;
- target_ulong addr;
- int mmu_idx = CPU_MMU_INDEX;
- TCGMemOpIdx oi;
-#if !defined(SOFTMMU_CODE_ACCESS)
- uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false, mmu_idx);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
- addr = ptr;
- entry = tlb_entry(env, mmu_idx, addr);
- if (unlikely(entry->ADDR_READ !=
- (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
- oi = make_memop_idx(SHIFT, mmu_idx);
- res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
- oi, retaddr);
- } else {
- uintptr_t hostaddr = addr + entry->addend;
- res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
- }
-#ifndef SOFTMMU_CODE_ACCESS
- qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
- return res;
-}
-
-static inline RES_TYPE
-glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
-{
- return glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
-}
-
-#if DATA_SIZE <= 2
-static inline int
-glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- target_ulong ptr,
- uintptr_t retaddr)
-{
- CPUTLBEntry *entry;
- int res;
- target_ulong addr;
- int mmu_idx = CPU_MMU_INDEX;
- TCGMemOpIdx oi;
-#if !defined(SOFTMMU_CODE_ACCESS)
- uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false, mmu_idx);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
- addr = ptr;
- entry = tlb_entry(env, mmu_idx, addr);
- if (unlikely(entry->ADDR_READ !=
- (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
- oi = make_memop_idx(SHIFT, mmu_idx);
- res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
- MMUSUFFIX)(env, addr, oi, retaddr);
- } else {
- uintptr_t hostaddr = addr + entry->addend;
- res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
- }
-#ifndef SOFTMMU_CODE_ACCESS
- qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
- return res;
-}
-
-static inline int
-glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
-{
- return glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
-}
-#endif
-
-#ifndef SOFTMMU_CODE_ACCESS
-
-/* generic store macro */
-
-static inline void
-glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- target_ulong ptr,
- RES_TYPE v, uintptr_t retaddr)
-{
- CPUTLBEntry *entry;
- target_ulong addr;
- int mmu_idx = CPU_MMU_INDEX;
- TCGMemOpIdx oi;
-#if !defined(SOFTMMU_CODE_ACCESS)
- uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true, mmu_idx);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
-#endif
-
- addr = ptr;
- entry = tlb_entry(env, mmu_idx, addr);
- if (unlikely(tlb_addr_write(entry) !=
- (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
- oi = make_memop_idx(SHIFT, mmu_idx);
- glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
- retaddr);
- } else {
- uintptr_t hostaddr = addr + entry->addend;
- glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
- }
-#ifndef SOFTMMU_CODE_ACCESS
- qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
-}
-
-static inline void
-glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
- RES_TYPE v)
-{
- glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(env, ptr, v, 0);
-}
-
-#endif /* !SOFTMMU_CODE_ACCESS */
-
-#undef RES_TYPE
-#undef DATA_TYPE
-#undef DATA_STYPE
-#undef SUFFIX
-#undef USUFFIX
-#undef DATA_SIZE
-#undef MMUSUFFIX
-#undef ADDR_READ
-#undef URETSUFFIX
-#undef SRETSUFFIX
-#undef SHIFT
diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h
deleted file mode 100644
index dbdc7a845d..0000000000
--- a/include/exec/cpu_ldst_useronly_template.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * User-only accessor function support
- *
- * Generate inline load/store functions for one data size.
- *
- * Generate a store function as well as signed and unsigned loads.
- *
- * Not used directly but included from cpu_ldst.h.
- *
- * Copyright (c) 2015 Linaro Limited
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#if !defined(CODE_ACCESS)
-#include "trace-root.h"
-#endif
-
-#include "trace/mem.h"
-
-#if DATA_SIZE == 8
-#define SUFFIX q
-#define USUFFIX q
-#define DATA_TYPE uint64_t
-#define SHIFT 3
-#elif DATA_SIZE == 4
-#define SUFFIX l
-#define USUFFIX l
-#define DATA_TYPE uint32_t
-#define SHIFT 2
-#elif DATA_SIZE == 2
-#define SUFFIX w
-#define USUFFIX uw
-#define DATA_TYPE uint16_t
-#define DATA_STYPE int16_t
-#define SHIFT 1
-#elif DATA_SIZE == 1
-#define SUFFIX b
-#define USUFFIX ub
-#define DATA_TYPE uint8_t
-#define DATA_STYPE int8_t
-#define SHIFT 0
-#else
-#error unsupported data size
-#endif
-
-#if DATA_SIZE == 8
-#define RES_TYPE uint64_t
-#else
-#define RES_TYPE uint32_t
-#endif
-
-static inline RES_TYPE
-glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
-{
- RES_TYPE ret;
-#ifdef CODE_ACCESS
- set_helper_retaddr(1);
- ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
- clear_helper_retaddr();
-#else
- uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false,
- MMU_USER_IDX);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
- ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr));
-#endif
- return ret;
-}
-
-#ifndef CODE_ACCESS
-static inline RES_TYPE
-glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- abi_ptr ptr,
- uintptr_t retaddr)
-{
- RES_TYPE ret;
- set_helper_retaddr(retaddr);
- ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr);
- clear_helper_retaddr();
- return ret;
-}
-#endif
-
-#if DATA_SIZE <= 2
-static inline int
-glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr)
-{
- int ret;
-#ifdef CODE_ACCESS
- set_helper_retaddr(1);
- ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
- clear_helper_retaddr();
-#else
- uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false,
- MMU_USER_IDX);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
- ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr));
- qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-#endif
- return ret;
-}
-
-#ifndef CODE_ACCESS
-static inline int
-glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- abi_ptr ptr,
- uintptr_t retaddr)
-{
- int ret;
- set_helper_retaddr(retaddr);
- ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr);
- clear_helper_retaddr();
- return ret;
-}
-#endif /* CODE_ACCESS */
-#endif /* DATA_SIZE <= 2 */
-
-#ifndef CODE_ACCESS
-static inline void
-glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr,
- RES_TYPE v)
-{
- uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true,
- MMU_USER_IDX);
- trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo);
- glue(glue(st, SUFFIX), _p)(g2h(ptr), v);
- qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo);
-}
-
-static inline void
-glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
- abi_ptr ptr,
- RES_TYPE v,
- uintptr_t retaddr)
-{
- set_helper_retaddr(retaddr);
- glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v);
- clear_helper_retaddr();
-}
-#endif
-
-#undef RES_TYPE
-#undef DATA_TYPE
-#undef DATA_STYPE
-#undef SUFFIX
-#undef USUFFIX
-#undef DATA_SIZE
-#undef SHIFT
diff --git a/include/exec/translator.h b/include/exec/translator.h
index 459dd72aab..638e1529c5 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -148,41 +148,19 @@ void translator_loop_temp_check(DisasContextBase *db);
/*
* Translator Load Functions
*
- * These are intended to replace the old cpu_ld*_code functions and
- * are mandatory for front-ends that have been migrated to the common
- * translator_loop. These functions are only intended to be called
- * from the translation stage and should not be called from helper
- * functions. Those functions should be converted to encode the
- * relevant information at translation time.
+ * These are intended to replace the direct usage of the cpu_ld*_code
+ * functions and are mandatory for front-ends that have been migrated
+ * to the common translator_loop. These functions are only intended
+ * to be called from the translation stage and should not be called
+ * from helper functions. Those functions should be converted to encode
+ * the relevant information at translation time.
*/
-#ifdef CONFIG_USER_ONLY
-
-#define DO_LOAD(type, name, shift) \
- do { \
- set_helper_retaddr(1); \
- ret = name ## _p(g2h(pc)); \
- clear_helper_retaddr(); \
- } while (0)
-
-#else
-
-#define DO_LOAD(type, name, shift) \
- do { \
- int mmu_idx = cpu_mmu_index(env, true); \
- TCGMemOpIdx oi = make_memop_idx(shift, mmu_idx); \
- ret = helper_ret_ ## name ## _cmmu(env, pc, oi, 0); \
- } while (0)
-
-#endif
-
-#define GEN_TRANSLATOR_LD(fullname, name, type, shift, swap_fn) \
+#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \
static inline type \
fullname ## _swap(CPUArchState *env, abi_ptr pc, bool do_swap) \
{ \
- type ret; \
- DO_LOAD(type, name, shift); \
- \
+ type ret = load_fn(env, pc); \
if (do_swap) { \
ret = swap_fn(ret); \
} \
@@ -195,11 +173,11 @@ void translator_loop_temp_check(DisasContextBase *db);
return fullname ## _swap(env, pc, false); \
}
-GEN_TRANSLATOR_LD(translator_ldub, ldub, uint8_t, 0, /* no swap */ )
-GEN_TRANSLATOR_LD(translator_ldsw, ldsw, int16_t, 1, bswap16)
-GEN_TRANSLATOR_LD(translator_lduw, lduw, uint16_t, 1, bswap16)
-GEN_TRANSLATOR_LD(translator_ldl, ldl, uint32_t, 2, bswap32)
-GEN_TRANSLATOR_LD(translator_ldq, ldq, uint64_t, 3, bswap64)
+GEN_TRANSLATOR_LD(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */)
+GEN_TRANSLATOR_LD(translator_ldsw, int16_t, cpu_ldsw_code, bswap16)
+GEN_TRANSLATOR_LD(translator_lduw, uint16_t, cpu_lduw_code, bswap16)
+GEN_TRANSLATOR_LD(translator_ldl, uint32_t, cpu_ldl_code, bswap32)
+GEN_TRANSLATOR_LD(translator_ldq, uint64_t, cpu_ldq_code, bswap64)
#undef GEN_TRANSLATOR_LD
#endif /* EXEC__TRANSLATOR_H */
diff --git a/include/hw/arm/allwinner-a10.h b/include/hw/arm/allwinner-a10.h
index 7d2d215630..40d0b1d9c0 100644
--- a/include/hw/arm/allwinner-a10.h
+++ b/include/hw/arm/allwinner-a10.h
@@ -12,12 +12,6 @@
#include "target/arm/cpu.h"
-#define AW_A10_PIC_REG_BASE 0x01c20400
-#define AW_A10_PIT_REG_BASE 0x01c20c00
-#define AW_A10_UART0_REG_BASE 0x01c28000
-#define AW_A10_EMAC_BASE 0x01c0b000
-#define AW_A10_SATA_BASE 0x01c18000
-
#define AW_A10_SDRAM_BASE 0x40000000
#define TYPE_AW_A10 "allwinner-a10"
@@ -29,7 +23,6 @@ typedef struct AwA10State {
/*< public >*/
ARMCPU cpu;
- qemu_irq irq[AW_A10_PIC_INT_NR];
AwA10PITState timer;
AwA10PICState intc;
AwEmacState emac;
diff --git a/include/hw/arm/exynos4210.h b/include/hw/arm/exynos4210.h
index f0f23b0e9b..55260394af 100644
--- a/include/hw/arm/exynos4210.h
+++ b/include/hw/arm/exynos4210.h
@@ -24,6 +24,7 @@
#ifndef EXYNOS4210_H
#define EXYNOS4210_H
+#include "hw/or-irq.h"
#include "hw/sysbus.h"
#include "target/arm/cpu-qom.h"
@@ -74,6 +75,8 @@
#define EXYNOS4210_I2C_NUMBER 9
+#define EXYNOS4210_NUM_DMA 3
+
typedef struct Exynos4210Irq {
qemu_irq int_combiner_irq[EXYNOS4210_MAX_INT_COMBINER_IN_IRQ];
qemu_irq ext_combiner_irq[EXYNOS4210_MAX_EXT_COMBINER_IN_IRQ];
@@ -97,6 +100,7 @@ typedef struct Exynos4210State {
MemoryRegion boot_secondary;
MemoryRegion bootreg_mem;
I2CBus *i2c_if[EXYNOS4210_I2C_NUMBER];
+ qemu_or_irq pl330_irq_orgate[EXYNOS4210_NUM_DMA];
} Exynos4210State;
#define TYPE_EXYNOS4210_SOC "exynos4210"
diff --git a/include/hw/arm/fsl-imx25.h b/include/hw/arm/fsl-imx25.h
index 241efb52ae..1c86bb55fb 100644
--- a/include/hw/arm/fsl-imx25.h
+++ b/include/hw/arm/fsl-imx25.h
@@ -24,6 +24,7 @@
#include "hw/timer/imx_gpt.h"
#include "hw/timer/imx_epit.h"
#include "hw/net/imx_fec.h"
+#include "hw/misc/imx_rngc.h"
#include "hw/i2c/imx_i2c.h"
#include "hw/gpio/imx_gpio.h"
#include "exec/memory.h"
@@ -50,6 +51,7 @@ typedef struct FslIMX25State {
IMXGPTState gpt[FSL_IMX25_NUM_GPTS];
IMXEPITState epit[FSL_IMX25_NUM_EPITS];
IMXFECState fec;
+ IMXRNGCState rngc;
IMXI2CState i2c[FSL_IMX25_NUM_I2CS];
IMXGPIOState gpio[FSL_IMX25_NUM_GPIOS];
MemoryRegion rom[2];
@@ -211,6 +213,8 @@ typedef struct FslIMX25State {
#define FSL_IMX25_GPIO4_SIZE 0x4000
#define FSL_IMX25_GPIO3_ADDR 0x53FA4000
#define FSL_IMX25_GPIO3_SIZE 0x4000
+#define FSL_IMX25_RNGC_ADDR 0x53FB0000
+#define FSL_IMX25_RNGC_SIZE 0x4000
#define FSL_IMX25_GPIO1_ADDR 0x53FCC000
#define FSL_IMX25_GPIO1_SIZE 0x4000
#define FSL_IMX25_GPIO2_ADDR 0x53FD0000
@@ -238,6 +242,7 @@ typedef struct FslIMX25State {
#define FSL_IMX25_EPIT1_IRQ 28
#define FSL_IMX25_EPIT2_IRQ 27
#define FSL_IMX25_FEC_IRQ 57
+#define FSL_IMX25_RNGC_IRQ 22
#define FSL_IMX25_I2C1_IRQ 3
#define FSL_IMX25_I2C2_IRQ 4
#define FSL_IMX25_I2C3_IRQ 10
diff --git a/include/hw/arm/stm32f405_soc.h b/include/hw/arm/stm32f405_soc.h
new file mode 100644
index 0000000000..1fe97f8c3a
--- /dev/null
+++ b/include/hw/arm/stm32f405_soc.h
@@ -0,0 +1,73 @@
+/*
+ * STM32F405 SoC
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef HW_ARM_STM32F405_SOC_H
+#define HW_ARM_STM32F405_SOC_H
+
+#include "hw/misc/stm32f4xx_syscfg.h"
+#include "hw/timer/stm32f2xx_timer.h"
+#include "hw/char/stm32f2xx_usart.h"
+#include "hw/adc/stm32f2xx_adc.h"
+#include "hw/misc/stm32f4xx_exti.h"
+#include "hw/or-irq.h"
+#include "hw/ssi/stm32f2xx_spi.h"
+#include "hw/arm/armv7m.h"
+
+#define TYPE_STM32F405_SOC "stm32f405-soc"
+#define STM32F405_SOC(obj) \
+ OBJECT_CHECK(STM32F405State, (obj), TYPE_STM32F405_SOC)
+
+#define STM_NUM_USARTS 7
+#define STM_NUM_TIMERS 4
+#define STM_NUM_ADCS 6
+#define STM_NUM_SPIS 6
+
+#define FLASH_BASE_ADDRESS 0x08000000
+#define FLASH_SIZE (1024 * 1024)
+#define SRAM_BASE_ADDRESS 0x20000000
+#define SRAM_SIZE (192 * 1024)
+
+typedef struct STM32F405State {
+ /*< private >*/
+ SysBusDevice parent_obj;
+ /*< public >*/
+
+ char *cpu_type;
+
+ ARMv7MState armv7m;
+
+ STM32F4xxSyscfgState syscfg;
+ STM32F4xxExtiState exti;
+ STM32F2XXUsartState usart[STM_NUM_USARTS];
+ STM32F2XXTimerState timer[STM_NUM_TIMERS];
+ qemu_or_irq adc_irqs;
+ STM32F2XXADCState adc[STM_NUM_ADCS];
+ STM32F2XXSPIState spi[STM_NUM_SPIS];
+
+ MemoryRegion sram;
+ MemoryRegion flash;
+ MemoryRegion flash_alias;
+} STM32F405State;
+
+#endif
diff --git a/include/hw/misc/imx_rngc.h b/include/hw/misc/imx_rngc.h
new file mode 100644
index 0000000000..f0d2b44d4f
--- /dev/null
+++ b/include/hw/misc/imx_rngc.h
@@ -0,0 +1,35 @@
+/*
+ * Freescale i.MX RNGC emulation
+ *
+ * Copyright (C) 2020 Martin Kaiser <martin@kaiser.cx>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef IMX_RNGC_H
+#define IMX_RNGC_H
+
+#include "hw/sysbus.h"
+
+#define TYPE_IMX_RNGC "imx.rngc"
+#define IMX_RNGC(obj) OBJECT_CHECK(IMXRNGCState, (obj), TYPE_IMX_RNGC)
+
+typedef struct IMXRNGCState {
+ /*< private >*/
+ SysBusDevice parent_obj;
+
+ /*< public >*/
+ MemoryRegion iomem;
+
+ uint8_t op_self_test;
+ uint8_t op_seed;
+ uint8_t mask;
+ bool auto_seed;
+
+ QEMUBH *self_test_bh;
+ QEMUBH *seed_bh;
+ qemu_irq irq;
+} IMXRNGCState;
+
+#endif /* IMX_RNGC_H */
diff --git a/include/hw/misc/stm32f4xx_exti.h b/include/hw/misc/stm32f4xx_exti.h
new file mode 100644
index 0000000000..707036a41b
--- /dev/null
+++ b/include/hw/misc/stm32f4xx_exti.h
@@ -0,0 +1,60 @@
+/*
+ * STM32F4XX EXTI
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef HW_STM_EXTI_H
+#define HW_STM_EXTI_H
+
+#include "hw/sysbus.h"
+#include "hw/hw.h"
+
+#define EXTI_IMR 0x00
+#define EXTI_EMR 0x04
+#define EXTI_RTSR 0x08
+#define EXTI_FTSR 0x0C
+#define EXTI_SWIER 0x10
+#define EXTI_PR 0x14
+
+#define TYPE_STM32F4XX_EXTI "stm32f4xx-exti"
+#define STM32F4XX_EXTI(obj) \
+ OBJECT_CHECK(STM32F4xxExtiState, (obj), TYPE_STM32F4XX_EXTI)
+
+#define NUM_GPIO_EVENT_IN_LINES 16
+#define NUM_INTERRUPT_OUT_LINES 16
+
+typedef struct {
+ SysBusDevice parent_obj;
+
+ MemoryRegion mmio;
+
+ uint32_t exti_imr;
+ uint32_t exti_emr;
+ uint32_t exti_rtsr;
+ uint32_t exti_ftsr;
+ uint32_t exti_swier;
+ uint32_t exti_pr;
+
+ qemu_irq irq[NUM_INTERRUPT_OUT_LINES];
+} STM32F4xxExtiState;
+
+#endif
diff --git a/include/hw/misc/stm32f4xx_syscfg.h b/include/hw/misc/stm32f4xx_syscfg.h
new file mode 100644
index 0000000000..c62c6629e5
--- /dev/null
+++ b/include/hw/misc/stm32f4xx_syscfg.h
@@ -0,0 +1,61 @@
+/*
+ * STM32F4xx SYSCFG
+ *
+ * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef HW_STM_SYSCFG_H
+#define HW_STM_SYSCFG_H
+
+#include "hw/sysbus.h"
+#include "hw/hw.h"
+
+#define SYSCFG_MEMRMP 0x00
+#define SYSCFG_PMC 0x04
+#define SYSCFG_EXTICR1 0x08
+#define SYSCFG_EXTICR2 0x0C
+#define SYSCFG_EXTICR3 0x10
+#define SYSCFG_EXTICR4 0x14
+#define SYSCFG_CMPCR 0x20
+
+#define TYPE_STM32F4XX_SYSCFG "stm32f4xx-syscfg"
+#define STM32F4XX_SYSCFG(obj) \
+ OBJECT_CHECK(STM32F4xxSyscfgState, (obj), TYPE_STM32F4XX_SYSCFG)
+
+#define SYSCFG_NUM_EXTICR 4
+
+typedef struct {
+ /* <private> */
+ SysBusDevice parent_obj;
+
+ /* <public> */
+ MemoryRegion mmio;
+
+ uint32_t syscfg_memrmp;
+ uint32_t syscfg_pmc;
+ uint32_t syscfg_exticr[SYSCFG_NUM_EXTICR];
+ uint32_t syscfg_cmpcr;
+
+ qemu_irq irq;
+ qemu_irq gpio_out[16];
+} STM32F4xxSyscfgState;
+
+#endif
diff --git a/include/hw/or-irq.h b/include/hw/or-irq.h
index 3a3230dd84..0038bfbe3d 100644
--- a/include/hw/or-irq.h
+++ b/include/hw/or-irq.h
@@ -33,7 +33,7 @@
/* This can safely be increased if necessary without breaking
* migration compatibility (as long as it remains greater than 15).
*/
-#define MAX_OR_LINES 32
+#define MAX_OR_LINES 48
typedef struct OrIRQState qemu_or_irq;
diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h
index b3bcf2e632..976fbae599 100644
--- a/include/hw/pci-host/q35.h
+++ b/include/hw/pci-host/q35.h
@@ -32,6 +32,7 @@
#include "hw/acpi/ich9.h"
#include "hw/pci-host/pam.h"
#include "hw/i386/intel_iommu.h"
+#include "qemu/units.h"
#define TYPE_Q35_HOST_DEVICE "q35-pcihost"
#define Q35_HOST_DEVICE(obj) \
@@ -54,6 +55,8 @@ typedef struct MCHPCIState {
MemoryRegion smram_region, open_high_smram;
MemoryRegion smram, low_smram, high_smram;
MemoryRegion tseg_blackhole, tseg_window;
+ MemoryRegion smbase_blackhole, smbase_window;
+ bool has_smram_at_smbase;
Range pci_hole;
uint64_t below_4g_mem_size;
uint64_t above_4g_mem_size;
@@ -97,6 +100,13 @@ typedef struct Q35PCIHost {
#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY 0xffff
#define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX 0xfff
+#define MCH_HOST_BRIDGE_SMBASE_SIZE (128 * KiB)
+#define MCH_HOST_BRIDGE_SMBASE_ADDR 0x30000
+#define MCH_HOST_BRIDGE_F_SMBASE 0x9c
+#define MCH_HOST_BRIDGE_F_SMBASE_QUERY 0xff
+#define MCH_HOST_BRIDGE_F_SMBASE_IN_RAM 0x01
+#define MCH_HOST_BRIDGE_F_SMBASE_LCK 0x02
+
#define MCH_HOST_BRIDGE_PCIEXBAR 0x60 /* 64bit register */
#define MCH_HOST_BRIDGE_PCIEXBAR_SIZE 8 /* 64bit register */
#define MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT 0xb0000000
diff --git a/include/hw/virtio/vhost-vsock.h b/include/hw/virtio/vhost-vsock.h
index d509d67c4a..bc5a988ee5 100644
--- a/include/hw/virtio/vhost-vsock.h
+++ b/include/hw/virtio/vhost-vsock.h
@@ -33,6 +33,8 @@ typedef struct {
struct vhost_virtqueue vhost_vqs[2];
struct vhost_dev vhost_dev;
VirtQueue *event_vq;
+ VirtQueue *recv_vq;
+ VirtQueue *trans_vq;
QEMUTimer *post_load_timer;
/*< public >*/
diff --git a/include/migration/register.h b/include/migration/register.h
index 00c38ebe9f..c1dcff0f90 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -71,7 +71,7 @@ typedef struct SaveVMHandlers {
} SaveVMHandlers;
int register_savevm_live(const char *idstr,
- int instance_id,
+ uint32_t instance_id,
int version_id,
const SaveVMHandlers *ops,
void *opaque);
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 4aef72c426..30667631bc 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -229,6 +229,7 @@ extern const VMStateInfo vmstate_info_tmp;
extern const VMStateInfo vmstate_info_bitmap;
extern const VMStateInfo vmstate_info_qtailq;
extern const VMStateInfo vmstate_info_gtree;
+extern const VMStateInfo vmstate_info_qlist;
#define type_check_2darray(t1,t2,n,m) ((t1(*)[n][m])0 - (t2*)0)
/*
@@ -798,6 +799,26 @@ extern const VMStateInfo vmstate_info_gtree;
.offset = offsetof(_state, _field), \
}
+/*
+ * For migrating a QLIST
+ * Target QLIST needs be properly initialized.
+ * _type: type of QLIST element
+ * _next: name of QLIST_ENTRY entry field in QLIST element
+ * _vmsd: VMSD for QLIST element
+ * size: size of QLIST element
+ * start: offset of QLIST_ENTRY in QTAILQ element
+ */
+#define VMSTATE_QLIST_V(_field, _state, _version, _vmsd, _type, _next) \
+{ \
+ .name = (stringify(_field)), \
+ .version_id = (_version), \
+ .vmsd = &(_vmsd), \
+ .size = sizeof(_type), \
+ .info = &vmstate_info_qlist, \
+ .offset = offsetof(_state, _field), \
+ .start = offsetof(_type, _next), \
+}
+
/* _f : field name
_f_n : num of elements field_name
_n : num of elements
@@ -1157,8 +1178,10 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd,
bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque);
+#define VMSTATE_INSTANCE_ID_ANY -1
+
/* Returns: 0 on success, -1 on failure */
-int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id,
+int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
const VMStateDescription *vmsd,
void *base, int alias_id,
int required_for_version,
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index 4764d93ea3..4d4554a7ce 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -501,4 +501,43 @@ union { \
QTAILQ_RAW_TQH_CIRC(head)->tql_prev = QTAILQ_RAW_TQE_CIRC(elm, entry); \
} while (/*CONSTCOND*/0)
+#define QLIST_RAW_FIRST(head) \
+ field_at_offset(head, 0, void *)
+
+#define QLIST_RAW_NEXT(elm, entry) \
+ field_at_offset(elm, entry, void *)
+
+#define QLIST_RAW_PREVIOUS(elm, entry) \
+ field_at_offset(elm, entry + sizeof(void *), void *)
+
+#define QLIST_RAW_FOREACH(elm, head, entry) \
+ for ((elm) = *QLIST_RAW_FIRST(head); \
+ (elm); \
+ (elm) = *QLIST_RAW_NEXT(elm, entry))
+
+#define QLIST_RAW_INSERT_HEAD(head, elm, entry) do { \
+ void *first = *QLIST_RAW_FIRST(head); \
+ *QLIST_RAW_FIRST(head) = elm; \
+ *QLIST_RAW_PREVIOUS(elm, entry) = QLIST_RAW_FIRST(head); \
+ if (first) { \
+ *QLIST_RAW_NEXT(elm, entry) = first; \
+ *QLIST_RAW_PREVIOUS(first, entry) = QLIST_RAW_NEXT(elm, entry); \
+ } else { \
+ *QLIST_RAW_NEXT(elm, entry) = NULL; \
+ } \
+} while (0)
+
+#define QLIST_RAW_REVERSE(head, elm, entry) do { \
+ void *iter = *QLIST_RAW_FIRST(head), *prev = NULL, *next; \
+ while (iter) { \
+ next = *QLIST_RAW_NEXT(iter, entry); \
+ *QLIST_RAW_PREVIOUS(iter, entry) = QLIST_RAW_NEXT(next, entry); \
+ *QLIST_RAW_NEXT(iter, entry) = prev; \
+ prev = iter; \
+ iter = next; \
+ } \
+ *QLIST_RAW_FIRST(head) = prev; \
+ *QLIST_RAW_PREVIOUS(prev, entry) = QLIST_RAW_FIRST(head); \
+} while (0)
+
#endif /* QEMU_SYS_QUEUE_H */
diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h
new file mode 100644
index 0000000000..f4df0a40f6
--- /dev/null
+++ b/include/standard-headers/linux/fuse.h
@@ -0,0 +1,891 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ This file defines the kernel interface of FUSE
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+
+ This -- and only this -- header file may also be distributed under
+ the terms of the BSD Licence as follows:
+
+ Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
+*/
+
+/*
+ * This file defines the kernel interface of FUSE
+ *
+ * Protocol changelog:
+ *
+ * 7.1:
+ * - add the following messages:
+ * FUSE_SETATTR, FUSE_SYMLINK, FUSE_MKNOD, FUSE_MKDIR, FUSE_UNLINK,
+ * FUSE_RMDIR, FUSE_RENAME, FUSE_LINK, FUSE_OPEN, FUSE_READ, FUSE_WRITE,
+ * FUSE_RELEASE, FUSE_FSYNC, FUSE_FLUSH, FUSE_SETXATTR, FUSE_GETXATTR,
+ * FUSE_LISTXATTR, FUSE_REMOVEXATTR, FUSE_OPENDIR, FUSE_READDIR,
+ * FUSE_RELEASEDIR
+ * - add padding to messages to accommodate 32-bit servers on 64-bit kernels
+ *
+ * 7.2:
+ * - add FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE flags
+ * - add FUSE_FSYNCDIR message
+ *
+ * 7.3:
+ * - add FUSE_ACCESS message
+ * - add FUSE_CREATE message
+ * - add filehandle to fuse_setattr_in
+ *
+ * 7.4:
+ * - add frsize to fuse_kstatfs
+ * - clean up request size limit checking
+ *
+ * 7.5:
+ * - add flags and max_write to fuse_init_out
+ *
+ * 7.6:
+ * - add max_readahead to fuse_init_in and fuse_init_out
+ *
+ * 7.7:
+ * - add FUSE_INTERRUPT message
+ * - add POSIX file lock support
+ *
+ * 7.8:
+ * - add lock_owner and flags fields to fuse_release_in
+ * - add FUSE_BMAP message
+ * - add FUSE_DESTROY message
+ *
+ * 7.9:
+ * - new fuse_getattr_in input argument of GETATTR
+ * - add lk_flags in fuse_lk_in
+ * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
+ * - add blksize field to fuse_attr
+ * - add file flags field to fuse_read_in and fuse_write_in
+ * - Add ATIME_NOW and MTIME_NOW flags to fuse_setattr_in
+ *
+ * 7.10
+ * - add nonseekable open flag
+ *
+ * 7.11
+ * - add IOCTL message
+ * - add unsolicited notification support
+ * - add POLL message and NOTIFY_POLL notification
+ *
+ * 7.12
+ * - add umask flag to input argument of create, mknod and mkdir
+ * - add notification messages for invalidation of inodes and
+ * directory entries
+ *
+ * 7.13
+ * - make max number of background requests and congestion threshold
+ * tunables
+ *
+ * 7.14
+ * - add splice support to fuse device
+ *
+ * 7.15
+ * - add store notify
+ * - add retrieve notify
+ *
+ * 7.16
+ * - add BATCH_FORGET request
+ * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
+ * fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
+ * - add FUSE_IOCTL_32BIT flag
+ *
+ * 7.17
+ * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
+ *
+ * 7.18
+ * - add FUSE_IOCTL_DIR flag
+ * - add FUSE_NOTIFY_DELETE
+ *
+ * 7.19
+ * - add FUSE_FALLOCATE
+ *
+ * 7.20
+ * - add FUSE_AUTO_INVAL_DATA
+ *
+ * 7.21
+ * - add FUSE_READDIRPLUS
+ * - send the requested events in POLL request
+ *
+ * 7.22
+ * - add FUSE_ASYNC_DIO
+ *
+ * 7.23
+ * - add FUSE_WRITEBACK_CACHE
+ * - add time_gran to fuse_init_out
+ * - add reserved space to fuse_init_out
+ * - add FATTR_CTIME
+ * - add ctime and ctimensec to fuse_setattr_in
+ * - add FUSE_RENAME2 request
+ * - add FUSE_NO_OPEN_SUPPORT flag
+ *
+ * 7.24
+ * - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support
+ *
+ * 7.25
+ * - add FUSE_PARALLEL_DIROPS
+ *
+ * 7.26
+ * - add FUSE_HANDLE_KILLPRIV
+ * - add FUSE_POSIX_ACL
+ *
+ * 7.27
+ * - add FUSE_ABORT_ERROR
+ *
+ * 7.28
+ * - add FUSE_COPY_FILE_RANGE
+ * - add FOPEN_CACHE_DIR
+ * - add FUSE_MAX_PAGES, add max_pages to init_out
+ * - add FUSE_CACHE_SYMLINKS
+ *
+ * 7.29
+ * - add FUSE_NO_OPENDIR_SUPPORT flag
+ *
+ * 7.30
+ * - add FUSE_EXPLICIT_INVAL_DATA
+ * - add FUSE_IOCTL_COMPAT_X32
+ *
+ * 7.31
+ * - add FUSE_WRITE_KILL_PRIV flag
+ * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING
+ * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag
+ */
+
+#ifndef _LINUX_FUSE_H
+#define _LINUX_FUSE_H
+
+#include <stdint.h>
+
+/*
+ * Version negotiation:
+ *
+ * Both the kernel and userspace send the version they support in the
+ * INIT request and reply respectively.
+ *
+ * If the major versions match then both shall use the smallest
+ * of the two minor versions for communication.
+ *
+ * If the kernel supports a larger major version, then userspace shall
+ * reply with the major version it supports, ignore the rest of the
+ * INIT message and expect a new INIT message from the kernel with a
+ * matching major version.
+ *
+ * If the library supports a larger major version, then it shall fall
+ * back to the major protocol version sent by the kernel for
+ * communication and reply with that major version (and an arbitrary
+ * supported minor version).
+ */
+
+/** Version number of this interface */
+#define FUSE_KERNEL_VERSION 7
+
+/** Minor version number of this interface */
+#define FUSE_KERNEL_MINOR_VERSION 31
+
+/** The node ID of the root inode */
+#define FUSE_ROOT_ID 1
+
+/* Make sure all structures are padded to 64bit boundary, so 32bit
+ userspace works under 64bit kernels */
+
+struct fuse_attr {
+ uint64_t ino;
+ uint64_t size;
+ uint64_t blocks;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t ctime;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t ctimensec;
+ uint32_t mode;
+ uint32_t nlink;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t rdev;
+ uint32_t blksize;
+ uint32_t padding;
+};
+
+struct fuse_kstatfs {
+ uint64_t blocks;
+ uint64_t bfree;
+ uint64_t bavail;
+ uint64_t files;
+ uint64_t ffree;
+ uint32_t bsize;
+ uint32_t namelen;
+ uint32_t frsize;
+ uint32_t padding;
+ uint32_t spare[6];
+};
+
+struct fuse_file_lock {
+ uint64_t start;
+ uint64_t end;
+ uint32_t type;
+ uint32_t pid; /* tgid */
+};
+
+/**
+ * Bitmasks for fuse_setattr_in.valid
+ */
+#define FATTR_MODE (1 << 0)
+#define FATTR_UID (1 << 1)
+#define FATTR_GID (1 << 2)
+#define FATTR_SIZE (1 << 3)
+#define FATTR_ATIME (1 << 4)
+#define FATTR_MTIME (1 << 5)
+#define FATTR_FH (1 << 6)
+#define FATTR_ATIME_NOW (1 << 7)
+#define FATTR_MTIME_NOW (1 << 8)
+#define FATTR_LOCKOWNER (1 << 9)
+#define FATTR_CTIME (1 << 10)
+
+/**
+ * Flags returned by the OPEN request
+ *
+ * FOPEN_DIRECT_IO: bypass page cache for this open file
+ * FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ * FOPEN_NONSEEKABLE: the file is not seekable
+ * FOPEN_CACHE_DIR: allow caching this directory
+ * FOPEN_STREAM: the file is stream-like (no file position at all)
+ */
+#define FOPEN_DIRECT_IO (1 << 0)
+#define FOPEN_KEEP_CACHE (1 << 1)
+#define FOPEN_NONSEEKABLE (1 << 2)
+#define FOPEN_CACHE_DIR (1 << 3)
+#define FOPEN_STREAM (1 << 4)
+
+/**
+ * INIT request/reply flags
+ *
+ * FUSE_ASYNC_READ: asynchronous read requests
+ * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
+ * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
+ * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
+ * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
+ * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
+ * FUSE_DONT_MASK: don't apply umask to file mode on create operations
+ * FUSE_SPLICE_WRITE: kernel supports splice write on the device
+ * FUSE_SPLICE_MOVE: kernel supports splice move on the device
+ * FUSE_SPLICE_READ: kernel supports splice read on the device
+ * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
+ * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
+ * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
+ * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
+ * FUSE_READDIRPLUS_AUTO: adaptive readdirplus
+ * FUSE_ASYNC_DIO: asynchronous direct I/O submission
+ * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes
+ * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens
+ * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir
+ * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc
+ * FUSE_POSIX_ACL: filesystem supports posix acls
+ * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
+ * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages
+ * FUSE_CACHE_SYMLINKS: cache READLINK responses
+ * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
+ * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
+ * FUSE_MAP_ALIGNMENT: map_alignment field is valid
+ */
+#define FUSE_ASYNC_READ (1 << 0)
+#define FUSE_POSIX_LOCKS (1 << 1)
+#define FUSE_FILE_OPS (1 << 2)
+#define FUSE_ATOMIC_O_TRUNC (1 << 3)
+#define FUSE_EXPORT_SUPPORT (1 << 4)
+#define FUSE_BIG_WRITES (1 << 5)
+#define FUSE_DONT_MASK (1 << 6)
+#define FUSE_SPLICE_WRITE (1 << 7)
+#define FUSE_SPLICE_MOVE (1 << 8)
+#define FUSE_SPLICE_READ (1 << 9)
+#define FUSE_FLOCK_LOCKS (1 << 10)
+#define FUSE_HAS_IOCTL_DIR (1 << 11)
+#define FUSE_AUTO_INVAL_DATA (1 << 12)
+#define FUSE_DO_READDIRPLUS (1 << 13)
+#define FUSE_READDIRPLUS_AUTO (1 << 14)
+#define FUSE_ASYNC_DIO (1 << 15)
+#define FUSE_WRITEBACK_CACHE (1 << 16)
+#define FUSE_NO_OPEN_SUPPORT (1 << 17)
+#define FUSE_PARALLEL_DIROPS (1 << 18)
+#define FUSE_HANDLE_KILLPRIV (1 << 19)
+#define FUSE_POSIX_ACL (1 << 20)
+#define FUSE_ABORT_ERROR (1 << 21)
+#define FUSE_MAX_PAGES (1 << 22)
+#define FUSE_CACHE_SYMLINKS (1 << 23)
+#define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
+#define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
+#define FUSE_MAP_ALIGNMENT (1 << 26)
+
+/**
+ * CUSE INIT request/reply flags
+ *
+ * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl
+ */
+#define CUSE_UNRESTRICTED_IOCTL (1 << 0)
+
+/**
+ * Release flags
+ */
+#define FUSE_RELEASE_FLUSH (1 << 0)
+#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1)
+
+/**
+ * Getattr flags
+ */
+#define FUSE_GETATTR_FH (1 << 0)
+
+/**
+ * Lock flags
+ */
+#define FUSE_LK_FLOCK (1 << 0)
+
+/**
+ * WRITE flags
+ *
+ * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
+ * FUSE_WRITE_LOCKOWNER: lock_owner field is valid
+ * FUSE_WRITE_KILL_PRIV: kill suid and sgid bits
+ */
+#define FUSE_WRITE_CACHE (1 << 0)
+#define FUSE_WRITE_LOCKOWNER (1 << 1)
+#define FUSE_WRITE_KILL_PRIV (1 << 2)
+
+/**
+ * Read flags
+ */
+#define FUSE_READ_LOCKOWNER (1 << 1)
+
+/**
+ * Ioctl flags
+ *
+ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
+ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
+ * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_32BIT: 32bit ioctl
+ * FUSE_IOCTL_DIR: is a directory
+ * FUSE_IOCTL_COMPAT_X32: x32 compat ioctl on 64bit machine (64bit time_t)
+ *
+ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
+ */
+#define FUSE_IOCTL_COMPAT (1 << 0)
+#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
+#define FUSE_IOCTL_RETRY (1 << 2)
+#define FUSE_IOCTL_32BIT (1 << 3)
+#define FUSE_IOCTL_DIR (1 << 4)
+#define FUSE_IOCTL_COMPAT_X32 (1 << 5)
+
+#define FUSE_IOCTL_MAX_IOV 256
+
+/**
+ * Poll flags
+ *
+ * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify
+ */
+#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
+
+/**
+ * Fsync flags
+ *
+ * FUSE_FSYNC_FDATASYNC: Sync data only, not metadata
+ */
+#define FUSE_FSYNC_FDATASYNC (1 << 0)
+
+enum fuse_opcode {
+ FUSE_LOOKUP = 1,
+ FUSE_FORGET = 2, /* no reply */
+ FUSE_GETATTR = 3,
+ FUSE_SETATTR = 4,
+ FUSE_READLINK = 5,
+ FUSE_SYMLINK = 6,
+ FUSE_MKNOD = 8,
+ FUSE_MKDIR = 9,
+ FUSE_UNLINK = 10,
+ FUSE_RMDIR = 11,
+ FUSE_RENAME = 12,
+ FUSE_LINK = 13,
+ FUSE_OPEN = 14,
+ FUSE_READ = 15,
+ FUSE_WRITE = 16,
+ FUSE_STATFS = 17,
+ FUSE_RELEASE = 18,
+ FUSE_FSYNC = 20,
+ FUSE_SETXATTR = 21,
+ FUSE_GETXATTR = 22,
+ FUSE_LISTXATTR = 23,
+ FUSE_REMOVEXATTR = 24,
+ FUSE_FLUSH = 25,
+ FUSE_INIT = 26,
+ FUSE_OPENDIR = 27,
+ FUSE_READDIR = 28,
+ FUSE_RELEASEDIR = 29,
+ FUSE_FSYNCDIR = 30,
+ FUSE_GETLK = 31,
+ FUSE_SETLK = 32,
+ FUSE_SETLKW = 33,
+ FUSE_ACCESS = 34,
+ FUSE_CREATE = 35,
+ FUSE_INTERRUPT = 36,
+ FUSE_BMAP = 37,
+ FUSE_DESTROY = 38,
+ FUSE_IOCTL = 39,
+ FUSE_POLL = 40,
+ FUSE_NOTIFY_REPLY = 41,
+ FUSE_BATCH_FORGET = 42,
+ FUSE_FALLOCATE = 43,
+ FUSE_READDIRPLUS = 44,
+ FUSE_RENAME2 = 45,
+ FUSE_LSEEK = 46,
+ FUSE_COPY_FILE_RANGE = 47,
+ FUSE_SETUPMAPPING = 48,
+ FUSE_REMOVEMAPPING = 49,
+
+ /* CUSE specific operations */
+ CUSE_INIT = 4096,
+
+ /* Reserved opcodes: helpful to detect structure endian-ness */
+ CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */
+ FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */
+};
+
+enum fuse_notify_code {
+ FUSE_NOTIFY_POLL = 1,
+ FUSE_NOTIFY_INVAL_INODE = 2,
+ FUSE_NOTIFY_INVAL_ENTRY = 3,
+ FUSE_NOTIFY_STORE = 4,
+ FUSE_NOTIFY_RETRIEVE = 5,
+ FUSE_NOTIFY_DELETE = 6,
+ FUSE_NOTIFY_CODE_MAX,
+};
+
+/* The read buffer is required to be at least 8k, but may be much larger */
+#define FUSE_MIN_READ_BUFFER 8192
+
+#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
+
+struct fuse_entry_out {
+ uint64_t nodeid; /* Inode ID */
+ uint64_t generation; /* Inode generation: nodeid:gen must
+ be unique for the fs's lifetime */
+ uint64_t entry_valid; /* Cache timeout for the name */
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t entry_valid_nsec;
+ uint32_t attr_valid_nsec;
+ struct fuse_attr attr;
+};
+
+struct fuse_forget_in {
+ uint64_t nlookup;
+};
+
+struct fuse_forget_one {
+ uint64_t nodeid;
+ uint64_t nlookup;
+};
+
+struct fuse_batch_forget_in {
+ uint32_t count;
+ uint32_t dummy;
+};
+
+struct fuse_getattr_in {
+ uint32_t getattr_flags;
+ uint32_t dummy;
+ uint64_t fh;
+};
+
+#define FUSE_COMPAT_ATTR_OUT_SIZE 96
+
+struct fuse_attr_out {
+ uint64_t attr_valid; /* Cache timeout for the attributes */
+ uint32_t attr_valid_nsec;
+ uint32_t dummy;
+ struct fuse_attr attr;
+};
+
+#define FUSE_COMPAT_MKNOD_IN_SIZE 8
+
+struct fuse_mknod_in {
+ uint32_t mode;
+ uint32_t rdev;
+ uint32_t umask;
+ uint32_t padding;
+};
+
+struct fuse_mkdir_in {
+ uint32_t mode;
+ uint32_t umask;
+};
+
+struct fuse_rename_in {
+ uint64_t newdir;
+};
+
+struct fuse_rename2_in {
+ uint64_t newdir;
+ uint32_t flags;
+ uint32_t padding;
+};
+
+struct fuse_link_in {
+ uint64_t oldnodeid;
+};
+
+struct fuse_setattr_in {
+ uint32_t valid;
+ uint32_t padding;
+ uint64_t fh;
+ uint64_t size;
+ uint64_t lock_owner;
+ uint64_t atime;
+ uint64_t mtime;
+ uint64_t ctime;
+ uint32_t atimensec;
+ uint32_t mtimensec;
+ uint32_t ctimensec;
+ uint32_t mode;
+ uint32_t unused4;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t unused5;
+};
+
+struct fuse_open_in {
+ uint32_t flags;
+ uint32_t unused;
+};
+
+struct fuse_create_in {
+ uint32_t flags;
+ uint32_t mode;
+ uint32_t umask;
+ uint32_t padding;
+};
+
+struct fuse_open_out {
+ uint64_t fh;
+ uint32_t open_flags;
+ uint32_t padding;
+};
+
+struct fuse_release_in {
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t release_flags;
+ uint64_t lock_owner;
+};
+
+struct fuse_flush_in {
+ uint64_t fh;
+ uint32_t unused;
+ uint32_t padding;
+ uint64_t lock_owner;
+};
+
+struct fuse_read_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t read_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
+};
+
+#define FUSE_COMPAT_WRITE_IN_SIZE 24
+
+struct fuse_write_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t write_flags;
+ uint64_t lock_owner;
+ uint32_t flags;
+ uint32_t padding;
+};
+
+struct fuse_write_out {
+ uint32_t size;
+ uint32_t padding;
+};
+
+#define FUSE_COMPAT_STATFS_SIZE 48
+
+struct fuse_statfs_out {
+ struct fuse_kstatfs st;
+};
+
+struct fuse_fsync_in {
+ uint64_t fh;
+ uint32_t fsync_flags;
+ uint32_t padding;
+};
+
+struct fuse_setxattr_in {
+ uint32_t size;
+ uint32_t flags;
+};
+
+struct fuse_getxattr_in {
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_getxattr_out {
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_lk_in {
+ uint64_t fh;
+ uint64_t owner;
+ struct fuse_file_lock lk;
+ uint32_t lk_flags;
+ uint32_t padding;
+};
+
+struct fuse_lk_out {
+ struct fuse_file_lock lk;
+};
+
+struct fuse_access_in {
+ uint32_t mask;
+ uint32_t padding;
+};
+
+struct fuse_init_in {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
+};
+
+#define FUSE_COMPAT_INIT_OUT_SIZE 8
+#define FUSE_COMPAT_22_INIT_OUT_SIZE 24
+
+struct fuse_init_out {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t max_readahead;
+ uint32_t flags;
+ uint16_t max_background;
+ uint16_t congestion_threshold;
+ uint32_t max_write;
+ uint32_t time_gran;
+ uint16_t max_pages;
+ uint16_t map_alignment;
+ uint32_t unused[8];
+};
+
+#define CUSE_INIT_INFO_MAX 4096
+
+struct cuse_init_in {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
+};
+
+struct cuse_init_out {
+ uint32_t major;
+ uint32_t minor;
+ uint32_t unused;
+ uint32_t flags;
+ uint32_t max_read;
+ uint32_t max_write;
+ uint32_t dev_major; /* chardev major */
+ uint32_t dev_minor; /* chardev minor */
+ uint32_t spare[10];
+};
+
+struct fuse_interrupt_in {
+ uint64_t unique;
+};
+
+struct fuse_bmap_in {
+ uint64_t block;
+ uint32_t blocksize;
+ uint32_t padding;
+};
+
+struct fuse_bmap_out {
+ uint64_t block;
+};
+
+struct fuse_ioctl_in {
+ uint64_t fh;
+ uint32_t flags;
+ uint32_t cmd;
+ uint64_t arg;
+ uint32_t in_size;
+ uint32_t out_size;
+};
+
+struct fuse_ioctl_iovec {
+ uint64_t base;
+ uint64_t len;
+};
+
+struct fuse_ioctl_out {
+ int32_t result;
+ uint32_t flags;
+ uint32_t in_iovs;
+ uint32_t out_iovs;
+};
+
+struct fuse_poll_in {
+ uint64_t fh;
+ uint64_t kh;
+ uint32_t flags;
+ uint32_t events;
+};
+
+struct fuse_poll_out {
+ uint32_t revents;
+ uint32_t padding;
+};
+
+struct fuse_notify_poll_wakeup_out {
+ uint64_t kh;
+};
+
+struct fuse_fallocate_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint64_t length;
+ uint32_t mode;
+ uint32_t padding;
+};
+
+struct fuse_in_header {
+ uint32_t len;
+ uint32_t opcode;
+ uint64_t unique;
+ uint64_t nodeid;
+ uint32_t uid;
+ uint32_t gid;
+ uint32_t pid;
+ uint32_t padding;
+};
+
+struct fuse_out_header {
+ uint32_t len;
+ int32_t error;
+ uint64_t unique;
+};
+
+struct fuse_dirent {
+ uint64_t ino;
+ uint64_t off;
+ uint32_t namelen;
+ uint32_t type;
+ char name[];
+};
+
+#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
+#define FUSE_DIRENT_ALIGN(x) \
+ (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
+#define FUSE_DIRENT_SIZE(d) \
+ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+struct fuse_direntplus {
+ struct fuse_entry_out entry_out;
+ struct fuse_dirent dirent;
+};
+
+#define FUSE_NAME_OFFSET_DIRENTPLUS \
+ offsetof(struct fuse_direntplus, dirent.name)
+#define FUSE_DIRENTPLUS_SIZE(d) \
+ FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
+
+struct fuse_notify_inval_inode_out {
+ uint64_t ino;
+ int64_t off;
+ int64_t len;
+};
+
+struct fuse_notify_inval_entry_out {
+ uint64_t parent;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_delete_out {
+ uint64_t parent;
+ uint64_t child;
+ uint32_t namelen;
+ uint32_t padding;
+};
+
+struct fuse_notify_store_out {
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+struct fuse_notify_retrieve_out {
+ uint64_t notify_unique;
+ uint64_t nodeid;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t padding;
+};
+
+/* Matches the size of fuse_write_in */
+struct fuse_notify_retrieve_in {
+ uint64_t dummy1;
+ uint64_t offset;
+ uint32_t size;
+ uint32_t dummy2;
+ uint64_t dummy3;
+ uint64_t dummy4;
+};
+
+/* Device ioctls: */
+#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t)
+
+struct fuse_lseek_in {
+ uint64_t fh;
+ uint64_t offset;
+ uint32_t whence;
+ uint32_t padding;
+};
+
+struct fuse_lseek_out {
+ uint64_t offset;
+};
+
+struct fuse_copy_file_range_in {
+ uint64_t fh_in;
+ uint64_t off_in;
+ uint64_t nodeid_out;
+ uint64_t fh_out;
+ uint64_t off_out;
+ uint64_t len;
+ uint64_t flags;
+};
+
+#endif /* _LINUX_FUSE_H */
diff --git a/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h
index 0224ac3e78..0224ac3e78 100644
--- a/tcg/tcg-gvec-desc.h
+++ b/include/tcg/tcg-gvec-desc.h
diff --git a/tcg/tcg-mo.h b/include/tcg/tcg-mo.h
index c2c55704e1..c2c55704e1 100644
--- a/tcg/tcg-mo.h
+++ b/include/tcg/tcg-mo.h
diff --git a/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index 830d68f697..830d68f697 100644
--- a/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
diff --git a/tcg/tcg-op.h b/include/tcg/tcg-op.h
index 4af272daa5..230db6e022 100644
--- a/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -25,7 +25,7 @@
#ifndef TCG_TCG_OP_H
#define TCG_TCG_OP_H
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "exec/helper-proto.h"
#include "exec/helper-gen.h"
diff --git a/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 9288a04946..9288a04946 100644
--- a/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
diff --git a/tcg/tcg.h b/include/tcg/tcg.h
index 92ca10dffc..54e5446880 100644
--- a/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -31,7 +31,7 @@
#include "qemu/bitops.h"
#include "qemu/plugin.h"
#include "qemu/queue.h"
-#include "tcg-mo.h"
+#include "tcg/tcg-mo.h"
#include "tcg-target.h"
#include "qemu/int128.h"
@@ -211,7 +211,7 @@ typedef uint64_t TCGRegSet;
typedef enum TCGOpcode {
#define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
-#include "tcg-opc.h"
+#include "tcg/tcg-opc.h"
#undef DEF
NB_OPS,
} TCGOpcode;
@@ -1290,27 +1290,6 @@ void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
TCGMemOpIdx oi, uintptr_t retaddr);
-uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
- TCGMemOpIdx oi, uintptr_t retaddr);
-
/* Temporary aliases until backends are converted. */
#ifdef TARGET_WORDS_BIGENDIAN
# define helper_ret_ldsw_mmu helper_be_ldsw_mmu
@@ -1322,10 +1301,6 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
# define helper_ret_stw_mmu helper_be_stw_mmu
# define helper_ret_stl_mmu helper_be_stl_mmu
# define helper_ret_stq_mmu helper_be_stq_mmu
-# define helper_ret_lduw_cmmu helper_be_lduw_cmmu
-# define helper_ret_ldsw_cmmu helper_be_ldsw_cmmu
-# define helper_ret_ldl_cmmu helper_be_ldl_cmmu
-# define helper_ret_ldq_cmmu helper_be_ldq_cmmu
#else
# define helper_ret_ldsw_mmu helper_le_ldsw_mmu
# define helper_ret_lduw_mmu helper_le_lduw_mmu
@@ -1336,10 +1311,6 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr,
# define helper_ret_stw_mmu helper_le_stw_mmu
# define helper_ret_stl_mmu helper_le_stl_mmu
# define helper_ret_stq_mmu helper_le_stq_mmu
-# define helper_ret_lduw_cmmu helper_le_lduw_cmmu
-# define helper_ret_ldsw_cmmu helper_le_ldsw_cmmu
-# define helper_ret_ldl_cmmu helper_le_ldl_cmmu
-# define helper_ret_ldq_cmmu helper_le_ldq_cmmu
#endif
uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
diff --git a/include/user/syscall-trace.h b/include/user/syscall-trace.h
index 9e60473643..79fd3e5aa9 100644
--- a/include/user/syscall-trace.h
+++ b/include/user/syscall-trace.h
@@ -10,6 +10,8 @@
#ifndef _SYSCALL_TRACE_H_
#define _SYSCALL_TRACE_H_
+#include "trace-root.h"
+
/*
* These helpers just provide a common place for the various
* subsystems that want to track syscalls to put their hooks in. We
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 07b16cc0f4..f3080a1635 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -10,6 +10,7 @@
#include "qemu/path.h"
#include "qemu/queue.h"
#include "qemu/guest-random.h"
+#include "qemu/units.h"
#ifdef _ARCH_PPC64
#undef ARCH_DLINFO
@@ -2191,7 +2192,7 @@ unsigned long init_guest_space(unsigned long host_start,
* to where we need to put the commpage.
*/
munmap((void *)real_start, host_size);
- real_size = aligned_size + qemu_host_page_size;
+ real_size = aligned_size + align;
real_start = (unsigned long)
mmap((void *)real_start, real_size, PROT_NONE, flags, -1, 0);
if (real_start == (unsigned long)-1) {
@@ -2364,24 +2365,51 @@ static void load_elf_image(const char *image_name, int image_fd,
}
}
- load_addr = loaddr;
- if (ehdr->e_type == ET_DYN) {
- /* The image indicates that it can be loaded anywhere. Find a
- location that can hold the memory space required. If the
- image is pre-linked, LOADDR will be non-zero. Since we do
- not supply MAP_FIXED here we'll use that address if and
- only if it remains available. */
- load_addr = target_mmap(loaddr, hiaddr - loaddr, PROT_NONE,
- MAP_PRIVATE | MAP_ANON | MAP_NORESERVE,
- -1, 0);
- if (load_addr == -1) {
- goto exit_perror;
+ if (pinterp_name != NULL) {
+ /*
+ * This is the main executable.
+ *
+ * Reserve extra space for brk.
+ * We hold on to this space while placing the interpreter
+ * and the stack, lest they be placed immediately after
+ * the data segment and block allocation from the brk.
+ *
+ * 16MB is chosen as "large enough" without being so large
+ * as to allow the result to not fit with a 32-bit guest on
+ * a 32-bit host.
+ */
+ info->reserve_brk = 16 * MiB;
+ hiaddr += info->reserve_brk;
+
+ if (ehdr->e_type == ET_EXEC) {
+ /*
+ * Make sure that the low address does not conflict with
+ * MMAP_MIN_ADDR or the QEMU application itself.
+ */
+ probe_guest_base(image_name, loaddr, hiaddr);
}
- } else if (pinterp_name != NULL) {
- /* This is the main executable. Make sure that the low
- address does not conflict with MMAP_MIN_ADDR or the
- QEMU application itself. */
- probe_guest_base(image_name, loaddr, hiaddr);
+ }
+
+ /*
+ * Reserve address space for all of this.
+ *
+ * In the case of ET_EXEC, we supply MAP_FIXED so that we get
+ * exactly the address range that is required.
+ *
+ * Otherwise this is ET_DYN, and we are searching for a location
+ * that can hold the memory space required. If the image is
+ * pre-linked, LOADDR will be non-zero, and the kernel should
+ * honor that address if it happens to be free.
+ *
+ * In both cases, we will overwrite pages in this range with mappings
+ * from the executable.
+ */
+ load_addr = target_mmap(loaddr, hiaddr - loaddr, PROT_NONE,
+ MAP_PRIVATE | MAP_ANON | MAP_NORESERVE |
+ (ehdr->e_type == ET_EXEC ? MAP_FIXED : 0),
+ -1, 0);
+ if (load_addr == -1) {
+ goto exit_perror;
}
load_bias = load_addr - loaddr;
@@ -2860,6 +2888,17 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
bprm->core_dump = &elf_core_dump;
#endif
+ /*
+ * If we reserved extra space for brk, release it now.
+ * The implementation of do_brk in syscalls.c expects to be able
+ * to mmap pages in this space.
+ */
+ if (info->reserve_brk) {
+ abi_ulong start_brk = HOST_PAGE_ALIGN(info->brk);
+ abi_ulong end_brk = HOST_PAGE_ALIGN(info->brk + info->reserve_brk);
+ target_munmap(start_brk, end_brk - start_brk);
+ }
+
return 0;
}
diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h
index c6b9d6ad66..73dcc761e6 100644
--- a/linux-user/ioctls.h
+++ b/linux-user/ioctls.h
@@ -69,6 +69,29 @@
IOCTL(KDSETLED, 0, TYPE_INT)
IOCTL_SPECIAL(KDSIGACCEPT, 0, do_ioctl_kdsigaccept, TYPE_INT)
+ IOCTL(RTC_AIE_ON, 0, TYPE_NULL)
+ IOCTL(RTC_AIE_OFF, 0, TYPE_NULL)
+ IOCTL(RTC_UIE_ON, 0, TYPE_NULL)
+ IOCTL(RTC_UIE_OFF, 0, TYPE_NULL)
+ IOCTL(RTC_PIE_ON, 0, TYPE_NULL)
+ IOCTL(RTC_PIE_OFF, 0, TYPE_NULL)
+ IOCTL(RTC_WIE_ON, 0, TYPE_NULL)
+ IOCTL(RTC_WIE_OFF, 0, TYPE_NULL)
+ IOCTL(RTC_ALM_READ, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time)))
+ IOCTL(RTC_ALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time)))
+ IOCTL(RTC_RD_TIME, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time)))
+ IOCTL(RTC_SET_TIME, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time)))
+ IOCTL(RTC_IRQP_READ, IOC_R, MK_PTR(TYPE_ULONG))
+ IOCTL(RTC_IRQP_SET, IOC_W, TYPE_ULONG)
+ IOCTL(RTC_EPOCH_READ, IOC_R, MK_PTR(TYPE_ULONG))
+ IOCTL(RTC_EPOCH_SET, IOC_W, TYPE_ULONG)
+ IOCTL(RTC_WKALM_RD, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm)))
+ IOCTL(RTC_WKALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm)))
+ IOCTL(RTC_PLL_GET, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info)))
+ IOCTL(RTC_PLL_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info)))
+ IOCTL(RTC_VL_READ, IOC_R, MK_PTR(TYPE_INT))
+ IOCTL(RTC_VL_CLR, 0, TYPE_NULL)
+
IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT))
IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT))
IOCTL(BLKRRPART, 0, TYPE_NULL)
@@ -114,7 +137,13 @@
IOCTL(FDMSGON, 0, TYPE_NULL)
IOCTL(FDMSGOFF, 0, TYPE_NULL)
+ IOCTL(FDSETEMSGTRESH, 0, TYPE_NULL)
+ IOCTL(FDFMTBEG, 0, TYPE_NULL)
+ IOCTL(FDFMTTRK, IOC_W, MK_PTR(MK_STRUCT(STRUCT_format_descr)))
+ IOCTL(FDFMTEND, 0, TYPE_NULL)
IOCTL(FDFLUSH, 0, TYPE_NULL)
+ IOCTL(FDSETMAXERRS, IOC_W, MK_PTR(MK_STRUCT(STRUCT_floppy_max_errors)))
+ IOCTL(FDGETMAXERRS, IOC_R, MK_PTR(MK_STRUCT(STRUCT_floppy_max_errors)))
IOCTL(FDRESET, 0, TYPE_NULL)
IOCTL(FDRAWCMD, 0, TYPE_NULL)
IOCTL(FDTWADDLE, 0, TYPE_NULL)
@@ -138,6 +167,12 @@
IOCTL(FS_IOC_GETFLAGS, IOC_R, MK_PTR(TYPE_INT))
IOCTL(FS_IOC_SETFLAGS, IOC_W, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC_GETVERSION, IOC_R, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC_SETVERSION, IOC_W, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC32_GETFLAGS, IOC_R, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC32_SETFLAGS, IOC_W, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC32_GETVERSION, IOC_R, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC32_SETVERSION, IOC_W, MK_PTR(TYPE_INT))
#ifdef CONFIG_USBFS
/* USB ioctls */
@@ -522,3 +557,9 @@
IOCTL_IGNORE(TIOCSTART)
IOCTL_IGNORE(TIOCSTOP)
#endif
+
+#ifdef CONFIG_KCOV
+ IOCTL(KCOV_ENABLE, 0, TYPE_NULL)
+ IOCTL(KCOV_DISABLE, 0, TYPE_NULL)
+ IOCTL(KCOV_INIT_TRACE, IOC_R, TYPE_ULONG)
+#endif
diff --git a/linux-user/main.c b/linux-user/main.c
index 8718d03ee2..fba833aac9 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -37,7 +37,7 @@
#include "qemu/plugin.h"
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "qemu/timer.h"
#include "qemu/envlist.h"
#include "qemu/guest-random.h"
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index f6f5fe5fbb..560a68090e 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -35,6 +35,7 @@ struct image_info {
abi_ulong end_data;
abi_ulong start_brk;
abi_ulong brk;
+ abi_ulong reserve_brk;
abi_ulong start_mmap;
abi_ulong start_stack;
abi_ulong stack_limit;
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 171c0caef3..d60142f069 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -73,6 +73,9 @@
#ifdef CONFIG_SENDFILE
#include <sys/sendfile.h>
#endif
+#ifdef CONFIG_KCOV
+#include <sys/kcov.h>
+#endif
#define termios host_termios
#define winsize host_winsize
@@ -107,6 +110,7 @@
#include <netpacket/packet.h>
#include <linux/netlink.h>
#include <linux/if_alg.h>
+#include <linux/rtc.h>
#include "linux_loop.h"
#include "uname.h"
@@ -115,6 +119,7 @@
#include "user/syscall-trace.h"
#include "qapi/error.h"
#include "fd-trans.h"
+#include "tcg/tcg.h"
#ifndef CLONE_IO
#define CLONE_IO 0x80000000 /* Clone io context */
@@ -5174,6 +5179,8 @@ static abi_long do_ioctl(int fd, int cmd, abi_long arg)
break;
case TYPE_PTRVOID:
case TYPE_INT:
+ case TYPE_LONG:
+ case TYPE_ULONG:
ret = get_errno(safe_ioctl(fd, ie->host_cmd, arg));
break;
case TYPE_PTR:
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index 98c2119de9..9b61ae8547 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -763,6 +763,42 @@ struct target_pollfd {
#define TARGET_KDSETLED 0x4B32 /* set led state [lights, not flags] */
#define TARGET_KDSIGACCEPT 0x4B4E
+struct target_rtc_pll_info {
+ int pll_ctrl;
+ int pll_value;
+ int pll_max;
+ int pll_min;
+ int pll_posmult;
+ int pll_negmult;
+ abi_long pll_clock;
+};
+
+/* real time clock ioctls */
+#define TARGET_RTC_AIE_ON TARGET_IO('p', 0x01)
+#define TARGET_RTC_AIE_OFF TARGET_IO('p', 0x02)
+#define TARGET_RTC_UIE_ON TARGET_IO('p', 0x03)
+#define TARGET_RTC_UIE_OFF TARGET_IO('p', 0x04)
+#define TARGET_RTC_PIE_ON TARGET_IO('p', 0x05)
+#define TARGET_RTC_PIE_OFF TARGET_IO('p', 0x06)
+#define TARGET_RTC_WIE_ON TARGET_IO('p', 0x0f)
+#define TARGET_RTC_WIE_OFF TARGET_IO('p', 0x10)
+#define TARGET_RTC_ALM_READ TARGET_IOR('p', 0x08, struct rtc_time)
+#define TARGET_RTC_ALM_SET TARGET_IOW('p', 0x07, struct rtc_time)
+#define TARGET_RTC_RD_TIME TARGET_IOR('p', 0x09, struct rtc_time)
+#define TARGET_RTC_SET_TIME TARGET_IOW('p', 0x0a, struct rtc_time)
+#define TARGET_RTC_IRQP_READ TARGET_IOR('p', 0x0b, abi_ulong)
+#define TARGET_RTC_IRQP_SET TARGET_IOW('p', 0x0c, abi_ulong)
+#define TARGET_RTC_EPOCH_READ TARGET_IOR('p', 0x0d, abi_ulong)
+#define TARGET_RTC_EPOCH_SET TARGET_IOW('p', 0x0e, abi_ulong)
+#define TARGET_RTC_WKALM_RD TARGET_IOR('p', 0x10, struct rtc_wkalrm)
+#define TARGET_RTC_WKALM_SET TARGET_IOW('p', 0x0f, struct rtc_wkalrm)
+#define TARGET_RTC_PLL_GET TARGET_IOR('p', 0x11, \
+ struct target_rtc_pll_info)
+#define TARGET_RTC_PLL_SET TARGET_IOW('p', 0x12, \
+ struct target_rtc_pll_info)
+#define TARGET_RTC_VL_READ TARGET_IOR('p', 0x13, int)
+#define TARGET_RTC_VL_CLR TARGET_IO('p', 0x14)
+
#if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \
defined(TARGET_XTENSA)
#define TARGET_FIOGETOWN TARGET_IOR('f', 123, int)
@@ -899,7 +935,13 @@ struct target_pollfd {
#define TARGET_FDMSGON TARGET_IO(2, 0x45)
#define TARGET_FDMSGOFF TARGET_IO(2, 0x46)
+#define TARGET_FDFMTBEG TARGET_IO(2, 0x47)
+#define TARGET_FDFMTTRK TARGET_IOW(2, 0x48, struct format_descr)
+#define TARGET_FDFMTEND TARGET_IO(2, 0x49)
+#define TARGET_FDSETEMSGTRESH TARGET_IO(2, 0x4a)
#define TARGET_FDFLUSH TARGET_IO(2, 0x4b)
+#define TARGET_FDSETMAXERRS TARGET_IOW(2, 0x4c, struct floppy_max_errors)
+#define TARGET_FDGETMAXERRS TARGET_IOR(2, 0x0e, struct floppy_max_errors)
#define TARGET_FDRESET TARGET_IO(2, 0x54)
#define TARGET_FDRAWCMD TARGET_IO(2, 0x58)
#define TARGET_FDTWADDLE TARGET_IO(2, 0x59)
@@ -911,13 +953,19 @@ struct target_pollfd {
#define TARGET_FICLONE TARGET_IOW(0x94, 9, int)
#define TARGET_FICLONERANGE TARGET_IOW(0x94, 13, struct file_clone_range)
-/* Note that the ioctl numbers claim type "long" but the actual type
- * used by the kernel is "int".
+/*
+ * Note that the ioctl numbers for FS_IOC_<GET|SET><FLAGS|VERSION>
+ * claim type "long" but the actual type used by the kernel is "int".
*/
#define TARGET_FS_IOC_GETFLAGS TARGET_IOR('f', 1, abi_long)
#define TARGET_FS_IOC_SETFLAGS TARGET_IOW('f', 2, abi_long)
-
+#define TARGET_FS_IOC_GETVERSION TARGET_IOR('v', 1, abi_long)
+#define TARGET_FS_IOC_SETVERSION TARGET_IOW('v', 2, abi_long)
#define TARGET_FS_IOC_FIEMAP TARGET_IOWR('f',11,struct fiemap)
+#define TARGET_FS_IOC32_GETFLAGS TARGET_IOR('f', 1, int)
+#define TARGET_FS_IOC32_SETFLAGS TARGET_IOW('f', 2, int)
+#define TARGET_FS_IOC32_GETVERSION TARGET_IOR('v', 1, int)
+#define TARGET_FS_IOC32_SETVERSION TARGET_IOW('v', 2, int)
/* usb ioctls */
#define TARGET_USBDEVFS_CONTROL TARGET_IOWRU('U', 0)
@@ -2422,6 +2470,11 @@ struct target_mtpos {
#define TARGET_MTIOCGET TARGET_IOR('m', 2, struct target_mtget)
#define TARGET_MTIOCPOS TARGET_IOR('m', 3, struct target_mtpos)
+/* kcov ioctls */
+#define TARGET_KCOV_ENABLE TARGET_IO('c', 100)
+#define TARGET_KCOV_DISABLE TARGET_IO('c', 101)
+#define TARGET_KCOV_INIT_TRACE TARGET_IOR('c', 1, abi_ulong)
+
struct target_sysinfo {
abi_long uptime; /* Seconds since boot */
abi_ulong loads[3]; /* 1, 5, and 15 minute load averages */
diff --git a/linux-user/syscall_types.h b/linux-user/syscall_types.h
index 4e36983826..5ba4155047 100644
--- a/linux-user/syscall_types.h
+++ b/linux-user/syscall_types.h
@@ -255,12 +255,49 @@ STRUCT(blkpg_partition,
MK_ARRAY(TYPE_CHAR, BLKPG_DEVNAMELTH), /* devname */
MK_ARRAY(TYPE_CHAR, BLKPG_VOLNAMELTH)) /* volname */
+STRUCT(rtc_time,
+ TYPE_INT, /* tm_sec */
+ TYPE_INT, /* tm_min */
+ TYPE_INT, /* tm_hour */
+ TYPE_INT, /* tm_mday */
+ TYPE_INT, /* tm_mon */
+ TYPE_INT, /* tm_year */
+ TYPE_INT, /* tm_wday */
+ TYPE_INT, /* tm_yday */
+ TYPE_INT) /* tm_isdst */
+
+STRUCT(rtc_wkalrm,
+ TYPE_CHAR, /* enabled */
+ TYPE_CHAR, /* pending */
+ MK_STRUCT(STRUCT_rtc_time)) /* time */
+
+STRUCT(rtc_pll_info,
+ TYPE_INT, /* pll_ctrl */
+ TYPE_INT, /* pll_value */
+ TYPE_INT, /* pll_max */
+ TYPE_INT, /* pll_min */
+ TYPE_INT, /* pll_posmult */
+ TYPE_INT, /* pll_negmult */
+ TYPE_LONG) /* pll_clock */
+
STRUCT(blkpg_ioctl_arg,
TYPE_INT, /* op */
TYPE_INT, /* flags */
TYPE_INT, /* datalen */
TYPE_PTRVOID) /* data */
+STRUCT(format_descr,
+ TYPE_INT, /* device */
+ TYPE_INT, /* head */
+ TYPE_INT) /* track */
+
+STRUCT(floppy_max_errors,
+ TYPE_INT, /* abort */
+ TYPE_INT, /* read_track */
+ TYPE_INT, /* reset */
+ TYPE_INT, /* recal */
+ TYPE_INT) /* reporting */
+
#if defined(CONFIG_USBFS)
/* usb device ioctls */
STRUCT(usbdevfs_ctrltransfer,
diff --git a/migration/migration.c b/migration/migration.c
index 354ad072fa..990bff00c0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1005,17 +1005,6 @@ static bool migrate_caps_check(bool *cap_list,
#endif
if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
- if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
- /* The decompression threads asynchronously write into RAM
- * rather than use the atomic copies needed to avoid
- * userfaulting. It should be possible to fix the decompression
- * threads for compatibility in future.
- */
- error_setg(errp, "Postcopy is not currently compatible "
- "with compression");
- return false;
- }
-
/* This check is reasonably expensive, so only when it's being
* set the first time, also it's only the destination that needs
* special support.
@@ -1784,6 +1773,7 @@ void qmp_migrate_incoming(const char *uri, Error **errp)
}
if (!once) {
error_setg(errp, "The incoming migration has already been started");
+ return;
}
qemu_start_incoming_migration(uri, &local_err);
@@ -2035,11 +2025,10 @@ void qmp_migrate_set_downtime(double value, Error **errp)
}
value *= 1000; /* Convert to milliseconds */
- value = MAX(0, MIN(INT64_MAX, value));
MigrateSetParameters p = {
.has_downtime_limit = true,
- .downtime_limit = value,
+ .downtime_limit = (int64_t)value,
};
qmp_migrate_set_parameters(&p, errp);
@@ -3224,6 +3213,37 @@ void migration_consume_urgent_request(void)
qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
}
+/* Returns true if the rate limiting was broken by an urgent request */
+bool migration_rate_limit(void)
+{
+ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+ MigrationState *s = migrate_get_current();
+
+ bool urgent = false;
+ migration_update_counters(s, now);
+ if (qemu_file_rate_limit(s->to_dst_file)) {
+ /*
+ * Wait for a delay to do rate limiting OR
+ * something urgent to post the semaphore.
+ */
+ int ms = s->iteration_start_time + BUFFER_DELAY - now;
+ trace_migration_rate_limit_pre(ms);
+ if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
+ /*
+ * We were woken by one or more urgent things but
+ * the timedwait will have consumed one of them.
+ * The service routine for the urgent wake will dec
+ * the semaphore itself for each item it consumes,
+ * so add this one we just eat back.
+ */
+ qemu_sem_post(&s->rate_limit_sem);
+ urgent = true;
+ }
+ trace_migration_rate_limit_post(urgent);
+ }
+ return urgent;
+}
+
/*
* Master migration thread on the source VM.
* It drives the migration and pumps the data down the outgoing channel.
@@ -3290,8 +3310,6 @@ static void *migration_thread(void *opaque)
trace_migration_thread_setup_complete();
while (migration_is_active(s)) {
- int64_t current_time;
-
if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
MigIterateState iter_state = migration_iteration_run(s);
if (iter_state == MIG_ITERATE_SKIP) {
@@ -3318,29 +3336,7 @@ static void *migration_thread(void *opaque)
update_iteration_initial_status(s);
}
- current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-
- migration_update_counters(s, current_time);
-
- urgent = false;
- if (qemu_file_rate_limit(s->to_dst_file)) {
- /* Wait for a delay to do rate limiting OR
- * something urgent to post the semaphore.
- */
- int ms = s->iteration_start_time + BUFFER_DELAY - current_time;
- trace_migration_thread_ratelimit_pre(ms);
- if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
- /* We were worken by one or more urgent things but
- * the timedwait will have consumed one of them.
- * The service routine for the urgent wake will dec
- * the semaphore itself for each item it consumes,
- * so add this one we just eat back.
- */
- qemu_sem_post(&s->rate_limit_sem);
- urgent = true;
- }
- trace_migration_thread_ratelimit_post(urgent);
- }
+ urgent = migration_rate_limit();
}
trace_migration_thread_after_loop();
diff --git a/migration/migration.h b/migration/migration.h
index 79b3dda146..aa9ff6f27b 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -341,5 +341,6 @@ int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
void migration_make_urgent_request(void);
void migration_consume_urgent_request(void);
+bool migration_rate_limit(void);
#endif
diff --git a/migration/ram.c b/migration/ram.c
index 96feb4062c..d2208b5534 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -703,7 +703,7 @@ typedef struct {
static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
{
- MultiFDInit_t msg;
+ MultiFDInit_t msg = {};
int ret;
msg.magic = cpu_to_be32(MULTIFD_MAGIC);
@@ -803,7 +803,10 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
}
for (i = 0; i < p->pages->used; i++) {
- packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
+ /* there are architectures where ram_addr_t is 32 bit */
+ uint64_t temp = p->pages->offset[i];
+
+ packet->offset[i] = cpu_to_be64(temp);
}
}
@@ -877,10 +880,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
}
for (i = 0; i < p->pages->used; i++) {
- ram_addr_t offset = be64_to_cpu(packet->offset[i]);
+ uint64_t offset = be64_to_cpu(packet->offset[i]);
if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
- error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
+ error_setg(errp, "multifd: offset too long %" PRIu64
" (max " RAM_ADDR_FMT ")",
offset, block->max_length);
return -1;
@@ -900,6 +903,12 @@ struct {
uint64_t packet_num;
/* send channels ready */
QemuSemaphore channels_ready;
+ /*
+ * Have we already run terminate threads. There is a race when it
+ * happens that we got one error while we are exiting.
+ * We will use atomic operations. Only valid values are 0 and 1.
+ */
+ int exiting;
} *multifd_send_state;
/*
@@ -928,6 +937,10 @@ static int multifd_send_pages(RAMState *rs)
MultiFDPages_t *pages = multifd_send_state->pages;
uint64_t transferred;
+ if (atomic_read(&multifd_send_state->exiting)) {
+ return -1;
+ }
+
qemu_sem_wait(&multifd_send_state->channels_ready);
for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
p = &multifd_send_state->params[i];
@@ -945,10 +958,10 @@ static int multifd_send_pages(RAMState *rs)
}
qemu_mutex_unlock(&p->mutex);
}
- p->pages->used = 0;
+ assert(!p->pages->used);
+ assert(!p->pages->block);
p->packet_num = multifd_send_state->packet_num++;
- p->pages->block = NULL;
multifd_send_state->pages = p->pages;
p->pages = pages;
transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
@@ -1009,6 +1022,16 @@ static void multifd_send_terminate_threads(Error *err)
}
}
+ /*
+ * We don't want to exit each threads twice. Depending on where
+ * we get the error, or if there are two independent errors in two
+ * threads at the same time, we can end calling this function
+ * twice.
+ */
+ if (atomic_xchg(&multifd_send_state->exiting, 1)) {
+ return;
+ }
+
for (i = 0; i < migrate_multifd_channels(); i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
@@ -1033,6 +1056,10 @@ void multifd_save_cleanup(void)
if (p->running) {
qemu_thread_join(&p->thread);
}
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDSendParams *p = &multifd_send_state->params[i];
+
socket_send_channel_destroy(p->c);
p->c = NULL;
qemu_mutex_destroy(&p->mutex);
@@ -1118,6 +1145,10 @@ static void *multifd_send_thread(void *opaque)
while (true) {
qemu_sem_wait(&p->sem);
+
+ if (atomic_read(&multifd_send_state->exiting)) {
+ break;
+ }
qemu_mutex_lock(&p->mutex);
if (p->pending_job) {
@@ -1130,6 +1161,8 @@ static void *multifd_send_thread(void *opaque)
p->flags = 0;
p->num_packets++;
p->num_pages += used;
+ p->pages->used = 0;
+ p->pages->block = NULL;
qemu_mutex_unlock(&p->mutex);
trace_multifd_send(p->id, packet_num, used, flags,
@@ -1224,6 +1257,7 @@ int multifd_save_setup(void)
multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
multifd_send_state->pages = multifd_pages_init(page_count);
qemu_sem_init(&multifd_send_state->channels_ready, 0);
+ atomic_set(&multifd_send_state->exiting, 0);
for (i = 0; i < thread_count; i++) {
MultiFDSendParams *p = &multifd_send_state->params[i];
@@ -1236,7 +1270,7 @@ int multifd_save_setup(void)
p->id = i;
p->pages = multifd_pages_init(page_count);
p->packet_len = sizeof(MultiFDPacket_t)
- + sizeof(ram_addr_t) * page_count;
+ + sizeof(uint64_t) * page_count;
p->packet = g_malloc0(p->packet_len);
p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
p->packet->version = cpu_to_be32(MULTIFD_VERSION);
@@ -1281,7 +1315,9 @@ static void multifd_recv_terminate_threads(Error *err)
- normal quit, i.e. everything went fine, just finished
- error quit: We close the channels so the channel threads
finish the qio_channel_read_all_eof() */
- qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ if (p->c) {
+ qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ }
qemu_mutex_unlock(&p->mutex);
}
}
@@ -1307,6 +1343,10 @@ int multifd_load_cleanup(Error **errp)
qemu_sem_post(&p->sem_sync);
qemu_thread_join(&p->thread);
}
+ }
+ for (i = 0; i < migrate_multifd_channels(); i++) {
+ MultiFDRecvParams *p = &multifd_recv_state->params[i];
+
object_unref(OBJECT(p->c));
p->c = NULL;
qemu_mutex_destroy(&p->mutex);
@@ -1447,7 +1487,7 @@ int multifd_load_setup(void)
p->id = i;
p->pages = multifd_pages_init(page_count);
p->packet_len = sizeof(MultiFDPacket_t)
- + sizeof(ram_addr_t) * page_count;
+ + sizeof(uint64_t) * page_count;
p->packet = g_malloc0(p->packet_len);
p->name = g_strdup_printf("multifdrecv_%d", i);
}
@@ -1731,7 +1771,7 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
uint8_t shift = rb->clear_bmap_shift;
hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
- hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
+ hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
/*
* CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
@@ -1968,7 +2008,7 @@ static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
return;
}
- ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
+ ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
}
/*
@@ -2056,7 +2096,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
uint8_t *p;
bool send_async = true;
RAMBlock *block = pss->block;
- ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
+ ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
ram_addr_t current_addr = block->offset + offset;
p = block->host + offset;
@@ -2243,7 +2283,8 @@ static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
*again = false;
return false;
}
- if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
+ if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
+ >= pss->block->used_length) {
/* Didn't find anything in this RAM Block */
pss->page = 0;
pss->block = QLIST_NEXT_RCU(pss->block, next);
@@ -2434,7 +2475,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
* it's the 1st request.
*/
error_report("ram_save_queue_pages no previous block");
- goto err;
+ return -1;
}
} else {
ramblock = qemu_ram_block_by_name(rbname);
@@ -2442,7 +2483,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
if (!ramblock) {
/* We shouldn't be asked for a non-existent RAMBlock */
error_report("ram_save_queue_pages no block '%s'", rbname);
- goto err;
+ return -1;
}
rs->last_req_rb = ramblock;
}
@@ -2451,7 +2492,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
error_report("%s request overrun start=" RAM_ADDR_FMT " len="
RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
__func__, start, len, ramblock->used_length);
- goto err;
+ return -1;
}
struct RAMSrcPageRequest *new_entry =
@@ -2467,9 +2508,6 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
qemu_mutex_unlock(&rs->src_page_req_mutex);
return 0;
-
-err:
- return -1;
}
static bool save_page_use_compression(RAMState *rs)
@@ -2537,7 +2575,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
bool last_stage)
{
RAMBlock *block = pss->block;
- ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
+ ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
int res;
if (control_save_page(rs, block, offset, &res)) {
@@ -2563,10 +2601,13 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
}
/*
- * do not use multifd for compression as the first page in the new
- * block should be posted out before sending the compressed page
+ * Do not use multifd for:
+ * 1. Compression as the first page in the new block should be posted out
+ * before sending the compressed page
+ * 2. In postcopy as one whole host page should be placed
*/
- if (!save_page_use_compression(rs) && migrate_use_multifd()) {
+ if (!save_page_use_compression(rs) && migrate_use_multifd()
+ && !migration_in_postcopy()) {
return ram_save_multifd_page(rs, block, offset);
}
@@ -2617,8 +2658,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
pages += tmppages;
pss->page++;
+ /* Allow rate limiting to happen in the middle of huge pages */
+ migration_rate_limit();
} while ((pss->page & (pagesize_bits - 1)) &&
- offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
+ offset_in_ramblock(pss->block,
+ ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
/* The offset we leave with is the last one we looked at */
pss->page--;
@@ -2835,8 +2879,10 @@ void ram_postcopy_migrated_memory_release(MigrationState *ms)
while (run_start < range) {
unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
- ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
- (run_end - run_start) << TARGET_PAGE_BITS);
+ ram_discard_range(block->idstr,
+ ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
+ ((ram_addr_t)(run_end - run_start))
+ << TARGET_PAGE_BITS);
run_start = find_next_zero_bit(bitmap, range, run_end + 1);
}
}
@@ -3072,8 +3118,6 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms)
*/
int ram_discard_range(const char *rbname, uint64_t start, size_t length)
{
- int ret = -1;
-
trace_ram_discard_range(rbname, start, length);
RCU_READ_LOCK_GUARD();
@@ -3081,7 +3125,7 @@ int ram_discard_range(const char *rbname, uint64_t start, size_t length)
if (!rb) {
error_report("ram_discard_range: Failed to find block '%s'", rbname);
- goto err;
+ return -1;
}
/*
@@ -3093,10 +3137,7 @@ int ram_discard_range(const char *rbname, uint64_t start, size_t length)
length >> qemu_target_page_bits());
}
- ret = ram_block_discard_range(rb, start, length);
-
-err:
- return ret;
+ return ram_block_discard_range(rb, start, length);
}
/*
@@ -3451,6 +3492,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
rs->target_page_count += pages;
/*
+ * During postcopy, it is necessary to make sure one whole host
+ * page is sent in one chunk.
+ */
+ if (migrate_postcopy_ram()) {
+ flush_compressed_data(rs);
+ }
+
+ /*
* we want to check in the 1st loop, just in case it was the 1st
* time and we had to sync the dirty bitmap.
* qemu_clock_get_ns() is a bit expensive, so we only check each
@@ -4031,8 +4080,9 @@ static int ram_load_postcopy(QEMUFile *f)
MigrationIncomingState *mis = migration_incoming_get_current();
/* Temporary page that is later 'placed' */
void *postcopy_host_page = mis->postcopy_tmp_page;
- void *last_host = NULL;
+ void *this_host = NULL;
bool all_zero = false;
+ int target_pages = 0;
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr;
@@ -4041,6 +4091,7 @@ static int ram_load_postcopy(QEMUFile *f)
void *place_source = NULL;
RAMBlock *block = NULL;
uint8_t ch;
+ int len;
addr = qemu_get_be64(f);
@@ -4058,7 +4109,8 @@ static int ram_load_postcopy(QEMUFile *f)
trace_ram_load_postcopy_loop((uint64_t)addr, flags);
place_needed = false;
- if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
+ if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
+ RAM_SAVE_FLAG_COMPRESS_PAGE)) {
block = ram_block_from_stream(f, flags);
host = host_from_ram_block_offset(block, addr);
@@ -4067,6 +4119,7 @@ static int ram_load_postcopy(QEMUFile *f)
ret = -EINVAL;
break;
}
+ target_pages++;
matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
/*
* Postcopy requires that we place whole host pages atomically;
@@ -4076,38 +4129,47 @@ static int ram_load_postcopy(QEMUFile *f)
* that's moved into place later.
* The migration protocol uses, possibly smaller, target-pages
* however the source ensures it always sends all the components
- * of a host page in order.
+ * of a host page in one chunk.
*/
page_buffer = postcopy_host_page +
((uintptr_t)host & (block->page_size - 1));
/* If all TP are zero then we can optimise the place */
- if (!((uintptr_t)host & (block->page_size - 1))) {
+ if (target_pages == 1) {
all_zero = true;
+ this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
+ block->page_size);
} else {
/* not the 1st TP within the HP */
- if (host != (last_host + TARGET_PAGE_SIZE)) {
- error_report("Non-sequential target page %p/%p",
- host, last_host);
+ if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
+ (uintptr_t)this_host) {
+ error_report("Non-same host page %p/%p",
+ host, this_host);
ret = -EINVAL;
break;
}
}
-
/*
* If it's the last part of a host page then we place the host
* page
*/
- place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
- (block->page_size - 1)) == 0;
+ if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
+ place_needed = true;
+ target_pages = 0;
+ }
place_source = postcopy_host_page;
}
- last_host = host;
switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
case RAM_SAVE_FLAG_ZERO:
ch = qemu_get_byte(f);
- memset(page_buffer, ch, TARGET_PAGE_SIZE);
+ /*
+ * Can skip to set page_buffer when
+ * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
+ */
+ if (ch || !matches_target_page_size) {
+ memset(page_buffer, ch, TARGET_PAGE_SIZE);
+ }
if (ch) {
all_zero = false;
}
@@ -4131,6 +4193,17 @@ static int ram_load_postcopy(QEMUFile *f)
TARGET_PAGE_SIZE);
}
break;
+ case RAM_SAVE_FLAG_COMPRESS_PAGE:
+ all_zero = false;
+ len = qemu_get_be32(f);
+ if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
+ error_report("Invalid compressed data length: %d", len);
+ ret = -EINVAL;
+ break;
+ }
+ decompress_data_with_multi_threads(f, page_buffer, len);
+ break;
+
case RAM_SAVE_FLAG_EOS:
/* normal exit */
multifd_recv_sync_main();
@@ -4142,6 +4215,11 @@ static int ram_load_postcopy(QEMUFile *f)
break;
}
+ /* Got the whole host page, wait for decompress before placing. */
+ if (place_needed) {
+ ret |= wait_for_decompress_done();
+ }
+
/* Detect for any possible file errors */
if (!ret && qemu_file_get_error(f)) {
ret = qemu_file_get_error(f);
@@ -4149,7 +4227,8 @@ static int ram_load_postcopy(QEMUFile *f)
if (!ret && place_needed) {
/* This gets called at the last target page in the host page */
- void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
+ void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
+ block->page_size);
if (all_zero) {
ret = postcopy_place_page_zero(mis, place_dest,
@@ -4201,13 +4280,16 @@ static void colo_flush_ram_cache(void)
while (block) {
offset = migration_bitmap_find_dirty(ram_state, block, offset);
- if (offset << TARGET_PAGE_BITS >= block->used_length) {
+ if (((ram_addr_t)offset) << TARGET_PAGE_BITS
+ >= block->used_length) {
offset = 0;
block = QLIST_NEXT_RCU(block, next);
} else {
migration_bitmap_clear_dirty(ram_state, block, offset);
- dst_host = block->host + (offset << TARGET_PAGE_BITS);
- src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
+ dst_host = block->host
+ + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
+ src_host = block->colo_cache
+ + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
}
}
@@ -4227,7 +4309,7 @@ static void colo_flush_ram_cache(void)
*/
static int ram_load_precopy(QEMUFile *f)
{
- int flags = 0, ret = 0, invalid_flags = 0, len = 0;
+ int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
/* ADVISE is earlier, it shows the source has the postcopy capability on */
bool postcopy_advised = postcopy_is_advised();
if (!migrate_use_compression()) {
@@ -4239,6 +4321,17 @@ static int ram_load_precopy(QEMUFile *f)
void *host = NULL;
uint8_t ch;
+ /*
+ * Yield periodically to let main loop run, but an iteration of
+ * the main loop is expensive, so do it each some iterations
+ */
+ if ((i & 32767) == 0 && qemu_in_coroutine()) {
+ aio_co_schedule(qemu_get_current_aio_context(),
+ qemu_coroutine_self());
+ qemu_coroutine_yield();
+ }
+ i++;
+
addr = qemu_get_be64(f);
flags = addr & ~TARGET_PAGE_MASK;
addr &= TARGET_PAGE_MASK;
@@ -4385,6 +4478,7 @@ static int ram_load_precopy(QEMUFile *f)
}
}
+ ret |= wait_for_decompress_done();
return ret;
}
@@ -4416,8 +4510,6 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
} else {
ret = ram_load_precopy(f);
}
-
- ret |= wait_for_decompress_done();
}
trace_ram_load_complete(ret, seq_iter);
diff --git a/migration/savevm.c b/migration/savevm.c
index 59efc1981d..adfdca26ac 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -233,7 +233,7 @@ typedef struct CompatEntry {
typedef struct SaveStateEntry {
QTAILQ_ENTRY(SaveStateEntry) entry;
char idstr[256];
- int instance_id;
+ uint32_t instance_id;
int alias_id;
int version_id;
/* version id read from the stream */
@@ -250,6 +250,7 @@ typedef struct SaveStateEntry {
typedef struct SaveState {
QTAILQ_HEAD(, SaveStateEntry) handlers;
+ SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
int global_section_id;
uint32_t len;
const char *name;
@@ -261,6 +262,7 @@ typedef struct SaveState {
static SaveState savevm_state = {
.handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
+ .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
.global_section_id = 0,
};
@@ -665,10 +667,10 @@ void dump_vmstate_json_to_file(FILE *out_file)
fclose(out_file);
}
-static int calculate_new_instance_id(const char *idstr)
+static uint32_t calculate_new_instance_id(const char *idstr)
{
SaveStateEntry *se;
- int instance_id = 0;
+ uint32_t instance_id = 0;
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
if (strcmp(idstr, se->idstr) == 0
@@ -676,6 +678,8 @@ static int calculate_new_instance_id(const char *idstr)
instance_id = se->instance_id + 1;
}
}
+ /* Make sure we never loop over without being noticed */
+ assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
return instance_id;
}
@@ -709,20 +713,43 @@ static void savevm_state_handler_insert(SaveStateEntry *nse)
{
MigrationPriority priority = save_state_priority(nse);
SaveStateEntry *se;
+ int i;
assert(priority <= MIG_PRI_MAX);
- QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
- if (save_state_priority(se) < priority) {
+ for (i = priority - 1; i >= 0; i--) {
+ se = savevm_state.handler_pri_head[i];
+ if (se != NULL) {
+ assert(save_state_priority(se) < priority);
break;
}
}
- if (se) {
+ if (i >= 0) {
QTAILQ_INSERT_BEFORE(se, nse, entry);
} else {
QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
}
+
+ if (savevm_state.handler_pri_head[priority] == NULL) {
+ savevm_state.handler_pri_head[priority] = nse;
+ }
+}
+
+static void savevm_state_handler_remove(SaveStateEntry *se)
+{
+ SaveStateEntry *next;
+ MigrationPriority priority = save_state_priority(se);
+
+ if (se == savevm_state.handler_pri_head[priority]) {
+ next = QTAILQ_NEXT(se, entry);
+ if (next != NULL && save_state_priority(next) == priority) {
+ savevm_state.handler_pri_head[priority] = next;
+ } else {
+ savevm_state.handler_pri_head[priority] = NULL;
+ }
+ }
+ QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
}
/* TODO: Individual devices generally have very little idea about the rest
@@ -730,7 +757,7 @@ static void savevm_state_handler_insert(SaveStateEntry *nse)
Meanwhile pass -1 as instance_id if you do not already have a clearly
distinguishing id for all instances of your device class. */
int register_savevm_live(const char *idstr,
- int instance_id,
+ uint32_t instance_id,
int version_id,
const SaveVMHandlers *ops,
void *opaque)
@@ -750,7 +777,7 @@ int register_savevm_live(const char *idstr,
pstrcat(se->idstr, sizeof(se->idstr), idstr);
- if (instance_id == -1) {
+ if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
se->instance_id = calculate_new_instance_id(se->idstr);
} else {
se->instance_id = instance_id;
@@ -777,14 +804,14 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
- QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
+ savevm_state_handler_remove(se);
g_free(se->compat);
g_free(se);
}
}
}
-int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id,
+int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
const VMStateDescription *vmsd,
void *opaque, int alias_id,
int required_for_version,
@@ -817,14 +844,14 @@ int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id,
se->compat = g_new0(CompatEntry, 1);
pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
- se->compat->instance_id = instance_id == -1 ?
+ se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
calculate_compat_instance_id(vmsd->name) : instance_id;
- instance_id = -1;
+ instance_id = VMSTATE_INSTANCE_ID_ANY;
}
}
pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
- if (instance_id == -1) {
+ if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
se->instance_id = calculate_new_instance_id(se->idstr);
} else {
se->instance_id = instance_id;
@@ -841,7 +868,7 @@ void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
if (se->vmsd == vmsd && se->opaque == opaque) {
- QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
+ savevm_state_handler_remove(se);
g_free(se->compat);
g_free(se);
}
@@ -1600,7 +1627,7 @@ int qemu_save_device_state(QEMUFile *f)
return qemu_file_get_error(f);
}
-static SaveStateEntry *find_se(const char *idstr, int instance_id)
+static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
{
SaveStateEntry *se;
@@ -2267,7 +2294,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
/* Find savevm section */
se = find_se(idstr, instance_id);
if (se == NULL) {
- error_report("Unknown savevm section or instance '%s' %d. "
+ error_report("Unknown savevm section or instance '%s' %"PRIu32". "
"Make sure that your current VM setup matches your "
"saved VM setup, including any hotplugged devices",
idstr, instance_id);
@@ -2291,7 +2318,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
ret = vmstate_load(f, se);
if (ret < 0) {
- error_report("error while loading state for instance 0x%x of"
+ error_report("error while loading state for instance 0x%"PRIx32" of"
" device '%s'", instance_id, idstr);
return ret;
}
diff --git a/migration/trace-events b/migration/trace-events
index 6dee7b5389..4ab0a503d2 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -76,6 +76,11 @@ get_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val
put_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d"
put_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d"
+get_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
+get_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
+put_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
+put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
+
# qemu-file.c
qemu_file_fclose(void) ""
@@ -138,12 +143,12 @@ migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi6
migration_completion_file_err(void) ""
migration_completion_postcopy_end(void) ""
migration_completion_postcopy_end_after_complete(void) ""
+migration_rate_limit_pre(int ms) "%d ms"
+migration_rate_limit_post(int urgent) "urgent: %d"
migration_return_path_end_before(void) ""
migration_return_path_end_after(int rp_error) "%d"
migration_thread_after_loop(void) ""
migration_thread_file_err(void) ""
-migration_thread_ratelimit_pre(int ms) "%d ms"
-migration_thread_ratelimit_post(int urgent) "urgent: %d"
migration_thread_setup_complete(void) ""
open_return_path_on_source(void) ""
open_return_path_on_source_continue(void) ""
diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c
index 7236cf92bc..1eee36773a 100644
--- a/migration/vmstate-types.c
+++ b/migration/vmstate-types.c
@@ -843,3 +843,73 @@ const VMStateInfo vmstate_info_gtree = {
.get = get_gtree,
.put = put_gtree,
};
+
+static int put_qlist(QEMUFile *f, void *pv, size_t unused_size,
+ const VMStateField *field, QJSON *vmdesc)
+{
+ const VMStateDescription *vmsd = field->vmsd;
+ /* offset of the QTAILQ entry in a QTAILQ element*/
+ size_t entry_offset = field->start;
+ void *elm;
+ int ret;
+
+ trace_put_qlist(field->name, vmsd->name, vmsd->version_id);
+ QLIST_RAW_FOREACH(elm, pv, entry_offset) {
+ qemu_put_byte(f, true);
+ ret = vmstate_save_state(f, vmsd, elm, vmdesc);
+ if (ret) {
+ error_report("%s: failed to save %s (%d)", field->name,
+ vmsd->name, ret);
+ return ret;
+ }
+ }
+ qemu_put_byte(f, false);
+ trace_put_qlist_end(field->name, vmsd->name);
+
+ return 0;
+}
+
+static int get_qlist(QEMUFile *f, void *pv, size_t unused_size,
+ const VMStateField *field)
+{
+ int ret = 0;
+ const VMStateDescription *vmsd = field->vmsd;
+ /* size of a QLIST element */
+ size_t size = field->size;
+ /* offset of the QLIST entry in a QLIST element */
+ size_t entry_offset = field->start;
+ int version_id = field->version_id;
+ void *elm;
+
+ trace_get_qlist(field->name, vmsd->name, vmsd->version_id);
+ if (version_id > vmsd->version_id) {
+ error_report("%s %s", vmsd->name, "too new");
+ return -EINVAL;
+ }
+ if (version_id < vmsd->minimum_version_id) {
+ error_report("%s %s", vmsd->name, "too old");
+ return -EINVAL;
+ }
+
+ while (qemu_get_byte(f)) {
+ elm = g_malloc(size);
+ ret = vmstate_load_state(f, vmsd, elm, version_id);
+ if (ret) {
+ error_report("%s: failed to load %s (%d)", field->name,
+ vmsd->name, ret);
+ g_free(elm);
+ return ret;
+ }
+ QLIST_RAW_INSERT_HEAD(pv, elm, entry_offset);
+ }
+ QLIST_RAW_REVERSE(pv, elm, entry_offset);
+ trace_get_qlist_end(field->name, vmsd->name);
+
+ return ret;
+}
+
+const VMStateInfo vmstate_info_qlist = {
+ .name = "qlist",
+ .get = get_qlist,
+ .put = put_qlist,
+};
diff --git a/plugins/api.c b/plugins/api.c
index fa1d9f276d..53c8a73582 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -46,6 +46,7 @@
#include "qemu/plugin-memory.h"
#include "hw/boards.h"
#endif
+#include "trace/mem.h"
/* Uninstall and Reset handlers */
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 39f950471f..2328e7ea47 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -633,17 +633,6 @@ encrypted disk images.
* disk_images_snapshot_mode:: Snapshot mode
* vm_snapshots:: VM snapshots
* qemu_img_invocation:: qemu-img Invocation
-* qemu_nbd_invocation:: qemu-nbd Invocation
-* disk_images_formats:: Disk image file formats
-* host_drives:: Using host drives
-* disk_images_fat_images:: Virtual FAT disk images
-* disk_images_nbd:: NBD access
-* disk_images_sheepdog:: Sheepdog disk images
-* disk_images_iscsi:: iSCSI LUNs
-* disk_images_gluster:: GlusterFS disk images
-* disk_images_ssh:: Secure Shell (ssh) disk images
-* disk_images_nvme:: NVMe userspace driver
-* disk_image_locking:: Disk image file locking
@end menu
@node disk_images_quickstart
@@ -724,13 +713,6 @@ state is not saved or restored properly (in particular USB).
@include qemu-img.texi
-@node qemu_nbd_invocation
-@subsection @code{qemu-nbd} Invocation
-
-@include qemu-nbd.texi
-
-@include docs/qemu-block-drivers.texi
-
@node pcsys_network
@section Network emulation
diff --git a/qemu-nbd.texi b/qemu-nbd.texi
deleted file mode 100644
index 7f55657722..0000000000
--- a/qemu-nbd.texi
+++ /dev/null
@@ -1,214 +0,0 @@
-@example
-@c man begin SYNOPSIS
-@command{qemu-nbd} [OPTION]... @var{filename}
-
-@command{qemu-nbd} @option{-L} [OPTION]...
-
-@command{qemu-nbd} @option{-d} @var{dev}
-@c man end
-@end example
-
-@c man begin DESCRIPTION
-
-Export a QEMU disk image using the NBD protocol.
-
-Other uses:
-@itemize
-@item
-Bind a /dev/nbdX block device to a QEMU server (on Linux).
-@item
-As a client to query exports of a remote NBD server.
-@end itemize
-
-@c man end
-
-@c man begin OPTIONS
-@var{filename} is a disk image filename, or a set of block
-driver options if @option{--image-opts} is specified.
-
-@var{dev} is an NBD device.
-
-@table @option
-@item --object type,id=@var{id},...props...
-Define a new instance of the @var{type} object class identified by @var{id}.
-See the @code{qemu(1)} manual page for full details of the properties
-supported. The common object types that it makes sense to define are the
-@code{secret} object, which is used to supply passwords and/or encryption
-keys, and the @code{tls-creds} object, which is used to supply TLS
-credentials for the qemu-nbd server or client.
-@item -p, --port=@var{port}
-The TCP port to listen on as a server, or connect to as a client
-(default @samp{10809}).
-@item -o, --offset=@var{offset}
-The offset into the image.
-@item -b, --bind=@var{iface}
-The interface to bind to as a server, or connect to as a client
-(default @samp{0.0.0.0}).
-@item -k, --socket=@var{path}
-Use a unix socket with path @var{path}.
-@item --image-opts
-Treat @var{filename} as a set of image options, instead of a plain
-filename. If this flag is specified, the @var{-f} flag should
-not be used, instead the '@code{format=}' option should be set.
-@item -f, --format=@var{fmt}
-Force the use of the block driver for format @var{fmt} instead of
-auto-detecting.
-@item -r, --read-only
-Export the disk as read-only.
-@item -P, --partition=@var{num}
-Deprecated: Only expose MBR partition @var{num}. Understands physical
-partitions 1-4 and logical partition 5. New code should instead use
-@option{--image-opts} with the raw driver wrapping a subset of the
-original image.
-@item -B, --bitmap=@var{name}
-If @var{filename} has a qcow2 persistent bitmap @var{name}, expose
-that bitmap via the ``qemu:dirty-bitmap:@var{name}'' context
-accessible through NBD_OPT_SET_META_CONTEXT.
-@item -s, --snapshot
-Use @var{filename} as an external snapshot, create a temporary
-file with backing_file=@var{filename}, redirect the write to
-the temporary one.
-@item -l, --load-snapshot=@var{snapshot_param}
-Load an internal snapshot inside @var{filename} and export it
-as an read-only device, @var{snapshot_param} format is
-'snapshot.id=[ID],snapshot.name=[NAME]' or '[ID_OR_NAME]'
-@item -n, --nocache
-@itemx --cache=@var{cache}
-The cache mode to be used with the file. See the documentation of
-the emulator's @code{-drive cache=...} option for allowed values.
-@item --aio=@var{aio}
-Set the asynchronous I/O mode between @samp{threads} (the default)
-and @samp{native} (Linux only).
-@item --discard=@var{discard}
-Control whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap})
-requests are ignored or passed to the filesystem. @var{discard} is one of
-@samp{ignore} (or @samp{off}), @samp{unmap} (or @samp{on}). The default is
-@samp{ignore}.
-@item --detect-zeroes=@var{detect-zeroes}
-Control the automatic conversion of plain zero writes by the OS to
-driver-specific optimized zero write commands. @var{detect-zeroes} is one of
-@samp{off}, @samp{on} or @samp{unmap}. @samp{unmap}
-converts a zero write to an unmap operation and can only be used if
-@var{discard} is set to @samp{unmap}. The default is @samp{off}.
-@item -c, --connect=@var{dev}
-Connect @var{filename} to NBD device @var{dev} (Linux only).
-@item -d, --disconnect
-Disconnect the device @var{dev} (Linux only).
-@item -e, --shared=@var{num}
-Allow up to @var{num} clients to share the device (default
-@samp{1}). Safe for readers, but for now, consistency is not
-guaranteed between multiple writers.
-@item -t, --persistent
-Don't exit on the last connection.
-@item -x, --export-name=@var{name}
-Set the NBD volume export name (default of a zero-length string).
-@item -D, --description=@var{description}
-Set the NBD volume export description, as a human-readable
-string.
-@item -L, --list
-Connect as a client and list all details about the exports exposed by
-a remote NBD server. This enables list mode, and is incompatible
-with options that change behavior related to a specific export (such as
-@option{--export-name}, @option{--offset}, ...).
-@item --tls-creds=ID
-Enable mandatory TLS encryption for the server by setting the ID
-of the TLS credentials object previously created with the --object
-option; or provide the credentials needed for connecting as a client
-in list mode.
-@item --fork
-Fork off the server process and exit the parent once the server is running.
-@item --pid-file=PATH
-Store the server's process ID in the given file.
-@item --tls-authz=ID
-Specify the ID of a qauthz object previously created with the
---object option. This will be used to authorize connecting users
-against their x509 distinguished name.
-@item -v, --verbose
-Display extra debugging information.
-@item -h, --help
-Display this help and exit.
-@item -V, --version
-Display version information and exit.
-@item -T, --trace [[enable=]@var{pattern}][,events=@var{file}][,file=@var{file}]
-@findex --trace
-@include qemu-option-trace.texi
-@end table
-
-@c man end
-
-@c man begin EXAMPLES
-Start a server listening on port 10809 that exposes only the
-guest-visible contents of a qcow2 file, with no TLS encryption, and
-with the default export name (an empty string). The command is
-one-shot, and will block until the first successful client
-disconnects:
-
-@example
-qemu-nbd -f qcow2 file.qcow2
-@end example
-
-Start a long-running server listening with encryption on port 10810,
-and whitelist clients with a specific X.509 certificate to connect to
-a 1 megabyte subset of a raw file, using the export name 'subset':
-
-@example
-qemu-nbd \
- --object tls-creds-x509,id=tls0,endpoint=server,dir=/path/to/qemutls \
- --object 'authz-simple,id=auth0,identity=CN=laptop.example.com,,\
- O=Example Org,,L=London,,ST=London,,C=GB' \
- --tls-creds tls0 --tls-authz auth0 \
- -t -x subset -p 10810 \
- --image-opts driver=raw,offset=1M,size=1M,file.driver=file,file.filename=file.raw
-@end example
-
-Serve a read-only copy of just the first MBR partition of a guest
-image over a Unix socket with as many as 5 simultaneous readers, with
-a persistent process forked as a daemon:
-
-@example
-qemu-nbd --fork --persistent --shared=5 --socket=/path/to/sock \
- --partition=1 --read-only --format=qcow2 file.qcow2
-@end example
-
-Expose the guest-visible contents of a qcow2 file via a block device
-/dev/nbd0 (and possibly creating /dev/nbd0p1 and friends for
-partitions found within), then disconnect the device when done.
-Access to bind qemu-nbd to an /dev/nbd device generally requires root
-privileges, and may also require the execution of @code{modprobe nbd}
-to enable the kernel NBD client module. @emph{CAUTION}: Do not use
-this method to mount filesystems from an untrusted guest image - a
-malicious guest may have prepared the image to attempt to trigger
-kernel bugs in partition probing or file system mounting.
-
-@example
-qemu-nbd -c /dev/nbd0 -f qcow2 file.qcow2
-qemu-nbd -d /dev/nbd0
-@end example
-
-Query a remote server to see details about what export(s) it is
-serving on port 10809, and authenticating via PSK:
-
-@example
-qemu-nbd \
- --object tls-creds-psk,id=tls0,dir=/tmp/keys,username=eblake,endpoint=client \
- --tls-creds tls0 -L -b remote.example.com
-@end example
-
-@c man end
-
-@ignore
-
-@setfilename qemu-nbd
-@settitle QEMU Disk Network Block Device Server
-
-@c man begin AUTHOR
-Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws>.
-This is free software; see the source for copying conditions. There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-@c man end
-
-@c man begin SEEALSO
-qemu(1), qemu-img(1)
-@c man end
-
-@end ignore
diff --git a/qemu-option-trace.texi b/qemu-option-trace.texi
index 7d1b7f05c5..162f1528d2 100644
--- a/qemu-option-trace.texi
+++ b/qemu-option-trace.texi
@@ -1,3 +1,7 @@
+@c The contents of this file must be kept in sync with qemu-option-trace.rst.inc
+@c until all the users of the texi file have been converted to rst and
+@c the texi file can be removed.
+
Specify tracing options.
@table @option
diff --git a/qemu-options.hx b/qemu-options.hx
index 709162c159..224a8e8712 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -953,7 +953,7 @@ STEXI
@findex -cdrom
Use @var{file} as CD-ROM image (you cannot use @option{-hdc} and
@option{-cdrom} at the same time). You can use the host CD-ROM by
-using @file{/dev/cdrom} as filename (@pxref{host_drives}).
+using @file{/dev/cdrom} as filename.
ETEXI
DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
diff --git a/roms/edk2-funcs.sh b/roms/edk2-funcs.sh
index 3f4485b201..cd6e4f2c82 100644
--- a/roms/edk2-funcs.sh
+++ b/roms/edk2-funcs.sh
@@ -112,6 +112,9 @@ qemu_edk2_get_cross_prefix()
( [ "$gcc_arch" == i686 ] && [ "$host_arch" == x86_64 ] ); then
# no cross-compiler needed
:
+ elif ( [ -e /etc/debian_version ] && [ "$gcc_arch" == arm ] ); then
+ # force soft-float cross-compiler on Debian
+ printf 'arm-linux-gnueabi-'
else
printf '%s-linux-gnu-\n' "$gcc_arch"
fi
diff --git a/scripts/git.orderfile b/scripts/git.orderfile
index e89790941c..1f747b583a 100644
--- a/scripts/git.orderfile
+++ b/scripts/git.orderfile
@@ -25,5 +25,8 @@ qga/*.json
# headers
*.h
+# decoding tree specification
+*.decode
+
# code
*.c
diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py
index 0bfc5256fb..5100110fa2 100644
--- a/scripts/qapi/schema.py
+++ b/scripts/qapi/schema.py
@@ -795,7 +795,7 @@ class QAPISchema(object):
self.docs = parser.docs
self._entity_list = []
self._entity_dict = {}
- self._module_dict = {}
+ self._module_dict = OrderedDict()
self._schema_dir = os.path.dirname(fname)
self._make_module(None) # built-ins
self._make_module(fname)
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index f76d77363b..29c27f4681 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -186,6 +186,7 @@ rm -rf "$output/include/standard-headers/linux"
mkdir -p "$output/include/standard-headers/linux"
for i in "$tmpdir"/include/linux/*virtio*.h \
"$tmpdir/include/linux/qemu_fw_cfg.h" \
+ "$tmpdir/include/linux/fuse.h" \
"$tmpdir/include/linux/input.h" \
"$tmpdir/include/linux/input-event-codes.h" \
"$tmpdir/include/linux/pci_regs.h" \
diff --git a/stubs/vmstate.c b/stubs/vmstate.c
index 6951d9fdc5..cc4fe41dfc 100644
--- a/stubs/vmstate.c
+++ b/stubs/vmstate.c
@@ -4,7 +4,7 @@
const VMStateDescription vmstate_dummy = {};
int vmstate_register_with_alias_id(VMStateIf *obj,
- int instance_id,
+ uint32_t instance_id,
const VMStateDescription *vmsd,
void *base, int alias_id,
int required_for_version,
diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index a530249a5b..3f782c0efe 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -193,8 +193,6 @@ enum {
PALcode cheats and usees the KSEG mapping for its code+data rather than
physical addresses. */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_KERNEL_IDX 0
#define MMU_USER_IDX 1
#define MMU_PHYS_IDX 2
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index f7f1ed0f41..8870284f57 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -23,7 +23,7 @@
#include "disas/disas.h"
#include "qemu/host-utils.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "exec/helper-proto.h"
#include "exec/helper-gen.h"
diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c
index 26a2c09868..2345dec3c2 100644
--- a/target/arm/arch_dump.c
+++ b/target/arm/arch_dump.c
@@ -62,12 +62,23 @@ struct aarch64_user_vfp_state {
QEMU_BUILD_BUG_ON(sizeof(struct aarch64_user_vfp_state) != 528);
+/* struct user_sve_header from arch/arm64/include/uapi/asm/ptrace.h */
+struct aarch64_user_sve_header {
+ uint32_t size;
+ uint32_t max_size;
+ uint16_t vl;
+ uint16_t max_vl;
+ uint16_t flags;
+ uint16_t reserved;
+} QEMU_PACKED;
+
struct aarch64_note {
Elf64_Nhdr hdr;
char name[8]; /* align_up(sizeof("CORE"), 4) */
union {
struct aarch64_elf_prstatus prstatus;
struct aarch64_user_vfp_state vfp;
+ struct aarch64_user_sve_header sve;
};
} QEMU_PACKED;
@@ -76,6 +87,8 @@ struct aarch64_note {
(AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_elf_prstatus))
#define AARCH64_PRFPREG_NOTE_SIZE \
(AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_user_vfp_state))
+#define AARCH64_SVE_NOTE_SIZE(env) \
+ (AARCH64_NOTE_HEADER_SIZE + sve_size(env))
static void aarch64_note_init(struct aarch64_note *note, DumpState *s,
const char *name, Elf64_Word namesz,
@@ -128,11 +141,102 @@ static int aarch64_write_elf64_prfpreg(WriteCoreDumpFunction f,
return 0;
}
+#ifdef TARGET_AARCH64
+static off_t sve_zreg_offset(uint32_t vq, int n)
+{
+ off_t off = sizeof(struct aarch64_user_sve_header);
+ return ROUND_UP(off, 16) + vq * 16 * n;
+}
+
+static off_t sve_preg_offset(uint32_t vq, int n)
+{
+ return sve_zreg_offset(vq, 32) + vq * 16 / 8 * n;
+}
+
+static off_t sve_fpsr_offset(uint32_t vq)
+{
+ off_t off = sve_preg_offset(vq, 17);
+ return ROUND_UP(off, 16);
+}
+
+static off_t sve_fpcr_offset(uint32_t vq)
+{
+ return sve_fpsr_offset(vq) + sizeof(uint32_t);
+}
+
+static uint32_t sve_current_vq(CPUARMState *env)
+{
+ return sve_zcr_len_for_el(env, arm_current_el(env)) + 1;
+}
+
+static size_t sve_size_vq(uint32_t vq)
+{
+ off_t off = sve_fpcr_offset(vq) + sizeof(uint32_t);
+ return ROUND_UP(off, 16);
+}
+
+static size_t sve_size(CPUARMState *env)
+{
+ return sve_size_vq(sve_current_vq(env));
+}
+
+static int aarch64_write_elf64_sve(WriteCoreDumpFunction f,
+ CPUARMState *env, int cpuid,
+ DumpState *s)
+{
+ struct aarch64_note *note;
+ ARMCPU *cpu = env_archcpu(env);
+ uint32_t vq = sve_current_vq(env);
+ uint64_t tmp[ARM_MAX_VQ * 2], *r;
+ uint32_t fpr;
+ uint8_t *buf;
+ int ret, i;
+
+ note = g_malloc0(AARCH64_SVE_NOTE_SIZE(env));
+ buf = (uint8_t *)&note->sve;
+
+ aarch64_note_init(note, s, "LINUX", 6, NT_ARM_SVE, sve_size_vq(vq));
+
+ note->sve.size = cpu_to_dump32(s, sve_size_vq(vq));
+ note->sve.max_size = cpu_to_dump32(s, sve_size_vq(cpu->sve_max_vq));
+ note->sve.vl = cpu_to_dump16(s, vq * 16);
+ note->sve.max_vl = cpu_to_dump16(s, cpu->sve_max_vq * 16);
+ note->sve.flags = cpu_to_dump16(s, 1);
+
+ for (i = 0; i < 32; ++i) {
+ r = sve_bswap64(tmp, &env->vfp.zregs[i].d[0], vq * 2);
+ memcpy(&buf[sve_zreg_offset(vq, i)], r, vq * 16);
+ }
+
+ for (i = 0; i < 17; ++i) {
+ r = sve_bswap64(tmp, r = &env->vfp.pregs[i].p[0],
+ DIV_ROUND_UP(vq * 2, 8));
+ memcpy(&buf[sve_preg_offset(vq, i)], r, vq * 16 / 8);
+ }
+
+ fpr = cpu_to_dump32(s, vfp_get_fpsr(env));
+ memcpy(&buf[sve_fpsr_offset(vq)], &fpr, sizeof(uint32_t));
+
+ fpr = cpu_to_dump32(s, vfp_get_fpcr(env));
+ memcpy(&buf[sve_fpcr_offset(vq)], &fpr, sizeof(uint32_t));
+
+ ret = f(note, AARCH64_SVE_NOTE_SIZE(env), s);
+ g_free(note);
+
+ if (ret < 0) {
+ return -1;
+ }
+
+ return 0;
+}
+#endif
+
int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
int cpuid, void *opaque)
{
struct aarch64_note note;
- CPUARMState *env = &ARM_CPU(cs)->env;
+ ARMCPU *cpu = ARM_CPU(cs);
+ CPUARMState *env = &cpu->env;
DumpState *s = opaque;
uint64_t pstate, sp;
int ret, i;
@@ -163,7 +267,18 @@ int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
return -1;
}
- return aarch64_write_elf64_prfpreg(f, env, cpuid, s);
+ ret = aarch64_write_elf64_prfpreg(f, env, cpuid, s);
+ if (ret) {
+ return ret;
+ }
+
+#ifdef TARGET_AARCH64
+ if (cpu_isar_feature(aa64_sve, cpu)) {
+ ret = aarch64_write_elf64_sve(f, env, cpuid, s);
+ }
+#endif
+
+ return ret;
}
/* struct pt_regs from arch/arm/include/asm/ptrace.h */
@@ -335,6 +450,11 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus)
if (class == ELFCLASS64) {
note_size = AARCH64_PRSTATUS_NOTE_SIZE;
note_size += AARCH64_PRFPREG_NOTE_SIZE;
+#ifdef TARGET_AARCH64
+ if (cpu_isar_feature(aa64_sve, cpu)) {
+ note_size += AARCH64_SVE_NOTE_SIZE(env);
+ }
+#endif
} else {
note_size = ARM_PRSTATUS_NOTE_SIZE;
if (arm_feature(env, ARM_FEATURE_VFP)) {
diff --git a/target/arm/arm-semi.c b/target/arm/arm-semi.c
index 47d61f6fe1..788fe61b51 100644
--- a/target/arm/arm-semi.c
+++ b/target/arm/arm-semi.c
@@ -144,7 +144,8 @@ static int alloc_guestfd(void)
guestfd_array = g_array_new(FALSE, TRUE, sizeof(GuestFD));
}
- for (i = 0; i < guestfd_array->len; i++) {
+ /* SYS_OPEN should return nonzero handle on success. Start guestfd from 1 */
+ for (i = 1; i < guestfd_array->len; i++) {
GuestFD *gf = &g_array_index(guestfd_array, GuestFD, i);
if (gf->type == GuestFDUnused) {
@@ -168,7 +169,7 @@ static GuestFD *do_get_guestfd(int guestfd)
return NULL;
}
- if (guestfd < 0 || guestfd >= guestfd_array->len) {
+ if (guestfd <= 0 || guestfd >= guestfd_array->len) {
return NULL;
}
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index d62fd5fdc6..64cd0a7d73 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2121,6 +2121,7 @@ static void cortex_r5_initfn(Object *obj)
set_feature(&cpu->env, ARM_FEATURE_V7);
set_feature(&cpu->env, ARM_FEATURE_V7MP);
set_feature(&cpu->env, ARM_FEATURE_PMSA);
+ set_feature(&cpu->env, ARM_FEATURE_PMU);
cpu->midr = 0x411fc153; /* r1p3 */
cpu->id_pfr0 = 0x0131;
cpu->id_pfr1 = 0x001;
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 40f2c45e17..c1aedbeac0 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -980,6 +980,31 @@ void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq);
void aarch64_sve_change_el(CPUARMState *env, int old_el,
int new_el, bool el0_a64);
void aarch64_add_sve_properties(Object *obj);
+
+/*
+ * SVE registers are encoded in KVM's memory in an endianness-invariant format.
+ * The byte at offset i from the start of the in-memory representation contains
+ * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the
+ * lowest offsets are stored in the lowest memory addresses, then that nearly
+ * matches QEMU's representation, which is to use an array of host-endian
+ * uint64_t's, where the lower offsets are at the lower indices. To complete
+ * the translation we just need to byte swap the uint64_t's on big-endian hosts.
+ */
+static inline uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr)
+{
+#ifdef HOST_WORDS_BIGENDIAN
+ int i;
+
+ for (i = 0; i < nr; ++i) {
+ dst[i] = bswap64(src[i]);
+ }
+
+ return dst;
+#else
+ return src;
+#endif
+}
+
#else
static inline void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq) { }
static inline void aarch64_sve_change_el(CPUARMState *env, int o,
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index b4cd680fc4..36aa6badfd 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -31,7 +31,7 @@
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "qemu/atomic128.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "fpu/softfloat.h"
#include <zlib.h> /* For crc32 */
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 876184b8fe..e2da756e65 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -877,30 +877,6 @@ static int kvm_arch_put_fpsimd(CPUState *cs)
}
/*
- * SVE registers are encoded in KVM's memory in an endianness-invariant format.
- * The byte at offset i from the start of the in-memory representation contains
- * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the
- * lowest offsets are stored in the lowest memory addresses, then that nearly
- * matches QEMU's representation, which is to use an array of host-endian
- * uint64_t's, where the lower offsets are at the lower indices. To complete
- * the translation we just need to byte swap the uint64_t's on big-endian hosts.
- */
-static uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr)
-{
-#ifdef HOST_WORDS_BIGENDIAN
- int i;
-
- for (i = 0; i < nr; ++i) {
- dst[i] = bswap64(src[i]);
- }
-
- return dst;
-#else
- return src;
-#endif
-}
-
-/*
* KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
* and PREGS and the FFR have a slice size of 256 bits. However we simply hard
* code the slice index to zero for now as it's unlikely we'll need more than
diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
index e5a346cb87..27d16ad9ad 100644
--- a/target/arm/op_helper.c
+++ b/target/arm/op_helper.c
@@ -295,7 +295,12 @@ void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
}
if (target_el) {
- env->pc -= insn_len;
+ if (env->aarch64) {
+ env->pc -= insn_len;
+ } else {
+ env->regs[15] -= insn_len;
+ }
+
raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2),
target_el);
}
diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c
index d3194f2043..0a5f41e10c 100644
--- a/target/arm/pauth_helper.c
+++ b/target/arm/pauth_helper.c
@@ -89,7 +89,7 @@ static uint64_t pac_sub(uint64_t i)
uint64_t o = 0;
int b;
- for (b = 0; b < 64; b += 16) {
+ for (b = 0; b < 64; b += 4) {
o |= (uint64_t)sub[(i >> b) & 0xf] << b;
}
return o;
@@ -104,7 +104,7 @@ static uint64_t pac_inv_sub(uint64_t i)
uint64_t o = 0;
int b;
- for (b = 0; b < 64; b += 16) {
+ for (b = 0; b < 64; b += 4) {
o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b;
}
return o;
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index fc0c1755d2..fdfa652094 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -25,6 +25,7 @@
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
+#include "tcg/tcg.h"
/* Note that vector data is stored in host-endian 64-bit chunks,
diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c
index 5feb312941..e63f8bda29 100644
--- a/target/arm/tlb_helper.c
+++ b/target/arm/tlb_helper.c
@@ -44,7 +44,7 @@ static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
syn = syn_data_abort_with_iss(same_el,
0, 0, 0, 0, 0,
ea, 0, s1ptw, is_write, fsc,
- false);
+ true);
/* Merge the runtime syndrome with the template syndrome. */
syn |= template_syn;
}
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 8c18cdff87..96a5be2b37 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -20,8 +20,8 @@
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "qemu/log.h"
#include "arm_ldst.h"
#include "translate.h"
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 5d7edd0907..b35bad245e 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -20,9 +20,9 @@
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-gvec-desc.h"
#include "qemu/log.h"
#include "arm_ldst.h"
#include "translate.h"
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 5185e08641..2f4aea927f 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -24,8 +24,8 @@
#include "internals.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "qemu/log.h"
#include "qemu/bitops.h"
#include "arm_ldst.h"
@@ -8556,6 +8556,9 @@ static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w)
/* ISS not valid if writeback */
if (p && !w) {
ret = rd;
+ if (s->base.pc_next - s->pc_curr == 2) {
+ ret |= ISSIs16Bit;
+ }
} else {
ret = ISSInvalid;
}
diff --git a/target/cris/cpu.h b/target/cris/cpu.h
index a7c2a8e15b..ca240bc788 100644
--- a/target/cris/cpu.h
+++ b/target/cris/cpu.h
@@ -253,8 +253,6 @@ enum {
#define cpu_signal_handler cpu_cris_signal_handler
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_USER_IDX 1
static inline int cpu_mmu_index (CPUCRISState *env, bool ifetch)
{
diff --git a/target/cris/translate.c b/target/cris/translate.c
index cb57516a44..aaa46b5bca 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -27,7 +27,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/helper-proto.h"
#include "mmu.h"
#include "exec/cpu_ldst.h"
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 2f8d407a82..f25927aeca 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -22,7 +22,7 @@
#include "disas/disas.h"
#include "qemu/host-utils.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "exec/helper-proto.h"
#include "exec/helper-gen.h"
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 594326a794..e6de38ae02 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1955,9 +1955,6 @@ uint64_t cpu_get_tsc(CPUX86State *env);
#define cpu_list x86_cpu_list
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _ksmap
-#define MMU_MODE1_SUFFIX _user
-#define MMU_MODE2_SUFFIX _knosmap /* SMAP disabled or CPL<3 && AC=1 */
#define MMU_KSMAP_IDX 0
#define MMU_USER_IDX 1
#define MMU_KNOSMAP_IDX 2
diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
index d50d4b0c40..acf41f8885 100644
--- a/target/i386/mem_helper.c
+++ b/target/i386/mem_helper.c
@@ -24,7 +24,7 @@
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "qemu/atomic128.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
{
diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
index 87a627f9dc..b96de068ca 100644
--- a/target/i386/seg_helper.c
+++ b/target/i386/seg_helper.c
@@ -37,37 +37,37 @@
# define LOG_PCALL_STATE(cpu) do { } while (0)
#endif
-#ifdef CONFIG_USER_ONLY
-#define MEMSUFFIX _kernel
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_useronly_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_useronly_template.h"
-#undef MEMSUFFIX
-#else
-#define CPU_MMU_INDEX (cpu_mmu_index_kernel(env))
-#define MEMSUFFIX _kernel
-#define DATA_SIZE 1
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 2
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 4
-#include "exec/cpu_ldst_template.h"
-
-#define DATA_SIZE 8
-#include "exec/cpu_ldst_template.h"
-#undef CPU_MMU_INDEX
-#undef MEMSUFFIX
-#endif
+/*
+ * TODO: Convert callers to compute cpu_mmu_index_kernel once
+ * and use *_mmuidx_ra directly.
+ */
+#define cpu_ldub_kernel_ra(e, p, r) \
+ cpu_ldub_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_lduw_kernel_ra(e, p, r) \
+ cpu_lduw_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_ldl_kernel_ra(e, p, r) \
+ cpu_ldl_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+#define cpu_ldq_kernel_ra(e, p, r) \
+ cpu_ldq_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r)
+
+#define cpu_stb_kernel_ra(e, p, v, r) \
+ cpu_stb_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stw_kernel_ra(e, p, v, r) \
+ cpu_stw_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stl_kernel_ra(e, p, v, r) \
+ cpu_stl_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+#define cpu_stq_kernel_ra(e, p, v, r) \
+ cpu_stq_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r)
+
+#define cpu_ldub_kernel(e, p) cpu_ldub_kernel_ra(e, p, 0)
+#define cpu_lduw_kernel(e, p) cpu_lduw_kernel_ra(e, p, 0)
+#define cpu_ldl_kernel(e, p) cpu_ldl_kernel_ra(e, p, 0)
+#define cpu_ldq_kernel(e, p) cpu_ldq_kernel_ra(e, p, 0)
+
+#define cpu_stb_kernel(e, p, v) cpu_stb_kernel_ra(e, p, v, 0)
+#define cpu_stw_kernel(e, p, v) cpu_stw_kernel_ra(e, p, v, 0)
+#define cpu_stl_kernel(e, p, v) cpu_stl_kernel_ra(e, p, v, 0)
+#define cpu_stq_kernel(e, p, v) cpu_stq_kernel_ra(e, p, v, 0)
/* return non zero if error */
static inline int load_segment_ra(CPUX86State *env, uint32_t *e1_ptr,
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 7c99ef1385..d9af8f4078 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -22,7 +22,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "exec/translator.h"
diff --git a/target/lm32/translate.c b/target/lm32/translate.c
index 73db9654d6..e583d52d03 100644
--- a/target/lm32/translate.c
+++ b/target/lm32/translate.c
@@ -23,7 +23,7 @@
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/translator.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "qemu/qemu-print.h"
#include "exec/cpu_ldst.h"
diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index 11c71fa962..3de8e06dfe 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -519,8 +519,6 @@ enum {
#define cpu_list m68k_cpu_list
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_KERNEL_IDX 0
#define MMU_USER_IDX 1
static inline int cpu_mmu_index (CPUM68KState *env, bool ifetch)
diff --git a/target/m68k/op_helper.c b/target/m68k/op_helper.c
index bc4f845e3f..202498deb5 100644
--- a/target/m68k/op_helper.c
+++ b/target/m68k/op_helper.c
@@ -42,8 +42,8 @@ static void cf_rte(CPUM68KState *env)
uint32_t fmt;
sp = env->aregs[7];
- fmt = cpu_ldl_kernel(env, sp);
- env->pc = cpu_ldl_kernel(env, sp + 4);
+ fmt = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
+ env->pc = cpu_ldl_mmuidx_ra(env, sp + 4, MMU_KERNEL_IDX, 0);
sp |= (fmt >> 28) & 3;
env->aregs[7] = sp + 8;
@@ -58,13 +58,13 @@ static void m68k_rte(CPUM68KState *env)
sp = env->aregs[7];
throwaway:
- sr = cpu_lduw_kernel(env, sp);
+ sr = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
sp += 2;
- env->pc = cpu_ldl_kernel(env, sp);
+ env->pc = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
sp += 4;
if (m68k_feature(env, M68K_FEATURE_QUAD_MULDIV)) {
/* all except 68000 */
- fmt = cpu_lduw_kernel(env, sp);
+ fmt = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0);
sp += 2;
switch (fmt >> 12) {
case 0:
@@ -260,12 +260,12 @@ static void cf_interrupt_all(CPUM68KState *env, int is_hw)
/* ??? This could cause MMU faults. */
sp &= ~3;
sp -= 4;
- cpu_stl_kernel(env, sp, retaddr);
+ cpu_stl_mmuidx_ra(env, sp, retaddr, MMU_KERNEL_IDX, 0);
sp -= 4;
- cpu_stl_kernel(env, sp, fmt);
+ cpu_stl_mmuidx_ra(env, sp, fmt, MMU_KERNEL_IDX, 0);
env->aregs[7] = sp;
/* Jump to vector. */
- env->pc = cpu_ldl_kernel(env, env->vbr + vector);
+ env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
}
static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
@@ -278,23 +278,24 @@ static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp,
switch (format) {
case 4:
*sp -= 4;
- cpu_stl_kernel(env, *sp, env->pc);
+ cpu_stl_mmuidx_ra(env, *sp, env->pc, MMU_KERNEL_IDX, 0);
*sp -= 4;
- cpu_stl_kernel(env, *sp, addr);
+ cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
break;
case 3:
case 2:
*sp -= 4;
- cpu_stl_kernel(env, *sp, addr);
+ cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0);
break;
}
*sp -= 2;
- cpu_stw_kernel(env, *sp, (format << 12) + (cs->exception_index << 2));
+ cpu_stw_mmuidx_ra(env, *sp, (format << 12) + (cs->exception_index << 2),
+ MMU_KERNEL_IDX, 0);
}
*sp -= 4;
- cpu_stl_kernel(env, *sp, retaddr);
+ cpu_stl_mmuidx_ra(env, *sp, retaddr, MMU_KERNEL_IDX, 0);
*sp -= 2;
- cpu_stw_kernel(env, *sp, sr);
+ cpu_stw_mmuidx_ra(env, *sp, sr, MMU_KERNEL_IDX, 0);
}
static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
@@ -353,36 +354,52 @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
cpu_abort(cs, "DOUBLE MMU FAULT\n");
}
env->mmu.fault = true;
+ /* push data 3 */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* push data 3 */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* push data 2 */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* push data 2 */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* push data 1 */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* push data 1 */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 1 / push data 0 */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* write back 1 / push data 0 */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 1 address */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* write back 1 address */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 2 data */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* write back 2 data */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 2 address */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* write back 2 address */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 3 data */
sp -= 4;
- cpu_stl_kernel(env, sp, 0); /* write back 3 data */
+ cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 3 address */
sp -= 4;
- cpu_stl_kernel(env, sp, env->mmu.ar); /* write back 3 address */
+ cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+ /* fault address */
sp -= 4;
- cpu_stl_kernel(env, sp, env->mmu.ar); /* fault address */
+ cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+ /* write back 1 status */
sp -= 2;
- cpu_stw_kernel(env, sp, 0); /* write back 1 status */
+ cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 2 status */
sp -= 2;
- cpu_stw_kernel(env, sp, 0); /* write back 2 status */
+ cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* write back 3 status */
sp -= 2;
- cpu_stw_kernel(env, sp, 0); /* write back 3 status */
+ cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0);
+ /* special status word */
sp -= 2;
- cpu_stw_kernel(env, sp, env->mmu.ssw); /* special status word */
+ cpu_stw_mmuidx_ra(env, sp, env->mmu.ssw, MMU_KERNEL_IDX, 0);
+ /* effective address */
sp -= 4;
- cpu_stl_kernel(env, sp, env->mmu.ar); /* effective address */
+ cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0);
+
do_stack_frame(env, &sp, 7, oldsr, 0, retaddr);
env->mmu.fault = false;
if (qemu_loglevel_mask(CPU_LOG_INT)) {
@@ -414,7 +431,7 @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw)
env->aregs[7] = sp;
/* Jump to vector. */
- env->pc = cpu_ldl_kernel(env, env->vbr + vector);
+ env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0);
}
static void do_interrupt_all(CPUM68KState *env, int is_hw)
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index fcdb7bc8e4..0f80888203 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -22,7 +22,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "qemu/log.h"
#include "qemu/qemu-print.h"
#include "exec/cpu_ldst.h"
@@ -289,16 +289,21 @@ static void gen_jmp(DisasContext *s, TCGv dest)
s->base.is_jmp = DISAS_JUMP;
}
-static void gen_exception(DisasContext *s, uint32_t dest, int nr)
+static void gen_raise_exception(int nr)
{
TCGv_i32 tmp;
- update_cc_op(s);
- tcg_gen_movi_i32(QREG_PC, dest);
-
tmp = tcg_const_i32(nr);
gen_helper_raise_exception(cpu_env, tmp);
tcg_temp_free_i32(tmp);
+}
+
+static void gen_exception(DisasContext *s, uint32_t dest, int nr)
+{
+ update_cc_op(s);
+ tcg_gen_movi_i32(QREG_PC, dest);
+
+ gen_raise_exception(nr);
s->base.is_jmp = DISAS_NORETURN;
}
@@ -6198,29 +6203,36 @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
{
DisasContext *dc = container_of(dcbase, DisasContext, base);
- if (dc->base.is_jmp == DISAS_NORETURN) {
- return;
- }
- if (dc->base.singlestep_enabled) {
- gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG));
- return;
- }
-
switch (dc->base.is_jmp) {
+ case DISAS_NORETURN:
+ break;
case DISAS_TOO_MANY:
update_cc_op(dc);
- gen_jmp_tb(dc, 0, dc->pc);
+ if (dc->base.singlestep_enabled) {
+ tcg_gen_movi_i32(QREG_PC, dc->pc);
+ gen_raise_exception(EXCP_DEBUG);
+ } else {
+ gen_jmp_tb(dc, 0, dc->pc);
+ }
break;
case DISAS_JUMP:
/* We updated CC_OP and PC in gen_jmp/gen_jmp_im. */
- tcg_gen_lookup_and_goto_ptr();
+ if (dc->base.singlestep_enabled) {
+ gen_raise_exception(EXCP_DEBUG);
+ } else {
+ tcg_gen_lookup_and_goto_ptr();
+ }
break;
case DISAS_EXIT:
/*
* We updated CC_OP and PC in gen_exit_tb, but also modified
* other state that may require returning to the main loop.
*/
- tcg_gen_exit_tb(NULL, 0);
+ if (dc->base.singlestep_enabled) {
+ gen_raise_exception(EXCP_DEBUG);
+ } else {
+ tcg_gen_exit_tb(NULL, 0);
+ }
break;
default:
g_assert_not_reached();
diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
index 95773089aa..32522f606b 100644
--- a/target/microblaze/cpu.h
+++ b/target/microblaze/cpu.h
@@ -328,9 +328,6 @@ int cpu_mb_signal_handler(int host_signum, void *pinfo,
#define cpu_signal_handler cpu_mb_signal_handler
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _nommu
-#define MMU_MODE1_SUFFIX _kernel
-#define MMU_MODE2_SUFFIX _user
#define MMU_NOMMU_IDX 0
#define MMU_KERNEL_IDX 1
#define MMU_USER_IDX 2
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index 525115b041..37a844db99 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -22,7 +22,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/helper-proto.h"
#include "microblaze-decode.h"
#include "exec/cpu_ldst.h"
diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index ca00f41daf..c218ccc4a8 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -1147,10 +1147,6 @@ extern uint32_t cpu_rddsp(uint32_t mask_num, CPUMIPSState *env);
* MMU modes definitions. We carefully match the indices with our
* hflags layout.
*/
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _super
-#define MMU_MODE2_SUFFIX _user
-#define MMU_MODE3_SUFFIX _error
#define MMU_USER_IDX 2
static inline int hflags_mmu_index(uint32_t hflags)
diff --git a/target/mips/op_helper.c b/target/mips/op_helper.c
index 18fcee4a78..79d44da6fa 100644
--- a/target/mips/op_helper.c
+++ b/target/mips/op_helper.c
@@ -52,69 +52,6 @@ static void raise_exception(CPUMIPSState *env, uint32_t exception)
do_raise_exception(env, exception, 0);
}
-#if defined(CONFIG_USER_ONLY)
-#define HELPER_LD(name, insn, type) \
-static inline type do_##name(CPUMIPSState *env, target_ulong addr, \
- int mem_idx, uintptr_t retaddr) \
-{ \
- return (type) cpu_##insn##_data_ra(env, addr, retaddr); \
-}
-#else
-#define HELPER_LD(name, insn, type) \
-static inline type do_##name(CPUMIPSState *env, target_ulong addr, \
- int mem_idx, uintptr_t retaddr) \
-{ \
- switch (mem_idx) { \
- case 0: return (type) cpu_##insn##_kernel_ra(env, addr, retaddr); \
- case 1: return (type) cpu_##insn##_super_ra(env, addr, retaddr); \
- default: \
- case 2: return (type) cpu_##insn##_user_ra(env, addr, retaddr); \
- case 3: return (type) cpu_##insn##_error_ra(env, addr, retaddr); \
- } \
-}
-#endif
-HELPER_LD(lw, ldl, int32_t)
-#if defined(TARGET_MIPS64)
-HELPER_LD(ld, ldq, int64_t)
-#endif
-#undef HELPER_LD
-
-#if defined(CONFIG_USER_ONLY)
-#define HELPER_ST(name, insn, type) \
-static inline void do_##name(CPUMIPSState *env, target_ulong addr, \
- type val, int mem_idx, uintptr_t retaddr) \
-{ \
- cpu_##insn##_data_ra(env, addr, val, retaddr); \
-}
-#else
-#define HELPER_ST(name, insn, type) \
-static inline void do_##name(CPUMIPSState *env, target_ulong addr, \
- type val, int mem_idx, uintptr_t retaddr) \
-{ \
- switch (mem_idx) { \
- case 0: \
- cpu_##insn##_kernel_ra(env, addr, val, retaddr); \
- break; \
- case 1: \
- cpu_##insn##_super_ra(env, addr, val, retaddr); \
- break; \
- default: \
- case 2: \
- cpu_##insn##_user_ra(env, addr, val, retaddr); \
- break; \
- case 3: \
- cpu_##insn##_error_ra(env, addr, val, retaddr); \
- break; \
- } \
-}
-#endif
-HELPER_ST(sb, stb, uint8_t)
-HELPER_ST(sw, stl, uint32_t)
-#if defined(TARGET_MIPS64)
-HELPER_ST(sd, stq, uint64_t)
-#endif
-#undef HELPER_ST
-
/* 64 bits arithmetic for 32 bits hosts */
static inline uint64_t get_HILO(CPUMIPSState *env)
{
@@ -379,12 +316,12 @@ target_ulong helper_##name(CPUMIPSState *env, target_ulong arg, int mem_idx) \
} \
env->CP0_LLAddr = do_translate_address(env, arg, 0, GETPC()); \
env->lladdr = arg; \
- env->llval = do_##insn(env, arg, mem_idx, GETPC()); \
+ env->llval = cpu_##insn##_mmuidx_ra(env, arg, mem_idx, GETPC()); \
return env->llval; \
}
-HELPER_LD_ATOMIC(ll, lw, 0x3)
+HELPER_LD_ATOMIC(ll, ldl, 0x3)
#ifdef TARGET_MIPS64
-HELPER_LD_ATOMIC(lld, ld, 0x7)
+HELPER_LD_ATOMIC(lld, ldq, 0x7)
#endif
#undef HELPER_LD_ATOMIC
#endif
@@ -400,42 +337,42 @@ HELPER_LD_ATOMIC(lld, ld, 0x7)
void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
int mem_idx)
{
- do_sb(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
+ cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC());
if (GET_LMASK(arg2) <= 2) {
- do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16),
+ mem_idx, GETPC());
}
if (GET_LMASK(arg2) <= 1) {
- do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8),
+ mem_idx, GETPC());
}
if (GET_LMASK(arg2) == 0) {
- do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)arg1, mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)arg1,
+ mem_idx, GETPC());
}
}
void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
int mem_idx)
{
- do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+ cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
if (GET_LMASK(arg2) >= 1) {
- do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
+ mem_idx, GETPC());
}
if (GET_LMASK(arg2) >= 2) {
- do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
+ mem_idx, GETPC());
}
if (GET_LMASK(arg2) == 3) {
- do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+ mem_idx, GETPC());
}
}
@@ -453,82 +390,82 @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
int mem_idx)
{
- do_sb(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
+ cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC());
if (GET_LMASK64(arg2) <= 6) {
- do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 5) {
- do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 4) {
- do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 3) {
- do_sb(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 2) {
- do_sb(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 1) {
- do_sb(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) <= 0) {
- do_sb(env, GET_OFFSET(arg2, 7), (uint8_t)arg1, mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 7), (uint8_t)arg1,
+ mem_idx, GETPC());
}
}
void helper_sdr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2,
int mem_idx)
{
- do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
+ cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC());
if (GET_LMASK64(arg2) >= 1) {
- do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) >= 2) {
- do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) >= 3) {
- do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) >= 4) {
- do_sb(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) >= 5) {
- do_sb(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) >= 6) {
- do_sb(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48),
+ mem_idx, GETPC());
}
if (GET_LMASK64(arg2) == 7) {
- do_sb(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56), mem_idx,
- GETPC());
+ cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56),
+ mem_idx, GETPC());
}
}
#endif /* TARGET_MIPS64 */
@@ -546,14 +483,14 @@ void helper_lwm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
for (i = 0; i < base_reglist; i++) {
env->active_tc.gpr[multiple_regs[i]] =
- (target_long)do_lw(env, addr, mem_idx, GETPC());
+ (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
addr += 4;
}
}
if (do_r31) {
- env->active_tc.gpr[31] = (target_long)do_lw(env, addr, mem_idx,
- GETPC());
+ env->active_tc.gpr[31] =
+ (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC());
}
}
@@ -567,14 +504,14 @@ void helper_swm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
target_ulong i;
for (i = 0; i < base_reglist; i++) {
- do_sw(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
- GETPC());
+ cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
+ mem_idx, GETPC());
addr += 4;
}
}
if (do_r31) {
- do_sw(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
+ cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
}
}
@@ -589,14 +526,15 @@ void helper_ldm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
target_ulong i;
for (i = 0; i < base_reglist; i++) {
- env->active_tc.gpr[multiple_regs[i]] = do_ld(env, addr, mem_idx,
- GETPC());
+ env->active_tc.gpr[multiple_regs[i]] =
+ cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
addr += 8;
}
}
if (do_r31) {
- env->active_tc.gpr[31] = do_ld(env, addr, mem_idx, GETPC());
+ env->active_tc.gpr[31] =
+ cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC());
}
}
@@ -610,14 +548,14 @@ void helper_sdm(CPUMIPSState *env, target_ulong addr, target_ulong reglist,
target_ulong i;
for (i = 0; i < base_reglist; i++) {
- do_sd(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx,
- GETPC());
+ cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]],
+ mem_idx, GETPC());
addr += 8;
}
}
if (do_r31) {
- do_sd(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
+ cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC());
}
}
#endif
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 4bff585bd6..efe75e6be0 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -26,7 +26,7 @@
#include "internal.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "hw/mips/cpudevs.h"
diff --git a/target/moxie/translate.c b/target/moxie/translate.c
index c87e9ec2b1..d5fb27dfb8 100644
--- a/target/moxie/translate.c
+++ b/target/moxie/translate.c
@@ -26,7 +26,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "qemu/qemu-print.h"
diff --git a/target/nios2/cpu.h b/target/nios2/cpu.h
index 361b06ffeb..78f633f970 100644
--- a/target/nios2/cpu.h
+++ b/target/nios2/cpu.h
@@ -217,8 +217,6 @@ void do_nios2_semihosting(CPUNios2State *env);
#define CPU_SAVE_VERSION 1
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_SUPERVISOR_IDX 0
#define MMU_USER_IDX 1
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index 82107bf270..6c34cd3193 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -23,7 +23,7 @@
#include "qemu/osdep.h"
#include "cpu.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/exec-all.h"
#include "disas/disas.h"
#include "exec/helper-proto.h"
diff --git a/target/openrisc/fpu_helper.c b/target/openrisc/fpu_helper.c
index 59e1413279..6f75ea0505 100644
--- a/target/openrisc/fpu_helper.c
+++ b/target/openrisc/fpu_helper.c
@@ -70,7 +70,7 @@ void cpu_set_fpcsr(CPUOpenRISCState *env, uint32_t val)
float_round_down
};
- env->fpcsr = val & 0x7ff;
+ env->fpcsr = val & 0xfff;
set_float_rounding_mode(rm_to_sf[extract32(val, 1, 2)], &env->fp_status);
}
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index 8dd28d6cf1..52323a16df 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -22,7 +22,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "qemu/log.h"
#include "qemu/bitops.h"
#include "qemu/qemu-print.h"
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 103bfe9dc2..8ebeaba649 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -951,8 +951,6 @@ struct ppc_radix_page_info {
* + real/paged mode combinations. The other two modes are for
* external PID load/store.
*/
-#define MMU_MODE8_SUFFIX _epl
-#define MMU_MODE9_SUFFIX _eps
#define PPC_TLB_EPID_LOAD 8
#define PPC_TLB_EPID_STORE 9
diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c
index 1351b53f28..e8e2a8ac2a 100644
--- a/target/ppc/mem_helper.c
+++ b/target/ppc/mem_helper.c
@@ -25,7 +25,7 @@
#include "exec/helper-proto.h"
#include "helper_regs.h"
#include "exec/cpu_ldst.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "internal.h"
#include "qemu/atomic128.h"
@@ -177,14 +177,7 @@ static void dcbz_common(CPUPPCState *env, target_ulong addr,
} else {
/* Slow path */
for (i = 0; i < dcbz_size; i += 8) {
- if (epid) {
-#if !defined(CONFIG_USER_ONLY)
- /* Does not make sense on USER_ONLY config */
- cpu_stq_eps_ra(env, addr + i, 0, retaddr);
-#endif
- } else {
- cpu_stq_data_ra(env, addr + i, 0, retaddr);
- }
+ cpu_stq_mmuidx_ra(env, addr + i, 0, mmu_idx, retaddr);
}
}
}
@@ -216,7 +209,7 @@ void helper_icbiep(CPUPPCState *env, target_ulong addr)
#if !defined(CONFIG_USER_ONLY)
/* See comments above */
addr &= ~(env->dcache_line_size - 1);
- cpu_ldl_epl_ra(env, addr, GETPC());
+ cpu_ldl_mmuidx_ra(env, addr, PPC_TLB_EPID_LOAD, GETPC());
#endif
}
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index f5fe5d0611..9dcf8dc261 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -23,8 +23,8 @@
#include "internal.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "qemu/host-utils.h"
#include "qemu/main-loop.h"
#include "exec/cpu_ldst.h"
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 767c8762ac..85403da9c8 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -22,7 +22,7 @@
#include "qemu/main-loop.h"
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "trace.h"
int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 8e40ed3ac4..14dc71156b 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -19,7 +19,7 @@
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "cpu.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "disas/disas.h"
#include "exec/cpu_ldst.h"
#include "exec/exec-all.h"
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index e195e5c7c8..8a557fd8d1 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -36,11 +36,6 @@
#define TARGET_INSN_START_EXTRA_WORDS 2
-#define MMU_MODE0_SUFFIX _primary
-#define MMU_MODE1_SUFFIX _secondary
-#define MMU_MODE2_SUFFIX _home
-#define MMU_MODE3_SUFFIX _real
-
#define MMU_USER_IDX 0
#define S390_MAX_CPUS 248
diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c
index 2921419c27..a237dec757 100644
--- a/target/s390x/mem_helper.c
+++ b/target/s390x/mem_helper.c
@@ -27,6 +27,7 @@
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "qemu/atomic128.h"
+#include "tcg/tcg.h"
#if !defined(CONFIG_USER_ONLY)
#include "hw/s390x/storage-keys.h"
@@ -2025,7 +2026,7 @@ uint32_t HELPER(testblock)(CPUS390XState *env, uint64_t real_addr)
real_addr = wrap_address(env, real_addr) & TARGET_PAGE_MASK;
for (i = 0; i < TARGET_PAGE_SIZE; i += 8) {
- cpu_stq_real_ra(env, real_addr + i, 0, ra);
+ cpu_stq_mmuidx_ra(env, real_addr + i, 0, MMU_REAL_IDX, ra);
}
return 0;
@@ -2259,11 +2260,11 @@ void HELPER(idte)(CPUS390XState *env, uint64_t r1, uint64_t r2, uint32_t m4)
for (i = 0; i < entries; i++) {
/* addresses are not wrapped in 24/31bit mode but table index is */
raddr = table + ((index + i) & 0x7ff) * sizeof(entry);
- entry = cpu_ldq_real_ra(env, raddr, ra);
+ entry = cpu_ldq_mmuidx_ra(env, raddr, MMU_REAL_IDX, ra);
if (!(entry & REGION_ENTRY_I)) {
/* we are allowed to not store if already invalid */
entry |= REGION_ENTRY_I;
- cpu_stq_real_ra(env, raddr, entry, ra);
+ cpu_stq_mmuidx_ra(env, raddr, entry, MMU_REAL_IDX, ra);
}
}
}
@@ -2290,9 +2291,9 @@ void HELPER(ipte)(CPUS390XState *env, uint64_t pto, uint64_t vaddr,
pte_addr += VADDR_PAGE_TX(vaddr) * 8;
/* Mark the page table entry as invalid */
- pte = cpu_ldq_real_ra(env, pte_addr, ra);
+ pte = cpu_ldq_mmuidx_ra(env, pte_addr, MMU_REAL_IDX, ra);
pte |= PAGE_ENTRY_I;
- cpu_stq_real_ra(env, pte_addr, pte, ra);
+ cpu_stq_mmuidx_ra(env, pte_addr, pte, MMU_REAL_IDX, ra);
/* XXX we exploit the fact that Linux passes the exact virtual
address here - it's not obliged to! */
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index 4292bb0dd0..b764ec3140 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -33,8 +33,8 @@
#include "internal.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "qemu/log.h"
#include "qemu/host-utils.h"
#include "exec/cpu_ldst.h"
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index ecaa7a18a9..452a596e67 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -254,8 +254,6 @@ void cpu_load_tlb(CPUSH4State * env);
#define cpu_list sh4_cpu_list
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_USER_IDX 1
static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch)
{
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 922785e225..6192d83e8c 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -23,7 +23,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "exec/helper-proto.h"
#include "exec/helper-gen.h"
diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
index 7345827a96..e91cfdecd3 100644
--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@@ -19,7 +19,7 @@
#include "qemu/osdep.h"
#include "cpu.h"
-#include "tcg.h"
+#include "tcg/tcg.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index edc23a7c40..9416a551cf 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -24,7 +24,7 @@
#include "disas/disas.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "exec/helper-gen.h"
diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c
index abce7e1c75..65f1c91f4f 100644
--- a/target/tilegx/translate.c
+++ b/target/tilegx/translate.c
@@ -24,7 +24,7 @@
#include "exec/log.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "linux-user/syscall_defs.h"
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index c574638c9f..609d75ae8a 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -22,7 +22,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "exec/cpu_ldst.h"
#include "qemu/qemu-print.h"
diff --git a/target/unicore32/cpu.h b/target/unicore32/cpu.h
index 50ed9dd99c..7a32e086ed 100644
--- a/target/unicore32/cpu.h
+++ b/target/unicore32/cpu.h
@@ -133,8 +133,6 @@ void cpu_asr_write(CPUUniCore32State *env1, target_ulong val, target_ulong mask)
int uc32_cpu_signal_handler(int host_signum, void *pinfo, void *puc);
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _kernel
-#define MMU_MODE1_SUFFIX _user
#define MMU_USER_IDX 1
static inline int cpu_mmu_index(CPUUniCore32State *env, bool ifetch)
{
diff --git a/target/unicore32/translate.c b/target/unicore32/translate.c
index 0f6891b8aa..d4b06df672 100644
--- a/target/unicore32/translate.c
+++ b/target/unicore32/translate.c
@@ -13,7 +13,7 @@
#include "cpu.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "qemu/log.h"
#include "exec/cpu_ldst.h"
#include "exec/translator.h"
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index 75e65df597..493f4fc80c 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -689,10 +689,6 @@ static inline uint32_t xtensa_replicate_windowstart(CPUXtensaState *env)
}
/* MMU modes definitions */
-#define MMU_MODE0_SUFFIX _ring0
-#define MMU_MODE1_SUFFIX _ring1
-#define MMU_MODE2_SUFFIX _ring2
-#define MMU_MODE3_SUFFIX _ring3
#define MMU_USER_IDX 3
static inline int cpu_mmu_index(CPUXtensaState *env, bool ifetch)
diff --git a/target/xtensa/mmu_helper.c b/target/xtensa/mmu_helper.c
index f15bff306f..b01ff9399a 100644
--- a/target/xtensa/mmu_helper.c
+++ b/target/xtensa/mmu_helper.c
@@ -63,10 +63,11 @@
void HELPER(itlb_hit_test)(CPUXtensaState *env, uint32_t vaddr)
{
/*
- * Attempt the memory load; we don't care about the result but
+ * Probe the memory; we don't care about the result but
* only the side-effects (ie any MMU or other exception)
*/
- cpu_ldub_code_ra(env, vaddr, GETPC());
+ probe_access(env, vaddr, 1, MMU_INST_FETCH,
+ cpu_mmu_index(env, true), GETPC());
}
void HELPER(wsr_rasid)(CPUXtensaState *env, uint32_t v)
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index e6d910786c..8aa972cafd 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -33,7 +33,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "disas/disas.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#include "qemu/log.h"
#include "qemu/qemu-print.h"
#include "exec/cpu_ldst.h"
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 3f921015d3..843fd0ca69 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -10,7 +10,7 @@
* See the COPYING file in the top-level directory for details.
*/
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#include "qemu/bitops.h"
/* We're going to re-use TCGType in setting of the SF bit, which controls
@@ -1541,7 +1541,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
}
#ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* TCGMemOpIdx oi, uintptr_t ra)
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 94d80d79d1..fffb6611e2 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -23,7 +23,7 @@
*/
#include "elf.h"
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
int arm_arch = __ARM_ARCH;
@@ -1131,7 +1131,7 @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
}
#ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* int mmu_idx, uintptr_t ra)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 928e8b87bb..bfb3f5f6e9 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -223,7 +223,7 @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
* The x86 has a pretty strong memory ordering which only really
* allows for some stores to be re-ordered after loads.
*/
-#include "tcg-mo.h"
+#include "tcg/tcg-mo.h"
#define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 9d8ed974e0..cdedcb2b25 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -22,7 +22,7 @@
* THE SOFTWARE.
*/
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
@@ -1647,7 +1647,7 @@ static void tcg_out_nopn(TCGContext *s, int n)
}
#if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* int mmu_idx, uintptr_t ra)
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 5442167045..1da663ce84 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -1107,7 +1107,7 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *arg)
}
#if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
static void * const qemu_ld_helpers[16] = {
[MO_UB] = helper_ret_ldub_mmu,
diff --git a/tcg/optimize.c b/tcg/optimize.c
index f7f4e873c9..53aa8e5329 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -24,7 +24,7 @@
*/
#include "qemu/osdep.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#define CASE_OP_32_64(x) \
glue(glue(case INDEX_op_, x), _i32): \
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index d308d69aba..ee1f9227c1 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -23,7 +23,7 @@
*/
#include "elf.h"
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#if defined _CALL_DARWIN || defined __APPLE__
#define TCG_TARGET_CALL_DARWIN
@@ -1845,7 +1845,7 @@ static const uint32_t qemu_exts_opc[4] = {
};
#if defined (CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
* int mmu_idx, uintptr_t ra)
diff --git a/tcg/riscv/tcg-target.inc.c b/tcg/riscv/tcg-target.inc.c
index 7018509693..2bc0ba71f2 100644
--- a/tcg/riscv/tcg-target.inc.c
+++ b/tcg/riscv/tcg-target.inc.c
@@ -27,7 +27,7 @@
* THE SOFTWARE.
*/
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
@@ -921,7 +921,7 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
*/
#if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
* TCGMemOpIdx oi, uintptr_t ra)
diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
index 8aaa4cebe8..b07e9ff7d6 100644
--- a/tcg/s390/tcg-target.inc.c
+++ b/tcg/s390/tcg-target.inc.c
@@ -29,7 +29,7 @@
#error "unsupported code generation mode"
#endif
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#include "elf.h"
/* ??? The translation blocks produced by TCG are generally small enough to
@@ -1536,7 +1536,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data,
}
#if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "../tcg-ldst.inc.c"
/* We're expecting to use a 20-bit negative offset on the tlb memory ops. */
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c
index d7986cda5c..65fddb310d 100644
--- a/tcg/sparc/tcg-target.inc.c
+++ b/tcg/sparc/tcg-target.inc.c
@@ -22,7 +22,7 @@
* THE SOFTWARE.
*/
-#include "tcg-pool.inc.c"
+#include "../tcg-pool.inc.c"
#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c
index 97305a3efc..7e1992e79e 100644
--- a/tcg/tcg-common.c
+++ b/tcg/tcg-common.c
@@ -32,7 +32,7 @@ uintptr_t tci_tb_ptr;
TCGOpDef tcg_op_defs[] = {
#define DEF(s, oargs, iargs, cargs, flags) \
{ #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
-#include "tcg-opc.h"
+#include "tcg/tcg-opc.h"
#undef DEF
};
const size_t tcg_op_defs_max = ARRAY_SIZE(tcg_op_defs);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 5c95ecd51c..41b4a3c661 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -18,11 +18,11 @@
*/
#include "qemu/osdep.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-op-gvec.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
#include "qemu/main-loop.h"
-#include "tcg-gvec-desc.h"
+#include "tcg/tcg-gvec-desc.h"
#define MAX_UNROLL 4
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index 6714991bf4..b6937e8d64 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -19,9 +19,9 @@
#include "qemu/osdep.h"
#include "cpu.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-mo.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-mo.h"
/* Reduce the number of ifdefs below. This assumes that all uses of
TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index c245126f98..7d782002e3 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -25,9 +25,9 @@
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/exec-all.h"
-#include "tcg.h"
-#include "tcg-op.h"
-#include "tcg-mo.h"
+#include "tcg/tcg.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-mo.h"
#include "trace-tcg.h"
#include "trace/mem.h"
#include "exec/plugin-gen.h"
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 4f616ba38b..dd4b3d7684 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -48,7 +48,7 @@
#include "hw/boards.h"
#endif
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
#if UINTPTR_MAX == UINT32_MAX
# define ELF_CLASS ELFCLASS32
diff --git a/tcg/tci.c b/tcg/tci.c
index a6208653e8..46fe9ce63f 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -30,7 +30,7 @@
#include "qemu-common.h"
#include "tcg/tcg.h" /* MAX_OPC_PARAM_IARGS */
#include "exec/cpu_ldst.h"
-#include "tcg-op.h"
+#include "tcg/tcg-op.h"
/* Marker for missing code. */
#define TODO() \
diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py
index 9c6aa2040a..e40b84651b 100644
--- a/tests/acceptance/boot_linux_console.py
+++ b/tests/acceptance/boot_linux_console.py
@@ -400,6 +400,91 @@ class BootLinuxConsole(Test):
self.wait_for_console_pattern('Boot successful.')
# TODO user command, for now the uart is stuck
+ def test_arm_cubieboard_initrd(self):
+ """
+ :avocado: tags=arch:arm
+ :avocado: tags=machine:cubieboard
+ """
+ deb_url = ('https://apt.armbian.com/pool/main/l/'
+ 'linux-4.20.7-sunxi/linux-image-dev-sunxi_5.75_armhf.deb')
+ deb_hash = '1334c29c44d984ffa05ed10de8c3361f33d78315'
+ deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
+ kernel_path = self.extract_from_deb(deb_path,
+ '/boot/vmlinuz-4.20.7-sunxi')
+ dtb_path = '/usr/lib/linux-image-dev-sunxi/sun4i-a10-cubieboard.dtb'
+ dtb_path = self.extract_from_deb(deb_path, dtb_path)
+ initrd_url = ('https://github.com/groeck/linux-build-test/raw/'
+ '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/'
+ 'arm/rootfs-armv5.cpio.gz')
+ initrd_hash = '2b50f1873e113523967806f4da2afe385462ff9b'
+ initrd_path_gz = self.fetch_asset(initrd_url, asset_hash=initrd_hash)
+ initrd_path = os.path.join(self.workdir, 'rootfs.cpio')
+ archive.gzip_uncompress(initrd_path_gz, initrd_path)
+
+ self.vm.set_console()
+ kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
+ 'console=ttyS0,115200 '
+ 'usbcore.nousb '
+ 'panic=-1 noreboot')
+ self.vm.add_args('-kernel', kernel_path,
+ '-dtb', dtb_path,
+ '-initrd', initrd_path,
+ '-append', kernel_command_line,
+ '-no-reboot')
+ self.vm.launch()
+ self.wait_for_console_pattern('Boot successful.')
+
+ exec_command_and_wait_for_pattern(self, 'cat /proc/cpuinfo',
+ 'Allwinner sun4i/sun5i')
+ exec_command_and_wait_for_pattern(self, 'cat /proc/iomem',
+ 'system-control@1c00000')
+ exec_command_and_wait_for_pattern(self, 'reboot',
+ 'reboot: Restarting system')
+
+ def test_arm_cubieboard_sata(self):
+ """
+ :avocado: tags=arch:arm
+ :avocado: tags=machine:cubieboard
+ """
+ deb_url = ('https://apt.armbian.com/pool/main/l/'
+ 'linux-4.20.7-sunxi/linux-image-dev-sunxi_5.75_armhf.deb')
+ deb_hash = '1334c29c44d984ffa05ed10de8c3361f33d78315'
+ deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
+ kernel_path = self.extract_from_deb(deb_path,
+ '/boot/vmlinuz-4.20.7-sunxi')
+ dtb_path = '/usr/lib/linux-image-dev-sunxi/sun4i-a10-cubieboard.dtb'
+ dtb_path = self.extract_from_deb(deb_path, dtb_path)
+ rootfs_url = ('https://github.com/groeck/linux-build-test/raw/'
+ '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/'
+ 'arm/rootfs-armv5.ext2.gz')
+ rootfs_hash = '093e89d2b4d982234bf528bc9fb2f2f17a9d1f93'
+ rootfs_path_gz = self.fetch_asset(rootfs_url, asset_hash=rootfs_hash)
+ rootfs_path = os.path.join(self.workdir, 'rootfs.cpio')
+ archive.gzip_uncompress(rootfs_path_gz, rootfs_path)
+
+ self.vm.set_console()
+ kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE +
+ 'console=ttyS0,115200 '
+ 'usbcore.nousb '
+ 'root=/dev/sda ro '
+ 'panic=-1 noreboot')
+ self.vm.add_args('-kernel', kernel_path,
+ '-dtb', dtb_path,
+ '-drive', 'if=none,format=raw,id=disk0,file='
+ + rootfs_path,
+ '-device', 'ide-hd,bus=ide.0,drive=disk0',
+ '-append', kernel_command_line,
+ '-no-reboot')
+ self.vm.launch()
+ self.wait_for_console_pattern('Boot successful.')
+
+ exec_command_and_wait_for_pattern(self, 'cat /proc/cpuinfo',
+ 'Allwinner sun4i/sun5i')
+ exec_command_and_wait_for_pattern(self, 'cat /proc/partitions',
+ 'sda')
+ exec_command_and_wait_for_pattern(self, 'reboot',
+ 'reboot: Restarting system')
+
def test_s390x_s390_ccw_virtio(self):
"""
:avocado: tags=arch:s390x
diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT
index 77ea60ffed..1f91888d7a 100644
--- a/tests/data/acpi/q35/DSDT
+++ b/tests/data/acpi/q35/DSDT
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat
index 30e3717b5b..3586f6368a 100644
--- a/tests/data/acpi/q35/DSDT.acpihmat
+++ b/tests/data/acpi/q35/DSDT.acpihmat
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge
index fbc2d40000..eae3a2a865 100644
--- a/tests/data/acpi/q35/DSDT.bridge
+++ b/tests/data/acpi/q35/DSDT.bridge
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp
index 6a896cb214..53d735a4de 100644
--- a/tests/data/acpi/q35/DSDT.cphp
+++ b/tests/data/acpi/q35/DSDT.cphp
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm
index 23fdf5e60a..02ccdd5f38 100644
--- a/tests/data/acpi/q35/DSDT.dimmpxm
+++ b/tests/data/acpi/q35/DSDT.dimmpxm
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt
index c3fca0a71e..9e2d4f785c 100644
--- a/tests/data/acpi/q35/DSDT.ipmibt
+++ b/tests/data/acpi/q35/DSDT.ipmibt
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp
index 2abd0e36cd..baefa611ac 100644
--- a/tests/data/acpi/q35/DSDT.memhp
+++ b/tests/data/acpi/q35/DSDT.memhp
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64
index b32034a11c..aae0ea2110 100644
--- a/tests/data/acpi/q35/DSDT.mmio64
+++ b/tests/data/acpi/q35/DSDT.mmio64
Binary files differ
diff --git a/tests/data/acpi/q35/DSDT.numamem b/tests/data/acpi/q35/DSDT.numamem
index d8b2b47f8b..859a2e0871 100644
--- a/tests/data/acpi/q35/DSDT.numamem
+++ b/tests/data/acpi/q35/DSDT.numamem
Binary files differ
diff --git a/tests/data/acpi/rebuild-expected-aml.sh b/tests/data/acpi/rebuild-expected-aml.sh
index f89d4624bc..d44e511533 100755
--- a/tests/data/acpi/rebuild-expected-aml.sh
+++ b/tests/data/acpi/rebuild-expected-aml.sh
@@ -14,7 +14,7 @@
qemu_bins="x86_64-softmmu/qemu-system-x86_64 aarch64-softmmu/qemu-system-aarch64"
-if [ ! -e "tests/bios-tables-test" ]; then
+if [ ! -e "tests/qtest/bios-tables-test" ]; then
echo "Test: bios-tables-test is required! Run make check before this script."
echo "Run this script from the build directory."
exit 1;
@@ -26,11 +26,11 @@ for qemu in $qemu_bins; do
echo "Also, run this script from the build directory."
exit 1;
fi
- TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test
+ TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/qtest/bios-tables-test
done
eval `grep SRC_PATH= config-host.mak`
-echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/bios-tables-test-allowed-diff.h
+echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h
echo "The files were rebuilt and can be added to git."
diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index f1ac2d7e96..3ab4872bd7 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -16,7 +16,10 @@
* 1. add empty files for new tables, if any, under tests/data/acpi
* 2. list any changed files in tests/bios-tables-test-allowed-diff.h
* 3. commit the above *before* making changes that affect the tables
- * Maintainer:
+ *
+ * Contributor or ACPI Maintainer (steps 4-7 need to be redone to resolve conflicts
+ * in binary commit created in step 6):
+ *
* After 1-3 above tests will pass but ignore differences with the expected files.
* You will also notice that tests/bios-tables-test-allowed-diff.h lists
* a bunch of files. This is your hint that you need to do the below:
@@ -28,13 +31,23 @@
* output. If not - disassemble them yourself in any way you like.
* Look at the differences - make sure they make sense and match what the
* changes you are merging are supposed to do.
+ * Save the changes, preferably in form of ASL diff for the commit log in
+ * step 6.
*
* 5. From build directory, run:
* $(SRC_PATH)/tests/data/acpi/rebuild-expected-aml.sh
- * 6. Now commit any changes.
- * 7. Before doing a pull request, make sure tests/bios-tables-test-allowed-diff.h
- * is empty - this will ensure following changes to ACPI tables will
- * be noticed.
+ * 6. Now commit any changes to the expected binary, include diff from step 4
+ * in commit log.
+ * 7. Before sending patches to the list (Contributor)
+ * or before doing a pull request (Maintainer), make sure
+ * tests/bios-tables-test-allowed-diff.h is empty - this will ensure
+ * following changes to ACPI tables will be noticed.
+ *
+ * The resulting patchset/pull request then looks like this:
+ * - patch 1: list changed files in tests/bios-tables-test-allowed-diff.h.
+ * - patches 2 - n: real changes, may contain multiple patches.
+ * - patch n + 1: update golden master binaries and empty
+ * tests/bios-tables-test-allowed-diff.h
*/
#include "qemu/osdep.h"
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 53afec4395..26e2e77289 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -356,6 +356,43 @@ static void migrate_set_parameter_int(QTestState *who, const char *parameter,
migrate_check_parameter_int(who, parameter, value);
}
+static char *migrate_get_parameter_str(QTestState *who,
+ const char *parameter)
+{
+ QDict *rsp;
+ char *result;
+
+ rsp = wait_command(who, "{ 'execute': 'query-migrate-parameters' }");
+ result = g_strdup(qdict_get_str(rsp, parameter));
+ qobject_unref(rsp);
+ return result;
+}
+
+static void migrate_check_parameter_str(QTestState *who, const char *parameter,
+ const char *value)
+{
+ char *result;
+
+ result = migrate_get_parameter_str(who, parameter);
+ g_assert_cmpstr(result, ==, value);
+ g_free(result);
+}
+
+__attribute__((unused))
+static void migrate_set_parameter_str(QTestState *who, const char *parameter,
+ const char *value)
+{
+ QDict *rsp;
+
+ rsp = qtest_qmp(who,
+ "{ 'execute': 'migrate-set-parameters',"
+ "'arguments': { %s: %s } }",
+ parameter, value);
+ g_assert(qdict_haskey(rsp, "return"));
+ qobject_unref(rsp);
+ migrate_check_parameter_str(who, parameter, value);
+}
+
static void migrate_pause(QTestState *who)
{
QDict *rsp;
@@ -480,14 +517,14 @@ static int test_migrate_start(QTestState **from, QTestState **to,
} else if (strcmp(arch, "ppc64") == 0) {
machine_opts = "vsmt=8";
memory_size = "256M";
+ start_address = PPC_TEST_MEM_START;
+ end_address = PPC_TEST_MEM_END;
arch_source = g_strdup_printf("-nodefaults "
"-prom-env 'use-nvramrc?=true' -prom-env "
"'nvramrc=hex .\" _\" begin %x %x "
"do i c@ 1 + i c! 1000 +loop .\" B\" 0 "
"until'", end_address, start_address);
arch_target = g_strdup("");
- start_address = PPC_TEST_MEM_START;
- end_address = PPC_TEST_MEM_END;
} else if (strcmp(arch, "aarch64") == 0) {
init_bootfile(bootpath, aarch64_kernel, sizeof(aarch64_kernel));
machine_opts = "virt,gic-version=max";
@@ -1202,6 +1239,61 @@ static void test_migrate_auto_converge(void)
test_migrate_end(from, to, true);
}
+static void test_multifd_tcp(void)
+{
+ MigrateStart *args = migrate_start_new();
+ QTestState *from, *to;
+ QDict *rsp;
+ char *uri;
+
+ if (test_migrate_start(&from, &to, "defer", args)) {
+ return;
+ }
+
+ /*
+ * We want to pick a speed slow enough that the test completes
+ * quickly, but that it doesn't complete precopy even on a slow
+ * machine, so also set the downtime.
+ */
+ /* 1 ms should make it not converge*/
+ migrate_set_parameter_int(from, "downtime-limit", 1);
+ /* 1GB/s */
+ migrate_set_parameter_int(from, "max-bandwidth", 1000000000);
+
+ migrate_set_parameter_int(from, "multifd-channels", 16);
+ migrate_set_parameter_int(to, "multifd-channels", 16);
+
+ migrate_set_capability(from, "multifd", "true");
+ migrate_set_capability(to, "multifd", "true");
+
+ /* Start incoming migration from the 1st socket */
+ rsp = wait_command(to, "{ 'execute': 'migrate-incoming',"
+ " 'arguments': { 'uri': 'tcp:127.0.0.1:0' }}");
+ qobject_unref(rsp);
+
+ /* Wait for the first serial output from the source */
+ wait_for_serial("src_serial");
+
+ uri = migrate_get_socket_address(to, "socket-address");
+
+ migrate_qmp(from, uri, "{}");
+
+ wait_for_migration_pass(from);
+
+ /* 300ms it should converge */
+ migrate_set_parameter_int(from, "downtime-limit", 300);
+
+ if (!got_stop) {
+ qtest_qmp_eventwait(from, "STOP");
+ }
+ qtest_qmp_eventwait(to, "RESUME");
+
+ wait_for_serial("dest_serial");
+ wait_for_migration_complete(from);
+ test_migrate_end(from, to, true);
+ free(uri);
+}
+
int main(int argc, char **argv)
{
char template[] = "/tmp/migration-test-XXXXXX";
@@ -1266,6 +1358,7 @@ int main(int argc, char **argv)
test_validate_uuid_dst_not_set);
qtest_add_func("/migration/auto_converge", test_migrate_auto_converge);
+ qtest_add_func("/migration/multifd/tcp", test_multifd_tcp);
ret = g_test_run();
diff --git a/tests/qtest/q35-test.c b/tests/qtest/q35-test.c
index a68183d513..c922d81bc0 100644
--- a/tests/qtest/q35-test.c
+++ b/tests/qtest/q35-test.c
@@ -186,6 +186,109 @@ static void test_tseg_size(const void *data)
qtest_quit(qts);
}
+#define SMBASE 0x30000
+#define SMRAM_TEST_PATTERN 0x32
+#define SMRAM_TEST_RESET_PATTERN 0x23
+
+static void test_smram_smbase_lock(void)
+{
+ QPCIBus *pcibus;
+ QPCIDevice *pcidev;
+ QDict *response;
+ QTestState *qts;
+ int i;
+
+ qts = qtest_init("-M q35");
+
+ pcibus = qpci_new_pc(qts, NULL);
+ g_assert(pcibus != NULL);
+
+ pcidev = qpci_device_find(pcibus, 0);
+ g_assert(pcidev != NULL);
+
+ /* check that SMRAM is not enabled by default */
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0);
+ qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN);
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN);
+
+ /* check that writing junk to 0x9c before before negotiating is ignored */
+ for (i = 0; i < 0xff; i++) {
+ qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0);
+ }
+
+ /* enable SMRAM at SMBASE */
+ qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, 0xff);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x01);
+ /* lock SMRAM at SMBASE */
+ qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, 0x02);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x02);
+
+ /* check that SMRAM at SMBASE is locked and can't be unlocked */
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, 0xff);
+ for (i = 0; i <= 0xff; i++) {
+ /* make sure register is immutable */
+ qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x02);
+
+ /* RAM access should go into black hole */
+ qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN);
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, 0xff);
+ }
+
+ /* reset */
+ response = qtest_qmp(qts, "{'execute': 'system_reset', 'arguments': {} }");
+ g_assert(response);
+ g_assert(!qdict_haskey(response, "error"));
+ qobject_unref(response);
+
+ /* check RAM at SMBASE is available after reset */
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0);
+ qtest_writeb(qts, SMBASE, SMRAM_TEST_RESET_PATTERN);
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_RESET_PATTERN);
+
+ g_free(pcidev);
+ qpci_free_pc(pcibus);
+
+ qtest_quit(qts);
+}
+
+static void test_without_smram_base(void)
+{
+ QPCIBus *pcibus;
+ QPCIDevice *pcidev;
+ QTestState *qts;
+ int i;
+
+ qts = qtest_init("-M pc-q35-4.1");
+
+ pcibus = qpci_new_pc(qts, NULL);
+ g_assert(pcibus != NULL);
+
+ pcidev = qpci_device_find(pcibus, 0);
+ g_assert(pcidev != NULL);
+
+ /* check that RAM is accessible */
+ qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN);
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN);
+
+ /* check that writing to 0x9c succeeds */
+ for (i = 0; i <= 0xff; i++) {
+ qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i);
+ g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == i);
+ }
+
+ /* check that RAM is still accessible */
+ qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN + 1);
+ g_assert_cmpint(qtest_readb(qts, SMBASE), ==, (SMRAM_TEST_PATTERN + 1));
+
+ g_free(pcidev);
+ qpci_free_pc(pcibus);
+
+ qtest_quit(qts);
+}
+
int main(int argc, char **argv)
{
g_test_init(&argc, &argv, NULL);
@@ -197,5 +300,7 @@ int main(int argc, char **argv)
qtest_add_data_func("/q35/tseg-size/8mb", &tseg_8mb, test_tseg_size);
qtest_add_data_func("/q35/tseg-size/ext/16mb", &tseg_ext_16mb,
test_tseg_size);
+ qtest_add_func("/q35/smram/smbase_lock", test_smram_smbase_lock);
+ qtest_add_func("/q35/smram/legacy_smbase", test_without_smram_base);
return g_test_run();
}
diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c
index 2324b964ad..9ee0f1e4fd 100644
--- a/tests/qtest/vhost-user-test.c
+++ b/tests/qtest/vhost-user-test.c
@@ -707,9 +707,9 @@ static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc)
static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
{
TestServer *s = arg;
- TestServer *dest = test_server_new("dest");
- GString *dest_cmdline = g_string_new(qos_get_current_command_line());
- char *uri = g_strdup_printf("%s%s", "unix:", dest->mig_path);
+ TestServer *dest;
+ GString *dest_cmdline;
+ char *uri;
QTestState *to;
GSource *source;
QDict *rsp;
@@ -720,6 +720,10 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
return;
}
+ dest = test_server_new("dest");
+ dest_cmdline = g_string_new(qos_get_current_command_line());
+ uri = g_strdup_printf("%s%s", "unix:", dest->mig_path);
+
size = get_log_size(s);
g_assert_cmpint(size, ==, (256 * 1024 * 1024) / (VHOST_LOG_PAGE * 8));
@@ -778,6 +782,7 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc)
qtest_quit(to);
test_server_free(dest);
g_free(uri);
+ g_string_free(dest_cmdline, true);
}
static void wait_for_rings_started(TestServer *s, size_t count)
diff --git a/tests/tcg/aarch64/Makefile.softmmu-target b/tests/tcg/aarch64/Makefile.softmmu-target
index 7b4eede3f0..f6b5121f5c 100644
--- a/tests/tcg/aarch64/Makefile.softmmu-target
+++ b/tests/tcg/aarch64/Makefile.softmmu-target
@@ -61,4 +61,7 @@ run-memory-replay: memory-replay run-memory-record
$(QEMU_OPTS) memory, \
"$< on $(TARGET_NAME)")
-EXTRA_TESTS+=memory-record memory-replay
+run-pauth-3: pauth-3
+pauth-3: CFLAGS += -march=armv8.3-a
+
+EXTRA_TESTS+=memory-record memory-replay pauth-3
diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target
index df3fe8032c..efa67cf1e9 100644
--- a/tests/tcg/aarch64/Makefile.target
+++ b/tests/tcg/aarch64/Makefile.target
@@ -18,8 +18,9 @@ run-fcvt: fcvt
$(call diff-out,$<,$(AARCH64_SRC)/fcvt.ref)
# Pauth Tests
-AARCH64_TESTS += pauth-1 pauth-2
+AARCH64_TESTS += pauth-1 pauth-2 pauth-4
run-pauth-%: QEMU_OPTS += -cpu max
+pauth-%: CFLAGS += -march=armv8.3-a
# Semihosting smoke test for linux-user
AARCH64_TESTS += semihosting
diff --git a/tests/tcg/aarch64/pauth-1.c b/tests/tcg/aarch64/pauth-1.c
index a3c1443cd0..ea0984ea82 100644
--- a/tests/tcg/aarch64/pauth-1.c
+++ b/tests/tcg/aarch64/pauth-1.c
@@ -2,8 +2,6 @@
#include <sys/prctl.h>
#include <stdio.h>
-asm(".arch armv8.4-a");
-
#ifndef PR_PAC_RESET_KEYS
#define PR_PAC_RESET_KEYS 54
#define PR_PAC_APDAKEY (1 << 2)
diff --git a/tests/tcg/aarch64/pauth-2.c b/tests/tcg/aarch64/pauth-2.c
index 2fe030ba3d..9bba0beb63 100644
--- a/tests/tcg/aarch64/pauth-2.c
+++ b/tests/tcg/aarch64/pauth-2.c
@@ -1,8 +1,6 @@
#include <stdint.h>
#include <assert.h>
-asm(".arch armv8.4-a");
-
void do_test(uint64_t value)
{
uint64_t salt1, salt2;
diff --git a/tests/tcg/aarch64/pauth-4.c b/tests/tcg/aarch64/pauth-4.c
new file mode 100644
index 0000000000..1040e92aec
--- /dev/null
+++ b/tests/tcg/aarch64/pauth-4.c
@@ -0,0 +1,25 @@
+#include <stdint.h>
+#include <assert.h>
+
+int main()
+{
+ uintptr_t x, y;
+
+ asm("mov %0, lr\n\t"
+ "pacia %0, sp\n\t" /* sigill if pauth not supported */
+ "eor %0, %0, #4\n\t" /* corrupt single bit */
+ "mov %1, %0\n\t"
+ "autia %1, sp\n\t" /* validate corrupted pointer */
+ "xpaci %0\n\t" /* strip pac from corrupted pointer */
+ : "=r"(x), "=r"(y));
+
+ /*
+ * Once stripped, the corrupted pointer is of the form 0x0000...wxyz.
+ * We expect the autia to indicate failure, producing a pointer of the
+ * form 0x000e....wxyz. Use xpaci and != for the test, rather than
+ * extracting explicit bits from the top, because the location of the
+ * error code "e" depends on the configuration of virtual memory.
+ */
+ assert(x != y);
+ return 0;
+}
diff --git a/tests/tcg/aarch64/system/pauth-3.c b/tests/tcg/aarch64/system/pauth-3.c
new file mode 100644
index 0000000000..42eff4d5ea
--- /dev/null
+++ b/tests/tcg/aarch64/system/pauth-3.c
@@ -0,0 +1,40 @@
+#include <inttypes.h>
+#include <minilib.h>
+
+int main()
+{
+ /*
+ * Test vector from QARMA paper (https://eprint.iacr.org/2016/444.pdf)
+ * to verify one computation of the pauth_computepac() function,
+ * which uses sbox2.
+ *
+ * Use PACGA, because it returns the most bits from ComputePAC.
+ * We still only get the most significant 32-bits of the result.
+ */
+
+ static const uint64_t d[5] = {
+ 0xfb623599da6e8127ull,
+ 0x477d469dec0b8762ull,
+ 0x84be85ce9804e94bull,
+ 0xec2802d4e0a488e9ull,
+ 0xc003b93999b33765ull & 0xffffffff00000000ull
+ };
+ uint64_t r;
+
+ asm("msr apgakeyhi_el1, %[w0]\n\t"
+ "msr apgakeylo_el1, %[k0]\n\t"
+ "pacga %[r], %[P], %[T]"
+ : [r] "=r"(r)
+ : [P] "r" (d[0]),
+ [T] "r" (d[1]),
+ [w0] "r" (d[2]),
+ [k0] "r" (d[3]));
+
+ if (r == d[4]) {
+ ml_printf("OK\n");
+ return 0;
+ } else {
+ ml_printf("FAIL: %lx != %lx\n", r, d[4]);
+ return 1;
+ }
+}
diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c
index 8f184f3556..cea363dd69 100644
--- a/tests/test-vmstate.c
+++ b/tests/test-vmstate.c
@@ -926,6 +926,28 @@ static const VMStateDescription vmstate_domain = {
}
};
+/* test QLIST Migration */
+
+typedef struct TestQListElement {
+ uint32_t id;
+ QLIST_ENTRY(TestQListElement) next;
+} TestQListElement;
+
+typedef struct TestQListContainer {
+ uint32_t id;
+ QLIST_HEAD(, TestQListElement) list;
+} TestQListContainer;
+
+static const VMStateDescription vmstate_qlist_element = {
+ .name = "test/queue list",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, TestQListElement),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
static const VMStateDescription vmstate_iommu = {
.name = "iommu",
.version_id = 1,
@@ -939,6 +961,18 @@ static const VMStateDescription vmstate_iommu = {
}
};
+static const VMStateDescription vmstate_container = {
+ .name = "test/container/qlist",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, TestQListContainer),
+ VMSTATE_QLIST_V(list, TestQListContainer, 1, vmstate_qlist_element,
+ TestQListElement, next),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
uint8_t first_domain_dump[] = {
/* id */
0x00, 0x0, 0x0, 0x6,
@@ -1229,6 +1263,140 @@ static void test_gtree_load_iommu(void)
qemu_fclose(fload);
}
+static uint8_t qlist_dump[] = {
+ 0x00, 0x00, 0x00, 0x01, /* container id */
+ 0x1, /* start of a */
+ 0x00, 0x00, 0x00, 0x0a,
+ 0x1, /* start of b */
+ 0x00, 0x00, 0x0b, 0x00,
+ 0x1, /* start of c */
+ 0x00, 0x0c, 0x00, 0x00,
+ 0x1, /* start of d */
+ 0x0d, 0x00, 0x00, 0x00,
+ 0x0, /* end of list */
+ QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */
+};
+
+static TestQListContainer *alloc_container(void)
+{
+ TestQListElement *a = g_malloc(sizeof(TestQListElement));
+ TestQListElement *b = g_malloc(sizeof(TestQListElement));
+ TestQListElement *c = g_malloc(sizeof(TestQListElement));
+ TestQListElement *d = g_malloc(sizeof(TestQListElement));
+ TestQListContainer *container = g_malloc(sizeof(TestQListContainer));
+
+ a->id = 0x0a;
+ b->id = 0x0b00;
+ c->id = 0xc0000;
+ d->id = 0xd000000;
+ container->id = 1;
+
+ QLIST_INIT(&container->list);
+ QLIST_INSERT_HEAD(&container->list, d, next);
+ QLIST_INSERT_HEAD(&container->list, c, next);
+ QLIST_INSERT_HEAD(&container->list, b, next);
+ QLIST_INSERT_HEAD(&container->list, a, next);
+ return container;
+}
+
+static void free_container(TestQListContainer *container)
+{
+ TestQListElement *iter, *tmp;
+
+ QLIST_FOREACH_SAFE(iter, &container->list, next, tmp) {
+ QLIST_REMOVE(iter, next);
+ g_free(iter);
+ }
+ g_free(container);
+}
+
+static void compare_containers(TestQListContainer *c1, TestQListContainer *c2)
+{
+ TestQListElement *first_item_c1, *first_item_c2;
+
+ while (!QLIST_EMPTY(&c1->list)) {
+ first_item_c1 = QLIST_FIRST(&c1->list);
+ first_item_c2 = QLIST_FIRST(&c2->list);
+ assert(first_item_c2);
+ assert(first_item_c1->id == first_item_c2->id);
+ QLIST_REMOVE(first_item_c1, next);
+ QLIST_REMOVE(first_item_c2, next);
+ g_free(first_item_c1);
+ g_free(first_item_c2);
+ }
+ assert(QLIST_EMPTY(&c2->list));
+}
+
+/*
+ * Check the prev & next fields are correct by doing list
+ * manipulations on the container. We will do that for both
+ * the source and the destination containers
+ */
+static void manipulate_container(TestQListContainer *c)
+{
+ TestQListElement *prev = NULL, *iter = QLIST_FIRST(&c->list);
+ TestQListElement *elem;
+
+ elem = g_malloc(sizeof(TestQListElement));
+ elem->id = 0x12;
+ QLIST_INSERT_AFTER(iter, elem, next);
+
+ elem = g_malloc(sizeof(TestQListElement));
+ elem->id = 0x13;
+ QLIST_INSERT_HEAD(&c->list, elem, next);
+
+ while (iter) {
+ prev = iter;
+ iter = QLIST_NEXT(iter, next);
+ }
+
+ elem = g_malloc(sizeof(TestQListElement));
+ elem->id = 0x14;
+ QLIST_INSERT_BEFORE(prev, elem, next);
+
+ elem = g_malloc(sizeof(TestQListElement));
+ elem->id = 0x15;
+ QLIST_INSERT_AFTER(prev, elem, next);
+
+ QLIST_REMOVE(prev, next);
+ g_free(prev);
+}
+
+static void test_save_qlist(void)
+{
+ TestQListContainer *container = alloc_container();
+
+ save_vmstate(&vmstate_container, container);
+ compare_vmstate(qlist_dump, sizeof(qlist_dump));
+ free_container(container);
+}
+
+static void test_load_qlist(void)
+{
+ QEMUFile *fsave, *fload;
+ TestQListContainer *orig_container = alloc_container();
+ TestQListContainer *dest_container = g_malloc0(sizeof(TestQListContainer));
+ char eof;
+
+ QLIST_INIT(&dest_container->list);
+
+ fsave = open_test_file(true);
+ qemu_put_buffer(fsave, qlist_dump, sizeof(qlist_dump));
+ g_assert(!qemu_file_get_error(fsave));
+ qemu_fclose(fsave);
+
+ fload = open_test_file(false);
+ vmstate_load_state(fload, &vmstate_container, dest_container, 1);
+ eof = qemu_get_byte(fload);
+ g_assert(!qemu_file_get_error(fload));
+ g_assert_cmpint(eof, ==, QEMU_VM_EOF);
+ manipulate_container(orig_container);
+ manipulate_container(dest_container);
+ compare_containers(orig_container, dest_container);
+ free_container(orig_container);
+ free_container(dest_container);
+}
+
typedef struct TmpTestStruct {
TestStruct *parent;
int64_t diff;
@@ -1353,6 +1521,8 @@ int main(int argc, char **argv)
g_test_add_func("/vmstate/gtree/load/loaddomain", test_gtree_load_domain);
g_test_add_func("/vmstate/gtree/save/saveiommu", test_gtree_save_iommu);
g_test_add_func("/vmstate/gtree/load/loadiommu", test_gtree_load_iommu);
+ g_test_add_func("/vmstate/qlist/save/saveqlist", test_save_qlist);
+ g_test_add_func("/vmstate/qlist/load/loadqlist", test_load_qlist);
g_test_add_func("/vmstate/tmp_struct", test_tmp_struct);
g_test_run();
diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in
new file mode 100644
index 0000000000..9bcd86f8dc
--- /dev/null
+++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in
@@ -0,0 +1,5 @@
+{
+ "description": "QEMU virtiofsd vhost-user-fs",
+ "type": "fs",
+ "binary": "@libexecdir@/virtiofsd"
+}
diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs
new file mode 100644
index 0000000000..076f667e46
--- /dev/null
+++ b/tools/virtiofsd/Makefile.objs
@@ -0,0 +1,12 @@
+virtiofsd-obj-y = buffer.o \
+ fuse_opt.o \
+ fuse_log.o \
+ fuse_lowlevel.o \
+ fuse_signals.o \
+ fuse_virtio.o \
+ helper.o \
+ passthrough_ll.o \
+ seccomp.o
+
+seccomp.o-cflags := $(SECCOMP_CFLAGS)
+seccomp.o-libs := $(SECCOMP_LIBS)
diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c
new file mode 100644
index 0000000000..27c1377f22
--- /dev/null
+++ b/tools/virtiofsd/buffer.c
@@ -0,0 +1,351 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2010 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Functions for dealing with `struct fuse_buf` and `struct
+ * fuse_bufvec`.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+size_t fuse_buf_size(const struct fuse_bufvec *bufv)
+{
+ size_t i;
+ size_t size = 0;
+
+ for (i = 0; i < bufv->count; i++) {
+ if (bufv->buf[i].size == SIZE_MAX) {
+ size = SIZE_MAX;
+ } else {
+ size += bufv->buf[i].size;
+ }
+ }
+
+ return size;
+}
+
+static ssize_t fuse_buf_writev(struct fuse_buf *out_buf,
+ struct fuse_bufvec *in_buf)
+{
+ ssize_t res, i, j;
+ size_t iovcnt = in_buf->count;
+ struct iovec *iov;
+ int fd = out_buf->fd;
+
+ iov = calloc(iovcnt, sizeof(struct iovec));
+ if (!iov) {
+ return -ENOMEM;
+ }
+
+ for (i = 0, j = 0; i < iovcnt; i++) {
+ /* Skip the buf with 0 size */
+ if (in_buf->buf[i].size) {
+ iov[j].iov_base = in_buf->buf[i].mem;
+ iov[j].iov_len = in_buf->buf[i].size;
+ j++;
+ }
+ }
+
+ if (out_buf->flags & FUSE_BUF_FD_SEEK) {
+ res = pwritev(fd, iov, iovcnt, out_buf->pos);
+ } else {
+ res = writev(fd, iov, iovcnt);
+ }
+
+ if (res == -1) {
+ res = -errno;
+ }
+
+ free(iov);
+ return res;
+}
+
+static size_t min_size(size_t s1, size_t s2)
+{
+ return s1 < s2 ? s1 : s2;
+}
+
+static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ ssize_t res = 0;
+ size_t copied = 0;
+
+ while (len) {
+ if (dst->flags & FUSE_BUF_FD_SEEK) {
+ res = pwrite(dst->fd, (char *)src->mem + src_off, len,
+ dst->pos + dst_off);
+ } else {
+ res = write(dst->fd, (char *)src->mem + src_off, len);
+ }
+ if (res == -1) {
+ if (!copied) {
+ return -errno;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+ if (!(dst->flags & FUSE_BUF_FD_RETRY)) {
+ break;
+ }
+
+ src_off += res;
+ dst_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ ssize_t res = 0;
+ size_t copied = 0;
+
+ while (len) {
+ if (src->flags & FUSE_BUF_FD_SEEK) {
+ res = pread(src->fd, (char *)dst->mem + dst_off, len,
+ src->pos + src_off);
+ } else {
+ res = read(src->fd, (char *)dst->mem + dst_off, len);
+ }
+ if (res == -1) {
+ if (!copied) {
+ return -errno;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+ if (!(src->flags & FUSE_BUF_FD_RETRY)) {
+ break;
+ }
+
+ dst_off += res;
+ src_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ char buf[4096];
+ struct fuse_buf tmp = {
+ .size = sizeof(buf),
+ .flags = 0,
+ };
+ ssize_t res;
+ size_t copied = 0;
+
+ tmp.mem = buf;
+
+ while (len) {
+ size_t this_len = min_size(tmp.size, len);
+ size_t read_len;
+
+ res = fuse_buf_read(&tmp, 0, src, src_off, this_len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ read_len = res;
+ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+
+ if (res < this_len) {
+ break;
+ }
+
+ dst_off += res;
+ src_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ int src_is_fd = src->flags & FUSE_BUF_IS_FD;
+ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD;
+
+ if (!src_is_fd && !dst_is_fd) {
+ char *dstmem = (char *)dst->mem + dst_off;
+ char *srcmem = (char *)src->mem + src_off;
+
+ if (dstmem != srcmem) {
+ if (dstmem + len <= srcmem || srcmem + len <= dstmem) {
+ memcpy(dstmem, srcmem, len);
+ } else {
+ memmove(dstmem, srcmem, len);
+ }
+ }
+
+ return len;
+ } else if (!src_is_fd) {
+ return fuse_buf_write(dst, dst_off, src, src_off, len);
+ } else if (!dst_is_fd) {
+ return fuse_buf_read(dst, dst_off, src, src_off, len);
+ } else {
+ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len);
+ }
+}
+
+static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv)
+{
+ if (bufv->idx < bufv->count) {
+ return &bufv->buf[bufv->idx];
+ } else {
+ return NULL;
+ }
+}
+
+static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len)
+{
+ const struct fuse_buf *buf = fuse_bufvec_current(bufv);
+
+ bufv->off += len;
+ assert(bufv->off <= buf->size);
+ if (bufv->off == buf->size) {
+ assert(bufv->idx < bufv->count);
+ bufv->idx++;
+ if (bufv->idx == bufv->count) {
+ return 0;
+ }
+ bufv->off = 0;
+ }
+ return 1;
+}
+
+ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv)
+{
+ size_t copied = 0, i;
+
+ if (dstv == srcv) {
+ return fuse_buf_size(dstv);
+ }
+
+ /*
+ * use writev to improve bandwidth when all the
+ * src buffers already mapped by the daemon
+ * process
+ */
+ for (i = 0; i < srcv->count; i++) {
+ if (srcv->buf[i].flags & FUSE_BUF_IS_FD) {
+ break;
+ }
+ }
+ if ((i == srcv->count) && (dstv->count == 1) &&
+ (dstv->idx == 0) &&
+ (dstv->buf[0].flags & FUSE_BUF_IS_FD)) {
+ dstv->buf[0].pos += dstv->off;
+ return fuse_buf_writev(&dstv->buf[0], srcv);
+ }
+
+ for (;;) {
+ const struct fuse_buf *src = fuse_bufvec_current(srcv);
+ const struct fuse_buf *dst = fuse_bufvec_current(dstv);
+ size_t src_len;
+ size_t dst_len;
+ size_t len;
+ ssize_t res;
+
+ if (src == NULL || dst == NULL) {
+ break;
+ }
+
+ src_len = src->size - srcv->off;
+ dst_len = dst->size - dstv->off;
+ len = min_size(src_len, dst_len);
+
+ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ copied += res;
+
+ if (!fuse_bufvec_advance(srcv, res) ||
+ !fuse_bufvec_advance(dstv, res)) {
+ break;
+ }
+
+ if (res < len) {
+ break;
+ }
+ }
+
+ return copied;
+}
+
+void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len)
+{
+ void *ptr;
+
+ if (len > iter->size - iter->pos) {
+ return NULL;
+ }
+
+ ptr = iter->mem + iter->pos;
+ iter->pos += len;
+ return ptr;
+}
+
+const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter)
+{
+ const char *str = iter->mem + iter->pos;
+ size_t remaining = iter->size - iter->pos;
+ size_t i;
+
+ for (i = 0; i < remaining; i++) {
+ if (str[i] == '\0') {
+ iter->pos += i + 1;
+ return str;
+ }
+ }
+ return NULL;
+}
diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h
new file mode 100644
index 0000000000..7a4c713559
--- /dev/null
+++ b/tools/virtiofsd/fuse.h
@@ -0,0 +1,1249 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_H_
+#define FUSE_H_
+
+/*
+ *
+ * This file defines the library interface of FUSE
+ *
+ * IMPORTANT: you should define FUSE_USE_VERSION before including this header.
+ */
+
+#include "fuse_common.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <time.h>
+
+/*
+ * Basic FUSE API
+ */
+
+/** Handle for a FUSE filesystem */
+struct fuse;
+
+/**
+ * Readdir flags, passed to ->readdir()
+ */
+enum fuse_readdir_flags {
+ /**
+ * "Plus" mode.
+ *
+ * The kernel wants to prefill the inode cache during readdir. The
+ * filesystem may honour this by filling in the attributes and setting
+ * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also
+ * just ignore this flag completely.
+ */
+ FUSE_READDIR_PLUS = (1 << 0),
+};
+
+enum fuse_fill_dir_flags {
+ /**
+ * "Plus" mode: all file attributes are valid
+ *
+ * The attributes are used by the kernel to prefill the inode cache
+ * during a readdir.
+ *
+ * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set
+ * and vice versa.
+ */
+ FUSE_FILL_DIR_PLUS = (1 << 1),
+};
+
+/**
+ * Function to add an entry in a readdir() operation
+ *
+ * The *off* parameter can be any non-zero value that enables the
+ * filesystem to identify the current point in the directory
+ * stream. It does not need to be the actual physical position. A
+ * value of zero is reserved to indicate that seeking in directories
+ * is not supported.
+ *
+ * @param buf the buffer passed to the readdir() operation
+ * @param name the file name of the directory entry
+ * @param stat file attributes, can be NULL
+ * @param off offset of the next entry or zero
+ * @param flags fill flags
+ * @return 1 if buffer is full, zero otherwise
+ */
+typedef int (*fuse_fill_dir_t)(void *buf, const char *name,
+ const struct stat *stbuf, off_t off,
+ enum fuse_fill_dir_flags flags);
+/**
+ * Configuration of the high-level API
+ *
+ * This structure is initialized from the arguments passed to
+ * fuse_new(), and then passed to the file system's init() handler
+ * which should ensure that the configuration is compatible with the
+ * file system implementation.
+ */
+struct fuse_config {
+ /**
+ * If `set_gid` is non-zero, the st_gid attribute of each file
+ * is overwritten with the value of `gid`.
+ */
+ int set_gid;
+ unsigned int gid;
+
+ /**
+ * If `set_uid` is non-zero, the st_uid attribute of each file
+ * is overwritten with the value of `uid`.
+ */
+ int set_uid;
+ unsigned int uid;
+
+ /**
+ * If `set_mode` is non-zero, the any permissions bits set in
+ * `umask` are unset in the st_mode attribute of each file.
+ */
+ int set_mode;
+ unsigned int umask;
+
+ /**
+ * The timeout in seconds for which name lookups will be
+ * cached.
+ */
+ double entry_timeout;
+
+ /**
+ * The timeout in seconds for which a negative lookup will be
+ * cached. This means, that if file did not exist (lookup
+ * retuned ENOENT), the lookup will only be redone after the
+ * timeout, and the file/directory will be assumed to not
+ * exist until then. A value of zero means that negative
+ * lookups are not cached.
+ */
+ double negative_timeout;
+
+ /**
+ * The timeout in seconds for which file/directory attributes
+ * (as returned by e.g. the `getattr` handler) are cached.
+ */
+ double attr_timeout;
+
+ /**
+ * Allow requests to be interrupted
+ */
+ int intr;
+
+ /**
+ * Specify which signal number to send to the filesystem when
+ * a request is interrupted. The default is hardcoded to
+ * USR1.
+ */
+ int intr_signal;
+
+ /**
+ * Normally, FUSE assigns inodes to paths only for as long as
+ * the kernel is aware of them. With this option inodes are
+ * instead remembered for at least this many seconds. This
+ * will require more memory, but may be necessary when using
+ * applications that make use of inode numbers.
+ *
+ * A number of -1 means that inodes will be remembered for the
+ * entire life-time of the file-system process.
+ */
+ int remember;
+
+ /**
+ * The default behavior is that if an open file is deleted,
+ * the file is renamed to a hidden file (.fuse_hiddenXXX), and
+ * only removed when the file is finally released. This
+ * relieves the filesystem implementation of having to deal
+ * with this problem. This option disables the hiding
+ * behavior, and files are removed immediately in an unlink
+ * operation (or in a rename operation which overwrites an
+ * existing file).
+ *
+ * It is recommended that you not use the hard_remove
+ * option. When hard_remove is set, the following libc
+ * functions fail on unlinked files (returning errno of
+ * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2),
+ * ftruncate(2), fstat(2), fchmod(2), fchown(2)
+ */
+ int hard_remove;
+
+ /**
+ * Honor the st_ino field in the functions getattr() and
+ * fill_dir(). This value is used to fill in the st_ino field
+ * in the stat(2), lstat(2), fstat(2) functions and the d_ino
+ * field in the readdir(2) function. The filesystem does not
+ * have to guarantee uniqueness, however some applications
+ * rely on this value being unique for the whole filesystem.
+ *
+ * Note that this does *not* affect the inode that libfuse
+ * and the kernel use internally (also called the "nodeid").
+ */
+ int use_ino;
+
+ /**
+ * If use_ino option is not given, still try to fill in the
+ * d_ino field in readdir(2). If the name was previously
+ * looked up, and is still in the cache, the inode number
+ * found there will be used. Otherwise it will be set to -1.
+ * If use_ino option is given, this option is ignored.
+ */
+ int readdir_ino;
+
+ /**
+ * This option disables the use of page cache (file content cache)
+ * in the kernel for this filesystem. This has several affects:
+ *
+ * 1. Each read(2) or write(2) system call will initiate one
+ * or more read or write operations, data will not be
+ * cached in the kernel.
+ *
+ * 2. The return value of the read() and write() system calls
+ * will correspond to the return values of the read and
+ * write operations. This is useful for example if the
+ * file size is not known in advance (before reading it).
+ *
+ * Internally, enabling this option causes fuse to set the
+ * `direct_io` field of `struct fuse_file_info` - overwriting
+ * any value that was put there by the file system.
+ */
+ int direct_io;
+
+ /**
+ * This option disables flushing the cache of the file
+ * contents on every open(2). This should only be enabled on
+ * filesystems where the file data is never changed
+ * externally (not through the mounted FUSE filesystem). Thus
+ * it is not suitable for network filesystems and other
+ * intermediate filesystems.
+ *
+ * NOTE: if this option is not specified (and neither
+ * direct_io) data is still cached after the open(2), so a
+ * read(2) system call will not always initiate a read
+ * operation.
+ *
+ * Internally, enabling this option causes fuse to set the
+ * `keep_cache` field of `struct fuse_file_info` - overwriting
+ * any value that was put there by the file system.
+ */
+ int kernel_cache;
+
+ /**
+ * This option is an alternative to `kernel_cache`. Instead of
+ * unconditionally keeping cached data, the cached data is
+ * invalidated on open(2) if if the modification time or the
+ * size of the file has changed since it was last opened.
+ */
+ int auto_cache;
+
+ /**
+ * The timeout in seconds for which file attributes are cached
+ * for the purpose of checking if auto_cache should flush the
+ * file data on open.
+ */
+ int ac_attr_timeout_set;
+ double ac_attr_timeout;
+
+ /**
+ * If this option is given the file-system handlers for the
+ * following operations will not receive path information:
+ * read, write, flush, release, fsync, readdir, releasedir,
+ * fsyncdir, lock, ioctl and poll.
+ *
+ * For the truncate, getattr, chmod, chown and utimens
+ * operations the path will be provided only if the struct
+ * fuse_file_info argument is NULL.
+ */
+ int nullpath_ok;
+
+ /**
+ * The remaining options are used by libfuse internally and
+ * should not be touched.
+ */
+ int show_help;
+ char *modules;
+ int debug;
+};
+
+
+/**
+ * The file system operations:
+ *
+ * Most of these should work very similarly to the well known UNIX
+ * file system operations. A major exception is that instead of
+ * returning an error in 'errno', the operation should return the
+ * negated error value (-errno) directly.
+ *
+ * All methods are optional, but some are essential for a useful
+ * filesystem (e.g. getattr). Open, flush, release, fsync, opendir,
+ * releasedir, fsyncdir, access, create, truncate, lock, init and
+ * destroy are special purpose methods, without which a full featured
+ * filesystem can still be implemented.
+ *
+ * In general, all methods are expected to perform any necessary
+ * permission checking. However, a filesystem may delegate this task
+ * to the kernel by passing the `default_permissions` mount option to
+ * `fuse_new()`. In this case, methods will only be called if
+ * the kernel's permission check has succeeded.
+ *
+ * Almost all operations take a path which can be of any length.
+ */
+struct fuse_operations {
+ /**
+ * Get file attributes.
+ *
+ * Similar to stat(). The 'st_dev' and 'st_blksize' fields are
+ * ignored. The 'st_ino' field is ignored except if the 'use_ino'
+ * mount option is given. In that case it is passed to userspace,
+ * but libfuse and the kernel will still assign a different
+ * inode for internal use (called the "nodeid").
+ *
+ * `fi` will always be NULL if the file is not currently open, but
+ * may also be NULL if the file is open.
+ */
+ int (*getattr)(const char *, struct stat *, struct fuse_file_info *fi);
+
+ /**
+ * Read the target of a symbolic link
+ *
+ * The buffer should be filled with a null terminated string. The
+ * buffer size argument includes the space for the terminating
+ * null character. If the linkname is too long to fit in the
+ * buffer, it should be truncated. The return value should be 0
+ * for success.
+ */
+ int (*readlink)(const char *, char *, size_t);
+
+ /**
+ * Create a file node
+ *
+ * This is called for creation of all non-directory, non-symlink
+ * nodes. If the filesystem defines a create() method, then for
+ * regular files that will be called instead.
+ */
+ int (*mknod)(const char *, mode_t, dev_t);
+
+ /**
+ * Create a directory
+ *
+ * Note that the mode argument may not have the type specification
+ * bits set, i.e. S_ISDIR(mode) can be false. To obtain the
+ * correct directory type bits use mode|S_IFDIR
+ */
+ int (*mkdir)(const char *, mode_t);
+
+ /** Remove a file */
+ int (*unlink)(const char *);
+
+ /** Remove a directory */
+ int (*rmdir)(const char *);
+
+ /** Create a symbolic link */
+ int (*symlink)(const char *, const char *);
+
+ /**
+ * Rename a file
+ *
+ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If
+ * RENAME_NOREPLACE is specified, the filesystem must not
+ * overwrite *newname* if it exists and return an error
+ * instead. If `RENAME_EXCHANGE` is specified, the filesystem
+ * must atomically exchange the two files, i.e. both must
+ * exist and neither may be deleted.
+ */
+ int (*rename)(const char *, const char *, unsigned int flags);
+
+ /** Create a hard link to a file */
+ int (*link)(const char *, const char *);
+
+ /**
+ * Change the permission bits of a file
+ *
+ * `fi` will always be NULL if the file is not currenlty open, but
+ * may also be NULL if the file is open.
+ */
+ int (*chmod)(const char *, mode_t, struct fuse_file_info *fi);
+
+ /**
+ * Change the owner and group of a file
+ *
+ * `fi` will always be NULL if the file is not currenlty open, but
+ * may also be NULL if the file is open.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ */
+ int (*chown)(const char *, uid_t, gid_t, struct fuse_file_info *fi);
+
+ /**
+ * Change the size of a file
+ *
+ * `fi` will always be NULL if the file is not currenlty open, but
+ * may also be NULL if the file is open.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ */
+ int (*truncate)(const char *, off_t, struct fuse_file_info *fi);
+
+ /**
+ * Open a file
+ *
+ * Open flags are available in fi->flags. The following rules
+ * apply.
+ *
+ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be
+ * filtered out / handled by the kernel.
+ *
+ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH)
+ * should be used by the filesystem to check if the operation is
+ * permitted. If the ``-o default_permissions`` mount option is
+ * given, this check is already done by the kernel before calling
+ * open() and may thus be omitted by the filesystem.
+ *
+ * - When writeback caching is enabled, the kernel may send
+ * read requests even for files opened with O_WRONLY. The
+ * filesystem should be prepared to handle this.
+ *
+ * - When writeback caching is disabled, the filesystem is
+ * expected to properly handle the O_APPEND flag and ensure
+ * that each write is appending to the end of the file.
+ *
+ * - When writeback caching is enabled, the kernel will
+ * handle O_APPEND. However, unless all changes to the file
+ * come through the kernel this will not work reliably. The
+ * filesystem should thus either ignore the O_APPEND flag
+ * (and let the kernel handle it), or return an error
+ * (indicating that reliably O_APPEND is not available).
+ *
+ * Filesystem may store an arbitrary file handle (pointer,
+ * index, etc) in fi->fh, and use this in other all other file
+ * operations (read, write, flush, release, fsync).
+ *
+ * Filesystem may also implement stateless file I/O and not store
+ * anything in fi->fh.
+ *
+ * There are also some flags (direct_io, keep_cache) which the
+ * filesystem may set in fi, to change the way the file is opened.
+ * See fuse_file_info structure in <fuse_common.h> for more details.
+ *
+ * If this request is answered with an error code of ENOSYS
+ * and FUSE_CAP_NO_OPEN_SUPPORT is set in
+ * `fuse_conn_info.capable`, this is treated as success and
+ * future calls to open will also succeed without being send
+ * to the filesystem process.
+ *
+ */
+ int (*open)(const char *, struct fuse_file_info *);
+
+ /**
+ * Read data from an open file
+ *
+ * Read should return exactly the number of bytes requested except
+ * on EOF or error, otherwise the rest of the data will be
+ * substituted with zeroes. An exception to this is when the
+ * 'direct_io' mount option is specified, in which case the return
+ * value of the read system call will reflect the return value of
+ * this operation.
+ */
+ int (*read)(const char *, char *, size_t, off_t, struct fuse_file_info *);
+
+ /**
+ * Write data to an open file
+ *
+ * Write should return exactly the number of bytes requested
+ * except on error. An exception to this is when the 'direct_io'
+ * mount option is specified (see read operation).
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ */
+ int (*write)(const char *, const char *, size_t, off_t,
+ struct fuse_file_info *);
+
+ /**
+ * Get file system statistics
+ *
+ * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored
+ */
+ int (*statfs)(const char *, struct statvfs *);
+
+ /**
+ * Possibly flush cached data
+ *
+ * BIG NOTE: This is not equivalent to fsync(). It's not a
+ * request to sync dirty data.
+ *
+ * Flush is called on each close() of a file descriptor, as opposed to
+ * release which is called on the close of the last file descriptor for
+ * a file. Under Linux, errors returned by flush() will be passed to
+ * userspace as errors from close(), so flush() is a good place to write
+ * back any cached dirty data. However, many applications ignore errors
+ * on close(), and on non-Linux systems, close() may succeed even if flush()
+ * returns an error. For these reasons, filesystems should not assume
+ * that errors returned by flush will ever be noticed or even
+ * delivered.
+ *
+ * NOTE: The flush() method may be called more than once for each
+ * open(). This happens if more than one file descriptor refers to an
+ * open file handle, e.g. due to dup(), dup2() or fork() calls. It is
+ * not possible to determine if a flush is final, so each flush should
+ * be treated equally. Multiple write-flush sequences are relatively
+ * rare, so this shouldn't be a problem.
+ *
+ * Filesystems shouldn't assume that flush will be called at any
+ * particular point. It may be called more times than expected, or not
+ * at all.
+ *
+ * [close]:
+ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html
+ */
+ int (*flush)(const char *, struct fuse_file_info *);
+
+ /**
+ * Release an open file
+ *
+ * Release is called when there are no more references to an open
+ * file: all file descriptors are closed and all memory mappings
+ * are unmapped.
+ *
+ * For every open() call there will be exactly one release() call
+ * with the same flags and file handle. It is possible to
+ * have a file opened more than once, in which case only the last
+ * release will mean, that no more reads/writes will happen on the
+ * file. The return value of release is ignored.
+ */
+ int (*release)(const char *, struct fuse_file_info *);
+
+ /*
+ * Synchronize file contents
+ *
+ * If the datasync parameter is non-zero, then only the user data
+ * should be flushed, not the meta data.
+ */
+ int (*fsync)(const char *, int, struct fuse_file_info *);
+
+ /** Set extended attributes */
+ int (*setxattr)(const char *, const char *, const char *, size_t, int);
+
+ /** Get extended attributes */
+ int (*getxattr)(const char *, const char *, char *, size_t);
+
+ /** List extended attributes */
+ int (*listxattr)(const char *, char *, size_t);
+
+ /** Remove extended attributes */
+ int (*removexattr)(const char *, const char *);
+
+ /*
+ * Open directory
+ *
+ * Unless the 'default_permissions' mount option is given,
+ * this method should check if opendir is permitted for this
+ * directory. Optionally opendir may also return an arbitrary
+ * filehandle in the fuse_file_info structure, which will be
+ * passed to readdir, releasedir and fsyncdir.
+ */
+ int (*opendir)(const char *, struct fuse_file_info *);
+
+ /*
+ * Read directory
+ *
+ * The filesystem may choose between two modes of operation:
+ *
+ * 1) The readdir implementation ignores the offset parameter, and
+ * passes zero to the filler function's offset. The filler
+ * function will not return '1' (unless an error happens), so the
+ * whole directory is read in a single readdir operation.
+ *
+ * 2) The readdir implementation keeps track of the offsets of the
+ * directory entries. It uses the offset parameter and always
+ * passes non-zero offset to the filler function. When the buffer
+ * is full (or an error happens) the filler function will return
+ * '1'.
+ */
+ int (*readdir)(const char *, void *, fuse_fill_dir_t, off_t,
+ struct fuse_file_info *, enum fuse_readdir_flags);
+
+ /**
+ * Release directory
+ */
+ int (*releasedir)(const char *, struct fuse_file_info *);
+
+ /**
+ * Synchronize directory contents
+ *
+ * If the datasync parameter is non-zero, then only the user data
+ * should be flushed, not the meta data
+ */
+ int (*fsyncdir)(const char *, int, struct fuse_file_info *);
+
+ /**
+ * Initialize filesystem
+ *
+ * The return value will passed in the `private_data` field of
+ * `struct fuse_context` to all file operations, and as a
+ * parameter to the destroy() method. It overrides the initial
+ * value provided to fuse_main() / fuse_new().
+ */
+ void *(*init)(struct fuse_conn_info *conn, struct fuse_config *cfg);
+
+ /**
+ * Clean up filesystem
+ *
+ * Called on filesystem exit.
+ */
+ void (*destroy)(void *private_data);
+
+ /**
+ * Check file access permissions
+ *
+ * This will be called for the access() system call. If the
+ * 'default_permissions' mount option is given, this method is not
+ * called.
+ *
+ * This method is not called under Linux kernel versions 2.4.x
+ */
+ int (*access)(const char *, int);
+
+ /**
+ * Create and open a file
+ *
+ * If the file does not exist, first create it with the specified
+ * mode, and then open it.
+ *
+ * If this method is not implemented or under Linux kernel
+ * versions earlier than 2.6.15, the mknod() and open() methods
+ * will be called instead.
+ */
+ int (*create)(const char *, mode_t, struct fuse_file_info *);
+
+ /**
+ * Perform POSIX file locking operation
+ *
+ * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW.
+ *
+ * For the meaning of fields in 'struct flock' see the man page
+ * for fcntl(2). The l_whence field will always be set to
+ * SEEK_SET.
+ *
+ * For checking lock ownership, the 'fuse_file_info->owner'
+ * argument must be used.
+ *
+ * For F_GETLK operation, the library will first check currently
+ * held locks, and if a conflicting lock is found it will return
+ * information without calling this method. This ensures, that
+ * for local locks the l_pid field is correctly filled in. The
+ * results may not be accurate in case of race conditions and in
+ * the presence of hard links, but it's unlikely that an
+ * application would rely on accurate GETLK results in these
+ * cases. If a conflicting lock is not found, this method will be
+ * called, and the filesystem may fill out l_pid by a meaningful
+ * value, or it may leave this field zero.
+ *
+ * For F_SETLK and F_SETLKW the l_pid field will be set to the pid
+ * of the process performing the locking operation.
+ *
+ * Note: if this method is not implemented, the kernel will still
+ * allow file locking to work locally. Hence it is only
+ * interesting for network filesystems and similar.
+ */
+ int (*lock)(const char *, struct fuse_file_info *, int cmd, struct flock *);
+
+ /**
+ * Change the access and modification times of a file with
+ * nanosecond resolution
+ *
+ * This supersedes the old utime() interface. New applications
+ * should use this.
+ *
+ * `fi` will always be NULL if the file is not currenlty open, but
+ * may also be NULL if the file is open.
+ *
+ * See the utimensat(2) man page for details.
+ */
+ int (*utimens)(const char *, const struct timespec tv[2],
+ struct fuse_file_info *fi);
+
+ /**
+ * Map block index within file to block index within device
+ *
+ * Note: This makes sense only for block device backed filesystems
+ * mounted with the 'blkdev' option
+ */
+ int (*bmap)(const char *, size_t blocksize, uint64_t *idx);
+
+ /**
+ * Ioctl
+ *
+ * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in
+ * 64bit environment. The size and direction of data is
+ * determined by _IOC_*() decoding of cmd. For _IOC_NONE,
+ * data will be NULL, for _IOC_WRITE data is out area, for
+ * _IOC_READ in area and if both are set in/out area. In all
+ * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes.
+ *
+ * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a
+ * directory file handle.
+ *
+ * Note : the unsigned long request submitted by the application
+ * is truncated to 32 bits.
+ */
+ int (*ioctl)(const char *, unsigned int cmd, void *arg,
+ struct fuse_file_info *, unsigned int flags, void *data);
+
+ /**
+ * Poll for IO readiness events
+ *
+ * Note: If ph is non-NULL, the client should notify
+ * when IO readiness events occur by calling
+ * fuse_notify_poll() with the specified ph.
+ *
+ * Regardless of the number of times poll with a non-NULL ph
+ * is received, single notification is enough to clear all.
+ * Notifying more times incurs overhead but doesn't harm
+ * correctness.
+ *
+ * The callee is responsible for destroying ph with
+ * fuse_pollhandle_destroy() when no longer in use.
+ */
+ int (*poll)(const char *, struct fuse_file_info *,
+ struct fuse_pollhandle *ph, unsigned *reventsp);
+
+ /*
+ * Write contents of buffer to an open file
+ *
+ * Similar to the write() method, but data is supplied in a
+ * generic buffer. Use fuse_buf_copy() to transfer data to
+ * the destination.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ */
+ int (*write_buf)(const char *, struct fuse_bufvec *buf, off_t off,
+ struct fuse_file_info *);
+
+ /*
+ * Store data from an open file in a buffer
+ *
+ * Similar to the read() method, but data is stored and
+ * returned in a generic buffer.
+ *
+ * No actual copying of data has to take place, the source
+ * file descriptor may simply be stored in the buffer for
+ * later data transfer.
+ *
+ * The buffer must be allocated dynamically and stored at the
+ * location pointed to by bufp. If the buffer contains memory
+ * regions, they too must be allocated using malloc(). The
+ * allocated memory will be freed by the caller.
+ */
+ int (*read_buf)(const char *, struct fuse_bufvec **bufp, size_t size,
+ off_t off, struct fuse_file_info *);
+ /**
+ * Perform BSD file locking operation
+ *
+ * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN
+ *
+ * Nonblocking requests will be indicated by ORing LOCK_NB to
+ * the above operations
+ *
+ * For more information see the flock(2) manual page.
+ *
+ * Additionally fi->owner will be set to a value unique to
+ * this open file. This same value will be supplied to
+ * ->release() when the file is released.
+ *
+ * Note: if this method is not implemented, the kernel will still
+ * allow file locking to work locally. Hence it is only
+ * interesting for network filesystems and similar.
+ */
+ int (*flock)(const char *, struct fuse_file_info *, int op);
+
+ /**
+ * Allocates space for an open file
+ *
+ * This function ensures that required space is allocated for specified
+ * file. If this function returns success then any subsequent write
+ * request to specified range is guaranteed not to fail because of lack
+ * of space on the file system media.
+ */
+ int (*fallocate)(const char *, int, off_t, off_t, struct fuse_file_info *);
+
+ /**
+ * Copy a range of data from one file to another
+ *
+ * Performs an optimized copy between two file descriptors without the
+ * additional cost of transferring data through the FUSE kernel module
+ * to user space (glibc) and then back into the FUSE filesystem again.
+ *
+ * In case this method is not implemented, glibc falls back to reading
+ * data from the source and writing to the destination. Effectively
+ * doing an inefficient copy of the data.
+ */
+ ssize_t (*copy_file_range)(const char *path_in,
+ struct fuse_file_info *fi_in, off_t offset_in,
+ const char *path_out,
+ struct fuse_file_info *fi_out, off_t offset_out,
+ size_t size, int flags);
+
+ /**
+ * Find next data or hole after the specified offset
+ */
+ off_t (*lseek)(const char *, off_t off, int whence,
+ struct fuse_file_info *);
+};
+
+/*
+ * Extra context that may be needed by some filesystems
+ *
+ * The uid, gid and pid fields are not filled in case of a writepage
+ * operation.
+ */
+struct fuse_context {
+ /** Pointer to the fuse object */
+ struct fuse *fuse;
+
+ /** User ID of the calling process */
+ uid_t uid;
+
+ /** Group ID of the calling process */
+ gid_t gid;
+
+ /** Process ID of the calling thread */
+ pid_t pid;
+
+ /** Private filesystem data */
+ void *private_data;
+
+ /** Umask of the calling process */
+ mode_t umask;
+};
+
+/**
+ * Main function of FUSE.
+ *
+ * This is for the lazy. This is all that has to be called from the
+ * main() function.
+ *
+ * This function does the following:
+ * - parses command line options, and handles --help and
+ * --version
+ * - installs signal handlers for INT, HUP, TERM and PIPE
+ * - registers an exit handler to unmount the filesystem on program exit
+ * - creates a fuse handle
+ * - registers the operations
+ * - calls either the single-threaded or the multi-threaded event loop
+ *
+ * Most file systems will have to parse some file-system specific
+ * arguments before calling this function. It is recommended to do
+ * this with fuse_opt_parse() and a processing function that passes
+ * through any unknown options (this can also be achieved by just
+ * passing NULL as the processing function). That way, the remaining
+ * options can be passed directly to fuse_main().
+ *
+ * fuse_main() accepts all options that can be passed to
+ * fuse_parse_cmdline(), fuse_new(), or fuse_session_new().
+ *
+ * Option parsing skips argv[0], which is assumed to contain the
+ * program name. This element must always be present and is used to
+ * construct a basic ``usage: `` message for the --help
+ * output. argv[0] may also be set to the empty string. In this case
+ * the usage message is suppressed. This can be used by file systems
+ * to print their own usage line first. See hello.c for an example of
+ * how to do this.
+ *
+ * Note: this is currently implemented as a macro.
+ *
+ * The following error codes may be returned from fuse_main():
+ * 1: Invalid option arguments
+ * 2: No mount point specified
+ * 3: FUSE setup failed
+ * 4: Mounting failed
+ * 5: Failed to daemonize (detach from session)
+ * 6: Failed to set up signal handlers
+ * 7: An error occured during the life of the file system
+ *
+ * @param argc the argument counter passed to the main() function
+ * @param argv the argument vector passed to the main() function
+ * @param op the file system operation
+ * @param private_data Initial value for the `private_data`
+ * field of `struct fuse_context`. May be overridden by the
+ * `struct fuse_operations.init` handler.
+ * @return 0 on success, nonzero on failure
+ *
+ * Example usage, see hello.c
+ */
+/*
+ * int fuse_main(int argc, char *argv[], const struct fuse_operations *op,
+ * void *private_data);
+ */
+#define fuse_main(argc, argv, op, private_data) \
+ fuse_main_real(argc, argv, op, sizeof(*(op)), private_data)
+
+/*
+ * More detailed API
+ */
+
+/**
+ * Print available options (high- and low-level) to stdout. This is
+ * not an exhaustive list, but includes only those options that may be
+ * of interest to an end-user of a file system.
+ *
+ * The function looks at the argument vector only to determine if
+ * there are additional modules to be loaded (module=foo option),
+ * and attempts to call their help functions as well.
+ *
+ * @param args the argument vector.
+ */
+void fuse_lib_help(struct fuse_args *args);
+
+/**
+ * Create a new FUSE filesystem.
+ *
+ * This function accepts most file-system independent mount options
+ * (like context, nodev, ro - see mount(8)), as well as the
+ * FUSE-specific mount options from mount.fuse(8).
+ *
+ * If the --help option is specified, the function writes a help text
+ * to stdout and returns NULL.
+ *
+ * Option parsing skips argv[0], which is assumed to contain the
+ * program name. This element must always be present and is used to
+ * construct a basic ``usage: `` message for the --help output. If
+ * argv[0] is set to the empty string, no usage message is included in
+ * the --help output.
+ *
+ * If an unknown option is passed in, an error message is written to
+ * stderr and the function returns NULL.
+ *
+ * @param args argument vector
+ * @param op the filesystem operations
+ * @param op_size the size of the fuse_operations structure
+ * @param private_data Initial value for the `private_data`
+ * field of `struct fuse_context`. May be overridden by the
+ * `struct fuse_operations.init` handler.
+ * @return the created FUSE handle
+ */
+#if FUSE_USE_VERSION == 30
+struct fuse *fuse_new_30(struct fuse_args *args,
+ const struct fuse_operations *op, size_t op_size,
+ void *private_data);
+#define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data)
+#else
+struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op,
+ size_t op_size, void *private_data);
+#endif
+
+/**
+ * Mount a FUSE file system.
+ *
+ * @param mountpoint the mount point path
+ * @param f the FUSE handle
+ *
+ * @return 0 on success, -1 on failure.
+ **/
+int fuse_mount(struct fuse *f, const char *mountpoint);
+
+/**
+ * Unmount a FUSE file system.
+ *
+ * See fuse_session_unmount() for additional information.
+ *
+ * @param f the FUSE handle
+ **/
+void fuse_unmount(struct fuse *f);
+
+/**
+ * Destroy the FUSE handle.
+ *
+ * NOTE: This function does not unmount the filesystem. If this is
+ * needed, call fuse_unmount() before calling this function.
+ *
+ * @param f the FUSE handle
+ */
+void fuse_destroy(struct fuse *f);
+
+/**
+ * FUSE event loop.
+ *
+ * Requests from the kernel are processed, and the appropriate
+ * operations are called.
+ *
+ * For a description of the return value and the conditions when the
+ * event loop exits, refer to the documentation of
+ * fuse_session_loop().
+ *
+ * @param f the FUSE handle
+ * @return see fuse_session_loop()
+ *
+ * See also: fuse_loop_mt()
+ */
+int fuse_loop(struct fuse *f);
+
+/**
+ * Flag session as terminated
+ *
+ * This function will cause any running event loops to exit on
+ * the next opportunity.
+ *
+ * @param f the FUSE handle
+ */
+void fuse_exit(struct fuse *f);
+
+/**
+ * Get the current context
+ *
+ * The context is only valid for the duration of a filesystem
+ * operation, and thus must not be stored and used later.
+ *
+ * @return the context
+ */
+struct fuse_context *fuse_get_context(void);
+
+/**
+ * Get the current supplementary group IDs for the current request
+ *
+ * Similar to the getgroups(2) system call, except the return value is
+ * always the total number of group IDs, even if it is larger than the
+ * specified size.
+ *
+ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
+ * the group list to userspace, hence this function needs to parse
+ * "/proc/$TID/task/$TID/status" to get the group IDs.
+ *
+ * This feature may not be supported on all operating systems. In
+ * such a case this function will return -ENOSYS.
+ *
+ * @param size size of given array
+ * @param list array of group IDs to be filled in
+ * @return the total number of supplementary group IDs or -errno on failure
+ */
+int fuse_getgroups(int size, gid_t list[]);
+
+/**
+ * Check if the current request has already been interrupted
+ *
+ * @return 1 if the request has been interrupted, 0 otherwise
+ */
+int fuse_interrupted(void);
+
+/**
+ * Invalidates cache for the given path.
+ *
+ * This calls fuse_lowlevel_notify_inval_inode internally.
+ *
+ * @return 0 on successful invalidation, negative error value otherwise.
+ * This routine may return -ENOENT to indicate that there was
+ * no entry to be invalidated, e.g., because the path has not
+ * been seen before or has been forgotten; this should not be
+ * considered to be an error.
+ */
+int fuse_invalidate_path(struct fuse *f, const char *path);
+
+/**
+ * The real main function
+ *
+ * Do not call this directly, use fuse_main()
+ */
+int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op,
+ size_t op_size, void *private_data);
+
+/**
+ * Start the cleanup thread when using option "remember".
+ *
+ * This is done automatically by fuse_loop_mt()
+ * @param fuse struct fuse pointer for fuse instance
+ * @return 0 on success and -1 on error
+ */
+int fuse_start_cleanup_thread(struct fuse *fuse);
+
+/**
+ * Stop the cleanup thread when using option "remember".
+ *
+ * This is done automatically by fuse_loop_mt()
+ * @param fuse struct fuse pointer for fuse instance
+ */
+void fuse_stop_cleanup_thread(struct fuse *fuse);
+
+/**
+ * Iterate over cache removing stale entries
+ * use in conjunction with "-oremember"
+ *
+ * NOTE: This is already done for the standard sessions
+ *
+ * @param fuse struct fuse pointer for fuse instance
+ * @return the number of seconds until the next cleanup
+ */
+int fuse_clean_cache(struct fuse *fuse);
+
+/*
+ * Stacking API
+ */
+
+/**
+ * Fuse filesystem object
+ *
+ * This is opaque object represents a filesystem layer
+ */
+struct fuse_fs;
+
+/*
+ * These functions call the relevant filesystem operation, and return
+ * the result.
+ *
+ * If the operation is not defined, they return -ENOSYS, with the
+ * exception of fuse_fs_open, fuse_fs_release, fuse_fs_opendir,
+ * fuse_fs_releasedir and fuse_fs_statfs, which return 0.
+ */
+
+int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf,
+ struct fuse_file_info *fi);
+int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, const char *newpath,
+ unsigned int flags);
+int fuse_fs_unlink(struct fuse_fs *fs, const char *path);
+int fuse_fs_rmdir(struct fuse_fs *fs, const char *path);
+int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, const char *path);
+int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath);
+int fuse_fs_release(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi);
+int fuse_fs_open(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi);
+int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size,
+ off_t off, struct fuse_file_info *fi);
+int fuse_fs_read_buf(struct fuse_fs *fs, const char *path,
+ struct fuse_bufvec **bufp, size_t size, off_t off,
+ struct fuse_file_info *fi);
+int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf,
+ size_t size, off_t off, struct fuse_file_info *fi);
+int fuse_fs_write_buf(struct fuse_fs *fs, const char *path,
+ struct fuse_bufvec *buf, off_t off,
+ struct fuse_file_info *fi);
+int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync,
+ struct fuse_file_info *fi);
+int fuse_fs_flush(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi);
+int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf);
+int fuse_fs_opendir(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi);
+int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf,
+ fuse_fill_dir_t filler, off_t off,
+ struct fuse_file_info *fi, enum fuse_readdir_flags flags);
+int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync,
+ struct fuse_file_info *fi);
+int fuse_fs_releasedir(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi);
+int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode,
+ struct fuse_file_info *fi);
+int fuse_fs_lock(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi, int cmd, struct flock *lock);
+int fuse_fs_flock(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi, int op);
+int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode,
+ struct fuse_file_info *fi);
+int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid,
+ struct fuse_file_info *fi);
+int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size,
+ struct fuse_file_info *fi);
+int fuse_fs_utimens(struct fuse_fs *fs, const char *path,
+ const struct timespec tv[2], struct fuse_file_info *fi);
+int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask);
+int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf,
+ size_t len);
+int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode,
+ dev_t rdev);
+int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode);
+int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name,
+ const char *value, size_t size, int flags);
+int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name,
+ char *value, size_t size);
+int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list,
+ size_t size);
+int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, const char *name);
+int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize,
+ uint64_t *idx);
+int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd,
+ void *arg, struct fuse_file_info *fi, unsigned int flags,
+ void *data);
+int fuse_fs_poll(struct fuse_fs *fs, const char *path,
+ struct fuse_file_info *fi, struct fuse_pollhandle *ph,
+ unsigned *reventsp);
+int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode,
+ off_t offset, off_t length, struct fuse_file_info *fi);
+ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in,
+ struct fuse_file_info *fi_in, off_t off_in,
+ const char *path_out,
+ struct fuse_file_info *fi_out, off_t off_out,
+ size_t len, int flags);
+off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence,
+ struct fuse_file_info *fi);
+void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn,
+ struct fuse_config *cfg);
+void fuse_fs_destroy(struct fuse_fs *fs);
+
+int fuse_notify_poll(struct fuse_pollhandle *ph);
+
+/**
+ * Create a new fuse filesystem object
+ *
+ * This is usually called from the factory of a fuse module to create
+ * a new instance of a filesystem.
+ *
+ * @param op the filesystem operations
+ * @param op_size the size of the fuse_operations structure
+ * @param private_data Initial value for the `private_data`
+ * field of `struct fuse_context`. May be overridden by the
+ * `struct fuse_operations.init` handler.
+ * @return a new filesystem object
+ */
+struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size,
+ void *private_data);
+
+/**
+ * Factory for creating filesystem objects
+ *
+ * The function may use and remove options from 'args' that belong
+ * to this module.
+ *
+ * For now the 'fs' vector always contains exactly one filesystem.
+ * This is the filesystem which will be below the newly created
+ * filesystem in the stack.
+ *
+ * @param args the command line arguments
+ * @param fs NULL terminated filesystem object vector
+ * @return the new filesystem object
+ */
+typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args,
+ struct fuse_fs *fs[]);
+/**
+ * Register filesystem module
+ *
+ * If the "-omodules=*name*_:..." option is present, filesystem
+ * objects are created and pushed onto the stack with the *factory_*
+ * function.
+ *
+ * @param name_ the name of this filesystem module
+ * @param factory_ the factory function for this filesystem module
+ */
+#define FUSE_REGISTER_MODULE(name_, factory_) \
+ fuse_module_factory_t fuse_module_##name_##_factory = factory_
+
+/** Get session from fuse object */
+struct fuse_session *fuse_get_session(struct fuse *f);
+
+/**
+ * Open a FUSE file descriptor and set up the mount for the given
+ * mountpoint and flags.
+ *
+ * @param mountpoint reference to the mount in the file system
+ * @param options mount options
+ * @return the FUSE file descriptor or -1 upon error
+ */
+int fuse_open_channel(const char *mountpoint, const char *options);
+
+#endif /* FUSE_H_ */
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
new file mode 100644
index 0000000000..686c42c0a5
--- /dev/null
+++ b/tools/virtiofsd/fuse_common.h
@@ -0,0 +1,816 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+/** @file */
+
+#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_)
+#error \
+ "Never include <fuse_common.h> directly; use <fuse.h> or <fuse_lowlevel.h> instead."
+#endif
+
+#ifndef FUSE_COMMON_H_
+#define FUSE_COMMON_H_
+
+#include "fuse_log.h"
+#include "fuse_opt.h"
+#include <stdint.h>
+#include <sys/types.h>
+
+/** Major version of FUSE library interface */
+#define FUSE_MAJOR_VERSION 3
+
+/** Minor version of FUSE library interface */
+#define FUSE_MINOR_VERSION 2
+
+#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min))
+#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION)
+
+/**
+ * Information about an open file.
+ *
+ * File Handles are created by the open, opendir, and create methods and closed
+ * by the release and releasedir methods. Multiple file handles may be
+ * concurrently open for the same file. Generally, a client will create one
+ * file handle per file descriptor, though in some cases multiple file
+ * descriptors can share a single file handle.
+ */
+struct fuse_file_info {
+ /** Open flags. Available in open() and release() */
+ int flags;
+
+ /*
+ * In case of a write operation indicates if this was caused
+ * by a delayed write from the page cache. If so, then the
+ * context's pid, uid, and gid fields will not be valid, and
+ * the *fh* value may not match the *fh* value that would
+ * have been sent with the corresponding individual write
+ * requests if write caching had been disabled.
+ */
+ unsigned int writepage:1;
+
+ /** Can be filled in by open, to use direct I/O on this file. */
+ unsigned int direct_io:1;
+
+ /*
+ * Can be filled in by open. It signals the kernel that any
+ * currently cached file data (ie., data that the filesystem
+ * provided the last time the file was open) need not be
+ * invalidated. Has no effect when set in other contexts (in
+ * particular it does nothing when set by opendir()).
+ */
+ unsigned int keep_cache:1;
+
+ /*
+ * Indicates a flush operation. Set in flush operation, also
+ * maybe set in highlevel lock operation and lowlevel release
+ * operation.
+ */
+ unsigned int flush:1;
+
+ /*
+ * Can be filled in by open, to indicate that the file is not
+ * seekable.
+ */
+ unsigned int nonseekable:1;
+
+ /*
+ * Indicates that flock locks for this file should be
+ * released. If set, lock_owner shall contain a valid value.
+ * May only be set in ->release().
+ */
+ unsigned int flock_release:1;
+
+ /*
+ * Can be filled in by opendir. It signals the kernel to
+ * enable caching of entries returned by readdir(). Has no
+ * effect when set in other contexts (in particular it does
+ * nothing when set by open()).
+ */
+ unsigned int cache_readdir:1;
+
+ /* Indicates that suid/sgid bits should be removed upon write */
+ unsigned int kill_priv:1;
+
+
+ /** Padding. Reserved for future use*/
+ unsigned int padding:24;
+ unsigned int padding2:32;
+
+ /*
+ * File handle id. May be filled in by filesystem in create,
+ * open, and opendir(). Available in most other file operations on the
+ * same file handle.
+ */
+ uint64_t fh;
+
+ /** Lock owner id. Available in locking operations and flush */
+ uint64_t lock_owner;
+
+ /*
+ * Requested poll events. Available in ->poll. Only set on kernels
+ * which support it. If unsupported, this field is set to zero.
+ */
+ uint32_t poll_events;
+};
+
+/*
+ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want'
+ */
+
+/**
+ * Indicates that the filesystem supports asynchronous read requests.
+ *
+ * If this capability is not requested/available, the kernel will
+ * ensure that there is at most one pending read request per
+ * file-handle at any time, and will attempt to order read requests by
+ * increasing offset.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ASYNC_READ (1 << 0)
+
+/**
+ * Indicates that the filesystem supports "remote" locking.
+ *
+ * This feature is enabled by default when supported by the kernel,
+ * and if getlk() and setlk() handlers are implemented.
+ */
+#define FUSE_CAP_POSIX_LOCKS (1 << 1)
+
+/**
+ * Indicates that the filesystem supports the O_TRUNC open flag. If
+ * disabled, and an application specifies O_TRUNC, fuse first calls
+ * truncate() and then open() with O_TRUNC filtered out.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3)
+
+/**
+ * Indicates that the filesystem supports lookups of "." and "..".
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_EXPORT_SUPPORT (1 << 4)
+
+/**
+ * Indicates that the kernel should not apply the umask to the
+ * file mode on create operations.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_DONT_MASK (1 << 6)
+
+/**
+ * Indicates that libfuse should try to use splice() when writing to
+ * the fuse device. This may improve performance.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_SPLICE_WRITE (1 << 7)
+
+/**
+ * Indicates that libfuse should try to move pages instead of copying when
+ * writing to / reading from the fuse device. This may improve performance.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_SPLICE_MOVE (1 << 8)
+
+/**
+ * Indicates that libfuse should try to use splice() when reading from
+ * the fuse device. This may improve performance.
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements a write_buf() handler.
+ */
+#define FUSE_CAP_SPLICE_READ (1 << 9)
+
+/**
+ * If set, the calls to flock(2) will be emulated using POSIX locks and must
+ * then be handled by the filesystem's setlock() handler.
+ *
+ * If not set, flock(2) calls will be handled by the FUSE kernel module
+ * internally (so any access that does not go through the kernel cannot be taken
+ * into account).
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements a flock() handler.
+ */
+#define FUSE_CAP_FLOCK_LOCKS (1 << 10)
+
+/**
+ * Indicates that the filesystem supports ioctl's on directories.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_IOCTL_DIR (1 << 11)
+
+/**
+ * Traditionally, while a file is open the FUSE kernel module only
+ * asks the filesystem for an update of the file's attributes when a
+ * client attempts to read beyond EOF. This is unsuitable for
+ * e.g. network filesystems, where the file contents may change
+ * without the kernel knowing about it.
+ *
+ * If this flag is set, FUSE will check the validity of the attributes
+ * on every read. If the attributes are no longer valid (i.e., if the
+ * *attr_timeout* passed to fuse_reply_attr() or set in `struct
+ * fuse_entry_param` has passed), it will first issue a `getattr`
+ * request. If the new mtime differs from the previous value, any
+ * cached file *contents* will be invalidated as well.
+ *
+ * This flag should always be set when available. If all file changes
+ * go through the kernel, *attr_timeout* should be set to a very large
+ * number to avoid unnecessary getattr() calls.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12)
+
+/**
+ * Indicates that the filesystem supports readdirplus.
+ *
+ * This feature is enabled by default when supported by the kernel and if the
+ * filesystem implements a readdirplus() handler.
+ */
+#define FUSE_CAP_READDIRPLUS (1 << 13)
+
+/**
+ * Indicates that the filesystem supports adaptive readdirplus.
+ *
+ * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect.
+ *
+ * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel
+ * will always issue readdirplus() requests to retrieve directory
+ * contents.
+ *
+ * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel
+ * will issue both readdir() and readdirplus() requests, depending on
+ * how much information is expected to be required.
+ *
+ * As of Linux 4.20, the algorithm is as follows: when userspace
+ * starts to read directory entries, issue a READDIRPLUS request to
+ * the filesystem. If any entry attributes have been looked up by the
+ * time userspace requests the next batch of entries continue with
+ * READDIRPLUS, otherwise switch to plain READDIR. This will reasult
+ * in eg plain "ls" triggering READDIRPLUS first then READDIR after
+ * that because it doesn't do lookups. "ls -l" should result in all
+ * READDIRPLUS, except if dentries are already cached.
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements both a readdirplus() and a readdir()
+ * handler.
+ */
+#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14)
+
+/**
+ * Indicates that the filesystem supports asynchronous direct I/O submission.
+ *
+ * If this capability is not requested/available, the kernel will ensure that
+ * there is at most one pending read and one pending write request per direct
+ * I/O file-handle at any time.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ASYNC_DIO (1 << 15)
+
+/**
+ * Indicates that writeback caching should be enabled. This means that
+ * individual write request may be buffered and merged in the kernel
+ * before they are send to the filesystem.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_WRITEBACK_CACHE (1 << 16)
+
+/**
+ * Indicates support for zero-message opens. If this flag is set in
+ * the `capable` field of the `fuse_conn_info` structure, then the
+ * filesystem may return `ENOSYS` from the open() handler to indicate
+ * success. Further attempts to open files will be handled in the
+ * kernel. (If this flag is not set, returning ENOSYS will be treated
+ * as an error and signaled to the caller).
+ *
+ * Setting (or unsetting) this flag in the `want` field has *no
+ * effect*.
+ */
+#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17)
+
+/**
+ * Indicates support for parallel directory operations. If this flag
+ * is unset, the FUSE kernel module will ensure that lookup() and
+ * readdir() requests are never issued concurrently for the same
+ * directory.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_PARALLEL_DIROPS (1 << 18)
+
+/**
+ * Indicates support for POSIX ACLs.
+ *
+ * If this feature is enabled, the kernel will cache and have
+ * responsibility for enforcing ACLs. ACL will be stored as xattrs and
+ * passed to userspace, which is responsible for updating the ACLs in
+ * the filesystem, keeping the file mode in sync with the ACL, and
+ * ensuring inheritance of default ACLs when new filesystem nodes are
+ * created. Note that this requires that the file system is able to
+ * parse and interpret the xattr representation of ACLs.
+ *
+ * Enabling this feature implicitly turns on the
+ * ``default_permissions`` mount option (even if it was not passed to
+ * mount(2)).
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_POSIX_ACL (1 << 19)
+
+/**
+ * Indicates that the filesystem is responsible for unsetting
+ * setuid and setgid bits when a file is written, truncated, or
+ * its owner is changed.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20)
+
+/**
+ * Indicates support for zero-message opendirs. If this flag is set in
+ * the `capable` field of the `fuse_conn_info` structure, then the filesystem
+ * may return `ENOSYS` from the opendir() handler to indicate success. Further
+ * opendir and releasedir messages will be handled in the kernel. (If this
+ * flag is not set, returning ENOSYS will be treated as an error and signalled
+ * to the caller.)
+ *
+ * Setting (or unsetting) this flag in the `want` field has *no effect*.
+ */
+#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24)
+
+/**
+ * Ioctl flags
+ *
+ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
+ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
+ * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_DIR: is a directory
+ *
+ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
+ */
+#define FUSE_IOCTL_COMPAT (1 << 0)
+#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
+#define FUSE_IOCTL_RETRY (1 << 2)
+#define FUSE_IOCTL_DIR (1 << 4)
+
+#define FUSE_IOCTL_MAX_IOV 256
+
+/**
+ * Connection information, passed to the ->init() method
+ *
+ * Some of the elements are read-write, these can be changed to
+ * indicate the value requested by the filesystem. The requested
+ * value must usually be smaller than the indicated value.
+ */
+struct fuse_conn_info {
+ /**
+ * Major version of the protocol (read-only)
+ */
+ unsigned proto_major;
+
+ /**
+ * Minor version of the protocol (read-only)
+ */
+ unsigned proto_minor;
+
+ /**
+ * Maximum size of the write buffer
+ */
+ unsigned max_write;
+
+ /**
+ * Maximum size of read requests. A value of zero indicates no
+ * limit. However, even if the filesystem does not specify a
+ * limit, the maximum size of read requests will still be
+ * limited by the kernel.
+ *
+ * NOTE: For the time being, the maximum size of read requests
+ * must be set both here *and* passed to fuse_session_new()
+ * using the ``-o max_read=<n>`` mount option. At some point
+ * in the future, specifying the mount option will no longer
+ * be necessary.
+ */
+ unsigned max_read;
+
+ /**
+ * Maximum readahead
+ */
+ unsigned max_readahead;
+
+ /**
+ * Capability flags that the kernel supports (read-only)
+ */
+ unsigned capable;
+
+ /**
+ * Capability flags that the filesystem wants to enable.
+ *
+ * libfuse attempts to initialize this field with
+ * reasonable default values before calling the init() handler.
+ */
+ unsigned want;
+
+ /**
+ * Maximum number of pending "background" requests. A
+ * background request is any type of request for which the
+ * total number is not limited by other means. As of kernel
+ * 4.8, only two types of requests fall into this category:
+ *
+ * 1. Read-ahead requests
+ * 2. Asynchronous direct I/O requests
+ *
+ * Read-ahead requests are generated (if max_readahead is
+ * non-zero) by the kernel to preemptively fill its caches
+ * when it anticipates that userspace will soon read more
+ * data.
+ *
+ * Asynchronous direct I/O requests are generated if
+ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large
+ * direct I/O request. In this case the kernel will internally
+ * split it up into multiple smaller requests and submit them
+ * to the filesystem concurrently.
+ *
+ * Note that the following requests are *not* background
+ * requests: writeback requests (limited by the kernel's
+ * flusher algorithm), regular (i.e., synchronous and
+ * buffered) userspace read/write requests (limited to one per
+ * thread), asynchronous read requests (Linux's io_submit(2)
+ * call actually blocks, so these are also limited to one per
+ * thread).
+ */
+ unsigned max_background;
+
+ /**
+ * Kernel congestion threshold parameter. If the number of pending
+ * background requests exceeds this number, the FUSE kernel module will
+ * mark the filesystem as "congested". This instructs the kernel to
+ * expect that queued requests will take some time to complete, and to
+ * adjust its algorithms accordingly (e.g. by putting a waiting thread
+ * to sleep instead of using a busy-loop).
+ */
+ unsigned congestion_threshold;
+
+ /**
+ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible
+ * for updating mtime and ctime when write requests are received. The
+ * updated values are passed to the filesystem with setattr() requests.
+ * However, if the filesystem does not support the full resolution of
+ * the kernel timestamps (nanoseconds), the mtime and ctime values used
+ * by kernel and filesystem will differ (and result in an apparent
+ * change of times after a cache flush).
+ *
+ * To prevent this problem, this variable can be used to inform the
+ * kernel about the timestamp granularity supported by the file-system.
+ * The value should be power of 10. The default is 1, i.e. full
+ * nano-second resolution. Filesystems supporting only second resolution
+ * should set this to 1000000000.
+ */
+ unsigned time_gran;
+
+ /**
+ * For future use.
+ */
+ unsigned reserved[22];
+};
+
+struct fuse_session;
+struct fuse_pollhandle;
+struct fuse_conn_info_opts;
+
+/**
+ * This function parses several command-line options that can be used
+ * to override elements of struct fuse_conn_info. The pointer returned
+ * by this function should be passed to the
+ * fuse_apply_conn_info_opts() method by the file system's init()
+ * handler.
+ *
+ * Before using this function, think twice if you really want these
+ * parameters to be adjustable from the command line. In most cases,
+ * they should be determined by the file system internally.
+ *
+ * The following options are recognized:
+ *
+ * -o max_write=N sets conn->max_write
+ * -o max_readahead=N sets conn->max_readahead
+ * -o max_background=N sets conn->max_background
+ * -o congestion_threshold=N sets conn->congestion_threshold
+ * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want
+ * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want
+ * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want
+ * -o no_remote_lock Equivalent to -o
+ *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets
+ *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets
+ *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets
+ *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets
+ *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets
+ *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets
+ *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets
+ *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets
+ *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o
+ *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO
+ *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in
+ *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in
+ *conn->want -o time_gran=N sets conn->time_gran
+ *
+ * Known options will be removed from *args*, unknown options will be
+ * passed through unchanged.
+ *
+ * @param args argument vector (input+output)
+ * @return parsed options
+ **/
+struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args);
+
+/**
+ * This function applies the (parsed) parameters in *opts* to the
+ * *conn* pointer. It may modify the following fields: wants,
+ * max_write, max_readahead, congestion_threshold, max_background,
+ * time_gran. A field is only set (or unset) if the corresponding
+ * option has been explicitly set.
+ */
+void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts,
+ struct fuse_conn_info *conn);
+
+/**
+ * Go into the background
+ *
+ * @param foreground if true, stay in the foreground
+ * @return 0 on success, -1 on failure
+ */
+int fuse_daemonize(int foreground);
+
+/**
+ * Get the version of the library
+ *
+ * @return the version
+ */
+int fuse_version(void);
+
+/**
+ * Get the full package version string of the library
+ *
+ * @return the package version
+ */
+const char *fuse_pkgversion(void);
+
+/**
+ * Destroy poll handle
+ *
+ * @param ph the poll handle
+ */
+void fuse_pollhandle_destroy(struct fuse_pollhandle *ph);
+
+/*
+ * Data buffer
+ */
+
+/**
+ * Buffer flags
+ */
+enum fuse_buf_flags {
+ /**
+ * Buffer contains a file descriptor
+ *
+ * If this flag is set, the .fd field is valid, otherwise the
+ * .mem fields is valid.
+ */
+ FUSE_BUF_IS_FD = (1 << 1),
+
+ /**
+ * Seek on the file descriptor
+ *
+ * If this flag is set then the .pos field is valid and is
+ * used to seek to the given offset before performing
+ * operation on file descriptor.
+ */
+ FUSE_BUF_FD_SEEK = (1 << 2),
+
+ /**
+ * Retry operation on file descriptor
+ *
+ * If this flag is set then retry operation on file descriptor
+ * until .size bytes have been copied or an error or EOF is
+ * detected.
+ */
+ FUSE_BUF_FD_RETRY = (1 << 3),
+};
+
+/**
+ * Single data buffer
+ *
+ * Generic data buffer for I/O, extended attributes, etc... Data may
+ * be supplied as a memory pointer or as a file descriptor
+ */
+struct fuse_buf {
+ /**
+ * Size of data in bytes
+ */
+ size_t size;
+
+ /**
+ * Buffer flags
+ */
+ enum fuse_buf_flags flags;
+
+ /**
+ * Memory pointer
+ *
+ * Used unless FUSE_BUF_IS_FD flag is set.
+ */
+ void *mem;
+
+ /**
+ * File descriptor
+ *
+ * Used if FUSE_BUF_IS_FD flag is set.
+ */
+ int fd;
+
+ /**
+ * File position
+ *
+ * Used if FUSE_BUF_FD_SEEK flag is set.
+ */
+ off_t pos;
+};
+
+/**
+ * Data buffer vector
+ *
+ * An array of data buffers, each containing a memory pointer or a
+ * file descriptor.
+ *
+ * Allocate dynamically to add more than one buffer.
+ */
+struct fuse_bufvec {
+ /**
+ * Number of buffers in the array
+ */
+ size_t count;
+
+ /**
+ * Index of current buffer within the array
+ */
+ size_t idx;
+
+ /**
+ * Current offset within the current buffer
+ */
+ size_t off;
+
+ /**
+ * Array of buffers
+ */
+ struct fuse_buf buf[1];
+};
+
+/* Initialize bufvec with a single buffer of given size */
+#define FUSE_BUFVEC_INIT(size__) \
+ ((struct fuse_bufvec){ /* .count= */ 1, \
+ /* .idx = */ 0, \
+ /* .off = */ 0, /* .buf = */ \
+ { /* [0] = */ { \
+ /* .size = */ (size__), \
+ /* .flags = */ (enum fuse_buf_flags)0, \
+ /* .mem = */ NULL, \
+ /* .fd = */ -1, \
+ /* .pos = */ 0, \
+ } } })
+
+/**
+ * Get total size of data in a fuse buffer vector
+ *
+ * @param bufv buffer vector
+ * @return size of data
+ */
+size_t fuse_buf_size(const struct fuse_bufvec *bufv);
+
+/**
+ * Copy data from one buffer vector to another
+ *
+ * @param dst destination buffer vector
+ * @param src source buffer vector
+ * @return actual number of bytes copied or -errno on error
+ */
+ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src);
+
+/**
+ * Memory buffer iterator
+ *
+ */
+struct fuse_mbuf_iter {
+ /**
+ * Data pointer
+ */
+ void *mem;
+
+ /**
+ * Total length, in bytes
+ */
+ size_t size;
+
+ /**
+ * Offset from start of buffer
+ */
+ size_t pos;
+};
+
+/* Initialize memory buffer iterator from a fuse_buf */
+#define FUSE_MBUF_ITER_INIT(fbuf) \
+ ((struct fuse_mbuf_iter){ \
+ .mem = fbuf->mem, \
+ .size = fbuf->size, \
+ .pos = 0, \
+ })
+
+/**
+ * Consume bytes from a memory buffer iterator
+ *
+ * @param iter memory buffer iterator
+ * @param len number of bytes to consume
+ * @return pointer to start of consumed bytes or
+ * NULL if advancing beyond end of buffer
+ */
+void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len);
+
+/**
+ * Consume a NUL-terminated string from a memory buffer iterator
+ *
+ * @param iter memory buffer iterator
+ * @return pointer to the string or
+ * NULL if advancing beyond end of buffer or there is no NUL-terminator
+ */
+const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter);
+
+/*
+ * Signal handling
+ */
+/**
+ * Exit session on HUP, TERM and INT signals and ignore PIPE signal
+ *
+ * Stores session in a global variable. May only be called once per
+ * process until fuse_remove_signal_handlers() is called.
+ *
+ * Once either of the POSIX signals arrives, the signal handler calls
+ * fuse_session_exit().
+ *
+ * @param se the session to exit
+ * @return 0 on success, -1 on failure
+ *
+ * See also:
+ * fuse_remove_signal_handlers()
+ */
+int fuse_set_signal_handlers(struct fuse_session *se);
+
+/**
+ * Restore default signal handlers
+ *
+ * Resets global session. After this fuse_set_signal_handlers() may
+ * be called again.
+ *
+ * @param se the same session as given in fuse_set_signal_handlers()
+ *
+ * See also:
+ * fuse_set_signal_handlers()
+ */
+void fuse_remove_signal_handlers(struct fuse_session *se);
+
+/*
+ * Compatibility stuff
+ */
+
+#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30
+#error only API version 30 or greater is supported
+#endif
+
+
+/*
+ * This interface uses 64 bit off_t.
+ *
+ * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags!
+ */
+
+#if defined(__GNUC__) && \
+ (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \
+ !defined __cplusplus
+_Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit");
+#else
+struct _fuse_off_t_must_be_64bit_dummy_struct {
+ unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1);
+};
+#endif
+
+#endif /* FUSE_COMMON_H_ */
diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h
new file mode 100644
index 0000000000..4e47e5880d
--- /dev/null
+++ b/tools/virtiofsd/fuse_i.h
@@ -0,0 +1,115 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#ifndef FUSE_I_H
+#define FUSE_I_H
+
+#define FUSE_USE_VERSION 31
+#include "fuse.h"
+#include "fuse_lowlevel.h"
+
+struct fv_VuDev;
+struct fv_QueueInfo;
+
+struct fuse_req {
+ struct fuse_session *se;
+ uint64_t unique;
+ int ctr;
+ pthread_mutex_t lock;
+ struct fuse_ctx ctx;
+ struct fuse_chan *ch;
+ int interrupted;
+ unsigned int ioctl_64bit:1;
+ union {
+ struct {
+ uint64_t unique;
+ } i;
+ struct {
+ fuse_interrupt_func_t func;
+ void *data;
+ } ni;
+ } u;
+ struct fuse_req *next;
+ struct fuse_req *prev;
+};
+
+struct fuse_notify_req {
+ uint64_t unique;
+ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t,
+ const void *, const struct fuse_buf *);
+ struct fuse_notify_req *next;
+ struct fuse_notify_req *prev;
+};
+
+struct fuse_session {
+ char *mountpoint;
+ volatile int exited;
+ int fd;
+ int debug;
+ int deny_others;
+ struct fuse_lowlevel_ops op;
+ int got_init;
+ struct cuse_data *cuse_data;
+ void *userdata;
+ uid_t owner;
+ struct fuse_conn_info conn;
+ struct fuse_req list;
+ struct fuse_req interrupts;
+ pthread_mutex_t lock;
+ pthread_rwlock_t init_rwlock;
+ int got_destroy;
+ int broken_splice_nonblock;
+ uint64_t notify_ctr;
+ struct fuse_notify_req notify_list;
+ size_t bufsize;
+ int error;
+ char *vu_socket_path;
+ int vu_listen_fd;
+ int vu_socketfd;
+ struct fv_VuDev *virtio_dev;
+ int thread_pool_size;
+};
+
+struct fuse_chan {
+ pthread_mutex_t lock;
+ int ctr;
+ int fd;
+ struct fv_QueueInfo *qi;
+};
+
+/**
+ * Filesystem module
+ *
+ * Filesystem modules are registered with the FUSE_REGISTER_MODULE()
+ * macro.
+ *
+ */
+struct fuse_module {
+ char *name;
+ fuse_module_factory_t factory;
+ struct fuse_module *next;
+ struct fusemod_so *so;
+ int ctr;
+};
+
+int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov,
+ int count);
+void fuse_free_req(fuse_req_t req);
+
+void fuse_session_process_buf_int(struct fuse_session *se,
+ struct fuse_bufvec *bufv,
+ struct fuse_chan *ch);
+
+
+#define FUSE_MAX_MAX_PAGES 256
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
+#endif
diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c
new file mode 100644
index 0000000000..c301ff6da1
--- /dev/null
+++ b/tools/virtiofsd/fuse_log.c
@@ -0,0 +1,41 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Logging API.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_log.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+static void default_log_func(__attribute__((unused)) enum fuse_log_level level,
+ const char *fmt, va_list ap)
+{
+ vfprintf(stderr, fmt, ap);
+}
+
+static fuse_log_func_t log_func = default_log_func;
+
+void fuse_set_log_func(fuse_log_func_t func)
+{
+ if (!func) {
+ func = default_log_func;
+ }
+
+ log_func = func;
+}
+
+void fuse_log(enum fuse_log_level level, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ log_func(level, fmt, ap);
+ va_end(ap);
+}
diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h
new file mode 100644
index 0000000000..bf6c11ff11
--- /dev/null
+++ b/tools/virtiofsd/fuse_log.h
@@ -0,0 +1,74 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_LOG_H_
+#define FUSE_LOG_H_
+
+/** @file
+ *
+ * This file defines the logging interface of FUSE
+ */
+
+#include <stdarg.h>
+
+/**
+ * Log severity level
+ *
+ * These levels correspond to syslog(2) log levels since they are widely used.
+ */
+enum fuse_log_level {
+ FUSE_LOG_EMERG,
+ FUSE_LOG_ALERT,
+ FUSE_LOG_CRIT,
+ FUSE_LOG_ERR,
+ FUSE_LOG_WARNING,
+ FUSE_LOG_NOTICE,
+ FUSE_LOG_INFO,
+ FUSE_LOG_DEBUG
+};
+
+/**
+ * Log message handler function.
+ *
+ * This function must be thread-safe. It may be called from any libfuse
+ * function, including fuse_parse_cmdline() and other functions invoked before
+ * a FUSE filesystem is created.
+ *
+ * Install a custom log message handler function using fuse_set_log_func().
+ *
+ * @param level log severity level
+ * @param fmt sprintf-style format string including newline
+ * @param ap format string arguments
+ */
+typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt,
+ va_list ap);
+
+/**
+ * Install a custom log handler function.
+ *
+ * Log messages are emitted by libfuse functions to report errors and debug
+ * information. Messages are printed to stderr by default but this can be
+ * overridden by installing a custom log message handler function.
+ *
+ * The log message handler function is global and affects all FUSE filesystems
+ * created within this process.
+ *
+ * @param func a custom log message handler function or NULL to revert to
+ * the default
+ */
+void fuse_set_log_func(fuse_log_func_t func);
+
+/**
+ * Emit a log message
+ *
+ * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc)
+ * @param fmt sprintf-style format string including newline
+ */
+void fuse_log(enum fuse_log_level level, const char *fmt, ...);
+
+#endif /* FUSE_LOG_H_ */
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
new file mode 100644
index 0000000000..de2e2e0c65
--- /dev/null
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -0,0 +1,2761 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Implementation of (most of) the low-level FUSE API. The session loop
+ * functions are implemented in separate files.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "standard-headers/linux/fuse.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+#include "fuse_virtio.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <glib.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/file.h>
+#include <unistd.h>
+
+#define THREAD_POOL_SIZE 64
+
+#define OFFSET_MAX 0x7fffffffffffffffLL
+
+struct fuse_pollhandle {
+ uint64_t kh;
+ struct fuse_session *se;
+};
+
+static size_t pagesize;
+
+static __attribute__((constructor)) void fuse_ll_init_pagesize(void)
+{
+ pagesize = getpagesize();
+}
+
+static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr)
+{
+ *attr = (struct fuse_attr){
+ .ino = stbuf->st_ino,
+ .mode = stbuf->st_mode,
+ .nlink = stbuf->st_nlink,
+ .uid = stbuf->st_uid,
+ .gid = stbuf->st_gid,
+ .rdev = stbuf->st_rdev,
+ .size = stbuf->st_size,
+ .blksize = stbuf->st_blksize,
+ .blocks = stbuf->st_blocks,
+ .atime = stbuf->st_atime,
+ .mtime = stbuf->st_mtime,
+ .ctime = stbuf->st_ctime,
+ .atimensec = ST_ATIM_NSEC(stbuf),
+ .mtimensec = ST_MTIM_NSEC(stbuf),
+ .ctimensec = ST_CTIM_NSEC(stbuf),
+ };
+}
+
+static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf)
+{
+ stbuf->st_mode = attr->mode;
+ stbuf->st_uid = attr->uid;
+ stbuf->st_gid = attr->gid;
+ stbuf->st_size = attr->size;
+ stbuf->st_atime = attr->atime;
+ stbuf->st_mtime = attr->mtime;
+ stbuf->st_ctime = attr->ctime;
+ ST_ATIM_NSEC_SET(stbuf, attr->atimensec);
+ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec);
+ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec);
+}
+
+static size_t iov_length(const struct iovec *iov, size_t count)
+{
+ size_t seg;
+ size_t ret = 0;
+
+ for (seg = 0; seg < count; seg++) {
+ ret += iov[seg].iov_len;
+ }
+ return ret;
+}
+
+static void list_init_req(struct fuse_req *req)
+{
+ req->next = req;
+ req->prev = req;
+}
+
+static void list_del_req(struct fuse_req *req)
+{
+ struct fuse_req *prev = req->prev;
+ struct fuse_req *next = req->next;
+ prev->next = next;
+ next->prev = prev;
+}
+
+static void list_add_req(struct fuse_req *req, struct fuse_req *next)
+{
+ struct fuse_req *prev = next->prev;
+ req->next = next;
+ req->prev = prev;
+ prev->next = req;
+ next->prev = req;
+}
+
+static void destroy_req(fuse_req_t req)
+{
+ pthread_mutex_destroy(&req->lock);
+ free(req);
+}
+
+void fuse_free_req(fuse_req_t req)
+{
+ int ctr;
+ struct fuse_session *se = req->se;
+
+ pthread_mutex_lock(&se->lock);
+ req->u.ni.func = NULL;
+ req->u.ni.data = NULL;
+ list_del_req(req);
+ ctr = --req->ctr;
+ req->ch = NULL;
+ pthread_mutex_unlock(&se->lock);
+ if (!ctr) {
+ destroy_req(req);
+ }
+}
+
+static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se)
+{
+ struct fuse_req *req;
+
+ req = (struct fuse_req *)calloc(1, sizeof(struct fuse_req));
+ if (req == NULL) {
+ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n");
+ } else {
+ req->se = se;
+ req->ctr = 1;
+ list_init_req(req);
+ fuse_mutex_init(&req->lock);
+ }
+
+ return req;
+}
+
+/* Send data. If *ch* is NULL, send via session master fd */
+static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count)
+{
+ struct fuse_out_header *out = iov[0].iov_base;
+
+ out->len = iov_length(iov, count);
+ if (out->unique == 0) {
+ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error,
+ out->len);
+ } else if (out->error) {
+ fuse_log(FUSE_LOG_DEBUG,
+ " unique: %llu, error: %i (%s), outsize: %i\n",
+ (unsigned long long)out->unique, out->error,
+ strerror(-out->error), out->len);
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n",
+ (unsigned long long)out->unique, out->len);
+ }
+
+ if (fuse_lowlevel_is_virtio(se)) {
+ return virtio_send_msg(se, ch, iov, count);
+ }
+
+ abort(); /* virtio should have taken it before here */
+ return 0;
+}
+
+
+int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov,
+ int count)
+{
+ struct fuse_out_header out = {
+ .unique = req->unique,
+ .error = error,
+ };
+
+ if (error <= -1000 || error > 0) {
+ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error);
+ error = -ERANGE;
+ }
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(struct fuse_out_header);
+
+ return fuse_send_msg(req->se, req->ch, iov, count);
+}
+
+static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov,
+ int count)
+{
+ int res;
+
+ res = fuse_send_reply_iov_nofree(req, error, iov, count);
+ fuse_free_req(req);
+ return res;
+}
+
+static int send_reply(fuse_req_t req, int error, const void *arg,
+ size_t argsize)
+{
+ struct iovec iov[2];
+ int count = 1;
+ if (argsize) {
+ iov[1].iov_base = (void *)arg;
+ iov[1].iov_len = argsize;
+ count++;
+ }
+ return send_reply_iov(req, error, iov, count);
+}
+
+int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count)
+{
+ int res;
+ struct iovec *padded_iov;
+
+ padded_iov = malloc((count + 1) * sizeof(struct iovec));
+ if (padded_iov == NULL) {
+ return fuse_reply_err(req, ENOMEM);
+ }
+
+ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec));
+ count++;
+
+ res = send_reply_iov(req, 0, padded_iov, count);
+ free(padded_iov);
+
+ return res;
+}
+
+
+/*
+ * 'buf` is allowed to be empty so that the proper size may be
+ * allocated by the caller
+ */
+size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name, const struct stat *stbuf, off_t off)
+{
+ (void)req;
+ size_t namelen;
+ size_t entlen;
+ size_t entlen_padded;
+ struct fuse_dirent *dirent;
+
+ namelen = strlen(name);
+ entlen = FUSE_NAME_OFFSET + namelen;
+ entlen_padded = FUSE_DIRENT_ALIGN(entlen);
+
+ if ((buf == NULL) || (entlen_padded > bufsize)) {
+ return entlen_padded;
+ }
+
+ dirent = (struct fuse_dirent *)buf;
+ dirent->ino = stbuf->st_ino;
+ dirent->off = off;
+ dirent->namelen = namelen;
+ dirent->type = (stbuf->st_mode & S_IFMT) >> 12;
+ memcpy(dirent->name, name, namelen);
+ memset(dirent->name + namelen, 0, entlen_padded - entlen);
+
+ return entlen_padded;
+}
+
+static void convert_statfs(const struct statvfs *stbuf,
+ struct fuse_kstatfs *kstatfs)
+{
+ *kstatfs = (struct fuse_kstatfs){
+ .bsize = stbuf->f_bsize,
+ .frsize = stbuf->f_frsize,
+ .blocks = stbuf->f_blocks,
+ .bfree = stbuf->f_bfree,
+ .bavail = stbuf->f_bavail,
+ .files = stbuf->f_files,
+ .ffree = stbuf->f_ffree,
+ .namelen = stbuf->f_namemax,
+ };
+}
+
+static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize)
+{
+ return send_reply(req, 0, arg, argsize);
+}
+
+int fuse_reply_err(fuse_req_t req, int err)
+{
+ return send_reply(req, -err, NULL, 0);
+}
+
+void fuse_reply_none(fuse_req_t req)
+{
+ fuse_free_req(req);
+}
+
+static unsigned long calc_timeout_sec(double t)
+{
+ if (t > (double)ULONG_MAX) {
+ return ULONG_MAX;
+ } else if (t < 0.0) {
+ return 0;
+ } else {
+ return (unsigned long)t;
+ }
+}
+
+static unsigned int calc_timeout_nsec(double t)
+{
+ double f = t - (double)calc_timeout_sec(t);
+ if (f < 0.0) {
+ return 0;
+ } else if (f >= 0.999999999) {
+ return 999999999;
+ } else {
+ return (unsigned int)(f * 1.0e9);
+ }
+}
+
+static void fill_entry(struct fuse_entry_out *arg,
+ const struct fuse_entry_param *e)
+{
+ *arg = (struct fuse_entry_out){
+ .nodeid = e->ino,
+ .generation = e->generation,
+ .entry_valid = calc_timeout_sec(e->entry_timeout),
+ .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout),
+ .attr_valid = calc_timeout_sec(e->attr_timeout),
+ .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout),
+ };
+ convert_stat(&e->attr, &arg->attr);
+}
+
+/*
+ * `buf` is allowed to be empty so that the proper size may be
+ * allocated by the caller
+ */
+size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name,
+ const struct fuse_entry_param *e, off_t off)
+{
+ (void)req;
+ size_t namelen;
+ size_t entlen;
+ size_t entlen_padded;
+
+ namelen = strlen(name);
+ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen;
+ entlen_padded = FUSE_DIRENT_ALIGN(entlen);
+ if ((buf == NULL) || (entlen_padded > bufsize)) {
+ return entlen_padded;
+ }
+
+ struct fuse_direntplus *dp = (struct fuse_direntplus *)buf;
+ memset(&dp->entry_out, 0, sizeof(dp->entry_out));
+ fill_entry(&dp->entry_out, e);
+
+ struct fuse_dirent *dirent = &dp->dirent;
+ *dirent = (struct fuse_dirent){
+ .ino = e->attr.st_ino,
+ .off = off,
+ .namelen = namelen,
+ .type = (e->attr.st_mode & S_IFMT) >> 12,
+ };
+ memcpy(dirent->name, name, namelen);
+ memset(dirent->name + namelen, 0, entlen_padded - entlen);
+
+ return entlen_padded;
+}
+
+static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f)
+{
+ arg->fh = f->fh;
+ if (f->direct_io) {
+ arg->open_flags |= FOPEN_DIRECT_IO;
+ }
+ if (f->keep_cache) {
+ arg->open_flags |= FOPEN_KEEP_CACHE;
+ }
+ if (f->cache_readdir) {
+ arg->open_flags |= FOPEN_CACHE_DIR;
+ }
+ if (f->nonseekable) {
+ arg->open_flags |= FOPEN_NONSEEKABLE;
+ }
+}
+
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e)
+{
+ struct fuse_entry_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ fill_entry(&arg, e);
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
+ const struct fuse_file_info *f)
+{
+ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)];
+ size_t entrysize = sizeof(struct fuse_entry_out);
+ struct fuse_entry_out *earg = (struct fuse_entry_out *)buf;
+ struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize);
+
+ memset(buf, 0, sizeof(buf));
+ fill_entry(earg, e);
+ fill_open(oarg, f);
+ return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out));
+}
+
+int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
+ double attr_timeout)
+{
+ struct fuse_attr_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ arg.attr_valid = calc_timeout_sec(attr_timeout);
+ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout);
+ convert_stat(attr, &arg.attr);
+
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_readlink(fuse_req_t req, const char *linkname)
+{
+ return send_reply_ok(req, linkname, strlen(linkname));
+}
+
+int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f)
+{
+ struct fuse_open_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ fill_open(&arg, f);
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_write(fuse_req_t req, size_t count)
+{
+ struct fuse_write_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.size = count;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size)
+{
+ return send_reply_ok(req, buf, size);
+}
+
+static int fuse_send_data_iov_fallback(struct fuse_session *se,
+ struct fuse_chan *ch, struct iovec *iov,
+ int iov_count, struct fuse_bufvec *buf,
+ size_t len)
+{
+ /* Optimize common case */
+ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 &&
+ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) {
+ /*
+ * FIXME: also avoid memory copy if there are multiple buffers
+ * but none of them contain an fd
+ */
+
+ iov[iov_count].iov_base = buf->buf[0].mem;
+ iov[iov_count].iov_len = len;
+ iov_count++;
+ return fuse_send_msg(se, ch, iov, iov_count);
+ }
+
+ if (fuse_lowlevel_is_virtio(se) && buf->count == 1 &&
+ buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) {
+ return virtio_send_data_iov(se, ch, iov, iov_count, buf, len);
+ }
+
+ abort(); /* Will have taken vhost path */
+ return 0;
+}
+
+static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int iov_count,
+ struct fuse_bufvec *buf)
+{
+ size_t len = fuse_buf_size(buf);
+
+ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len);
+}
+
+int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv)
+{
+ struct iovec iov[2];
+ struct fuse_out_header out = {
+ .unique = req->unique,
+ };
+ int res;
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(struct fuse_out_header);
+
+ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv);
+ if (res <= 0) {
+ fuse_free_req(req);
+ return res;
+ } else {
+ return fuse_reply_err(req, res);
+ }
+}
+
+int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf)
+{
+ struct fuse_statfs_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ convert_statfs(stbuf, &arg.st);
+
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_xattr(fuse_req_t req, size_t count)
+{
+ struct fuse_getxattr_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.size = count;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_lock(fuse_req_t req, const struct flock *lock)
+{
+ struct fuse_lk_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.lk.type = lock->l_type;
+ if (lock->l_type != F_UNLCK) {
+ arg.lk.start = lock->l_start;
+ if (lock->l_len == 0) {
+ arg.lk.end = OFFSET_MAX;
+ } else {
+ arg.lk.end = lock->l_start + lock->l_len - 1;
+ }
+ }
+ arg.lk.pid = lock->l_pid;
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_bmap(fuse_req_t req, uint64_t idx)
+{
+ struct fuse_bmap_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.block = idx;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov,
+ size_t count)
+{
+ struct fuse_ioctl_iovec *fiov;
+ size_t i;
+
+ fiov = malloc(sizeof(fiov[0]) * count);
+ if (!fiov) {
+ return NULL;
+ }
+
+ for (i = 0; i < count; i++) {
+ fiov[i].base = (uintptr_t)iov[i].iov_base;
+ fiov[i].len = iov[i].iov_len;
+ }
+
+ return fiov;
+}
+
+int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov,
+ size_t in_count, const struct iovec *out_iov,
+ size_t out_count)
+{
+ struct fuse_ioctl_out arg;
+ struct fuse_ioctl_iovec *in_fiov = NULL;
+ struct fuse_ioctl_iovec *out_fiov = NULL;
+ struct iovec iov[4];
+ size_t count = 1;
+ int res;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.flags |= FUSE_IOCTL_RETRY;
+ arg.in_iovs = in_count;
+ arg.out_iovs = out_count;
+ iov[count].iov_base = &arg;
+ iov[count].iov_len = sizeof(arg);
+ count++;
+
+ /* Can't handle non-compat 64bit ioctls on 32bit */
+ if (sizeof(void *) == 4 && req->ioctl_64bit) {
+ res = fuse_reply_err(req, EINVAL);
+ goto out;
+ }
+
+ if (in_count) {
+ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count);
+ if (!in_fiov) {
+ goto enomem;
+ }
+
+ iov[count].iov_base = (void *)in_fiov;
+ iov[count].iov_len = sizeof(in_fiov[0]) * in_count;
+ count++;
+ }
+ if (out_count) {
+ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count);
+ if (!out_fiov) {
+ goto enomem;
+ }
+
+ iov[count].iov_base = (void *)out_fiov;
+ iov[count].iov_len = sizeof(out_fiov[0]) * out_count;
+ count++;
+ }
+
+ res = send_reply_iov(req, 0, iov, count);
+out:
+ free(in_fiov);
+ free(out_fiov);
+
+ return res;
+
+enomem:
+ res = fuse_reply_err(req, ENOMEM);
+ goto out;
+}
+
+int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size)
+{
+ struct fuse_ioctl_out arg;
+ struct iovec iov[3];
+ size_t count = 1;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.result = result;
+ iov[count].iov_base = &arg;
+ iov[count].iov_len = sizeof(arg);
+ count++;
+
+ if (size) {
+ iov[count].iov_base = (char *)buf;
+ iov[count].iov_len = size;
+ count++;
+ }
+
+ return send_reply_iov(req, 0, iov, count);
+}
+
+int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov,
+ int count)
+{
+ struct iovec *padded_iov;
+ struct fuse_ioctl_out arg;
+ int res;
+
+ padded_iov = malloc((count + 2) * sizeof(struct iovec));
+ if (padded_iov == NULL) {
+ return fuse_reply_err(req, ENOMEM);
+ }
+
+ memset(&arg, 0, sizeof(arg));
+ arg.result = result;
+ padded_iov[1].iov_base = &arg;
+ padded_iov[1].iov_len = sizeof(arg);
+
+ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec));
+
+ res = send_reply_iov(req, 0, padded_iov, count + 2);
+ free(padded_iov);
+
+ return res;
+}
+
+int fuse_reply_poll(fuse_req_t req, unsigned revents)
+{
+ struct fuse_poll_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.revents = revents;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_lseek(fuse_req_t req, off_t off)
+{
+ struct fuse_lseek_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.offset = off;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+static void do_lookup(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.lookup) {
+ req->se->op.lookup(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_forget(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_forget_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.forget) {
+ req->se->op.forget(req, nodeid, arg->nlookup);
+ } else {
+ fuse_reply_none(req);
+ }
+}
+
+static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_batch_forget_in *arg;
+ struct fuse_forget_data *forgets;
+ size_t scount;
+
+ (void)nodeid;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ /*
+ * Prevent integer overflow. The compiler emits the following warning
+ * unless we use the scount local variable:
+ *
+ * error: comparison is always false due to limited range of data type
+ * [-Werror=type-limits]
+ *
+ * This may be true on 64-bit hosts but we need this check for 32-bit
+ * hosts.
+ */
+ scount = arg->count;
+ if (scount > SIZE_MAX / sizeof(forgets[0])) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0]));
+ if (!forgets) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ if (req->se->op.forget_multi) {
+ req->se->op.forget_multi(req, arg->count, forgets);
+ } else if (req->se->op.forget) {
+ unsigned int i;
+
+ for (i = 0; i < arg->count; i++) {
+ struct fuse_req *dummy_req;
+
+ dummy_req = fuse_ll_alloc_req(req->se);
+ if (dummy_req == NULL) {
+ break;
+ }
+
+ dummy_req->unique = req->unique;
+ dummy_req->ctx = req->ctx;
+ dummy_req->ch = NULL;
+
+ req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup);
+ }
+ fuse_reply_none(req);
+ } else {
+ fuse_reply_none(req);
+ }
+}
+
+static void do_getattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_file_info *fip = NULL;
+ struct fuse_file_info fi;
+
+ struct fuse_getattr_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (arg->getattr_flags & FUSE_GETATTR_FH) {
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fip = &fi;
+ }
+
+ if (req->se->op.getattr) {
+ req->se->op.getattr(req, nodeid, fip);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_setattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.setattr) {
+ struct fuse_setattr_in *arg;
+ struct fuse_file_info *fi = NULL;
+ struct fuse_file_info fi_store;
+ struct stat stbuf;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&stbuf, 0, sizeof(stbuf));
+ convert_attr(arg, &stbuf);
+ if (arg->valid & FATTR_FH) {
+ arg->valid &= ~FATTR_FH;
+ memset(&fi_store, 0, sizeof(fi_store));
+ fi = &fi_store;
+ fi->fh = arg->fh;
+ }
+ arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID |
+ FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE |
+ FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME |
+ FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW |
+ FUSE_SET_ATTR_CTIME;
+
+ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_access(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_access_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.access) {
+ req->se->op.access(req, nodeid, arg->mask);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_readlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ (void)iter;
+
+ if (req->se->op.readlink) {
+ req->se->op.readlink(req, nodeid);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_mknod_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ req->ctx.umask = arg->umask;
+
+ if (req->se->op.mknod) {
+ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_mkdir_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ req->ctx.umask = arg->umask;
+
+ if (req->se->op.mkdir) {
+ req->se->op.mkdir(req, nodeid, name, arg->mode);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_unlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.unlink) {
+ req->se->op.unlink(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rmdir) {
+ req->se->op.rmdir(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_symlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+ const char *linkname = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name || !linkname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.symlink) {
+ req->se->op.symlink(req, linkname, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rename(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_rename_in *arg;
+ const char *oldname;
+ const char *newname;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ oldname = fuse_mbuf_iter_advance_str(iter);
+ newname = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !oldname || !newname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rename) {
+ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rename2(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_rename2_in *arg;
+ const char *oldname;
+ const char *newname;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ oldname = fuse_mbuf_iter_advance_str(iter);
+ newname = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !oldname || !newname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rename) {
+ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname,
+ arg->flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_link(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.link) {
+ req->se->op.link(req, arg->oldnodeid, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_create(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.create) {
+ struct fuse_create_in *arg;
+ struct fuse_file_info fi;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+
+ req->ctx.umask = arg->umask;
+
+ req->se->op.create(req, nodeid, name, arg->mode, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_open(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_open_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+
+ if (req->se->op.open) {
+ req->se->op.open(req, nodeid, &fi);
+ } else {
+ fuse_reply_open(req, &fi);
+ }
+}
+
+static void do_read(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.read) {
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_write(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_write_in *arg;
+ struct fuse_file_info fi;
+ const char *param;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ param = fuse_mbuf_iter_advance(iter, arg->size);
+ if (!param) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0;
+ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV);
+
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+
+ if (req->se->op.write) {
+ req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv)
+{
+ struct fuse_session *se = req->se;
+ struct fuse_bufvec *pbufv = ibufv;
+ struct fuse_bufvec tmpbufv = {
+ .buf[0] = ibufv->buf[0],
+ .count = 1,
+ };
+ struct fuse_write_in *arg;
+ size_t arg_size = sizeof(*arg);
+ struct fuse_file_info fi;
+
+ memset(&fi, 0, sizeof(fi));
+
+ arg = fuse_mbuf_iter_advance(iter, arg_size);
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+ fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE);
+ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV);
+
+ if (ibufv->count == 1) {
+ assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD));
+ tmpbufv.buf[0].mem = ((char *)arg) + arg_size;
+ tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size;
+ pbufv = &tmpbufv;
+ } else {
+ /*
+ * Input bufv contains the headers in the first element
+ * and the data in the rest, we need to skip that first element
+ */
+ ibufv->buf[0].size = 0;
+ }
+
+ if (fuse_buf_size(pbufv) != arg->size) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: do_write_buf: buffer size doesn't match arg->size\n");
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi);
+}
+
+static void do_flush(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_flush_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.flush = 1;
+ fi.lock_owner = arg->lock_owner;
+
+ if (req->se->op.flush) {
+ req->se->op.flush(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_release(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_release_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0;
+ fi.lock_owner = arg->lock_owner;
+
+ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) {
+ fi.flock_release = 1;
+ }
+
+ if (req->se->op.release) {
+ req->se->op.release(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, 0);
+ }
+}
+
+static void do_fsync(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fsync_in *arg;
+ struct fuse_file_info fi;
+ int datasync;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ datasync = arg->fsync_flags & 1;
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fsync) {
+ if (fi.fh == (uint64_t)-1) {
+ req->se->op.fsync(req, nodeid, datasync, NULL);
+ } else {
+ req->se->op.fsync(req, nodeid, datasync, &fi);
+ }
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_opendir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_open_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+
+ if (req->se->op.opendir) {
+ req->se->op.opendir(req, nodeid, &fi);
+ } else {
+ fuse_reply_open(req, &fi);
+ }
+}
+
+static void do_readdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.readdir) {
+ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.readdirplus) {
+ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_release_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+
+ if (req->se->op.releasedir) {
+ req->se->op.releasedir(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, 0);
+ }
+}
+
+static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fsync_in *arg;
+ struct fuse_file_info fi;
+ int datasync;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ datasync = arg->fsync_flags & 1;
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fsyncdir) {
+ req->se->op.fsyncdir(req, nodeid, datasync, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_statfs(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ (void)nodeid;
+ (void)iter;
+
+ if (req->se->op.statfs) {
+ req->se->op.statfs(req, nodeid);
+ } else {
+ struct statvfs buf = {
+ .f_namemax = 255,
+ .f_bsize = 512,
+ };
+ fuse_reply_statfs(req, &buf);
+ }
+}
+
+static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_setxattr_in *arg;
+ const char *name;
+ const char *value;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ value = fuse_mbuf_iter_advance(iter, arg->size);
+ if (!value) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.setxattr) {
+ req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_getxattr_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.getxattr) {
+ req->se->op.getxattr(req, nodeid, name, arg->size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_getxattr_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.listxattr) {
+ req->se->op.listxattr(req, nodeid, arg->size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.removexattr) {
+ req->se->op.removexattr(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void convert_fuse_file_lock(struct fuse_file_lock *fl,
+ struct flock *flock)
+{
+ memset(flock, 0, sizeof(struct flock));
+ flock->l_type = fl->type;
+ flock->l_whence = SEEK_SET;
+ flock->l_start = fl->start;
+ if (fl->end == OFFSET_MAX) {
+ flock->l_len = 0;
+ } else {
+ flock->l_len = fl->end - fl->start + 1;
+ }
+ flock->l_pid = fl->pid;
+}
+
+static void do_getlk(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_lk_in *arg;
+ struct fuse_file_info fi;
+ struct flock flock;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->owner;
+
+ convert_fuse_file_lock(&arg->lk, &flock);
+ if (req->se->op.getlk) {
+ req->se->op.getlk(req, nodeid, &fi, &flock);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter, int sleep)
+{
+ struct fuse_lk_in *arg;
+ struct fuse_file_info fi;
+ struct flock flock;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->owner;
+
+ if (arg->lk_flags & FUSE_LK_FLOCK) {
+ int op = 0;
+
+ switch (arg->lk.type) {
+ case F_RDLCK:
+ op = LOCK_SH;
+ break;
+ case F_WRLCK:
+ op = LOCK_EX;
+ break;
+ case F_UNLCK:
+ op = LOCK_UN;
+ break;
+ }
+ if (!sleep) {
+ op |= LOCK_NB;
+ }
+
+ if (req->se->op.flock) {
+ req->se->op.flock(req, nodeid, &fi, op);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+ } else {
+ convert_fuse_file_lock(&arg->lk, &flock);
+ if (req->se->op.setlk) {
+ req->se->op.setlk(req, nodeid, &fi, &flock, sleep);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+ }
+}
+
+static void do_setlk(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ do_setlk_common(req, nodeid, iter, 0);
+}
+
+static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ do_setlk_common(req, nodeid, iter, 1);
+}
+
+static int find_interrupted(struct fuse_session *se, struct fuse_req *req)
+{
+ struct fuse_req *curr;
+
+ for (curr = se->list.next; curr != &se->list; curr = curr->next) {
+ if (curr->unique == req->u.i.unique) {
+ fuse_interrupt_func_t func;
+ void *data;
+
+ curr->ctr++;
+ pthread_mutex_unlock(&se->lock);
+
+ /* Ugh, ugly locking */
+ pthread_mutex_lock(&curr->lock);
+ pthread_mutex_lock(&se->lock);
+ curr->interrupted = 1;
+ func = curr->u.ni.func;
+ data = curr->u.ni.data;
+ pthread_mutex_unlock(&se->lock);
+ if (func) {
+ func(curr, data);
+ }
+ pthread_mutex_unlock(&curr->lock);
+
+ pthread_mutex_lock(&se->lock);
+ curr->ctr--;
+ if (!curr->ctr) {
+ destroy_req(curr);
+ }
+
+ return 1;
+ }
+ }
+ for (curr = se->interrupts.next; curr != &se->interrupts;
+ curr = curr->next) {
+ if (curr->u.i.unique == req->u.i.unique) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_interrupt_in *arg;
+ struct fuse_session *se = req->se;
+
+ (void)nodeid;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n",
+ (unsigned long long)arg->unique);
+
+ req->u.i.unique = arg->unique;
+
+ pthread_mutex_lock(&se->lock);
+ if (find_interrupted(se, req)) {
+ destroy_req(req);
+ } else {
+ list_add_req(req, &se->interrupts);
+ }
+ pthread_mutex_unlock(&se->lock);
+}
+
+static struct fuse_req *check_interrupt(struct fuse_session *se,
+ struct fuse_req *req)
+{
+ struct fuse_req *curr;
+
+ for (curr = se->interrupts.next; curr != &se->interrupts;
+ curr = curr->next) {
+ if (curr->u.i.unique == req->unique) {
+ req->interrupted = 1;
+ list_del_req(curr);
+ free(curr);
+ return NULL;
+ }
+ }
+ curr = se->interrupts.next;
+ if (curr != &se->interrupts) {
+ list_del_req(curr);
+ list_init_req(curr);
+ return curr;
+ } else {
+ return NULL;
+ }
+}
+
+static void do_bmap(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.bmap) {
+ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_ioctl_in *arg;
+ unsigned int flags;
+ void *in_buf = NULL;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ flags = arg->flags;
+ if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) {
+ fuse_reply_err(req, ENOTTY);
+ return;
+ }
+
+ if (arg->in_size) {
+ in_buf = fuse_mbuf_iter_advance(iter, arg->in_size);
+ if (!in_buf) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) {
+ req->ioctl_64bit = 1;
+ }
+
+ if (req->se->op.ioctl) {
+ req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg,
+ &fi, flags, in_buf, arg->in_size, arg->out_size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+void fuse_pollhandle_destroy(struct fuse_pollhandle *ph)
+{
+ free(ph);
+}
+
+static void do_poll(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_poll_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.poll_events = arg->events;
+
+ if (req->se->op.poll) {
+ struct fuse_pollhandle *ph = NULL;
+
+ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) {
+ ph = malloc(sizeof(struct fuse_pollhandle));
+ if (ph == NULL) {
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+ ph->kh = arg->kh;
+ ph->se = req->se;
+ }
+
+ req->se->op.poll(req, nodeid, &fi, ph);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fallocate_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fallocate) {
+ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length,
+ &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_copy_file_range_in *arg;
+ struct fuse_file_info fi_in, fi_out;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi_in, 0, sizeof(fi_in));
+ fi_in.fh = arg->fh_in;
+
+ memset(&fi_out, 0, sizeof(fi_out));
+ fi_out.fh = arg->fh_out;
+
+
+ if (req->se->op.copy_file_range) {
+ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in,
+ arg->nodeid_out, arg->off_out, &fi_out,
+ arg->len, arg->flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_lseek(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_lseek_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.lseek) {
+ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_init(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ size_t compat_size = offsetof(struct fuse_init_in, max_readahead);
+ struct fuse_init_in *arg;
+ struct fuse_init_out outarg;
+ struct fuse_session *se = req->se;
+ size_t bufsize = se->bufsize;
+ size_t outargsize = sizeof(outarg);
+
+ (void)nodeid;
+
+ /* First consume the old fields... */
+ arg = fuse_mbuf_iter_advance(iter, compat_size);
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ /* ...and now consume the new fields. */
+ if (arg->major == 7 && arg->minor >= 6) {
+ if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor);
+ if (arg->major == 7 && arg->minor >= 6) {
+ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags);
+ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead);
+ }
+ se->conn.proto_major = arg->major;
+ se->conn.proto_minor = arg->minor;
+ se->conn.capable = 0;
+ se->conn.want = 0;
+
+ memset(&outarg, 0, sizeof(outarg));
+ outarg.major = FUSE_KERNEL_VERSION;
+ outarg.minor = FUSE_KERNEL_MINOR_VERSION;
+
+ if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) {
+ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n",
+ arg->major, arg->minor);
+ fuse_reply_err(req, EPROTO);
+ return;
+ }
+
+ if (arg->major > 7) {
+ /* Wait for a second INIT request with a 7.X version */
+ send_reply_ok(req, &outarg, sizeof(outarg));
+ return;
+ }
+
+ if (arg->max_readahead < se->conn.max_readahead) {
+ se->conn.max_readahead = arg->max_readahead;
+ }
+ if (arg->flags & FUSE_ASYNC_READ) {
+ se->conn.capable |= FUSE_CAP_ASYNC_READ;
+ }
+ if (arg->flags & FUSE_POSIX_LOCKS) {
+ se->conn.capable |= FUSE_CAP_POSIX_LOCKS;
+ }
+ if (arg->flags & FUSE_ATOMIC_O_TRUNC) {
+ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC;
+ }
+ if (arg->flags & FUSE_EXPORT_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT;
+ }
+ if (arg->flags & FUSE_DONT_MASK) {
+ se->conn.capable |= FUSE_CAP_DONT_MASK;
+ }
+ if (arg->flags & FUSE_FLOCK_LOCKS) {
+ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS;
+ }
+ if (arg->flags & FUSE_AUTO_INVAL_DATA) {
+ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA;
+ }
+ if (arg->flags & FUSE_DO_READDIRPLUS) {
+ se->conn.capable |= FUSE_CAP_READDIRPLUS;
+ }
+ if (arg->flags & FUSE_READDIRPLUS_AUTO) {
+ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO;
+ }
+ if (arg->flags & FUSE_ASYNC_DIO) {
+ se->conn.capable |= FUSE_CAP_ASYNC_DIO;
+ }
+ if (arg->flags & FUSE_WRITEBACK_CACHE) {
+ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE;
+ }
+ if (arg->flags & FUSE_NO_OPEN_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT;
+ }
+ if (arg->flags & FUSE_PARALLEL_DIROPS) {
+ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS;
+ }
+ if (arg->flags & FUSE_POSIX_ACL) {
+ se->conn.capable |= FUSE_CAP_POSIX_ACL;
+ }
+ if (arg->flags & FUSE_HANDLE_KILLPRIV) {
+ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV;
+ }
+ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT;
+ }
+ if (!(arg->flags & FUSE_MAX_PAGES)) {
+ size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() +
+ FUSE_BUFFER_HEADER_SIZE;
+ if (bufsize > max_bufsize) {
+ bufsize = max_bufsize;
+ }
+ }
+#ifdef HAVE_SPLICE
+#ifdef HAVE_VMSPLICE
+ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
+#endif
+ se->conn.capable |= FUSE_CAP_SPLICE_READ;
+#endif
+ se->conn.capable |= FUSE_CAP_IOCTL_DIR;
+
+ /*
+ * Default settings for modern filesystems.
+ *
+ * Most of these capabilities were disabled by default in
+ * libfuse2 for backwards compatibility reasons. In libfuse3,
+ * we can finally enable them by default (as long as they're
+ * supported by the kernel).
+ */
+#define LL_SET_DEFAULT(cond, cap) \
+ if ((cond) && (se->conn.capable & (cap))) \
+ se->conn.want |= (cap)
+ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ);
+ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS);
+ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA);
+ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV);
+ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO);
+ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR);
+ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC);
+ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ);
+ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS);
+ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS);
+ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS);
+ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir,
+ FUSE_CAP_READDIRPLUS_AUTO);
+ se->conn.time_gran = 1;
+
+ if (bufsize < FUSE_MIN_READ_BUFFER) {
+ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n",
+ bufsize);
+ bufsize = FUSE_MIN_READ_BUFFER;
+ }
+ se->bufsize = bufsize;
+
+ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) {
+ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE;
+ }
+
+ se->got_init = 1;
+ se->got_destroy = 0;
+ if (se->op.init) {
+ se->op.init(se->userdata, &se->conn);
+ }
+
+ if (se->conn.want & (~se->conn.capable)) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: error: filesystem requested capabilities "
+ "0x%x that are not supported by kernel, aborting.\n",
+ se->conn.want & (~se->conn.capable));
+ fuse_reply_err(req, EPROTO);
+ se->error = -EPROTO;
+ fuse_session_exit(se);
+ return;
+ }
+
+ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) {
+ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE;
+ }
+ if (arg->flags & FUSE_MAX_PAGES) {
+ outarg.flags |= FUSE_MAX_PAGES;
+ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1;
+ }
+
+ /*
+ * Always enable big writes, this is superseded
+ * by the max_write option
+ */
+ outarg.flags |= FUSE_BIG_WRITES;
+
+ if (se->conn.want & FUSE_CAP_ASYNC_READ) {
+ outarg.flags |= FUSE_ASYNC_READ;
+ }
+ if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) {
+ outarg.flags |= FUSE_PARALLEL_DIROPS;
+ }
+ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) {
+ outarg.flags |= FUSE_POSIX_LOCKS;
+ }
+ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) {
+ outarg.flags |= FUSE_ATOMIC_O_TRUNC;
+ }
+ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) {
+ outarg.flags |= FUSE_EXPORT_SUPPORT;
+ }
+ if (se->conn.want & FUSE_CAP_DONT_MASK) {
+ outarg.flags |= FUSE_DONT_MASK;
+ }
+ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) {
+ outarg.flags |= FUSE_FLOCK_LOCKS;
+ }
+ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) {
+ outarg.flags |= FUSE_AUTO_INVAL_DATA;
+ }
+ if (se->conn.want & FUSE_CAP_READDIRPLUS) {
+ outarg.flags |= FUSE_DO_READDIRPLUS;
+ }
+ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) {
+ outarg.flags |= FUSE_READDIRPLUS_AUTO;
+ }
+ if (se->conn.want & FUSE_CAP_ASYNC_DIO) {
+ outarg.flags |= FUSE_ASYNC_DIO;
+ }
+ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) {
+ outarg.flags |= FUSE_WRITEBACK_CACHE;
+ }
+ if (se->conn.want & FUSE_CAP_POSIX_ACL) {
+ outarg.flags |= FUSE_POSIX_ACL;
+ }
+ outarg.max_readahead = se->conn.max_readahead;
+ outarg.max_write = se->conn.max_write;
+ if (se->conn.max_background >= (1 << 16)) {
+ se->conn.max_background = (1 << 16) - 1;
+ }
+ if (se->conn.congestion_threshold > se->conn.max_background) {
+ se->conn.congestion_threshold = se->conn.max_background;
+ }
+ if (!se->conn.congestion_threshold) {
+ se->conn.congestion_threshold = se->conn.max_background * 3 / 4;
+ }
+
+ outarg.max_background = se->conn.max_background;
+ outarg.congestion_threshold = se->conn.congestion_threshold;
+ outarg.time_gran = se->conn.time_gran;
+
+ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
+ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
+ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead);
+ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write);
+ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background);
+ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n",
+ outarg.congestion_threshold);
+ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran);
+
+ send_reply_ok(req, &outarg, outargsize);
+}
+
+static void do_destroy(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_session *se = req->se;
+
+ (void)nodeid;
+ (void)iter;
+
+ se->got_destroy = 1;
+ se->got_init = 0;
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+
+ send_reply_ok(req, NULL, 0);
+}
+
+static int send_notify_iov(struct fuse_session *se, int notify_code,
+ struct iovec *iov, int count)
+{
+ struct fuse_out_header out = {
+ .error = notify_code,
+ };
+
+ if (!se->got_init) {
+ return -ENOTCONN;
+ }
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(struct fuse_out_header);
+
+ return fuse_send_msg(se, NULL, iov, count);
+}
+
+int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph)
+{
+ if (ph != NULL) {
+ struct fuse_notify_poll_wakeup_out outarg = {
+ .kh = ph->kh,
+ };
+ struct iovec iov[2];
+
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+
+ return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2);
+ } else {
+ return 0;
+ }
+}
+
+int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino,
+ off_t off, off_t len)
+{
+ struct fuse_notify_inval_inode_out outarg = {
+ .ino = ino,
+ .off = off,
+ .len = len,
+ };
+ struct iovec iov[2];
+
+ if (!se) {
+ return -EINVAL;
+ }
+
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+
+ return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2);
+}
+
+int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent,
+ const char *name, size_t namelen)
+{
+ struct fuse_notify_inval_entry_out outarg = {
+ .parent = parent,
+ .namelen = namelen,
+ };
+ struct iovec iov[3];
+
+ if (!se) {
+ return -EINVAL;
+ }
+
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+ iov[2].iov_base = (void *)name;
+ iov[2].iov_len = namelen + 1;
+
+ return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3);
+}
+
+int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent,
+ fuse_ino_t child, const char *name,
+ size_t namelen)
+{
+ struct fuse_notify_delete_out outarg = {
+ .parent = parent,
+ .child = child,
+ .namelen = namelen,
+ };
+ struct iovec iov[3];
+
+ if (!se) {
+ return -EINVAL;
+ }
+
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+ iov[2].iov_base = (void *)name;
+ iov[2].iov_len = namelen + 1;
+
+ return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3);
+}
+
+int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino,
+ off_t offset, struct fuse_bufvec *bufv)
+{
+ struct fuse_out_header out = {
+ .error = FUSE_NOTIFY_STORE,
+ };
+ struct fuse_notify_store_out outarg = {
+ .nodeid = ino,
+ .offset = offset,
+ .size = fuse_buf_size(bufv),
+ };
+ struct iovec iov[3];
+ int res;
+
+ if (!se) {
+ return -EINVAL;
+ }
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(out);
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+
+ res = fuse_send_data_iov(se, NULL, iov, 2, bufv);
+ if (res > 0) {
+ res = -res;
+ }
+
+ return res;
+}
+
+void *fuse_req_userdata(fuse_req_t req)
+{
+ return req->se->userdata;
+}
+
+const struct fuse_ctx *fuse_req_ctx(fuse_req_t req)
+{
+ return &req->ctx;
+}
+
+void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func,
+ void *data)
+{
+ pthread_mutex_lock(&req->lock);
+ pthread_mutex_lock(&req->se->lock);
+ req->u.ni.func = func;
+ req->u.ni.data = data;
+ pthread_mutex_unlock(&req->se->lock);
+ if (req->interrupted && func) {
+ func(req, data);
+ }
+ pthread_mutex_unlock(&req->lock);
+}
+
+int fuse_req_interrupted(fuse_req_t req)
+{
+ int interrupted;
+
+ pthread_mutex_lock(&req->se->lock);
+ interrupted = req->interrupted;
+ pthread_mutex_unlock(&req->se->lock);
+
+ return interrupted;
+}
+
+static struct {
+ void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *);
+ const char *name;
+} fuse_ll_ops[] = {
+ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" },
+ [FUSE_FORGET] = { do_forget, "FORGET" },
+ [FUSE_GETATTR] = { do_getattr, "GETATTR" },
+ [FUSE_SETATTR] = { do_setattr, "SETATTR" },
+ [FUSE_READLINK] = { do_readlink, "READLINK" },
+ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" },
+ [FUSE_MKNOD] = { do_mknod, "MKNOD" },
+ [FUSE_MKDIR] = { do_mkdir, "MKDIR" },
+ [FUSE_UNLINK] = { do_unlink, "UNLINK" },
+ [FUSE_RMDIR] = { do_rmdir, "RMDIR" },
+ [FUSE_RENAME] = { do_rename, "RENAME" },
+ [FUSE_LINK] = { do_link, "LINK" },
+ [FUSE_OPEN] = { do_open, "OPEN" },
+ [FUSE_READ] = { do_read, "READ" },
+ [FUSE_WRITE] = { do_write, "WRITE" },
+ [FUSE_STATFS] = { do_statfs, "STATFS" },
+ [FUSE_RELEASE] = { do_release, "RELEASE" },
+ [FUSE_FSYNC] = { do_fsync, "FSYNC" },
+ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" },
+ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" },
+ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" },
+ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" },
+ [FUSE_FLUSH] = { do_flush, "FLUSH" },
+ [FUSE_INIT] = { do_init, "INIT" },
+ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" },
+ [FUSE_READDIR] = { do_readdir, "READDIR" },
+ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" },
+ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" },
+ [FUSE_GETLK] = { do_getlk, "GETLK" },
+ [FUSE_SETLK] = { do_setlk, "SETLK" },
+ [FUSE_SETLKW] = { do_setlkw, "SETLKW" },
+ [FUSE_ACCESS] = { do_access, "ACCESS" },
+ [FUSE_CREATE] = { do_create, "CREATE" },
+ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" },
+ [FUSE_BMAP] = { do_bmap, "BMAP" },
+ [FUSE_IOCTL] = { do_ioctl, "IOCTL" },
+ [FUSE_POLL] = { do_poll, "POLL" },
+ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" },
+ [FUSE_DESTROY] = { do_destroy, "DESTROY" },
+ [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" },
+ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" },
+ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" },
+ [FUSE_RENAME2] = { do_rename2, "RENAME2" },
+ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" },
+ [FUSE_LSEEK] = { do_lseek, "LSEEK" },
+};
+
+#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0]))
+
+static const char *opname(enum fuse_opcode opcode)
+{
+ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) {
+ return "???";
+ } else {
+ return fuse_ll_ops[opcode].name;
+ }
+}
+
+void fuse_session_process_buf(struct fuse_session *se,
+ const struct fuse_buf *buf)
+{
+ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 };
+ fuse_session_process_buf_int(se, &bufv, NULL);
+}
+
+/*
+ * Restriction:
+ * bufv is normally a single entry buffer, except for a write
+ * where (if it's in memory) then the bufv may be multiple entries,
+ * where the first entry contains all headers and subsequent entries
+ * contain data
+ * bufv shall not use any offsets etc to make the data anything
+ * other than contiguous starting from 0.
+ */
+void fuse_session_process_buf_int(struct fuse_session *se,
+ struct fuse_bufvec *bufv,
+ struct fuse_chan *ch)
+{
+ const struct fuse_buf *buf = bufv->buf;
+ struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf);
+ struct fuse_in_header *in;
+ struct fuse_req *req;
+ int err;
+
+ /* The first buffer must be a memory buffer */
+ assert(!(buf->flags & FUSE_BUF_IS_FD));
+
+ in = fuse_mbuf_iter_advance(&iter, sizeof(*in));
+ assert(in); /* caller guarantees the input buffer is large enough */
+
+ fuse_log(
+ FUSE_LOG_DEBUG,
+ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n",
+ (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode),
+ in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid);
+
+ req = fuse_ll_alloc_req(se);
+ if (req == NULL) {
+ struct fuse_out_header out = {
+ .unique = in->unique,
+ .error = -ENOMEM,
+ };
+ struct iovec iov = {
+ .iov_base = &out,
+ .iov_len = sizeof(struct fuse_out_header),
+ };
+
+ fuse_send_msg(se, ch, &iov, 1);
+ return;
+ }
+
+ req->unique = in->unique;
+ req->ctx.uid = in->uid;
+ req->ctx.gid = in->gid;
+ req->ctx.pid = in->pid;
+ req->ch = ch;
+
+ /*
+ * INIT and DESTROY requests are serialized, all other request types
+ * run in parallel. This prevents races between FUSE_INIT and ordinary
+ * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and
+ * FUSE_DESTROY and FUSE_DESTROY.
+ */
+ if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT ||
+ in->opcode == FUSE_DESTROY) {
+ pthread_rwlock_wrlock(&se->init_rwlock);
+ } else {
+ pthread_rwlock_rdlock(&se->init_rwlock);
+ }
+
+ err = EIO;
+ if (!se->got_init) {
+ enum fuse_opcode expected;
+
+ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT;
+ if (in->opcode != expected) {
+ goto reply_err;
+ }
+ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) {
+ if (fuse_lowlevel_is_virtio(se)) {
+ /*
+ * TODO: This is after a hard reboot typically, we need to do
+ * a destroy, but we can't reply to this request yet so
+ * we can't use do_destroy
+ */
+ fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__);
+ se->got_destroy = 1;
+ se->got_init = 0;
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+ } else {
+ goto reply_err;
+ }
+ }
+
+ err = EACCES;
+ /* Implement -o allow_root */
+ if (se->deny_others && in->uid != se->owner && in->uid != 0 &&
+ in->opcode != FUSE_INIT && in->opcode != FUSE_READ &&
+ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC &&
+ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR &&
+ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR &&
+ in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) {
+ goto reply_err;
+ }
+
+ err = ENOSYS;
+ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) {
+ goto reply_err;
+ }
+ if (in->opcode != FUSE_INTERRUPT) {
+ struct fuse_req *intr;
+ pthread_mutex_lock(&se->lock);
+ intr = check_interrupt(se, req);
+ list_add_req(req, &se->list);
+ pthread_mutex_unlock(&se->lock);
+ if (intr) {
+ fuse_reply_err(intr, EAGAIN);
+ }
+ }
+
+ if (in->opcode == FUSE_WRITE && se->op.write_buf) {
+ do_write_buf(req, in->nodeid, &iter, bufv);
+ } else {
+ fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter);
+ }
+
+ pthread_rwlock_unlock(&se->init_rwlock);
+ return;
+
+reply_err:
+ fuse_reply_err(req, err);
+ pthread_rwlock_unlock(&se->init_rwlock);
+}
+
+#define LL_OPTION(n, o, v) \
+ { \
+ n, offsetof(struct fuse_session, o), v \
+ }
+
+static const struct fuse_opt fuse_ll_opts[] = {
+ LL_OPTION("debug", debug, 1),
+ LL_OPTION("-d", debug, 1),
+ LL_OPTION("--debug", debug, 1),
+ LL_OPTION("allow_root", deny_others, 1),
+ LL_OPTION("--socket-path=%s", vu_socket_path, 0),
+ LL_OPTION("--fd=%d", vu_listen_fd, 0),
+ LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0),
+ FUSE_OPT_END
+};
+
+void fuse_lowlevel_version(void)
+{
+ printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION,
+ FUSE_KERNEL_MINOR_VERSION);
+}
+
+void fuse_lowlevel_help(void)
+{
+ /*
+ * These are not all options, but the ones that are
+ * potentially of interest to an end-user
+ */
+ printf(
+ " -o allow_root allow access by root\n"
+ " --socket-path=PATH path for the vhost-user socket\n"
+ " --fd=FDNUM fd number of vhost-user socket\n"
+ " --thread-pool-size=NUM thread pool size limit (default %d)\n",
+ THREAD_POOL_SIZE);
+}
+
+void fuse_session_destroy(struct fuse_session *se)
+{
+ if (se->got_init && !se->got_destroy) {
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+ }
+ pthread_rwlock_destroy(&se->init_rwlock);
+ pthread_mutex_destroy(&se->lock);
+ free(se->cuse_data);
+ if (se->fd != -1) {
+ close(se->fd);
+ }
+
+ if (fuse_lowlevel_is_virtio(se)) {
+ virtio_session_close(se);
+ }
+
+ free(se->vu_socket_path);
+ se->vu_socket_path = NULL;
+
+ free(se);
+}
+
+
+struct fuse_session *fuse_session_new(struct fuse_args *args,
+ const struct fuse_lowlevel_ops *op,
+ size_t op_size, void *userdata)
+{
+ struct fuse_session *se;
+
+ if (sizeof(struct fuse_lowlevel_ops) < op_size) {
+ fuse_log(
+ FUSE_LOG_ERR,
+ "fuse: warning: library too old, some operations may not work\n");
+ op_size = sizeof(struct fuse_lowlevel_ops);
+ }
+
+ if (args->argc == 0) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: empty argv passed to fuse_session_new().\n");
+ return NULL;
+ }
+
+ se = (struct fuse_session *)calloc(1, sizeof(struct fuse_session));
+ if (se == NULL) {
+ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n");
+ goto out1;
+ }
+ se->fd = -1;
+ se->vu_listen_fd = -1;
+ se->thread_pool_size = THREAD_POOL_SIZE;
+ se->conn.max_write = UINT_MAX;
+ se->conn.max_readahead = UINT_MAX;
+
+ /* Parse options */
+ if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) {
+ goto out2;
+ }
+ if (args->argc == 1 && args->argv[0][0] == '-') {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: warning: argv[0] looks like an option, but "
+ "will be ignored\n");
+ } else if (args->argc != 1) {
+ int i;
+ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `");
+ for (i = 1; i < args->argc - 1; i++) {
+ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]);
+ }
+ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]);
+ goto out4;
+ }
+
+ if (!se->vu_socket_path && se->vu_listen_fd < 0) {
+ fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n");
+ goto out4;
+ }
+ if (se->vu_socket_path && se->vu_listen_fd >= 0) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: --socket-path and --fd cannot be given together\n");
+ goto out4;
+ }
+
+ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE;
+
+ list_init_req(&se->list);
+ list_init_req(&se->interrupts);
+ fuse_mutex_init(&se->lock);
+ pthread_rwlock_init(&se->init_rwlock, NULL);
+
+ memcpy(&se->op, op, op_size);
+ se->owner = getuid();
+ se->userdata = userdata;
+
+ return se;
+
+out4:
+ fuse_opt_free_args(args);
+out2:
+ free(se);
+out1:
+ return NULL;
+}
+
+int fuse_session_mount(struct fuse_session *se)
+{
+ return virtio_session_mount(se);
+}
+
+int fuse_session_fd(struct fuse_session *se)
+{
+ return se->fd;
+}
+
+void fuse_session_unmount(struct fuse_session *se)
+{
+}
+
+int fuse_lowlevel_is_virtio(struct fuse_session *se)
+{
+ return !!se->virtio_dev;
+}
+
+#ifdef linux
+int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
+{
+ char *buf;
+ size_t bufsize = 1024;
+ char path[128];
+ int ret;
+ int fd;
+ unsigned long pid = req->ctx.pid;
+ char *s;
+
+ sprintf(path, "/proc/%lu/task/%lu/status", pid, pid);
+
+retry:
+ buf = malloc(bufsize);
+ if (buf == NULL) {
+ return -ENOMEM;
+ }
+
+ ret = -EIO;
+ fd = open(path, O_RDONLY);
+ if (fd == -1) {
+ goto out_free;
+ }
+
+ ret = read(fd, buf, bufsize);
+ close(fd);
+ if (ret < 0) {
+ ret = -EIO;
+ goto out_free;
+ }
+
+ if ((size_t)ret == bufsize) {
+ free(buf);
+ bufsize *= 4;
+ goto retry;
+ }
+
+ ret = -EIO;
+ s = strstr(buf, "\nGroups:");
+ if (s == NULL) {
+ goto out_free;
+ }
+
+ s += 8;
+ ret = 0;
+ while (1) {
+ char *end;
+ unsigned long val = strtoul(s, &end, 0);
+ if (end == s) {
+ break;
+ }
+
+ s = end;
+ if (ret < size) {
+ list[ret] = val;
+ }
+ ret++;
+ }
+
+out_free:
+ free(buf);
+ return ret;
+}
+#else /* linux */
+/*
+ * This is currently not implemented on other than Linux...
+ */
+int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
+{
+ (void)req;
+ (void)size;
+ (void)list;
+ return -ENOSYS;
+}
+#endif
+
+void fuse_session_exit(struct fuse_session *se)
+{
+ se->exited = 1;
+}
+
+void fuse_session_reset(struct fuse_session *se)
+{
+ se->exited = 0;
+ se->error = 0;
+}
+
+int fuse_session_exited(struct fuse_session *se)
+{
+ return se->exited;
+}
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
new file mode 100644
index 0000000000..138041e5f1
--- /dev/null
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -0,0 +1,1991 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_LOWLEVEL_H_
+#define FUSE_LOWLEVEL_H_
+
+/**
+ * @file
+ *
+ * Low level API
+ *
+ * IMPORTANT: you should define FUSE_USE_VERSION before including this
+ * header. To use the newest API define it to 31 (recommended for any
+ * new application).
+ */
+
+#ifndef FUSE_USE_VERSION
+#error FUSE_USE_VERSION not defined
+#endif
+
+#include "fuse_common.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <utime.h>
+
+/*
+ * Miscellaneous definitions
+ */
+
+/** The node ID of the root inode */
+#define FUSE_ROOT_ID 1
+
+/** Inode number type */
+typedef uint64_t fuse_ino_t;
+
+/** Request pointer type */
+typedef struct fuse_req *fuse_req_t;
+
+/**
+ * Session
+ *
+ * This provides hooks for processing requests, and exiting
+ */
+struct fuse_session;
+
+/** Directory entry parameters supplied to fuse_reply_entry() */
+struct fuse_entry_param {
+ /**
+ * Unique inode number
+ *
+ * In lookup, zero means negative entry (from version 2.5)
+ * Returning ENOENT also means negative entry, but by setting zero
+ * ino the kernel may cache negative entries for entry_timeout
+ * seconds.
+ */
+ fuse_ino_t ino;
+
+ /**
+ * Generation number for this entry.
+ *
+ * If the file system will be exported over NFS, the
+ * ino/generation pairs need to be unique over the file
+ * system's lifetime (rather than just the mount time). So if
+ * the file system reuses an inode after it has been deleted,
+ * it must assign a new, previously unused generation number
+ * to the inode at the same time.
+ *
+ */
+ uint64_t generation;
+
+ /**
+ * Inode attributes.
+ *
+ * Even if attr_timeout == 0, attr must be correct. For example,
+ * for open(), FUSE uses attr.st_size from lookup() to determine
+ * how many bytes to request. If this value is not correct,
+ * incorrect data will be returned.
+ */
+ struct stat attr;
+
+ /**
+ * Validity timeout (in seconds) for inode attributes. If
+ * attributes only change as a result of requests that come
+ * through the kernel, this should be set to a very large
+ * value.
+ */
+ double attr_timeout;
+
+ /**
+ * Validity timeout (in seconds) for the name. If directory
+ * entries are changed/deleted only as a result of requests
+ * that come through the kernel, this should be set to a very
+ * large value.
+ */
+ double entry_timeout;
+};
+
+/**
+ * Additional context associated with requests.
+ *
+ * Note that the reported client uid, gid and pid may be zero in some
+ * situations. For example, if the FUSE file system is running in a
+ * PID or user namespace but then accessed from outside the namespace,
+ * there is no valid uid/pid/gid that could be reported.
+ */
+struct fuse_ctx {
+ /** User ID of the calling process */
+ uid_t uid;
+
+ /** Group ID of the calling process */
+ gid_t gid;
+
+ /** Thread ID of the calling process */
+ pid_t pid;
+
+ /** Umask of the calling process */
+ mode_t umask;
+};
+
+struct fuse_forget_data {
+ fuse_ino_t ino;
+ uint64_t nlookup;
+};
+
+/* 'to_set' flags in setattr */
+#define FUSE_SET_ATTR_MODE (1 << 0)
+#define FUSE_SET_ATTR_UID (1 << 1)
+#define FUSE_SET_ATTR_GID (1 << 2)
+#define FUSE_SET_ATTR_SIZE (1 << 3)
+#define FUSE_SET_ATTR_ATIME (1 << 4)
+#define FUSE_SET_ATTR_MTIME (1 << 5)
+#define FUSE_SET_ATTR_ATIME_NOW (1 << 7)
+#define FUSE_SET_ATTR_MTIME_NOW (1 << 8)
+#define FUSE_SET_ATTR_CTIME (1 << 10)
+
+/*
+ * Request methods and replies
+ */
+
+/**
+ * Low level filesystem operations
+ *
+ * Most of the methods (with the exception of init and destroy)
+ * receive a request handle (fuse_req_t) as their first argument.
+ * This handle must be passed to one of the specified reply functions.
+ *
+ * This may be done inside the method invocation, or after the call
+ * has returned. The request handle is valid until one of the reply
+ * functions is called.
+ *
+ * Other pointer arguments (name, fuse_file_info, etc) are not valid
+ * after the call has returned, so if they are needed later, their
+ * contents have to be copied.
+ *
+ * In general, all methods are expected to perform any necessary
+ * permission checking. However, a filesystem may delegate this task
+ * to the kernel by passing the `default_permissions` mount option to
+ * `fuse_session_new()`. In this case, methods will only be called if
+ * the kernel's permission check has succeeded.
+ *
+ * The filesystem sometimes needs to handle a return value of -ENOENT
+ * from the reply function, which means, that the request was
+ * interrupted, and the reply discarded. For example if
+ * fuse_reply_open() return -ENOENT means, that the release method for
+ * this file will not be called.
+ */
+struct fuse_lowlevel_ops {
+ /**
+ * Initialize filesystem
+ *
+ * This function is called when libfuse establishes
+ * communication with the FUSE kernel module. The file system
+ * should use this module to inspect and/or modify the
+ * connection parameters provided in the `conn` structure.
+ *
+ * Note that some parameters may be overwritten by options
+ * passed to fuse_session_new() which take precedence over the
+ * values set in this handler.
+ *
+ * There's no reply to this function
+ *
+ * @param userdata the user data passed to fuse_session_new()
+ */
+ void (*init)(void *userdata, struct fuse_conn_info *conn);
+
+ /**
+ * Clean up filesystem.
+ *
+ * Called on filesystem exit. When this method is called, the
+ * connection to the kernel may be gone already, so that eg. calls
+ * to fuse_lowlevel_notify_* will fail.
+ *
+ * There's no reply to this function
+ *
+ * @param userdata the user data passed to fuse_session_new()
+ */
+ void (*destroy)(void *userdata);
+
+ /**
+ * Look up a directory entry by name and get its attributes.
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name the name to look up
+ */
+ void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Forget about an inode
+ *
+ * This function is called when the kernel removes an inode
+ * from its internal caches.
+ *
+ * The inode's lookup count increases by one for every call to
+ * fuse_reply_entry and fuse_reply_create. The nlookup parameter
+ * indicates by how much the lookup count should be decreased.
+ *
+ * Inodes with a non-zero lookup count may receive request from
+ * the kernel even after calls to unlink, rmdir or (when
+ * overwriting an existing file) rename. Filesystems must handle
+ * such requests properly and it is recommended to defer removal
+ * of the inode until the lookup count reaches zero. Calls to
+ * unlink, rmdir or rename will be followed closely by forget
+ * unless the file or directory is open, in which case the
+ * kernel issues forget only after the release or releasedir
+ * calls.
+ *
+ * Note that if a file system will be exported over NFS the
+ * inodes lifetime must extend even beyond forget. See the
+ * generation field in struct fuse_entry_param above.
+ *
+ * On unmount the lookup count for all inodes implicitly drops
+ * to zero. It is not guaranteed that the file system will
+ * receive corresponding forget messages for the affected
+ * inodes.
+ *
+ * Valid replies:
+ * fuse_reply_none
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param nlookup the number of lookups to forget
+ */
+ void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup);
+
+ /**
+ * Get file attributes.
+ *
+ * If writeback caching is enabled, the kernel may have a
+ * better idea of a file's length than the FUSE file system
+ * (eg if there has been a write that extended the file size,
+ * but that has not yet been passed to the filesystem.n
+ *
+ * In this case, the st_size value provided by the file system
+ * will be ignored.
+ *
+ * Valid replies:
+ * fuse_reply_attr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi for future use, currently always NULL
+ */
+ void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Set file attributes
+ *
+ * In the 'attr' argument only members indicated by the 'to_set'
+ * bitmask contain valid values. Other members contain undefined
+ * values.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits if the file
+ * size or owner is being changed.
+ *
+ * If the setattr was invoked from the ftruncate() system call
+ * under Linux kernel versions 2.6.15 or later, the fi->fh will
+ * contain the value set by the open method or will be undefined
+ * if the open method didn't set any value. Otherwise (not
+ * ftruncate call, or kernel version earlier than 2.6.15) the fi
+ * parameter will be NULL.
+ *
+ * Valid replies:
+ * fuse_reply_attr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param attr the attributes
+ * @param to_set bit mask of attributes which should be set
+ * @param fi file information, or NULL
+ */
+ void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
+ int to_set, struct fuse_file_info *fi);
+
+ /**
+ * Read symbolic link
+ *
+ * Valid replies:
+ * fuse_reply_readlink
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ */
+ void (*readlink)(fuse_req_t req, fuse_ino_t ino);
+
+ /**
+ * Create file node
+ *
+ * Create a regular file, character device, block device, fifo or
+ * socket node.
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode file type and mode with which to create the new file
+ * @param rdev the device number (only valid if created file is a device)
+ */
+ void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, dev_t rdev);
+
+ /**
+ * Create a directory
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode with which to create the new file
+ */
+ void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode);
+
+ /**
+ * Remove a file
+ *
+ * If the file's inode's lookup count is non-zero, the file
+ * system is expected to postpone any removal of the inode
+ * until the lookup count reaches zero (see description of the
+ * forget function).
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to remove
+ */
+ void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Remove a directory
+ *
+ * If the directory's inode's lookup count is non-zero, the
+ * file system is expected to postpone any removal of the
+ * inode until the lookup count reaches zero (see description
+ * of the forget function).
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to remove
+ */
+ void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Create a symbolic link
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param link the contents of the symbolic link
+ * @param parent inode number of the parent directory
+ * @param name to create
+ */
+ void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent,
+ const char *name);
+
+ /**
+ * Rename a file
+ *
+ * If the target exists it should be atomically replaced. If
+ * the target's inode's lookup count is non-zero, the file
+ * system is expected to postpone any removal of the inode
+ * until the lookup count reaches zero (see description of the
+ * forget function).
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EINVAL, i.e. all
+ * future bmap requests will fail with EINVAL without being
+ * send to the filesystem process.
+ *
+ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If
+ * RENAME_NOREPLACE is specified, the filesystem must not
+ * overwrite *newname* if it exists and return an error
+ * instead. If `RENAME_EXCHANGE` is specified, the filesystem
+ * must atomically exchange the two files, i.e. both must
+ * exist and neither may be deleted.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the old parent directory
+ * @param name old name
+ * @param newparent inode number of the new parent directory
+ * @param newname new name
+ */
+ void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ fuse_ino_t newparent, const char *newname,
+ unsigned int flags);
+
+ /**
+ * Create a hard link
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the old inode number
+ * @param newparent inode number of the new parent directory
+ * @param newname new name to create
+ */
+ void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
+ const char *newname);
+
+ /**
+ * Open a file
+ *
+ * Open flags are available in fi->flags. The following rules
+ * apply.
+ *
+ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be
+ * filtered out / handled by the kernel.
+ *
+ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used
+ * by the filesystem to check if the operation is
+ * permitted. If the ``-o default_permissions`` mount
+ * option is given, this check is already done by the
+ * kernel before calling open() and may thus be omitted by
+ * the filesystem.
+ *
+ * - When writeback caching is enabled, the kernel may send
+ * read requests even for files opened with O_WRONLY. The
+ * filesystem should be prepared to handle this.
+ *
+ * - When writeback caching is disabled, the filesystem is
+ * expected to properly handle the O_APPEND flag and ensure
+ * that each write is appending to the end of the file.
+ *
+ * - When writeback caching is enabled, the kernel will
+ * handle O_APPEND. However, unless all changes to the file
+ * come through the kernel this will not work reliably. The
+ * filesystem should thus either ignore the O_APPEND flag
+ * (and let the kernel handle it), or return an error
+ * (indicating that reliably O_APPEND is not available).
+ *
+ * Filesystem may store an arbitrary file handle (pointer,
+ * index, etc) in fi->fh, and use this in other all other file
+ * operations (read, write, flush, release, fsync).
+ *
+ * Filesystem may also implement stateless file I/O and not store
+ * anything in fi->fh.
+ *
+ * There are also some flags (direct_io, keep_cache) which the
+ * filesystem may set in fi, to change the way the file is opened.
+ * See fuse_file_info structure in <fuse_common.h> for more details.
+ *
+ * If this request is answered with an error code of ENOSYS
+ * and FUSE_CAP_NO_OPEN_SUPPORT is set in
+ * `fuse_conn_info.capable`, this is treated as success and
+ * future calls to open and release will also succeed without being
+ * sent to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_open
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Read data
+ *
+ * Read should send exactly the number of bytes requested except
+ * on EOF or error, otherwise the rest of the data will be
+ * substituted with zeroes. An exception to this is when the file
+ * has been opened in 'direct_io' mode, in which case the return
+ * value of the read system call will reflect the return value of
+ * this operation.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_iov
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size number of bytes to read
+ * @param off offset to read from
+ * @param fi file information
+ */
+ void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Write data
+ *
+ * Write should return exactly the number of bytes requested
+ * except on error. An exception to this is when the file has
+ * been opened in 'direct_io' mode, in which case the return value
+ * of the write system call will reflect the return value of this
+ * operation.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param buf data to write
+ * @param size number of bytes to write
+ * @param off offset to write to
+ * @param fi file information
+ */
+ void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size,
+ off_t off, struct fuse_file_info *fi);
+
+ /**
+ * Flush method
+ *
+ * This is called on each close() of the opened file.
+ *
+ * Since file descriptors can be duplicated (dup, dup2, fork), for
+ * one open call there may be many flush calls.
+ *
+ * Filesystems shouldn't assume that flush will always be called
+ * after some writes, or that if will be called at all.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * NOTE: the name of the method is misleading, since (unlike
+ * fsync) the filesystem is not forced to flush pending writes.
+ * One reason to flush data is if the filesystem wants to return
+ * write errors during close. However, such use is non-portable
+ * because POSIX does not require [close] to wait for delayed I/O to
+ * complete.
+ *
+ * If the filesystem supports file locking operations (setlk,
+ * getlk) it should remove all locks belonging to 'fi->owner'.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to flush() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ *
+ * [close]:
+ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html
+ */
+ void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Release an open file
+ *
+ * Release is called when there are no more references to an open
+ * file: all file descriptors are closed and all memory mappings
+ * are unmapped.
+ *
+ * For every open call there will be exactly one release call (unless
+ * the filesystem is force-unmounted).
+ *
+ * The filesystem may reply with an error, but error values are
+ * not returned to close() or munmap() which triggered the
+ * release.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ * fi->flags will contain the same flags as for open.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Synchronize file contents
+ *
+ * If the datasync parameter is non-zero, then only the user data
+ * should be flushed, not the meta data.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to fsync() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param datasync flag indicating if only data should be flushed
+ * @param fi file information
+ */
+ void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi);
+
+ /**
+ * Open a directory
+ *
+ * Filesystem may store an arbitrary file handle (pointer, index,
+ * etc) in fi->fh, and use this in other all other directory
+ * stream operations (readdir, releasedir, fsyncdir).
+ *
+ * If this request is answered with an error code of ENOSYS and
+ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`,
+ * this is treated as success and future calls to opendir and
+ * releasedir will also succeed without being sent to the filesystem
+ * process. In addition, the kernel will cache readdir results
+ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR.
+ *
+ * Valid replies:
+ * fuse_reply_open
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Read directory
+ *
+ * Send a buffer filled using fuse_add_direntry(), with size not
+ * exceeding the requested size. Send an empty buffer on end of
+ * stream.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * Returning a directory entry from readdir() does not affect
+ * its lookup count.
+ *
+ * If off_t is non-zero, then it will correspond to one of the off_t
+ * values that was previously returned by readdir() for the same
+ * directory handle. In this case, readdir() should skip over entries
+ * coming before the position defined by the off_t value. If entries
+ * are added or removed while the directory handle is open, they filesystem
+ * may still include the entries that have been removed, and may not
+ * report the entries that have been created. However, addition or
+ * removal of entries must never cause readdir() to skip over unrelated
+ * entries or to report them more than once. This means
+ * that off_t can not be a simple index that enumerates the entries
+ * that have been returned but must contain sufficient information to
+ * uniquely determine the next directory entry to return even when the
+ * set of entries is changing.
+ *
+ * The function does not have to report the '.' and '..'
+ * entries, but is allowed to do so. Note that, if readdir does
+ * not return '.' or '..', they will not be implicitly returned,
+ * and this behavior is observable by the caller.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum number of bytes to send
+ * @param off offset to continue reading the directory stream
+ * @param fi file information
+ */
+ void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Release an open directory
+ *
+ * For every opendir call there will be exactly one releasedir
+ * call (unless the filesystem is force-unmounted).
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*releasedir)(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi);
+
+ /**
+ * Synchronize directory contents
+ *
+ * If the datasync parameter is non-zero, then only the directory
+ * contents should be flushed, not the meta data.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to fsyncdir() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param datasync flag indicating if only data should be flushed
+ * @param fi file information
+ */
+ void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi);
+
+ /**
+ * Get file system statistics
+ *
+ * Valid replies:
+ * fuse_reply_statfs
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number, zero means "undefined"
+ */
+ void (*statfs)(fuse_req_t req, fuse_ino_t ino);
+
+ /**
+ * Set an extended attribute
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future setxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ */
+ void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name,
+ const char *value, size_t size, int flags);
+
+ /**
+ * Get an extended attribute
+ *
+ * If size is zero, the size of the value should be sent with
+ * fuse_reply_xattr.
+ *
+ * If the size is non-zero, and the value fits in the buffer, the
+ * value should be sent with fuse_reply_buf.
+ *
+ * If the size is too small for the value, the ERANGE error should
+ * be sent.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future getxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_xattr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param name of the extended attribute
+ * @param size maximum size of the value to send
+ */
+ void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name,
+ size_t size);
+
+ /**
+ * List extended attribute names
+ *
+ * If size is zero, the total size of the attribute list should be
+ * sent with fuse_reply_xattr.
+ *
+ * If the size is non-zero, and the null character separated
+ * attribute list fits in the buffer, the list should be sent with
+ * fuse_reply_buf.
+ *
+ * If the size is too small for the list, the ERANGE error should
+ * be sent.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future listxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_xattr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum size of the list to send
+ */
+ void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size);
+
+ /**
+ * Remove an extended attribute
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future removexattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param name of the extended attribute
+ */
+ void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name);
+
+ /**
+ * Check file access permissions
+ *
+ * This will be called for the access() and chdir() system
+ * calls. If the 'default_permissions' mount option is given,
+ * this method is not called.
+ *
+ * This method is not called under Linux kernel versions 2.4.x
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent success, i.e. this and all future access()
+ * requests will succeed without being send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param mask requested access mode
+ */
+ void (*access)(fuse_req_t req, fuse_ino_t ino, int mask);
+
+ /**
+ * Create and open a file
+ *
+ * If the file does not exist, first create it with the specified
+ * mode, and then open it.
+ *
+ * See the description of the open handler for more
+ * information.
+ *
+ * If this method is not implemented or under Linux kernel
+ * versions earlier than 2.6.15, the mknod() and open() methods
+ * will be called instead.
+ *
+ * If this request is answered with an error code of ENOSYS, the handler
+ * is treated as not implemented (i.e., for this and future requests the
+ * mknod() and open() handlers will be called instead).
+ *
+ * Valid replies:
+ * fuse_reply_create
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode file type and mode with which to create the new file
+ * @param fi file information
+ */
+ void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, struct fuse_file_info *fi);
+
+ /**
+ * Test for a POSIX file lock
+ *
+ * Valid replies:
+ * fuse_reply_lock
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param lock the region/type to test
+ */
+ void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock);
+
+ /**
+ * Acquire, modify or release a POSIX file lock
+ *
+ * For POSIX threads (NPTL) there's a 1-1 relation between pid and
+ * owner, but otherwise this is not always the case. For checking
+ * lock ownership, 'fi->owner' must be used. The l_pid field in
+ * 'struct flock' should only be used to fill in this field in
+ * getlk().
+ *
+ * Note: if the locking methods are not implemented, the kernel
+ * will still allow file locking to work locally. Hence these are
+ * only interesting for network filesystems and similar.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param lock the region/type to set
+ * @param sleep locking operation may sleep
+ */
+ void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock, int sleep);
+
+ /**
+ * Map block index within file to block index within device
+ *
+ * Note: This makes sense only for block device backed filesystems
+ * mounted with the 'blkdev' option
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure, i.e. all future bmap() requests will
+ * fail with the same error code without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_bmap
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param blocksize unit of block index
+ * @param idx block index within file
+ */
+ void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize,
+ uint64_t idx);
+
+ /**
+ * Ioctl
+ *
+ * Note: For unrestricted ioctls (not allowed for FUSE
+ * servers), data in and out areas can be discovered by giving
+ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For
+ * restricted ioctls, kernel prepares in/out data area
+ * according to the information encoded in cmd.
+ *
+ * Valid replies:
+ * fuse_reply_ioctl_retry
+ * fuse_reply_ioctl
+ * fuse_reply_ioctl_iov
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param cmd ioctl command
+ * @param arg ioctl argument
+ * @param fi file information
+ * @param flags for FUSE_IOCTL_* flags
+ * @param in_buf data fetched from the caller
+ * @param in_bufsz number of fetched bytes
+ * @param out_bufsz maximum size of output data
+ *
+ * Note : the unsigned long request submitted by the application
+ * is truncated to 32 bits.
+ */
+ void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags, const void *in_buf,
+ size_t in_bufsz, size_t out_bufsz);
+
+ /**
+ * Poll for IO readiness
+ *
+ * Note: If ph is non-NULL, the client should notify
+ * when IO readiness events occur by calling
+ * fuse_lowlevel_notify_poll() with the specified ph.
+ *
+ * Regardless of the number of times poll with a non-NULL ph
+ * is received, single notification is enough to clear all.
+ * Notifying more times incurs overhead but doesn't harm
+ * correctness.
+ *
+ * The callee is responsible for destroying ph with
+ * fuse_pollhandle_destroy() when no longer in use.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as success (with a kernel-defined default poll-mask) and
+ * future calls to pull() will succeed the same way without being send
+ * to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_poll
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param ph poll handle to be used for notification
+ */
+ void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct fuse_pollhandle *ph);
+
+ /**
+ * Write data made available in a buffer
+ *
+ * This is a more generic version of the ->write() method. If
+ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the
+ * kernel supports splicing from the fuse device, then the
+ * data will be made available in pipe for supporting zero
+ * copy data transfer.
+ *
+ * buf->count is guaranteed to be one (and thus buf->idx is
+ * always zero). The write_buf handler must ensure that
+ * bufv->off is correctly updated (reflecting the number of
+ * bytes read from bufv->buf[0]).
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param bufv buffer containing the data
+ * @param off offset to write to
+ * @param fi file information
+ */
+ void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv,
+ off_t off, struct fuse_file_info *fi);
+
+ /**
+ * Forget about multiple inodes
+ *
+ * See description of the forget function for more
+ * information.
+ *
+ * Valid replies:
+ * fuse_reply_none
+ *
+ * @param req request handle
+ */
+ void (*forget_multi)(fuse_req_t req, size_t count,
+ struct fuse_forget_data *forgets);
+
+ /**
+ * Acquire, modify or release a BSD file lock
+ *
+ * Note: if the locking methods are not implemented, the kernel
+ * will still allow file locking to work locally. Hence these are
+ * only interesting for network filesystems and similar.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param op the locking operation, see flock(2)
+ */
+ void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ int op);
+
+ /**
+ * Allocate requested space. If this function returns success then
+ * subsequent writes to the specified range shall not fail due to the lack
+ * of free space on the file system storage media.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future fallocate() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param offset starting point for allocated region
+ * @param length size of allocated region
+ * @param mode determines the operation to be performed on the given range,
+ * see fallocate(2)
+ */
+ void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
+ off_t length, struct fuse_file_info *fi);
+
+ /**
+ * Read directory with attributes
+ *
+ * Send a buffer filled using fuse_add_direntry_plus(), with size not
+ * exceeding the requested size. Send an empty buffer on end of
+ * stream.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * In contrast to readdir() (which does not affect the lookup counts),
+ * the lookup count of every entry returned by readdirplus(), except "."
+ * and "..", is incremented by one.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum number of bytes to send
+ * @param off offset to continue reading the directory stream
+ * @param fi file information
+ */
+ void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Copy a range of data from one file to another
+ *
+ * Performs an optimized copy between two file descriptors without the
+ * additional cost of transferring data through the FUSE kernel module
+ * to user space (glibc) and then back into the FUSE filesystem again.
+ *
+ * In case this method is not implemented, glibc falls back to reading
+ * data from the source and writing to the destination. Effectively
+ * doing an inefficient copy of the data.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future copy_file_range() requests will fail with EOPNOTSUPP without
+ * being send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino_in the inode number or the source file
+ * @param off_in starting point from were the data should be read
+ * @param fi_in file information of the source file
+ * @param ino_out the inode number or the destination file
+ * @param off_out starting point where the data should be written
+ * @param fi_out file information of the destination file
+ * @param len maximum size of the data to copy
+ * @param flags passed along with the copy_file_range() syscall
+ */
+ void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
+ struct fuse_file_info *fi_in, fuse_ino_t ino_out,
+ off_t off_out, struct fuse_file_info *fi_out,
+ size_t len, int flags);
+
+ /**
+ * Find next data or hole after the specified offset
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure, i.e. all future lseek() requests will
+ * fail with the same error code without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_lseek
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param off offset to start search from
+ * @param whence either SEEK_DATA or SEEK_HOLE
+ * @param fi file information
+ */
+ void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
+ struct fuse_file_info *fi);
+};
+
+/**
+ * Reply with an error code or success.
+ *
+ * Possible requests:
+ * all except forget
+ *
+ * Whereever possible, error codes should be chosen from the list of
+ * documented error conditions in the corresponding system calls
+ * manpage.
+ *
+ * An error code of ENOSYS is sometimes treated specially. This is
+ * indicated in the documentation of the affected handler functions.
+ *
+ * The following requests may be answered with a zero error code:
+ * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr,
+ * removexattr, setlk.
+ *
+ * @param req request handle
+ * @param err the positive error value, or zero for success
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_err(fuse_req_t req, int err);
+
+/**
+ * Don't send reply
+ *
+ * Possible requests:
+ * forget
+ * forget_multi
+ * retrieve_reply
+ *
+ * @param req request handle
+ */
+void fuse_reply_none(fuse_req_t req);
+
+/**
+ * Reply with a directory entry
+ *
+ * Possible requests:
+ * lookup, mknod, mkdir, symlink, link
+ *
+ * Side effects:
+ * increments the lookup count on success
+ *
+ * @param req request handle
+ * @param e the entry parameters
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e);
+
+/**
+ * Reply with a directory entry and open parameters
+ *
+ * currently the following members of 'fi' are used:
+ * fh, direct_io, keep_cache
+ *
+ * Possible requests:
+ * create
+ *
+ * Side effects:
+ * increments the lookup count on success
+ *
+ * @param req request handle
+ * @param e the entry parameters
+ * @param fi file information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
+ const struct fuse_file_info *fi);
+
+/**
+ * Reply with attributes
+ *
+ * Possible requests:
+ * getattr, setattr
+ *
+ * @param req request handle
+ * @param attr the attributes
+ * @param attr_timeout validity timeout (in seconds) for the attributes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
+ double attr_timeout);
+
+/**
+ * Reply with the contents of a symbolic link
+ *
+ * Possible requests:
+ * readlink
+ *
+ * @param req request handle
+ * @param link symbolic link contents
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_readlink(fuse_req_t req, const char *link);
+
+/**
+ * Reply with open parameters
+ *
+ * currently the following members of 'fi' are used:
+ * fh, direct_io, keep_cache
+ *
+ * Possible requests:
+ * open, opendir
+ *
+ * @param req request handle
+ * @param fi file information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi);
+
+/**
+ * Reply with number of bytes written
+ *
+ * Possible requests:
+ * write
+ *
+ * @param req request handle
+ * @param count the number of bytes written
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_write(fuse_req_t req, size_t count);
+
+/**
+ * Reply with data
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * @param req request handle
+ * @param buf buffer containing data
+ * @param size the size of data in bytes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size);
+
+/**
+ * Reply with data copied/moved from buffer(s)
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * Side effects:
+ * when used to return data from a readdirplus() (but not readdir())
+ * call, increments the lookup count of each returned entry by one
+ * on success.
+ *
+ * @param req request handle
+ * @param bufv buffer vector
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv);
+
+/**
+ * Reply with data vector
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * @param req request handle
+ * @param iov the vector containing the data
+ * @param count the size of vector
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count);
+
+/**
+ * Reply with filesystem statistics
+ *
+ * Possible requests:
+ * statfs
+ *
+ * @param req request handle
+ * @param stbuf filesystem statistics
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf);
+
+/**
+ * Reply with needed buffer size
+ *
+ * Possible requests:
+ * getxattr, listxattr
+ *
+ * @param req request handle
+ * @param count the buffer size needed in bytes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_xattr(fuse_req_t req, size_t count);
+
+/**
+ * Reply with file lock information
+ *
+ * Possible requests:
+ * getlk
+ *
+ * @param req request handle
+ * @param lock the lock information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_lock(fuse_req_t req, const struct flock *lock);
+
+/**
+ * Reply with block index
+ *
+ * Possible requests:
+ * bmap
+ *
+ * @param req request handle
+ * @param idx block index within device
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_bmap(fuse_req_t req, uint64_t idx);
+
+/*
+ * Filling a buffer in readdir
+ */
+
+/**
+ * Add a directory entry to the buffer
+ *
+ * Buffer needs to be large enough to hold the entry. If it's not,
+ * then the entry is not filled in but the size of the entry is still
+ * returned. The caller can check this by comparing the bufsize
+ * parameter with the returned entry size. If the entry size is
+ * larger than the buffer size, the operation failed.
+ *
+ * From the 'stbuf' argument the st_ino field and bits 12-15 of the
+ * st_mode field are used. The other fields are ignored.
+ *
+ * *off* should be any non-zero value that the filesystem can use to
+ * identify the current point in the directory stream. It does not
+ * need to be the actual physical position. A value of zero is
+ * reserved to mean "from the beginning", and should therefore never
+ * be used (the first call to fuse_add_direntry should be passed the
+ * offset of the second directory entry).
+ *
+ * @param req request handle
+ * @param buf the point where the new entry will be added to the buffer
+ * @param bufsize remaining size of the buffer
+ * @param name the name of the entry
+ * @param stbuf the file attributes
+ * @param off the offset of the next entry
+ * @return the space needed for the entry
+ */
+size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name, const struct stat *stbuf, off_t off);
+
+/**
+ * Add a directory entry to the buffer with the attributes
+ *
+ * See documentation of `fuse_add_direntry()` for more details.
+ *
+ * @param req request handle
+ * @param buf the point where the new entry will be added to the buffer
+ * @param bufsize remaining size of the buffer
+ * @param name the name of the entry
+ * @param e the directory entry
+ * @param off the offset of the next entry
+ * @return the space needed for the entry
+ */
+size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name,
+ const struct fuse_entry_param *e, off_t off);
+
+/**
+ * Reply to ask for data fetch and output buffer preparation. ioctl
+ * will be retried with the specified input data fetched and output
+ * buffer prepared.
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param in_iov iovec specifying data to fetch from the caller
+ * @param in_count number of entries in in_iov
+ * @param out_iov iovec specifying addresses to write output to
+ * @param out_count number of entries in out_iov
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov,
+ size_t in_count, const struct iovec *out_iov,
+ size_t out_count);
+
+/**
+ * Reply to finish ioctl
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param result result to be passed to the caller
+ * @param buf buffer containing output data
+ * @param size length of output data
+ */
+int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size);
+
+/**
+ * Reply to finish ioctl with iov buffer
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param result result to be passed to the caller
+ * @param iov the vector containing the data
+ * @param count the size of vector
+ */
+int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov,
+ int count);
+
+/**
+ * Reply with poll result event mask
+ *
+ * @param req request handle
+ * @param revents poll result event mask
+ */
+int fuse_reply_poll(fuse_req_t req, unsigned revents);
+
+/**
+ * Reply with offset
+ *
+ * Possible requests:
+ * lseek
+ *
+ * @param req request handle
+ * @param off offset of next data or hole
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_lseek(fuse_req_t req, off_t off);
+
+/*
+ * Notification
+ */
+
+/**
+ * Notify IO readiness event
+ *
+ * For more information, please read comment for poll operation.
+ *
+ * @param ph poll handle to notify IO readiness event for
+ */
+int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph);
+
+/**
+ * Notify to invalidate cache for an inode.
+ *
+ * Added in FUSE protocol version 7.12. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * If the filesystem has writeback caching enabled, invalidating an
+ * inode will first trigger a writeback of all dirty pages. The call
+ * will block until all writeback requests have completed and the
+ * inode has been invalidated. It will, however, not wait for
+ * completion of pending writeback requests that have been issued
+ * before.
+ *
+ * If there are no dirty pages, this function will never block.
+ *
+ * @param se the session object
+ * @param ino the inode number
+ * @param off the offset in the inode where to start invalidating
+ * or negative to invalidate attributes only
+ * @param len the amount of cache to invalidate or 0 for all
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino,
+ off_t off, off_t len);
+
+/**
+ * Notify to invalidate parent attributes and the dentry matching
+ * parent/name
+ *
+ * To avoid a deadlock this function must not be called in the
+ * execution path of a related filesytem operation or within any code
+ * that could hold a lock that could be needed to execute such an
+ * operation. As of kernel 4.18, a "related operation" is a lookup(),
+ * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create()
+ * request for the parent, and a setattr(), unlink(), rmdir(),
+ * rename(), setxattr(), removexattr(), readdir() or readdirplus()
+ * request for the inode itself.
+ *
+ * When called correctly, this function will never block.
+ *
+ * Added in FUSE protocol version 7.12. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param parent inode number
+ * @param name file name
+ * @param namelen strlen() of file name
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent,
+ const char *name, size_t namelen);
+
+/**
+ * This function behaves like fuse_lowlevel_notify_inval_entry() with
+ * the following additional effect (at least as of Linux kernel 4.8):
+ *
+ * If the provided *child* inode matches the inode that is currently
+ * associated with the cached dentry, and if there are any inotify
+ * watches registered for the dentry, then the watchers are informed
+ * that the dentry has been deleted.
+ *
+ * To avoid a deadlock this function must not be called while
+ * executing a related filesytem operation or while holding a lock
+ * that could be needed to execute such an operation (see the
+ * description of fuse_lowlevel_notify_inval_entry() for more
+ * details).
+ *
+ * When called correctly, this function will never block.
+ *
+ * Added in FUSE protocol version 7.18. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param parent inode number
+ * @param child inode number
+ * @param name file name
+ * @param namelen strlen() of file name
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent,
+ fuse_ino_t child, const char *name,
+ size_t namelen);
+
+/**
+ * Store data to the kernel buffers
+ *
+ * Synchronously store data in the kernel buffers belonging to the
+ * given inode. The stored data is marked up-to-date (no read will be
+ * performed against it, unless it's invalidated or evicted from the
+ * cache).
+ *
+ * If the stored data overflows the current file size, then the size
+ * is extended, similarly to a write(2) on the filesystem.
+ *
+ * If this function returns an error, then the store wasn't fully
+ * completed, but it may have been partially completed.
+ *
+ * Added in FUSE protocol version 7.15. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param ino the inode number
+ * @param offset the starting offset into the file to store to
+ * @param bufv buffer vector
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino,
+ off_t offset, struct fuse_bufvec *bufv);
+
+/*
+ * Utility functions
+ */
+
+/**
+ * Get the userdata from the request
+ *
+ * @param req request handle
+ * @return the user data passed to fuse_session_new()
+ */
+void *fuse_req_userdata(fuse_req_t req);
+
+/**
+ * Get the context from the request
+ *
+ * The pointer returned by this function will only be valid for the
+ * request's lifetime
+ *
+ * @param req request handle
+ * @return the context structure
+ */
+const struct fuse_ctx *fuse_req_ctx(fuse_req_t req);
+
+/**
+ * Get the current supplementary group IDs for the specified request
+ *
+ * Similar to the getgroups(2) system call, except the return value is
+ * always the total number of group IDs, even if it is larger than the
+ * specified size.
+ *
+ * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
+ * the group list to userspace, hence this function needs to parse
+ * "/proc/$TID/task/$TID/status" to get the group IDs.
+ *
+ * This feature may not be supported on all operating systems. In
+ * such a case this function will return -ENOSYS.
+ *
+ * @param req request handle
+ * @param size size of given array
+ * @param list array of group IDs to be filled in
+ * @return the total number of supplementary group IDs or -errno on failure
+ */
+int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]);
+
+/**
+ * Callback function for an interrupt
+ *
+ * @param req interrupted request
+ * @param data user data
+ */
+typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data);
+
+/**
+ * Register/unregister callback for an interrupt
+ *
+ * If an interrupt has already happened, then the callback function is
+ * called from within this function, hence it's not possible for
+ * interrupts to be lost.
+ *
+ * @param req request handle
+ * @param func the callback function or NULL for unregister
+ * @param data user data passed to the callback function
+ */
+void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func,
+ void *data);
+
+/**
+ * Check if a request has already been interrupted
+ *
+ * @param req request handle
+ * @return 1 if the request has been interrupted, 0 otherwise
+ */
+int fuse_req_interrupted(fuse_req_t req);
+
+/**
+ * Check if the session is connected via virtio
+ *
+ * @param se session object
+ * @return 1 if the session is a virtio session
+ */
+int fuse_lowlevel_is_virtio(struct fuse_session *se);
+
+/*
+ * Inquiry functions
+ */
+
+/**
+ * Print low-level version information to stdout.
+ */
+void fuse_lowlevel_version(void);
+
+/**
+ * Print available low-level options to stdout. This is not an
+ * exhaustive list, but includes only those options that may be of
+ * interest to an end-user of a file system.
+ */
+void fuse_lowlevel_help(void);
+
+/**
+ * Print available options for `fuse_parse_cmdline()`.
+ */
+void fuse_cmdline_help(void);
+
+/*
+ * Filesystem setup & teardown
+ */
+
+struct fuse_cmdline_opts {
+ int foreground;
+ int debug;
+ int nodefault_subtype;
+ int show_version;
+ int show_help;
+ int print_capabilities;
+ int syslog;
+ int log_level;
+ unsigned int max_idle_threads;
+};
+
+/**
+ * Utility function to parse common options for simple file systems
+ * using the low-level API. A help text that describes the available
+ * options can be printed with `fuse_cmdline_help`. A single
+ * non-option argument is treated as the mountpoint. Multiple
+ * non-option arguments will result in an error.
+ *
+ * If neither -o subtype= or -o fsname= options are given, a new
+ * subtype option will be added and set to the basename of the program
+ * (the fsname will remain unset, and then defaults to "fuse").
+ *
+ * Known options will be removed from *args*, unknown options will
+ * remain.
+ *
+ * @param args argument vector (input+output)
+ * @param opts output argument for parsed options
+ * @return 0 on success, -1 on failure
+ */
+int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts);
+
+/**
+ * Create a low level session.
+ *
+ * Returns a session structure suitable for passing to
+ * fuse_session_mount() and fuse_session_loop().
+ *
+ * This function accepts most file-system independent mount options
+ * (like context, nodev, ro - see mount(8)), as well as the general
+ * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and
+ * -o default_permissions, but not ``-o use_ino``). Instead of `-o
+ * debug`, debugging may also enabled with `-d` or `--debug`.
+ *
+ * If not all options are known, an error message is written to stderr
+ * and the function returns NULL.
+ *
+ * Option parsing skips argv[0], which is assumed to contain the
+ * program name. To prevent accidentally passing an option in
+ * argv[0], this element must always be present (even if no options
+ * are specified). It may be set to the empty string ('\0') if no
+ * reasonable value can be provided.
+ *
+ * @param args argument vector
+ * @param op the (low-level) filesystem operations
+ * @param op_size sizeof(struct fuse_lowlevel_ops)
+ * @param userdata user data
+ *
+ * @return the fuse session on success, NULL on failure
+ **/
+struct fuse_session *fuse_session_new(struct fuse_args *args,
+ const struct fuse_lowlevel_ops *op,
+ size_t op_size, void *userdata);
+
+/**
+ * Mount a FUSE file system.
+ *
+ * @param se session object
+ *
+ * @return 0 on success, -1 on failure.
+ **/
+int fuse_session_mount(struct fuse_session *se);
+
+/**
+ * Enter a single threaded, blocking event loop.
+ *
+ * When the event loop terminates because the connection to the FUSE
+ * kernel module has been closed, this function returns zero. This
+ * happens when the filesystem is unmounted regularly (by the
+ * filesystem owner or root running the umount(8) or fusermount(1)
+ * command), or if connection is explicitly severed by writing ``1``
+ * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only
+ * way to distinguish between these two conditions is to check if the
+ * filesystem is still mounted after the session loop returns.
+ *
+ * When some error occurs during request processing, the function
+ * returns a negated errno(3) value.
+ *
+ * If the loop has been terminated because of a signal handler
+ * installed by fuse_set_signal_handlers(), this function returns the
+ * (positive) signal value that triggered the exit.
+ *
+ * @param se the session
+ * @return 0, -errno, or a signal value
+ */
+int fuse_session_loop(struct fuse_session *se);
+
+/**
+ * Flag a session as terminated.
+ *
+ * This function is invoked by the POSIX signal handlers, when
+ * registered using fuse_set_signal_handlers(). It will cause any
+ * running event loops to terminate on the next opportunity.
+ *
+ * @param se the session
+ */
+void fuse_session_exit(struct fuse_session *se);
+
+/**
+ * Reset the terminated flag of a session
+ *
+ * @param se the session
+ */
+void fuse_session_reset(struct fuse_session *se);
+
+/**
+ * Query the terminated flag of a session
+ *
+ * @param se the session
+ * @return 1 if exited, 0 if not exited
+ */
+int fuse_session_exited(struct fuse_session *se);
+
+/**
+ * Ensure that file system is unmounted.
+ *
+ * In regular operation, the file system is typically unmounted by the
+ * user calling umount(8) or fusermount(1), which then terminates the
+ * FUSE session loop. However, the session loop may also terminate as
+ * a result of an explicit call to fuse_session_exit() (e.g. by a
+ * signal handler installed by fuse_set_signal_handler()). In this
+ * case the filesystem remains mounted, but any attempt to access it
+ * will block (while the filesystem process is still running) or give
+ * an ESHUTDOWN error (after the filesystem process has terminated).
+ *
+ * If the communication channel with the FUSE kernel module is still
+ * open (i.e., if the session loop was terminated by an explicit call
+ * to fuse_session_exit()), this function will close it and unmount
+ * the filesystem. If the communication channel has been closed by the
+ * kernel, this method will do (almost) nothing.
+ *
+ * NOTE: The above semantics mean that if the connection to the kernel
+ * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file,
+ * this method will *not* unmount the filesystem.
+ *
+ * @param se the session
+ */
+void fuse_session_unmount(struct fuse_session *se);
+
+/**
+ * Destroy a session
+ *
+ * @param se the session
+ */
+void fuse_session_destroy(struct fuse_session *se);
+
+/*
+ * Custom event loop support
+ */
+
+/**
+ * Return file descriptor for communication with kernel.
+ *
+ * The file selector can be used to integrate FUSE with a custom event
+ * loop. Whenever data is available for reading on the provided fd,
+ * the event loop should call `fuse_session_receive_buf` followed by
+ * `fuse_session_process_buf` to process the request.
+ *
+ * The returned file descriptor is valid until `fuse_session_unmount`
+ * is called.
+ *
+ * @param se the session
+ * @return a file descriptor
+ */
+int fuse_session_fd(struct fuse_session *se);
+
+/**
+ * Process a raw request supplied in a generic buffer
+ *
+ * The fuse_buf may contain a memory buffer or a pipe file descriptor.
+ *
+ * @param se the session
+ * @param buf the fuse_buf containing the request
+ */
+void fuse_session_process_buf(struct fuse_session *se,
+ const struct fuse_buf *buf);
+
+/**
+ * Read a raw request from the kernel into the supplied buffer.
+ *
+ * Depending on file system options, system capabilities, and request
+ * size the request is either read into a memory buffer or spliced
+ * into a temporary pipe.
+ *
+ * @param se the session
+ * @param buf the fuse_buf to store the request in
+ * @return the actual size of the raw request, or -errno on error
+ */
+int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf);
+
+#endif /* FUSE_LOWLEVEL_H_ */
diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h
new file mode 100644
index 0000000000..5c618ce21f
--- /dev/null
+++ b/tools/virtiofsd/fuse_misc.h
@@ -0,0 +1,60 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include <pthread.h>
+#include "config-host.h"
+
+/*
+ * Versioned symbols cannot be used in some cases because it
+ * - confuse the dynamic linker in uClibc
+ * - not supported on MacOSX (in MachO binary format)
+ */
+#if (!defined(__UCLIBC__) && !defined(__APPLE__))
+#define FUSE_SYMVER(x) __asm__(x)
+#else
+#define FUSE_SYMVER(x)
+#endif
+
+#ifndef USE_UCLIBC
+#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL)
+#else
+/* Is this hack still needed? */
+static inline void fuse_mutex_init(pthread_mutex_t *mut)
+{
+ pthread_mutexattr_t attr;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+ pthread_mutex_init(mut, &attr);
+ pthread_mutexattr_destroy(&attr);
+}
+#endif
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM
+/* Linux */
+#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec)
+#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec)
+#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec)
+#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val)
+#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val)
+#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val)
+#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC)
+/* FreeBSD */
+#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec)
+#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec)
+#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec)
+#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val)
+#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val)
+#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val)
+#else
+#define ST_ATIM_NSEC(stbuf) 0
+#define ST_CTIM_NSEC(stbuf) 0
+#define ST_MTIM_NSEC(stbuf) 0
+#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0)
+#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0)
+#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0)
+#endif
diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c
new file mode 100644
index 0000000000..28922361a2
--- /dev/null
+++ b/tools/virtiofsd/fuse_opt.c
@@ -0,0 +1,450 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Implementation of option parsing routines (dealing with `struct
+ * fuse_args`).
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_opt.h"
+#include "fuse_i.h"
+#include "fuse_misc.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct fuse_opt_context {
+ void *data;
+ const struct fuse_opt *opt;
+ fuse_opt_proc_t proc;
+ int argctr;
+ int argc;
+ char **argv;
+ struct fuse_args outargs;
+ char *opts;
+ int nonopt;
+};
+
+void fuse_opt_free_args(struct fuse_args *args)
+{
+ if (args) {
+ if (args->argv && args->allocated) {
+ int i;
+ for (i = 0; i < args->argc; i++) {
+ free(args->argv[i]);
+ }
+ free(args->argv);
+ }
+ args->argc = 0;
+ args->argv = NULL;
+ args->allocated = 0;
+ }
+}
+
+static int alloc_failed(void)
+{
+ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n");
+ return -1;
+}
+
+int fuse_opt_add_arg(struct fuse_args *args, const char *arg)
+{
+ char **newargv;
+ char *newarg;
+
+ assert(!args->argv || args->allocated);
+
+ newarg = strdup(arg);
+ if (!newarg) {
+ return alloc_failed();
+ }
+
+ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *));
+ if (!newargv) {
+ free(newarg);
+ return alloc_failed();
+ }
+
+ args->argv = newargv;
+ args->allocated = 1;
+ args->argv[args->argc++] = newarg;
+ args->argv[args->argc] = NULL;
+ return 0;
+}
+
+static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos,
+ const char *arg)
+{
+ assert(pos <= args->argc);
+ if (fuse_opt_add_arg(args, arg) == -1) {
+ return -1;
+ }
+
+ if (pos != args->argc - 1) {
+ char *newarg = args->argv[args->argc - 1];
+ memmove(&args->argv[pos + 1], &args->argv[pos],
+ sizeof(char *) * (args->argc - pos - 1));
+ args->argv[pos] = newarg;
+ }
+ return 0;
+}
+
+int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg)
+{
+ return fuse_opt_insert_arg_common(args, pos, arg);
+}
+
+static int next_arg(struct fuse_opt_context *ctx, const char *opt)
+{
+ if (ctx->argctr + 1 >= ctx->argc) {
+ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt);
+ return -1;
+ }
+ ctx->argctr++;
+ return 0;
+}
+
+static int add_arg(struct fuse_opt_context *ctx, const char *arg)
+{
+ return fuse_opt_add_arg(&ctx->outargs, arg);
+}
+
+static int add_opt_common(char **opts, const char *opt, int esc)
+{
+ unsigned oldlen = *opts ? strlen(*opts) : 0;
+ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1);
+
+ if (!d) {
+ return alloc_failed();
+ }
+
+ *opts = d;
+ if (oldlen) {
+ d += oldlen;
+ *d++ = ',';
+ }
+
+ for (; *opt; opt++) {
+ if (esc && (*opt == ',' || *opt == '\\')) {
+ *d++ = '\\';
+ }
+ *d++ = *opt;
+ }
+ *d = '\0';
+
+ return 0;
+}
+
+int fuse_opt_add_opt(char **opts, const char *opt)
+{
+ return add_opt_common(opts, opt, 0);
+}
+
+int fuse_opt_add_opt_escaped(char **opts, const char *opt)
+{
+ return add_opt_common(opts, opt, 1);
+}
+
+static int add_opt(struct fuse_opt_context *ctx, const char *opt)
+{
+ return add_opt_common(&ctx->opts, opt, 1);
+}
+
+static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key,
+ int iso)
+{
+ if (key == FUSE_OPT_KEY_DISCARD) {
+ return 0;
+ }
+
+ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) {
+ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs);
+ if (res == -1 || !res) {
+ return res;
+ }
+ }
+ if (iso) {
+ return add_opt(ctx, arg);
+ } else {
+ return add_arg(ctx, arg);
+ }
+}
+
+static int match_template(const char *t, const char *arg, unsigned *sepp)
+{
+ int arglen = strlen(arg);
+ const char *sep = strchr(t, '=');
+ sep = sep ? sep : strchr(t, ' ');
+ if (sep && (!sep[1] || sep[1] == '%')) {
+ int tlen = sep - t;
+ if (sep[0] == '=') {
+ tlen++;
+ }
+ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) {
+ *sepp = sep - t;
+ return 1;
+ }
+ }
+ if (strcmp(t, arg) == 0) {
+ *sepp = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct fuse_opt *find_opt(const struct fuse_opt *opt,
+ const char *arg, unsigned *sepp)
+{
+ for (; opt && opt->templ; opt++) {
+ if (match_template(opt->templ, arg, sepp)) {
+ return opt;
+ }
+ }
+ return NULL;
+}
+
+int fuse_opt_match(const struct fuse_opt *opts, const char *opt)
+{
+ unsigned dummy;
+ return find_opt(opts, opt, &dummy) ? 1 : 0;
+}
+
+static int process_opt_param(void *var, const char *format, const char *param,
+ const char *arg)
+{
+ assert(format[0] == '%');
+ if (format[1] == 's') {
+ char **s = var;
+ char *copy = strdup(param);
+ if (!copy) {
+ return alloc_failed();
+ }
+
+ free(*s);
+ *s = copy;
+ } else {
+ if (sscanf(param, format, var) != 1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n",
+ arg);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt,
+ unsigned sep, const char *arg, int iso)
+{
+ if (opt->offset == -1U) {
+ if (call_proc(ctx, arg, opt->value, iso) == -1) {
+ return -1;
+ }
+ } else {
+ void *var = (char *)ctx->data + opt->offset;
+ if (sep && opt->templ[sep + 1]) {
+ const char *param = arg + sep;
+ if (opt->templ[sep] == '=') {
+ param++;
+ }
+ if (process_opt_param(var, opt->templ + sep + 1, param, arg) ==
+ -1) {
+ return -1;
+ }
+ } else {
+ *(int *)var = opt->value;
+ }
+ }
+ return 0;
+}
+
+static int process_opt_sep_arg(struct fuse_opt_context *ctx,
+ const struct fuse_opt *opt, unsigned sep,
+ const char *arg, int iso)
+{
+ int res;
+ char *newarg;
+ char *param;
+
+ if (next_arg(ctx, arg) == -1) {
+ return -1;
+ }
+
+ param = ctx->argv[ctx->argctr];
+ newarg = malloc(sep + strlen(param) + 1);
+ if (!newarg) {
+ return alloc_failed();
+ }
+
+ memcpy(newarg, arg, sep);
+ strcpy(newarg + sep, param);
+ res = process_opt(ctx, opt, sep, newarg, iso);
+ free(newarg);
+
+ return res;
+}
+
+static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso)
+{
+ unsigned sep;
+ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep);
+ if (opt) {
+ for (; opt; opt = find_opt(opt + 1, arg, &sep)) {
+ int res;
+ if (sep && opt->templ[sep] == ' ' && !arg[sep]) {
+ res = process_opt_sep_arg(ctx, opt, sep, arg, iso);
+ } else {
+ res = process_opt(ctx, opt, sep, arg, iso);
+ }
+ if (res == -1) {
+ return -1;
+ }
+ }
+ return 0;
+ } else {
+ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso);
+ }
+}
+
+static int process_real_option_group(struct fuse_opt_context *ctx, char *opts)
+{
+ char *s = opts;
+ char *d = s;
+ int end = 0;
+
+ while (!end) {
+ if (*s == '\0') {
+ end = 1;
+ }
+ if (*s == ',' || end) {
+ int res;
+
+ *d = '\0';
+ res = process_gopt(ctx, opts, 1);
+ if (res == -1) {
+ return -1;
+ }
+ d = opts;
+ } else {
+ if (s[0] == '\\' && s[1] != '\0') {
+ s++;
+ if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' &&
+ s[2] >= '0' && s[2] <= '7') {
+ *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 +
+ (s[2] - '0');
+ s += 2;
+ } else {
+ *d++ = *s;
+ }
+ } else {
+ *d++ = *s;
+ }
+ }
+ s++;
+ }
+
+ return 0;
+}
+
+static int process_option_group(struct fuse_opt_context *ctx, const char *opts)
+{
+ int res;
+ char *copy = strdup(opts);
+
+ if (!copy) {
+ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n");
+ return -1;
+ }
+ res = process_real_option_group(ctx, copy);
+ free(copy);
+ return res;
+}
+
+static int process_one(struct fuse_opt_context *ctx, const char *arg)
+{
+ if (ctx->nonopt || arg[0] != '-') {
+ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0);
+ } else if (arg[1] == 'o') {
+ if (arg[2]) {
+ return process_option_group(ctx, arg + 2);
+ } else {
+ if (next_arg(ctx, arg) == -1) {
+ return -1;
+ }
+
+ return process_option_group(ctx, ctx->argv[ctx->argctr]);
+ }
+ } else if (arg[1] == '-' && !arg[2]) {
+ if (add_arg(ctx, arg) == -1) {
+ return -1;
+ }
+ ctx->nonopt = ctx->outargs.argc;
+ return 0;
+ } else {
+ return process_gopt(ctx, arg, 0);
+ }
+}
+
+static int opt_parse(struct fuse_opt_context *ctx)
+{
+ if (ctx->argc) {
+ if (add_arg(ctx, ctx->argv[0]) == -1) {
+ return -1;
+ }
+ }
+
+ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) {
+ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) {
+ return -1;
+ }
+ }
+
+ if (ctx->opts) {
+ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 ||
+ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) {
+ return -1;
+ }
+ }
+
+ /* If option separator ("--") is the last argument, remove it */
+ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc &&
+ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) {
+ free(ctx->outargs.argv[ctx->outargs.argc - 1]);
+ ctx->outargs.argv[--ctx->outargs.argc] = NULL;
+ }
+
+ return 0;
+}
+
+int fuse_opt_parse(struct fuse_args *args, void *data,
+ const struct fuse_opt opts[], fuse_opt_proc_t proc)
+{
+ int res;
+ struct fuse_opt_context ctx = {
+ .data = data,
+ .opt = opts,
+ .proc = proc,
+ };
+
+ if (!args || !args->argv || !args->argc) {
+ return 0;
+ }
+
+ ctx.argc = args->argc;
+ ctx.argv = args->argv;
+
+ res = opt_parse(&ctx);
+ if (res != -1) {
+ struct fuse_args tmp = *args;
+ *args = ctx.outargs;
+ ctx.outargs = tmp;
+ }
+ free(ctx.opts);
+ fuse_opt_free_args(&ctx.outargs);
+ return res;
+}
diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h
new file mode 100644
index 0000000000..8f59b4d301
--- /dev/null
+++ b/tools/virtiofsd/fuse_opt.h
@@ -0,0 +1,272 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_OPT_H_
+#define FUSE_OPT_H_
+
+/** @file
+ *
+ * This file defines the option parsing interface of FUSE
+ */
+
+/**
+ * Option description
+ *
+ * This structure describes a single option, and action associated
+ * with it, in case it matches.
+ *
+ * More than one such match may occur, in which case the action for
+ * each match is executed.
+ *
+ * There are three possible actions in case of a match:
+ *
+ * i) An integer (int or unsigned) variable determined by 'offset' is
+ * set to 'value'
+ *
+ * ii) The processing function is called, with 'value' as the key
+ *
+ * iii) An integer (any) or string (char *) variable determined by
+ * 'offset' is set to the value of an option parameter
+ *
+ * 'offset' should normally be either set to
+ *
+ * - 'offsetof(struct foo, member)' actions i) and iii)
+ *
+ * - -1 action ii)
+ *
+ * The 'offsetof()' macro is defined in the <stddef.h> header.
+ *
+ * The template determines which options match, and also have an
+ * effect on the action. Normally the action is either i) or ii), but
+ * if a format is present in the template, then action iii) is
+ * performed.
+ *
+ * The types of templates are:
+ *
+ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only
+ * themselves. Invalid values are "--" and anything beginning
+ * with "-o"
+ *
+ * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or
+ * the relevant option in a comma separated option list
+ *
+ * 3) "bar=", "--foo=", etc. These are variations of 1) and 2)
+ * which have a parameter
+ *
+ * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform
+ * action iii).
+ *
+ * 5) "-x ", etc. Matches either "-xparam" or "-x param" as
+ * two separate arguments
+ *
+ * 6) "-x %s", etc. Combination of 4) and 5)
+ *
+ * If the format is "%s", memory is allocated for the string unlike with
+ * scanf(). The previous value (if non-NULL) stored at the this location is
+ * freed.
+ */
+struct fuse_opt {
+ /** Matching template and optional parameter formatting */
+ const char *templ;
+
+ /**
+ * Offset of variable within 'data' parameter of fuse_opt_parse()
+ * or -1
+ */
+ unsigned long offset;
+
+ /**
+ * Value to set the variable to, or to be passed as 'key' to the
+ * processing function. Ignored if template has a format
+ */
+ int value;
+};
+
+/**
+ * Key option. In case of a match, the processing function will be
+ * called with the specified key.
+ */
+#define FUSE_OPT_KEY(templ, key) \
+ { \
+ templ, -1U, key \
+ }
+
+/**
+ * Last option. An array of 'struct fuse_opt' must end with a NULL
+ * template value
+ */
+#define FUSE_OPT_END \
+ { \
+ NULL, 0, 0 \
+ }
+
+/**
+ * Argument list
+ */
+struct fuse_args {
+ /** Argument count */
+ int argc;
+
+ /** Argument vector. NULL terminated */
+ char **argv;
+
+ /** Is 'argv' allocated? */
+ int allocated;
+};
+
+/**
+ * Initializer for 'struct fuse_args'
+ */
+#define FUSE_ARGS_INIT(argc, argv) \
+ { \
+ argc, argv, 0 \
+ }
+
+/**
+ * Key value passed to the processing function if an option did not
+ * match any template
+ */
+#define FUSE_OPT_KEY_OPT -1
+
+/**
+ * Key value passed to the processing function for all non-options
+ *
+ * Non-options are the arguments beginning with a character other than
+ * '-' or all arguments after the special '--' option
+ */
+#define FUSE_OPT_KEY_NONOPT -2
+
+/**
+ * Special key value for options to keep
+ *
+ * Argument is not passed to processing function, but behave as if the
+ * processing function returned 1
+ */
+#define FUSE_OPT_KEY_KEEP -3
+
+/**
+ * Special key value for options to discard
+ *
+ * Argument is not passed to processing function, but behave as if the
+ * processing function returned zero
+ */
+#define FUSE_OPT_KEY_DISCARD -4
+
+/**
+ * Processing function
+ *
+ * This function is called if
+ * - option did not match any 'struct fuse_opt'
+ * - argument is a non-option
+ * - option did match and offset was set to -1
+ *
+ * The 'arg' parameter will always contain the whole argument or
+ * option including the parameter if exists. A two-argument option
+ * ("-x foo") is always converted to single argument option of the
+ * form "-xfoo" before this function is called.
+ *
+ * Options of the form '-ofoo' are passed to this function without the
+ * '-o' prefix.
+ *
+ * The return value of this function determines whether this argument
+ * is to be inserted into the output argument vector, or discarded.
+ *
+ * @param data is the user data passed to the fuse_opt_parse() function
+ * @param arg is the whole argument or option
+ * @param key determines why the processing function was called
+ * @param outargs the current output argument list
+ * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept
+ */
+typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key,
+ struct fuse_args *outargs);
+
+/**
+ * Option parsing function
+ *
+ * If 'args' was returned from a previous call to fuse_opt_parse() or
+ * it was constructed from
+ *
+ * A NULL 'args' is equivalent to an empty argument vector
+ *
+ * A NULL 'opts' is equivalent to an 'opts' array containing a single
+ * end marker
+ *
+ * A NULL 'proc' is equivalent to a processing function always
+ * returning '1'
+ *
+ * @param args is the input and output argument list
+ * @param data is the user data
+ * @param opts is the option description array
+ * @param proc is the processing function
+ * @return -1 on error, 0 on success
+ */
+int fuse_opt_parse(struct fuse_args *args, void *data,
+ const struct fuse_opt opts[], fuse_opt_proc_t proc);
+
+/**
+ * Add an option to a comma separated option list
+ *
+ * @param opts is a pointer to an option list, may point to a NULL value
+ * @param opt is the option to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_opt(char **opts, const char *opt);
+
+/**
+ * Add an option, escaping commas, to a comma separated option list
+ *
+ * @param opts is a pointer to an option list, may point to a NULL value
+ * @param opt is the option to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_opt_escaped(char **opts, const char *opt);
+
+/**
+ * Add an argument to a NULL terminated argument vector
+ *
+ * @param args is the structure containing the current argument list
+ * @param arg is the new argument to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_arg(struct fuse_args *args, const char *arg);
+
+/**
+ * Add an argument at the specified position in a NULL terminated
+ * argument vector
+ *
+ * Adds the argument to the N-th position. This is useful for adding
+ * options at the beginning of the array which must not come after the
+ * special '--' option.
+ *
+ * @param args is the structure containing the current argument list
+ * @param pos is the position at which to add the argument
+ * @param arg is the new argument to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg);
+
+/**
+ * Free the contents of argument list
+ *
+ * The structure itself is not freed
+ *
+ * @param args is the structure containing the argument list
+ */
+void fuse_opt_free_args(struct fuse_args *args);
+
+
+/**
+ * Check if an option matches
+ *
+ * @param opts is the option description array
+ * @param opt is the option to match
+ * @return 1 if a match is found, 0 if not
+ */
+int fuse_opt_match(const struct fuse_opt opts[], const char *opt);
+
+#endif /* FUSE_OPT_H_ */
diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c
new file mode 100644
index 0000000000..f18625b6e2
--- /dev/null
+++ b/tools/virtiofsd/fuse_signals.c
@@ -0,0 +1,98 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Utility functions for setting signal handlers.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static struct fuse_session *fuse_instance;
+
+static void exit_handler(int sig)
+{
+ if (fuse_instance) {
+ fuse_session_exit(fuse_instance);
+ if (sig <= 0) {
+ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n");
+ abort();
+ }
+ fuse_instance->error = sig;
+ }
+}
+
+static void do_nothing(int sig)
+{
+ (void)sig;
+}
+
+static int set_one_signal_handler(int sig, void (*handler)(int), int remove)
+{
+ struct sigaction sa;
+ struct sigaction old_sa;
+
+ memset(&sa, 0, sizeof(struct sigaction));
+ sa.sa_handler = remove ? SIG_DFL : handler;
+ sigemptyset(&(sa.sa_mask));
+ sa.sa_flags = 0;
+
+ if (sigaction(sig, NULL, &old_sa) == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) &&
+ sigaction(sig, &sa, NULL) == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+int fuse_set_signal_handlers(struct fuse_session *se)
+{
+ /*
+ * If we used SIG_IGN instead of the do_nothing function,
+ * then we would be unable to tell if we set SIG_IGN (and
+ * thus should reset to SIG_DFL in fuse_remove_signal_handlers)
+ * or if it was already set to SIG_IGN (and should be left
+ * untouched.
+ */
+ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) {
+ return -1;
+ }
+
+ fuse_instance = se;
+ return 0;
+}
+
+void fuse_remove_signal_handlers(struct fuse_session *se)
+{
+ if (fuse_instance != se) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: fuse_remove_signal_handlers: unknown session\n");
+ } else {
+ fuse_instance = NULL;
+ }
+
+ set_one_signal_handler(SIGHUP, exit_handler, 1);
+ set_one_signal_handler(SIGINT, exit_handler, 1);
+ set_one_signal_handler(SIGTERM, exit_handler, 1);
+ set_one_signal_handler(SIGPIPE, do_nothing, 1);
+}
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
new file mode 100644
index 0000000000..80a6e929df
--- /dev/null
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -0,0 +1,986 @@
+/*
+ * virtio-fs glue for FUSE
+ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * Implements the glue between libfuse and libvhost-user
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qapi/error.h"
+#include "fuse_i.h"
+#include "standard-headers/linux/fuse.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+#include "fuse_virtio.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <glib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include "contrib/libvhost-user/libvhost-user.h"
+
+struct fv_VuDev;
+struct fv_QueueInfo {
+ pthread_t thread;
+ /*
+ * This lock protects the VuVirtq preventing races between
+ * fv_queue_thread() and fv_queue_worker().
+ */
+ pthread_mutex_t vq_lock;
+
+ struct fv_VuDev *virtio_dev;
+
+ /* Our queue index, corresponds to array position */
+ int qidx;
+ int kick_fd;
+ int kill_fd; /* For killing the thread */
+};
+
+/* A FUSE request */
+typedef struct {
+ VuVirtqElement elem;
+ struct fuse_chan ch;
+
+ /* Used to complete requests that involve no reply */
+ bool reply_sent;
+} FVRequest;
+
+/*
+ * We pass the dev element into libvhost-user
+ * and then use it to get back to the outer
+ * container for other data.
+ */
+struct fv_VuDev {
+ VuDev dev;
+ struct fuse_session *se;
+
+ /*
+ * Either handle virtqueues or vhost-user protocol messages. Don't do
+ * both at the same time since that could lead to race conditions if
+ * virtqueues or memory tables change while another thread is accessing
+ * them.
+ *
+ * The assumptions are:
+ * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
+ * 2. virtio_loop() reads/writes virtqueues and VuDev.
+ */
+ pthread_rwlock_t vu_dispatch_rwlock;
+
+ /*
+ * The following pair of fields are only accessed in the main
+ * virtio_loop
+ */
+ size_t nqueues;
+ struct fv_QueueInfo **qi;
+};
+
+/* From spec */
+struct virtio_fs_config {
+ char tag[36];
+ uint32_t num_queues;
+};
+
+/* Callback from libvhost-user */
+static uint64_t fv_get_features(VuDev *dev)
+{
+ return 1ULL << VIRTIO_F_VERSION_1;
+}
+
+/* Callback from libvhost-user */
+static void fv_set_features(VuDev *dev, uint64_t features)
+{
+}
+
+/*
+ * Callback from libvhost-user if there's a new fd we're supposed to listen
+ * to, typically a queue kick?
+ */
+static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
+ void *data)
+{
+ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
+}
+
+/*
+ * Callback from libvhost-user if we're no longer supposed to listen on an fd
+ */
+static void fv_remove_watch(VuDev *dev, int fd)
+{
+ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
+}
+
+/* Callback from libvhost-user to panic */
+static void fv_panic(VuDev *dev, const char *err)
+{
+ fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
+ /* TODO: Allow reconnects?? */
+ exit(EXIT_FAILURE);
+}
+
+/*
+ * Copy from an iovec into a fuse_buf (memory only)
+ * Caller must ensure there is space
+ */
+static void copy_from_iov(struct fuse_buf *buf, size_t out_num,
+ const struct iovec *out_sg)
+{
+ void *dest = buf->mem;
+
+ while (out_num) {
+ size_t onelen = out_sg->iov_len;
+ memcpy(dest, out_sg->iov_base, onelen);
+ dest += onelen;
+ out_sg++;
+ out_num--;
+ }
+}
+
+/*
+ * Copy from one iov to another, the given number of bytes
+ * The caller must have checked sizes.
+ */
+static void copy_iov(struct iovec *src_iov, int src_count,
+ struct iovec *dst_iov, int dst_count, size_t to_copy)
+{
+ size_t dst_offset = 0;
+ /* Outer loop copies 'src' elements */
+ while (to_copy) {
+ assert(src_count);
+ size_t src_len = src_iov[0].iov_len;
+ size_t src_offset = 0;
+
+ if (src_len > to_copy) {
+ src_len = to_copy;
+ }
+ /* Inner loop copies contents of one 'src' to maybe multiple dst. */
+ while (src_len) {
+ assert(dst_count);
+ size_t dst_len = dst_iov[0].iov_len - dst_offset;
+ if (dst_len > src_len) {
+ dst_len = src_len;
+ }
+
+ memcpy(dst_iov[0].iov_base + dst_offset,
+ src_iov[0].iov_base + src_offset, dst_len);
+ src_len -= dst_len;
+ to_copy -= dst_len;
+ src_offset += dst_len;
+ dst_offset += dst_len;
+
+ assert(dst_offset <= dst_iov[0].iov_len);
+ if (dst_offset == dst_iov[0].iov_len) {
+ dst_offset = 0;
+ dst_iov++;
+ dst_count--;
+ }
+ }
+ src_iov++;
+ src_count--;
+ }
+}
+
+/*
+ * Called back by ll whenever it wants to send a reply/message back
+ * The 1st element of the iov starts with the fuse_out_header
+ * 'unique'==0 means it's a notify message.
+ */
+int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count)
+{
+ FVRequest *req = container_of(ch, FVRequest, ch);
+ struct fv_QueueInfo *qi = ch->qi;
+ VuDev *dev = &se->virtio_dev->dev;
+ VuVirtq *q = vu_get_queue(dev, qi->qidx);
+ VuVirtqElement *elem = &req->elem;
+ int ret = 0;
+
+ assert(count >= 1);
+ assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
+
+ struct fuse_out_header *out = iov[0].iov_base;
+ /* TODO: Endianness! */
+
+ size_t tosend_len = iov_size(iov, count);
+
+ /* unique == 0 is notification, which we don't support */
+ assert(out->unique);
+ assert(!req->reply_sent);
+
+ /* The 'in' part of the elem is to qemu */
+ unsigned int in_num = elem->in_num;
+ struct iovec *in_sg = elem->in_sg;
+ size_t in_len = iov_size(in_sg, in_num);
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
+ __func__, elem->index, in_num, in_len);
+
+ /*
+ * The elem should have room for a 'fuse_out_header' (out from fuse)
+ * plus the data based on the len in the header.
+ */
+ if (in_len < sizeof(struct fuse_out_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
+ __func__, elem->index);
+ ret = -E2BIG;
+ goto err;
+ }
+ if (in_len < tosend_len) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
+ __func__, elem->index, tosend_len);
+ ret = -E2BIG;
+ goto err;
+ }
+
+ copy_iov(iov, count, in_sg, in_num, tosend_len);
+
+ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ pthread_mutex_lock(&qi->vq_lock);
+ vu_queue_push(dev, q, elem, tosend_len);
+ vu_queue_notify(dev, q);
+ pthread_mutex_unlock(&qi->vq_lock);
+ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+
+ req->reply_sent = true;
+
+err:
+ return ret;
+}
+
+/*
+ * Callback from fuse_send_data_iov_* when it's virtio and the buffer
+ * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
+ * We need send the iov and then the buffer.
+ * Return 0 on success
+ */
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count, struct fuse_bufvec *buf,
+ size_t len)
+{
+ FVRequest *req = container_of(ch, FVRequest, ch);
+ struct fv_QueueInfo *qi = ch->qi;
+ VuDev *dev = &se->virtio_dev->dev;
+ VuVirtq *q = vu_get_queue(dev, qi->qidx);
+ VuVirtqElement *elem = &req->elem;
+ int ret = 0;
+
+ assert(count >= 1);
+ assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
+
+ struct fuse_out_header *out = iov[0].iov_base;
+ /* TODO: Endianness! */
+
+ size_t iov_len = iov_size(iov, count);
+ size_t tosend_len = iov_len + len;
+
+ out->len = tosend_len;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
+ count, len, iov_len);
+
+ /* unique == 0 is notification which we don't support */
+ assert(out->unique);
+
+ assert(!req->reply_sent);
+
+ /* The 'in' part of the elem is to qemu */
+ unsigned int in_num = elem->in_num;
+ struct iovec *in_sg = elem->in_sg;
+ size_t in_len = iov_size(in_sg, in_num);
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
+ __func__, elem->index, in_num, in_len);
+
+ /*
+ * The elem should have room for a 'fuse_out_header' (out from fuse)
+ * plus the data based on the len in the header.
+ */
+ if (in_len < sizeof(struct fuse_out_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
+ __func__, elem->index);
+ ret = E2BIG;
+ goto err;
+ }
+ if (in_len < tosend_len) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
+ __func__, elem->index, tosend_len);
+ ret = E2BIG;
+ goto err;
+ }
+
+ /* TODO: Limit to 'len' */
+
+ /* First copy the header data from iov->in_sg */
+ copy_iov(iov, count, in_sg, in_num, iov_len);
+
+ /*
+ * Build a copy of the the in_sg iov so we can skip bits in it,
+ * including changing the offsets
+ */
+ struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num);
+ assert(in_sg_cpy);
+ memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
+ /* These get updated as we skip */
+ struct iovec *in_sg_ptr = in_sg_cpy;
+ int in_sg_cpy_count = in_num;
+
+ /* skip over parts of in_sg that contained the header iov */
+ size_t skip_size = iov_len;
+
+ size_t in_sg_left = 0;
+ do {
+ while (skip_size != 0 && in_sg_cpy_count) {
+ if (skip_size >= in_sg_ptr[0].iov_len) {
+ skip_size -= in_sg_ptr[0].iov_len;
+ in_sg_ptr++;
+ in_sg_cpy_count--;
+ } else {
+ in_sg_ptr[0].iov_len -= skip_size;
+ in_sg_ptr[0].iov_base += skip_size;
+ break;
+ }
+ }
+
+ int i;
+ for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) {
+ in_sg_left += in_sg_ptr[i].iov_len;
+ }
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: after skip skip_size=%zd in_sg_cpy_count=%d "
+ "in_sg_left=%zd\n",
+ __func__, skip_size, in_sg_cpy_count, in_sg_left);
+ ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
+ buf->buf[0].pos);
+
+ if (ret == -1) {
+ ret = errno;
+ fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
+ __func__, len);
+ free(in_sg_cpy);
+ goto err;
+ }
+ fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
+ ret, len);
+ if (ret < len && ret) {
+ fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
+ /* Skip over this much next time around */
+ skip_size = ret;
+ buf->buf[0].pos += ret;
+ len -= ret;
+
+ /* Lets do another read */
+ continue;
+ }
+ if (!ret) {
+ /* EOF case? */
+ fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__,
+ in_sg_left);
+ break;
+ }
+ if (ret != len) {
+ fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__);
+ ret = EIO;
+ free(in_sg_cpy);
+ goto err;
+ }
+ in_sg_left -= ret;
+ len -= ret;
+ } while (in_sg_left);
+ free(in_sg_cpy);
+
+ /* Need to fix out->len on EOF */
+ if (len) {
+ struct fuse_out_header *out_sg = in_sg[0].iov_base;
+
+ tosend_len -= len;
+ out_sg->len = tosend_len;
+ }
+
+ ret = 0;
+
+ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ pthread_mutex_lock(&qi->vq_lock);
+ vu_queue_push(dev, q, elem, tosend_len);
+ vu_queue_notify(dev, q);
+ pthread_mutex_unlock(&qi->vq_lock);
+ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+
+err:
+ if (ret == 0) {
+ req->reply_sent = true;
+ }
+
+ return ret;
+}
+
+/* Process one FVRequest in a thread pool */
+static void fv_queue_worker(gpointer data, gpointer user_data)
+{
+ struct fv_QueueInfo *qi = user_data;
+ struct fuse_session *se = qi->virtio_dev->se;
+ struct VuDev *dev = &qi->virtio_dev->dev;
+ FVRequest *req = data;
+ VuVirtqElement *elem = &req->elem;
+ struct fuse_buf fbuf = {};
+ bool allocated_bufv = false;
+ struct fuse_bufvec bufv;
+ struct fuse_bufvec *pbufv;
+
+ assert(se->bufsize > sizeof(struct fuse_in_header));
+
+ /*
+ * An element contains one request and the space to send our response
+ * They're spread over multiple descriptors in a scatter/gather set
+ * and we can't trust the guest to keep them still; so copy in/out.
+ */
+ fbuf.mem = malloc(se->bufsize);
+ assert(fbuf.mem);
+
+ fuse_mutex_init(&req->ch.lock);
+ req->ch.fd = -1;
+ req->ch.qi = qi;
+
+ /* The 'out' part of the elem is from qemu */
+ unsigned int out_num = elem->out_num;
+ struct iovec *out_sg = elem->out_sg;
+ size_t out_len = iov_size(out_sg, out_num);
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: elem %d: with %d out desc of length %zd\n",
+ __func__, elem->index, out_num, out_len);
+
+ /*
+ * The elem should contain a 'fuse_in_header' (in to fuse)
+ * plus the data based on the len in the header.
+ */
+ if (out_len < sizeof(struct fuse_in_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
+ __func__, elem->index);
+ assert(0); /* TODO */
+ }
+ if (out_len > se->bufsize) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
+ elem->index);
+ assert(0); /* TODO */
+ }
+ /* Copy just the first element and look at it */
+ copy_from_iov(&fbuf, 1, out_sg);
+
+ pbufv = NULL; /* Compiler thinks an unitialised path */
+ if (out_num > 2 &&
+ out_sg[0].iov_len == sizeof(struct fuse_in_header) &&
+ ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE &&
+ out_sg[1].iov_len == sizeof(struct fuse_write_in)) {
+ /*
+ * For a write we don't actually need to copy the
+ * data, we can just do it straight out of guest memory
+ * but we must still copy the headers in case the guest
+ * was nasty and changed them while we were using them.
+ */
+ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
+
+ /* copy the fuse_write_in header afte rthe fuse_in_header */
+ fbuf.mem += out_sg->iov_len;
+ copy_from_iov(&fbuf, 1, out_sg + 1);
+ fbuf.mem -= out_sg->iov_len;
+ fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len;
+
+ /* Allocate the bufv, with space for the rest of the iov */
+ pbufv = malloc(sizeof(struct fuse_bufvec) +
+ sizeof(struct fuse_buf) * (out_num - 2));
+ if (!pbufv) {
+ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
+ __func__);
+ goto out;
+ }
+
+ allocated_bufv = true;
+ pbufv->count = 1;
+ pbufv->buf[0] = fbuf;
+
+ size_t iovindex, pbufvindex;
+ iovindex = 2; /* 2 headers, separate iovs */
+ pbufvindex = 1; /* 2 headers, 1 fusebuf */
+
+ for (; iovindex < out_num; iovindex++, pbufvindex++) {
+ pbufv->count++;
+ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
+ pbufv->buf[pbufvindex].flags = 0;
+ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
+ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
+ }
+ } else {
+ /* Normal (non fast write) path */
+
+ /* Copy the rest of the buffer */
+ fbuf.mem += out_sg->iov_len;
+ copy_from_iov(&fbuf, out_num - 1, out_sg + 1);
+ fbuf.mem -= out_sg->iov_len;
+ fbuf.size = out_len;
+
+ /* TODO! Endianness of header */
+
+ /* TODO: Add checks for fuse_session_exited */
+ bufv.buf[0] = fbuf;
+ bufv.count = 1;
+ pbufv = &bufv;
+ }
+ pbufv->idx = 0;
+ pbufv->off = 0;
+ fuse_session_process_buf_int(se, pbufv, &req->ch);
+
+out:
+ if (allocated_bufv) {
+ free(pbufv);
+ }
+
+ /* If the request has no reply, still recycle the virtqueue element */
+ if (!req->reply_sent) {
+ struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
+ elem->index);
+
+ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ pthread_mutex_lock(&qi->vq_lock);
+ vu_queue_push(dev, q, elem, 0);
+ vu_queue_notify(dev, q);
+ pthread_mutex_unlock(&qi->vq_lock);
+ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ }
+
+ pthread_mutex_destroy(&req->ch.lock);
+ free(fbuf.mem);
+ free(req);
+}
+
+/* Thread function for individual queues, created when a queue is 'started' */
+static void *fv_queue_thread(void *opaque)
+{
+ struct fv_QueueInfo *qi = opaque;
+ struct VuDev *dev = &qi->virtio_dev->dev;
+ struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
+ struct fuse_session *se = qi->virtio_dev->se;
+ GThreadPool *pool;
+
+ pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, TRUE,
+ NULL);
+ if (!pool) {
+ fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
+ return NULL;
+ }
+
+ fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
+ qi->qidx, qi->kick_fd);
+ while (1) {
+ struct pollfd pf[2];
+ int ret;
+
+ pf[0].fd = qi->kick_fd;
+ pf[0].events = POLLIN;
+ pf[0].revents = 0;
+ pf[1].fd = qi->kill_fd;
+ pf[1].events = POLLIN;
+ pf[1].revents = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
+ qi->qidx);
+ int poll_res = ppoll(pf, 2, NULL, NULL);
+
+ if (poll_res == -1) {
+ if (errno == EINTR) {
+ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
+ __func__);
+ continue;
+ }
+ fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
+ break;
+ }
+ assert(poll_res >= 1);
+ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
+ __func__, pf[0].revents, qi->qidx);
+ break;
+ }
+ if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Unexpected poll revents %x Queue %d killfd\n",
+ __func__, pf[1].revents, qi->qidx);
+ break;
+ }
+ if (pf[1].revents) {
+ fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
+ __func__, qi->qidx);
+ break;
+ }
+ assert(pf[0].revents & POLLIN);
+ fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
+ qi->qidx);
+
+ eventfd_t evalue;
+ if (eventfd_read(qi->kick_fd, &evalue)) {
+ fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
+ break;
+ }
+ /* Mutual exclusion with virtio_loop() */
+ ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ assert(ret == 0); /* there is no possible error case */
+ pthread_mutex_lock(&qi->vq_lock);
+ /* out is from guest, in is too guest */
+ unsigned int in_bytes, out_bytes;
+ vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
+ __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
+
+ while (1) {
+ FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
+ if (!req) {
+ break;
+ }
+
+ req->reply_sent = false;
+
+ g_thread_pool_push(pool, req, NULL);
+ }
+
+ pthread_mutex_unlock(&qi->vq_lock);
+ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ }
+
+ g_thread_pool_free(pool, FALSE, TRUE);
+
+ return NULL;
+}
+
+static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
+{
+ int ret;
+ struct fv_QueueInfo *ourqi;
+
+ assert(qidx < vud->nqueues);
+ ourqi = vud->qi[qidx];
+
+ /* Kill the thread */
+ if (eventfd_write(ourqi->kill_fd, 1)) {
+ fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
+ qidx, strerror(errno));
+ }
+ ret = pthread_join(ourqi->thread, NULL);
+ if (ret) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
+ __func__, qidx, ret);
+ }
+ pthread_mutex_destroy(&ourqi->vq_lock);
+ close(ourqi->kill_fd);
+ ourqi->kick_fd = -1;
+ free(vud->qi[qidx]);
+ vud->qi[qidx] = NULL;
+}
+
+/* Callback from libvhost-user on start or stop of a queue */
+static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
+{
+ struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
+ struct fv_QueueInfo *ourqi;
+
+ fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
+ started);
+ assert(qidx >= 0);
+
+ /*
+ * Ignore additional request queues for now. passthrough_ll.c must be
+ * audited for thread-safety issues first. It was written with a
+ * well-behaved client in mind and may not protect against all types of
+ * races yet.
+ */
+ if (qidx > 1) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: multiple request queues not yet implemented, please only "
+ "configure 1 request queue\n",
+ __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (started) {
+ /* Fire up a thread to watch this queue */
+ if (qidx >= vud->nqueues) {
+ vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0]));
+ assert(vud->qi);
+ memset(vud->qi + vud->nqueues, 0,
+ sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
+ vud->nqueues = qidx + 1;
+ }
+ if (!vud->qi[qidx]) {
+ vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1);
+ assert(vud->qi[qidx]);
+ vud->qi[qidx]->virtio_dev = vud;
+ vud->qi[qidx]->qidx = qidx;
+ } else {
+ /* Shouldn't have been started */
+ assert(vud->qi[qidx]->kick_fd == -1);
+ }
+ ourqi = vud->qi[qidx];
+ ourqi->kick_fd = dev->vq[qidx].kick_fd;
+
+ ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
+ assert(ourqi->kill_fd != -1);
+ pthread_mutex_init(&ourqi->vq_lock, NULL);
+
+ if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
+ __func__, qidx);
+ assert(0);
+ }
+ } else {
+ fv_queue_cleanup_thread(vud, qidx);
+ }
+}
+
+static bool fv_queue_order(VuDev *dev, int qidx)
+{
+ return false;
+}
+
+static const VuDevIface fv_iface = {
+ .get_features = fv_get_features,
+ .set_features = fv_set_features,
+
+ /* Don't need process message, we've not got any at vhost-user level */
+ .queue_set_started = fv_queue_set_started,
+
+ .queue_is_processed_in_order = fv_queue_order,
+};
+
+/*
+ * Main loop; this mostly deals with events on the vhost-user
+ * socket itself, and not actual fuse data.
+ */
+int virtio_loop(struct fuse_session *se)
+{
+ fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
+
+ while (!fuse_session_exited(se)) {
+ struct pollfd pf[1];
+ bool ok;
+ int ret;
+ pf[0].fd = se->vu_socketfd;
+ pf[0].events = POLLIN;
+ pf[0].revents = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
+ int poll_res = ppoll(pf, 1, NULL, NULL);
+
+ if (poll_res == -1) {
+ if (errno == EINTR) {
+ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
+ __func__);
+ continue;
+ }
+ fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
+ break;
+ }
+ assert(poll_res == 1);
+ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
+ pf[0].revents);
+ break;
+ }
+ assert(pf[0].revents & POLLIN);
+ fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
+ /* Mutual exclusion with fv_queue_thread() */
+ ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock);
+ assert(ret == 0); /* there is no possible error case */
+
+ ok = vu_dispatch(&se->virtio_dev->dev);
+
+ pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock);
+
+ if (!ok) {
+ fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
+ break;
+ }
+ }
+
+ /*
+ * Make sure all fv_queue_thread()s quit on exit, as we're about to
+ * free virtio dev and fuse session, no one should access them anymore.
+ */
+ for (int i = 0; i < se->virtio_dev->nqueues; i++) {
+ if (!se->virtio_dev->qi[i]) {
+ continue;
+ }
+
+ fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
+ fv_queue_cleanup_thread(se->virtio_dev, i);
+ }
+
+ fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
+
+ return 0;
+}
+
+static void strreplace(char *s, char old, char new)
+{
+ for (; *s; ++s) {
+ if (*s == old) {
+ *s = new;
+ }
+ }
+}
+
+static bool fv_socket_lock(struct fuse_session *se)
+{
+ g_autofree gchar *sk_name = NULL;
+ g_autofree gchar *pidfile = NULL;
+ g_autofree gchar *dir = NULL;
+ Error *local_err = NULL;
+
+ dir = qemu_get_local_state_pathname("run/virtiofsd");
+
+ if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s",
+ __func__, dir, strerror(errno));
+ return false;
+ }
+
+ sk_name = g_strdup(se->vu_socket_path);
+ strreplace(sk_name, '/', '.');
+ pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
+
+ if (!qemu_write_pidfile(pidfile, &local_err)) {
+ error_report_err(local_err);
+ return false;
+ }
+
+ return true;
+}
+
+static int fv_create_listen_socket(struct fuse_session *se)
+{
+ struct sockaddr_un un;
+ mode_t old_umask;
+
+ /* Nothing to do if fd is already initialized */
+ if (se->vu_listen_fd >= 0) {
+ return 0;
+ }
+
+ if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
+ fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
+ return -1;
+ }
+
+ if (!strlen(se->vu_socket_path)) {
+ fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
+ return -1;
+ }
+
+ /* Check the vu_socket_path is already used */
+ if (!fv_socket_lock(se)) {
+ return -1;
+ }
+
+ /*
+ * Create the Unix socket to communicate with qemu
+ * based on QEMU's vhost-user-bridge
+ */
+ unlink(se->vu_socket_path);
+ strcpy(un.sun_path, se->vu_socket_path);
+ size_t addr_len = sizeof(un);
+
+ int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (listen_sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
+ return -1;
+ }
+ un.sun_family = AF_UNIX;
+
+ /*
+ * Unfortunately bind doesn't let you set the mask on the socket,
+ * so set umask to 077 and restore it later.
+ */
+ old_umask = umask(0077);
+ if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
+ umask(old_umask);
+ return -1;
+ }
+ umask(old_umask);
+
+ if (listen(listen_sock, 1) == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
+ return -1;
+ }
+
+ se->vu_listen_fd = listen_sock;
+ return 0;
+}
+
+int virtio_session_mount(struct fuse_session *se)
+{
+ int ret;
+
+ ret = fv_create_listen_socket(se);
+ if (ret < 0) {
+ return ret;
+ }
+
+ se->fd = -1;
+
+ fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
+ __func__);
+ int data_sock = accept(se->vu_listen_fd, NULL, NULL);
+ if (data_sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
+ close(se->vu_listen_fd);
+ return -1;
+ }
+ close(se->vu_listen_fd);
+ se->vu_listen_fd = -1;
+ fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
+ __func__);
+
+ /* TODO: Some cleanup/deallocation! */
+ se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
+ if (!se->virtio_dev) {
+ fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__);
+ close(data_sock);
+ return -1;
+ }
+
+ se->vu_socketfd = data_sock;
+ se->virtio_dev->se = se;
+ pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
+ vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch,
+ fv_remove_watch, &fv_iface);
+
+ return 0;
+}
+
+void virtio_session_close(struct fuse_session *se)
+{
+ close(se->vu_socketfd);
+
+ if (!se->virtio_dev) {
+ return;
+ }
+
+ free(se->virtio_dev->qi);
+ pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
+ free(se->virtio_dev);
+ se->virtio_dev = NULL;
+}
diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h
new file mode 100644
index 0000000000..111684032c
--- /dev/null
+++ b/tools/virtiofsd/fuse_virtio.h
@@ -0,0 +1,33 @@
+/*
+ * virtio-fs glue for FUSE
+ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * Implements the glue between libfuse and libvhost-user
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#ifndef FUSE_VIRTIO_H
+#define FUSE_VIRTIO_H
+
+#include "fuse_i.h"
+
+struct fuse_session;
+
+int virtio_session_mount(struct fuse_session *se);
+void virtio_session_close(struct fuse_session *se);
+int virtio_loop(struct fuse_session *se);
+
+
+int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count);
+
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count,
+ struct fuse_bufvec *buf, size_t len);
+
+#endif
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
new file mode 100644
index 0000000000..0801cf752c
--- /dev/null
+++ b/tools/virtiofsd/helper.c
@@ -0,0 +1,349 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Helper functions to create (simple) standalone programs. With the
+ * aid of these functions it should be possible to create full FUSE
+ * file system by implementing nothing but the request handlers.
+
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
+#include <unistd.h>
+
+#define FUSE_HELPER_OPT(t, p) \
+ { \
+ t, offsetof(struct fuse_cmdline_opts, p), 1 \
+ }
+#define FUSE_HELPER_OPT_VALUE(t, p, v) \
+ { \
+ t, offsetof(struct fuse_cmdline_opts, p), v \
+ }
+
+static const struct fuse_opt fuse_helper_opts[] = {
+ FUSE_HELPER_OPT("-h", show_help),
+ FUSE_HELPER_OPT("--help", show_help),
+ FUSE_HELPER_OPT("-V", show_version),
+ FUSE_HELPER_OPT("--version", show_version),
+ FUSE_HELPER_OPT("--print-capabilities", print_capabilities),
+ FUSE_HELPER_OPT("-d", debug),
+ FUSE_HELPER_OPT("debug", debug),
+ FUSE_HELPER_OPT("-d", foreground),
+ FUSE_HELPER_OPT("debug", foreground),
+ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP),
+ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("-f", foreground),
+ FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0),
+ FUSE_HELPER_OPT("fsname=", nodefault_subtype),
+ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("subtype=", nodefault_subtype),
+ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
+ FUSE_HELPER_OPT("--syslog", syslog),
+ FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
+ FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
+ FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING),
+ FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR),
+ FUSE_OPT_END
+};
+
+struct fuse_conn_info_opts {
+ int atomic_o_trunc;
+ int no_remote_posix_lock;
+ int no_remote_flock;
+ int splice_write;
+ int splice_move;
+ int splice_read;
+ int no_splice_write;
+ int no_splice_move;
+ int no_splice_read;
+ int auto_inval_data;
+ int no_auto_inval_data;
+ int no_readdirplus;
+ int no_readdirplus_auto;
+ int async_dio;
+ int no_async_dio;
+ int writeback_cache;
+ int no_writeback_cache;
+ int async_read;
+ int sync_read;
+ unsigned max_write;
+ unsigned max_readahead;
+ unsigned max_background;
+ unsigned congestion_threshold;
+ unsigned time_gran;
+ int set_max_write;
+ int set_max_readahead;
+ int set_max_background;
+ int set_congestion_threshold;
+ int set_time_gran;
+};
+
+#define CONN_OPTION(t, p, v) \
+ { \
+ t, offsetof(struct fuse_conn_info_opts, p), v \
+ }
+static const struct fuse_opt conn_info_opt_spec[] = {
+ CONN_OPTION("max_write=%u", max_write, 0),
+ CONN_OPTION("max_write=", set_max_write, 1),
+ CONN_OPTION("max_readahead=%u", max_readahead, 0),
+ CONN_OPTION("max_readahead=", set_max_readahead, 1),
+ CONN_OPTION("max_background=%u", max_background, 0),
+ CONN_OPTION("max_background=", set_max_background, 1),
+ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0),
+ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1),
+ CONN_OPTION("sync_read", sync_read, 1),
+ CONN_OPTION("async_read", async_read, 1),
+ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1),
+ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1),
+ CONN_OPTION("no_remote_lock", no_remote_flock, 1),
+ CONN_OPTION("no_remote_flock", no_remote_flock, 1),
+ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1),
+ CONN_OPTION("splice_write", splice_write, 1),
+ CONN_OPTION("no_splice_write", no_splice_write, 1),
+ CONN_OPTION("splice_move", splice_move, 1),
+ CONN_OPTION("no_splice_move", no_splice_move, 1),
+ CONN_OPTION("splice_read", splice_read, 1),
+ CONN_OPTION("no_splice_read", no_splice_read, 1),
+ CONN_OPTION("auto_inval_data", auto_inval_data, 1),
+ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1),
+ CONN_OPTION("readdirplus=no", no_readdirplus, 1),
+ CONN_OPTION("readdirplus=yes", no_readdirplus, 0),
+ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1),
+ CONN_OPTION("readdirplus=auto", no_readdirplus, 0),
+ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0),
+ CONN_OPTION("async_dio", async_dio, 1),
+ CONN_OPTION("no_async_dio", no_async_dio, 1),
+ CONN_OPTION("writeback_cache", writeback_cache, 1),
+ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1),
+ CONN_OPTION("time_gran=%u", time_gran, 0),
+ CONN_OPTION("time_gran=", set_time_gran, 1),
+ FUSE_OPT_END
+};
+
+
+void fuse_cmdline_help(void)
+{
+ printf(" -h --help print help\n"
+ " -V --version print version\n"
+ " --print-capabilities print vhost-user.json\n"
+ " -d -o debug enable debug output (implies -f)\n"
+ " --syslog log to syslog (default stderr)\n"
+ " -f foreground operation\n"
+ " --daemonize run in background\n"
+ " -o cache=<mode> cache mode. could be one of \"auto, "
+ "always, none\"\n"
+ " default: auto\n"
+ " -o flock|no_flock enable/disable flock\n"
+ " default: no_flock\n"
+ " -o log_level=<level> log level, default to \"info\"\n"
+ " level could be one of \"debug, "
+ "info, warn, err\"\n"
+ " -o max_idle_threads the maximum number of idle worker "
+ "threads\n"
+ " allowed (default: 10)\n"
+ " -o norace disable racy fallback\n"
+ " default: false\n"
+ " -o posix_lock|no_posix_lock\n"
+ " enable/disable remote posix lock\n"
+ " default: posix_lock\n"
+ " -o readdirplus|no_readdirplus\n"
+ " enable/disable readirplus\n"
+ " default: readdirplus except with "
+ "cache=none\n"
+ " -o timeout=<number> I/O timeout (second)\n"
+ " default: depends on cache= option.\n"
+ " -o writeback|no_writeback enable/disable writeback cache\n"
+ " default: no_writeback\n"
+ " -o xattr|no_xattr enable/disable xattr\n"
+ " default: no_xattr\n"
+ );
+}
+
+static int fuse_helper_opt_proc(void *data, const char *arg, int key,
+ struct fuse_args *outargs)
+{
+ (void)data;
+ (void)outargs;
+
+ switch (key) {
+ case FUSE_OPT_KEY_NONOPT:
+ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg);
+ return -1;
+
+ default:
+ /* Pass through unknown options */
+ return 1;
+ }
+}
+
+int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
+{
+ memset(opts, 0, sizeof(struct fuse_cmdline_opts));
+
+ opts->max_idle_threads = 10;
+ opts->foreground = 1;
+
+ if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==
+ -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int fuse_daemonize(int foreground)
+{
+ int ret = 0, rett;
+ if (!foreground) {
+ int nullfd;
+ int waiter[2];
+ char completed;
+
+ if (pipe(waiter)) {
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * demonize current process by forking it and killing the
+ * parent. This makes current process as a child of 'init'.
+ */
+ switch (fork()) {
+ case -1:
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n",
+ strerror(errno));
+ return -1;
+ case 0:
+ break;
+ default:
+ _exit(read(waiter[0], &completed,
+ sizeof(completed) != sizeof(completed)));
+ }
+
+ if (setsid() == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ret = chdir("/");
+
+ nullfd = open("/dev/null", O_RDWR, 0);
+ if (nullfd != -1) {
+ rett = dup2(nullfd, 0);
+ if (!ret) {
+ ret = rett;
+ }
+ rett = dup2(nullfd, 1);
+ if (!ret) {
+ ret = rett;
+ }
+ rett = dup2(nullfd, 2);
+ if (!ret) {
+ ret = rett;
+ }
+ if (nullfd > 2) {
+ close(nullfd);
+ }
+ }
+
+ /* Propagate completion of daemon initialization */
+ completed = 1;
+ rett = write(waiter[1], &completed, sizeof(completed));
+ if (!ret) {
+ ret = rett;
+ }
+ close(waiter[0]);
+ close(waiter[1]);
+ } else {
+ ret = chdir("/");
+ }
+ return ret;
+}
+
+void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts,
+ struct fuse_conn_info *conn)
+{
+ if (opts->set_max_write) {
+ conn->max_write = opts->max_write;
+ }
+ if (opts->set_max_background) {
+ conn->max_background = opts->max_background;
+ }
+ if (opts->set_congestion_threshold) {
+ conn->congestion_threshold = opts->congestion_threshold;
+ }
+ if (opts->set_time_gran) {
+ conn->time_gran = opts->time_gran;
+ }
+ if (opts->set_max_readahead) {
+ conn->max_readahead = opts->max_readahead;
+ }
+
+#define LL_ENABLE(cond, cap) \
+ if (cond) \
+ conn->want |= (cap)
+#define LL_DISABLE(cond, cap) \
+ if (cond) \
+ conn->want &= ~(cap)
+
+ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ);
+ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ);
+
+ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE);
+ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE);
+
+ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE);
+ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE);
+
+ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA);
+ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA);
+
+ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS);
+ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO);
+
+ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO);
+ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO);
+
+ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE);
+ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE);
+
+ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ);
+ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ);
+
+ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS);
+ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS);
+}
+
+struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args)
+{
+ struct fuse_conn_info_opts *opts;
+
+ opts = calloc(1, sizeof(struct fuse_conn_info_opts));
+ if (opts == NULL) {
+ fuse_log(FUSE_LOG_ERR, "calloc failed\n");
+ return NULL;
+ }
+ if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) {
+ free(opts);
+ return NULL;
+ }
+ return opts;
+}
diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h
new file mode 100644
index 0000000000..0b98275ed5
--- /dev/null
+++ b/tools/virtiofsd/passthrough_helpers.h
@@ -0,0 +1,51 @@
+/*
+ * FUSE: Filesystem in Userspace
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+/*
+ * Creates files on the underlying file system in response to a FUSE_MKNOD
+ * operation
+ */
+static int mknod_wrapper(int dirfd, const char *path, const char *link,
+ int mode, dev_t rdev)
+{
+ int res;
+
+ if (S_ISREG(mode)) {
+ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode);
+ if (res >= 0) {
+ res = close(res);
+ }
+ } else if (S_ISDIR(mode)) {
+ res = mkdirat(dirfd, path, mode);
+ } else if (S_ISLNK(mode) && link != NULL) {
+ res = symlinkat(link, dirfd, path);
+ } else if (S_ISFIFO(mode)) {
+ res = mkfifoat(dirfd, path, mode);
+ } else {
+ res = mknodat(dirfd, path, mode, rdev);
+ }
+
+ return res;
+}
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
new file mode 100644
index 0000000000..e6f2399efc
--- /dev/null
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -0,0 +1,3006 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file COPYING.
+ */
+
+/*
+ *
+ * This file system mirrors the existing file system hierarchy of the
+ * system, starting at the root file system. This is implemented by
+ * just "passing through" all requests to the corresponding user-space
+ * libc functions. In contrast to passthrough.c and passthrough_fh.c,
+ * this implementation uses the low-level API. Its performance should
+ * be the least bad among the three, but many operations are not
+ * implemented. In particular, it is not possible to remove files (or
+ * directories) because the code necessary to defer actual removal
+ * until the file is not opened anymore would make the example much
+ * more complicated.
+ *
+ * When writeback caching is enabled (-o writeback mount option), it
+ * is only possible to write to files for which the mounting user has
+ * read permissions. This is because the writeback cache requires the
+ * kernel to be able to issue read requests for all files (which the
+ * passthrough filesystem cannot satisfy if it can't read the file in
+ * the underlying filesystem).
+ *
+ * Compile with:
+ *
+ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
+ * passthrough_ll
+ *
+ * ## Source code ##
+ * \include passthrough_ll.c
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+#include "fuse_virtio.h"
+#include "fuse_log.h"
+#include "fuse_lowlevel.h"
+#include <assert.h>
+#include <cap-ng.h>
+#include <dirent.h>
+#include <errno.h>
+#include <glib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/xattr.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include "passthrough_helpers.h"
+#include "seccomp.h"
+
+/* Keep track of inode posix locks for each owner. */
+struct lo_inode_plock {
+ uint64_t lock_owner;
+ int fd; /* fd for OFD locks */
+};
+
+struct lo_map_elem {
+ union {
+ struct lo_inode *inode;
+ struct lo_dirp *dirp;
+ int fd;
+ ssize_t freelist;
+ };
+ bool in_use;
+};
+
+/* Maps FUSE fh or ino values to internal objects */
+struct lo_map {
+ struct lo_map_elem *elems;
+ size_t nelems;
+ ssize_t freelist;
+};
+
+struct lo_key {
+ ino_t ino;
+ dev_t dev;
+};
+
+struct lo_inode {
+ int fd;
+
+ /*
+ * Atomic reference count for this object. The nlookup field holds a
+ * reference and release it when nlookup reaches 0.
+ */
+ gint refcount;
+
+ struct lo_key key;
+
+ /*
+ * This counter keeps the inode alive during the FUSE session.
+ * Incremented when the FUSE inode number is sent in a reply
+ * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
+ * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
+ *
+ * Note that this value is untrusted because the client can manipulate
+ * it arbitrarily using FUSE_FORGET requests.
+ *
+ * Protected by lo->mutex.
+ */
+ uint64_t nlookup;
+
+ fuse_ino_t fuse_ino;
+ pthread_mutex_t plock_mutex;
+ GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
+
+ bool is_symlink;
+};
+
+struct lo_cred {
+ uid_t euid;
+ gid_t egid;
+};
+
+enum {
+ CACHE_NONE,
+ CACHE_AUTO,
+ CACHE_ALWAYS,
+};
+
+struct lo_data {
+ pthread_mutex_t mutex;
+ int debug;
+ int norace;
+ int writeback;
+ int flock;
+ int posix_lock;
+ int xattr;
+ char *source;
+ double timeout;
+ int cache;
+ int timeout_set;
+ int readdirplus_set;
+ int readdirplus_clear;
+ struct lo_inode root;
+ GHashTable *inodes; /* protected by lo->mutex */
+ struct lo_map ino_map; /* protected by lo->mutex */
+ struct lo_map dirp_map; /* protected by lo->mutex */
+ struct lo_map fd_map; /* protected by lo->mutex */
+
+ /* An O_PATH file descriptor to /proc/self/fd/ */
+ int proc_self_fd;
+};
+
+static const struct fuse_opt lo_opts[] = {
+ { "writeback", offsetof(struct lo_data, writeback), 1 },
+ { "no_writeback", offsetof(struct lo_data, writeback), 0 },
+ { "source=%s", offsetof(struct lo_data, source), 0 },
+ { "flock", offsetof(struct lo_data, flock), 1 },
+ { "no_flock", offsetof(struct lo_data, flock), 0 },
+ { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
+ { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
+ { "xattr", offsetof(struct lo_data, xattr), 1 },
+ { "no_xattr", offsetof(struct lo_data, xattr), 0 },
+ { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
+ { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
+ { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
+ { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
+ { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
+ { "norace", offsetof(struct lo_data, norace), 1 },
+ { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
+ { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
+ FUSE_OPT_END
+};
+static bool use_syslog = false;
+static int current_log_level;
+static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
+ uint64_t n);
+
+static struct {
+ pthread_mutex_t mutex;
+ void *saved;
+} cap;
+/* That we loaded cap-ng in the current thread from the saved */
+static __thread bool cap_loaded = 0;
+
+static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
+
+static int is_dot_or_dotdot(const char *name)
+{
+ return name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
+}
+
+/* Is `path` a single path component that is not "." or ".."? */
+static int is_safe_path_component(const char *path)
+{
+ if (strchr(path, '/')) {
+ return 0;
+ }
+
+ return !is_dot_or_dotdot(path);
+}
+
+static struct lo_data *lo_data(fuse_req_t req)
+{
+ return (struct lo_data *)fuse_req_userdata(req);
+}
+
+/*
+ * Load capng's state from our saved state if the current thread
+ * hadn't previously been loaded.
+ * returns 0 on success
+ */
+static int load_capng(void)
+{
+ if (!cap_loaded) {
+ pthread_mutex_lock(&cap.mutex);
+ capng_restore_state(&cap.saved);
+ /*
+ * restore_state free's the saved copy
+ * so make another.
+ */
+ cap.saved = capng_save_state();
+ if (!cap.saved) {
+ fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
+ return -EINVAL;
+ }
+ pthread_mutex_unlock(&cap.mutex);
+
+ /*
+ * We want to use the loaded state for our pid,
+ * not the original
+ */
+ capng_setpid(syscall(SYS_gettid));
+ cap_loaded = true;
+ }
+ return 0;
+}
+
+/*
+ * Helpers for dropping and regaining effective capabilities. Returns 0
+ * on success, error otherwise
+ */
+static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
+{
+ int cap, ret;
+
+ cap = capng_name_to_capability(cap_name);
+ if (cap < 0) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
+ cap_name, strerror(errno));
+ goto out;
+ }
+
+ if (load_capng()) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
+ goto out;
+ }
+
+ /* We dont have this capability in effective set already. */
+ if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
+ goto out;
+ }
+
+ if (capng_apply(CAPNG_SELECT_CAPS)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
+ goto out;
+ }
+
+ ret = 0;
+ if (cap_dropped) {
+ *cap_dropped = true;
+ }
+
+out:
+ return ret;
+}
+
+static int gain_effective_cap(const char *cap_name)
+{
+ int cap;
+ int ret = 0;
+
+ cap = capng_name_to_capability(cap_name);
+ if (cap < 0) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
+ cap_name, strerror(errno));
+ goto out;
+ }
+
+ if (load_capng()) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
+ goto out;
+ }
+
+ if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
+ goto out;
+ }
+
+ if (capng_apply(CAPNG_SELECT_CAPS)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
+ goto out;
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static void lo_map_init(struct lo_map *map)
+{
+ map->elems = NULL;
+ map->nelems = 0;
+ map->freelist = -1;
+}
+
+static void lo_map_destroy(struct lo_map *map)
+{
+ free(map->elems);
+}
+
+static int lo_map_grow(struct lo_map *map, size_t new_nelems)
+{
+ struct lo_map_elem *new_elems;
+ size_t i;
+
+ if (new_nelems <= map->nelems) {
+ return 1;
+ }
+
+ new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
+ if (!new_elems) {
+ return 0;
+ }
+
+ for (i = map->nelems; i < new_nelems; i++) {
+ new_elems[i].freelist = i + 1;
+ new_elems[i].in_use = false;
+ }
+ new_elems[new_nelems - 1].freelist = -1;
+
+ map->elems = new_elems;
+ map->freelist = map->nelems;
+ map->nelems = new_nelems;
+ return 1;
+}
+
+static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
+{
+ struct lo_map_elem *elem;
+
+ if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
+ return NULL;
+ }
+
+ elem = &map->elems[map->freelist];
+ map->freelist = elem->freelist;
+
+ elem->in_use = true;
+
+ return elem;
+}
+
+static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
+{
+ ssize_t *prev;
+
+ if (!lo_map_grow(map, key + 1)) {
+ return NULL;
+ }
+
+ for (prev = &map->freelist; *prev != -1;
+ prev = &map->elems[*prev].freelist) {
+ if (*prev == key) {
+ struct lo_map_elem *elem = &map->elems[key];
+
+ *prev = elem->freelist;
+ elem->in_use = true;
+ return elem;
+ }
+ }
+ return NULL;
+}
+
+static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
+{
+ if (key >= map->nelems) {
+ return NULL;
+ }
+ if (!map->elems[key].in_use) {
+ return NULL;
+ }
+ return &map->elems[key];
+}
+
+static void lo_map_remove(struct lo_map *map, size_t key)
+{
+ struct lo_map_elem *elem;
+
+ if (key >= map->nelems) {
+ return;
+ }
+
+ elem = &map->elems[key];
+ if (!elem->in_use) {
+ return;
+ }
+
+ elem->in_use = false;
+
+ elem->freelist = map->freelist;
+ map->freelist = key;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->fd = fd;
+ return elem - lo_data(req)->fd_map.elems;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->dirp = dirp;
+ return elem - lo_data(req)->dirp_map.elems;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->inode = inode;
+ return elem - lo_data(req)->ino_map.elems;
+}
+
+static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
+{
+ struct lo_inode *inode = *inodep;
+
+ if (!inode) {
+ return;
+ }
+
+ *inodep = NULL;
+
+ if (g_atomic_int_dec_and_test(&inode->refcount)) {
+ close(inode->fd);
+ free(inode);
+ }
+}
+
+/* Caller must release refcount using lo_inode_put() */
+static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->ino_map, ino);
+ if (elem) {
+ g_atomic_int_inc(&elem->inode->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ if (!elem) {
+ return NULL;
+ }
+
+ return elem->inode;
+}
+
+/*
+ * TODO Remove this helper and force callers to hold an inode refcount until
+ * they are done with the fd. This will be done in a later patch to make
+ * review easier.
+ */
+static int lo_fd(fuse_req_t req, fuse_ino_t ino)
+{
+ struct lo_inode *inode = lo_inode(req, ino);
+ int fd;
+
+ if (!inode) {
+ return -1;
+ }
+
+ fd = inode->fd;
+ lo_inode_put(lo_data(req), &inode);
+ return fd;
+}
+
+static void lo_init(void *userdata, struct fuse_conn_info *conn)
+{
+ struct lo_data *lo = (struct lo_data *)userdata;
+
+ if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
+ conn->want |= FUSE_CAP_EXPORT_SUPPORT;
+ }
+
+ if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
+ conn->want |= FUSE_CAP_WRITEBACK_CACHE;
+ }
+ if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
+ if (lo->flock) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
+ conn->want |= FUSE_CAP_FLOCK_LOCKS;
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
+ conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
+ }
+ }
+
+ if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
+ if (lo->posix_lock) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
+ conn->want |= FUSE_CAP_POSIX_LOCKS;
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
+ conn->want &= ~FUSE_CAP_POSIX_LOCKS;
+ }
+ }
+
+ if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
+ lo->readdirplus_clear) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
+ conn->want &= ~FUSE_CAP_READDIRPLUS;
+ }
+}
+
+static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ int res;
+ struct stat buf;
+ struct lo_data *lo = lo_data(req);
+
+ (void)fi;
+
+ res =
+ fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ fuse_reply_attr(req, &buf, lo->timeout);
+}
+
+/*
+ * Increments parent->nlookup and caller must release refcount using
+ * lo_inode_put(&parent).
+ */
+static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
+ char path[PATH_MAX], struct lo_inode **parent)
+{
+ char procname[64];
+ char *last;
+ struct stat stat;
+ struct lo_inode *p;
+ int retries = 2;
+ int res;
+
+retry:
+ sprintf(procname, "%i", inode->fd);
+
+ res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX);
+ if (res < 0) {
+ fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__);
+ goto fail_noretry;
+ }
+
+ if (res >= PATH_MAX) {
+ fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__);
+ goto fail_noretry;
+ }
+ path[res] = '\0';
+
+ last = strrchr(path, '/');
+ if (last == NULL) {
+ /* Shouldn't happen */
+ fuse_log(
+ FUSE_LOG_WARNING,
+ "%s: INTERNAL ERROR: bad path read from proc\n", __func__);
+ goto fail_noretry;
+ }
+ if (last == path) {
+ p = &lo->root;
+ pthread_mutex_lock(&lo->mutex);
+ p->nlookup++;
+ g_atomic_int_inc(&p->refcount);
+ pthread_mutex_unlock(&lo->mutex);
+ } else {
+ *last = '\0';
+ res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0);
+ if (res == -1) {
+ if (!retries) {
+ fuse_log(FUSE_LOG_WARNING,
+ "%s: failed to stat parent: %m\n", __func__);
+ }
+ goto fail;
+ }
+ p = lo_find(lo, &stat);
+ if (p == NULL) {
+ if (!retries) {
+ fuse_log(FUSE_LOG_WARNING,
+ "%s: failed to find parent\n", __func__);
+ }
+ goto fail;
+ }
+ }
+ last++;
+ res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ if (!retries) {
+ fuse_log(FUSE_LOG_WARNING,
+ "%s: failed to stat last\n", __func__);
+ }
+ goto fail_unref;
+ }
+ if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) {
+ if (!retries) {
+ fuse_log(FUSE_LOG_WARNING,
+ "%s: failed to match last\n", __func__);
+ }
+ goto fail_unref;
+ }
+ *parent = p;
+ memmove(path, last, strlen(last) + 1);
+
+ return 0;
+
+fail_unref:
+ unref_inode_lolocked(lo, p, 1);
+ lo_inode_put(lo, &p);
+fail:
+ if (retries) {
+ retries--;
+ goto retry;
+ }
+fail_noretry:
+ errno = EIO;
+ return -1;
+}
+
+static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode,
+ const struct timespec *tv)
+{
+ int res;
+ struct lo_inode *parent;
+ char path[PATH_MAX];
+
+ if (inode->is_symlink) {
+ res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH);
+ if (res == -1 && errno == EINVAL) {
+ /* Sorry, no race free way to set times on symlink. */
+ if (lo->norace) {
+ errno = EPERM;
+ } else {
+ goto fallback;
+ }
+ }
+ return res;
+ }
+ sprintf(path, "%i", inode->fd);
+
+ return utimensat(lo->proc_self_fd, path, tv, 0);
+
+fallback:
+ res = lo_parent_and_name(lo, inode, path, &parent);
+ if (res != -1) {
+ res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW);
+ unref_inode_lolocked(lo, parent, 1);
+ lo_inode_put(lo, &parent);
+ }
+
+ return res;
+}
+
+static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->fd_map, fi->fh);
+ pthread_mutex_unlock(&lo->mutex);
+
+ if (!elem) {
+ return -1;
+ }
+
+ return elem->fd;
+}
+
+static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
+ int valid, struct fuse_file_info *fi)
+{
+ int saverr;
+ char procname[64];
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ int ifd;
+ int res;
+ int fd;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ ifd = inode->fd;
+
+ /* If fi->fh is invalid we'll report EBADF later */
+ if (fi) {
+ fd = lo_fi_fd(req, fi);
+ }
+
+ if (valid & FUSE_SET_ATTR_MODE) {
+ if (fi) {
+ res = fchmod(fd, attr->st_mode);
+ } else {
+ sprintf(procname, "%i", ifd);
+ res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
+ }
+ if (res == -1) {
+ goto out_err;
+ }
+ }
+ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
+ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
+ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
+
+ res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ goto out_err;
+ }
+ }
+ if (valid & FUSE_SET_ATTR_SIZE) {
+ int truncfd;
+
+ if (fi) {
+ truncfd = fd;
+ } else {
+ sprintf(procname, "%i", ifd);
+ truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
+ if (truncfd < 0) {
+ goto out_err;
+ }
+ }
+
+ res = ftruncate(truncfd, attr->st_size);
+ if (!fi) {
+ saverr = errno;
+ close(truncfd);
+ errno = saverr;
+ }
+ if (res == -1) {
+ goto out_err;
+ }
+ }
+ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
+ struct timespec tv[2];
+
+ tv[0].tv_sec = 0;
+ tv[1].tv_sec = 0;
+ tv[0].tv_nsec = UTIME_OMIT;
+ tv[1].tv_nsec = UTIME_OMIT;
+
+ if (valid & FUSE_SET_ATTR_ATIME_NOW) {
+ tv[0].tv_nsec = UTIME_NOW;
+ } else if (valid & FUSE_SET_ATTR_ATIME) {
+ tv[0] = attr->st_atim;
+ }
+
+ if (valid & FUSE_SET_ATTR_MTIME_NOW) {
+ tv[1].tv_nsec = UTIME_NOW;
+ } else if (valid & FUSE_SET_ATTR_MTIME) {
+ tv[1] = attr->st_mtim;
+ }
+
+ if (fi) {
+ res = futimens(fd, tv);
+ } else {
+ res = utimensat_empty(lo, inode, tv);
+ }
+ if (res == -1) {
+ goto out_err;
+ }
+ }
+ lo_inode_put(lo, &inode);
+
+ return lo_getattr(req, ino, fi);
+
+out_err:
+ saverr = errno;
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
+{
+ struct lo_inode *p;
+ struct lo_key key = {
+ .ino = st->st_ino,
+ .dev = st->st_dev,
+ };
+
+ pthread_mutex_lock(&lo->mutex);
+ p = g_hash_table_lookup(lo->inodes, &key);
+ if (p) {
+ assert(p->nlookup > 0);
+ p->nlookup++;
+ g_atomic_int_inc(&p->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ return p;
+}
+
+/* value_destroy_func for posix_locks GHashTable */
+static void posix_locks_value_destroy(gpointer data)
+{
+ struct lo_inode_plock *plock = data;
+
+ /*
+ * We had used open() for locks and had only one fd. So
+ * closing this fd should release all OFD locks.
+ */
+ close(plock->fd);
+ free(plock);
+}
+
+/*
+ * Increments nlookup and caller must release refcount using
+ * lo_inode_put(&parent).
+ */
+static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
+ struct fuse_entry_param *e)
+{
+ int newfd;
+ int res;
+ int saverr;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode = NULL;
+ struct lo_inode *dir = lo_inode(req, parent);
+
+ /*
+ * name_to_handle_at() and open_by_handle_at() can reach here with fuse
+ * mount point in guest, but we don't have its inode info in the
+ * ino_map.
+ */
+ if (!dir) {
+ return ENOENT;
+ }
+
+ memset(e, 0, sizeof(*e));
+ e->attr_timeout = lo->timeout;
+ e->entry_timeout = lo->timeout;
+
+ /* Do not allow escaping root directory */
+ if (dir == &lo->root && strcmp(name, "..") == 0) {
+ name = ".";
+ }
+
+ newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
+ if (newfd == -1) {
+ goto out_err;
+ }
+
+ res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ inode = lo_find(lo, &e->attr);
+ if (inode) {
+ close(newfd);
+ newfd = -1;
+ } else {
+ inode = calloc(1, sizeof(struct lo_inode));
+ if (!inode) {
+ goto out_err;
+ }
+
+ inode->is_symlink = S_ISLNK(e->attr.st_mode);
+
+ /*
+ * One for the caller and one for nlookup (released in
+ * unref_inode_lolocked())
+ */
+ g_atomic_int_set(&inode->refcount, 2);
+
+ inode->nlookup = 1;
+ inode->fd = newfd;
+ newfd = -1;
+ inode->key.ino = e->attr.st_ino;
+ inode->key.dev = e->attr.st_dev;
+ pthread_mutex_init(&inode->plock_mutex, NULL);
+ inode->posix_locks = g_hash_table_new_full(
+ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
+
+ pthread_mutex_lock(&lo->mutex);
+ inode->fuse_ino = lo_add_inode_mapping(req, inode);
+ g_hash_table_insert(lo->inodes, &inode->key, inode);
+ pthread_mutex_unlock(&lo->mutex);
+ }
+ e->ino = inode->fuse_ino;
+ lo_inode_put(lo, &inode);
+ lo_inode_put(lo, &dir);
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e->ino);
+
+ return 0;
+
+out_err:
+ saverr = errno;
+ if (newfd != -1) {
+ close(newfd);
+ }
+ lo_inode_put(lo, &inode);
+ lo_inode_put(lo, &dir);
+ return saverr;
+}
+
+static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ struct fuse_entry_param e;
+ int err;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
+ name);
+
+ /*
+ * Don't use is_safe_path_component(), allow "." and ".." for NFS export
+ * support.
+ */
+ if (strchr(name, '/')) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ err = lo_do_lookup(req, parent, name, &e);
+ if (err) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_entry(req, &e);
+ }
+}
+
+/*
+ * On some archs, setres*id is limited to 2^16 but they
+ * provide setres*id32 variants that allow 2^32.
+ * Others just let setres*id do 2^32 anyway.
+ */
+#ifdef SYS_setresgid32
+#define OURSYS_setresgid SYS_setresgid32
+#else
+#define OURSYS_setresgid SYS_setresgid
+#endif
+
+#ifdef SYS_setresuid32
+#define OURSYS_setresuid SYS_setresuid32
+#else
+#define OURSYS_setresuid SYS_setresuid
+#endif
+
+/*
+ * Change to uid/gid of caller so that file is created with
+ * ownership of caller.
+ * TODO: What about selinux context?
+ */
+static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
+{
+ int res;
+
+ old->euid = geteuid();
+ old->egid = getegid();
+
+ res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
+ if (res == -1) {
+ return errno;
+ }
+
+ res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
+ if (res == -1) {
+ int errno_save = errno;
+
+ syscall(OURSYS_setresgid, -1, old->egid, -1);
+ return errno_save;
+ }
+
+ return 0;
+}
+
+/* Regain Privileges */
+static void lo_restore_cred(struct lo_cred *old)
+{
+ int res;
+
+ res = syscall(OURSYS_setresuid, -1, old->euid, -1);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
+ exit(1);
+ }
+
+ res = syscall(OURSYS_setresgid, -1, old->egid, -1);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
+ exit(1);
+ }
+}
+
+static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
+ const char *name, mode_t mode, dev_t rdev,
+ const char *link)
+{
+ int res;
+ int saverr;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *dir;
+ struct fuse_entry_param e;
+ struct lo_cred old = {};
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ dir = lo_inode(req, parent);
+ if (!dir) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOMEM;
+
+ saverr = lo_change_cred(req, &old);
+ if (saverr) {
+ goto out;
+ }
+
+ res = mknod_wrapper(dir->fd, name, link, mode, rdev);
+
+ saverr = errno;
+
+ lo_restore_cred(&old);
+
+ if (res == -1) {
+ goto out;
+ }
+
+ saverr = lo_do_lookup(req, parent, name, &e);
+ if (saverr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e.ino);
+
+ fuse_reply_entry(req, &e);
+ lo_inode_put(lo, &dir);
+ return;
+
+out:
+ lo_inode_put(lo, &dir);
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, dev_t rdev)
+{
+ lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
+}
+
+static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode)
+{
+ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
+}
+
+static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
+ const char *name)
+{
+ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
+}
+
+static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode,
+ int dfd, const char *name)
+{
+ int res;
+ struct lo_inode *parent;
+ char path[PATH_MAX];
+
+ if (inode->is_symlink) {
+ res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH);
+ if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
+ /* Sorry, no race free way to hard-link a symlink. */
+ if (lo->norace) {
+ errno = EPERM;
+ } else {
+ goto fallback;
+ }
+ }
+ return res;
+ }
+
+ sprintf(path, "%i", inode->fd);
+
+ return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW);
+
+fallback:
+ res = lo_parent_and_name(lo, inode, path, &parent);
+ if (res != -1) {
+ res = linkat(parent->fd, path, dfd, name, 0);
+ unref_inode_lolocked(lo, parent, 1);
+ lo_inode_put(lo, &parent);
+ }
+
+ return res;
+}
+
+static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
+ const char *name)
+{
+ int res;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *parent_inode;
+ struct lo_inode *inode;
+ struct fuse_entry_param e;
+ int saverr;
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ inode = lo_inode(req, ino);
+ if (!parent_inode || !inode) {
+ errno = EBADF;
+ goto out_err;
+ }
+
+ memset(&e, 0, sizeof(struct fuse_entry_param));
+ e.attr_timeout = lo->timeout;
+ e.entry_timeout = lo->timeout;
+
+ res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ inode->nlookup++;
+ pthread_mutex_unlock(&lo->mutex);
+ e.ino = inode->fuse_ino;
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e.ino);
+
+ fuse_reply_entry(req, &e);
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+/* Increments nlookup and caller must release refcount using lo_inode_put() */
+static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
+ const char *name)
+{
+ int res;
+ struct stat attr;
+
+ res = fstatat(lo_fd(req, parent), name, &attr,
+ AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ return NULL;
+ }
+
+ return lo_find(lo_data(req), &attr);
+}
+
+static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int res;
+ struct lo_inode *inode;
+ struct lo_data *lo = lo_data(req);
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ inode = lookup_name(req, parent, name);
+ if (!inode) {
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ unref_inode_lolocked(lo, inode, 1);
+ lo_inode_put(lo, &inode);
+}
+
+static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
+ fuse_ino_t newparent, const char *newname,
+ unsigned int flags)
+{
+ int res;
+ struct lo_inode *parent_inode;
+ struct lo_inode *newparent_inode;
+ struct lo_inode *oldinode = NULL;
+ struct lo_inode *newinode = NULL;
+ struct lo_data *lo = lo_data(req);
+
+ if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ newparent_inode = lo_inode(req, newparent);
+ if (!parent_inode || !newparent_inode) {
+ fuse_reply_err(req, EBADF);
+ goto out;
+ }
+
+ oldinode = lookup_name(req, parent, name);
+ newinode = lookup_name(req, newparent, newname);
+
+ if (!oldinode) {
+ fuse_reply_err(req, EIO);
+ goto out;
+ }
+
+ if (flags) {
+#ifndef SYS_renameat2
+ fuse_reply_err(req, EINVAL);
+#else
+ res = syscall(SYS_renameat2, parent_inode->fd, name,
+ newparent_inode->fd, newname, flags);
+ if (res == -1 && errno == ENOSYS) {
+ fuse_reply_err(req, EINVAL);
+ } else {
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ }
+#endif
+ goto out;
+ }
+
+ res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+out:
+ unref_inode_lolocked(lo, oldinode, 1);
+ unref_inode_lolocked(lo, newinode, 1);
+ lo_inode_put(lo, &oldinode);
+ lo_inode_put(lo, &newinode);
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &newparent_inode);
+}
+
+static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int res;
+ struct lo_inode *inode;
+ struct lo_data *lo = lo_data(req);
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ inode = lookup_name(req, parent, name);
+ if (!inode) {
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ res = unlinkat(lo_fd(req, parent), name, 0);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ unref_inode_lolocked(lo, inode, 1);
+ lo_inode_put(lo, &inode);
+}
+
+/* To be called with lo->mutex held */
+static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
+{
+ if (!inode) {
+ return;
+ }
+
+ assert(inode->nlookup >= n);
+ inode->nlookup -= n;
+ if (!inode->nlookup) {
+ lo_map_remove(&lo->ino_map, inode->fuse_ino);
+ g_hash_table_remove(lo->inodes, &inode->key);
+ if (g_hash_table_size(inode->posix_locks)) {
+ fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
+ }
+ g_hash_table_destroy(inode->posix_locks);
+ pthread_mutex_destroy(&inode->plock_mutex);
+
+ /* Drop our refcount from lo_do_lookup() */
+ lo_inode_put(lo, &inode);
+ }
+}
+
+static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
+ uint64_t n)
+{
+ if (!inode) {
+ return;
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ unref_inode(lo, inode, n);
+ pthread_mutex_unlock(&lo->mutex);
+}
+
+static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ return;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
+ (unsigned long long)ino, (unsigned long long)inode->nlookup,
+ (unsigned long long)nlookup);
+
+ unref_inode_lolocked(lo, inode, nlookup);
+ lo_inode_put(lo, &inode);
+}
+
+static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
+{
+ lo_forget_one(req, ino, nlookup);
+ fuse_reply_none(req);
+}
+
+static void lo_forget_multi(fuse_req_t req, size_t count,
+ struct fuse_forget_data *forgets)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
+ }
+ fuse_reply_none(req);
+}
+
+static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
+{
+ char buf[PATH_MAX + 1];
+ int res;
+
+ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ if (res == sizeof(buf)) {
+ return (void)fuse_reply_err(req, ENAMETOOLONG);
+ }
+
+ buf[res] = '\0';
+
+ fuse_reply_readlink(req, buf);
+}
+
+struct lo_dirp {
+ gint refcount;
+ DIR *dp;
+ struct dirent *entry;
+ off_t offset;
+};
+
+static void lo_dirp_put(struct lo_dirp **dp)
+{
+ struct lo_dirp *d = *dp;
+
+ if (!d) {
+ return;
+ }
+ *dp = NULL;
+
+ if (g_atomic_int_dec_and_test(&d->refcount)) {
+ closedir(d->dp);
+ free(d);
+ }
+}
+
+/* Call lo_dirp_put() on the return value when no longer needed */
+static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->dirp_map, fi->fh);
+ if (elem) {
+ g_atomic_int_inc(&elem->dirp->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+ if (!elem) {
+ return NULL;
+ }
+
+ return elem->dirp;
+}
+
+static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ int error = ENOMEM;
+ struct lo_data *lo = lo_data(req);
+ struct lo_dirp *d;
+ int fd;
+ ssize_t fh;
+
+ d = calloc(1, sizeof(struct lo_dirp));
+ if (d == NULL) {
+ goto out_err;
+ }
+
+ fd = openat(lo_fd(req, ino), ".", O_RDONLY);
+ if (fd == -1) {
+ goto out_errno;
+ }
+
+ d->dp = fdopendir(fd);
+ if (d->dp == NULL) {
+ goto out_errno;
+ }
+
+ d->offset = 0;
+ d->entry = NULL;
+
+ g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
+ pthread_mutex_lock(&lo->mutex);
+ fh = lo_add_dirp_mapping(req, d);
+ pthread_mutex_unlock(&lo->mutex);
+ if (fh == -1) {
+ goto out_err;
+ }
+
+ fi->fh = fh;
+ if (lo->cache == CACHE_ALWAYS) {
+ fi->cache_readdir = 1;
+ }
+ fuse_reply_open(req, fi);
+ return;
+
+out_errno:
+ error = errno;
+out_err:
+ if (d) {
+ if (d->dp) {
+ closedir(d->dp);
+ }
+ if (fd != -1) {
+ close(fd);
+ }
+ free(d);
+ }
+ fuse_reply_err(req, error);
+}
+
+static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi, int plus)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_dirp *d = NULL;
+ struct lo_inode *dinode;
+ char *buf = NULL;
+ char *p;
+ size_t rem = size;
+ int err = EBADF;
+
+ dinode = lo_inode(req, ino);
+ if (!dinode) {
+ goto error;
+ }
+
+ d = lo_dirp(req, fi);
+ if (!d) {
+ goto error;
+ }
+
+ err = ENOMEM;
+ buf = calloc(1, size);
+ if (!buf) {
+ goto error;
+ }
+ p = buf;
+
+ if (offset != d->offset) {
+ seekdir(d->dp, offset);
+ d->entry = NULL;
+ d->offset = offset;
+ }
+ while (1) {
+ size_t entsize;
+ off_t nextoff;
+ const char *name;
+
+ if (!d->entry) {
+ errno = 0;
+ d->entry = readdir(d->dp);
+ if (!d->entry) {
+ if (errno) { /* Error */
+ err = errno;
+ goto error;
+ } else { /* End of stream */
+ break;
+ }
+ }
+ }
+ nextoff = d->entry->d_off;
+ name = d->entry->d_name;
+
+ fuse_ino_t entry_ino = 0;
+ struct fuse_entry_param e = (struct fuse_entry_param){
+ .attr.st_ino = d->entry->d_ino,
+ .attr.st_mode = d->entry->d_type << 12,
+ };
+
+ /* Hide root's parent directory */
+ if (dinode == &lo->root && strcmp(name, "..") == 0) {
+ e.attr.st_ino = lo->root.key.ino;
+ e.attr.st_mode = DT_DIR << 12;
+ }
+
+ if (plus) {
+ if (!is_dot_or_dotdot(name)) {
+ err = lo_do_lookup(req, ino, name, &e);
+ if (err) {
+ goto error;
+ }
+ entry_ino = e.ino;
+ }
+
+ entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
+ } else {
+ entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
+ }
+ if (entsize > rem) {
+ if (entry_ino != 0) {
+ lo_forget_one(req, entry_ino, 1);
+ }
+ break;
+ }
+
+ p += entsize;
+ rem -= entsize;
+
+ d->entry = NULL;
+ d->offset = nextoff;
+ }
+
+ err = 0;
+error:
+ lo_dirp_put(&d);
+ lo_inode_put(lo, &dinode);
+
+ /*
+ * If there's an error, we can only signal it if we haven't stored
+ * any entries yet - otherwise we'd end up with wrong lookup
+ * counts for the entries that are already in the buffer. So we
+ * return what we've collected until that point.
+ */
+ if (err && rem == size) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_buf(req, buf, size - rem);
+ }
+ free(buf);
+}
+
+static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ lo_do_readdir(req, ino, size, offset, fi, 0);
+}
+
+static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ lo_do_readdir(req, ino, size, offset, fi, 1);
+}
+
+static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+ struct lo_dirp *d;
+
+ (void)ino;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->dirp_map, fi->fh);
+ if (!elem) {
+ pthread_mutex_unlock(&lo->mutex);
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ d = elem->dirp;
+ lo_map_remove(&lo->dirp_map, fi->fh);
+ pthread_mutex_unlock(&lo->mutex);
+
+ lo_dirp_put(&d); /* paired with lo_opendir() */
+
+ fuse_reply_err(req, 0);
+}
+
+static void update_open_flags(int writeback, struct fuse_file_info *fi)
+{
+ /*
+ * With writeback cache, kernel may send read requests even
+ * when userspace opened write-only
+ */
+ if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
+ fi->flags &= ~O_ACCMODE;
+ fi->flags |= O_RDWR;
+ }
+
+ /*
+ * With writeback cache, O_APPEND is handled by the kernel.
+ * This breaks atomicity (since the file may change in the
+ * underlying filesystem, so that the kernel's idea of the
+ * end of the file isn't accurate anymore). In this example,
+ * we just accept that. A more rigorous filesystem may want
+ * to return an error here
+ */
+ if (writeback && (fi->flags & O_APPEND)) {
+ fi->flags &= ~O_APPEND;
+ }
+
+ /*
+ * O_DIRECT in guest should not necessarily mean bypassing page
+ * cache on host as well. If somebody needs that behavior, it
+ * probably should be a configuration knob in daemon.
+ */
+ fi->flags &= ~O_DIRECT;
+}
+
+static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, struct fuse_file_info *fi)
+{
+ int fd;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *parent_inode;
+ struct fuse_entry_param e;
+ int err;
+ struct lo_cred old = {};
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
+ name);
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ if (!parent_inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ err = lo_change_cred(req, &old);
+ if (err) {
+ goto out;
+ }
+
+ update_open_flags(lo->writeback, fi);
+
+ fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
+ mode);
+ err = fd == -1 ? errno : 0;
+ lo_restore_cred(&old);
+
+ if (!err) {
+ ssize_t fh;
+
+ pthread_mutex_lock(&lo->mutex);
+ fh = lo_add_fd_mapping(req, fd);
+ pthread_mutex_unlock(&lo->mutex);
+ if (fh == -1) {
+ close(fd);
+ err = ENOMEM;
+ goto out;
+ }
+
+ fi->fh = fh;
+ err = lo_do_lookup(req, parent, name, &e);
+ }
+ if (lo->cache == CACHE_NONE) {
+ fi->direct_io = 1;
+ } else if (lo->cache == CACHE_ALWAYS) {
+ fi->keep_cache = 1;
+ }
+
+out:
+ lo_inode_put(lo, &parent_inode);
+
+ if (err) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_create(req, &e, fi);
+ }
+}
+
+/* Should be called with inode->plock_mutex held */
+static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
+ struct lo_inode *inode,
+ uint64_t lock_owner,
+ pid_t pid, int *err)
+{
+ struct lo_inode_plock *plock;
+ char procname[64];
+ int fd;
+
+ plock =
+ g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
+
+ if (plock) {
+ return plock;
+ }
+
+ plock = malloc(sizeof(struct lo_inode_plock));
+ if (!plock) {
+ *err = ENOMEM;
+ return NULL;
+ }
+
+ /* Open another instance of file which can be used for ofd locks. */
+ sprintf(procname, "%i", inode->fd);
+
+ /* TODO: What if file is not writable? */
+ fd = openat(lo->proc_self_fd, procname, O_RDWR);
+ if (fd == -1) {
+ *err = errno;
+ free(plock);
+ return NULL;
+ }
+
+ plock->lock_owner = lock_owner;
+ plock->fd = fd;
+ g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
+ plock);
+ return plock;
+}
+
+static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ struct lo_inode_plock *plock;
+ int ret, saverr = 0;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_getlk(ino=%" PRIu64 ", flags=%d)"
+ " owner=0x%lx, l_type=%d l_start=0x%lx"
+ " l_len=0x%lx\n",
+ ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
+ lock->l_len);
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ pthread_mutex_lock(&inode->plock_mutex);
+ plock =
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
+ if (!plock) {
+ saverr = ret;
+ goto out;
+ }
+
+ ret = fcntl(plock->fd, F_OFD_GETLK, lock);
+ if (ret == -1) {
+ saverr = errno;
+ }
+
+out:
+ pthread_mutex_unlock(&inode->plock_mutex);
+ lo_inode_put(lo, &inode);
+
+ if (saverr) {
+ fuse_reply_err(req, saverr);
+ } else {
+ fuse_reply_lock(req, lock);
+ }
+}
+
+static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock, int sleep)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ struct lo_inode_plock *plock;
+ int ret, saverr = 0;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_setlk(ino=%" PRIu64 ", flags=%d)"
+ " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
+ " l_start=0x%lx l_len=0x%lx\n",
+ ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
+ lock->l_whence, lock->l_start, lock->l_len);
+
+ if (sleep) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ pthread_mutex_lock(&inode->plock_mutex);
+ plock =
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
+
+ if (!plock) {
+ saverr = ret;
+ goto out;
+ }
+
+ /* TODO: Is it alright to modify flock? */
+ lock->l_pid = 0;
+ ret = fcntl(plock->fd, F_OFD_SETLK, lock);
+ if (ret == -1) {
+ saverr = errno;
+ }
+
+out:
+ pthread_mutex_unlock(&inode->plock_mutex);
+ lo_inode_put(lo, &inode);
+
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi)
+{
+ int res;
+ struct lo_dirp *d;
+ int fd;
+
+ (void)ino;
+
+ d = lo_dirp(req, fi);
+ if (!d) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ fd = dirfd(d->dp);
+ if (datasync) {
+ res = fdatasync(fd);
+ } else {
+ res = fsync(fd);
+ }
+
+ lo_dirp_put(&d);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ int fd;
+ ssize_t fh;
+ char buf[64];
+ struct lo_data *lo = lo_data(req);
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
+ fi->flags);
+
+ update_open_flags(lo->writeback, fi);
+
+ sprintf(buf, "%i", lo_fd(req, ino));
+ fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
+ if (fd == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ fh = lo_add_fd_mapping(req, fd);
+ pthread_mutex_unlock(&lo->mutex);
+ if (fh == -1) {
+ close(fd);
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ fi->fh = fh;
+ if (lo->cache == CACHE_NONE) {
+ fi->direct_io = 1;
+ } else if (lo->cache == CACHE_ALWAYS) {
+ fi->keep_cache = 1;
+ }
+ fuse_reply_open(req, fi);
+}
+
+static void lo_release(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+ int fd = -1;
+
+ (void)ino;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->fd_map, fi->fh);
+ if (elem) {
+ fd = elem->fd;
+ elem = NULL;
+ lo_map_remove(&lo->fd_map, fi->fh);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ close(fd);
+ fuse_reply_err(req, 0);
+}
+
+static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ int res;
+ (void)ino;
+ struct lo_inode *inode;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ /* An fd is going away. Cleanup associated posix locks */
+ pthread_mutex_lock(&inode->plock_mutex);
+ g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
+ pthread_mutex_unlock(&inode->plock_mutex);
+
+ res = close(dup(lo_fi_fd(req, fi)));
+ lo_inode_put(lo_data(req), &inode);
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi)
+{
+ int res;
+ int fd;
+ char *buf;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
+ (void *)fi);
+
+ if (!fi) {
+ struct lo_data *lo = lo_data(req);
+
+ res = asprintf(&buf, "%i", lo_fd(req, ino));
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ fd = openat(lo->proc_self_fd, buf, O_RDWR);
+ free(buf);
+ if (fd == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+ } else {
+ fd = lo_fi_fd(req, fi);
+ }
+
+ if (datasync) {
+ res = fdatasync(fd);
+ } else {
+ res = fsync(fd);
+ }
+ if (!fi) {
+ close(fd);
+ }
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
+ struct fuse_file_info *fi)
+{
+ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_read(ino=%" PRIu64 ", size=%zd, "
+ "off=%lu)\n",
+ ino, size, (unsigned long)offset);
+
+ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
+ buf.buf[0].fd = lo_fi_fd(req, fi);
+ buf.buf[0].pos = offset;
+
+ fuse_reply_data(req, &buf);
+}
+
+static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_bufvec *in_buf, off_t off,
+ struct fuse_file_info *fi)
+{
+ (void)ino;
+ ssize_t res;
+ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
+ bool cap_fsetid_dropped = false;
+
+ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
+ out_buf.buf[0].fd = lo_fi_fd(req, fi);
+ out_buf.buf[0].pos = off;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
+ out_buf.buf[0].size, (unsigned long)off);
+
+ /*
+ * If kill_priv is set, drop CAP_FSETID which should lead to kernel
+ * clearing setuid/setgid on file.
+ */
+ if (fi->kill_priv) {
+ res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
+ if (res != 0) {
+ fuse_reply_err(req, res);
+ return;
+ }
+ }
+
+ res = fuse_buf_copy(&out_buf, in_buf);
+ if (res < 0) {
+ fuse_reply_err(req, -res);
+ } else {
+ fuse_reply_write(req, (size_t)res);
+ }
+
+ if (cap_fsetid_dropped) {
+ res = gain_effective_cap("FSETID");
+ if (res) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
+ }
+ }
+}
+
+static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
+{
+ int res;
+ struct statvfs stbuf;
+
+ res = fstatvfs(lo_fd(req, ino), &stbuf);
+ if (res == -1) {
+ fuse_reply_err(req, errno);
+ } else {
+ fuse_reply_statfs(req, &stbuf);
+ }
+}
+
+static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
+ off_t length, struct fuse_file_info *fi)
+{
+ int err = EOPNOTSUPP;
+ (void)ino;
+
+#ifdef CONFIG_FALLOCATE
+ err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
+ if (err < 0) {
+ err = errno;
+ }
+
+#elif defined(CONFIG_POSIX_FALLOCATE)
+ if (mode) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
+#endif
+
+ fuse_reply_err(req, err);
+}
+
+static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ int op)
+{
+ int res;
+ (void)ino;
+
+ res = flock(lo_fi_fd(req, fi), op);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
+ size_t size)
+{
+ struct lo_data *lo = lo_data(req);
+ char *value = NULL;
+ char procname[64];
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
+ ino, name, size);
+
+ if (inode->is_symlink) {
+ /* Sorry, no race free way to getxattr on symlink. */
+ saverr = EPERM;
+ goto out;
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ goto out_err;
+ }
+
+ if (size) {
+ value = malloc(size);
+ if (!value) {
+ goto out_err;
+ }
+
+ ret = fgetxattr(fd, name, value, size);
+ if (ret == -1) {
+ goto out_err;
+ }
+ saverr = 0;
+ if (ret == 0) {
+ goto out;
+ }
+
+ fuse_reply_buf(req, value, ret);
+ } else {
+ ret = fgetxattr(fd, name, NULL, 0);
+ if (ret == -1) {
+ goto out_err;
+ }
+
+ fuse_reply_xattr(req, ret);
+ }
+out_free:
+ free(value);
+
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+out:
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+ goto out_free;
+}
+
+static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
+{
+ struct lo_data *lo = lo_data(req);
+ char *value = NULL;
+ char procname[64];
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
+ size);
+
+ if (inode->is_symlink) {
+ /* Sorry, no race free way to listxattr on symlink. */
+ saverr = EPERM;
+ goto out;
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ goto out_err;
+ }
+
+ if (size) {
+ value = malloc(size);
+ if (!value) {
+ goto out_err;
+ }
+
+ ret = flistxattr(fd, value, size);
+ if (ret == -1) {
+ goto out_err;
+ }
+ saverr = 0;
+ if (ret == 0) {
+ goto out;
+ }
+
+ fuse_reply_buf(req, value, ret);
+ } else {
+ ret = flistxattr(fd, NULL, 0);
+ if (ret == -1) {
+ goto out_err;
+ }
+
+ fuse_reply_xattr(req, ret);
+ }
+out_free:
+ free(value);
+
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+out:
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+ goto out_free;
+}
+
+static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
+ const char *value, size_t size, int flags)
+{
+ char procname[64];
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
+ ", name=%s value=%s size=%zd)\n", ino, name, value, size);
+
+ if (inode->is_symlink) {
+ /* Sorry, no race free way to setxattr on symlink. */
+ saverr = EPERM;
+ goto out;
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ fd = openat(lo->proc_self_fd, procname, O_RDWR);
+ if (fd < 0) {
+ saverr = errno;
+ goto out;
+ }
+
+ ret = fsetxattr(fd, name, value, size, flags);
+ saverr = ret == -1 ? errno : 0;
+
+out:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name)
+{
+ char procname[64];
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
+ name);
+
+ if (inode->is_symlink) {
+ /* Sorry, no race free way to setxattr on symlink. */
+ saverr = EPERM;
+ goto out;
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ fd = openat(lo->proc_self_fd, procname, O_RDWR);
+ if (fd < 0) {
+ saverr = errno;
+ goto out;
+ }
+
+ ret = fremovexattr(fd, name);
+ saverr = ret == -1 ? errno : 0;
+
+out:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+#ifdef HAVE_COPY_FILE_RANGE
+static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
+ struct fuse_file_info *fi_in, fuse_ino_t ino_out,
+ off_t off_out, struct fuse_file_info *fi_out,
+ size_t len, int flags)
+{
+ int in_fd, out_fd;
+ ssize_t res;
+
+ in_fd = lo_fi_fd(req, fi_in);
+ out_fd = lo_fi_fd(req, fi_out);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
+ "off=%lu, ino=%" PRIu64 "/fd=%d, "
+ "off=%lu, size=%zd, flags=0x%x)\n",
+ ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
+
+ res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
+ if (res < 0) {
+ fuse_reply_err(req, errno);
+ } else {
+ fuse_reply_write(req, res);
+ }
+}
+#endif
+
+static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
+ struct fuse_file_info *fi)
+{
+ off_t res;
+
+ (void)ino;
+ res = lseek(lo_fi_fd(req, fi), off, whence);
+ if (res != -1) {
+ fuse_reply_lseek(req, res);
+ } else {
+ fuse_reply_err(req, errno);
+ }
+}
+
+static void lo_destroy(void *userdata)
+{
+ struct lo_data *lo = (struct lo_data *)userdata;
+
+ pthread_mutex_lock(&lo->mutex);
+ while (true) {
+ GHashTableIter iter;
+ gpointer key, value;
+
+ g_hash_table_iter_init(&iter, lo->inodes);
+ if (!g_hash_table_iter_next(&iter, &key, &value)) {
+ break;
+ }
+
+ struct lo_inode *inode = value;
+ unref_inode(lo, inode, inode->nlookup);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+}
+
+static struct fuse_lowlevel_ops lo_oper = {
+ .init = lo_init,
+ .lookup = lo_lookup,
+ .mkdir = lo_mkdir,
+ .mknod = lo_mknod,
+ .symlink = lo_symlink,
+ .link = lo_link,
+ .unlink = lo_unlink,
+ .rmdir = lo_rmdir,
+ .rename = lo_rename,
+ .forget = lo_forget,
+ .forget_multi = lo_forget_multi,
+ .getattr = lo_getattr,
+ .setattr = lo_setattr,
+ .readlink = lo_readlink,
+ .opendir = lo_opendir,
+ .readdir = lo_readdir,
+ .readdirplus = lo_readdirplus,
+ .releasedir = lo_releasedir,
+ .fsyncdir = lo_fsyncdir,
+ .create = lo_create,
+ .getlk = lo_getlk,
+ .setlk = lo_setlk,
+ .open = lo_open,
+ .release = lo_release,
+ .flush = lo_flush,
+ .fsync = lo_fsync,
+ .read = lo_read,
+ .write_buf = lo_write_buf,
+ .statfs = lo_statfs,
+ .fallocate = lo_fallocate,
+ .flock = lo_flock,
+ .getxattr = lo_getxattr,
+ .listxattr = lo_listxattr,
+ .setxattr = lo_setxattr,
+ .removexattr = lo_removexattr,
+#ifdef HAVE_COPY_FILE_RANGE
+ .copy_file_range = lo_copy_file_range,
+#endif
+ .lseek = lo_lseek,
+ .destroy = lo_destroy,
+};
+
+/* Print vhost-user.json backend program capabilities */
+static void print_capabilities(void)
+{
+ printf("{\n");
+ printf(" \"type\": \"fs\"\n");
+ printf("}\n");
+}
+
+/*
+ * Move to a new mount, net, and pid namespaces to isolate this process.
+ */
+static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
+{
+ pid_t child;
+
+ /*
+ * Create a new pid namespace for *child* processes. We'll have to
+ * fork in order to enter the new pid namespace. A new mount namespace
+ * is also needed so that we can remount /proc for the new pid
+ * namespace.
+ *
+ * Our UNIX domain sockets have been created. Now we can move to
+ * an empty network namespace to prevent TCP/IP and other network
+ * activity in case this process is compromised.
+ */
+ if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
+ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
+ exit(1);
+ }
+
+ child = fork();
+ if (child < 0) {
+ fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
+ exit(1);
+ }
+ if (child > 0) {
+ pid_t waited;
+ int wstatus;
+
+ /* The parent waits for the child */
+ do {
+ waited = waitpid(child, &wstatus, 0);
+ } while (waited < 0 && errno == EINTR && !se->exited);
+
+ /* We were terminated by a signal, see fuse_signals.c */
+ if (se->exited) {
+ exit(0);
+ }
+
+ if (WIFEXITED(wstatus)) {
+ exit(WEXITSTATUS(wstatus));
+ }
+
+ exit(1);
+ }
+
+ /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
+ prctl(PR_SET_PDEATHSIG, SIGTERM);
+
+ /*
+ * If the mounts have shared propagation then we want to opt out so our
+ * mount changes don't affect the parent mount namespace.
+ */
+ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
+ exit(1);
+ }
+
+ /* The child must remount /proc to use the new pid namespace */
+ if (mount("proc", "/proc", "proc",
+ MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
+ exit(1);
+ }
+
+ /* Now we can get our /proc/self/fd directory file descriptor */
+ lo->proc_self_fd = open("/proc/self/fd", O_PATH);
+ if (lo->proc_self_fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
+ exit(1);
+ }
+}
+
+/*
+ * Capture the capability state, we'll need to restore this for individual
+ * threads later; see load_capng.
+ */
+static void setup_capng(void)
+{
+ /* Note this accesses /proc so has to happen before the sandbox */
+ if (capng_get_caps_process()) {
+ fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
+ exit(1);
+ }
+ pthread_mutex_init(&cap.mutex, NULL);
+ pthread_mutex_lock(&cap.mutex);
+ cap.saved = capng_save_state();
+ if (!cap.saved) {
+ fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
+ exit(1);
+ }
+ pthread_mutex_unlock(&cap.mutex);
+}
+
+static void cleanup_capng(void)
+{
+ free(cap.saved);
+ cap.saved = NULL;
+ pthread_mutex_destroy(&cap.mutex);
+}
+
+
+/*
+ * Make the source directory our root so symlinks cannot escape and no other
+ * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
+ */
+static void setup_mounts(const char *source)
+{
+ int oldroot;
+ int newroot;
+
+ if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
+ exit(1);
+ }
+
+ /* This magic is based on lxc's lxc_pivot_root() */
+ oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (oldroot < 0) {
+ fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
+ exit(1);
+ }
+
+ newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (newroot < 0) {
+ fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
+ exit(1);
+ }
+
+ if (fchdir(newroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
+ exit(1);
+ }
+
+ if (syscall(__NR_pivot_root, ".", ".") < 0) {
+ fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
+ exit(1);
+ }
+
+ if (fchdir(oldroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
+ exit(1);
+ }
+
+ if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
+ exit(1);
+ }
+
+ if (umount2(".", MNT_DETACH) < 0) {
+ fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
+ exit(1);
+ }
+
+ if (fchdir(newroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
+ exit(1);
+ }
+
+ close(newroot);
+ close(oldroot);
+}
+
+/*
+ * Lock down this process to prevent access to other processes or files outside
+ * source directory. This reduces the impact of arbitrary code execution bugs.
+ */
+static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
+ bool enable_syslog)
+{
+ setup_namespaces(lo, se);
+ setup_mounts(lo->source);
+ setup_seccomp(enable_syslog);
+}
+
+/* Raise the maximum number of open file descriptors */
+static void setup_nofile_rlimit(void)
+{
+ const rlim_t max_fds = 1000000;
+ struct rlimit rlim;
+
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+
+ if (rlim.rlim_cur >= max_fds) {
+ return; /* nothing to do */
+ }
+
+ rlim.rlim_cur = max_fds;
+ rlim.rlim_max = max_fds;
+
+ if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ /* Ignore SELinux denials */
+ if (errno == EPERM) {
+ return;
+ }
+
+ fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+}
+
+static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
+{
+ g_autofree char *localfmt = NULL;
+
+ if (current_log_level < level) {
+ return;
+ }
+
+ if (current_log_level == FUSE_LOG_DEBUG) {
+ if (!use_syslog) {
+ localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
+ get_clock(), syscall(__NR_gettid), fmt);
+ } else {
+ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
+ fmt);
+ }
+ fmt = localfmt;
+ }
+
+ if (use_syslog) {
+ int priority = LOG_ERR;
+ switch (level) {
+ case FUSE_LOG_EMERG:
+ priority = LOG_EMERG;
+ break;
+ case FUSE_LOG_ALERT:
+ priority = LOG_ALERT;
+ break;
+ case FUSE_LOG_CRIT:
+ priority = LOG_CRIT;
+ break;
+ case FUSE_LOG_ERR:
+ priority = LOG_ERR;
+ break;
+ case FUSE_LOG_WARNING:
+ priority = LOG_WARNING;
+ break;
+ case FUSE_LOG_NOTICE:
+ priority = LOG_NOTICE;
+ break;
+ case FUSE_LOG_INFO:
+ priority = LOG_INFO;
+ break;
+ case FUSE_LOG_DEBUG:
+ priority = LOG_DEBUG;
+ break;
+ }
+ vsyslog(priority, fmt, ap);
+ } else {
+ vfprintf(stderr, fmt, ap);
+ }
+}
+
+static void setup_root(struct lo_data *lo, struct lo_inode *root)
+{
+ int fd, res;
+ struct stat stat;
+
+ fd = open("/", O_PATH);
+ if (fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
+ exit(1);
+ }
+
+ res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
+ exit(1);
+ }
+
+ root->is_symlink = false;
+ root->fd = fd;
+ root->key.ino = stat.st_ino;
+ root->key.dev = stat.st_dev;
+ root->nlookup = 2;
+ g_atomic_int_set(&root->refcount, 2);
+}
+
+static guint lo_key_hash(gconstpointer key)
+{
+ const struct lo_key *lkey = key;
+
+ return (guint)lkey->ino + (guint)lkey->dev;
+}
+
+static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
+{
+ const struct lo_key *la = a;
+ const struct lo_key *lb = b;
+
+ return la->ino == lb->ino && la->dev == lb->dev;
+}
+
+static void fuse_lo_data_cleanup(struct lo_data *lo)
+{
+ if (lo->inodes) {
+ g_hash_table_destroy(lo->inodes);
+ }
+ lo_map_destroy(&lo->fd_map);
+ lo_map_destroy(&lo->dirp_map);
+ lo_map_destroy(&lo->ino_map);
+
+ if (lo->proc_self_fd >= 0) {
+ close(lo->proc_self_fd);
+ }
+
+ if (lo->root.fd >= 0) {
+ close(lo->root.fd);
+ }
+
+ free(lo->source);
+}
+
+int main(int argc, char *argv[])
+{
+ struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
+ struct fuse_session *se;
+ struct fuse_cmdline_opts opts;
+ struct lo_data lo = {
+ .debug = 0,
+ .writeback = 0,
+ .posix_lock = 1,
+ .proc_self_fd = -1,
+ };
+ struct lo_map_elem *root_elem;
+ int ret = -1;
+
+ /* Don't mask creation mode, kernel already did that */
+ umask(0);
+
+ pthread_mutex_init(&lo.mutex, NULL);
+ lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
+ lo.root.fd = -1;
+ lo.root.fuse_ino = FUSE_ROOT_ID;
+ lo.cache = CACHE_AUTO;
+
+ /*
+ * Set up the ino map like this:
+ * [0] Reserved (will not be used)
+ * [1] Root inode
+ */
+ lo_map_init(&lo.ino_map);
+ lo_map_reserve(&lo.ino_map, 0)->in_use = false;
+ root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
+ root_elem->inode = &lo.root;
+
+ lo_map_init(&lo.dirp_map);
+ lo_map_init(&lo.fd_map);
+
+ if (fuse_parse_cmdline(&args, &opts) != 0) {
+ goto err_out1;
+ }
+ fuse_set_log_func(log_func);
+ use_syslog = opts.syslog;
+ if (use_syslog) {
+ openlog("virtiofsd", LOG_PID, LOG_DAEMON);
+ }
+
+ if (opts.show_help) {
+ printf("usage: %s [options]\n\n", argv[0]);
+ fuse_cmdline_help();
+ printf(" -o source=PATH shared directory tree\n");
+ fuse_lowlevel_help();
+ ret = 0;
+ goto err_out1;
+ } else if (opts.show_version) {
+ fuse_lowlevel_version();
+ ret = 0;
+ goto err_out1;
+ } else if (opts.print_capabilities) {
+ print_capabilities();
+ ret = 0;
+ goto err_out1;
+ }
+
+ if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
+ goto err_out1;
+ }
+
+ /*
+ * log_level is 0 if not configured via cmd options (0 is LOG_EMERG,
+ * and we don't use this log level).
+ */
+ if (opts.log_level != 0) {
+ current_log_level = opts.log_level;
+ }
+ lo.debug = opts.debug;
+ if (lo.debug) {
+ current_log_level = FUSE_LOG_DEBUG;
+ }
+ if (lo.source) {
+ struct stat stat;
+ int res;
+
+ res = lstat(lo.source, &stat);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
+ lo.source);
+ exit(1);
+ }
+ if (!S_ISDIR(stat.st_mode)) {
+ fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
+ exit(1);
+ }
+ } else {
+ lo.source = strdup("/");
+ }
+ if (!lo.timeout_set) {
+ switch (lo.cache) {
+ case CACHE_NONE:
+ lo.timeout = 0.0;
+ break;
+
+ case CACHE_AUTO:
+ lo.timeout = 1.0;
+ break;
+
+ case CACHE_ALWAYS:
+ lo.timeout = 86400.0;
+ break;
+ }
+ } else if (lo.timeout < 0) {
+ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
+ exit(1);
+ }
+
+ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
+ if (se == NULL) {
+ goto err_out1;
+ }
+
+ if (fuse_set_signal_handlers(se) != 0) {
+ goto err_out2;
+ }
+
+ if (fuse_session_mount(se) != 0) {
+ goto err_out3;
+ }
+
+ fuse_daemonize(opts.foreground);
+
+ setup_nofile_rlimit();
+
+ /* Must be before sandbox since it wants /proc */
+ setup_capng();
+
+ setup_sandbox(&lo, se, opts.syslog);
+
+ setup_root(&lo, &lo.root);
+ /* Block until ctrl+c or fusermount -u */
+ ret = virtio_loop(se);
+
+ fuse_session_unmount(se);
+ cleanup_capng();
+err_out3:
+ fuse_remove_signal_handlers(se);
+err_out2:
+ fuse_session_destroy(se);
+err_out1:
+ fuse_opt_free_args(&args);
+
+ fuse_lo_data_cleanup(&lo);
+
+ return ret ? 1 : 0;
+}
diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c
new file mode 100644
index 0000000000..2d9d4a7ec0
--- /dev/null
+++ b/tools/virtiofsd/seccomp.c
@@ -0,0 +1,165 @@
+/*
+ * Seccomp sandboxing for virtiofsd
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "seccomp.h"
+#include "fuse_i.h"
+#include "fuse_log.h"
+#include <errno.h>
+#include <glib.h>
+#include <seccomp.h>
+#include <stdlib.h>
+
+/* Bodge for libseccomp 2.4.2 which broke ppoll */
+#if !defined(__SNR_ppoll) && defined(__SNR_brk)
+#ifdef __NR_ppoll
+#define __SNR_ppoll __NR_ppoll
+#else
+#define __SNR_ppoll __PNR_ppoll
+#endif
+#endif
+
+static const int syscall_whitelist[] = {
+ /* TODO ireg sem*() syscalls */
+ SCMP_SYS(brk),
+ SCMP_SYS(capget), /* For CAP_FSETID */
+ SCMP_SYS(capset),
+ SCMP_SYS(clock_gettime),
+ SCMP_SYS(clone),
+#ifdef __NR_clone3
+ SCMP_SYS(clone3),
+#endif
+ SCMP_SYS(close),
+ SCMP_SYS(copy_file_range),
+ SCMP_SYS(dup),
+ SCMP_SYS(eventfd2),
+ SCMP_SYS(exit),
+ SCMP_SYS(exit_group),
+ SCMP_SYS(fallocate),
+ SCMP_SYS(fchmodat),
+ SCMP_SYS(fchownat),
+ SCMP_SYS(fcntl),
+ SCMP_SYS(fdatasync),
+ SCMP_SYS(fgetxattr),
+ SCMP_SYS(flistxattr),
+ SCMP_SYS(flock),
+ SCMP_SYS(fremovexattr),
+ SCMP_SYS(fsetxattr),
+ SCMP_SYS(fstat),
+ SCMP_SYS(fstatfs),
+ SCMP_SYS(fsync),
+ SCMP_SYS(ftruncate),
+ SCMP_SYS(futex),
+ SCMP_SYS(getdents),
+ SCMP_SYS(getdents64),
+ SCMP_SYS(getegid),
+ SCMP_SYS(geteuid),
+ SCMP_SYS(getpid),
+ SCMP_SYS(gettid),
+ SCMP_SYS(gettimeofday),
+ SCMP_SYS(linkat),
+ SCMP_SYS(lseek),
+ SCMP_SYS(madvise),
+ SCMP_SYS(mkdirat),
+ SCMP_SYS(mknodat),
+ SCMP_SYS(mmap),
+ SCMP_SYS(mprotect),
+ SCMP_SYS(mremap),
+ SCMP_SYS(munmap),
+ SCMP_SYS(newfstatat),
+ SCMP_SYS(open),
+ SCMP_SYS(openat),
+ SCMP_SYS(ppoll),
+ SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */
+ SCMP_SYS(preadv),
+ SCMP_SYS(pread64),
+ SCMP_SYS(pwritev),
+ SCMP_SYS(pwrite64),
+ SCMP_SYS(read),
+ SCMP_SYS(readlinkat),
+ SCMP_SYS(recvmsg),
+ SCMP_SYS(renameat),
+ SCMP_SYS(renameat2),
+ SCMP_SYS(rt_sigaction),
+ SCMP_SYS(rt_sigprocmask),
+ SCMP_SYS(rt_sigreturn),
+ SCMP_SYS(sendmsg),
+ SCMP_SYS(setresgid),
+ SCMP_SYS(setresuid),
+#ifdef __NR_setresgid32
+ SCMP_SYS(setresgid32),
+#endif
+#ifdef __NR_setresuid32
+ SCMP_SYS(setresuid32),
+#endif
+ SCMP_SYS(set_robust_list),
+ SCMP_SYS(symlinkat),
+ SCMP_SYS(time), /* Rarely needed, except on static builds */
+ SCMP_SYS(tgkill),
+ SCMP_SYS(unlinkat),
+ SCMP_SYS(utimensat),
+ SCMP_SYS(write),
+ SCMP_SYS(writev),
+};
+
+/* Syscalls used when --syslog is enabled */
+static const int syscall_whitelist_syslog[] = {
+ SCMP_SYS(sendto),
+};
+
+static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n",
+ syscalls[i]);
+ exit(1);
+ }
+ }
+}
+
+void setup_seccomp(bool enable_syslog)
+{
+ scmp_filter_ctx ctx;
+
+#ifdef SCMP_ACT_KILL_PROCESS
+ ctx = seccomp_init(SCMP_ACT_KILL_PROCESS);
+ /* Handle a newer libseccomp but an older kernel */
+ if (!ctx && errno == EOPNOTSUPP) {
+ ctx = seccomp_init(SCMP_ACT_TRAP);
+ }
+#else
+ ctx = seccomp_init(SCMP_ACT_TRAP);
+#endif
+ if (!ctx) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n");
+ exit(1);
+ }
+
+ add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist));
+ if (enable_syslog) {
+ add_whitelist(ctx, syscall_whitelist_syslog,
+ G_N_ELEMENTS(syscall_whitelist_syslog));
+ }
+
+ /* libvhost-user calls this for post-copy migration, we don't need it */
+ if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(userfaultfd), 0) != 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n");
+ exit(1);
+ }
+
+ if (seccomp_load(ctx) < 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n");
+ exit(1);
+ }
+
+ seccomp_release(ctx);
+}
diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h
new file mode 100644
index 0000000000..d47c8eade6
--- /dev/null
+++ b/tools/virtiofsd/seccomp.h
@@ -0,0 +1,16 @@
+/*
+ * Seccomp sandboxing for virtiofsd
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VIRTIOFSD_SECCOMP_H
+#define VIRTIOFSD_SECCOMP_H
+
+#include <stdbool.h>
+
+void setup_seccomp(bool enable_syslog);
+
+#endif /* VIRTIOFSD_SECCOMP_H */
diff --git a/trace/mem-internal.h b/trace/mem-internal.h
index 0a32aa22ca..8b72b678fa 100644
--- a/trace/mem-internal.h
+++ b/trace/mem-internal.h
@@ -47,21 +47,4 @@ static inline uint16_t trace_mem_get_info(MemOp op,
mmu_idx);
}
-/* Used by the atomic helpers */
-static inline
-uint16_t trace_mem_build_info_no_se_be(int size_shift, bool store,
- TCGMemOpIdx oi)
-{
- return trace_mem_build_info(size_shift, false, MO_BE, store,
- get_mmuidx(oi));
-}
-
-static inline
-uint16_t trace_mem_build_info_no_se_le(int size_shift, bool store,
- TCGMemOpIdx oi)
-{
- return trace_mem_build_info(size_shift, false, MO_LE, store,
- get_mmuidx(oi));
-}
-
#endif /* TRACE__MEM_INTERNAL_H */
diff --git a/ui/console.c b/ui/console.c
index 69339b028b..179901c35e 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -2338,6 +2338,7 @@ void qemu_display_help(void)
int idx;
printf("Available display backend types:\n");
+ printf("none\n");
for (idx = DISPLAY_TYPE_NONE; idx < DISPLAY_TYPE__MAX; idx++) {
if (!dpys[idx]) {
ui_module_load_one(DisplayType_str(idx));
diff --git a/ui/vnc-enc-zrle.c b/ui/vnc-enc-zrle.c
index 17fd28a2e2..b4f71e32cf 100644
--- a/ui/vnc-enc-zrle.c
+++ b/ui/vnc-enc-zrle.c
@@ -98,8 +98,8 @@ static int zrle_compress_data(VncState *vs, int level)
/* set pointers */
zstream->next_in = vs->zrle->zrle.buffer;
zstream->avail_in = vs->zrle->zrle.offset;
- zstream->next_out = vs->zrle->zlib.buffer + vs->zrle->zlib.offset;
- zstream->avail_out = vs->zrle->zlib.capacity - vs->zrle->zlib.offset;
+ zstream->next_out = vs->zrle->zlib.buffer;
+ zstream->avail_out = vs->zrle->zlib.capacity;
zstream->data_type = Z_BINARY;
/* start encoding */
diff --git a/ui/vnc.c b/ui/vnc.c
index 4100d6e404..1d7138a3a0 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -898,8 +898,6 @@ int vnc_raw_send_framebuffer_update(VncState *vs, int x, int y, int w, int h)
int vnc_send_framebuffer_update(VncState *vs, int x, int y, int w, int h)
{
int n = 0;
- bool encode_raw = false;
- size_t saved_offs = vs->output.offset;
switch(vs->vnc_encoding) {
case VNC_ENCODING_ZLIB:
@@ -922,24 +920,10 @@ int vnc_send_framebuffer_update(VncState *vs, int x, int y, int w, int h)
n = vnc_zywrle_send_framebuffer_update(vs, x, y, w, h);
break;
default:
- encode_raw = true;
+ vnc_framebuffer_update(vs, x, y, w, h, VNC_ENCODING_RAW);
+ n = vnc_raw_send_framebuffer_update(vs, x, y, w, h);
break;
}
-
- /* If the client has the same pixel format as our internal buffer and
- * a RAW encoding would need less space fall back to RAW encoding to
- * save bandwidth and processing power in the client. */
- if (!encode_raw && vs->write_pixels == vnc_write_pixels_copy &&
- 12 + h * w * VNC_SERVER_FB_BYTES <= (vs->output.offset - saved_offs)) {
- vs->output.offset = saved_offs;
- encode_raw = true;
- }
-
- if (encode_raw) {
- vnc_framebuffer_update(vs, x, y, w, h, VNC_ENCODING_RAW);
- n = vnc_raw_send_framebuffer_update(vs, x, y, w, h);
- }
-
return n;
}
@@ -2087,8 +2071,15 @@ static void set_encodings(VncState *vs, int32_t *encodings, size_t n_encodings)
break;
#endif
case VNC_ENCODING_ZLIB:
- vs->features |= VNC_FEATURE_ZLIB_MASK;
- vs->vnc_encoding = enc;
+ /*
+ * VNC_ENCODING_ZRLE compresses better than VNC_ENCODING_ZLIB.
+ * So prioritize ZRLE, even if the client hints that it prefers
+ * ZLIB.
+ */
+ if ((vs->features & VNC_FEATURE_ZRLE_MASK) == 0) {
+ vs->features |= VNC_FEATURE_ZLIB_MASK;
+ vs->vnc_encoding = enc;
+ }
break;
case VNC_ENCODING_ZRLE:
vs->features |= VNC_FEATURE_ZRLE_MASK;
diff --git a/util/cacheinfo.c b/util/cacheinfo.c
index ea6f3e99bf..d94dc6adc8 100644
--- a/util/cacheinfo.c
+++ b/util/cacheinfo.c
@@ -93,10 +93,16 @@ static void sys_cache_info(int *isize, int *dsize)
static void sys_cache_info(int *isize, int *dsize)
{
# ifdef _SC_LEVEL1_ICACHE_LINESIZE
- *isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+ int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+ if (tmp_isize > 0) {
+ *isize = tmp_isize;
+ }
# endif
# ifdef _SC_LEVEL1_DCACHE_LINESIZE
- *dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+ int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+ if (tmp_dsize > 0) {
+ *dsize = tmp_dsize;
+ }
# endif
}
#endif /* sys_cache_info */
diff --git a/vl.c b/vl.c
index 751401214c..4c5033842c 100644
--- a/vl.c
+++ b/vl.c
@@ -1604,9 +1604,6 @@ static bool main_loop_should_exit(void)
RunState r;
ShutdownCause request;
- if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
- return false;
- }
if (preconfig_exit_requested) {
if (runstate_check(RUN_STATE_PRECONFIG)) {
runstate_set(RUN_STATE_PRELAUNCH);
@@ -1635,8 +1632,13 @@ static bool main_loop_should_exit(void)
pause_all_vcpus();
qemu_system_reset(request);
resume_all_vcpus();
+ /*
+ * runstate can change in pause_all_vcpus()
+ * as iothread mutex is unlocked
+ */
if (!runstate_check(RUN_STATE_RUNNING) &&
- !runstate_check(RUN_STATE_INMIGRATE)) {
+ !runstate_check(RUN_STATE_INMIGRATE) &&
+ !runstate_check(RUN_STATE_FINISH_MIGRATE)) {
runstate_set(RUN_STATE_PRELAUNCH);
}
}
@@ -2753,8 +2755,6 @@ static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
static void configure_accelerators(const char *progname)
{
const char *accel;
- char **accel_list, **tmp;
- bool accel_initialised = false;
bool init_failed = false;
qemu_opts_foreach(qemu_find_opts("icount"),
@@ -2762,26 +2762,33 @@ static void configure_accelerators(const char *progname)
accel = qemu_opt_get(qemu_get_machine_opts(), "accel");
if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {
+ char **accel_list, **tmp;
+
if (accel == NULL) {
/* Select the default accelerator */
- if (!accel_find("tcg") && !accel_find("kvm")) {
- error_report("No accelerator selected and"
- " no default accelerator available");
- exit(1);
- } else {
- int pnlen = strlen(progname);
- if (pnlen >= 3 && g_str_equal(&progname[pnlen - 3], "kvm")) {
+ bool have_tcg = accel_find("tcg");
+ bool have_kvm = accel_find("kvm");
+
+ if (have_tcg && have_kvm) {
+ if (g_str_has_suffix(progname, "kvm")) {
/* If the program name ends with "kvm", we prefer KVM */
accel = "kvm:tcg";
} else {
accel = "tcg:kvm";
}
+ } else if (have_kvm) {
+ accel = "kvm";
+ } else if (have_tcg) {
+ accel = "tcg";
+ } else {
+ error_report("No accelerator selected and"
+ " no default accelerator available");
+ exit(1);
}
}
-
accel_list = g_strsplit(accel, ":", 0);
- for (tmp = accel_list; !accel_initialised && tmp && *tmp; tmp++) {
+ for (tmp = accel_list; *tmp; tmp++) {
/*
* Filter invalid accelerators here, to prevent obscenities
* such as "-machine accel=tcg,,thread=single".