diff options
251 files changed, 19846 insertions, 2996 deletions
diff --git a/.gitignore b/.gitignore index efad605e1a..bc0a035f9c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ /config-target.* /config.status /config-temp +/tools/virtiofsd/50-qemu-virtiofsd.json /elf2dmp /trace-events-all /trace/generated-events.h diff --git a/.gitlab-ci-edk2.yml b/.gitlab-ci-edk2.yml new file mode 100644 index 0000000000..088ba4b43a --- /dev/null +++ b/.gitlab-ci-edk2.yml @@ -0,0 +1,49 @@ +docker-edk2: + stage: build + rules: # Only run this job when the Dockerfile is modified + - changes: + - .gitlab-ci-edk2.yml + - .gitlab-ci.d/edk2/Dockerfile + when: always + image: docker:19.03.1 + services: + - docker:19.03.1-dind + variables: + GIT_DEPTH: 3 + IMAGE_TAG: $CI_REGISTRY_IMAGE:edk2-cross-build + # We don't use TLS + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + before_script: + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + script: + - docker pull $IMAGE_TAG || true + - docker build --cache-from $IMAGE_TAG --tag $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA + --tag $IMAGE_TAG .gitlab-ci.d/edk2 + - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA + - docker push $IMAGE_TAG + +build-edk2: + rules: # Only run this job when ... + - changes: # ... roms/edk2/ is modified (submodule updated) + - roms/edk2/* + when: always + - if: '$CI_COMMIT_REF_NAME =~ /^edk2/' # or the branch/tag starts with 'edk2' + when: always + - if: '$CI_COMMIT_MESSAGE =~ /edk2/i' # or last commit description contains 'EDK2' + when: always + artifacts: + paths: # 'artifacts.zip' will contains the following files: + - pc-bios/edk2*bz2 + - pc-bios/edk2-licenses.txt + - edk2-stdout.log + - edk2-stderr.log + image: $CI_REGISTRY_IMAGE:edk2-cross-build + variables: + GIT_DEPTH: 3 + script: # Clone the required submodules and build EDK2 + - git submodule update --init roms/edk2 + - git -C roms/edk2 submodule update --init + - export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1)) + - echo "=== Using ${JOBS} simultaneous jobs ===" + - make -j${JOBS} -C roms efi 2>&1 1>edk2-stdout.log | tee -a edk2-stderr.log >&2 diff --git a/.gitlab-ci.d/edk2/Dockerfile b/.gitlab-ci.d/edk2/Dockerfile new file mode 100644 index 0000000000..b4584d1cf6 --- /dev/null +++ b/.gitlab-ci.d/edk2/Dockerfile @@ -0,0 +1,27 @@ +# +# Docker image to cross-compile EDK2 firmware binaries +# +FROM ubuntu:16.04 + +MAINTAINER Philippe Mathieu-Daudé <philmd@redhat.com> + +# Install packages required to build EDK2 +RUN apt update \ + && \ + \ + DEBIAN_FRONTEND=noninteractive \ + apt install --assume-yes --no-install-recommends \ + build-essential \ + ca-certificates \ + dos2unix \ + gcc-aarch64-linux-gnu \ + gcc-arm-linux-gnueabi \ + git \ + iasl \ + make \ + nasm \ + python \ + uuid-dev \ + && \ + \ + rm -rf /var/lib/apt/lists/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dce8f2d3f5..228783993e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,6 @@ +include: + - local: '/.gitlab-ci-edk2.yml' + before_script: - apt-get update -qq - apt-get install -y -qq flex bison libglib2.0-dev libpixman-1-dev genisoimage @@ -87,6 +90,7 @@ build-tci: - ../configure --enable-tcg-interpreter --target-list="$(for tg in $TARGETS; do echo -n ${tg}'-softmmu '; done)" - make -j2 + - make run-tcg-tests-x86_64-softmmu - make tests/qtest/boot-serial-test tests/qtest/cdrom-test tests/qtest/pxe-test - for tg in $TARGETS ; do export QTEST_QEMU_BINARY="${tg}-softmmu/qemu-system-${tg}" ; diff --git a/MAINTAINERS b/MAINTAINERS index 483edfbc0b..f6511d5120 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -816,12 +816,26 @@ F: hw/adc/* F: hw/ssi/stm32f2xx_spi.c F: include/hw/*/stm32*.h +STM32F405 +M: Alistair Francis <alistair@alistair23.me> +M: Peter Maydell <peter.maydell@linaro.org> +S: Maintained +F: hw/arm/stm32f405_soc.c +F: hw/misc/stm32f4xx_syscfg.c +F: hw/misc/stm32f4xx_exti.c + Netduino 2 M: Alistair Francis <alistair@alistair23.me> M: Peter Maydell <peter.maydell@linaro.org> S: Maintained F: hw/arm/netduino2.c +Netduino Plus 2 +M: Alistair Francis <alistair@alistair23.me> +M: Peter Maydell <peter.maydell@linaro.org> +S: Maintained +F: hw/arm/netduinoplus2.c + SmartFusion2 M: Subbaraya Sundeep <sundeep.lkml@gmail.com> M: Peter Maydell <peter.maydell@linaro.org> @@ -1581,6 +1595,14 @@ T: git https://github.com/cohuck/qemu.git s390-next T: git https://github.com/borntraeger/qemu.git s390-next L: qemu-s390x@nongnu.org +virtiofs +M: Dr. David Alan Gilbert <dgilbert@redhat.com> +M: Stefan Hajnoczi <stefanha@redhat.com> +S: Supported +F: tools/virtiofsd/* +F: hw/virtio/vhost-user-fs* +F: include/hw/virtio/vhost-user-fs.h + virtio-input M: Gerd Hoffmann <kraxel@redhat.com> S: Maintained @@ -2354,6 +2376,8 @@ F: roms/edk2 F: roms/edk2-* F: tests/data/uefi-boot-images/ F: tests/uefi-test-tools/ +F: .gitlab-ci-edk2.yml +F: .gitlab-ci.d/edk2/ Usermode Emulation ------------------ @@ -2382,6 +2406,7 @@ Common TCG code M: Richard Henderson <rth@twiddle.net> S: Maintained F: tcg/ +F: include/tcg/ TCG Plugins M: Alex Bennée <alex.bennee@linaro.org> @@ -2391,8 +2416,7 @@ F: plugins/ F: tests/plugin AArch64 TCG target -M: Claudio Fontana <claudio.fontana@huawei.com> -M: Claudio Fontana <claudio.fontana@gmail.com> +M: Richard Henderson <richard.henderson@linaro.org> S: Maintained L: qemu-arm@nongnu.org F: tcg/aarch64/ @@ -2503,6 +2527,7 @@ F: include/block/nbd* F: qemu-nbd.* F: blockdev-nbd.c F: docs/interop/nbd.txt +F: docs/interop/qemu-nbd.rst T: git https://repo.or.cz/qemu/ericb.git nbd NFS @@ -327,6 +327,11 @@ HELPERS-y += vhost-user-gpu$(EXESUF) vhost-user-json-y += contrib/vhost-user-gpu/50-qemu-gpu.json endif +ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) +HELPERS-y += virtiofsd$(EXESUF) +vhost-user-json-y += tools/virtiofsd/50-qemu-virtiofsd.json +endif + # Sphinx does not allow building manuals into the same directory as # the source files, so if we're doing an in-tree QEMU build we must # build the manuals into a subdirectory (and then install them from @@ -339,10 +344,12 @@ MANUAL_BUILDDIR := docs endif ifdef BUILD_DOCS -DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1 qemu-nbd.8 $(MANUAL_BUILDDIR)/interop/qemu-ga.8 +DOCS=qemu-doc.html qemu-doc.txt qemu.1 qemu-img.1 +DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-nbd.8 +DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-ga.8 +DOCS+=$(MANUAL_BUILDDIR)/system/qemu-block-drivers.7 DOCS+=docs/interop/qemu-qmp-ref.html docs/interop/qemu-qmp-ref.txt docs/interop/qemu-qmp-ref.7 DOCS+=docs/interop/qemu-ga-ref.html docs/interop/qemu-ga-ref.txt docs/interop/qemu-ga-ref.7 -DOCS+=docs/qemu-block-drivers.7 DOCS+=docs/qemu-cpu-models.7 DOCS+=$(MANUAL_BUILDDIR)/index.html ifdef CONFIG_VIRTFS @@ -429,6 +436,7 @@ dummy := $(call unnest-vars,, \ elf2dmp-obj-y \ ivshmem-client-obj-y \ ivshmem-server-obj-y \ + virtiofsd-obj-y \ rdmacm-mux-obj-y \ libvhost-user-obj-y \ vhost-user-scsi-obj-y \ @@ -668,6 +676,12 @@ rdmacm-mux$(EXESUF): LIBS += "-libumad" rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS) $(call LINK, $^) +# relies on Linux-specific syscalls +ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy) +virtiofsd$(EXESUF): $(virtiofsd-obj-y) libvhost-user.a $(COMMON_LDADDS) + $(call LINK, $^) +endif + vhost-user-gpu$(EXESUF): $(vhost-user-gpu-obj-y) $(libvhost-user-obj-y) libqemuutil.a libqemustub.a $(call LINK, $^) @@ -749,12 +763,12 @@ distclean: clean rm -f docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt rm -f docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf rm -f docs/interop/qemu-qmp-ref.html docs/interop/qemu-ga-ref.html - rm -f docs/qemu-block-drivers.7 rm -f docs/qemu-cpu-models.7 rm -rf .doctrees $(call clean-manual,devel) $(call clean-manual,interop) $(call clean-manual,specs) + $(call clean-manual,system) for d in $(TARGET_DIRS); do \ rm -rf $$d || exit 1 ; \ done @@ -811,6 +825,7 @@ endef install-sphinxdocs: sphinxdocs $(call install-manual,interop) $(call install-manual,specs) + $(call install-manual,system) install-doc: $(DOCS) install-sphinxdocs $(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)" @@ -824,12 +839,12 @@ ifdef CONFIG_POSIX $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1" $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man7" $(INSTALL_DATA) docs/interop/qemu-qmp-ref.7 "$(DESTDIR)$(mandir)/man7" - $(INSTALL_DATA) docs/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7" + $(INSTALL_DATA) $(MANUAL_BUILDDIR)/system/qemu-block-drivers.7 "$(DESTDIR)$(mandir)/man7" $(INSTALL_DATA) docs/qemu-cpu-models.7 "$(DESTDIR)$(mandir)/man7" ifeq ($(CONFIG_TOOLS),y) $(INSTALL_DATA) qemu-img.1 "$(DESTDIR)$(mandir)/man1" $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man8" - $(INSTALL_DATA) qemu-nbd.8 "$(DESTDIR)$(mandir)/man8" + $(INSTALL_DATA) $(MANUAL_BUILDDIR)/interop/qemu-nbd.8 "$(DESTDIR)$(mandir)/man8" endif ifdef CONFIG_TRACE_SYSTEMTAP $(INSTALL_DATA) scripts/qemu-trace-stap.1 "$(DESTDIR)$(mandir)/man1" @@ -998,7 +1013,10 @@ docs/version.texi: $(SRC_PATH)/VERSION config-host.mak # and handles "don't rebuild things unless necessary" itself. # The '.doctrees' files are cached information to speed this up. .PHONY: sphinxdocs -sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html $(MANUAL_BUILDDIR)/interop/index.html $(MANUAL_BUILDDIR)/specs/index.html +sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html \ + $(MANUAL_BUILDDIR)/interop/index.html \ + $(MANUAL_BUILDDIR)/specs/index.html \ + $(MANUAL_BUILDDIR)/system/index.html # Canned command to build a single manual # Arguments: $1 = manual name, $2 = Sphinx builder ('html' or 'man') @@ -1007,7 +1025,9 @@ sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html $(MANUAL_BUILDDIR)/interop/index # a single doctree: https://github.com/sphinx-doc/sphinx/issues/2946 build-manual = $(call quiet-command,CONFDIR="$(qemu_confdir)" sphinx-build $(if $(V),,-q) -W -b $2 -D version=$(VERSION) -D release="$(FULL_VERSION)" -d .doctrees/$1-$2 $(SRC_PATH)/docs/$1 $(MANUAL_BUILDDIR)/$1 ,"SPHINX","$(MANUAL_BUILDDIR)/$1") # We assume all RST files in the manual's directory are used in it -manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py +manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) \ + $(wildcard $(SRC_PATH)/docs/$1/*.rst.inc) \ + $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py $(MANUAL_BUILDDIR)/devel/index.html: $(call manual-deps,devel) $(call build-manual,devel,html) @@ -1018,10 +1038,20 @@ $(MANUAL_BUILDDIR)/interop/index.html: $(call manual-deps,interop) $(MANUAL_BUILDDIR)/specs/index.html: $(call manual-deps,specs) $(call build-manual,specs,html) +$(MANUAL_BUILDDIR)/system/index.html: $(call manual-deps,system) + $(call build-manual,system,html) + $(MANUAL_BUILDDIR)/interop/qemu-ga.8: $(call manual-deps,interop) $(call build-manual,interop,man) +$(MANUAL_BUILDDIR)/interop/qemu-nbd.8: $(call manual-deps,interop) + $(call build-manual,interop,man) + +$(MANUAL_BUILDDIR)/system/qemu-block-drivers.7: $(call manual-deps,system) + $(call build-manual,system,man) + $(MANUAL_BUILDDIR)/index.html: $(SRC_PATH)/docs/index.html.in qemu-version.h + @mkdir -p "$(MANUAL_BUILDDIR)" $(call quiet-command, sed "s|@@VERSION@@|${VERSION}|g" $< >$@, \ "GEN","$@") @@ -1047,8 +1077,6 @@ qemu.1: qemu-doc.texi qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi qemu.1: qemu-option-trace.texi qemu-img.1: qemu-img.texi qemu-option-trace.texi qemu-img-cmds.texi fsdev/virtfs-proxy-helper.1: fsdev/virtfs-proxy-helper.texi -qemu-nbd.8: qemu-nbd.texi qemu-option-trace.texi -docs/qemu-block-drivers.7: docs/qemu-block-drivers.texi docs/qemu-cpu-models.7: docs/qemu-cpu-models.texi scripts/qemu-trace-stap.1: scripts/qemu-trace-stap.texi @@ -1058,10 +1086,10 @@ pdf: qemu-doc.pdf docs/interop/qemu-qmp-ref.pdf docs/interop/qemu-ga-ref.pdf txt: qemu-doc.txt docs/interop/qemu-qmp-ref.txt docs/interop/qemu-ga-ref.txt qemu-doc.html qemu-doc.info qemu-doc.pdf qemu-doc.txt: \ - qemu-img.texi qemu-nbd.texi qemu-options.texi \ + qemu-img.texi qemu-options.texi \ qemu-tech.texi qemu-option-trace.texi \ qemu-deprecated.texi qemu-monitor.texi qemu-img-cmds.texi \ - qemu-monitor-info.texi docs/qemu-block-drivers.texi \ + qemu-monitor-info.texi \ docs/qemu-cpu-models.texi docs/security.texi docs/interop/qemu-ga-ref.dvi docs/interop/qemu-ga-ref.html \ diff --git a/Makefile.objs b/Makefile.objs index 7c1e50f9d6..ff396b9209 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -123,6 +123,7 @@ vhost-user-blk-obj-y = contrib/vhost-user-blk/ rdmacm-mux-obj-y = contrib/rdmacm-mux/ vhost-user-input-obj-y = contrib/vhost-user-input/ vhost-user-gpu-obj-y = contrib/vhost-user-gpu/ +virtiofsd-obj-y = tools/virtiofsd/ ###################################################################### trace-events-subdirs = diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h index 837676231f..26969487d6 100644 --- a/accel/tcg/atomic_template.h +++ b/accel/tcg/atomic_template.h @@ -64,13 +64,10 @@ the ATOMIC_NAME macro, and redefined below. */ #if DATA_SIZE == 1 # define END -# define MEND _be /* either le or be would be fine */ #elif defined(HOST_WORDS_BIGENDIAN) # define END _be -# define MEND _be #else # define END _le -# define MEND _le #endif ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr, @@ -79,8 +76,8 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr, ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; DATA_TYPE ret; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, + ATOMIC_MMU_IDX); atomic_trace_rmw_pre(env, addr, info); #if DATA_SIZE == 16 @@ -99,8 +96,8 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS) { ATOMIC_MMU_DECLS; DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, + ATOMIC_MMU_IDX); atomic_trace_ld_pre(env, addr, info); val = atomic16_read(haddr); @@ -114,8 +111,8 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, { ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, true, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, 0, true, + ATOMIC_MMU_IDX); atomic_trace_st_pre(env, addr, info); atomic16_set(haddr, val); @@ -130,8 +127,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; DATA_TYPE ret; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, + ATOMIC_MMU_IDX); atomic_trace_rmw_pre(env, addr, info); ret = atomic_xchg__nocheck(haddr, val); @@ -147,10 +144,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \ ATOMIC_MMU_DECLS; \ DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \ DATA_TYPE ret; \ - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \ - false, \ - ATOMIC_MMU_IDX); \ - \ + uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, \ + ATOMIC_MMU_IDX); \ atomic_trace_rmw_pre(env, addr, info); \ ret = atomic_##X(haddr, val); \ ATOMIC_MMU_CLEANUP; \ @@ -183,10 +178,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \ ATOMIC_MMU_DECLS; \ XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \ XDATA_TYPE cmp, old, new, val = xval; \ - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \ - false, \ - ATOMIC_MMU_IDX); \ - \ + uint16_t info = trace_mem_build_info(SHIFT, false, 0, false, \ + ATOMIC_MMU_IDX); \ atomic_trace_rmw_pre(env, addr, info); \ smp_mb(); \ cmp = atomic_read__nocheck(haddr); \ @@ -213,7 +206,6 @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX, DATA_TYPE, new) #endif /* DATA SIZE >= 16 */ #undef END -#undef MEND #if DATA_SIZE > 1 @@ -221,10 +213,8 @@ GEN_ATOMIC_HELPER_FN(umax_fetch, MAX, DATA_TYPE, new) within the ATOMIC_NAME macro. */ #ifdef HOST_WORDS_BIGENDIAN # define END _le -# define MEND _le #else # define END _be -# define MEND _be #endif ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr, @@ -233,9 +223,8 @@ ABI_TYPE ATOMIC_NAME(cmpxchg)(CPUArchState *env, target_ulong addr, ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; DATA_TYPE ret; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, - false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false, + ATOMIC_MMU_IDX); atomic_trace_rmw_pre(env, addr, info); #if DATA_SIZE == 16 @@ -254,9 +243,8 @@ ABI_TYPE ATOMIC_NAME(ld)(CPUArchState *env, target_ulong addr EXTRA_ARGS) { ATOMIC_MMU_DECLS; DATA_TYPE val, *haddr = ATOMIC_MMU_LOOKUP; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, - false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false, + ATOMIC_MMU_IDX); atomic_trace_ld_pre(env, addr, info); val = atomic16_read(haddr); @@ -270,9 +258,8 @@ void ATOMIC_NAME(st)(CPUArchState *env, target_ulong addr, { ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, - true, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, true, + ATOMIC_MMU_IDX); val = BSWAP(val); atomic_trace_st_pre(env, addr, info); @@ -289,9 +276,8 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, target_ulong addr, ATOMIC_MMU_DECLS; DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; ABI_TYPE ret; - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, - false, - ATOMIC_MMU_IDX); + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, false, + ATOMIC_MMU_IDX); atomic_trace_rmw_pre(env, addr, info); ret = atomic_xchg__nocheck(haddr, BSWAP(val)); @@ -307,10 +293,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \ ATOMIC_MMU_DECLS; \ DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \ DATA_TYPE ret; \ - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \ - false, \ - ATOMIC_MMU_IDX); \ - \ + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, \ + false, ATOMIC_MMU_IDX); \ atomic_trace_rmw_pre(env, addr, info); \ ret = atomic_##X(haddr, BSWAP(val)); \ ATOMIC_MMU_CLEANUP; \ @@ -341,10 +325,8 @@ ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr, \ ATOMIC_MMU_DECLS; \ XDATA_TYPE *haddr = ATOMIC_MMU_LOOKUP; \ XDATA_TYPE ldo, ldn, old, new, val = xval; \ - uint16_t info = glue(trace_mem_build_info_no_se, MEND)(SHIFT, \ - false, \ - ATOMIC_MMU_IDX); \ - \ + uint16_t info = trace_mem_build_info(SHIFT, false, MO_BSWAP, \ + false, ATOMIC_MMU_IDX); \ atomic_trace_rmw_pre(env, addr, info); \ smp_mb(); \ ldn = atomic_read__nocheck(haddr); \ @@ -378,7 +360,6 @@ GEN_ATOMIC_HELPER_FN(add_fetch, ADD, DATA_TYPE, new) #endif /* DATA_SIZE >= 16 */ #undef END -#undef MEND #endif /* DATA_SIZE > 1 */ #undef BSWAP diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 62068d10c3..2560c90eec 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -23,7 +23,7 @@ #include "trace.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "qemu/atomic.h" #include "sysemu/qtest.h" #include "qemu/timer.h" diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 98221948d6..e3b5750c3b 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -34,6 +34,8 @@ #include "qemu/atomic.h" #include "qemu/atomic128.h" #include "translate-all.h" +#include "trace-root.h" +#include "trace/mem.h" #ifdef CONFIG_PLUGIN #include "qemu/plugin-memory.h" #endif @@ -78,9 +80,14 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data)); QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16); #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1) -static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx) +static inline size_t tlb_n_entries(CPUTLBDescFast *fast) { - return env_tlb(env)->f[mmu_idx].mask + (1 << CPU_TLB_ENTRY_BITS); + return (fast->mask >> CPU_TLB_ENTRY_BITS) + 1; +} + +static inline size_t sizeof_tlb(CPUTLBDescFast *fast) +{ + return fast->mask + (1 << CPU_TLB_ENTRY_BITS); } static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns, @@ -90,26 +97,10 @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns, desc->window_max_entries = max_entries; } -static void tlb_dyn_init(CPUArchState *env) -{ - int i; - - for (i = 0; i < NB_MMU_MODES; i++) { - CPUTLBDesc *desc = &env_tlb(env)->d[i]; - size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS; - - tlb_window_reset(desc, get_clock_realtime(), 0); - desc->n_used_entries = 0; - env_tlb(env)->f[i].mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS; - env_tlb(env)->f[i].table = g_new(CPUTLBEntry, n_entries); - env_tlb(env)->d[i].iotlb = g_new(CPUIOTLBEntry, n_entries); - } -} - /** * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary - * @env: CPU that owns the TLB - * @mmu_idx: MMU index of the TLB + * @desc: The CPUTLBDesc portion of the TLB + * @fast: The CPUTLBDescFast portion of the same TLB * * Called with tlb_lock_held. * @@ -146,13 +137,12 @@ static void tlb_dyn_init(CPUArchState *env) * high), since otherwise we are likely to have a significant amount of * conflict misses. */ -static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx) +static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast, + int64_t now) { - CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx]; - size_t old_size = tlb_n_entries(env, mmu_idx); + size_t old_size = tlb_n_entries(fast); size_t rate; size_t new_size = old_size; - int64_t now = get_clock_realtime(); int64_t window_len_ms = 100; int64_t window_len_ns = window_len_ms * 1000 * 1000; bool window_expired = now > desc->window_begin_ns + window_len_ns; @@ -191,14 +181,15 @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx) return; } - g_free(env_tlb(env)->f[mmu_idx].table); - g_free(env_tlb(env)->d[mmu_idx].iotlb); + g_free(fast->table); + g_free(desc->iotlb); tlb_window_reset(desc, now, 0); /* desc->n_used_entries is cleared by the caller */ - env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS; - env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size); - env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size); + fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS; + fast->table = g_try_new(CPUTLBEntry, new_size); + desc->iotlb = g_try_new(CPUIOTLBEntry, new_size); + /* * If the allocations fail, try smaller sizes. We just freed some * memory, so going back to half of new_size has a good chance of working. @@ -206,27 +197,51 @@ static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx) * allocations to fail though, so we progressively reduce the allocation * size, aborting if we cannot even allocate the smallest TLB we support. */ - while (env_tlb(env)->f[mmu_idx].table == NULL || - env_tlb(env)->d[mmu_idx].iotlb == NULL) { + while (fast->table == NULL || desc->iotlb == NULL) { if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) { error_report("%s: %s", __func__, strerror(errno)); abort(); } new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS); - env_tlb(env)->f[mmu_idx].mask = (new_size - 1) << CPU_TLB_ENTRY_BITS; + fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS; - g_free(env_tlb(env)->f[mmu_idx].table); - g_free(env_tlb(env)->d[mmu_idx].iotlb); - env_tlb(env)->f[mmu_idx].table = g_try_new(CPUTLBEntry, new_size); - env_tlb(env)->d[mmu_idx].iotlb = g_try_new(CPUIOTLBEntry, new_size); + g_free(fast->table); + g_free(desc->iotlb); + fast->table = g_try_new(CPUTLBEntry, new_size); + desc->iotlb = g_try_new(CPUIOTLBEntry, new_size); } } -static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx) +static void tlb_mmu_flush_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast) { - tlb_mmu_resize_locked(env, mmu_idx); - memset(env_tlb(env)->f[mmu_idx].table, -1, sizeof_tlb(env, mmu_idx)); - env_tlb(env)->d[mmu_idx].n_used_entries = 0; + desc->n_used_entries = 0; + desc->large_page_addr = -1; + desc->large_page_mask = -1; + desc->vindex = 0; + memset(fast->table, -1, sizeof_tlb(fast)); + memset(desc->vtable, -1, sizeof(desc->vtable)); +} + +static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx, + int64_t now) +{ + CPUTLBDesc *desc = &env_tlb(env)->d[mmu_idx]; + CPUTLBDescFast *fast = &env_tlb(env)->f[mmu_idx]; + + tlb_mmu_resize_locked(desc, fast, now); + tlb_mmu_flush_locked(desc, fast); +} + +static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now) +{ + size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS; + + tlb_window_reset(desc, now, 0); + desc->n_used_entries = 0; + fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS; + fast->table = g_new(CPUTLBEntry, n_entries); + desc->iotlb = g_new(CPUIOTLBEntry, n_entries); + tlb_mmu_flush_locked(desc, fast); } static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx) @@ -242,13 +257,17 @@ static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx) void tlb_init(CPUState *cpu) { CPUArchState *env = cpu->env_ptr; + int64_t now = get_clock_realtime(); + int i; qemu_spin_init(&env_tlb(env)->c.lock); - /* Ensure that cpu_reset performs a full flush. */ - env_tlb(env)->c.dirty = ALL_MMUIDX_BITS; + /* All tlbs are initialized flushed. */ + env_tlb(env)->c.dirty = 0; - tlb_dyn_init(env); + for (i = 0; i < NB_MMU_MODES; i++) { + tlb_mmu_init(&env_tlb(env)->d[i], &env_tlb(env)->f[i], now); + } } /* flush_all_helper: run fn across all cpus @@ -287,21 +306,12 @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide) *pelide = elide; } -static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx) -{ - tlb_table_flush_by_mmuidx(env, mmu_idx); - env_tlb(env)->d[mmu_idx].large_page_addr = -1; - env_tlb(env)->d[mmu_idx].large_page_mask = -1; - env_tlb(env)->d[mmu_idx].vindex = 0; - memset(env_tlb(env)->d[mmu_idx].vtable, -1, - sizeof(env_tlb(env)->d[0].vtable)); -} - static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data) { CPUArchState *env = cpu->env_ptr; uint16_t asked = data.host_int; uint16_t all_dirty, work, to_clean; + int64_t now = get_clock_realtime(); assert_cpu_is_self(cpu); @@ -316,7 +326,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data) for (work = to_clean; work != 0; work &= work - 1) { int mmu_idx = ctz32(work); - tlb_flush_one_mmuidx_locked(env, mmu_idx); + tlb_flush_one_mmuidx_locked(env, mmu_idx, now); } qemu_spin_unlock(&env_tlb(env)->c.lock); @@ -438,7 +448,7 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx, tlb_debug("forcing full flush midx %d (" TARGET_FMT_lx "/" TARGET_FMT_lx ")\n", midx, lp_addr, lp_mask); - tlb_flush_one_mmuidx_locked(env, midx); + tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime()); } else { if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) { tlb_n_used_entries_dec(env, midx); @@ -447,28 +457,29 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx, } } -/* As we are going to hijack the bottom bits of the page address for a - * mmuidx bit mask we need to fail to build if we can't do that +/** + * tlb_flush_page_by_mmuidx_async_0: + * @cpu: cpu on which to flush + * @addr: page of virtual address to flush + * @idxmap: set of mmu_idx to flush + * + * Helper for tlb_flush_page_by_mmuidx and friends, flush one page + * at @addr from the tlbs indicated by @idxmap from @cpu. */ -QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN); - -static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu, - run_on_cpu_data data) +static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu, + target_ulong addr, + uint16_t idxmap) { CPUArchState *env = cpu->env_ptr; - target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr; - target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK; - unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS; int mmu_idx; assert_cpu_is_self(cpu); - tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n", - addr, mmu_idx_bitmap); + tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%x\n", addr, idxmap); qemu_spin_lock(&env_tlb(env)->c.lock); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { - if (test_bit(mmu_idx, &mmu_idx_bitmap)) { + if ((idxmap >> mmu_idx) & 1) { tlb_flush_page_locked(env, mmu_idx, addr); } } @@ -477,22 +488,75 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu, tb_flush_jmp_cache(cpu, addr); } -void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap) +/** + * tlb_flush_page_by_mmuidx_async_1: + * @cpu: cpu on which to flush + * @data: encoded addr + idxmap + * + * Helper for tlb_flush_page_by_mmuidx and friends, called through + * async_run_on_cpu. The idxmap parameter is encoded in the page + * offset of the target_ptr field. This limits the set of mmu_idx + * that can be passed via this method. + */ +static void tlb_flush_page_by_mmuidx_async_1(CPUState *cpu, + run_on_cpu_data data) { - target_ulong addr_and_mmu_idx; + target_ulong addr_and_idxmap = (target_ulong) data.target_ptr; + target_ulong addr = addr_and_idxmap & TARGET_PAGE_MASK; + uint16_t idxmap = addr_and_idxmap & ~TARGET_PAGE_MASK; + tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap); +} + +typedef struct { + target_ulong addr; + uint16_t idxmap; +} TLBFlushPageByMMUIdxData; + +/** + * tlb_flush_page_by_mmuidx_async_2: + * @cpu: cpu on which to flush + * @data: allocated addr + idxmap + * + * Helper for tlb_flush_page_by_mmuidx and friends, called through + * async_run_on_cpu. The addr+idxmap parameters are stored in a + * TLBFlushPageByMMUIdxData structure that has been allocated + * specifically for this helper. Free the structure when done. + */ +static void tlb_flush_page_by_mmuidx_async_2(CPUState *cpu, + run_on_cpu_data data) +{ + TLBFlushPageByMMUIdxData *d = data.host_ptr; + + tlb_flush_page_by_mmuidx_async_0(cpu, d->addr, d->idxmap); + g_free(d); +} + +void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap) +{ tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap); /* This should already be page aligned */ - addr_and_mmu_idx = addr & TARGET_PAGE_MASK; - addr_and_mmu_idx |= idxmap; + addr &= TARGET_PAGE_MASK; - if (!qemu_cpu_is_self(cpu)) { - async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work, - RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + if (qemu_cpu_is_self(cpu)) { + tlb_flush_page_by_mmuidx_async_0(cpu, addr, idxmap); + } else if (idxmap < TARGET_PAGE_SIZE) { + /* + * Most targets have only a few mmu_idx. In the case where + * we can stuff idxmap into the low TARGET_PAGE_BITS, avoid + * allocating memory for this operation. + */ + async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_1, + RUN_ON_CPU_TARGET_PTR(addr | idxmap)); } else { - tlb_flush_page_by_mmuidx_async_work( - cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + TLBFlushPageByMMUIdxData *d = g_new(TLBFlushPageByMMUIdxData, 1); + + /* Otherwise allocate a structure, freed by the worker. */ + d->addr = addr; + d->idxmap = idxmap; + async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_2, + RUN_ON_CPU_HOST_PTR(d)); } } @@ -504,17 +568,36 @@ void tlb_flush_page(CPUState *cpu, target_ulong addr) void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr, uint16_t idxmap) { - const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work; - target_ulong addr_and_mmu_idx; - tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap); /* This should already be page aligned */ - addr_and_mmu_idx = addr & TARGET_PAGE_MASK; - addr_and_mmu_idx |= idxmap; + addr &= TARGET_PAGE_MASK; - flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); - fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + /* + * Allocate memory to hold addr+idxmap only when needed. + * See tlb_flush_page_by_mmuidx for details. + */ + if (idxmap < TARGET_PAGE_SIZE) { + flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1, + RUN_ON_CPU_TARGET_PTR(addr | idxmap)); + } else { + CPUState *dst_cpu; + + /* Allocate a separate data block for each destination cpu. */ + CPU_FOREACH(dst_cpu) { + if (dst_cpu != src_cpu) { + TLBFlushPageByMMUIdxData *d + = g_new(TLBFlushPageByMMUIdxData, 1); + + d->addr = addr; + d->idxmap = idxmap; + async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2, + RUN_ON_CPU_HOST_PTR(d)); + } + } + } + + tlb_flush_page_by_mmuidx_async_0(src_cpu, addr, idxmap); } void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr) @@ -526,17 +609,41 @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu, target_ulong addr, uint16_t idxmap) { - const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work; - target_ulong addr_and_mmu_idx; - tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap); /* This should already be page aligned */ - addr_and_mmu_idx = addr & TARGET_PAGE_MASK; - addr_and_mmu_idx |= idxmap; + addr &= TARGET_PAGE_MASK; + + /* + * Allocate memory to hold addr+idxmap only when needed. + * See tlb_flush_page_by_mmuidx for details. + */ + if (idxmap < TARGET_PAGE_SIZE) { + flush_all_helper(src_cpu, tlb_flush_page_by_mmuidx_async_1, + RUN_ON_CPU_TARGET_PTR(addr | idxmap)); + async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_1, + RUN_ON_CPU_TARGET_PTR(addr | idxmap)); + } else { + CPUState *dst_cpu; + TLBFlushPageByMMUIdxData *d; + + /* Allocate a separate data block for each destination cpu. */ + CPU_FOREACH(dst_cpu) { + if (dst_cpu != src_cpu) { + d = g_new(TLBFlushPageByMMUIdxData, 1); + d->addr = addr; + d->idxmap = idxmap; + async_run_on_cpu(dst_cpu, tlb_flush_page_by_mmuidx_async_2, + RUN_ON_CPU_HOST_PTR(d)); + } + } - flush_all_helper(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); - async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx)); + d = g_new(TLBFlushPageByMMUIdxData, 1); + d->addr = addr; + d->idxmap = idxmap; + async_safe_run_on_cpu(src_cpu, tlb_flush_page_by_mmuidx_async_2, + RUN_ON_CPU_HOST_PTR(d)); + } } void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr) @@ -620,7 +727,7 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length) qemu_spin_lock(&env_tlb(env)->c.lock); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { unsigned int i; - unsigned int n = tlb_n_entries(env, mmu_idx); + unsigned int n = tlb_n_entries(&env_tlb(env)->f[mmu_idx]); for (i = 0; i < n; i++) { tlb_reset_dirty_range_locked(&env_tlb(env)->f[mmu_idx].table[i], @@ -1626,6 +1733,137 @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr, } /* + * Load helpers for cpu_ldst.h. + */ + +static inline uint64_t cpu_load_helper(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t retaddr, + MemOp op, FullLoadHelper *full_load) +{ + uint16_t meminfo; + TCGMemOpIdx oi; + uint64_t ret; + + meminfo = trace_mem_get_info(op, mmu_idx, false); + trace_guest_mem_before_exec(env_cpu(env), addr, meminfo); + + op &= ~MO_SIGN; + oi = make_memop_idx(op, mmu_idx); + ret = full_load(env, addr, oi, retaddr); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo); + + return ret; +} + +uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_load_helper(env, addr, mmu_idx, ra, MO_UB, full_ldub_mmu); +} + +int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return (int8_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_SB, + full_ldub_mmu); +} + +uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUW, + MO_TE == MO_LE + ? full_le_lduw_mmu : full_be_lduw_mmu); +} + +int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return (int16_t)cpu_load_helper(env, addr, mmu_idx, ra, MO_TESW, + MO_TE == MO_LE + ? full_le_lduw_mmu : full_be_lduw_mmu); +} + +uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEUL, + MO_TE == MO_LE + ? full_le_ldul_mmu : full_be_ldul_mmu); +} + +uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_load_helper(env, addr, mmu_idx, ra, MO_TEQ, + MO_TE == MO_LE + ? helper_le_ldq_mmu : helper_be_ldq_mmu); +} + +uint32_t cpu_ldub_data_ra(CPUArchState *env, target_ulong ptr, + uintptr_t retaddr) +{ + return cpu_ldub_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +int cpu_ldsb_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr) +{ + return cpu_ldsb_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +uint32_t cpu_lduw_data_ra(CPUArchState *env, target_ulong ptr, + uintptr_t retaddr) +{ + return cpu_lduw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +int cpu_ldsw_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr) +{ + return cpu_ldsw_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +uint32_t cpu_ldl_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr) +{ + return cpu_ldl_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +uint64_t cpu_ldq_data_ra(CPUArchState *env, target_ulong ptr, uintptr_t retaddr) +{ + return cpu_ldq_mmuidx_ra(env, ptr, cpu_mmu_index(env, false), retaddr); +} + +uint32_t cpu_ldub_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_ldub_data_ra(env, ptr, 0); +} + +int cpu_ldsb_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_ldsb_data_ra(env, ptr, 0); +} + +uint32_t cpu_lduw_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_lduw_data_ra(env, ptr, 0); +} + +int cpu_ldsw_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_ldsw_data_ra(env, ptr, 0); +} + +uint32_t cpu_ldl_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_ldl_data_ra(env, ptr, 0); +} + +uint64_t cpu_ldq_data(CPUArchState *env, target_ulong ptr) +{ + return cpu_ldq_data_ra(env, ptr, 0); +} + +/* * Store Helpers */ @@ -1854,6 +2092,94 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, store_helper(env, addr, val, oi, retaddr, MO_BEQ); } +/* + * Store Helpers for cpu_ldst.h + */ + +static inline void QEMU_ALWAYS_INLINE +cpu_store_helper(CPUArchState *env, target_ulong addr, uint64_t val, + int mmu_idx, uintptr_t retaddr, MemOp op) +{ + TCGMemOpIdx oi; + uint16_t meminfo; + + meminfo = trace_mem_get_info(op, mmu_idx, true); + trace_guest_mem_before_exec(env_cpu(env), addr, meminfo); + + oi = make_memop_idx(op, mmu_idx); + store_helper(env, addr, val, oi, retaddr, op); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, meminfo); +} + +void cpu_stb_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val, + int mmu_idx, uintptr_t retaddr) +{ + cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_UB); +} + +void cpu_stw_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val, + int mmu_idx, uintptr_t retaddr) +{ + cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUW); +} + +void cpu_stl_mmuidx_ra(CPUArchState *env, target_ulong addr, uint32_t val, + int mmu_idx, uintptr_t retaddr) +{ + cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEUL); +} + +void cpu_stq_mmuidx_ra(CPUArchState *env, target_ulong addr, uint64_t val, + int mmu_idx, uintptr_t retaddr) +{ + cpu_store_helper(env, addr, val, mmu_idx, retaddr, MO_TEQ); +} + +void cpu_stb_data_ra(CPUArchState *env, target_ulong ptr, + uint32_t val, uintptr_t retaddr) +{ + cpu_stb_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr); +} + +void cpu_stw_data_ra(CPUArchState *env, target_ulong ptr, + uint32_t val, uintptr_t retaddr) +{ + cpu_stw_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr); +} + +void cpu_stl_data_ra(CPUArchState *env, target_ulong ptr, + uint32_t val, uintptr_t retaddr) +{ + cpu_stl_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr); +} + +void cpu_stq_data_ra(CPUArchState *env, target_ulong ptr, + uint64_t val, uintptr_t retaddr) +{ + cpu_stq_mmuidx_ra(env, ptr, val, cpu_mmu_index(env, false), retaddr); +} + +void cpu_stb_data(CPUArchState *env, target_ulong ptr, uint32_t val) +{ + cpu_stb_data_ra(env, ptr, val, 0); +} + +void cpu_stw_data(CPUArchState *env, target_ulong ptr, uint32_t val) +{ + cpu_stw_data_ra(env, ptr, val, 0); +} + +void cpu_stl_data(CPUArchState *env, target_ulong ptr, uint32_t val) +{ + cpu_stl_data_ra(env, ptr, val, 0); +} + +void cpu_stq_data(CPUArchState *env, target_ulong ptr, uint64_t val) +{ + cpu_stq_data_ra(env, ptr, val, 0); +} + /* First set of helpers allows passing in of OI and RETADDR. This makes them callable from other helpers. */ @@ -1912,98 +2238,50 @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, /* Code access functions. */ -static uint64_t full_ldub_cmmu(CPUArchState *env, target_ulong addr, +static uint64_t full_ldub_code(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi, uintptr_t retaddr) { - return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_cmmu); -} - -uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return full_ldub_cmmu(env, addr, oi, retaddr); -} - -int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return (int8_t) full_ldub_cmmu(env, addr, oi, retaddr); -} - -static uint64_t full_le_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return load_helper(env, addr, oi, retaddr, MO_LEUW, true, - full_le_lduw_cmmu); -} - -uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return full_le_lduw_cmmu(env, addr, oi, retaddr); + return load_helper(env, addr, oi, retaddr, MO_8, true, full_ldub_code); } -int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr) { - return (int16_t) full_le_lduw_cmmu(env, addr, oi, retaddr); + TCGMemOpIdx oi = make_memop_idx(MO_UB, cpu_mmu_index(env, true)); + return full_ldub_code(env, addr, oi, 0); } -static uint64_t full_be_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return load_helper(env, addr, oi, retaddr, MO_BEUW, true, - full_be_lduw_cmmu); -} - -uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return full_be_lduw_cmmu(env, addr, oi, retaddr); -} - -int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) -{ - return (int16_t) full_be_lduw_cmmu(env, addr, oi, retaddr); -} - -static uint64_t full_le_ldul_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +static uint64_t full_lduw_code(CPUArchState *env, target_ulong addr, + TCGMemOpIdx oi, uintptr_t retaddr) { - return load_helper(env, addr, oi, retaddr, MO_LEUL, true, - full_le_ldul_cmmu); + return load_helper(env, addr, oi, retaddr, MO_TEUW, true, full_lduw_code); } -uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr) { - return full_le_ldul_cmmu(env, addr, oi, retaddr); + TCGMemOpIdx oi = make_memop_idx(MO_TEUW, cpu_mmu_index(env, true)); + return full_lduw_code(env, addr, oi, 0); } -static uint64_t full_be_ldul_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +static uint64_t full_ldl_code(CPUArchState *env, target_ulong addr, + TCGMemOpIdx oi, uintptr_t retaddr) { - return load_helper(env, addr, oi, retaddr, MO_BEUL, true, - full_be_ldul_cmmu); + return load_helper(env, addr, oi, retaddr, MO_TEUL, true, full_ldl_code); } -uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr) { - return full_be_ldul_cmmu(env, addr, oi, retaddr); + TCGMemOpIdx oi = make_memop_idx(MO_TEUL, cpu_mmu_index(env, true)); + return full_ldl_code(env, addr, oi, 0); } -uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +static uint64_t full_ldq_code(CPUArchState *env, target_ulong addr, + TCGMemOpIdx oi, uintptr_t retaddr) { - return load_helper(env, addr, oi, retaddr, MO_LEQ, true, - helper_le_ldq_cmmu); + return load_helper(env, addr, oi, retaddr, MO_TEQ, true, full_ldq_code); } -uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr) +uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr) { - return load_helper(env, addr, oi, retaddr, MO_BEQ, true, - helper_be_ldq_cmmu); + TCGMemOpIdx oi = make_memop_idx(MO_TEQ, cpu_mmu_index(env, true)); + return full_ldq_code(env, addr, oi, 0); } diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c index 51cb29ca79..5b1902d591 100644 --- a/accel/tcg/tcg-runtime-gvec.c +++ b/accel/tcg/tcg-runtime-gvec.c @@ -21,7 +21,7 @@ #include "qemu/host-utils.h" #include "cpu.h" #include "exec/helper-proto.h" -#include "tcg-gvec-desc.h" +#include "tcg/tcg-gvec-desc.h" /* Virtually all hosts support 16-byte vectors. Those that don't can emulate diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c index 8a1e408e31..446465a09a 100644 --- a/accel/tcg/tcg-runtime.c +++ b/accel/tcg/tcg-runtime.c @@ -30,6 +30,7 @@ #include "exec/tb-lookup.h" #include "disas/disas.h" #include "exec/log.h" +#include "tcg/tcg.h" /* 32-bit helpers */ diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index bb325a2bc4..a08ab11f65 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -25,7 +25,7 @@ #include "trace.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg.h" +#include "tcg/tcg.h" #if defined(CONFIG_USER_ONLY) #include "qemu.h" #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index b09f7a1577..4be78eb9b3 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -20,12 +20,14 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "qemu/bitops.h" #include "exec/cpu_ldst.h" #include "translate-all.h" #include "exec/helper-proto.h" #include "qemu/atomic128.h" +#include "trace-root.h" +#include "trace/mem.h" #undef EAX #undef ECX @@ -734,6 +736,240 @@ int cpu_signal_handler(int host_signum, void *pinfo, /* The softmmu versions of these helpers are in cputlb.c. */ +uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = ldub_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr) +{ + int ret; + uint16_t meminfo = trace_mem_get_info(MO_SB, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = ldsb_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = lduw_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr) +{ + int ret; + uint16_t meminfo = trace_mem_get_info(MO_TESW, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = ldsw_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = ldl_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr) +{ + uint64_t ret; + uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, false); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + ret = ldq_p(g2h(ptr)); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); + return ret; +} + +uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + uint32_t ret; + + set_helper_retaddr(retaddr); + ret = cpu_ldub_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + int ret; + + set_helper_retaddr(retaddr); + ret = cpu_ldsb_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + uint32_t ret; + + set_helper_retaddr(retaddr); + ret = cpu_lduw_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + int ret; + + set_helper_retaddr(retaddr); + ret = cpu_ldsw_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + uint32_t ret; + + set_helper_retaddr(retaddr); + ret = cpu_ldl_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr) +{ + uint64_t ret; + + set_helper_retaddr(retaddr); + ret = cpu_ldq_data(env, ptr); + clear_helper_retaddr(); + return ret; +} + +void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val) +{ + uint16_t meminfo = trace_mem_get_info(MO_UB, MMU_USER_IDX, true); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + stb_p(g2h(ptr), val); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); +} + +void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val) +{ + uint16_t meminfo = trace_mem_get_info(MO_TEUW, MMU_USER_IDX, true); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + stw_p(g2h(ptr), val); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); +} + +void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val) +{ + uint16_t meminfo = trace_mem_get_info(MO_TEUL, MMU_USER_IDX, true); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + stl_p(g2h(ptr), val); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); +} + +void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val) +{ + uint16_t meminfo = trace_mem_get_info(MO_TEQ, MMU_USER_IDX, true); + + trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); + stq_p(g2h(ptr), val); + qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); +} + +void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr) +{ + set_helper_retaddr(retaddr); + cpu_stb_data(env, ptr, val); + clear_helper_retaddr(); +} + +void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr) +{ + set_helper_retaddr(retaddr); + cpu_stw_data(env, ptr, val); + clear_helper_retaddr(); +} + +void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr) +{ + set_helper_retaddr(retaddr); + cpu_stl_data(env, ptr, val); + clear_helper_retaddr(); +} + +void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr, + uint64_t val, uintptr_t retaddr) +{ + set_helper_retaddr(retaddr); + cpu_stq_data(env, ptr, val); + clear_helper_retaddr(); +} + +uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + + set_helper_retaddr(1); + ret = ldub_p(g2h(ptr)); + clear_helper_retaddr(); + return ret; +} + +uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + + set_helper_retaddr(1); + ret = lduw_p(g2h(ptr)); + clear_helper_retaddr(); + return ret; +} + +uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr ptr) +{ + uint32_t ret; + + set_helper_retaddr(1); + ret = ldl_p(g2h(ptr)); + clear_helper_retaddr(); + return ret; +} + +uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr ptr) +{ + uint64_t ret; + + set_helper_retaddr(1); + ret = ldq_p(g2h(ptr)); + clear_helper_retaddr(); + return ret; +} + /* Do not allow unaligned operations to proceed. Return the host address. */ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr, int size, uintptr_t retaddr) diff --git a/backends/dbus-vmstate.c b/backends/dbus-vmstate.c index 56b482a7d6..cc594a722e 100644 --- a/backends/dbus-vmstate.c +++ b/backends/dbus-vmstate.c @@ -412,7 +412,8 @@ dbus_vmstate_complete(UserCreatable *uc, Error **errp) return; } - if (vmstate_register(VMSTATE_IF(self), -1, &dbus_vmstate, self) < 0) { + if (vmstate_register(VMSTATE_IF(self), VMSTATE_INSTANCE_ID_ANY, + &dbus_vmstate, self) < 0) { error_setg(errp, "Failed to register vmstate"); } } diff --git a/bsd-user/main.c b/bsd-user/main.c index 7f4e3cd627..770c2b267a 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -33,7 +33,7 @@ #include "qemu/module.h" #include "cpu.h" #include "exec/exec-all.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "qemu/timer.h" #include "qemu/envlist.h" #include "exec/log.h" @@ -4761,6 +4761,12 @@ if compile_prog "" "" ; then syncfs=yes fi +# check for kcov support (kernel must be 4.4+, compiled with certain options) +kcov=no +if check_include sys/kcov.h ; then + kcov=yes +fi + # Check we have a new enough version of sphinx-build has_sphinx_build() { # This is a bit awkward but works: create a trivial document and @@ -5191,6 +5197,19 @@ if compile_prog "" "" ; then strchrnul=yes fi +######################################### +# check if we have st_atim + +st_atim=no +cat > $TMPC << EOF +#include <sys/stat.h> +#include <stddef.h> +int main(void) { return offsetof(struct stat, st_atim); } +EOF +if compile_prog "" "" ; then + st_atim=yes +fi + ########################################## # check if trace backend exists @@ -6874,6 +6893,9 @@ fi if test "$syncfs" = "yes" ; then echo "CONFIG_SYNCFS=y" >> $config_host_mak fi +if test "$kcov" = "yes" ; then + echo "CONFIG_KCOV=y" >> $config_host_mak +fi if test "$inotify" = "yes" ; then echo "CONFIG_INOTIFY=y" >> $config_host_mak fi @@ -6886,6 +6908,9 @@ fi if test "$strchrnul" = "yes" ; then echo "HAVE_STRCHRNUL=y" >> $config_host_mak fi +if test "$st_atim" = "yes" ; then + echo "HAVE_STRUCT_STAT_ST_ATIM=y" >> $config_host_mak +fi if test "$byteswap_h" = "yes" ; then echo "CONFIG_BYTESWAP_H=y" >> $config_host_mak fi @@ -7437,7 +7462,6 @@ elif test "$ARCH" = "riscv32" || test "$ARCH" = "riscv64" ; then else QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg/\$(ARCH) $QEMU_INCLUDES" fi -QEMU_INCLUDES="-iquote \$(SRC_PATH)/tcg $QEMU_INCLUDES" echo "TOOLS=$tools" >> $config_host_mak echo "ROMS=$roms" >> $config_host_mak diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index ec27b78ff1..b89bf18501 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -392,26 +392,37 @@ vu_send_reply(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) return vu_message_write(dev, conn_fd, vmsg); } +/* + * Processes a reply on the slave channel. + * Entered with slave_mutex held and releases it before exit. + * Returns true on success. + */ static bool vu_process_message_reply(VuDev *dev, const VhostUserMsg *vmsg) { VhostUserMsg msg_reply; + bool result = false; if ((vmsg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) { - return true; + result = true; + goto out; } if (!vu_message_read(dev, dev->slave_fd, &msg_reply)) { - return false; + goto out; } if (msg_reply.request != vmsg->request) { DPRINT("Received unexpected msg type. Expected %d received %d", vmsg->request, msg_reply.request); - return false; + goto out; } - return msg_reply.payload.u64 == 0; + result = msg_reply.payload.u64 == 0; + +out: + pthread_mutex_unlock(&dev->slave_mutex); + return result; } /* Kick the log_call_fd if required. */ @@ -554,6 +565,21 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) } static bool +map_ring(VuDev *dev, VuVirtq *vq) +{ + vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr); + vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr); + vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + return !(vq->vring.desc && vq->vring.used && vq->vring.avail); +} + +static bool vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) { int i; @@ -756,6 +782,14 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) close(vmsg->fds[i]); } + for (i = 0; i < dev->max_queues; i++) { + if (dev->vq[i].vring.desc) { + if (map_ring(dev, &dev->vq[i])) { + vu_panic(dev, "remaping queue %d during setmemtable", i); + } + } + } + return false; } @@ -842,18 +876,12 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) DPRINT(" avail_user_addr: 0x%016" PRIx64 "\n", vra->avail_user_addr); DPRINT(" log_guest_addr: 0x%016" PRIx64 "\n", vra->log_guest_addr); + vq->vra = *vra; vq->vring.flags = vra->flags; - vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); - vq->vring.used = qva_to_va(dev, vra->used_user_addr); - vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); vq->vring.log_guest_addr = vra->log_guest_addr; - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->vring.desc); - DPRINT(" vring_used at %p\n", vq->vring.used); - DPRINT(" vring_avail at %p\n", vq->vring.avail); - if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { + if (map_ring(dev, vq)) { vu_panic(dev, "Invalid vring_addr message"); return false; } @@ -1105,10 +1133,13 @@ bool vu_set_queue_host_notifier(VuDev *dev, VuVirtq *vq, int fd, return false; } + pthread_mutex_lock(&dev->slave_mutex); if (!vu_message_write(dev, dev->slave_fd, &vmsg)) { + pthread_mutex_unlock(&dev->slave_mutex); return false; } + /* Also unlocks the slave_mutex */ return vu_process_message_reply(dev, &vmsg); } @@ -1628,6 +1659,7 @@ vu_deinit(VuDev *dev) close(dev->slave_fd); dev->slave_fd = -1; } + pthread_mutex_destroy(&dev->slave_mutex); if (dev->sock != -1) { close(dev->sock); @@ -1663,6 +1695,7 @@ vu_init(VuDev *dev, dev->remove_watch = remove_watch; dev->iface = iface; dev->log_call_fd = -1; + pthread_mutex_init(&dev->slave_mutex, NULL); dev->slave_fd = -1; dev->max_queues = max_queues; diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 46b600799b..5cb7708559 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -19,6 +19,7 @@ #include <stddef.h> #include <sys/poll.h> #include <linux/vhost.h> +#include <pthread.h> #include "standard-headers/linux/virtio_ring.h" /* Based on qemu/hw/virtio/vhost-user.c */ @@ -326,6 +327,9 @@ typedef struct VuVirtq { int err_fd; unsigned int enable; bool started; + + /* Guest addresses of our ring */ + struct vhost_vring_addr vra; } VuVirtq; enum VuWatchCondtion { @@ -355,6 +359,8 @@ struct VuDev { VuVirtq *vq; VuDevInflightInfo inflight_info; int log_call_fd; + /* Must be held while using slave_fd */ + pthread_mutex_t slave_mutex; int slave_fd; uint64_t log_size; uint8_t *log_table; @@ -53,7 +53,7 @@ #include "qemu/bitmap.h" #include "qemu/seqlock.h" #include "qemu/guest-random.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "hw/nmi.h" #include "sysemu/replay.h" #include "sysemu/runstate.h" diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak index 1f2e0e7fde..645e6201bb 100644 --- a/default-configs/arm-softmmu.mak +++ b/default-configs/arm-softmmu.mak @@ -30,6 +30,7 @@ CONFIG_Z2=y CONFIG_COLLIE=y CONFIG_ASPEED_SOC=y CONFIG_NETDUINO2=y +CONFIG_NETDUINOPLUS2=y CONFIG_MPS2=y CONFIG_RASPI=y CONFIG_DIGIC=y diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst index c74cd090e6..03aa9e7ff8 100644 --- a/docs/devel/loads-stores.rst +++ b/docs/devel/loads-stores.rst @@ -72,31 +72,66 @@ Regexes for git grep - ``\<ldn_\([hbl]e\)?_p\>`` - ``\<stn_\([hbl]e\)?_p\>`` -``cpu_{ld,st}_*`` -~~~~~~~~~~~~~~~~~ +``cpu_{ld,st}*_mmuidx_ra`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions operate on a guest virtual address plus a context, +known as a "mmu index" or ``mmuidx``, which controls how that virtual +address is translated. The meaning of the indexes are target specific, +but specifying a particular index might be necessary if, for instance, +the helper requires an "always as non-privileged" access rather that +the default access for the current state of the guest CPU. + +These functions may cause a guest CPU exception to be taken +(e.g. for an alignment fault or MMU fault) which will result in +guest CPU state being updated and control longjmp'ing out of the +function call. They should therefore only be used in code that is +implementing emulation of the guest CPU. + +The ``retaddr`` parameter is used to control unwinding of the +guest CPU state in case of a guest CPU exception. This is passed +to ``cpu_restore_state()``. Therefore the value should either be 0, +to indicate that the guest CPU state is already synchronized, or +the result of ``GETPC()`` from the top level ``HELPER(foo)`` +function, which is a return address into the generated code. + +Function names follow the pattern: + +load: ``cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmuidx, retaddr)`` + +store: ``cpu_st{size}_mmuidx_ra(env, ptr, val, mmuidx, retaddr)`` + +``sign`` + - (empty) : for 32 or 64 bit sizes + - ``u`` : unsigned + - ``s`` : signed + +``size`` + - ``b`` : 8 bits + - ``w`` : 16 bits + - ``l`` : 32 bits + - ``q`` : 64 bits + +Regexes for git grep: + - ``\<cpu_ld[us]\?[bwlq]_mmuidx_ra\>`` + - ``\<cpu_st[bwlq]_mmuidx_ra\>`` + +``cpu_{ld,st}*_data_ra`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +These functions work like the ``cpu_{ld,st}_mmuidx_ra`` functions +except that the ``mmuidx`` parameter is taken from the current mode +of the guest CPU, as determined by ``cpu_mmu_index(env, false)``. -These functions operate on a guest virtual address. Be aware -that these functions may cause a guest CPU exception to be -taken (e.g. for an alignment fault or MMU fault) which will -result in guest CPU state being updated and control longjumping -out of the function call. They should therefore only be used -in code that is implementing emulation of the target CPU. - -These functions may throw an exception (longjmp() back out -to the top level TCG loop). This means they must only be used -from helper functions where the translator has saved all -necessary CPU state before generating the helper function call. -It's usually better to use the ``_ra`` variants described below -from helper functions, but these functions are the right choice -for calls made from hooks like the CPU do_interrupt hook or -when you know for certain that the translator had to save all -the CPU state that ``cpu_restore_state()`` would restore anyway. +These are generally the preferred way to do accesses by guest +virtual address from helper functions, unless the access should +be performed with a context other than the default. Function names follow the pattern: -load: ``cpu_ld{sign}{size}_{mmusuffix}(env, ptr)`` +load: ``cpu_ld{sign}{size}_data_ra(env, ptr, ra)`` -store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)`` +store: ``cpu_st{size}_data_ra(env, ptr, val, ra)`` ``sign`` - (empty) : for 32 or 64 bit sizes @@ -109,56 +144,119 @@ store: ``cpu_st{size}_{mmusuffix}(env, ptr, val)`` - ``l`` : 32 bits - ``q`` : 64 bits -``mmusuffix`` is one of the generic suffixes ``data`` or ``code``, or -(for softmmu configs) a target-specific MMU mode suffix as defined -in the target's ``cpu.h``. +Regexes for git grep: + - ``\<cpu_ld[us]\?[bwlq]_data_ra\>`` + - ``\<cpu_st[bwlq]_data_ra\>`` + +``cpu_{ld,st}*_data`` +~~~~~~~~~~~~~~~~~~~~~ + +These functions work like the ``cpu_{ld,st}_data_ra`` functions +except that the ``retaddr`` parameter is 0, and thus does not +unwind guest CPU state. + +This means they must only be used from helper functions where the +translator has saved all necessary CPU state. These functions are +the right choice for calls made from hooks like the CPU ``do_interrupt`` +hook or when you know for certain that the translator had to save all +the CPU state anyway. + +Function names follow the pattern: + +load: ``cpu_ld{sign}{size}_data(env, ptr)`` + +store: ``cpu_st{size}_data(env, ptr, val)`` + +``sign`` + - (empty) : for 32 or 64 bit sizes + - ``u`` : unsigned + - ``s`` : signed + +``size`` + - ``b`` : 8 bits + - ``w`` : 16 bits + - ``l`` : 32 bits + - ``q`` : 64 bits Regexes for git grep - - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+\>`` - - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+\>`` + - ``\<cpu_ld[us]\?[bwlq]_data\>`` + - ``\<cpu_st[bwlq]_data\+\>`` -``cpu_{ld,st}_*_ra`` -~~~~~~~~~~~~~~~~~~~~ +``cpu_ld*_code`` +~~~~~~~~~~~~~~~~ -These functions work like the ``cpu_{ld,st}_*`` functions except -that they also take a ``retaddr`` argument. This extra argument -allows for correct unwinding of any exception that is taken, -and should generally be the result of GETPC() called directly -from the top level HELPER(foo) function (i.e. the return address -in the generated code). +These functions perform a read for instruction execution. The ``mmuidx`` +parameter is taken from the current mode of the guest CPU, as determined +by ``cpu_mmu_index(env, true)``. The ``retaddr`` parameter is 0, and +thus does not unwind guest CPU state, because CPU state is always +synchronized while translating instructions. Any guest CPU exception +that is raised will indicate an instruction execution fault rather than +a data read fault. -These are generally the preferred way to do accesses by guest -virtual address from helper functions; see the documentation -of the non-``_ra`` variants for when those would be better. +In general these functions should not be used directly during translation. +There are wrapper functions that are to be used which also take care of +plugins for tracing. + +Function names follow the pattern: + +load: ``cpu_ld{sign}{size}_code(env, ptr)`` + +``sign`` + - (empty) : for 32 or 64 bit sizes + - ``u`` : unsigned + - ``s`` : signed + +``size`` + - ``b`` : 8 bits + - ``w`` : 16 bits + - ``l`` : 32 bits + - ``q`` : 64 bits + +Regexes for git grep: + - ``\<cpu_ld[us]\?[bwlq]_code\>`` + +``translator_ld*`` +~~~~~~~~~~~~~~~~~~ -Calling these functions with a ``retaddr`` argument of 0 is -equivalent to calling the non-``_ra`` version of the function. +These functions are a wrapper for ``cpu_ld*_code`` which also perform +any actions required by any tracing plugins. They are only to be +called during the translator callback ``translate_insn``. + +There is a set of functions ending in ``_swap`` which, if the parameter +is true, returns the value in the endianness that is the reverse of +the guest native endianness, as determined by ``TARGET_WORDS_BIGENDIAN``. Function names follow the pattern: -load: ``cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr)`` +load: ``translator_ld{sign}{size}(env, ptr)`` + +swap: ``translator_ld{sign}{size}_swap(env, ptr, swap)`` + +``sign`` + - (empty) : for 32 or 64 bit sizes + - ``u`` : unsigned + - ``s`` : signed -store: ``cpu_st{sign}{size}_{mmusuffix}_ra(env, ptr, val, retaddr)`` +``size`` + - ``b`` : 8 bits + - ``w`` : 16 bits + - ``l`` : 32 bits + - ``q`` : 64 bits Regexes for git grep - - ``\<cpu_ld[us]\?[bwlq]_[a-zA-Z0-9]\+_ra\>`` - - ``\<cpu_st[bwlq]_[a-zA-Z0-9]\+_ra\>`` + - ``\<translator_ld[us]\?[bwlq]\(_swap\)\?\>`` -``helper_*_{ld,st}*mmu`` -~~~~~~~~~~~~~~~~~~~~~~~~ +``helper_*_{ld,st}*_mmu`` +~~~~~~~~~~~~~~~~~~~~~~~~~ These functions are intended primarily to be called by the code generated by the TCG backend. They may also be called by target -CPU helper function code. Like the ``cpu_{ld,st}_*_ra`` functions -they perform accesses by guest virtual address; the difference is -that these functions allow you to specify an ``opindex`` parameter -which encodes (among other things) the mmu index to use for the -access. This is necessary if your helper needs to make an access -via a specific mmu index (for instance, an "always as non-privileged" -access) rather than using the default mmu index for the current state -of the guest CPU. +CPU helper function code. Like the ``cpu_{ld,st}_mmuidx_ra`` functions +they perform accesses by guest virtual address, with a given ``mmuidx``. -The ``opindex`` parameter should be created by calling ``make_memop_idx()``. +These functions specify an ``opindex`` parameter which encodes +(among other things) the mmu index to use for the access. This parameter +should be created by calling ``make_memop_idx()``. The ``retaddr`` parameter should be the result of GETPC() called directly from the top level HELPER(foo) function (or 0 if no guest CPU state @@ -166,13 +264,12 @@ unwinding is required). **TODO** The names of these functions are a bit odd for historical reasons because they were originally expected to be called only from -within generated code. We should rename them to bring them -more in line with the other memory access functions. +within generated code. We should rename them to bring them more in +line with the other memory access functions. The explicit endianness +is the only feature they have beyond ``*_mmuidx_ra``. load: ``helper_{endian}_ld{sign}{size}_mmu(env, addr, opindex, retaddr)`` -load (code): ``helper_{endian}_ld{sign}{size}_cmmu(env, addr, opindex, retaddr)`` - store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)`` ``sign`` @@ -192,7 +289,7 @@ store: ``helper_{endian}_st{size}_mmu(env, addr, val, opindex, retaddr)`` - ``ret`` : target endianness Regexes for git grep - - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_c\?mmu\>`` + - ``\<helper_\(le\|be\|ret\)_ld[us]\?[bwlq]_mmu\>`` - ``\<helper_\(le\|be\|ret\)_st[bwlq]_mmu\>`` ``address_space_*`` diff --git a/docs/index.html.in b/docs/index.html.in index 94eb782cf7..8512933d14 100644 --- a/docs/index.html.in +++ b/docs/index.html.in @@ -12,6 +12,7 @@ <li><a href="qemu-ga-ref.html">Guest Agent Protocol Reference</a></li> <li><a href="interop/index.html">System Emulation Management and Interoperability Guide</a></li> <li><a href="specs/index.html">System Emulation Guest Hardware Specifications</a></li> + <li><a href="system/index.html">System Emulation User's Guide</a></li> </ul> </body> </html> diff --git a/docs/index.rst b/docs/index.rst index baa5791c17..46405d4f07 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,4 +13,4 @@ Welcome to QEMU's documentation! interop/index devel/index specs/index - + system/index diff --git a/docs/interop/conf.py b/docs/interop/conf.py index e87b8c22be..40b1ad811d 100644 --- a/docs/interop/conf.py +++ b/docs/interop/conf.py @@ -18,5 +18,7 @@ html_theme_options['description'] = u'System Emulation Management and Interopera # (source start file, name, description, authors, manual section). man_pages = [ ('qemu-ga', 'qemu-ga', u'QEMU Guest Agent', - ['Michael Roth <mdroth@linux.vnet.ibm.com>'], 8) + ['Michael Roth <mdroth@linux.vnet.ibm.com>'], 8), + ('qemu-nbd', 'qemu-nbd', u'QEMU Disk Network Block Device Server', + ['Anthony Liguori <anthony@codemonkey.ws>'], 8) ] diff --git a/docs/interop/index.rst b/docs/interop/index.rst index 049387ac6d..c28f7785a5 100644 --- a/docs/interop/index.rst +++ b/docs/interop/index.rst @@ -18,5 +18,6 @@ Contents: live-block-operations pr-helper qemu-ga + qemu-nbd vhost-user vhost-user-gpu diff --git a/docs/interop/qemu-nbd.rst b/docs/interop/qemu-nbd.rst new file mode 100644 index 0000000000..873bb9e17d --- /dev/null +++ b/docs/interop/qemu-nbd.rst @@ -0,0 +1,263 @@ +QEMU Disk Network Block Device Server +===================================== + +Synopsis +-------- + +**qemu-nbd** [*OPTION*]... *filename* + +**qemu-nbd** -L [*OPTION*]... + +**qemu-nbd** -d *dev* + +Description +----------- + +Export a QEMU disk image using the NBD protocol. + +Other uses: + +- Bind a /dev/nbdX block device to a QEMU server (on Linux). +- As a client to query exports of a remote NBD server. + +Options +------- + +.. program:: qemu-nbd + +*filename* is a disk image filename, or a set of block +driver options if ``--image-opts`` is specified. + +*dev* is an NBD device. + +.. option:: --object type,id=ID,...props... + + Define a new instance of the *type* object class identified by *ID*. + See the :manpage:`qemu(1)` manual page for full details of the properties + supported. The common object types that it makes sense to define are the + ``secret`` object, which is used to supply passwords and/or encryption + keys, and the ``tls-creds`` object, which is used to supply TLS + credentials for the qemu-nbd server or client. + +.. option:: -p, --port=PORT + + TCP port to listen on as a server, or connect to as a client + (default ``10809``). + +.. option:: -o, --offset=OFFSET + + The offset into the image. + +.. option:: -b, --bind=IFACE + + The interface to bind to as a server, or connect to as a client + (default ``0.0.0.0``). + +.. option:: -k, --socket=PATH + + Use a unix socket with path *PATH*. + +.. option:: --image-opts + + Treat *filename* as a set of image options, instead of a plain + filename. If this flag is specified, the ``-f`` flag should + not be used, instead the :option:`format=` option should be set. + +.. option:: -f, --format=FMT + + Force the use of the block driver for format *FMT* instead of + auto-detecting. + +.. option:: -r, --read-only + + Export the disk as read-only. + +.. option:: -P, --partition=NUM + + Deprecated: Only expose MBR partition *NUM*. Understands physical + partitions 1-4 and logical partition 5. New code should instead use + :option:`--image-opts` with the raw driver wrapping a subset of the + original image. + +.. option:: -B, --bitmap=NAME + + If *filename* has a qcow2 persistent bitmap *NAME*, expose + that bitmap via the ``qemu:dirty-bitmap:NAME`` context + accessible through NBD_OPT_SET_META_CONTEXT. + +.. option:: -s, --snapshot + + Use *filename* as an external snapshot, create a temporary + file with ``backing_file=``\ *filename*, redirect the write to + the temporary one. + +.. option:: -l, --load-snapshot=SNAPSHOT_PARAM + + Load an internal snapshot inside *filename* and export it + as an read-only device, SNAPSHOT_PARAM format is + ``snapshot.id=[ID],snapshot.name=[NAME]`` or ``[ID_OR_NAME]`` + +.. option:: --cache=CACHE + + The cache mode to be used with the file. See the documentation of + the emulator's ``-drive cache=...`` option for allowed values. + +.. option:: -n, --nocache + + Equivalent to :option:`--cache=none`. + +.. option:: --aio=AIO + + Set the asynchronous I/O mode between ``threads`` (the default) + and ``native`` (Linux only). + +.. option:: --discard=DISCARD + + Control whether ``discard`` (also known as ``trim`` or ``unmap``) + requests are ignored or passed to the filesystem. *DISCARD* is one of + ``ignore`` (or ``off``), ``unmap`` (or ``on``). The default is + ``ignore``. + +.. option:: --detect-zeroes=DETECT_ZEROES + + Control the automatic conversion of plain zero writes by the OS to + driver-specific optimized zero write commands. *DETECT_ZEROES* is one of + ``off``, ``on``, or ``unmap``. ``unmap`` + converts a zero write to an unmap operation and can only be used if + *DISCARD* is set to ``unmap``. The default is ``off``. + +.. option:: -c, --connect=DEV + + Connect *filename* to NBD device *DEV* (Linux only). + +.. option:: -d, --disconnect + + Disconnect the device *DEV* (Linux only). + +.. option:: -e, --shared=NUM + + Allow up to *NUM* clients to share the device (default + ``1``). Safe for readers, but for now, consistency is not + guaranteed between multiple writers. + +.. option:: -t, --persistent + + Don't exit on the last connection. + +.. option:: -x, --export-name=NAME + + Set the NBD volume export name (default of a zero-length string). + +.. option:: -D, --description=DESCRIPTION + + Set the NBD volume export description, as a human-readable + string. + +.. option:: -L, --list + + Connect as a client and list all details about the exports exposed by + a remote NBD server. This enables list mode, and is incompatible + with options that change behavior related to a specific export (such as + :option:`--export-name`, :option:`--offset`, ...). + +.. option:: --tls-creds=ID + + Enable mandatory TLS encryption for the server by setting the ID + of the TLS credentials object previously created with the --object + option; or provide the credentials needed for connecting as a client + in list mode. + +.. option:: --fork + + Fork off the server process and exit the parent once the server is running. + +.. option:: --pid-file=PATH + + Store the server's process ID in the given file. + +.. option:: --tls-authz=ID + + Specify the ID of a qauthz object previously created with the + :option:`--object` option. This will be used to authorize connecting users + against their x509 distinguished name. + +.. option:: -v, --verbose + + Display extra debugging information. + +.. option:: -h, --help + + Display this help and exit. + +.. option:: -V, --version + + Display version information and exit. + +.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE] + + .. include:: qemu-option-trace.rst.inc + +Examples +-------- + +Start a server listening on port 10809 that exposes only the +guest-visible contents of a qcow2 file, with no TLS encryption, and +with the default export name (an empty string). The command is +one-shot, and will block until the first successful client +disconnects: + +:: + + qemu-nbd -f qcow2 file.qcow2 + +Start a long-running server listening with encryption on port 10810, +and whitelist clients with a specific X.509 certificate to connect to +a 1 megabyte subset of a raw file, using the export name 'subset': + +:: + + qemu-nbd \ + --object tls-creds-x509,id=tls0,endpoint=server,dir=/path/to/qemutls \ + --object 'authz-simple,id=auth0,identity=CN=laptop.example.com,,\ + O=Example Org,,L=London,,ST=London,,C=GB' \ + --tls-creds tls0 --tls-authz auth0 \ + -t -x subset -p 10810 \ + --image-opts driver=raw,offset=1M,size=1M,file.driver=file,file.filename=file.raw + +Serve a read-only copy of just the first MBR partition of a guest +image over a Unix socket with as many as 5 simultaneous readers, with +a persistent process forked as a daemon: + +:: + + qemu-nbd --fork --persistent --shared=5 --socket=/path/to/sock \ + --partition=1 --read-only --format=qcow2 file.qcow2 + +Expose the guest-visible contents of a qcow2 file via a block device +/dev/nbd0 (and possibly creating /dev/nbd0p1 and friends for +partitions found within), then disconnect the device when done. +Access to bind qemu-nbd to an /dev/nbd device generally requires root +privileges, and may also require the execution of ``modprobe nbd`` +to enable the kernel NBD client module. *CAUTION*: Do not use +this method to mount filesystems from an untrusted guest image - a +malicious guest may have prepared the image to attempt to trigger +kernel bugs in partition probing or file system mounting. + +:: + + qemu-nbd -c /dev/nbd0 -f qcow2 file.qcow2 + qemu-nbd -d /dev/nbd0 + +Query a remote server to see details about what export(s) it is +serving on port 10809, and authenticating via PSK: + +:: + + qemu-nbd \ + --object tls-creds-psk,id=tls0,dir=/tmp/keys,username=eblake,endpoint=client \ + --tls-creds tls0 -L -b remote.example.com + +See also +-------- + +:manpage:`qemu(1)`, :manpage:`qemu-img(1)` diff --git a/docs/interop/qemu-option-trace.rst.inc b/docs/interop/qemu-option-trace.rst.inc new file mode 100644 index 0000000000..23cfcb4853 --- /dev/null +++ b/docs/interop/qemu-option-trace.rst.inc @@ -0,0 +1,30 @@ +.. + The contents of this file must be kept in sync with qemu-option-trace.texi + until all the users of the texi file have been converted to rst and + the texi file can be removed. + +Specify tracing options. + +.. option:: [enable=]PATTERN + + Immediately enable events matching *PATTERN* + (either event name or a globbing pattern). This option is only + available if QEMU has been compiled with the ``simple``, ``log`` + or ``ftrace`` tracing backend. To specify multiple events or patterns, + specify the :option:`-trace` option multiple times. + + Use :option:`-trace help` to print a list of names of trace points. + +.. option:: events=FILE + + Immediately enable events listed in *FILE*. + The file must contain one event name (as listed in the ``trace-events-all`` + file) per line; globbing patterns are accepted too. This option is only + available if QEMU has been compiled with the ``simple``, ``log`` or + ``ftrace`` tracing backend. + +.. option:: file=FILE + + Log output traces to *FILE*. + This option is only available if QEMU has been compiled with + the ``simple`` tracing backend. diff --git a/docs/interop/vhost-user.json b/docs/interop/vhost-user.json index ce0ef74db5..ef8ac5941f 100644 --- a/docs/interop/vhost-user.json +++ b/docs/interop/vhost-user.json @@ -31,6 +31,7 @@ # @rproc-serial: virtio remoteproc serial link # @scsi: virtio scsi # @vsock: virtio vsock transport +# @fs: virtio fs (since 4.2) # # Since: 4.0 ## @@ -50,7 +51,8 @@ 'rpmsg', 'rproc-serial', 'scsi', - 'vsock' + 'vsock', + 'fs' ] } diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi deleted file mode 100644 index 2c7ea49c32..0000000000 --- a/docs/qemu-block-drivers.texi +++ /dev/null @@ -1,889 +0,0 @@ -@c man begin SYNOPSIS -QEMU block driver reference manual -@c man end - -@set qemu_system qemu-system-x86_64 - -@c man begin DESCRIPTION - -@node disk_images_formats -@subsection Disk image file formats - -QEMU supports many image file formats that can be used with VMs as well as with -any of the tools (like @code{qemu-img}). This includes the preferred formats -raw and qcow2 as well as formats that are supported for compatibility with -older QEMU versions or other hypervisors. - -Depending on the image format, different options can be passed to -@code{qemu-img create} and @code{qemu-img convert} using the @code{-o} option. -This section describes each format and the options that are supported for it. - -@table @option -@item raw - -Raw disk image format. This format has the advantage of -being simple and easily exportable to all other emulators. If your -file system supports @emph{holes} (for example in ext2 or ext3 on -Linux or NTFS on Windows), then only the written sectors will reserve -space. Use @code{qemu-img info} to know the real size used by the -image or @code{ls -ls} on Unix/Linux. - -Supported options: -@table @code -@item preallocation -Preallocation mode (allowed values: @code{off}, @code{falloc}, @code{full}). -@code{falloc} mode preallocates space for image by calling posix_fallocate(). -@code{full} mode preallocates space for image by writing data to underlying -storage. This data may or may not be zero, depending on the storage location. -@end table - -@item qcow2 -QEMU image format, the most versatile format. Use it to have smaller -images (useful if your filesystem does not supports holes, for example -on Windows), zlib based compression and support of multiple VM -snapshots. - -Supported options: -@table @code -@item compat -Determines the qcow2 version to use. @code{compat=0.10} uses the -traditional image format that can be read by any QEMU since 0.10. -@code{compat=1.1} enables image format extensions that only QEMU 1.1 and -newer understand (this is the default). Amongst others, this includes -zero clusters, which allow efficient copy-on-read for sparse images. - -@item backing_file -File name of a base image (see @option{create} subcommand) -@item backing_fmt -Image format of the base image -@item encryption -This option is deprecated and equivalent to @code{encrypt.format=aes} - -@item encrypt.format - -If this is set to @code{luks}, it requests that the qcow2 payload (not -qcow2 header) be encrypted using the LUKS format. The passphrase to -use to unlock the LUKS key slot is given by the @code{encrypt.key-secret} -parameter. LUKS encryption parameters can be tuned with the other -@code{encrypt.*} parameters. - -If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC. -The encryption key is given by the @code{encrypt.key-secret} parameter. -This encryption format is considered to be flawed by modern cryptography -standards, suffering from a number of design problems: - -@itemize @minus -@item The AES-CBC cipher is used with predictable initialization vectors based -on the sector number. This makes it vulnerable to chosen plaintext attacks -which can reveal the existence of encrypted data. -@item The user passphrase is directly used as the encryption key. A poorly -chosen or short passphrase will compromise the security of the encryption. -@item In the event of the passphrase being compromised there is no way to -change the passphrase to protect data in any qcow images. The files must -be cloned, using a different encryption passphrase in the new file. The -original file must then be securely erased using a program like shred, -though even this is ineffective with many modern storage technologies. -@end itemize - -The use of this is no longer supported in system emulators. Support only -remains in the command line utilities, for the purposes of data liberation -and interoperability with old versions of QEMU. The @code{luks} format -should be used instead. - -@item encrypt.key-secret - -Provides the ID of a @code{secret} object that contains the passphrase -(@code{encrypt.format=luks}) or encryption key (@code{encrypt.format=aes}). - -@item encrypt.cipher-alg - -Name of the cipher algorithm and key length. Currently defaults -to @code{aes-256}. Only used when @code{encrypt.format=luks}. - -@item encrypt.cipher-mode - -Name of the encryption mode to use. Currently defaults to @code{xts}. -Only used when @code{encrypt.format=luks}. - -@item encrypt.ivgen-alg - -Name of the initialization vector generator algorithm. Currently defaults -to @code{plain64}. Only used when @code{encrypt.format=luks}. - -@item encrypt.ivgen-hash-alg - -Name of the hash algorithm to use with the initialization vector generator -(if required). Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}. - -@item encrypt.hash-alg - -Name of the hash algorithm to use for PBKDF algorithm -Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}. - -@item encrypt.iter-time - -Amount of time, in milliseconds, to use for PBKDF algorithm per key slot. -Defaults to @code{2000}. Only used when @code{encrypt.format=luks}. - -@item cluster_size -Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster -sizes can improve the image file size whereas larger cluster sizes generally -provide better performance. - -@item preallocation -Preallocation mode (allowed values: @code{off}, @code{metadata}, @code{falloc}, -@code{full}). An image with preallocated metadata is initially larger but can -improve performance when the image needs to grow. @code{falloc} and @code{full} -preallocations are like the same options of @code{raw} format, but sets up -metadata also. - -@item lazy_refcounts -If this option is set to @code{on}, reference count updates are postponed with -the goal of avoiding metadata I/O and improving performance. This is -particularly interesting with @option{cache=writethrough} which doesn't batch -metadata updates. The tradeoff is that after a host crash, the reference count -tables must be rebuilt, i.e. on the next open an (automatic) @code{qemu-img -check -r all} is required, which may take some time. - -This option can only be enabled if @code{compat=1.1} is specified. - -@item nocow -If this option is set to @code{on}, it will turn off COW of the file. It's only -valid on btrfs, no effect on other file systems. - -Btrfs has low performance when hosting a VM image file, even more when the guest -on the VM also using btrfs as file system. Turning off COW is a way to mitigate -this bad performance. Generally there are two ways to turn off COW on btrfs: -a) Disable it by mounting with nodatacow, then all newly created files will be -NOCOW. b) For an empty file, add the NOCOW file attribute. That's what this option -does. - -Note: this option is only valid to new or empty files. If there is an existing -file which is COW and has data blocks already, it couldn't be changed to NOCOW -by setting @code{nocow=on}. One can issue @code{lsattr filename} to check if -the NOCOW flag is set or not (Capital 'C' is NOCOW flag). - -@end table - -@item qed -Old QEMU image format with support for backing files and compact image files -(when your filesystem or transport medium does not support holes). - -When converting QED images to qcow2, you might want to consider using the -@code{lazy_refcounts=on} option to get a more QED-like behaviour. - -Supported options: -@table @code -@item backing_file -File name of a base image (see @option{create} subcommand). -@item backing_fmt -Image file format of backing file (optional). Useful if the format cannot be -autodetected because it has no header, like some vhd/vpc files. -@item cluster_size -Changes the cluster size (must be power-of-2 between 4K and 64K). Smaller -cluster sizes can improve the image file size whereas larger cluster sizes -generally provide better performance. -@item table_size -Changes the number of clusters per L1/L2 table (must be power-of-2 between 1 -and 16). There is normally no need to change this value but this option can be -used for performance benchmarking. -@end table - -@item qcow -Old QEMU image format with support for backing files, compact image files, -encryption and compression. - -Supported options: -@table @code -@item backing_file -File name of a base image (see @option{create} subcommand) -@item encryption -This option is deprecated and equivalent to @code{encrypt.format=aes} - -@item encrypt.format -If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC. -The encryption key is given by the @code{encrypt.key-secret} parameter. -This encryption format is considered to be flawed by modern cryptography -standards, suffering from a number of design problems enumerated previously -against the @code{qcow2} image format. - -The use of this is no longer supported in system emulators. Support only -remains in the command line utilities, for the purposes of data liberation -and interoperability with old versions of QEMU. - -Users requiring native encryption should use the @code{qcow2} format -instead with @code{encrypt.format=luks}. - -@item encrypt.key-secret - -Provides the ID of a @code{secret} object that contains the encryption -key (@code{encrypt.format=aes}). - -@end table - -@item luks - -LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup - -Supported options: -@table @code - -@item key-secret - -Provides the ID of a @code{secret} object that contains the passphrase. - -@item cipher-alg - -Name of the cipher algorithm and key length. Currently defaults -to @code{aes-256}. - -@item cipher-mode - -Name of the encryption mode to use. Currently defaults to @code{xts}. - -@item ivgen-alg - -Name of the initialization vector generator algorithm. Currently defaults -to @code{plain64}. - -@item ivgen-hash-alg - -Name of the hash algorithm to use with the initialization vector generator -(if required). Defaults to @code{sha256}. - -@item hash-alg - -Name of the hash algorithm to use for PBKDF algorithm -Defaults to @code{sha256}. - -@item iter-time - -Amount of time, in milliseconds, to use for PBKDF algorithm per key slot. -Defaults to @code{2000}. - -@end table - -@item vdi -VirtualBox 1.1 compatible image format. -Supported options: -@table @code -@item static -If this option is set to @code{on}, the image is created with metadata -preallocation. -@end table - -@item vmdk -VMware 3 and 4 compatible image format. - -Supported options: -@table @code -@item backing_file -File name of a base image (see @option{create} subcommand). -@item compat6 -Create a VMDK version 6 image (instead of version 4) -@item hwversion -Specify vmdk virtual hardware version. Compat6 flag cannot be enabled -if hwversion is specified. -@item subformat -Specifies which VMDK subformat to use. Valid options are -@code{monolithicSparse} (default), -@code{monolithicFlat}, -@code{twoGbMaxExtentSparse}, -@code{twoGbMaxExtentFlat} and -@code{streamOptimized}. -@end table - -@item vpc -VirtualPC compatible image format (VHD). -Supported options: -@table @code -@item subformat -Specifies which VHD subformat to use. Valid options are -@code{dynamic} (default) and @code{fixed}. -@end table - -@item VHDX -Hyper-V compatible image format (VHDX). -Supported options: -@table @code -@item subformat -Specifies which VHDX subformat to use. Valid options are -@code{dynamic} (default) and @code{fixed}. -@item block_state_zero -Force use of payload blocks of type 'ZERO'. Can be set to @code{on} (default) -or @code{off}. When set to @code{off}, new blocks will be created as -@code{PAYLOAD_BLOCK_NOT_PRESENT}, which means parsers are free to return -arbitrary data for those blocks. Do not set to @code{off} when using -@code{qemu-img convert} with @code{subformat=dynamic}. -@item block_size -Block size; min 1 MB, max 256 MB. 0 means auto-calculate based on image size. -@item log_size -Log size; min 1 MB. -@end table -@end table - -@subsubsection Read-only formats -More disk image file formats are supported in a read-only mode. -@table @option -@item bochs -Bochs images of @code{growing} type. -@item cloop -Linux Compressed Loop image, useful only to reuse directly compressed -CD-ROM images present for example in the Knoppix CD-ROMs. -@item dmg -Apple disk image. -@item parallels -Parallels disk image format. -@end table - - -@node host_drives -@subsection Using host drives - -In addition to disk image files, QEMU can directly access host -devices. We describe here the usage for QEMU version >= 0.8.3. - -@subsubsection Linux - -On Linux, you can directly use the host device filename instead of a -disk image filename provided you have enough privileges to access -it. For example, use @file{/dev/cdrom} to access to the CDROM. - -@table @code -@item CD -You can specify a CDROM device even if no CDROM is loaded. QEMU has -specific code to detect CDROM insertion or removal. CDROM ejection by -the guest OS is supported. Currently only data CDs are supported. -@item Floppy -You can specify a floppy device even if no floppy is loaded. Floppy -removal is currently not detected accurately (if you change floppy -without doing floppy access while the floppy is not loaded, the guest -OS will think that the same floppy is loaded). -Use of the host's floppy device is deprecated, and support for it will -be removed in a future release. -@item Hard disks -Hard disks can be used. Normally you must specify the whole disk -(@file{/dev/hdb} instead of @file{/dev/hdb1}) so that the guest OS can -see it as a partitioned disk. WARNING: unless you know what you do, it -is better to only make READ-ONLY accesses to the hard disk otherwise -you may corrupt your host data (use the @option{-snapshot} command -line option or modify the device permissions accordingly). -@end table - -@subsubsection Windows - -@table @code -@item CD -The preferred syntax is the drive letter (e.g. @file{d:}). The -alternate syntax @file{\\.\d:} is supported. @file{/dev/cdrom} is -supported as an alias to the first CDROM drive. - -Currently there is no specific code to handle removable media, so it -is better to use the @code{change} or @code{eject} monitor commands to -change or eject media. -@item Hard disks -Hard disks can be used with the syntax: @file{\\.\PhysicalDrive@var{N}} -where @var{N} is the drive number (0 is the first hard disk). - -WARNING: unless you know what you do, it is better to only make -READ-ONLY accesses to the hard disk otherwise you may corrupt your -host data (use the @option{-snapshot} command line so that the -modifications are written in a temporary file). -@end table - - -@subsubsection Mac OS X - -@file{/dev/cdrom} is an alias to the first CDROM. - -Currently there is no specific code to handle removable media, so it -is better to use the @code{change} or @code{eject} monitor commands to -change or eject media. - -@node disk_images_fat_images -@subsection Virtual FAT disk images - -QEMU can automatically create a virtual FAT disk image from a -directory tree. In order to use it, just type: - -@example -@value{qemu_system} linux.img -hdb fat:/my_directory -@end example - -Then you access access to all the files in the @file{/my_directory} -directory without having to copy them in a disk image or to export -them via SAMBA or NFS. The default access is @emph{read-only}. - -Floppies can be emulated with the @code{:floppy:} option: - -@example -@value{qemu_system} linux.img -fda fat:floppy:/my_directory -@end example - -A read/write support is available for testing (beta stage) with the -@code{:rw:} option: - -@example -@value{qemu_system} linux.img -fda fat:floppy:rw:/my_directory -@end example - -What you should @emph{never} do: -@itemize -@item use non-ASCII filenames ; -@item use "-snapshot" together with ":rw:" ; -@item expect it to work when loadvm'ing ; -@item write to the FAT directory on the host system while accessing it with the guest system. -@end itemize - -@node disk_images_nbd -@subsection NBD access - -QEMU can access directly to block device exported using the Network Block Device -protocol. - -@example -@value{qemu_system} linux.img -hdb nbd://my_nbd_server.mydomain.org:1024/ -@end example - -If the NBD server is located on the same host, you can use an unix socket instead -of an inet socket: - -@example -@value{qemu_system} linux.img -hdb nbd+unix://?socket=/tmp/my_socket -@end example - -In this case, the block device must be exported using qemu-nbd: - -@example -qemu-nbd --socket=/tmp/my_socket my_disk.qcow2 -@end example - -The use of qemu-nbd allows sharing of a disk between several guests: -@example -qemu-nbd --socket=/tmp/my_socket --share=2 my_disk.qcow2 -@end example - -@noindent -and then you can use it with two guests: -@example -@value{qemu_system} linux1.img -hdb nbd+unix://?socket=/tmp/my_socket -@value{qemu_system} linux2.img -hdb nbd+unix://?socket=/tmp/my_socket -@end example - -If the nbd-server uses named exports (supported since NBD 2.9.18, or with QEMU's -own embedded NBD server), you must specify an export name in the URI: -@example -@value{qemu_system} -cdrom nbd://localhost/debian-500-ppc-netinst -@value{qemu_system} -cdrom nbd://localhost/openSUSE-11.1-ppc-netinst -@end example - -The URI syntax for NBD is supported since QEMU 1.3. An alternative syntax is -also available. Here are some example of the older syntax: -@example -@value{qemu_system} linux.img -hdb nbd:my_nbd_server.mydomain.org:1024 -@value{qemu_system} linux2.img -hdb nbd:unix:/tmp/my_socket -@value{qemu_system} -cdrom nbd:localhost:10809:exportname=debian-500-ppc-netinst -@end example - -@node disk_images_sheepdog -@subsection Sheepdog disk images - -Sheepdog is a distributed storage system for QEMU. It provides highly -available block level storage volumes that can be attached to -QEMU-based virtual machines. - -You can create a Sheepdog disk image with the command: -@example -qemu-img create sheepdog:///@var{image} @var{size} -@end example -where @var{image} is the Sheepdog image name and @var{size} is its -size. - -To import the existing @var{filename} to Sheepdog, you can use a -convert command. -@example -qemu-img convert @var{filename} sheepdog:///@var{image} -@end example - -You can boot from the Sheepdog disk image with the command: -@example -@value{qemu_system} sheepdog:///@var{image} -@end example - -You can also create a snapshot of the Sheepdog image like qcow2. -@example -qemu-img snapshot -c @var{tag} sheepdog:///@var{image} -@end example -where @var{tag} is a tag name of the newly created snapshot. - -To boot from the Sheepdog snapshot, specify the tag name of the -snapshot. -@example -@value{qemu_system} sheepdog:///@var{image}#@var{tag} -@end example - -You can create a cloned image from the existing snapshot. -@example -qemu-img create -b sheepdog:///@var{base}#@var{tag} sheepdog:///@var{image} -@end example -where @var{base} is an image name of the source snapshot and @var{tag} -is its tag name. - -You can use an unix socket instead of an inet socket: - -@example -@value{qemu_system} sheepdog+unix:///@var{image}?socket=@var{path} -@end example - -If the Sheepdog daemon doesn't run on the local host, you need to -specify one of the Sheepdog servers to connect to. -@example -qemu-img create sheepdog://@var{hostname}:@var{port}/@var{image} @var{size} -@value{qemu_system} sheepdog://@var{hostname}:@var{port}/@var{image} -@end example - -@node disk_images_iscsi -@subsection iSCSI LUNs - -iSCSI is a popular protocol used to access SCSI devices across a computer -network. - -There are two different ways iSCSI devices can be used by QEMU. - -The first method is to mount the iSCSI LUN on the host, and make it appear as -any other ordinary SCSI device on the host and then to access this device as a -/dev/sd device from QEMU. How to do this differs between host OSes. - -The second method involves using the iSCSI initiator that is built into -QEMU. This provides a mechanism that works the same way regardless of which -host OS you are running QEMU on. This section will describe this second method -of using iSCSI together with QEMU. - -In QEMU, iSCSI devices are described using special iSCSI URLs - -@example -URL syntax: -iscsi://[<username>[%<password>]@@]<host>[:<port>]/<target-iqn-name>/<lun> -@end example - -Username and password are optional and only used if your target is set up -using CHAP authentication for access control. -Alternatively the username and password can also be set via environment -variables to have these not show up in the process list - -@example -export LIBISCSI_CHAP_USERNAME=<username> -export LIBISCSI_CHAP_PASSWORD=<password> -iscsi://<host>/<target-iqn-name>/<lun> -@end example - -Various session related parameters can be set via special options, either -in a configuration file provided via '-readconfig' or directly on the -command line. - -If the initiator-name is not specified qemu will use a default name -of 'iqn.2008-11.org.linux-kvm[:<uuid>'] where <uuid> is the UUID of the -virtual machine. If the UUID is not specified qemu will use -'iqn.2008-11.org.linux-kvm[:<name>'] where <name> is the name of the -virtual machine. - -@example -Setting a specific initiator name to use when logging in to the target --iscsi initiator-name=iqn.qemu.test:my-initiator -@end example - -@example -Controlling which type of header digest to negotiate with the target --iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE -@end example - -These can also be set via a configuration file -@example -[iscsi] - user = "CHAP username" - password = "CHAP password" - initiator-name = "iqn.qemu.test:my-initiator" - # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE - header-digest = "CRC32C" -@end example - - -Setting the target name allows different options for different targets -@example -[iscsi "iqn.target.name"] - user = "CHAP username" - password = "CHAP password" - initiator-name = "iqn.qemu.test:my-initiator" - # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE - header-digest = "CRC32C" -@end example - - -Howto use a configuration file to set iSCSI configuration options: -@example -cat >iscsi.conf <<EOF -[iscsi] - user = "me" - password = "my password" - initiator-name = "iqn.qemu.test:my-initiator" - header-digest = "CRC32C" -EOF - -@value{qemu_system} -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \ - -readconfig iscsi.conf -@end example - - -How to set up a simple iSCSI target on loopback and access it via QEMU: -@example -This example shows how to set up an iSCSI target with one CDROM and one DISK -using the Linux STGT software target. This target is available on Red Hat based -systems as the package 'scsi-target-utils'. - -tgtd --iscsi portal=127.0.0.1:3260 -tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.qemu.test -tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 1 \ - -b /IMAGES/disk.img --device-type=disk -tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 2 \ - -b /IMAGES/cd.iso --device-type=cd -tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL - -@value{qemu_system} -iscsi initiator-name=iqn.qemu.test:my-initiator \ - -boot d -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \ - -cdrom iscsi://127.0.0.1/iqn.qemu.test/2 -@end example - -@node disk_images_gluster -@subsection GlusterFS disk images - -GlusterFS is a user space distributed file system. - -You can boot from the GlusterFS disk image with the command: -@example -URI: -@value{qemu_system} -drive file=gluster[+@var{type}]://[@var{host}[:@var{port}]]/@var{volume}/@var{path} - [?socket=...][,file.debug=9][,file.logfile=...] - -JSON: -@value{qemu_system} 'json:@{"driver":"qcow2", - "file":@{"driver":"gluster", - "volume":"testvol","path":"a.img","debug":9,"logfile":"...", - "server":[@{"type":"tcp","host":"...","port":"..."@}, - @{"type":"unix","socket":"..."@}]@}@}' -@end example - -@var{gluster} is the protocol. - -@var{type} specifies the transport type used to connect to gluster -management daemon (glusterd). Valid transport types are -tcp and unix. In the URI form, if a transport type isn't specified, -then tcp type is assumed. - -@var{host} specifies the server where the volume file specification for -the given volume resides. This can be either a hostname or an ipv4 address. -If transport type is unix, then @var{host} field should not be specified. -Instead @var{socket} field needs to be populated with the path to unix domain -socket. - -@var{port} is the port number on which glusterd is listening. This is optional -and if not specified, it defaults to port 24007. If the transport type is unix, -then @var{port} should not be specified. - -@var{volume} is the name of the gluster volume which contains the disk image. - -@var{path} is the path to the actual disk image that resides on gluster volume. - -@var{debug} is the logging level of the gluster protocol driver. Debug levels -are 0-9, with 9 being the most verbose, and 0 representing no debugging output. -The default level is 4. The current logging levels defined in the gluster source -are 0 - None, 1 - Emergency, 2 - Alert, 3 - Critical, 4 - Error, 5 - Warning, -6 - Notice, 7 - Info, 8 - Debug, 9 - Trace - -@var{logfile} is a commandline option to mention log file path which helps in -logging to the specified file and also help in persisting the gfapi logs. The -default is stderr. - - - - -You can create a GlusterFS disk image with the command: -@example -qemu-img create gluster://@var{host}/@var{volume}/@var{path} @var{size} -@end example - -Examples -@example -@value{qemu_system} -drive file=gluster://1.2.3.4/testvol/a.img -@value{qemu_system} -drive file=gluster+tcp://1.2.3.4/testvol/a.img -@value{qemu_system} -drive file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img -@value{qemu_system} -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img -@value{qemu_system} -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img -@value{qemu_system} -drive file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img -@value{qemu_system} -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket -@value{qemu_system} -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img -@value{qemu_system} -drive file=gluster://1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log -@value{qemu_system} 'json:@{"driver":"qcow2", - "file":@{"driver":"gluster", - "volume":"testvol","path":"a.img", - "debug":9,"logfile":"/var/log/qemu-gluster.log", - "server":[@{"type":"tcp","host":"1.2.3.4","port":24007@}, - @{"type":"unix","socket":"/var/run/glusterd.socket"@}]@}@}' -@value{qemu_system} -drive driver=qcow2,file.driver=gluster,file.volume=testvol,file.path=/path/a.img, - file.debug=9,file.logfile=/var/log/qemu-gluster.log, - file.server.0.type=tcp,file.server.0.host=1.2.3.4,file.server.0.port=24007, - file.server.1.type=unix,file.server.1.socket=/var/run/glusterd.socket -@end example - -@node disk_images_ssh -@subsection Secure Shell (ssh) disk images - -You can access disk images located on a remote ssh server -by using the ssh protocol: - -@example -@value{qemu_system} -drive file=ssh://[@var{user}@@]@var{server}[:@var{port}]/@var{path}[?host_key_check=@var{host_key_check}] -@end example - -Alternative syntax using properties: - -@example -@value{qemu_system} -drive file.driver=ssh[,file.user=@var{user}],file.host=@var{server}[,file.port=@var{port}],file.path=@var{path}[,file.host_key_check=@var{host_key_check}] -@end example - -@var{ssh} is the protocol. - -@var{user} is the remote user. If not specified, then the local -username is tried. - -@var{server} specifies the remote ssh server. Any ssh server can be -used, but it must implement the sftp-server protocol. Most Unix/Linux -systems should work without requiring any extra configuration. - -@var{port} is the port number on which sshd is listening. By default -the standard ssh port (22) is used. - -@var{path} is the path to the disk image. - -The optional @var{host_key_check} parameter controls how the remote -host's key is checked. The default is @code{yes} which means to use -the local @file{.ssh/known_hosts} file. Setting this to @code{no} -turns off known-hosts checking. Or you can check that the host key -matches a specific fingerprint: -@code{host_key_check=md5:78:45:8e:14:57:4f:d5:45:83:0a:0e:f3:49:82:c9:c8} -(@code{sha1:} can also be used as a prefix, but note that OpenSSH -tools only use MD5 to print fingerprints). - -Currently authentication must be done using ssh-agent. Other -authentication methods may be supported in future. - -Note: Many ssh servers do not support an @code{fsync}-style operation. -The ssh driver cannot guarantee that disk flush requests are -obeyed, and this causes a risk of disk corruption if the remote -server or network goes down during writes. The driver will -print a warning when @code{fsync} is not supported: - -warning: ssh server @code{ssh.example.com:22} does not support fsync - -With sufficiently new versions of libssh and OpenSSH, @code{fsync} is -supported. - -@node disk_images_nvme -@subsection NVMe disk images - -NVM Express (NVMe) storage controllers can be accessed directly by a userspace -driver in QEMU. This bypasses the host kernel file system and block layers -while retaining QEMU block layer functionalities, such as block jobs, I/O -throttling, image formats, etc. Disk I/O performance is typically higher than -with @code{-drive file=/dev/sda} using either thread pool or linux-aio. - -The controller will be exclusively used by the QEMU process once started. To be -able to share storage between multiple VMs and other applications on the host, -please use the file based protocols. - -Before starting QEMU, bind the host NVMe controller to the host vfio-pci -driver. For example: - -@example -# modprobe vfio-pci -# lspci -n -s 0000:06:0d.0 -06:0d.0 0401: 1102:0002 (rev 08) -# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind -# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id - -# @value{qemu_system} -drive file=nvme://@var{host}:@var{bus}:@var{slot}.@var{func}/@var{namespace} -@end example - -Alternative syntax using properties: - -@example -@value{qemu_system} -drive file.driver=nvme,file.device=@var{host}:@var{bus}:@var{slot}.@var{func},file.namespace=@var{namespace} -@end example - -@var{host}:@var{bus}:@var{slot}.@var{func} is the NVMe controller's PCI device -address on the host. - -@var{namespace} is the NVMe namespace number, starting from 1. - -@node disk_image_locking -@subsection Disk image file locking - -By default, QEMU tries to protect image files from unexpected concurrent -access, as long as it's supported by the block protocol driver and host -operating system. If multiple QEMU processes (including QEMU emulators and -utilities) try to open the same image with conflicting accessing modes, all but -the first one will get an error. - -This feature is currently supported by the file protocol on Linux with the Open -File Descriptor (OFD) locking API, and can be configured to fall back to POSIX -locking if the POSIX host doesn't support Linux OFD locking. - -To explicitly enable image locking, specify "locking=on" in the file protocol -driver options. If OFD locking is not possible, a warning will be printed and -the POSIX locking API will be used. In this case there is a risk that the lock -will get silently lost when doing hot plugging and block jobs, due to the -shortcomings of the POSIX locking API. - -QEMU transparently handles lock handover during shared storage migration. For -shared virtual disk images between multiple VMs, the "share-rw" device option -should be used. - -By default, the guest has exclusive write access to its disk image. If the -guest can safely share the disk image with other writers the @code{-device -...,share-rw=on} parameter can be used. This is only safe if the guest is -running software, such as a cluster file system, that coordinates disk accesses -to avoid corruption. - -Note that share-rw=on only declares the guest's ability to share the disk. -Some QEMU features, such as image file formats, require exclusive write access -to the disk image and this is unaffected by the share-rw=on option. - -Alternatively, locking can be fully disabled by "locking=off" block device -option. In the command line, the option is usually in the form of -"file.locking=off" as the protocol driver is normally placed as a "file" child -under a format driver. For example: - -@code{-blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file} - -To check if image locking is active, check the output of the "lslocks" command -on host and see if there are locks held by the QEMU process on the image file. -More than one byte could be locked by the QEMU instance, each byte of which -reflects a particular permission that is acquired or protected by the running -block driver. - -@c man end - -@ignore - -@setfilename qemu-block-drivers -@settitle QEMU block drivers reference - -@c man begin SEEALSO -The HTML documentation of QEMU for more precise information and Linux -user mode emulator invocation. -@c man end - -@c man begin AUTHOR -Fabrice Bellard and the QEMU Project developers -@c man end - -@end ignore diff --git a/docs/specs/acpi_cpu_hotplug.txt b/docs/specs/acpi_cpu_hotplug.txt index ee219c8358..a8ce5e7402 100644 --- a/docs/specs/acpi_cpu_hotplug.txt +++ b/docs/specs/acpi_cpu_hotplug.txt @@ -15,14 +15,14 @@ CPU present bitmap for: PIIX-PM (IO port 0xaf00-0xaf1f, 1-byte access) One bit per CPU. Bit position reflects corresponding CPU APIC ID. Read-only. The first DWORD in bitmap is used in write mode to switch from legacy - to new CPU hotplug interface, write 0 into it to do switch. + to modern CPU hotplug interface, write 0 into it to do switch. --------------------------------------------------------------- QEMU sets corresponding CPU bit on hot-add event and issues SCI with GPE.2 event set. CPU present map is read by ACPI BIOS GPE.2 handler to notify OS about CPU hot-add events. CPU hot-remove isn't supported. ===================================== -ACPI CPU hotplug interface registers: +Modern ACPI CPU hotplug interface registers: ------------------------------------- Register block base address: ICH9-LPC IO port 0x0cd8 @@ -30,9 +30,25 @@ Register block base address: Register block size: ACPI_CPU_HOTPLUG_REG_LEN = 12 +All accesses to registers described below, imply little-endian byte order. + +Reserved resisters behavior: + - write accesses are ignored + - read accesses return all bits set to 0. + +The last stored value in 'CPU selector' must refer to a possible CPU, otherwise + - reads from any register return 0 + - writes to any other register are ignored until valid value is stored into it +On QEMU start, 'CPU selector' is initialized to a valid value, on reset it +keeps the current value. + read access: offset: - [0x0-0x3] reserved + [0x0-0x3] Command data 2: (DWORD access) + if value last stored in 'Command field': + 0: reads as 0x0 + 3: upper 32 bits of architecture specific CPU ID value + other values: reserved [0x4] CPU device status fields: (1 byte access) bits: 0: Device is enabled and may be used by guest @@ -44,15 +60,17 @@ read access: 3-7: reserved and should be ignored by OSPM [0x5-0x7] reserved [0x8] Command data: (DWORD access) - in case of error or unsupported command reads is 0xFFFFFFFF - current 'Command field' value: - 0: returns PXM value corresponding to device + contains 0 unless value last stored in 'Command field' is one of: + 0: contains 'CPU selector' value of a CPU with pending event[s] + 3: lower 32 bits of architecture specific CPU ID value + (in x86 case: APIC ID) write access: offset: [0x0-0x3] CPU selector: (DWORD access) selects active CPU device. All following accesses to other registers will read/store data from/to selected CPU. + Valid values: [0 .. max_cpus) [0x4] CPU device control fields: (1 byte access) bits: 0: reserved, OSPM must clear it before writing to register. @@ -69,9 +87,9 @@ write access: value: 0: selects a CPU device with inserting/removing events and following reads from 'Command data' register return - selected CPU (CPU selector value). If no CPU with events - found, the current CPU selector doesn't change and - corresponding insert/remove event flags are not set. + selected CPU ('CPU selector' value). + If no CPU with events found, the current 'CPU selector' doesn't + change and corresponding insert/remove event flags are not modified. 1: following writes to 'Command data' register set OST event register in QEMU 2: following writes to 'Command data' register set OST status @@ -79,16 +97,53 @@ write access: other values: reserved [0x6-0x7] reserved [0x8] Command data: (DWORD access) - current 'Command field' value: - 0: OSPM reads value of CPU selector + if last stored 'Command field' value: 1: stores value into OST event register 2: stores value into OST status register, triggers ACPI_DEVICE_OST QMP event from QEMU to external applications with current values of OST event and status registers. - other values: reserved + other values: reserved + +Typical usecases: + - (x86) Detecting and enabling modern CPU hotplug interface. + QEMU starts with legacy CPU hotplug interface enabled. Detecting and + switching to modern interface is based on the 2 legacy CPU hotplug features: + 1. Writes into CPU bitmap are ignored. + 2. CPU bitmap always has bit#0 set, corresponding to boot CPU. + + Use following steps to detect and enable modern CPU hotplug interface: + 1. Store 0x0 to the 'CPU selector' register, + attempting to switch to modern mode + 2. Store 0x0 to the 'CPU selector' register, + to ensure valid selector value + 3. Store 0x0 to the 'Command field' register, + 4. Read the 'Command data 2' register. + If read value is 0x0, the modern interface is enabled. + Otherwise legacy or no CPU hotplug interface available + + - Get a cpu with pending event + 1. Store 0x0 to the 'CPU selector' register. + 2. Store 0x0 to the 'Command field' register. + 3. Read the 'CPU device status fields' register. + 4. If both bit#1 and bit#2 are clear in the value read, there is no CPU + with a pending event and selected CPU remains unchanged. + 5. Otherwise, read the 'Command data' register. The value read is the + selector of the CPU with the pending event (which is already + selected). -Selecting CPU device beyond possible range has no effect on platform: - - write accesses to CPU hot-plug registers not documented above are - ignored - - read accesses to CPU hot-plug registers not documented above return - all bits set to 0. + - Enumerate CPUs present/non present CPUs + 01. Set the present CPU count to 0. + 02. Set the iterator to 0. + 03. Store 0x0 to the 'CPU selector' register, to ensure that it's in + a valid state and that access to other registers won't be ignored. + 04. Store 0x0 to the 'Command field' register to make 'Command data' + register return 'CPU selector' value of selected CPU + 05. Read the 'CPU device status fields' register. + 06. If bit#0 is set, increment the present CPU count. + 07. Increment the iterator. + 08. Store the iterator to the 'CPU selector' register. + 09. Read the 'Command data' register. + 10. If the value read is not zero, goto 05. + 11. Otherwise store 0x0 to the 'CPU selector' register, to put it + into a valid state and exit. + The iterator at this point equals "max_cpus". diff --git a/docs/system/conf.py b/docs/system/conf.py new file mode 100644 index 0000000000..7ca115f5e0 --- /dev/null +++ b/docs/system/conf.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# +# QEMU documentation build configuration file for the 'system' manual. +# +# This includes the top level conf file and then makes any necessary tweaks. +import sys +import os + +qemu_docdir = os.path.abspath("..") +parent_config = os.path.join(qemu_docdir, "conf.py") +exec(compile(open(parent_config, "rb").read(), parent_config, 'exec')) + +# This slightly misuses the 'description', but is the best way to get +# the manual title to appear in the sidebar. +html_theme_options['description'] = u'System Emulation User''s Guide' +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('qemu-block-drivers', 'qemu-block-drivers', + u'QEMU block drivers reference', + ['Fabrice Bellard and the QEMU Project developers'], 7) +] diff --git a/docs/system/index.rst b/docs/system/index.rst new file mode 100644 index 0000000000..f66e6ea585 --- /dev/null +++ b/docs/system/index.rst @@ -0,0 +1,17 @@ +.. This is the top level page for the 'system' manual. + + +QEMU System Emulation User's Guide +================================== + +This manual is the overall guide for users using QEMU +for full system emulation (as opposed to user-mode emulation). +This includes working with hypervisors such as KVM, Xen, Hax +or Hypervisor.Framework. + +Contents: + +.. toctree:: + :maxdepth: 2 + + qemu-block-drivers diff --git a/docs/system/qemu-block-drivers.rst b/docs/system/qemu-block-drivers.rst new file mode 100644 index 0000000000..388adbefbf --- /dev/null +++ b/docs/system/qemu-block-drivers.rst @@ -0,0 +1,985 @@ +QEMU block drivers reference +============================ + +.. |qemu_system| replace:: qemu-system-x86_64 + +.. + We put the 'Synopsis' and 'See also' sections into the manpage, but not + the HTML. This makes the HTML docs read better and means the ToC in + the index has a more useful set of entries. Ideally, the section + headings 'Disk image file formats' would be top-level headings for + the HTML, but sub-headings of the conventional manpage 'Description' + header for the manpage. Unfortunately, due to deficiencies in + the Sphinx 'only' directive, this isn't possible: they must be headers + at the same level as 'Synopsis' and 'See also', otherwise Sphinx's + identification of which header underline style is which gets confused. + +.. only:: man + + Synopsis + -------- + + QEMU block driver reference manual + +Disk image file formats +----------------------- + +QEMU supports many image file formats that can be used with VMs as well as with +any of the tools (like ``qemu-img``). This includes the preferred formats +raw and qcow2 as well as formats that are supported for compatibility with +older QEMU versions or other hypervisors. + +Depending on the image format, different options can be passed to +``qemu-img create`` and ``qemu-img convert`` using the ``-o`` option. +This section describes each format and the options that are supported for it. + +.. program:: image-formats +.. option:: raw + + Raw disk image format. This format has the advantage of + being simple and easily exportable to all other emulators. If your + file system supports *holes* (for example in ext2 or ext3 on + Linux or NTFS on Windows), then only the written sectors will reserve + space. Use ``qemu-img info`` to know the real size used by the + image or ``ls -ls`` on Unix/Linux. + + Supported options: + + .. program:: raw + .. option:: preallocation + + Preallocation mode (allowed values: ``off``, ``falloc``, + ``full``). ``falloc`` mode preallocates space for image by + calling ``posix_fallocate()``. ``full`` mode preallocates space + for image by writing data to underlying storage. This data may or + may not be zero, depending on the storage location. + +.. program:: image-formats +.. option:: qcow2 + + QEMU image format, the most versatile format. Use it to have smaller + images (useful if your filesystem does not supports holes, for example + on Windows), zlib based compression and support of multiple VM + snapshots. + + Supported options: + + .. program:: qcow2 + .. option:: compat + + Determines the qcow2 version to use. ``compat=0.10`` uses the + traditional image format that can be read by any QEMU since 0.10. + ``compat=1.1`` enables image format extensions that only QEMU 1.1 and + newer understand (this is the default). Amongst others, this includes + zero clusters, which allow efficient copy-on-read for sparse images. + + .. option:: backing_file + + File name of a base image (see ``create`` subcommand) + + .. option:: backing_fmt + + Image format of the base image + + .. option:: encryption + + This option is deprecated and equivalent to ``encrypt.format=aes`` + + .. option:: encrypt.format + + If this is set to ``luks``, it requests that the qcow2 payload (not + qcow2 header) be encrypted using the LUKS format. The passphrase to + use to unlock the LUKS key slot is given by the ``encrypt.key-secret`` + parameter. LUKS encryption parameters can be tuned with the other + ``encrypt.*`` parameters. + + If this is set to ``aes``, the image is encrypted with 128-bit AES-CBC. + The encryption key is given by the ``encrypt.key-secret`` parameter. + This encryption format is considered to be flawed by modern cryptography + standards, suffering from a number of design problems: + + - The AES-CBC cipher is used with predictable initialization vectors based + on the sector number. This makes it vulnerable to chosen plaintext attacks + which can reveal the existence of encrypted data. + - The user passphrase is directly used as the encryption key. A poorly + chosen or short passphrase will compromise the security of the encryption. + - In the event of the passphrase being compromised there is no way to + change the passphrase to protect data in any qcow images. The files must + be cloned, using a different encryption passphrase in the new file. The + original file must then be securely erased using a program like shred, + though even this is ineffective with many modern storage technologies. + + The use of this is no longer supported in system emulators. Support only + remains in the command line utilities, for the purposes of data liberation + and interoperability with old versions of QEMU. The ``luks`` format + should be used instead. + + .. option:: encrypt.key-secret + + Provides the ID of a ``secret`` object that contains the passphrase + (``encrypt.format=luks``) or encryption key (``encrypt.format=aes``). + + .. option:: encrypt.cipher-alg + + Name of the cipher algorithm and key length. Currently defaults + to ``aes-256``. Only used when ``encrypt.format=luks``. + + .. option:: encrypt.cipher-mode + + Name of the encryption mode to use. Currently defaults to ``xts``. + Only used when ``encrypt.format=luks``. + + .. option:: encrypt.ivgen-alg + + Name of the initialization vector generator algorithm. Currently defaults + to ``plain64``. Only used when ``encrypt.format=luks``. + + .. option:: encrypt.ivgen-hash-alg + + Name of the hash algorithm to use with the initialization vector generator + (if required). Defaults to ``sha256``. Only used when ``encrypt.format=luks``. + + .. option:: encrypt.hash-alg + + Name of the hash algorithm to use for PBKDF algorithm + Defaults to ``sha256``. Only used when ``encrypt.format=luks``. + + .. option:: encrypt.iter-time + + Amount of time, in milliseconds, to use for PBKDF algorithm per key slot. + Defaults to ``2000``. Only used when ``encrypt.format=luks``. + + .. option:: cluster_size + + Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster + sizes can improve the image file size whereas larger cluster sizes generally + provide better performance. + + .. option:: preallocation + + Preallocation mode (allowed values: ``off``, ``metadata``, ``falloc``, + ``full``). An image with preallocated metadata is initially larger but can + improve performance when the image needs to grow. ``falloc`` and ``full`` + preallocations are like the same options of ``raw`` format, but sets up + metadata also. + + .. option:: lazy_refcounts + + If this option is set to ``on``, reference count updates are postponed with + the goal of avoiding metadata I/O and improving performance. This is + particularly interesting with :option:`cache=writethrough` which doesn't batch + metadata updates. The tradeoff is that after a host crash, the reference count + tables must be rebuilt, i.e. on the next open an (automatic) ``qemu-img + check -r all`` is required, which may take some time. + + This option can only be enabled if ``compat=1.1`` is specified. + + .. option:: nocow + + If this option is set to ``on``, it will turn off COW of the file. It's only + valid on btrfs, no effect on other file systems. + + Btrfs has low performance when hosting a VM image file, even more + when the guest on the VM also using btrfs as file system. Turning off + COW is a way to mitigate this bad performance. Generally there are two + ways to turn off COW on btrfs: + + - Disable it by mounting with nodatacow, then all newly created files + will be NOCOW. + - For an empty file, add the NOCOW file attribute. That's what this + option does. + + Note: this option is only valid to new or empty files. If there is + an existing file which is COW and has data blocks already, it couldn't + be changed to NOCOW by setting ``nocow=on``. One can issue ``lsattr + filename`` to check if the NOCOW flag is set or not (Capital 'C' is + NOCOW flag). + +.. program:: image-formats +.. option:: qed + + Old QEMU image format with support for backing files and compact image files + (when your filesystem or transport medium does not support holes). + + When converting QED images to qcow2, you might want to consider using the + ``lazy_refcounts=on`` option to get a more QED-like behaviour. + + Supported options: + + .. program:: qed + .. option:: backing_file + + File name of a base image (see ``create`` subcommand). + + .. option:: backing_fmt + + Image file format of backing file (optional). Useful if the format cannot be + autodetected because it has no header, like some vhd/vpc files. + + .. option:: cluster_size + + Changes the cluster size (must be power-of-2 between 4K and 64K). Smaller + cluster sizes can improve the image file size whereas larger cluster sizes + generally provide better performance. + + .. option:: table_size + + Changes the number of clusters per L1/L2 table (must be + power-of-2 between 1 and 16). There is normally no need to + change this value but this option can between used for + performance benchmarking. + +.. program:: image-formats +.. option:: qcow + + Old QEMU image format with support for backing files, compact image files, + encryption and compression. + + Supported options: + + .. program:: qcow + .. option:: backing_file + + File name of a base image (see ``create`` subcommand) + + .. option:: encryption + + This option is deprecated and equivalent to ``encrypt.format=aes`` + + .. option:: encrypt.format + + If this is set to ``aes``, the image is encrypted with 128-bit AES-CBC. + The encryption key is given by the ``encrypt.key-secret`` parameter. + This encryption format is considered to be flawed by modern cryptography + standards, suffering from a number of design problems enumerated previously + against the ``qcow2`` image format. + + The use of this is no longer supported in system emulators. Support only + remains in the command line utilities, for the purposes of data liberation + and interoperability with old versions of QEMU. + + Users requiring native encryption should use the ``qcow2`` format + instead with ``encrypt.format=luks``. + + .. option:: encrypt.key-secret + + Provides the ID of a ``secret`` object that contains the encryption + key (``encrypt.format=aes``). + +.. program:: image-formats +.. option:: luks + + LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup + + Supported options: + + .. program:: luks + .. option:: key-secret + + Provides the ID of a ``secret`` object that contains the passphrase. + + .. option:: cipher-alg + + Name of the cipher algorithm and key length. Currently defaults + to ``aes-256``. + + .. option:: cipher-mode + + Name of the encryption mode to use. Currently defaults to ``xts``. + + .. option:: ivgen-alg + + Name of the initialization vector generator algorithm. Currently defaults + to ``plain64``. + + .. option:: ivgen-hash-alg + + Name of the hash algorithm to use with the initialization vector generator + (if required). Defaults to ``sha256``. + + .. option:: hash-alg + + Name of the hash algorithm to use for PBKDF algorithm + Defaults to ``sha256``. + + .. option:: iter-time + + Amount of time, in milliseconds, to use for PBKDF algorithm per key slot. + Defaults to ``2000``. + +.. program:: image-formats +.. option:: vdi + + VirtualBox 1.1 compatible image format. + + Supported options: + + .. program:: vdi + .. option:: static + + If this option is set to ``on``, the image is created with metadata + preallocation. + +.. program:: image-formats +.. option:: vmdk + + VMware 3 and 4 compatible image format. + + Supported options: + + .. program: vmdk + .. option:: backing_file + + File name of a base image (see ``create`` subcommand). + + .. option:: compat6 + + Create a VMDK version 6 image (instead of version 4) + + .. option:: hwversion + + Specify vmdk virtual hardware version. Compat6 flag cannot be enabled + if hwversion is specified. + + .. option:: subformat + + Specifies which VMDK subformat to use. Valid options are + ``monolithicSparse`` (default), + ``monolithicFlat``, + ``twoGbMaxExtentSparse``, + ``twoGbMaxExtentFlat`` and + ``streamOptimized``. + +.. program:: image-formats +.. option:: vpc + + VirtualPC compatible image format (VHD). + + Supported options: + + .. program:: vpc + .. option:: subformat + + Specifies which VHD subformat to use. Valid options are + ``dynamic`` (default) and ``fixed``. + +.. program:: image-formats +.. option:: VHDX + + Hyper-V compatible image format (VHDX). + + Supported options: + + .. program:: VHDX + .. option:: subformat + + Specifies which VHDX subformat to use. Valid options are + ``dynamic`` (default) and ``fixed``. + + .. option:: block_state_zero + + Force use of payload blocks of type 'ZERO'. Can be set to ``on`` (default) + or ``off``. When set to ``off``, new blocks will be created as + ``PAYLOAD_BLOCK_NOT_PRESENT``, which means parsers are free to return + arbitrary data for those blocks. Do not set to ``off`` when using + ``qemu-img convert`` with ``subformat=dynamic``. + + .. option:: block_size + + Block size; min 1 MB, max 256 MB. 0 means auto-calculate based on + image size. + + .. option:: log_size + + Log size; min 1 MB. + +Read-only formats +----------------- + +More disk image file formats are supported in a read-only mode. + +.. program:: image-formats +.. option:: bochs + + Bochs images of ``growing`` type. + +.. program:: image-formats +.. option:: cloop + + Linux Compressed Loop image, useful only to reuse directly compressed + CD-ROM images present for example in the Knoppix CD-ROMs. + +.. program:: image-formats +.. option:: dmg + + Apple disk image. + +.. program:: image-formats +.. option:: parallels + + Parallels disk image format. + +Using host drives +----------------- + +In addition to disk image files, QEMU can directly access host +devices. We describe here the usage for QEMU version >= 0.8.3. + +Linux +''''' + +On Linux, you can directly use the host device filename instead of a +disk image filename provided you have enough privileges to access +it. For example, use ``/dev/cdrom`` to access to the CDROM. + +CD + You can specify a CDROM device even if no CDROM is loaded. QEMU has + specific code to detect CDROM insertion or removal. CDROM ejection by + the guest OS is supported. Currently only data CDs are supported. + +Floppy + You can specify a floppy device even if no floppy is loaded. Floppy + removal is currently not detected accurately (if you change floppy + without doing floppy access while the floppy is not loaded, the guest + OS will think that the same floppy is loaded). + Use of the host's floppy device is deprecated, and support for it will + be removed in a future release. + +Hard disks + Hard disks can be used. Normally you must specify the whole disk + (``/dev/hdb`` instead of ``/dev/hdb1``) so that the guest OS can + see it as a partitioned disk. WARNING: unless you know what you do, it + is better to only make READ-ONLY accesses to the hard disk otherwise + you may corrupt your host data (use the ``-snapshot`` command + line option or modify the device permissions accordingly). + +Windows +''''''' + +CD + The preferred syntax is the drive letter (e.g. ``d:``). The + alternate syntax ``\\.\d:`` is supported. ``/dev/cdrom`` is + supported as an alias to the first CDROM drive. + + Currently there is no specific code to handle removable media, so it + is better to use the ``change`` or ``eject`` monitor commands to + change or eject media. + +Hard disks + Hard disks can be used with the syntax: ``\\.\PhysicalDriveN`` + where *N* is the drive number (0 is the first hard disk). + + WARNING: unless you know what you do, it is better to only make + READ-ONLY accesses to the hard disk otherwise you may corrupt your + host data (use the ``-snapshot`` command line so that the + modifications are written in a temporary file). + +Mac OS X +'''''''' + +``/dev/cdrom`` is an alias to the first CDROM. + +Currently there is no specific code to handle removable media, so it +is better to use the ``change`` or ``eject`` monitor commands to +change or eject media. + +Virtual FAT disk images +----------------------- + +QEMU can automatically create a virtual FAT disk image from a +directory tree. In order to use it, just type: + +.. parsed-literal:: + + |qemu_system| linux.img -hdb fat:/my_directory + +Then you access access to all the files in the ``/my_directory`` +directory without having to copy them in a disk image or to export +them via SAMBA or NFS. The default access is *read-only*. + +Floppies can be emulated with the ``:floppy:`` option: + +.. parsed-literal:: + + |qemu_system| linux.img -fda fat:floppy:/my_directory + +A read/write support is available for testing (beta stage) with the +``:rw:`` option: + +.. parsed-literal:: + + |qemu_system| linux.img -fda fat:floppy:rw:/my_directory + +What you should *never* do: + +- use non-ASCII filenames +- use "-snapshot" together with ":rw:" +- expect it to work when loadvm'ing +- write to the FAT directory on the host system while accessing it with the guest system + +NBD access +---------- + +QEMU can access directly to block device exported using the Network Block Device +protocol. + +.. parsed-literal:: + + |qemu_system| linux.img -hdb nbd://my_nbd_server.mydomain.org:1024/ + +If the NBD server is located on the same host, you can use an unix socket instead +of an inet socket: + +.. parsed-literal:: + + |qemu_system| linux.img -hdb nbd+unix://?socket=/tmp/my_socket + +In this case, the block device must be exported using qemu-nbd: + +.. parsed-literal:: + + qemu-nbd --socket=/tmp/my_socket my_disk.qcow2 + +The use of qemu-nbd allows sharing of a disk between several guests: + +.. parsed-literal:: + + qemu-nbd --socket=/tmp/my_socket --share=2 my_disk.qcow2 + +and then you can use it with two guests: + +.. parsed-literal:: + + |qemu_system| linux1.img -hdb nbd+unix://?socket=/tmp/my_socket + |qemu_system| linux2.img -hdb nbd+unix://?socket=/tmp/my_socket + +If the nbd-server uses named exports (supported since NBD 2.9.18, or with QEMU's +own embedded NBD server), you must specify an export name in the URI: + +.. parsed-literal:: + + |qemu_system| -cdrom nbd://localhost/debian-500-ppc-netinst + |qemu_system| -cdrom nbd://localhost/openSUSE-11.1-ppc-netinst + +The URI syntax for NBD is supported since QEMU 1.3. An alternative syntax is +also available. Here are some example of the older syntax: + +.. parsed-literal:: + + |qemu_system| linux.img -hdb nbd:my_nbd_server.mydomain.org:1024 + |qemu_system| linux2.img -hdb nbd:unix:/tmp/my_socket + |qemu_system| -cdrom nbd:localhost:10809:exportname=debian-500-ppc-netinst + + + +Sheepdog disk images +-------------------- + +Sheepdog is a distributed storage system for QEMU. It provides highly +available block level storage volumes that can be attached to +QEMU-based virtual machines. + +You can create a Sheepdog disk image with the command: + +.. parsed-literal:: + + qemu-img create sheepdog:///IMAGE SIZE + +where *IMAGE* is the Sheepdog image name and *SIZE* is its +size. + +To import the existing *FILENAME* to Sheepdog, you can use a +convert command. + +.. parsed-literal:: + + qemu-img convert FILENAME sheepdog:///IMAGE + +You can boot from the Sheepdog disk image with the command: + +.. parsed-literal:: + + |qemu_system| sheepdog:///IMAGE + +You can also create a snapshot of the Sheepdog image like qcow2. + +.. parsed-literal:: + + qemu-img snapshot -c TAG sheepdog:///IMAGE + +where *TAG* is a tag name of the newly created snapshot. + +To boot from the Sheepdog snapshot, specify the tag name of the +snapshot. + +.. parsed-literal:: + + |qemu_system| sheepdog:///IMAGE#TAG + +You can create a cloned image from the existing snapshot. + +.. parsed-literal:: + + qemu-img create -b sheepdog:///BASE#TAG sheepdog:///IMAGE + +where *BASE* is an image name of the source snapshot and *TAG* +is its tag name. + +You can use an unix socket instead of an inet socket: + +.. parsed-literal:: + + |qemu_system| sheepdog+unix:///IMAGE?socket=PATH + +If the Sheepdog daemon doesn't run on the local host, you need to +specify one of the Sheepdog servers to connect to. + +.. parsed-literal:: + + qemu-img create sheepdog://HOSTNAME:PORT/IMAGE SIZE + |qemu_system| sheepdog://HOSTNAME:PORT/IMAGE + +iSCSI LUNs +---------- + +iSCSI is a popular protocol used to access SCSI devices across a computer +network. + +There are two different ways iSCSI devices can be used by QEMU. + +The first method is to mount the iSCSI LUN on the host, and make it appear as +any other ordinary SCSI device on the host and then to access this device as a +/dev/sd device from QEMU. How to do this differs between host OSes. + +The second method involves using the iSCSI initiator that is built into +QEMU. This provides a mechanism that works the same way regardless of which +host OS you are running QEMU on. This section will describe this second method +of using iSCSI together with QEMU. + +In QEMU, iSCSI devices are described using special iSCSI URLs. URL syntax: + +:: + + iscsi://[<username>[%<password>]@]<host>[:<port>]/<target-iqn-name>/<lun> + +Username and password are optional and only used if your target is set up +using CHAP authentication for access control. +Alternatively the username and password can also be set via environment +variables to have these not show up in the process list: + +:: + + export LIBISCSI_CHAP_USERNAME=<username> + export LIBISCSI_CHAP_PASSWORD=<password> + iscsi://<host>/<target-iqn-name>/<lun> + +Various session related parameters can be set via special options, either +in a configuration file provided via '-readconfig' or directly on the +command line. + +If the initiator-name is not specified qemu will use a default name +of 'iqn.2008-11.org.linux-kvm[:<uuid>'] where <uuid> is the UUID of the +virtual machine. If the UUID is not specified qemu will use +'iqn.2008-11.org.linux-kvm[:<name>'] where <name> is the name of the +virtual machine. + +Setting a specific initiator name to use when logging in to the target: + +:: + + -iscsi initiator-name=iqn.qemu.test:my-initiator + +Controlling which type of header digest to negotiate with the target: + +:: + + -iscsi header-digest=CRC32C|CRC32C-NONE|NONE-CRC32C|NONE + +These can also be set via a configuration file: + +:: + + [iscsi] + user = "CHAP username" + password = "CHAP password" + initiator-name = "iqn.qemu.test:my-initiator" + # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE + header-digest = "CRC32C" + +Setting the target name allows different options for different targets: + +:: + + [iscsi "iqn.target.name"] + user = "CHAP username" + password = "CHAP password" + initiator-name = "iqn.qemu.test:my-initiator" + # header digest is one of CRC32C|CRC32C-NONE|NONE-CRC32C|NONE + header-digest = "CRC32C" + +How to use a configuration file to set iSCSI configuration options: + +.. parsed-literal:: + + cat >iscsi.conf <<EOF + [iscsi] + user = "me" + password = "my password" + initiator-name = "iqn.qemu.test:my-initiator" + header-digest = "CRC32C" + EOF + + |qemu_system| -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \\ + -readconfig iscsi.conf + +How to set up a simple iSCSI target on loopback and access it via QEMU: +this example shows how to set up an iSCSI target with one CDROM and one DISK +using the Linux STGT software target. This target is available on Red Hat based +systems as the package 'scsi-target-utils'. + +.. parsed-literal:: + + tgtd --iscsi portal=127.0.0.1:3260 + tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.qemu.test + tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 1 \\ + -b /IMAGES/disk.img --device-type=disk + tgtadm --lld iscsi --mode logicalunit --op new --tid 1 --lun 2 \\ + -b /IMAGES/cd.iso --device-type=cd + tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL + + |qemu_system| -iscsi initiator-name=iqn.qemu.test:my-initiator \\ + -boot d -drive file=iscsi://127.0.0.1/iqn.qemu.test/1 \\ + -cdrom iscsi://127.0.0.1/iqn.qemu.test/2 + +GlusterFS disk images +--------------------- + +GlusterFS is a user space distributed file system. + +You can boot from the GlusterFS disk image with the command: + +URI: + +.. parsed-literal:: + + |qemu_system| -drive file=gluster[+TYPE]://[HOST}[:PORT]]/VOLUME/PATH + [?socket=...][,file.debug=9][,file.logfile=...] + +JSON: + +.. parsed-literal:: + + |qemu_system| 'json:{"driver":"qcow2", + "file":{"driver":"gluster", + "volume":"testvol","path":"a.img","debug":9,"logfile":"...", + "server":[{"type":"tcp","host":"...","port":"..."}, + {"type":"unix","socket":"..."}]}}' + +*gluster* is the protocol. + +*TYPE* specifies the transport type used to connect to gluster +management daemon (glusterd). Valid transport types are +tcp and unix. In the URI form, if a transport type isn't specified, +then tcp type is assumed. + +*HOST* specifies the server where the volume file specification for +the given volume resides. This can be either a hostname or an ipv4 address. +If transport type is unix, then *HOST* field should not be specified. +Instead *socket* field needs to be populated with the path to unix domain +socket. + +*PORT* is the port number on which glusterd is listening. This is optional +and if not specified, it defaults to port 24007. If the transport type is unix, +then *PORT* should not be specified. + +*VOLUME* is the name of the gluster volume which contains the disk image. + +*PATH* is the path to the actual disk image that resides on gluster volume. + +*debug* is the logging level of the gluster protocol driver. Debug levels +are 0-9, with 9 being the most verbose, and 0 representing no debugging output. +The default level is 4. The current logging levels defined in the gluster source +are 0 - None, 1 - Emergency, 2 - Alert, 3 - Critical, 4 - Error, 5 - Warning, +6 - Notice, 7 - Info, 8 - Debug, 9 - Trace + +*logfile* is a commandline option to mention log file path which helps in +logging to the specified file and also help in persisting the gfapi logs. The +default is stderr. + +You can create a GlusterFS disk image with the command: + +.. parsed-literal:: + + qemu-img create gluster://HOST/VOLUME/PATH SIZE + +Examples + +.. parsed-literal:: + + |qemu_system| -drive file=gluster://1.2.3.4/testvol/a.img + |qemu_system| -drive file=gluster+tcp://1.2.3.4/testvol/a.img + |qemu_system| -drive file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img + |qemu_system| -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img + |qemu_system| -drive file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img + |qemu_system| -drive file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img + |qemu_system| -drive file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket + |qemu_system| -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img + |qemu_system| -drive file=gluster://1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log + |qemu_system| 'json:{"driver":"qcow2", + "file":{"driver":"gluster", + "volume":"testvol","path":"a.img", + "debug":9,"logfile":"/var/log/qemu-gluster.log", + "server":[{"type":"tcp","host":"1.2.3.4","port":24007}, + {"type":"unix","socket":"/var/run/glusterd.socket"}]}}' + |qemu_system| -drive driver=qcow2,file.driver=gluster,file.volume=testvol,file.path=/path/a.img, + file.debug=9,file.logfile=/var/log/qemu-gluster.log, + file.server.0.type=tcp,file.server.0.host=1.2.3.4,file.server.0.port=24007, + file.server.1.type=unix,file.server.1.socket=/var/run/glusterd.socket + +Secure Shell (ssh) disk images +------------------------------ + +You can access disk images located on a remote ssh server +by using the ssh protocol: + +.. parsed-literal:: + + |qemu_system| -drive file=ssh://[USER@]SERVER[:PORT]/PATH[?host_key_check=HOST_KEY_CHECK] + +Alternative syntax using properties: + +.. parsed-literal:: + + |qemu_system| -drive file.driver=ssh[,file.user=USER],file.host=SERVER[,file.port=PORT],file.path=PATH[,file.host_key_check=HOST_KEY_CHECK] + +*ssh* is the protocol. + +*USER* is the remote user. If not specified, then the local +username is tried. + +*SERVER* specifies the remote ssh server. Any ssh server can be +used, but it must implement the sftp-server protocol. Most Unix/Linux +systems should work without requiring any extra configuration. + +*PORT* is the port number on which sshd is listening. By default +the standard ssh port (22) is used. + +*PATH* is the path to the disk image. + +The optional *HOST_KEY_CHECK* parameter controls how the remote +host's key is checked. The default is ``yes`` which means to use +the local ``.ssh/known_hosts`` file. Setting this to ``no`` +turns off known-hosts checking. Or you can check that the host key +matches a specific fingerprint: +``host_key_check=md5:78:45:8e:14:57:4f:d5:45:83:0a:0e:f3:49:82:c9:c8`` +(``sha1:`` can also be used as a prefix, but note that OpenSSH +tools only use MD5 to print fingerprints). + +Currently authentication must be done using ssh-agent. Other +authentication methods may be supported in future. + +Note: Many ssh servers do not support an ``fsync``-style operation. +The ssh driver cannot guarantee that disk flush requests are +obeyed, and this causes a risk of disk corruption if the remote +server or network goes down during writes. The driver will +print a warning when ``fsync`` is not supported: + +:: + + warning: ssh server ssh.example.com:22 does not support fsync + +With sufficiently new versions of libssh and OpenSSH, ``fsync`` is +supported. + +NVMe disk images +---------------- + +NVM Express (NVMe) storage controllers can be accessed directly by a userspace +driver in QEMU. This bypasses the host kernel file system and block layers +while retaining QEMU block layer functionalities, such as block jobs, I/O +throttling, image formats, etc. Disk I/O performance is typically higher than +with ``-drive file=/dev/sda`` using either thread pool or linux-aio. + +The controller will be exclusively used by the QEMU process once started. To be +able to share storage between multiple VMs and other applications on the host, +please use the file based protocols. + +Before starting QEMU, bind the host NVMe controller to the host vfio-pci +driver. For example: + +.. parsed-literal:: + + # modprobe vfio-pci + # lspci -n -s 0000:06:0d.0 + 06:0d.0 0401: 1102:0002 (rev 08) + # echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind + # echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id + + # |qemu_system| -drive file=nvme://HOST:BUS:SLOT.FUNC/NAMESPACE + +Alternative syntax using properties: + +.. parsed-literal:: + + |qemu_system| -drive file.driver=nvme,file.device=HOST:BUS:SLOT.FUNC,file.namespace=NAMESPACE + +*HOST*:*BUS*:*SLOT*.\ *FUNC* is the NVMe controller's PCI device +address on the host. + +*NAMESPACE* is the NVMe namespace number, starting from 1. + +Disk image file locking +----------------------- + +By default, QEMU tries to protect image files from unexpected concurrent +access, as long as it's supported by the block protocol driver and host +operating system. If multiple QEMU processes (including QEMU emulators and +utilities) try to open the same image with conflicting accessing modes, all but +the first one will get an error. + +This feature is currently supported by the file protocol on Linux with the Open +File Descriptor (OFD) locking API, and can be configured to fall back to POSIX +locking if the POSIX host doesn't support Linux OFD locking. + +To explicitly enable image locking, specify "locking=on" in the file protocol +driver options. If OFD locking is not possible, a warning will be printed and +the POSIX locking API will be used. In this case there is a risk that the lock +will get silently lost when doing hot plugging and block jobs, due to the +shortcomings of the POSIX locking API. + +QEMU transparently handles lock handover during shared storage migration. For +shared virtual disk images between multiple VMs, the "share-rw" device option +should be used. + +By default, the guest has exclusive write access to its disk image. If the +guest can safely share the disk image with other writers the +``-device ...,share-rw=on`` parameter can be used. This is only safe if +the guest is running software, such as a cluster file system, that +coordinates disk accesses to avoid corruption. + +Note that share-rw=on only declares the guest's ability to share the disk. +Some QEMU features, such as image file formats, require exclusive write access +to the disk image and this is unaffected by the share-rw=on option. + +Alternatively, locking can be fully disabled by "locking=off" block device +option. In the command line, the option is usually in the form of +"file.locking=off" as the protocol driver is normally placed as a "file" child +under a format driver. For example: + +:: + + -blockdev driver=qcow2,file.filename=/path/to/image,file.locking=off,file.driver=file + +To check if image locking is active, check the output of the "lslocks" command +on host and see if there are locks held by the QEMU process on the image file. +More than one byte could be locked by the QEMU instance, each byte of which +reflects a particular permission that is acquired or protected by the running +block driver. + +.. only:: man + + See also + -------- + + The HTML documentation of QEMU for more precise information and Linux + user mode emulator invocation. @@ -25,7 +25,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "exec/target_page.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "hw/qdev-core.h" #include "hw/qdev-properties.h" #if !defined(CONFIG_USER_ONLY) @@ -3895,7 +3895,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) uint8_t *host_startaddr = rb->host + start; - if ((uintptr_t)host_startaddr & (rb->page_size - 1)) { + if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) { error_report("ram_block_discard_range: Unaligned start address: %p", host_startaddr); goto err; @@ -3903,7 +3903,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) if ((start + length) <= rb->used_length) { bool need_madvise, need_fallocate; - if (length & (rb->page_size - 1)) { + if (!QEMU_IS_ALIGNED(length, rb->page_size)) { error_report("ram_block_discard_range: Unaligned length: %zx", length); goto err; diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c index 0d4de49dcf..aa1ab2590d 100644 --- a/fsdev/virtfs-proxy-helper.c +++ b/fsdev/virtfs-proxy-helper.c @@ -287,8 +287,7 @@ static int setugid(int uid, int gid, int *suid, int *sgid) *sgid = getegid(); if (setresgid(-1, gid, *sgid) == -1) { - retval = -errno; - goto err_out; + return -errno; } if (setresuid(-1, uid, *suid) == -1) { @@ -322,7 +321,6 @@ err_sgid: if (setresgid(-1, *sgid, *sgid) == -1) { abort(); } -err_out: return retval; } diff --git a/hw/9pfs/9p-local.c b/hw/9pfs/9p-local.c index ca641390fb..54e012e5b4 100644 --- a/hw/9pfs/9p-local.c +++ b/hw/9pfs/9p-local.c @@ -947,7 +947,7 @@ static int local_link(FsContext *ctx, V9fsPath *oldpath, if (ctx->export_flags & V9FS_SM_MAPPED_FILE && local_is_mapped_file_metadata(ctx, name)) { errno = EINVAL; - return -1; + goto out; } odirfd = local_opendir_nofollow(ctx, odirpath); @@ -1076,7 +1076,7 @@ out: static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name, int flags) { - int ret = -1; + int ret; if (ctx->export_flags & V9FS_SM_MAPPED_FILE) { int map_dirfd; @@ -1094,12 +1094,12 @@ static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name, fd = openat_dir(dirfd, name); if (fd == -1) { - goto err_out; + return -1; } ret = unlinkat(fd, VIRTFS_META_DIR, AT_REMOVEDIR); close_preserve_errno(fd); if (ret < 0 && errno != ENOENT) { - goto err_out; + return -1; } } map_dirfd = openat_dir(dirfd, VIRTFS_META_DIR); @@ -1107,16 +1107,14 @@ static int local_unlinkat_common(FsContext *ctx, int dirfd, const char *name, ret = unlinkat(map_dirfd, name, 0); close_preserve_errno(map_dirfd); if (ret < 0 && errno != ENOENT) { - goto err_out; + return -1; } } else if (errno != ENOENT) { - goto err_out; + return -1; } } - ret = unlinkat(dirfd, name, flags); -err_out: - return ret; + return unlinkat(dirfd, name, flags); } static int local_remove(FsContext *ctx, const char *path) diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 520177f40c..b0e445d6bd 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -2090,22 +2090,29 @@ out_nofid: * with qemu_iovec_destroy(). */ static void v9fs_init_qiov_from_pdu(QEMUIOVector *qiov, V9fsPDU *pdu, - size_t skip, size_t size, + size_t skip, size_t *size, bool is_write) { QEMUIOVector elem; struct iovec *iov; unsigned int niov; + size_t alloc_size = *size + skip; if (is_write) { - pdu->s->transport->init_out_iov_from_pdu(pdu, &iov, &niov, size + skip); + pdu->s->transport->init_out_iov_from_pdu(pdu, &iov, &niov, alloc_size); } else { - pdu->s->transport->init_in_iov_from_pdu(pdu, &iov, &niov, size + skip); + pdu->s->transport->init_in_iov_from_pdu(pdu, &iov, &niov, &alloc_size); + } + + if (alloc_size < skip) { + *size = 0; + } else { + *size = alloc_size - skip; } qemu_iovec_init_external(&elem, iov, niov); qemu_iovec_init(qiov, niov); - qemu_iovec_concat(qiov, &elem, skip, size); + qemu_iovec_concat(qiov, &elem, skip, *size); } static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp, @@ -2113,15 +2120,14 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp, { ssize_t err; size_t offset = 7; - uint64_t read_count; + size_t read_count; QEMUIOVector qiov_full; if (fidp->fs.xattr.len < off) { read_count = 0; - } else { + } else if (fidp->fs.xattr.len - off < max_count) { read_count = fidp->fs.xattr.len - off; - } - if (read_count > max_count) { + } else { read_count = max_count; } err = pdu_marshal(pdu, offset, "d", read_count); @@ -2130,7 +2136,7 @@ static int v9fs_xattr_read(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp, } offset += err; - v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, read_count, false); + v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, &read_count, false); err = v9fs_pack(qiov_full.iov, qiov_full.niov, 0, ((char *)fidp->fs.xattr.value) + off, read_count); @@ -2259,9 +2265,11 @@ static void coroutine_fn v9fs_read(void *opaque) QEMUIOVector qiov_full; QEMUIOVector qiov; int32_t len; + size_t size = max_count; - v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset + 4, max_count, false); + v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset + 4, &size, false); qemu_iovec_init(&qiov, qiov_full.niov); + max_count = size; do { qemu_iovec_reset(&qiov); qemu_iovec_concat(&qiov, &qiov_full, count, qiov_full.size - count); @@ -2464,8 +2472,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp, if (fidp->fs.xattr.len < off) { - err = -ENOSPC; - goto out; + return -ENOSPC; } write_count = fidp->fs.xattr.len - off; if (write_count > count) { @@ -2491,7 +2498,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp, off += to_copy; write_count -= to_copy; } -out: + return err; } @@ -2504,6 +2511,7 @@ static void coroutine_fn v9fs_write(void *opaque) int32_t len = 0; int32_t total = 0; size_t offset = 7; + size_t size; V9fsFidState *fidp; V9fsPDU *pdu = opaque; V9fsState *s = pdu->s; @@ -2516,7 +2524,9 @@ static void coroutine_fn v9fs_write(void *opaque) return; } offset += err; - v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, count, true); + size = count; + v9fs_init_qiov_from_pdu(&qiov_full, pdu, offset, &size, true); + count = size; trace_v9fs_write(pdu->tag, pdu->id, fid, off, count, qiov_full.niov); fidp = get_fid(pdu, fid); @@ -3056,8 +3066,7 @@ static int coroutine_fn v9fs_complete_rename(V9fsPDU *pdu, V9fsFidState *fidp, if (newdirfid != -1) { dirfidp = get_fid(pdu, newdirfid); if (dirfidp == NULL) { - err = -ENOENT; - goto out_nofid; + return -ENOENT; } if (fidp->fid_type != P9_FID_NONE) { err = -EINVAL; @@ -3100,7 +3109,6 @@ out: put_fid(pdu, dirfidp); } v9fs_path_free(&new_path); -out_nofid: return err; } diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index 3904f82901..8d07a0b301 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -425,7 +425,7 @@ struct V9fsTransport { ssize_t (*pdu_vunmarshal)(V9fsPDU *pdu, size_t offset, const char *fmt, va_list ap); void (*init_in_iov_from_pdu)(V9fsPDU *pdu, struct iovec **piov, - unsigned int *pniov, size_t size); + unsigned int *pniov, size_t *size); void (*init_out_iov_from_pdu)(V9fsPDU *pdu, struct iovec **piov, unsigned int *pniov, size_t size); void (*push_and_notify)(V9fsPDU *pdu); diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c index b5a7c03f26..1d1c50409c 100644 --- a/hw/9pfs/virtio-9p-device.c +++ b/hw/9pfs/virtio-9p-device.c @@ -147,19 +147,22 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset, } static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov, - unsigned int *pniov, size_t size) + unsigned int *pniov, size_t *size) { V9fsState *s = pdu->s; V9fsVirtioState *v = container_of(s, V9fsVirtioState, state); VirtQueueElement *elem = v->elems[pdu->idx]; size_t buf_size = iov_size(elem->in_sg, elem->in_num); - if (buf_size < size) { + if (buf_size < P9_IOHDRSZ) { VirtIODevice *vdev = VIRTIO_DEVICE(v); virtio_error(vdev, - "VirtFS reply type %d needs %zu bytes, buffer has %zu", - pdu->id + 1, size, buf_size); + "VirtFS reply type %d needs %zu bytes, buffer has %zu, less than minimum", + pdu->id + 1, *size, buf_size); + } + if (buf_size < *size) { + *size = buf_size; } *piov = elem->in_sg; @@ -215,6 +218,7 @@ static void virtio_9p_device_unrealize(DeviceState *dev, Error **errp) V9fsVirtioState *v = VIRTIO_9P(dev); V9fsState *s = &v->state; + virtio_delete_queue(v->vq); virtio_cleanup(vdev); v9fs_device_unrealize_common(s, errp); } diff --git a/hw/9pfs/xen-9p-backend.c b/hw/9pfs/xen-9p-backend.c index 71eebe12dd..18fe5b7c92 100644 --- a/hw/9pfs/xen-9p-backend.c +++ b/hw/9pfs/xen-9p-backend.c @@ -187,7 +187,7 @@ static void xen_9pfs_init_out_iov_from_pdu(V9fsPDU *pdu, static void xen_9pfs_init_in_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov, unsigned int *pniov, - size_t size) + size_t *size) { Xen9pfsDev *xen_9pfs = container_of(pdu->s, Xen9pfsDev, state); Xen9pfsRing *ring = &xen_9pfs->rings[pdu->tag % xen_9pfs->num_rings]; @@ -197,16 +197,19 @@ static void xen_9pfs_init_in_iov_from_pdu(V9fsPDU *pdu, g_free(ring->sg); ring->sg = g_new0(struct iovec, 2); - xen_9pfs_in_sg(ring, ring->sg, &num, pdu->idx, size); + xen_9pfs_in_sg(ring, ring->sg, &num, pdu->idx, *size); buf_size = iov_size(ring->sg, num); - if (buf_size < size) { + if (buf_size < P9_IOHDRSZ) { xen_pv_printf(&xen_9pfs->xendev, 0, "Xen 9pfs request type %d" - "needs %zu bytes, buffer has %zu\n", pdu->id, size, - buf_size); + "needs %zu bytes, buffer has %zu, less than minimum\n", + pdu->id, *size, buf_size); xen_be_set_state(&xen_9pfs->xendev, XenbusStateClosing); xen_9pfs_disconnect(&xen_9pfs->xendev); } + if (buf_size < *size) { + *size = buf_size; + } *piov = ring->sg; *pniov = num; diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 87f30a31d7..e2c957ce00 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -12,11 +12,13 @@ #define ACPI_CPU_FLAGS_OFFSET_RW 4 #define ACPI_CPU_CMD_OFFSET_WR 5 #define ACPI_CPU_CMD_DATA_OFFSET_RW 8 +#define ACPI_CPU_CMD_DATA2_OFFSET_R 0 enum { CPHP_GET_NEXT_CPU_WITH_EVENT_CMD = 0, CPHP_OST_EVENT_CMD = 1, CPHP_OST_STATUS_CMD = 2, + CPHP_GET_CPU_ID_CMD = 3, CPHP_CMD_MAX }; @@ -74,11 +76,27 @@ static uint64_t cpu_hotplug_rd(void *opaque, hwaddr addr, unsigned size) case CPHP_GET_NEXT_CPU_WITH_EVENT_CMD: val = cpu_st->selector; break; + case CPHP_GET_CPU_ID_CMD: + val = cdev->arch_id & 0xFFFFFFFF; + break; default: break; } trace_cpuhp_acpi_read_cmd_data(cpu_st->selector, val); break; + case ACPI_CPU_CMD_DATA2_OFFSET_R: + switch (cpu_st->command) { + case CPHP_GET_NEXT_CPU_WITH_EVENT_CMD: + val = 0; + break; + case CPHP_GET_CPU_ID_CMD: + val = cdev->arch_id >> 32; + break; + default: + break; + } + trace_cpuhp_acpi_read_cmd_data2(cpu_st->selector, val); + break; default: break; } diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 9cee90cc70..55eb29d80a 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -175,7 +175,7 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev, AcpiGedState *s = ACPI_GED(hotplug_dev); if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { - acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); + acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); } else { error_setg(errp, "virt: device plug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events index 96b8273297..afbc77de1c 100644 --- a/hw/acpi/trace-events +++ b/hw/acpi/trace-events @@ -23,6 +23,7 @@ cpuhp_acpi_read_flags(uint32_t idx, uint8_t flags) "idx[0x%"PRIx32"] flags: 0x%" cpuhp_acpi_write_idx(uint32_t idx) "set active cpu idx: 0x%"PRIx32 cpuhp_acpi_write_cmd(uint32_t idx, uint8_t cmd) "idx[0x%"PRIx32"] cmd: 0x%"PRIx8 cpuhp_acpi_read_cmd_data(uint32_t idx, uint32_t data) "idx[0x%"PRIx32"] data: 0x%"PRIx32 +cpuhp_acpi_read_cmd_data2(uint32_t idx, uint32_t data) "idx[0x%"PRIx32"] data: 0x%"PRIx32 cpuhp_acpi_cpu_has_events(uint32_t idx, bool ins, bool rm) "idx[0x%"PRIx32"] inserting: %d, removing: %d" cpuhp_acpi_clear_inserting_evt(uint32_t idx) "idx[0x%"PRIx32"]" cpuhp_acpi_clear_remove_evt(uint32_t idx) "idx[0x%"PRIx32"]" diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig index c6e7782580..3d86691ae0 100644 --- a/hw/arm/Kconfig +++ b/hw/arm/Kconfig @@ -101,6 +101,10 @@ config NETDUINO2 bool select STM32F205_SOC +config NETDUINOPLUS2 + bool + select STM32F405_SOC + config NSERIES bool select OMAP @@ -307,6 +311,12 @@ config STM32F205_SOC select STM32F2XX_ADC select STM32F2XX_SPI +config STM32F405_SOC + bool + select ARM_V7M + select STM32F4XX_SYSCFG + select STM32F4XX_EXTI + config XLNX_ZYNQMP_ARM bool select AHCI diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs index fe749f65fd..336f6dd374 100644 --- a/hw/arm/Makefile.objs +++ b/hw/arm/Makefile.objs @@ -11,6 +11,7 @@ obj-$(CONFIG_MAINSTONE) += mainstone.o obj-$(CONFIG_MICROBIT) += microbit.o obj-$(CONFIG_MUSICPAL) += musicpal.o obj-$(CONFIG_NETDUINO2) += netduino2.o +obj-$(CONFIG_NETDUINOPLUS2) += netduinoplus2.o obj-$(CONFIG_NSERIES) += nseries.o obj-$(CONFIG_SX1) += omap_sx1.o obj-$(CONFIG_CHEETAH) += palm.o @@ -36,6 +37,7 @@ obj-$(CONFIG_STRONGARM) += strongarm.o obj-$(CONFIG_ALLWINNER_A10) += allwinner-a10.o cubieboard.o obj-$(CONFIG_RASPI) += bcm2835_peripherals.o bcm2836.o raspi.o obj-$(CONFIG_STM32F205_SOC) += stm32f205_soc.o +obj-$(CONFIG_STM32F405_SOC) += stm32f405_soc.o obj-$(CONFIG_XLNX_ZYNQMP_ARM) += xlnx-zynqmp.o xlnx-zcu102.o obj-$(CONFIG_XLNX_VERSAL) += xlnx-versal.o xlnx-versal-virt.o obj-$(CONFIG_FSL_IMX25) += fsl-imx25.o imx25_pdk.o diff --git a/hw/arm/allwinner-a10.c b/hw/arm/allwinner-a10.c index 118032c8c7..1cde165611 100644 --- a/hw/arm/allwinner-a10.c +++ b/hw/arm/allwinner-a10.c @@ -25,6 +25,12 @@ #include "hw/misc/unimp.h" #include "sysemu/sysemu.h" +#define AW_A10_PIC_REG_BASE 0x01c20400 +#define AW_A10_PIT_REG_BASE 0x01c20c00 +#define AW_A10_UART0_REG_BASE 0x01c28000 +#define AW_A10_EMAC_BASE 0x01c0b000 +#define AW_A10_SATA_BASE 0x01c18000 + static void aw_a10_init(Object *obj) { AwA10State *s = AW_A10(obj); @@ -49,8 +55,6 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) { AwA10State *s = AW_A10(dev); SysBusDevice *sysbusdev; - uint8_t i; - qemu_irq fiq, irq; Error *err = NULL; object_property_set_bool(OBJECT(&s->cpu), true, "realized", &err); @@ -58,8 +62,6 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) error_propagate(errp, err); return; } - irq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_IRQ); - fiq = qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_FIQ); object_property_set_bool(OBJECT(&s->intc), true, "realized", &err); if (err != NULL) { @@ -68,11 +70,11 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) } sysbusdev = SYS_BUS_DEVICE(&s->intc); sysbus_mmio_map(sysbusdev, 0, AW_A10_PIC_REG_BASE); - sysbus_connect_irq(sysbusdev, 0, irq); - sysbus_connect_irq(sysbusdev, 1, fiq); - for (i = 0; i < AW_A10_PIC_INT_NR; i++) { - s->irq[i] = qdev_get_gpio_in(DEVICE(&s->intc), i); - } + sysbus_connect_irq(sysbusdev, 0, + qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_IRQ)); + sysbus_connect_irq(sysbusdev, 1, + qdev_get_gpio_in(DEVICE(&s->cpu), ARM_CPU_FIQ)); + qdev_pass_gpios(DEVICE(&s->intc), dev, NULL); object_property_set_bool(OBJECT(&s->timer), true, "realized", &err); if (err != NULL) { @@ -81,12 +83,12 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) } sysbusdev = SYS_BUS_DEVICE(&s->timer); sysbus_mmio_map(sysbusdev, 0, AW_A10_PIT_REG_BASE); - sysbus_connect_irq(sysbusdev, 0, s->irq[22]); - sysbus_connect_irq(sysbusdev, 1, s->irq[23]); - sysbus_connect_irq(sysbusdev, 2, s->irq[24]); - sysbus_connect_irq(sysbusdev, 3, s->irq[25]); - sysbus_connect_irq(sysbusdev, 4, s->irq[67]); - sysbus_connect_irq(sysbusdev, 5, s->irq[68]); + sysbus_connect_irq(sysbusdev, 0, qdev_get_gpio_in(dev, 22)); + sysbus_connect_irq(sysbusdev, 1, qdev_get_gpio_in(dev, 23)); + sysbus_connect_irq(sysbusdev, 2, qdev_get_gpio_in(dev, 24)); + sysbus_connect_irq(sysbusdev, 3, qdev_get_gpio_in(dev, 25)); + sysbus_connect_irq(sysbusdev, 4, qdev_get_gpio_in(dev, 67)); + sysbus_connect_irq(sysbusdev, 5, qdev_get_gpio_in(dev, 68)); memory_region_init_ram(&s->sram_a, OBJECT(dev), "sram A", 48 * KiB, &error_fatal); @@ -105,7 +107,7 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) } sysbusdev = SYS_BUS_DEVICE(&s->emac); sysbus_mmio_map(sysbusdev, 0, AW_A10_EMAC_BASE); - sysbus_connect_irq(sysbusdev, 0, s->irq[55]); + sysbus_connect_irq(sysbusdev, 0, qdev_get_gpio_in(dev, 55)); object_property_set_bool(OBJECT(&s->sata), true, "realized", &err); if (err) { @@ -113,10 +115,11 @@ static void aw_a10_realize(DeviceState *dev, Error **errp) return; } sysbus_mmio_map(SYS_BUS_DEVICE(&s->sata), 0, AW_A10_SATA_BASE); - sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, s->irq[56]); + sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, qdev_get_gpio_in(dev, 56)); /* FIXME use a qdev chardev prop instead of serial_hd() */ - serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, s->irq[1], + serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, + qdev_get_gpio_in(dev, 1), 115200, serial_hd(0), DEVICE_NATIVE_ENDIAN); } diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c index 77fbe1baab..59a27bdd68 100644 --- a/hw/arm/exynos4210.c +++ b/hw/arm/exynos4210.c @@ -166,17 +166,37 @@ static uint64_t exynos4210_calc_affinity(int cpu) return (0x9 << ARM_AFF1_SHIFT) | cpu; } -static void pl330_create(uint32_t base, qemu_irq irq, int nreq) +static DeviceState *pl330_create(uint32_t base, qemu_or_irq *orgate, + qemu_irq irq, int nreq, int nevents, int width) { SysBusDevice *busdev; DeviceState *dev; + int i; dev = qdev_create(NULL, "pl330"); + qdev_prop_set_uint8(dev, "num_events", nevents); + qdev_prop_set_uint8(dev, "num_chnls", 8); qdev_prop_set_uint8(dev, "num_periph_req", nreq); + + qdev_prop_set_uint8(dev, "wr_cap", 4); + qdev_prop_set_uint8(dev, "wr_q_dep", 8); + qdev_prop_set_uint8(dev, "rd_cap", 4); + qdev_prop_set_uint8(dev, "rd_q_dep", 8); + qdev_prop_set_uint8(dev, "data_width", width); + qdev_prop_set_uint16(dev, "data_buffer_dep", width); qdev_init_nofail(dev); busdev = SYS_BUS_DEVICE(dev); sysbus_mmio_map(busdev, 0, base); - sysbus_connect_irq(busdev, 0, irq); + + object_property_set_int(OBJECT(orgate), nevents + 1, "num-lines", + &error_abort); + object_property_set_bool(OBJECT(orgate), true, "realized", &error_abort); + + for (i = 0; i < nevents + 1; i++) { + sysbus_connect_irq(busdev, i, qdev_get_gpio_in(DEVICE(orgate), i)); + } + qdev_connect_gpio_out(DEVICE(orgate), 0, irq); + return dev; } static void exynos4210_realize(DeviceState *socdev, Error **errp) @@ -185,7 +205,7 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp) MemoryRegion *system_mem = get_system_memory(); qemu_irq gate_irq[EXYNOS4210_NCPUS][EXYNOS4210_IRQ_GATE_NINPUTS]; SysBusDevice *busdev; - DeviceState *dev; + DeviceState *dev, *uart[4], *pl330[3]; int i, n; for (n = 0; n < EXYNOS4210_NCPUS; n++) { @@ -371,19 +391,19 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp) /*** UARTs ***/ - exynos4210_uart_create(EXYNOS4210_UART0_BASE_ADDR, + uart[0] = exynos4210_uart_create(EXYNOS4210_UART0_BASE_ADDR, EXYNOS4210_UART0_FIFO_SIZE, 0, serial_hd(0), s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 0)]); - exynos4210_uart_create(EXYNOS4210_UART1_BASE_ADDR, + uart[1] = exynos4210_uart_create(EXYNOS4210_UART1_BASE_ADDR, EXYNOS4210_UART1_FIFO_SIZE, 1, serial_hd(1), s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 1)]); - exynos4210_uart_create(EXYNOS4210_UART2_BASE_ADDR, + uart[2] = exynos4210_uart_create(EXYNOS4210_UART2_BASE_ADDR, EXYNOS4210_UART2_FIFO_SIZE, 2, serial_hd(2), s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 2)]); - exynos4210_uart_create(EXYNOS4210_UART3_BASE_ADDR, + uart[3] = exynos4210_uart_create(EXYNOS4210_UART3_BASE_ADDR, EXYNOS4210_UART3_FIFO_SIZE, 3, serial_hd(3), s->irq_table[exynos4210_get_irq(EXYNOS4210_UART_INT_GRP, 3)]); @@ -431,12 +451,42 @@ static void exynos4210_realize(DeviceState *socdev, Error **errp) s->irq_table[exynos4210_get_irq(28, 3)]); /*** DMA controllers ***/ - pl330_create(EXYNOS4210_PL330_BASE0_ADDR, - qemu_irq_invert(s->irq_table[exynos4210_get_irq(35, 1)]), 32); - pl330_create(EXYNOS4210_PL330_BASE1_ADDR, - qemu_irq_invert(s->irq_table[exynos4210_get_irq(36, 1)]), 32); - pl330_create(EXYNOS4210_PL330_BASE2_ADDR, - qemu_irq_invert(s->irq_table[exynos4210_get_irq(34, 1)]), 1); + pl330[0] = pl330_create(EXYNOS4210_PL330_BASE0_ADDR, + &s->pl330_irq_orgate[0], + s->irq_table[exynos4210_get_irq(21, 0)], + 32, 32, 32); + pl330[1] = pl330_create(EXYNOS4210_PL330_BASE1_ADDR, + &s->pl330_irq_orgate[1], + s->irq_table[exynos4210_get_irq(21, 1)], + 32, 32, 32); + pl330[2] = pl330_create(EXYNOS4210_PL330_BASE2_ADDR, + &s->pl330_irq_orgate[2], + s->irq_table[exynos4210_get_irq(20, 1)], + 1, 31, 64); + + sysbus_connect_irq(SYS_BUS_DEVICE(uart[0]), 1, + qdev_get_gpio_in(pl330[0], 15)); + sysbus_connect_irq(SYS_BUS_DEVICE(uart[1]), 1, + qdev_get_gpio_in(pl330[1], 15)); + sysbus_connect_irq(SYS_BUS_DEVICE(uart[2]), 1, + qdev_get_gpio_in(pl330[0], 17)); + sysbus_connect_irq(SYS_BUS_DEVICE(uart[3]), 1, + qdev_get_gpio_in(pl330[1], 17)); +} + +static void exynos4210_init(Object *obj) +{ + Exynos4210State *s = EXYNOS4210_SOC(obj); + int i; + + for (i = 0; i < ARRAY_SIZE(s->pl330_irq_orgate); i++) { + char *name = g_strdup_printf("pl330-irq-orgate%d", i); + qemu_or_irq *orgate = &s->pl330_irq_orgate[i]; + + object_initialize_child(obj, name, orgate, sizeof(*orgate), + TYPE_OR_IRQ, &error_abort, NULL); + g_free(name); + } } static void exynos4210_class_init(ObjectClass *klass, void *data) @@ -450,6 +500,7 @@ static const TypeInfo exynos4210_info = { .name = TYPE_EXYNOS4210_SOC, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(Exynos4210State), + .instance_init = exynos4210_init, .class_init = exynos4210_class_init, }; diff --git a/hw/arm/fsl-imx25.c b/hw/arm/fsl-imx25.c index 3cb5a8fdfd..da3471b395 100644 --- a/hw/arm/fsl-imx25.c +++ b/hw/arm/fsl-imx25.c @@ -62,6 +62,9 @@ static void fsl_imx25_init(Object *obj) sysbus_init_child_obj(obj, "fec", &s->fec, sizeof(s->fec), TYPE_IMX_FEC); + sysbus_init_child_obj(obj, "rngc", &s->rngc, sizeof(s->rngc), + TYPE_IMX_RNGC); + for (i = 0; i < FSL_IMX25_NUM_I2CS; i++) { sysbus_init_child_obj(obj, "i2c[*]", &s->i2c[i], sizeof(s->i2c[i]), TYPE_IMX_I2C); @@ -188,6 +191,14 @@ static void fsl_imx25_realize(DeviceState *dev, Error **errp) sysbus_connect_irq(SYS_BUS_DEVICE(&s->fec), 0, qdev_get_gpio_in(DEVICE(&s->avic), FSL_IMX25_FEC_IRQ)); + object_property_set_bool(OBJECT(&s->rngc), true, "realized", &err); + if (err) { + error_propagate(errp, err); + return; + } + sysbus_mmio_map(SYS_BUS_DEVICE(&s->rngc), 0, FSL_IMX25_RNGC_ADDR); + sysbus_connect_irq(SYS_BUS_DEVICE(&s->rngc), 0, + qdev_get_gpio_in(DEVICE(&s->avic), FSL_IMX25_RNGC_IRQ)); /* Initialize all I2C */ for (i = 0; i < FSL_IMX25_NUM_I2CS; i++) { diff --git a/hw/arm/netduinoplus2.c b/hw/arm/netduinoplus2.c new file mode 100644 index 0000000000..e5e247edbe --- /dev/null +++ b/hw/arm/netduinoplus2.c @@ -0,0 +1,52 @@ +/* + * Netduino Plus 2 Machine Model + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/boards.h" +#include "hw/qdev-properties.h" +#include "qemu/error-report.h" +#include "hw/arm/stm32f405_soc.h" +#include "hw/arm/boot.h" + +static void netduinoplus2_init(MachineState *machine) +{ + DeviceState *dev; + + dev = qdev_create(NULL, TYPE_STM32F405_SOC); + qdev_prop_set_string(dev, "cpu-type", ARM_CPU_TYPE_NAME("cortex-m4")); + object_property_set_bool(OBJECT(dev), true, "realized", &error_fatal); + + armv7m_load_kernel(ARM_CPU(first_cpu), + machine->kernel_filename, + FLASH_SIZE); +} + +static void netduinoplus2_machine_init(MachineClass *mc) +{ + mc->desc = "Netduino Plus 2 Machine"; + mc->init = netduinoplus2_init; +} + +DEFINE_MACHINE("netduinoplus2", netduinoplus2_machine_init) diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c index b198066b54..bb025e0bd0 100644 --- a/hw/arm/stellaris.c +++ b/hw/arm/stellaris.c @@ -708,7 +708,7 @@ static int stellaris_sys_init(uint32_t base, qemu_irq irq, memory_region_init_io(&s->iomem, NULL, &ssys_ops, s, "ssys", 0x00001000); memory_region_add_subregion(get_system_memory(), base, &s->iomem); ssys_reset(s); - vmstate_register(NULL, -1, &vmstate_stellaris_sys, s); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_stellaris_sys, s); return 0; } diff --git a/hw/arm/stm32f405_soc.c b/hw/arm/stm32f405_soc.c new file mode 100644 index 0000000000..f22516fdf7 --- /dev/null +++ b/hw/arm/stm32f405_soc.c @@ -0,0 +1,302 @@ +/* + * STM32F405 SoC + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "exec/address-spaces.h" +#include "sysemu/sysemu.h" +#include "hw/arm/stm32f405_soc.h" +#include "hw/misc/unimp.h" + +#define SYSCFG_ADD 0x40013800 +static const uint32_t usart_addr[] = { 0x40011000, 0x40004400, 0x40004800, + 0x40004C00, 0x40005000, 0x40011400, + 0x40007800, 0x40007C00 }; +/* At the moment only Timer 2 to 5 are modelled */ +static const uint32_t timer_addr[] = { 0x40000000, 0x40000400, + 0x40000800, 0x40000C00 }; +#define ADC_ADDR 0x40012000 +static const uint32_t spi_addr[] = { 0x40013000, 0x40003800, 0x40003C00, + 0x40013400, 0x40015000, 0x40015400 }; +#define EXTI_ADDR 0x40013C00 + +#define SYSCFG_IRQ 71 +static const int usart_irq[] = { 37, 38, 39, 52, 53, 71, 82, 83 }; +static const int timer_irq[] = { 28, 29, 30, 50 }; +#define ADC_IRQ 18 +static const int spi_irq[] = { 35, 36, 51, 0, 0, 0 }; +static const int exti_irq[] = { 6, 7, 8, 9, 10, 23, 23, 23, 23, 23, 40, + 40, 40, 40, 40, 40} ; + + +static void stm32f405_soc_initfn(Object *obj) +{ + STM32F405State *s = STM32F405_SOC(obj); + int i; + + sysbus_init_child_obj(obj, "armv7m", &s->armv7m, sizeof(s->armv7m), + TYPE_ARMV7M); + + sysbus_init_child_obj(obj, "syscfg", &s->syscfg, sizeof(s->syscfg), + TYPE_STM32F4XX_SYSCFG); + + for (i = 0; i < STM_NUM_USARTS; i++) { + sysbus_init_child_obj(obj, "usart[*]", &s->usart[i], + sizeof(s->usart[i]), TYPE_STM32F2XX_USART); + } + + for (i = 0; i < STM_NUM_TIMERS; i++) { + sysbus_init_child_obj(obj, "timer[*]", &s->timer[i], + sizeof(s->timer[i]), TYPE_STM32F2XX_TIMER); + } + + for (i = 0; i < STM_NUM_ADCS; i++) { + sysbus_init_child_obj(obj, "adc[*]", &s->adc[i], sizeof(s->adc[i]), + TYPE_STM32F2XX_ADC); + } + + for (i = 0; i < STM_NUM_SPIS; i++) { + sysbus_init_child_obj(obj, "spi[*]", &s->spi[i], sizeof(s->spi[i]), + TYPE_STM32F2XX_SPI); + } + + sysbus_init_child_obj(obj, "exti", &s->exti, sizeof(s->exti), + TYPE_STM32F4XX_EXTI); +} + +static void stm32f405_soc_realize(DeviceState *dev_soc, Error **errp) +{ + STM32F405State *s = STM32F405_SOC(dev_soc); + MemoryRegion *system_memory = get_system_memory(); + DeviceState *dev, *armv7m; + SysBusDevice *busdev; + Error *err = NULL; + int i; + + memory_region_init_ram(&s->flash, NULL, "STM32F405.flash", FLASH_SIZE, + &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + memory_region_init_alias(&s->flash_alias, NULL, "STM32F405.flash.alias", + &s->flash, 0, FLASH_SIZE); + + memory_region_set_readonly(&s->flash, true); + memory_region_set_readonly(&s->flash_alias, true); + + memory_region_add_subregion(system_memory, FLASH_BASE_ADDRESS, &s->flash); + memory_region_add_subregion(system_memory, 0, &s->flash_alias); + + memory_region_init_ram(&s->sram, NULL, "STM32F405.sram", SRAM_SIZE, + &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + memory_region_add_subregion(system_memory, SRAM_BASE_ADDRESS, &s->sram); + + armv7m = DEVICE(&s->armv7m); + qdev_prop_set_uint32(armv7m, "num-irq", 96); + qdev_prop_set_string(armv7m, "cpu-type", s->cpu_type); + qdev_prop_set_bit(armv7m, "enable-bitband", true); + object_property_set_link(OBJECT(&s->armv7m), OBJECT(system_memory), + "memory", &error_abort); + object_property_set_bool(OBJECT(&s->armv7m), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + + /* System configuration controller */ + dev = DEVICE(&s->syscfg); + object_property_set_bool(OBJECT(&s->syscfg), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, SYSCFG_ADD); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, SYSCFG_IRQ)); + + /* Attach UART (uses USART registers) and USART controllers */ + for (i = 0; i < STM_NUM_USARTS; i++) { + dev = DEVICE(&(s->usart[i])); + qdev_prop_set_chr(dev, "chardev", serial_hd(i)); + object_property_set_bool(OBJECT(&s->usart[i]), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, usart_addr[i]); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, usart_irq[i])); + } + + /* Timer 2 to 5 */ + for (i = 0; i < STM_NUM_TIMERS; i++) { + dev = DEVICE(&(s->timer[i])); + qdev_prop_set_uint64(dev, "clock-frequency", 1000000000); + object_property_set_bool(OBJECT(&s->timer[i]), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, timer_addr[i]); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, timer_irq[i])); + } + + /* ADC device, the IRQs are ORed together */ + object_initialize_child(OBJECT(s), "adc-orirq", &s->adc_irqs, + sizeof(s->adc_irqs), TYPE_OR_IRQ, + &err, NULL); + if (err != NULL) { + error_propagate(errp, err); + return; + } + object_property_set_int(OBJECT(&s->adc_irqs), STM_NUM_ADCS, + "num-lines", &err); + object_property_set_bool(OBJECT(&s->adc_irqs), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + qdev_connect_gpio_out(DEVICE(&s->adc_irqs), 0, + qdev_get_gpio_in(armv7m, ADC_IRQ)); + + dev = DEVICE(&(s->adc[i])); + object_property_set_bool(OBJECT(&s->adc[i]), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, ADC_ADDR); + sysbus_connect_irq(busdev, 0, + qdev_get_gpio_in(DEVICE(&s->adc_irqs), i)); + + /* SPI devices */ + for (i = 0; i < STM_NUM_SPIS; i++) { + dev = DEVICE(&(s->spi[i])); + object_property_set_bool(OBJECT(&s->spi[i]), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, spi_addr[i]); + sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, spi_irq[i])); + } + + /* EXTI device */ + dev = DEVICE(&s->exti); + object_property_set_bool(OBJECT(&s->exti), true, "realized", &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, EXTI_ADDR); + for (i = 0; i < 16; i++) { + sysbus_connect_irq(busdev, i, qdev_get_gpio_in(armv7m, exti_irq[i])); + } + for (i = 0; i < 16; i++) { + qdev_connect_gpio_out(DEVICE(&s->syscfg), i, qdev_get_gpio_in(dev, i)); + } + + create_unimplemented_device("timer[7]", 0x40001400, 0x400); + create_unimplemented_device("timer[12]", 0x40001800, 0x400); + create_unimplemented_device("timer[6]", 0x40001000, 0x400); + create_unimplemented_device("timer[13]", 0x40001C00, 0x400); + create_unimplemented_device("timer[14]", 0x40002000, 0x400); + create_unimplemented_device("RTC and BKP", 0x40002800, 0x400); + create_unimplemented_device("WWDG", 0x40002C00, 0x400); + create_unimplemented_device("IWDG", 0x40003000, 0x400); + create_unimplemented_device("I2S2ext", 0x40003000, 0x400); + create_unimplemented_device("I2S3ext", 0x40004000, 0x400); + create_unimplemented_device("I2C1", 0x40005400, 0x400); + create_unimplemented_device("I2C2", 0x40005800, 0x400); + create_unimplemented_device("I2C3", 0x40005C00, 0x400); + create_unimplemented_device("CAN1", 0x40006400, 0x400); + create_unimplemented_device("CAN2", 0x40006800, 0x400); + create_unimplemented_device("PWR", 0x40007000, 0x400); + create_unimplemented_device("DAC", 0x40007400, 0x400); + create_unimplemented_device("timer[1]", 0x40010000, 0x400); + create_unimplemented_device("timer[8]", 0x40010400, 0x400); + create_unimplemented_device("SDIO", 0x40012C00, 0x400); + create_unimplemented_device("timer[9]", 0x40014000, 0x400); + create_unimplemented_device("timer[10]", 0x40014400, 0x400); + create_unimplemented_device("timer[11]", 0x40014800, 0x400); + create_unimplemented_device("GPIOA", 0x40020000, 0x400); + create_unimplemented_device("GPIOB", 0x40020400, 0x400); + create_unimplemented_device("GPIOC", 0x40020800, 0x400); + create_unimplemented_device("GPIOD", 0x40020C00, 0x400); + create_unimplemented_device("GPIOE", 0x40021000, 0x400); + create_unimplemented_device("GPIOF", 0x40021400, 0x400); + create_unimplemented_device("GPIOG", 0x40021800, 0x400); + create_unimplemented_device("GPIOH", 0x40021C00, 0x400); + create_unimplemented_device("GPIOI", 0x40022000, 0x400); + create_unimplemented_device("CRC", 0x40023000, 0x400); + create_unimplemented_device("RCC", 0x40023800, 0x400); + create_unimplemented_device("Flash Int", 0x40023C00, 0x400); + create_unimplemented_device("BKPSRAM", 0x40024000, 0x400); + create_unimplemented_device("DMA1", 0x40026000, 0x400); + create_unimplemented_device("DMA2", 0x40026400, 0x400); + create_unimplemented_device("Ethernet", 0x40028000, 0x1400); + create_unimplemented_device("USB OTG HS", 0x40040000, 0x30000); + create_unimplemented_device("USB OTG FS", 0x50000000, 0x31000); + create_unimplemented_device("DCMI", 0x50050000, 0x400); + create_unimplemented_device("RNG", 0x50060800, 0x400); +} + +static Property stm32f405_soc_properties[] = { + DEFINE_PROP_STRING("cpu-type", STM32F405State, cpu_type), + DEFINE_PROP_END_OF_LIST(), +}; + +static void stm32f405_soc_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = stm32f405_soc_realize; + dc->props = stm32f405_soc_properties; + /* No vmstate or reset required: device has no internal state */ +} + +static const TypeInfo stm32f405_soc_info = { + .name = TYPE_STM32F405_SOC, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(STM32F405State), + .instance_init = stm32f405_soc_initfn, + .class_init = stm32f405_soc_class_init, +}; + +static void stm32f405_soc_types(void) +{ + type_register_static(&stm32f405_soc_info); +} + +type_init(stm32f405_soc_types) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 39ab5f47e0..656b0081c2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1934,7 +1934,6 @@ static void virt_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, static void virt_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { - HotplugHandlerClass *hhc; VirtMachineState *vms = VIRT_MACHINE(hotplug_dev); Error *local_err = NULL; @@ -1943,8 +1942,9 @@ static void virt_memory_plug(HotplugHandler *hotplug_dev, goto out; } - hhc = HOTPLUG_HANDLER_GET_CLASS(vms->acpi_dev); - hhc->plug(HOTPLUG_HANDLER(vms->acpi_dev), dev, &error_abort); + hotplug_handler_plug(HOTPLUG_HANDLER(vms->acpi_dev), + dev, &error_abort); + out: error_propagate(errp, local_err); } diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c index 7e5c5ce789..20d8509107 100644 --- a/hw/char/exynos4210_uart.c +++ b/hw/char/exynos4210_uart.c @@ -24,6 +24,7 @@ #include "migration/vmstate.h" #include "qemu/error-report.h" #include "qemu/module.h" +#include "qemu/timer.h" #include "chardev/char-fe.h" #include "chardev/char-serial.h" @@ -31,45 +32,7 @@ #include "hw/irq.h" #include "hw/qdev-properties.h" -#undef DEBUG_UART -#undef DEBUG_UART_EXTEND -#undef DEBUG_IRQ -#undef DEBUG_Rx_DATA -#undef DEBUG_Tx_DATA - -#define DEBUG_UART 0 -#define DEBUG_UART_EXTEND 0 -#define DEBUG_IRQ 0 -#define DEBUG_Rx_DATA 0 -#define DEBUG_Tx_DATA 0 - -#if DEBUG_UART -#define PRINT_DEBUG(fmt, args...) \ - do { \ - fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \ - } while (0) - -#if DEBUG_UART_EXTEND -#define PRINT_DEBUG_EXTEND(fmt, args...) \ - do { \ - fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \ - } while (0) -#else -#define PRINT_DEBUG_EXTEND(fmt, args...) \ - do {} while (0) -#endif /* EXTEND */ - -#else -#define PRINT_DEBUG(fmt, args...) \ - do {} while (0) -#define PRINT_DEBUG_EXTEND(fmt, args...) \ - do {} while (0) -#endif - -#define PRINT_ERROR(fmt, args...) \ - do { \ - fprintf(stderr, " [%s:%d] "fmt, __func__, __LINE__, ##args); \ - } while (0) +#include "trace.h" /* * Offsets for UART registers relative to SFR base address @@ -156,6 +119,7 @@ static const Exynos4210UartReg exynos4210_uart_regs[] = { #define ULCON_STOP_BIT_SHIFT 1 /* UART Tx/Rx Status */ +#define UTRSTAT_Rx_TIMEOUT 0x8 #define UTRSTAT_TRANSMITTER_EMPTY 0x4 #define UTRSTAT_Tx_BUFFER_EMPTY 0x2 #define UTRSTAT_Rx_BUFFER_DATA_READY 0x1 @@ -185,16 +149,19 @@ typedef struct Exynos4210UartState { Exynos4210UartFIFO rx; Exynos4210UartFIFO tx; + QEMUTimer *fifo_timeout_timer; + uint64_t wordtime; /* word time in ns */ + CharBackend chr; qemu_irq irq; + qemu_irq dmairq; uint32_t channel; } Exynos4210UartState; -#if DEBUG_UART -/* Used only for debugging inside PRINT_DEBUG_... macros */ +/* Used only for tracing */ static const char *exynos4210_uart_regname(hwaddr offset) { @@ -208,7 +175,6 @@ static const char *exynos4210_uart_regname(hwaddr offset) return NULL; } -#endif static void fifo_store(Exynos4210UartFIFO *q, uint8_t ch) @@ -249,15 +215,12 @@ static void fifo_reset(Exynos4210UartFIFO *q) q->rp = 0; } -static uint32_t exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState *s) +static uint32_t exynos4210_uart_FIFO_trigger_level(uint32_t channel, + uint32_t reg) { - uint32_t level = 0; - uint32_t reg; - - reg = (s->reg[I_(UFCON)] & UFCON_Tx_FIFO_TRIGGER_LEVEL) >> - UFCON_Tx_FIFO_TRIGGER_LEVEL_SHIFT; + uint32_t level; - switch (s->channel) { + switch (channel) { case 0: level = reg * 32; break; @@ -271,12 +234,52 @@ static uint32_t exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState break; default: level = 0; - PRINT_ERROR("Wrong UART channel number: %d\n", s->channel); + trace_exynos_uart_channel_error(channel); + break; } - return level; } +static uint32_t +exynos4210_uart_Tx_FIFO_trigger_level(const Exynos4210UartState *s) +{ + uint32_t reg; + + reg = (s->reg[I_(UFCON)] & UFCON_Tx_FIFO_TRIGGER_LEVEL) >> + UFCON_Tx_FIFO_TRIGGER_LEVEL_SHIFT; + + return exynos4210_uart_FIFO_trigger_level(s->channel, reg); +} + +static uint32_t +exynos4210_uart_Rx_FIFO_trigger_level(const Exynos4210UartState *s) +{ + uint32_t reg; + + reg = ((s->reg[I_(UFCON)] & UFCON_Rx_FIFO_TRIGGER_LEVEL) >> + UFCON_Rx_FIFO_TRIGGER_LEVEL_SHIFT) + 1; + + return exynos4210_uart_FIFO_trigger_level(s->channel, reg); +} + +/* + * Update Rx DMA busy signal if Rx DMA is enabled. For simplicity, + * mark DMA as busy if DMA is enabled and the receive buffer is empty. + */ +static void exynos4210_uart_update_dmabusy(Exynos4210UartState *s) +{ + bool rx_dma_enabled = (s->reg[I_(UCON)] & 0x03) == 0x02; + uint32_t count = fifo_elements_number(&s->rx); + + if (rx_dma_enabled && !count) { + qemu_irq_raise(s->dmairq); + trace_exynos_uart_dmabusy(s->channel); + } else { + qemu_irq_lower(s->dmairq); + trace_exynos_uart_dmaready(s->channel); + } +} + static void exynos4210_uart_update_irq(Exynos4210UartState *s) { /* @@ -284,27 +287,53 @@ static void exynos4210_uart_update_irq(Exynos4210UartState *s) * transmit FIFO is smaller than the trigger level. */ if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) { - uint32_t count = (s->reg[I_(UFSTAT)] & UFSTAT_Tx_FIFO_COUNT) >> UFSTAT_Tx_FIFO_COUNT_SHIFT; if (count <= exynos4210_uart_Tx_FIFO_trigger_level(s)) { s->reg[I_(UINTSP)] |= UINTSP_TXD; } + + /* + * Rx interrupt if trigger level is reached or if rx timeout + * interrupt is disabled and there is data in the receive buffer + */ + count = fifo_elements_number(&s->rx); + if ((count && !(s->reg[I_(UCON)] & 0x80)) || + count >= exynos4210_uart_Rx_FIFO_trigger_level(s)) { + exynos4210_uart_update_dmabusy(s); + s->reg[I_(UINTSP)] |= UINTSP_RXD; + timer_del(s->fifo_timeout_timer); + } + } else if (s->reg[I_(UTRSTAT)] & UTRSTAT_Rx_BUFFER_DATA_READY) { + exynos4210_uart_update_dmabusy(s); + s->reg[I_(UINTSP)] |= UINTSP_RXD; } s->reg[I_(UINTP)] = s->reg[I_(UINTSP)] & ~s->reg[I_(UINTM)]; if (s->reg[I_(UINTP)]) { qemu_irq_raise(s->irq); - -#if DEBUG_IRQ - fprintf(stderr, "UART%d: IRQ has been raised: %08x\n", - s->channel, s->reg[I_(UINTP)]); -#endif - + trace_exynos_uart_irq_raised(s->channel, s->reg[I_(UINTP)]); } else { qemu_irq_lower(s->irq); + trace_exynos_uart_irq_lowered(s->channel); + } +} + +static void exynos4210_uart_timeout_int(void *opaque) +{ + Exynos4210UartState *s = opaque; + + trace_exynos_uart_rx_timeout(s->channel, s->reg[I_(UTRSTAT)], + s->reg[I_(UINTSP)]); + + if ((s->reg[I_(UTRSTAT)] & UTRSTAT_Rx_BUFFER_DATA_READY) || + (s->reg[I_(UCON)] & (1 << 11))) { + s->reg[I_(UINTSP)] |= UINTSP_RXD; + s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_TIMEOUT; + exynos4210_uart_update_dmabusy(s); + exynos4210_uart_update_irq(s); } } @@ -346,10 +375,24 @@ static void exynos4210_uart_update_parameters(Exynos4210UartState *s) ssp.data_bits = data_bits; ssp.stop_bits = stop_bits; + s->wordtime = NANOSECONDS_PER_SECOND * (data_bits + stop_bits + 1) / speed; + qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_PARAMS, &ssp); - PRINT_DEBUG("UART%d: speed: %d, parity: %c, data: %d, stop: %d\n", - s->channel, speed, parity, data_bits, stop_bits); + trace_exynos_uart_update_params( + s->channel, speed, parity, data_bits, stop_bits, s->wordtime); +} + +static void exynos4210_uart_rx_timeout_set(Exynos4210UartState *s) +{ + if (s->reg[I_(UCON)] & 0x80) { + uint32_t timeout = ((s->reg[I_(UCON)] >> 12) & 0x0f) * s->wordtime; + + timer_mod(s->fifo_timeout_timer, + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + timeout); + } else { + timer_del(s->fifo_timeout_timer); + } } static void exynos4210_uart_write(void *opaque, hwaddr offset, @@ -358,8 +401,8 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset, Exynos4210UartState *s = (Exynos4210UartState *)opaque; uint8_t ch; - PRINT_DEBUG_EXTEND("UART%d: <0x%04x> %s <- 0x%08llx\n", s->channel, - offset, exynos4210_uart_regname(offset), (long long unsigned int)val); + trace_exynos_uart_write(s->channel, offset, + exynos4210_uart_regname(offset), val); switch (offset) { case ULCON: @@ -373,12 +416,12 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset, if (val & UFCON_Rx_FIFO_RESET) { fifo_reset(&s->rx); s->reg[I_(UFCON)] &= ~UFCON_Rx_FIFO_RESET; - PRINT_DEBUG("UART%d: Rx FIFO Reset\n", s->channel); + trace_exynos_uart_rx_fifo_reset(s->channel); } if (val & UFCON_Tx_FIFO_RESET) { fifo_reset(&s->tx); s->reg[I_(UFCON)] &= ~UFCON_Tx_FIFO_RESET; - PRINT_DEBUG("UART%d: Tx FIFO Reset\n", s->channel); + trace_exynos_uart_tx_fifo_reset(s->channel); } break; @@ -390,9 +433,7 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset, /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &ch, 1); -#if DEBUG_Tx_DATA - fprintf(stderr, "%c", ch); -#endif + trace_exynos_uart_tx(s->channel, ch); s->reg[I_(UTRSTAT)] |= UTRSTAT_TRANSMITTER_EMPTY | UTRSTAT_Tx_BUFFER_EMPTY; s->reg[I_(UINTSP)] |= UINTSP_TXD; @@ -403,16 +444,19 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset, case UINTP: s->reg[I_(UINTP)] &= ~val; s->reg[I_(UINTSP)] &= ~val; - PRINT_DEBUG("UART%d: UINTP [%04x] have been cleared: %08x\n", - s->channel, offset, s->reg[I_(UINTP)]); + trace_exynos_uart_intclr(s->channel, s->reg[I_(UINTP)]); exynos4210_uart_update_irq(s); break; case UTRSTAT: + if (val & UTRSTAT_Rx_TIMEOUT) { + s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_TIMEOUT; + } + break; case UERSTAT: case UFSTAT: case UMSTAT: case URXH: - PRINT_DEBUG("UART%d: Trying to write into RO register: %s [%04x]\n", + trace_exynos_uart_ro_write( s->channel, exynos4210_uart_regname(offset), offset); break; case UINTSP: @@ -429,6 +473,7 @@ static void exynos4210_uart_write(void *opaque, hwaddr offset, break; } } + static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset, unsigned size) { @@ -439,6 +484,8 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset, case UERSTAT: /* Read Only */ res = s->reg[I_(UERSTAT)]; s->reg[I_(UERSTAT)] = 0; + trace_exynos_uart_read(s->channel, offset, + exynos4210_uart_regname(offset), res); return res; case UFSTAT: /* Read Only */ s->reg[I_(UFSTAT)] = fifo_elements_number(&s->rx) & 0xff; @@ -446,20 +493,22 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset, s->reg[I_(UFSTAT)] |= UFSTAT_Rx_FIFO_FULL; s->reg[I_(UFSTAT)] &= ~0xff; } + trace_exynos_uart_read(s->channel, offset, + exynos4210_uart_regname(offset), + s->reg[I_(UFSTAT)]); return s->reg[I_(UFSTAT)]; case URXH: if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) { if (fifo_elements_number(&s->rx)) { res = fifo_retrieve(&s->rx); -#if DEBUG_Rx_DATA - fprintf(stderr, "%c", res); -#endif + trace_exynos_uart_rx(s->channel, res); if (!fifo_elements_number(&s->rx)) { s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_BUFFER_DATA_READY; } else { s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY; } } else { + trace_exynos_uart_rx_error(s->channel); s->reg[I_(UINTSP)] |= UINTSP_ERROR; exynos4210_uart_update_irq(s); res = 0; @@ -468,15 +517,23 @@ static uint64_t exynos4210_uart_read(void *opaque, hwaddr offset, s->reg[I_(UTRSTAT)] &= ~UTRSTAT_Rx_BUFFER_DATA_READY; res = s->reg[I_(URXH)]; } + exynos4210_uart_update_dmabusy(s); + trace_exynos_uart_read(s->channel, offset, + exynos4210_uart_regname(offset), res); return res; case UTXH: - PRINT_DEBUG("UART%d: Trying to read from WO register: %s [%04x]\n", - s->channel, exynos4210_uart_regname(offset), offset); + trace_exynos_uart_wo_read(s->channel, exynos4210_uart_regname(offset), + offset); break; default: + trace_exynos_uart_read(s->channel, offset, + exynos4210_uart_regname(offset), + s->reg[I_(offset)]); return s->reg[I_(offset)]; } + trace_exynos_uart_read(s->channel, offset, exynos4210_uart_regname(offset), + 0); return 0; } @@ -497,7 +554,6 @@ static int exynos4210_uart_can_receive(void *opaque) return fifo_empty_elements_number(&s->rx); } - static void exynos4210_uart_receive(void *opaque, const uint8_t *buf, int size) { Exynos4210UartState *s = (Exynos4210UartState *)opaque; @@ -505,24 +561,17 @@ static void exynos4210_uart_receive(void *opaque, const uint8_t *buf, int size) if (s->reg[I_(UFCON)] & UFCON_FIFO_ENABLE) { if (fifo_empty_elements_number(&s->rx) < size) { - for (i = 0; i < fifo_empty_elements_number(&s->rx); i++) { - fifo_store(&s->rx, buf[i]); - } + size = fifo_empty_elements_number(&s->rx); s->reg[I_(UINTSP)] |= UINTSP_ERROR; - s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY; - } else { - for (i = 0; i < size; i++) { - fifo_store(&s->rx, buf[i]); - } - s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY; } - /* XXX: Around here we maybe should check Rx trigger level */ - s->reg[I_(UINTSP)] |= UINTSP_RXD; + for (i = 0; i < size; i++) { + fifo_store(&s->rx, buf[i]); + } + exynos4210_uart_rx_timeout_set(s); } else { s->reg[I_(URXH)] = buf[0]; - s->reg[I_(UINTSP)] |= UINTSP_RXD; - s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY; } + s->reg[I_(UTRSTAT)] |= UTRSTAT_Rx_BUFFER_DATA_READY; exynos4210_uart_update_irq(s); } @@ -555,13 +604,24 @@ static void exynos4210_uart_reset(DeviceState *dev) fifo_reset(&s->rx); fifo_reset(&s->tx); - PRINT_DEBUG("UART%d: Rx FIFO size: %d\n", s->channel, s->rx.size); + trace_exynos_uart_rxsize(s->channel, s->rx.size); +} + +static int exynos4210_uart_post_load(void *opaque, int version_id) +{ + Exynos4210UartState *s = (Exynos4210UartState *)opaque; + + exynos4210_uart_update_parameters(s); + exynos4210_uart_rx_timeout_set(s); + + return 0; } static const VMStateDescription vmstate_exynos4210_uart_fifo = { .name = "exynos4210.uart.fifo", .version_id = 1, .minimum_version_id = 1, + .post_load = exynos4210_uart_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(sp, Exynos4210UartFIFO), VMSTATE_UINT32(rp, Exynos4210UartFIFO), @@ -614,12 +674,17 @@ static void exynos4210_uart_init(Object *obj) SysBusDevice *dev = SYS_BUS_DEVICE(obj); Exynos4210UartState *s = EXYNOS4210_UART(dev); + s->fifo_timeout_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + exynos4210_uart_timeout_int, s); + s->wordtime = NANOSECONDS_PER_SECOND * 10 / 9600; + /* memory mapping */ memory_region_init_io(&s->iomem, obj, &exynos4210_uart_ops, s, "exynos4210.uart", EXYNOS4210_UART_REGS_MEM_SIZE); sysbus_init_mmio(dev, &s->iomem); sysbus_init_irq(dev, &s->irq); + sysbus_init_irq(dev, &s->dmairq); } static void exynos4210_uart_realize(DeviceState *dev, Error **errp) diff --git a/hw/char/trace-events b/hw/char/trace-events index 2ce7f2f998..6f938301d9 100644 --- a/hw/char/trace-events +++ b/hw/char/trace-events @@ -77,3 +77,23 @@ cmsdk_apb_uart_set_params(int speed) "CMSDK APB UART: params set to %d 8N1" # nrf51_uart.c nrf51_uart_read(uint64_t addr, uint64_t r, unsigned int size) "addr 0x%" PRIx64 " value 0x%" PRIx64 " size %u" nrf51_uart_write(uint64_t addr, uint64_t value, unsigned int size) "addr 0x%" PRIx64 " value 0x%" PRIx64 " size %u" + +# exynos4210_uart.c +exynos_uart_dmabusy(uint32_t channel) "UART%d: DMA busy (Rx buffer empty)" +exynos_uart_dmaready(uint32_t channel) "UART%d: DMA ready" +exynos_uart_irq_raised(uint32_t channel, uint32_t reg) "UART%d: IRQ raised: 0x%08"PRIx32 +exynos_uart_irq_lowered(uint32_t channel) "UART%d: IRQ lowered" +exynos_uart_update_params(uint32_t channel, int speed, uint8_t parity, int data, int stop, uint64_t wordtime) "UART%d: speed: %d, parity: %c, data bits: %d, stop bits: %d wordtime: %"PRId64"ns" +exynos_uart_write(uint32_t channel, uint32_t offset, const char *name, uint64_t val) "UART%d: <0x%04x> %s <- 0x%" PRIx64 +exynos_uart_read(uint32_t channel, uint32_t offset, const char *name, uint64_t val) "UART%d: <0x%04x> %s -> 0x%" PRIx64 +exynos_uart_rx_fifo_reset(uint32_t channel) "UART%d: Rx FIFO Reset" +exynos_uart_tx_fifo_reset(uint32_t channel) "UART%d: Tx FIFO Reset" +exynos_uart_tx(uint32_t channel, uint8_t ch) "UART%d: Tx 0x%02"PRIx32 +exynos_uart_intclr(uint32_t channel, uint32_t reg) "UART%d: interrupts cleared: 0x%08"PRIx32 +exynos_uart_ro_write(uint32_t channel, const char *name, uint32_t reg) "UART%d: Trying to write into RO register: %s [0x%04"PRIx32"]" +exynos_uart_rx(uint32_t channel, uint8_t ch) "UART%d: Rx 0x%02"PRIx32 +exynos_uart_rx_error(uint32_t channel) "UART%d: Rx error" +exynos_uart_wo_read(uint32_t channel, const char *name, uint32_t reg) "UART%d: Trying to read from WO register: %s [0x%04"PRIx32"]" +exynos_uart_rxsize(uint32_t channel, uint32_t size) "UART%d: Rx FIFO size: %d" +exynos_uart_channel_error(uint32_t channel) "Wrong UART channel number: %d" +exynos_uart_rx_timeout(uint32_t channel, uint32_t stat, uint32_t intsp) "UART%d: Rx timeout stat=0x%x intsp=0x%x" diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 9f1753f5cf..58e87d336d 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -879,7 +879,8 @@ static void device_set_realized(Object *obj, bool value, Error **errp) if (qdev_get_vmsd(dev)) { if (vmstate_register_with_alias_id(VMSTATE_IF(dev), - -1, qdev_get_vmsd(dev), dev, + VMSTATE_INSTANCE_ID_ANY, + qdev_get_vmsd(dev), dev, dev->instance_id_alias, dev->alias_required_for_version, &local_err) < 0) { diff --git a/hw/display/ads7846.c b/hw/display/ads7846.c index c12272ae72..9228b40b1a 100644 --- a/hw/display/ads7846.c +++ b/hw/display/ads7846.c @@ -154,7 +154,7 @@ static void ads7846_realize(SSISlave *d, Error **errp) ads7846_int_update(s); - vmstate_register(NULL, -1, &vmstate_ads7846, s); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_ads7846, s); } static void ads7846_class_init(ObjectClass *klass, void *data) diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c index f2bb2d9ac1..64519971ef 100644 --- a/hw/dma/pl330.c +++ b/hw/dma/pl330.c @@ -25,19 +25,12 @@ #include "sysemu/dma.h" #include "qemu/log.h" #include "qemu/module.h" +#include "trace.h" #ifndef PL330_ERR_DEBUG #define PL330_ERR_DEBUG 0 #endif -#define DB_PRINT_L(lvl, fmt, args...) do {\ - if (PL330_ERR_DEBUG >= lvl) {\ - fprintf(stderr, "PL330: %s:" fmt, __func__, ## args);\ - } \ -} while (0) - -#define DB_PRINT(fmt, args...) DB_PRINT_L(1, fmt, ## args) - #define PL330_PERIPH_NUM 32 #define PL330_MAX_BURST_LEN 128 #define PL330_INSN_MAXSIZE 6 @@ -319,6 +312,26 @@ typedef struct PL330InsnDesc { void (*exec)(PL330Chan *, uint8_t opcode, uint8_t *args, int len); } PL330InsnDesc; +static void pl330_hexdump(uint8_t *buf, size_t size) +{ + unsigned int b, i, len; + char tmpbuf[80]; + + for (b = 0; b < size; b += 16) { + len = size - b; + if (len > 16) { + len = 16; + } + tmpbuf[0] = '\0'; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) { + strcat(tmpbuf, " "); + } + sprintf(tmpbuf + strlen(tmpbuf), " %02x", buf[b + i]); + } + trace_pl330_hexdump(b, tmpbuf); + } +} /* MFIFO Implementation * @@ -582,7 +595,7 @@ static inline void pl330_queue_remove_tagged(PL330Queue *s, uint8_t tag) static inline void pl330_fault(PL330Chan *ch, uint32_t flags) { - DB_PRINT("ch: %p, flags: %" PRIx32 "\n", ch, flags); + trace_pl330_fault(ch, flags); ch->fault_type |= flags; if (ch->state == pl330_chan_fault) { return; @@ -590,7 +603,7 @@ static inline void pl330_fault(PL330Chan *ch, uint32_t flags) ch->state = pl330_chan_fault; ch->parent->num_faulting++; if (ch->parent->num_faulting == 1) { - DB_PRINT("abort interrupt raised\n"); + trace_pl330_fault_abort(); qemu_irq_raise(ch->parent->irq_abort); } } @@ -648,7 +661,7 @@ static void pl330_dmaend(PL330Chan *ch, uint8_t opcode, return; } } - DB_PRINT("DMA ending!\n"); + trace_pl330_dmaend(); pl330_fifo_tagged_remove(&s->fifo, ch->tag); pl330_queue_remove_tagged(&s->read_queue, ch->tag); pl330_queue_remove_tagged(&s->write_queue, ch->tag); @@ -683,7 +696,7 @@ static void pl330_dmago(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len) uint32_t pc; PL330Chan *s; - DB_PRINT("\n"); + trace_pl330_dmago(); if (!ch->is_manager) { pl330_fault(ch, PL330_FAULT_UNDEF_INSTR); @@ -740,9 +753,7 @@ static void pl330_dmald(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len) ch->stall = pl330_queue_put_insn(&ch->parent->read_queue, ch->src, size, num, inc, 0, ch->tag); if (!ch->stall) { - DB_PRINT("channel:%" PRId8 " address:%08" PRIx32 " size:%" PRIx32 - " num:%" PRId32 " %c\n", - ch->tag, ch->src, size, num, inc ? 'Y' : 'N'); + trace_pl330_dmald(ch->tag, ch->src, size, num, inc ? 'Y' : 'N'); ch->src += inc ? size * num - (ch->src & (size - 1)) : 0; } } @@ -782,7 +793,7 @@ static void pl330_dmakill(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len) ch->fault_type = 0; ch->parent->num_faulting--; if (ch->parent->num_faulting == 0) { - DB_PRINT("abort interrupt lowered\n"); + trace_pl330_dmakill(); qemu_irq_lower(ch->parent->irq_abort); } } @@ -800,6 +811,8 @@ static void pl330_dmalpend(PL330Chan *ch, uint8_t opcode, uint8_t bs = opcode & 3; uint8_t lc = (opcode & 4) >> 2; + trace_pl330_dmalpend(nf, bs, lc, ch->lc[lc], ch->request_flag); + if (bs == 2) { pl330_fault(ch, PL330_FAULT_OPERAND_INVALID); return; @@ -813,12 +826,12 @@ static void pl330_dmalpend(PL330Chan *ch, uint8_t opcode, if (nf) { ch->lc[lc]--; } - DB_PRINT("loop reiteration\n"); + trace_pl330_dmalpiter(); ch->pc -= args[0]; ch->pc -= len + 1; /* "ch->pc -= args[0] + len + 1" is incorrect when args[0] == 256 */ } else { - DB_PRINT("loop fallthrough\n"); + trace_pl330_dmalpfallthrough(); } } @@ -886,10 +899,10 @@ static void pl330_dmasev(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len) } if (ch->parent->inten & (1 << ev_id)) { ch->parent->int_status |= (1 << ev_id); - DB_PRINT("event interrupt raised %" PRId8 "\n", ev_id); + trace_pl330_dmasev_evirq(ev_id); qemu_irq_raise(ch->parent->irq[ev_id]); } - DB_PRINT("event raised %" PRId8 "\n", ev_id); + trace_pl330_dmasev_event(ev_id); ch->parent->ev_status |= (1 << ev_id); } @@ -914,9 +927,7 @@ static void pl330_dmast(PL330Chan *ch, uint8_t opcode, uint8_t *args, int len) ch->stall = pl330_queue_put_insn(&ch->parent->write_queue, ch->dst, size, num, inc, 0, ch->tag); if (!ch->stall) { - DB_PRINT("channel:%" PRId8 " address:%08" PRIx32 " size:%" PRIx32 - " num:%" PRId32 " %c\n", - ch->tag, ch->dst, size, num, inc ? 'Y' : 'N'); + trace_pl330_dmast(ch->tag, ch->dst, size, num, inc ? 'Y' : 'N'); ch->dst += inc ? size * num - (ch->dst & (size - 1)) : 0; } } @@ -992,7 +1003,7 @@ static void pl330_dmawfe(PL330Chan *ch, uint8_t opcode, } } ch->parent->ev_status &= ~(1 << ev_id); - DB_PRINT("event lowered %" PRIx8 "\n", ev_id); + trace_pl330_dmawfe(ev_id); } else { ch->stall = 1; } @@ -1135,7 +1146,7 @@ static int pl330_chan_exec(PL330Chan *ch) ch->stall = 0; insn = pl330_fetch_insn(ch); if (!insn) { - DB_PRINT("pl330 undefined instruction\n"); + trace_pl330_chan_exec_undef(); pl330_fault(ch, PL330_FAULT_UNDEF_INSTR); return 0; } @@ -1175,10 +1186,9 @@ static int pl330_exec_cycle(PL330Chan *channel) int len = q->len - (q->addr & (q->len - 1)); dma_memory_read(&address_space_memory, q->addr, buf, len); - if (PL330_ERR_DEBUG > 1) { - DB_PRINT("PL330 read from memory @%08" PRIx32 " (size = %08x):\n", - q->addr, len); - qemu_hexdump((char *)buf, stderr, "", len); + trace_pl330_exec_cycle(q->addr, len); + if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) { + pl330_hexdump(buf, len); } fifo_res = pl330_fifo_push(&s->fifo, buf, len, q->tag); if (fifo_res == PL330_FIFO_OK) { @@ -1207,10 +1217,9 @@ static int pl330_exec_cycle(PL330Chan *channel) } if (fifo_res == PL330_FIFO_OK || q->z) { dma_memory_write(&address_space_memory, q->addr, buf, len); - if (PL330_ERR_DEBUG > 1) { - DB_PRINT("PL330 read from memory @%08" PRIx32 - " (size = %08x):\n", q->addr, len); - qemu_hexdump((char *)buf, stderr, "", len); + trace_pl330_exec_cycle(q->addr, len); + if (trace_event_get_state_backends(TRACE_PL330_HEXDUMP)) { + pl330_hexdump(buf, len); } if (q->inc) { q->addr += len; @@ -1252,8 +1261,8 @@ static int pl330_exec_channel(PL330Chan *channel) static inline void pl330_exec(PL330State *s) { - DB_PRINT("\n"); int i, insr_exec; + trace_pl330_exec(); do { insr_exec = pl330_exec_channel(&s->manager); @@ -1298,7 +1307,7 @@ static void pl330_debug_exec(PL330State *s) args[2] = (s->dbg[1] >> 8) & 0xff; args[3] = (s->dbg[1] >> 16) & 0xff; args[4] = (s->dbg[1] >> 24) & 0xff; - DB_PRINT("chan id: %" PRIx8 "\n", chan_id); + trace_pl330_debug_exec(chan_id); if (s->dbg[0] & 1) { ch = &s->chan[chan_id]; } else { @@ -1320,6 +1329,7 @@ static void pl330_debug_exec(PL330State *s) ch->fault_type |= PL330_FAULT_DBG_INSTR; } if (ch->stall) { + trace_pl330_debug_exec_stall(); qemu_log_mask(LOG_UNIMP, "pl330: stall of debug instruction not " "implemented\n"); } @@ -1334,7 +1344,7 @@ static void pl330_iomem_write(void *opaque, hwaddr offset, PL330State *s = (PL330State *) opaque; int i; - DB_PRINT("addr: %08x data: %08x\n", (unsigned)offset, (unsigned)value); + trace_pl330_iomem_write((unsigned)offset, (unsigned)value); switch (offset) { case PL330_REG_INTEN: @@ -1343,7 +1353,7 @@ static void pl330_iomem_write(void *opaque, hwaddr offset, case PL330_REG_INTCLR: for (i = 0; i < s->num_events; i++) { if (s->int_status & s->inten & value & (1 << i)) { - DB_PRINT("event interrupt lowered %d\n", i); + trace_pl330_iomem_write_clr(i); qemu_irq_lower(s->irq[i]); } } @@ -1361,11 +1371,9 @@ static void pl330_iomem_write(void *opaque, hwaddr offset, } break; case PL330_REG_DBGINST0: - DB_PRINT("s->dbg[0] = %08x\n", (unsigned)value); s->dbg[0] = value; break; case PL330_REG_DBGINST1: - DB_PRINT("s->dbg[1] = %08x\n", (unsigned)value); s->dbg[1] = value; break; default: @@ -1489,7 +1497,7 @@ static uint64_t pl330_iomem_read(void *opaque, hwaddr offset, unsigned size) { uint32_t ret = pl330_iomem_read_imp(opaque, offset); - DB_PRINT("addr: %08" HWADDR_PRIx " data: %08" PRIx32 "\n", offset, ret); + trace_pl330_iomem_read((uint32_t)offset, ret); return ret; } diff --git a/hw/dma/trace-events b/hw/dma/trace-events index e4498428c5..44893995f6 100644 --- a/hw/dma/trace-events +++ b/hw/dma/trace-events @@ -20,3 +20,27 @@ sparc32_dma_enable_lower(void) "Lower DMA enable" # i8257.c i8257_unregistered_dma(int nchan, int dma_pos, int dma_len) "unregistered DMA channel used nchan=%d dma_pos=%d dma_len=%d" + +# pl330.c +pl330_fault(void *ptr, uint32_t flags) "ch: %p, flags: 0x%"PRIx32 +pl330_fault_abort(void) "abort interrupt raised" +pl330_dmaend(void) "DMA ending" +pl330_dmago(void) "DMA run" +pl330_dmald(uint8_t chan, uint32_t addr, uint32_t size, uint32_t num, char ch) "channel:%"PRId8" address:0x%08"PRIx32" size:0x%"PRIx32" num:%"PRId32"%c" +pl330_dmakill(void) "abort interrupt lowered" +pl330_dmalpend(uint8_t nf, uint8_t bs, uint8_t lc, uint8_t ch, uint8_t flag) "nf=0x%02x bs=0x%02x lc=0x%02x ch=0x%02x flag=0x%02x" +pl330_dmalpiter(void) "loop reiteration" +pl330_dmalpfallthrough(void) "loop fallthrough" +pl330_dmasev_evirq(uint8_t ev_id) "event interrupt raised %"PRId8 +pl330_dmasev_event(uint8_t ev_id) "event raised %"PRId8 +pl330_dmast(uint8_t chan, uint32_t addr, uint32_t sz, uint32_t num, char ch) "channel:%"PRId8" address:0x%08"PRIx32" size:0x%"PRIx32" num:%"PRId32" %c" +pl330_dmawfe(uint8_t ev_id) "event lowered 0x%"PRIx8 +pl330_chan_exec_undef(void) "undefined instruction" +pl330_exec_cycle(uint32_t addr, uint32_t size) "PL330 read from memory @0x%08"PRIx32" (size = 0x%08"PRIx32")" +pl330_hexdump(uint32_t offset, char *str) " 0x%04"PRIx32":%s" +pl330_exec(void) "pl330_exec" +pl330_debug_exec(uint8_t ch) "chan id: 0x%"PRIx8 +pl330_debug_exec_stall(void) "stall of debug instruction not implemented" +pl330_iomem_write(uint32_t offset, uint32_t value) "addr: 0x%08"PRIx32" data: 0x%08"PRIx32 +pl330_iomem_write_clr(int i) "event interrupt lowered %d" +pl330_iomem_read(uint32_t addr, uint32_t data) "addr: 0x%08"PRIx32" data: 0x%08"PRIx32 diff --git a/hw/i2c/core.c b/hw/i2c/core.c index 92cd489069..d770035ba0 100644 --- a/hw/i2c/core.c +++ b/hw/i2c/core.c @@ -61,7 +61,7 @@ I2CBus *i2c_init_bus(DeviceState *parent, const char *name) bus = I2C_BUS(qbus_create(TYPE_I2C_BUS, parent, name)); QLIST_INIT(&bus->current_devs); - vmstate_register(NULL, -1, &vmstate_i2c_bus, bus); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_i2c_bus, bus); return bus; } diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index e25df838f0..9c4e46fa74 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1816,7 +1816,6 @@ static void build_smb0(Aml *table, I2CBus *smbus, int devnr, int func) Aml *scope = aml_scope("_SB.PCI0"); Aml *dev = aml_device("SMB0"); - aml_append(dev, aml_name_decl("_HID", aml_eisaid("APP0005"))); aml_append(dev, aml_name_decl("_ADR", aml_int(devnr << 16 | func))); build_acpi_ipmi_devices(dev, BUS(smbus), "\\_SB.PCI0.SMB0"); aml_append(scope, dev); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 8054bc4147..a6302a772d 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -93,7 +93,9 @@ #include "fw_cfg.h" #include "trace.h" -GlobalProperty pc_compat_4_2[] = {}; +GlobalProperty pc_compat_4_2[] = { + { "mch", "smbase-smram", "off" }, +}; const size_t pc_compat_4_2_len = G_N_ELEMENTS(pc_compat_4_2); GlobalProperty pc_compat_4_1[] = {}; diff --git a/hw/input/stellaris_input.c b/hw/input/stellaris_input.c index 59892b07fc..e6ee5e11f1 100644 --- a/hw/input/stellaris_input.c +++ b/hw/input/stellaris_input.c @@ -88,5 +88,6 @@ void stellaris_gamepad_init(int n, qemu_irq *irq, const int *keycode) } s->num_buttons = n; qemu_add_kbd_event_handler(stellaris_gamepad_put_key, s); - vmstate_register(NULL, -1, &vmstate_stellaris_gamepad, s); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, + &vmstate_stellaris_gamepad, s); } diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c index 375cb6abe9..b5dbeb6206 100644 --- a/hw/intc/apic_common.c +++ b/hw/intc/apic_common.c @@ -268,7 +268,10 @@ static void apic_common_realize(DeviceState *dev, Error **errp) APICCommonState *s = APIC_COMMON(dev); APICCommonClass *info; static DeviceState *vapic; - int instance_id = s->id; + uint32_t instance_id = s->initial_apic_id; + + /* Normally initial APIC ID should be no more than hundreds */ + assert(instance_id != VMSTATE_INSTANCE_ID_ANY); info = APIC_COMMON_GET_CLASS(s); info->realize(dev, errp); @@ -284,7 +287,7 @@ static void apic_common_realize(DeviceState *dev, Error **errp) } if (s->legacy_instance_id) { - instance_id = -1; + instance_id = VMSTATE_INSTANCE_ID_ANY; } vmstate_register_with_alias_id(NULL, instance_id, &vmstate_apic_common, s, -1, 0, NULL); diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index a254b0ce87..08e000e33c 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -664,6 +664,9 @@ static uint64_t icv_iar_read(CPUARMState *env, const ARMCPRegInfo *ri) trace_gicv3_icv_iar_read(ri->crm == 8 ? 0 : 1, gicv3_redist_affid(cs), intid); + + gicv3_cpuif_virt_update(cs); + return intid; } diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig index 2164646553..bdd77d8020 100644 --- a/hw/misc/Kconfig +++ b/hw/misc/Kconfig @@ -82,6 +82,12 @@ config IMX config STM32F2XX_SYSCFG bool +config STM32F4XX_SYSCFG + bool + +config STM32F4XX_EXTI + bool + config MIPS_ITU bool diff --git a/hw/misc/Makefile.objs b/hw/misc/Makefile.objs index ba898a5781..da993f45b7 100644 --- a/hw/misc/Makefile.objs +++ b/hw/misc/Makefile.objs @@ -42,6 +42,7 @@ common-obj-$(CONFIG_IMX) += imx7_ccm.o common-obj-$(CONFIG_IMX) += imx2_wdt.o common-obj-$(CONFIG_IMX) += imx7_snvs.o common-obj-$(CONFIG_IMX) += imx7_gpr.o +common-obj-$(CONFIG_IMX) += imx_rngc.o common-obj-$(CONFIG_MILKYMIST) += milkymist-hpdmc.o common-obj-$(CONFIG_MILKYMIST) += milkymist-pfpu.o common-obj-$(CONFIG_MAINSTONE) += mst_fpga.o @@ -58,6 +59,8 @@ common-obj-$(CONFIG_SLAVIO) += slavio_misc.o common-obj-$(CONFIG_ZYNQ) += zynq_slcr.o common-obj-$(CONFIG_ZYNQ) += zynq-xadc.o common-obj-$(CONFIG_STM32F2XX_SYSCFG) += stm32f2xx_syscfg.o +common-obj-$(CONFIG_STM32F4XX_SYSCFG) += stm32f4xx_syscfg.o +common-obj-$(CONFIG_STM32F4XX_EXTI) += stm32f4xx_exti.o obj-$(CONFIG_MIPS_CPS) += mips_cmgcr.o obj-$(CONFIG_MIPS_CPS) += mips_cpc.o obj-$(CONFIG_MIPS_ITU) += mips_itu.o diff --git a/hw/misc/imx_rngc.c b/hw/misc/imx_rngc.c new file mode 100644 index 0000000000..4c270df2db --- /dev/null +++ b/hw/misc/imx_rngc.c @@ -0,0 +1,278 @@ +/* + * Freescale i.MX RNGC emulation + * + * Copyright (C) 2020 Martin Kaiser <martin@kaiser.cx> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * This driver provides the minimum functionality to initialize and seed + * an rngc and to read random numbers. The rngb that is found in imx25 + * chipsets is also supported. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qemu/log.h" +#include "qemu/guest-random.h" +#include "hw/irq.h" +#include "hw/misc/imx_rngc.h" +#include "migration/vmstate.h" + +#define RNGC_NAME "i.MX RNGC" + +#define RNGC_VER_ID 0x00 +#define RNGC_COMMAND 0x04 +#define RNGC_CONTROL 0x08 +#define RNGC_STATUS 0x0C +#define RNGC_FIFO 0x14 + +/* These version info are reported by the rngb in an imx258 chip. */ +#define RNG_TYPE_RNGB 0x1 +#define V_MAJ 0x2 +#define V_MIN 0x40 + +#define RNGC_CMD_BIT_SW_RST 0x40 +#define RNGC_CMD_BIT_CLR_ERR 0x20 +#define RNGC_CMD_BIT_CLR_INT 0x10 +#define RNGC_CMD_BIT_SEED 0x02 +#define RNGC_CMD_BIT_SELF_TEST 0x01 + +#define RNGC_CTRL_BIT_MASK_ERR 0x40 +#define RNGC_CTRL_BIT_MASK_DONE 0x20 +#define RNGC_CTRL_BIT_AUTO_SEED 0x10 + +/* the current status for self-test and seed operations */ +#define OP_IDLE 0 +#define OP_RUN 1 +#define OP_DONE 2 + +static uint64_t imx_rngc_read(void *opaque, hwaddr offset, unsigned size) +{ + IMXRNGCState *s = IMX_RNGC(opaque); + uint64_t val = 0; + + switch (offset) { + case RNGC_VER_ID: + val |= RNG_TYPE_RNGB << 28 | V_MAJ << 8 | V_MIN; + break; + + case RNGC_COMMAND: + if (s->op_seed == OP_RUN) { + val |= RNGC_CMD_BIT_SEED; + } + if (s->op_self_test == OP_RUN) { + val |= RNGC_CMD_BIT_SELF_TEST; + } + break; + + case RNGC_CONTROL: + /* + * The CTL_ACC and VERIF_MODE bits are not supported yet. + * They read as 0. + */ + val |= s->mask; + if (s->auto_seed) { + val |= RNGC_CTRL_BIT_AUTO_SEED; + } + /* + * We don't have an internal fifo like the real hardware. + * There's no need for strategy to handle fifo underflows. + * We return the FIFO_UFLOW_RESPONSE bits as 0. + */ + break; + + case RNGC_STATUS: + /* + * We never report any statistics test or self-test errors or any + * other errors. STAT_TEST_PF, ST_PF and ERROR are always 0. + */ + + /* + * We don't have an internal fifo, see above. Therefore, we + * report back the default fifo size (5 32-bit words) and + * indicate that our fifo is always full. + */ + val |= 5 << 12 | 5 << 8; + + /* We always have a new seed available. */ + val |= 1 << 6; + + if (s->op_seed == OP_DONE) { + val |= 1 << 5; + } + if (s->op_self_test == OP_DONE) { + val |= 1 << 4; + } + if (s->op_seed == OP_RUN || s->op_self_test == OP_RUN) { + /* + * We're busy if self-test is running or if we're + * seeding the prng. + */ + val |= 1 << 1; + } else { + /* + * We're ready to provide secure random numbers whenever + * we're not busy. + */ + val |= 1; + } + break; + + case RNGC_FIFO: + qemu_guest_getrandom_nofail(&val, sizeof(val)); + break; + } + + return val; +} + +static void imx_rngc_do_reset(IMXRNGCState *s) +{ + s->op_self_test = OP_IDLE; + s->op_seed = OP_IDLE; + s->mask = 0; + s->auto_seed = false; +} + +static void imx_rngc_write(void *opaque, hwaddr offset, uint64_t value, + unsigned size) +{ + IMXRNGCState *s = IMX_RNGC(opaque); + + switch (offset) { + case RNGC_COMMAND: + if (value & RNGC_CMD_BIT_SW_RST) { + imx_rngc_do_reset(s); + } + + /* + * For now, both CLR_ERR and CLR_INT clear the interrupt. We + * don't report any errors yet. + */ + if (value & (RNGC_CMD_BIT_CLR_ERR | RNGC_CMD_BIT_CLR_INT)) { + qemu_irq_lower(s->irq); + } + + if (value & RNGC_CMD_BIT_SEED) { + s->op_seed = OP_RUN; + qemu_bh_schedule(s->seed_bh); + } + + if (value & RNGC_CMD_BIT_SELF_TEST) { + s->op_self_test = OP_RUN; + qemu_bh_schedule(s->self_test_bh); + } + break; + + case RNGC_CONTROL: + /* + * The CTL_ACC and VERIF_MODE bits are not supported yet. + * We ignore them if they're set by the caller. + */ + + if (value & RNGC_CTRL_BIT_MASK_ERR) { + s->mask |= RNGC_CTRL_BIT_MASK_ERR; + } else { + s->mask &= ~RNGC_CTRL_BIT_MASK_ERR; + } + + if (value & RNGC_CTRL_BIT_MASK_DONE) { + s->mask |= RNGC_CTRL_BIT_MASK_DONE; + } else { + s->mask &= ~RNGC_CTRL_BIT_MASK_DONE; + } + + if (value & RNGC_CTRL_BIT_AUTO_SEED) { + s->auto_seed = true; + } else { + s->auto_seed = false; + } + break; + } +} + +static const MemoryRegionOps imx_rngc_ops = { + .read = imx_rngc_read, + .write = imx_rngc_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void imx_rngc_self_test(void *opaque) +{ + IMXRNGCState *s = IMX_RNGC(opaque); + + s->op_self_test = OP_DONE; + if (!(s->mask & RNGC_CTRL_BIT_MASK_DONE)) { + qemu_irq_raise(s->irq); + } +} + +static void imx_rngc_seed(void *opaque) +{ + IMXRNGCState *s = IMX_RNGC(opaque); + + s->op_seed = OP_DONE; + if (!(s->mask & RNGC_CTRL_BIT_MASK_DONE)) { + qemu_irq_raise(s->irq); + } +} + +static void imx_rngc_realize(DeviceState *dev, Error **errp) +{ + IMXRNGCState *s = IMX_RNGC(dev); + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + + memory_region_init_io(&s->iomem, OBJECT(s), &imx_rngc_ops, s, + TYPE_IMX_RNGC, 0x1000); + sysbus_init_mmio(sbd, &s->iomem); + + sysbus_init_irq(sbd, &s->irq); + s->self_test_bh = qemu_bh_new(imx_rngc_self_test, s); + s->seed_bh = qemu_bh_new(imx_rngc_seed, s); +} + +static void imx_rngc_reset(DeviceState *dev) +{ + IMXRNGCState *s = IMX_RNGC(dev); + + imx_rngc_do_reset(s); +} + +static const VMStateDescription vmstate_imx_rngc = { + .name = RNGC_NAME, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT8(op_self_test, IMXRNGCState), + VMSTATE_UINT8(op_seed, IMXRNGCState), + VMSTATE_UINT8(mask, IMXRNGCState), + VMSTATE_BOOL(auto_seed, IMXRNGCState), + VMSTATE_END_OF_LIST() + } +}; + +static void imx_rngc_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = imx_rngc_realize; + dc->reset = imx_rngc_reset; + dc->desc = RNGC_NAME, + dc->vmsd = &vmstate_imx_rngc; +} + +static const TypeInfo imx_rngc_info = { + .name = TYPE_IMX_RNGC, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMXRNGCState), + .class_init = imx_rngc_class_init, +}; + +static void imx_rngc_register_types(void) +{ + type_register_static(&imx_rngc_info); +} + +type_init(imx_rngc_register_types) diff --git a/hw/misc/max111x.c b/hw/misc/max111x.c index 211008ce02..2b87bdee5b 100644 --- a/hw/misc/max111x.c +++ b/hw/misc/max111x.c @@ -146,7 +146,8 @@ static int max111x_init(SSISlave *d, int inputs) s->input[7] = 0x80; s->com = 0; - vmstate_register(VMSTATE_IF(dev), -1, &vmstate_max111x, s); + vmstate_register(VMSTATE_IF(dev), VMSTATE_INSTANCE_ID_ANY, + &vmstate_max111x, s); return 0; } diff --git a/hw/misc/stm32f4xx_exti.c b/hw/misc/stm32f4xx_exti.c new file mode 100644 index 0000000000..02e7810046 --- /dev/null +++ b/hw/misc/stm32f4xx_exti.c @@ -0,0 +1,188 @@ +/* + * STM32F4XX EXTI + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "trace.h" +#include "hw/irq.h" +#include "migration/vmstate.h" +#include "hw/misc/stm32f4xx_exti.h" + +static void stm32f4xx_exti_reset(DeviceState *dev) +{ + STM32F4xxExtiState *s = STM32F4XX_EXTI(dev); + + s->exti_imr = 0x00000000; + s->exti_emr = 0x00000000; + s->exti_rtsr = 0x00000000; + s->exti_ftsr = 0x00000000; + s->exti_swier = 0x00000000; + s->exti_pr = 0x00000000; +} + +static void stm32f4xx_exti_set_irq(void *opaque, int irq, int level) +{ + STM32F4xxExtiState *s = opaque; + + trace_stm32f4xx_exti_set_irq(irq, level); + + if (((1 << irq) & s->exti_rtsr) && level) { + /* Rising Edge */ + s->exti_pr |= 1 << irq; + } + + if (((1 << irq) & s->exti_ftsr) && !level) { + /* Falling Edge */ + s->exti_pr |= 1 << irq; + } + + if (!((1 << irq) & s->exti_imr)) { + /* Interrupt is masked */ + return; + } + qemu_irq_pulse(s->irq[irq]); +} + +static uint64_t stm32f4xx_exti_read(void *opaque, hwaddr addr, + unsigned int size) +{ + STM32F4xxExtiState *s = opaque; + + trace_stm32f4xx_exti_read(addr); + + switch (addr) { + case EXTI_IMR: + return s->exti_imr; + case EXTI_EMR: + return s->exti_emr; + case EXTI_RTSR: + return s->exti_rtsr; + case EXTI_FTSR: + return s->exti_ftsr; + case EXTI_SWIER: + return s->exti_swier; + case EXTI_PR: + return s->exti_pr; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "STM32F4XX_exti_read: Bad offset %x\n", (int)addr); + return 0; + } + return 0; +} + +static void stm32f4xx_exti_write(void *opaque, hwaddr addr, + uint64_t val64, unsigned int size) +{ + STM32F4xxExtiState *s = opaque; + uint32_t value = (uint32_t) val64; + + trace_stm32f4xx_exti_write(addr, value); + + switch (addr) { + case EXTI_IMR: + s->exti_imr = value; + return; + case EXTI_EMR: + s->exti_emr = value; + return; + case EXTI_RTSR: + s->exti_rtsr = value; + return; + case EXTI_FTSR: + s->exti_ftsr = value; + return; + case EXTI_SWIER: + s->exti_swier = value; + return; + case EXTI_PR: + /* This bit is cleared by writing a 1 to it */ + s->exti_pr &= ~value; + return; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "STM32F4XX_exti_write: Bad offset %x\n", (int)addr); + } +} + +static const MemoryRegionOps stm32f4xx_exti_ops = { + .read = stm32f4xx_exti_read, + .write = stm32f4xx_exti_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void stm32f4xx_exti_init(Object *obj) +{ + STM32F4xxExtiState *s = STM32F4XX_EXTI(obj); + int i; + + for (i = 0; i < NUM_INTERRUPT_OUT_LINES; i++) { + sysbus_init_irq(SYS_BUS_DEVICE(obj), &s->irq[i]); + } + + memory_region_init_io(&s->mmio, obj, &stm32f4xx_exti_ops, s, + TYPE_STM32F4XX_EXTI, 0x400); + sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->mmio); + + qdev_init_gpio_in(DEVICE(obj), stm32f4xx_exti_set_irq, + NUM_GPIO_EVENT_IN_LINES); +} + +static const VMStateDescription vmstate_stm32f4xx_exti = { + .name = TYPE_STM32F4XX_EXTI, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(exti_imr, STM32F4xxExtiState), + VMSTATE_UINT32(exti_emr, STM32F4xxExtiState), + VMSTATE_UINT32(exti_rtsr, STM32F4xxExtiState), + VMSTATE_UINT32(exti_ftsr, STM32F4xxExtiState), + VMSTATE_UINT32(exti_swier, STM32F4xxExtiState), + VMSTATE_UINT32(exti_pr, STM32F4xxExtiState), + VMSTATE_END_OF_LIST() + } +}; + +static void stm32f4xx_exti_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = stm32f4xx_exti_reset; + dc->vmsd = &vmstate_stm32f4xx_exti; +} + +static const TypeInfo stm32f4xx_exti_info = { + .name = TYPE_STM32F4XX_EXTI, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(STM32F4xxExtiState), + .instance_init = stm32f4xx_exti_init, + .class_init = stm32f4xx_exti_class_init, +}; + +static void stm32f4xx_exti_register_types(void) +{ + type_register_static(&stm32f4xx_exti_info); +} + +type_init(stm32f4xx_exti_register_types) diff --git a/hw/misc/stm32f4xx_syscfg.c b/hw/misc/stm32f4xx_syscfg.c new file mode 100644 index 0000000000..f960e4ea1e --- /dev/null +++ b/hw/misc/stm32f4xx_syscfg.c @@ -0,0 +1,171 @@ +/* + * STM32F4xx SYSCFG + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "trace.h" +#include "hw/irq.h" +#include "migration/vmstate.h" +#include "hw/misc/stm32f4xx_syscfg.h" + +static void stm32f4xx_syscfg_reset(DeviceState *dev) +{ + STM32F4xxSyscfgState *s = STM32F4XX_SYSCFG(dev); + + s->syscfg_memrmp = 0x00000000; + s->syscfg_pmc = 0x00000000; + s->syscfg_exticr[0] = 0x00000000; + s->syscfg_exticr[1] = 0x00000000; + s->syscfg_exticr[2] = 0x00000000; + s->syscfg_exticr[3] = 0x00000000; + s->syscfg_cmpcr = 0x00000000; +} + +static void stm32f4xx_syscfg_set_irq(void *opaque, int irq, int level) +{ + STM32F4xxSyscfgState *s = opaque; + int icrreg = irq / 4; + int startbit = (irq & 3) * 4; + uint8_t config = irq / 16; + + trace_stm32f4xx_syscfg_set_irq(irq / 16, irq % 16, level); + + g_assert(icrreg < SYSCFG_NUM_EXTICR); + + if (extract32(s->syscfg_exticr[icrreg], startbit, 4) == config) { + qemu_set_irq(s->gpio_out[irq], level); + trace_stm32f4xx_pulse_exti(irq); + } +} + +static uint64_t stm32f4xx_syscfg_read(void *opaque, hwaddr addr, + unsigned int size) +{ + STM32F4xxSyscfgState *s = opaque; + + trace_stm32f4xx_syscfg_read(addr); + + switch (addr) { + case SYSCFG_MEMRMP: + return s->syscfg_memrmp; + case SYSCFG_PMC: + return s->syscfg_pmc; + case SYSCFG_EXTICR1...SYSCFG_EXTICR4: + return s->syscfg_exticr[addr / 4 - SYSCFG_EXTICR1 / 4]; + case SYSCFG_CMPCR: + return s->syscfg_cmpcr; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Bad offset 0x%"HWADDR_PRIx"\n", __func__, addr); + return 0; + } +} + +static void stm32f4xx_syscfg_write(void *opaque, hwaddr addr, + uint64_t val64, unsigned int size) +{ + STM32F4xxSyscfgState *s = opaque; + uint32_t value = val64; + + trace_stm32f4xx_syscfg_write(value, addr); + + switch (addr) { + case SYSCFG_MEMRMP: + qemu_log_mask(LOG_UNIMP, + "%s: Changing the memory mapping isn't supported " \ + "in QEMU\n", __func__); + return; + case SYSCFG_PMC: + qemu_log_mask(LOG_UNIMP, + "%s: Changing the memory mapping isn't supported " \ + "in QEMU\n", __func__); + return; + case SYSCFG_EXTICR1...SYSCFG_EXTICR4: + s->syscfg_exticr[addr / 4 - SYSCFG_EXTICR1 / 4] = (value & 0xFFFF); + return; + case SYSCFG_CMPCR: + s->syscfg_cmpcr = value; + return; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: Bad offset 0x%"HWADDR_PRIx"\n", __func__, addr); + } +} + +static const MemoryRegionOps stm32f4xx_syscfg_ops = { + .read = stm32f4xx_syscfg_read, + .write = stm32f4xx_syscfg_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static void stm32f4xx_syscfg_init(Object *obj) +{ + STM32F4xxSyscfgState *s = STM32F4XX_SYSCFG(obj); + + sysbus_init_irq(SYS_BUS_DEVICE(obj), &s->irq); + + memory_region_init_io(&s->mmio, obj, &stm32f4xx_syscfg_ops, s, + TYPE_STM32F4XX_SYSCFG, 0x400); + sysbus_init_mmio(SYS_BUS_DEVICE(obj), &s->mmio); + + qdev_init_gpio_in(DEVICE(obj), stm32f4xx_syscfg_set_irq, 16 * 9); + qdev_init_gpio_out(DEVICE(obj), s->gpio_out, 16); +} + +static const VMStateDescription vmstate_stm32f4xx_syscfg = { + .name = TYPE_STM32F4XX_SYSCFG, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(syscfg_memrmp, STM32F4xxSyscfgState), + VMSTATE_UINT32(syscfg_pmc, STM32F4xxSyscfgState), + VMSTATE_UINT32_ARRAY(syscfg_exticr, STM32F4xxSyscfgState, + SYSCFG_NUM_EXTICR), + VMSTATE_UINT32(syscfg_cmpcr, STM32F4xxSyscfgState), + VMSTATE_END_OF_LIST() + } +}; + +static void stm32f4xx_syscfg_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = stm32f4xx_syscfg_reset; + dc->vmsd = &vmstate_stm32f4xx_syscfg; +} + +static const TypeInfo stm32f4xx_syscfg_info = { + .name = TYPE_STM32F4XX_SYSCFG, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(STM32F4xxSyscfgState), + .instance_init = stm32f4xx_syscfg_init, + .class_init = stm32f4xx_syscfg_class_init, +}; + +static void stm32f4xx_syscfg_register_types(void) +{ + type_register_static(&stm32f4xx_syscfg_info); +} + +type_init(stm32f4xx_syscfg_register_types) diff --git a/hw/misc/trace-events b/hw/misc/trace-events index 2e0c820834..7f0f5dff3a 100644 --- a/hw/misc/trace-events +++ b/hw/misc/trace-events @@ -84,6 +84,17 @@ mos6522_set_sr_int(void) "set sr_int" mos6522_write(uint64_t addr, uint64_t val) "reg=0x%"PRIx64 " val=0x%"PRIx64 mos6522_read(uint64_t addr, unsigned val) "reg=0x%"PRIx64 " val=0x%x" +# stm32f4xx_syscfg +stm32f4xx_syscfg_set_irq(int gpio, int line, int level) "Interupt: GPIO: %d, Line: %d; Level: %d" +stm32f4xx_pulse_exti(int irq) "Pulse EXTI: %d" +stm32f4xx_syscfg_read(uint64_t addr) "reg read: addr: 0x%" PRIx64 " " +stm32f4xx_syscfg_write(uint64_t addr, uint64_t data) "reg write: addr: 0x%" PRIx64 " val: 0x%" PRIx64 "" + +# stm32f4xx_exti +stm32f4xx_exti_set_irq(int irq, int leve) "Set EXTI: %d to %d" +stm32f4xx_exti_read(uint64_t addr) "reg read: addr: 0x%" PRIx64 " " +stm32f4xx_exti_write(uint64_t addr, uint64_t data) "reg write: addr: 0x%" PRIx64 " val: 0x%" PRIx64 "" + # tz-mpc.c tz_mpc_reg_read(uint32_t offset, uint64_t data, unsigned size) "TZ MPC regs read: offset 0x%x data 0x%" PRIx64 " size %u" tz_mpc_reg_write(uint32_t offset, uint64_t data, unsigned size) "TZ MPC regs write: offset 0x%x data 0x%" PRIx64 " size %u" diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index cc71a7a036..6cc97769d9 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -1874,7 +1874,8 @@ static void e100_nic_realize(PCIDevice *pci_dev, Error **errp) s->vmstate = g_memdup(&vmstate_eepro100, sizeof(vmstate_eepro100)); s->vmstate->name = qemu_get_queue(s->nic)->model; - vmstate_register(VMSTATE_IF(&pci_dev->qdev), -1, s->vmstate, s); + vmstate_register(VMSTATE_IF(&pci_dev->qdev), VMSTATE_INSTANCE_ID_ANY, + s->vmstate, s); } static void eepro100_instance_init(Object *obj) diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 158d270b9f..6342f73b9f 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -275,20 +275,20 @@ static const TypeInfo q35_host_info = { * MCH D0:F0 */ -static uint64_t tseg_blackhole_read(void *ptr, hwaddr reg, unsigned size) +static uint64_t blackhole_read(void *ptr, hwaddr reg, unsigned size) { return 0xffffffff; } -static void tseg_blackhole_write(void *opaque, hwaddr addr, uint64_t val, - unsigned width) +static void blackhole_write(void *opaque, hwaddr addr, uint64_t val, + unsigned width) { /* nothing */ } -static const MemoryRegionOps tseg_blackhole_ops = { - .read = tseg_blackhole_read, - .write = tseg_blackhole_write, +static const MemoryRegionOps blackhole_ops = { + .read = blackhole_read, + .write = blackhole_write, .endianness = DEVICE_NATIVE_ENDIAN, .valid.min_access_size = 1, .valid.max_access_size = 4, @@ -430,6 +430,46 @@ static void mch_update_ext_tseg_mbytes(MCHPCIState *mch) } } +static void mch_update_smbase_smram(MCHPCIState *mch) +{ + PCIDevice *pd = PCI_DEVICE(mch); + uint8_t *reg = pd->config + MCH_HOST_BRIDGE_F_SMBASE; + bool lck; + + if (!mch->has_smram_at_smbase) { + return; + } + + if (*reg == MCH_HOST_BRIDGE_F_SMBASE_QUERY) { + pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] = + MCH_HOST_BRIDGE_F_SMBASE_LCK; + *reg = MCH_HOST_BRIDGE_F_SMBASE_IN_RAM; + return; + } + + /* + * default/reset state, discard written value + * which will disable SMRAM balackhole at SMBASE + */ + if (pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] == 0xff) { + *reg = 0x00; + } + + memory_region_transaction_begin(); + if (*reg & MCH_HOST_BRIDGE_F_SMBASE_LCK) { + /* disable all writes */ + pd->wmask[MCH_HOST_BRIDGE_F_SMBASE] &= + ~MCH_HOST_BRIDGE_F_SMBASE_LCK; + *reg = MCH_HOST_BRIDGE_F_SMBASE_LCK; + lck = true; + } else { + lck = false; + } + memory_region_set_enabled(&mch->smbase_blackhole, lck); + memory_region_set_enabled(&mch->smbase_window, lck); + memory_region_transaction_commit(); +} + static void mch_write_config(PCIDevice *d, uint32_t address, uint32_t val, int len) { @@ -456,6 +496,10 @@ static void mch_write_config(PCIDevice *d, MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_SIZE)) { mch_update_ext_tseg_mbytes(mch); } + + if (ranges_overlap(address, len, MCH_HOST_BRIDGE_F_SMBASE, 1)) { + mch_update_smbase_smram(mch); + } } static void mch_update(MCHPCIState *mch) @@ -464,6 +508,7 @@ static void mch_update(MCHPCIState *mch) mch_update_pam(mch); mch_update_smram(mch); mch_update_ext_tseg_mbytes(mch); + mch_update_smbase_smram(mch); /* * pci hole goes from end-of-low-ram to io-apic. @@ -514,6 +559,9 @@ static void mch_reset(DeviceState *qdev) MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY); } + d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0; + d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff; + mch_update(mch); } @@ -563,7 +611,7 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram); memory_region_init_io(&mch->tseg_blackhole, OBJECT(mch), - &tseg_blackhole_ops, NULL, + &blackhole_ops, NULL, "tseg-blackhole", 0); memory_region_set_enabled(&mch->tseg_blackhole, false); memory_region_add_subregion_overlap(mch->system_memory, @@ -575,6 +623,27 @@ static void mch_realize(PCIDevice *d, Error **errp) memory_region_set_enabled(&mch->tseg_window, false); memory_region_add_subregion(&mch->smram, mch->below_4g_mem_size, &mch->tseg_window); + + /* + * This is not what hardware does, so it's QEMU specific hack. + * See commit message for details. + */ + memory_region_init_io(&mch->smbase_blackhole, OBJECT(mch), &blackhole_ops, + NULL, "smbase-blackhole", + MCH_HOST_BRIDGE_SMBASE_SIZE); + memory_region_set_enabled(&mch->smbase_blackhole, false); + memory_region_add_subregion_overlap(mch->system_memory, + MCH_HOST_BRIDGE_SMBASE_ADDR, + &mch->smbase_blackhole, 1); + + memory_region_init_alias(&mch->smbase_window, OBJECT(mch), + "smbase-window", mch->ram_memory, + MCH_HOST_BRIDGE_SMBASE_ADDR, + MCH_HOST_BRIDGE_SMBASE_SIZE); + memory_region_set_enabled(&mch->smbase_window, false); + memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMBASE_ADDR, + &mch->smbase_window); + object_property_add_const_link(qdev_get_machine(), "smram", OBJECT(&mch->smram), &error_abort); @@ -601,6 +670,7 @@ uint64_t mch_mcfg_base(void) static Property mch_props[] = { DEFINE_PROP_UINT16("extended-tseg-mbytes", MCHPCIState, ext_tseg_mbytes, 16), + DEFINE_PROP_BOOL("smbase-smram", MCHPCIState, has_smram_at_smbase, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/pci/pci.c b/hw/pci/pci.c index e3d310365d..3ac7961451 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -122,7 +122,7 @@ static void pci_bus_realize(BusState *qbus, Error **errp) bus->machine_done.notify = pcibus_machine_done; qemu_add_machine_init_done_notifier(&bus->machine_done); - vmstate_register(NULL, -1, &vmstate_pcibus, bus); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_pcibus, bus); } static void pcie_bus_realize(BusState *qbus, Error **errp) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 30a5fbd3be..02cf53fc5b 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2948,7 +2948,7 @@ static void spapr_machine_init(MachineState *machine) * interface, this is a legacy from the sPAPREnvironment structure * which predated MachineState but had a similar function */ vmstate_register(NULL, 0, &vmstate_spapr, spapr); - register_savevm_live("spapr/htab", -1, 1, + register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1, &savevm_htab_handlers, spapr); qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine), diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 4bc73a370e..d3af42ef92 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -943,7 +943,13 @@ void virtio_scsi_common_unrealize(DeviceState *dev) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev); + int i; + virtio_delete_queue(vs->ctrl_vq); + virtio_delete_queue(vs->event_vq); + for (i = 0; i < vs->conf.num_queues; i++) { + virtio_delete_queue(vs->cmd_vqs[i]); + } g_free(vs->cmd_vqs); virtio_cleanup(vdev); } diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c index af524fabf7..beaa285685 100644 --- a/hw/timer/arm_timer.c +++ b/hw/timer/arm_timer.c @@ -180,7 +180,7 @@ static arm_timer_state *arm_timer_init(uint32_t freq) s->control = TIMER_CTRL_IE; s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_DEFAULT); - vmstate_register(NULL, -1, &vmstate_arm_timer, s); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, &vmstate_arm_timer, s); return s; } diff --git a/hw/tpm/tpm_emulator.c b/hw/tpm/tpm_emulator.c index 10d587ed40..3a0fc442f3 100644 --- a/hw/tpm/tpm_emulator.c +++ b/hw/tpm/tpm_emulator.c @@ -914,7 +914,8 @@ static void tpm_emulator_inst_init(Object *obj) tpm_emu->cur_locty_number = ~0; qemu_mutex_init(&tpm_emu->mutex); - vmstate_register(NULL, -1, &vmstate_tpm_emulator, obj); + vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, + &vmstate_tpm_emulator, obj); } /* diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index d27a10fcc6..2e81f5514f 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1061,7 +1061,7 @@ static void slave_read(void *opaque) fd[0]); break; default: - error_report("Received unexpected msg type."); + error_report("Received unexpected msg type: %d.", hdr.request); ret = -EINVAL; } diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c index f5744363a8..b6cee479bb 100644 --- a/hw/virtio/vhost-vsock.c +++ b/hw/virtio/vhost-vsock.c @@ -335,8 +335,10 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) sizeof(struct virtio_vsock_config)); /* Receive and transmit queues belong to vhost */ - virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, vhost_vsock_handle_output); - virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, vhost_vsock_handle_output); + vsock->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_handle_output); + vsock->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_handle_output); /* The event queue belongs to QEMU */ vsock->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, @@ -363,6 +365,9 @@ static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) err_vhost_dev: vhost_dev_cleanup(&vsock->vhost_dev); err_virtio: + virtio_delete_queue(vsock->recv_vq); + virtio_delete_queue(vsock->trans_vq); + virtio_delete_queue(vsock->event_vq); virtio_cleanup(vdev); close(vhostfd); return; @@ -379,6 +384,9 @@ static void vhost_vsock_device_unrealize(DeviceState *dev, Error **errp) vhost_vsock_set_status(vdev, 0); vhost_dev_cleanup(&vsock->vhost_dev); + virtio_delete_queue(vsock->recv_vq); + virtio_delete_queue(vsock->trans_vq); + virtio_delete_queue(vsock->event_vq); virtio_cleanup(vdev); } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 4da0d5a6c5..9edfadc81d 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -547,26 +547,28 @@ static void vhost_region_add_section(struct vhost_dev *dev, uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + section->offset_within_region; RAMBlock *mrs_rb = section->mr->ram_block; - size_t mrs_page = qemu_ram_pagesize(mrs_rb); trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, mrs_host); - /* Round the section to it's page size */ - /* First align the start down to a page boundary */ - uint64_t alignage = mrs_host & (mrs_page - 1); - if (alignage) { - mrs_host -= alignage; - mrs_size += alignage; - mrs_gpa -= alignage; - } - /* Now align the size up to a page boundary */ - alignage = mrs_size & (mrs_page - 1); - if (alignage) { - mrs_size += mrs_page - alignage; + if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { + /* Round the section to it's page size */ + /* First align the start down to a page boundary */ + size_t mrs_page = qemu_ram_pagesize(mrs_rb); + uint64_t alignage = mrs_host & (mrs_page - 1); + if (alignage) { + mrs_host -= alignage; + mrs_size += alignage; + mrs_gpa -= alignage; + } + /* Now align the size up to a page boundary */ + alignage = mrs_size & (mrs_page - 1); + if (alignage) { + mrs_size += mrs_page - alignage; + } + trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, + mrs_size, mrs_host); } - trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, - mrs_host); if (dev->n_tmp_sections) { /* Since we already have at least one section, lets see if @@ -590,9 +592,10 @@ static void vhost_region_add_section(struct vhost_dev *dev, * match up in the same RAMBlock if they do. */ if (mrs_gpa < prev_gpa_start) { - error_report("%s:Section rounded to %"PRIx64 - " prior to previous %"PRIx64, - __func__, mrs_gpa, prev_gpa_start); + error_report("%s:Section '%s' rounded to %"PRIx64 + " prior to previous '%s' %"PRIx64, + __func__, section->mr->name, mrs_gpa, + prev_sec->mr->name, prev_gpa_start); /* A way to cleanly fail here would be better */ return; } diff --git a/include/elf.h b/include/elf.h index 3501e0c8d0..8fbfe60e09 100644 --- a/include/elf.h +++ b/include/elf.h @@ -1650,6 +1650,7 @@ typedef struct elf64_shdr { #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension regs */ /* * Physical entry point into the kernel. diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h index fd499f7e2f..53de19753a 100644 --- a/include/exec/cpu_ldst.h +++ b/include/exec/cpu_ldst.h @@ -25,9 +25,13 @@ * * The syntax for the accessors is: * - * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr) + * load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr) + * cpu_ld{sign}{size}_{mmusuffix}_ra(env, ptr, retaddr) + * cpu_ld{sign}{size}_mmuidx_ra(env, ptr, mmu_idx, retaddr) * - * store: cpu_st{sign}{size}_{mmusuffix}(env, ptr, val) + * store: cpu_st{size}_{mmusuffix}(env, ptr, val) + * cpu_st{size}_{mmusuffix}_ra(env, ptr, val, retaddr) + * cpu_st{size}_mmuidx_ra(env, ptr, val, mmu_idx, retaddr) * * sign is: * (empty): for 32 and 64 bit sizes @@ -40,9 +44,10 @@ * l: 32 bits * q: 64 bits * - * mmusuffix is one of the generic suffixes "data" or "code", or - * (for softmmu configs) a target-specific MMU mode suffix as defined - * in target cpu.h. + * mmusuffix is one of the generic suffixes "data" or "code", or "mmuidx". + * The "mmuidx" suffix carries an extra mmu_idx argument that specifies + * the index to use; the "data" and "code" suffixes take the index from + * cpu_mmu_index(). */ #ifndef CPU_LDST_H #define CPU_LDST_H @@ -89,6 +94,34 @@ typedef target_ulong abi_ptr; #define TARGET_ABI_FMT_ptr TARGET_ABI_FMT_lx #endif +uint32_t cpu_ldub_data(CPUArchState *env, abi_ptr ptr); +uint32_t cpu_lduw_data(CPUArchState *env, abi_ptr ptr); +uint32_t cpu_ldl_data(CPUArchState *env, abi_ptr ptr); +uint64_t cpu_ldq_data(CPUArchState *env, abi_ptr ptr); +int cpu_ldsb_data(CPUArchState *env, abi_ptr ptr); +int cpu_ldsw_data(CPUArchState *env, abi_ptr ptr); + +uint32_t cpu_ldub_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); +uint32_t cpu_lduw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); +uint32_t cpu_ldl_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); +uint64_t cpu_ldq_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); +int cpu_ldsb_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); +int cpu_ldsw_data_ra(CPUArchState *env, abi_ptr ptr, uintptr_t retaddr); + +void cpu_stb_data(CPUArchState *env, abi_ptr ptr, uint32_t val); +void cpu_stw_data(CPUArchState *env, abi_ptr ptr, uint32_t val); +void cpu_stl_data(CPUArchState *env, abi_ptr ptr, uint32_t val); +void cpu_stq_data(CPUArchState *env, abi_ptr ptr, uint64_t val); + +void cpu_stb_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr); +void cpu_stw_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr); +void cpu_stl_data_ra(CPUArchState *env, abi_ptr ptr, + uint32_t val, uintptr_t retaddr); +void cpu_stq_data_ra(CPUArchState *env, abi_ptr ptr, + uint64_t val, uintptr_t retaddr); + #if defined(CONFIG_USER_ONLY) extern __thread uintptr_t helper_retaddr; @@ -113,47 +146,75 @@ static inline void clear_helper_retaddr(void) helper_retaddr = 0; } -/* In user-only mode we provide only the _code and _data accessors. */ +/* + * Provide the same *_mmuidx_ra interface as for softmmu. + * The mmu_idx argument is ignored. + */ -#define MEMSUFFIX _data -#define DATA_SIZE 1 -#include "exec/cpu_ldst_useronly_template.h" +static inline uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_ldub_data_ra(env, addr, ra); +} -#define DATA_SIZE 2 -#include "exec/cpu_ldst_useronly_template.h" +static inline uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_lduw_data_ra(env, addr, ra); +} -#define DATA_SIZE 4 -#include "exec/cpu_ldst_useronly_template.h" +static inline uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_ldl_data_ra(env, addr, ra); +} -#define DATA_SIZE 8 -#include "exec/cpu_ldst_useronly_template.h" -#undef MEMSUFFIX +static inline uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_ldq_data_ra(env, addr, ra); +} -/* - * Code access is deprecated in favour of translator_ld* functions - * (see translator.h). However there are still users that need to - * converted so for now these stay. - */ -#define MEMSUFFIX _code -#define CODE_ACCESS -#define DATA_SIZE 1 -#include "exec/cpu_ldst_useronly_template.h" +static inline int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_ldsb_data_ra(env, addr, ra); +} -#define DATA_SIZE 2 -#include "exec/cpu_ldst_useronly_template.h" +static inline int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra) +{ + return cpu_ldsw_data_ra(env, addr, ra); +} -#define DATA_SIZE 4 -#include "exec/cpu_ldst_useronly_template.h" +static inline void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, + uint32_t val, int mmu_idx, uintptr_t ra) +{ + cpu_stb_data_ra(env, addr, val, ra); +} -#define DATA_SIZE 8 -#include "exec/cpu_ldst_useronly_template.h" -#undef MEMSUFFIX -#undef CODE_ACCESS +static inline void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + uint32_t val, int mmu_idx, uintptr_t ra) +{ + cpu_stw_data_ra(env, addr, val, ra); +} + +static inline void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, + uint32_t val, int mmu_idx, uintptr_t ra) +{ + cpu_stl_data_ra(env, addr, val, ra); +} + +static inline void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, + uint64_t val, int mmu_idx, uintptr_t ra) +{ + cpu_stq_data_ra(env, addr, val, ra); +} #else -/* The memory helpers for tcg-generated code need tcg_target_long etc. */ -#include "tcg.h" +/* Needed for TCG_OVERSIZED_GUEST */ +#include "tcg/tcg.h" static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry) { @@ -173,11 +234,6 @@ static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx, return (addr >> TARGET_PAGE_BITS) & size_mask; } -static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx) -{ - return (env_tlb(env)->f[mmu_idx].mask >> CPU_TLB_ENTRY_BITS) + 1; -} - /* Find the TLB entry corresponding to the mmu_idx + address pair. */ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx, target_ulong addr) @@ -185,280 +241,45 @@ static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx, return &env_tlb(env)->f[mmu_idx].table[tlb_index(env, mmu_idx, addr)]; } -#ifdef MMU_MODE0_SUFFIX -#define CPU_MMU_INDEX 0 -#define MEMSUFFIX MMU_MODE0_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif - -#if (NB_MMU_MODES >= 2) && defined(MMU_MODE1_SUFFIX) -#define CPU_MMU_INDEX 1 -#define MEMSUFFIX MMU_MODE1_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif - -#if (NB_MMU_MODES >= 3) && defined(MMU_MODE2_SUFFIX) - -#define CPU_MMU_INDEX 2 -#define MEMSUFFIX MMU_MODE2_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 3) */ - -#if (NB_MMU_MODES >= 4) && defined(MMU_MODE3_SUFFIX) - -#define CPU_MMU_INDEX 3 -#define MEMSUFFIX MMU_MODE3_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 4) */ - -#if (NB_MMU_MODES >= 5) && defined(MMU_MODE4_SUFFIX) - -#define CPU_MMU_INDEX 4 -#define MEMSUFFIX MMU_MODE4_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" +uint32_t cpu_ldub_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); +uint32_t cpu_lduw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); +uint32_t cpu_ldl_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); +uint64_t cpu_ldq_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); + +int cpu_ldsb_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); +int cpu_ldsw_mmuidx_ra(CPUArchState *env, abi_ptr addr, + int mmu_idx, uintptr_t ra); + +void cpu_stb_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val, + int mmu_idx, uintptr_t retaddr); +void cpu_stw_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val, + int mmu_idx, uintptr_t retaddr); +void cpu_stl_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint32_t val, + int mmu_idx, uintptr_t retaddr); +void cpu_stq_mmuidx_ra(CPUArchState *env, abi_ptr addr, uint64_t val, + int mmu_idx, uintptr_t retaddr); -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 5) */ - -#if (NB_MMU_MODES >= 6) && defined(MMU_MODE5_SUFFIX) - -#define CPU_MMU_INDEX 5 -#define MEMSUFFIX MMU_MODE5_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 6) */ - -#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX) - -#define CPU_MMU_INDEX 6 -#define MEMSUFFIX MMU_MODE6_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 7) */ - -#if (NB_MMU_MODES >= 8) && defined(MMU_MODE7_SUFFIX) - -#define CPU_MMU_INDEX 7 -#define MEMSUFFIX MMU_MODE7_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 8) */ - -#if (NB_MMU_MODES >= 9) && defined(MMU_MODE8_SUFFIX) - -#define CPU_MMU_INDEX 8 -#define MEMSUFFIX MMU_MODE8_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 9) */ - -#if (NB_MMU_MODES >= 10) && defined(MMU_MODE9_SUFFIX) - -#define CPU_MMU_INDEX 9 -#define MEMSUFFIX MMU_MODE9_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 10) */ - -#if (NB_MMU_MODES >= 11) && defined(MMU_MODE10_SUFFIX) - -#define CPU_MMU_INDEX 10 -#define MEMSUFFIX MMU_MODE10_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 11) */ - -#if (NB_MMU_MODES >= 12) && defined(MMU_MODE11_SUFFIX) - -#define CPU_MMU_INDEX 11 -#define MEMSUFFIX MMU_MODE11_SUFFIX -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif /* (NB_MMU_MODES >= 12) */ - -#if (NB_MMU_MODES > 12) -#error "NB_MMU_MODES > 12 is not supported for now" -#endif /* (NB_MMU_MODES > 12) */ - -/* these access are slower, they must be as rare as possible */ -#define CPU_MMU_INDEX (cpu_mmu_index(env, false)) -#define MEMSUFFIX _data -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX - -/* - * Code access is deprecated in favour of translator_ld* functions - * (see translator.h). However there are still users that need to - * converted so for now these stay. - */ - -#define CPU_MMU_INDEX (cpu_mmu_index(env, true)) -#define MEMSUFFIX _code -#define SOFTMMU_CODE_ACCESS - -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" +#endif /* defined(CONFIG_USER_ONLY) */ -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" +uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr addr); +uint32_t cpu_lduw_code(CPUArchState *env, abi_ptr addr); +uint32_t cpu_ldl_code(CPUArchState *env, abi_ptr addr); +uint64_t cpu_ldq_code(CPUArchState *env, abi_ptr addr); -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#undef SOFTMMU_CODE_ACCESS +static inline int cpu_ldsb_code(CPUArchState *env, abi_ptr addr) +{ + return (int8_t)cpu_ldub_code(env, addr); +} -#endif /* defined(CONFIG_USER_ONLY) */ +static inline int cpu_ldsw_code(CPUArchState *env, abi_ptr addr) +{ + return (int16_t)cpu_lduw_code(env, addr); +} /** * tlb_vaddr_to_host: diff --git a/include/exec/cpu_ldst_template.h b/include/exec/cpu_ldst_template.h deleted file mode 100644 index 54b5e858ce..0000000000 --- a/include/exec/cpu_ldst_template.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Software MMU support - * - * Generate inline load/store functions for one MMU mode and data - * size. - * - * Generate a store function as well as signed and unsigned loads. - * - * Not used directly but included from cpu_ldst.h. - * - * Copyright (c) 2003 Fabrice Bellard - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see <http://www.gnu.org/licenses/>. - */ - -#if !defined(SOFTMMU_CODE_ACCESS) -#include "trace-root.h" -#endif - -#include "qemu/plugin.h" -#include "trace/mem.h" - -#if DATA_SIZE == 8 -#define SUFFIX q -#define USUFFIX q -#define DATA_TYPE uint64_t -#define SHIFT 3 -#elif DATA_SIZE == 4 -#define SUFFIX l -#define USUFFIX l -#define DATA_TYPE uint32_t -#define SHIFT 2 -#elif DATA_SIZE == 2 -#define SUFFIX w -#define USUFFIX uw -#define DATA_TYPE uint16_t -#define DATA_STYPE int16_t -#define SHIFT 1 -#elif DATA_SIZE == 1 -#define SUFFIX b -#define USUFFIX ub -#define DATA_TYPE uint8_t -#define DATA_STYPE int8_t -#define SHIFT 0 -#else -#error unsupported data size -#endif - -#if DATA_SIZE == 8 -#define RES_TYPE uint64_t -#else -#define RES_TYPE uint32_t -#endif - -#ifdef SOFTMMU_CODE_ACCESS -#define ADDR_READ addr_code -#define MMUSUFFIX _cmmu -#define URETSUFFIX USUFFIX -#define SRETSUFFIX glue(s, SUFFIX) -#else -#define ADDR_READ addr_read -#define MMUSUFFIX _mmu -#define URETSUFFIX USUFFIX -#define SRETSUFFIX glue(s, SUFFIX) -#endif - -/* generic load/store macros */ - -static inline RES_TYPE -glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - target_ulong ptr, - uintptr_t retaddr) -{ - CPUTLBEntry *entry; - RES_TYPE res; - target_ulong addr; - int mmu_idx = CPU_MMU_INDEX; - TCGMemOpIdx oi; -#if !defined(SOFTMMU_CODE_ACCESS) - uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false, mmu_idx); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); -#endif - - addr = ptr; - entry = tlb_entry(env, mmu_idx, addr); - if (unlikely(entry->ADDR_READ != - (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) { - oi = make_memop_idx(SHIFT, mmu_idx); - res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr, - oi, retaddr); - } else { - uintptr_t hostaddr = addr + entry->addend; - res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr); - } -#ifndef SOFTMMU_CODE_ACCESS - qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); -#endif - return res; -} - -static inline RES_TYPE -glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr) -{ - return glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(env, ptr, 0); -} - -#if DATA_SIZE <= 2 -static inline int -glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - target_ulong ptr, - uintptr_t retaddr) -{ - CPUTLBEntry *entry; - int res; - target_ulong addr; - int mmu_idx = CPU_MMU_INDEX; - TCGMemOpIdx oi; -#if !defined(SOFTMMU_CODE_ACCESS) - uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false, mmu_idx); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); -#endif - - addr = ptr; - entry = tlb_entry(env, mmu_idx, addr); - if (unlikely(entry->ADDR_READ != - (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) { - oi = make_memop_idx(SHIFT, mmu_idx); - res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX), - MMUSUFFIX)(env, addr, oi, retaddr); - } else { - uintptr_t hostaddr = addr + entry->addend; - res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr); - } -#ifndef SOFTMMU_CODE_ACCESS - qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); -#endif - return res; -} - -static inline int -glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr) -{ - return glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(env, ptr, 0); -} -#endif - -#ifndef SOFTMMU_CODE_ACCESS - -/* generic store macro */ - -static inline void -glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - target_ulong ptr, - RES_TYPE v, uintptr_t retaddr) -{ - CPUTLBEntry *entry; - target_ulong addr; - int mmu_idx = CPU_MMU_INDEX; - TCGMemOpIdx oi; -#if !defined(SOFTMMU_CODE_ACCESS) - uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true, mmu_idx); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); -#endif - - addr = ptr; - entry = tlb_entry(env, mmu_idx, addr); - if (unlikely(tlb_addr_write(entry) != - (addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) { - oi = make_memop_idx(SHIFT, mmu_idx); - glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi, - retaddr); - } else { - uintptr_t hostaddr = addr + entry->addend; - glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v); - } -#ifndef SOFTMMU_CODE_ACCESS - qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); -#endif -} - -static inline void -glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr, - RES_TYPE v) -{ - glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(env, ptr, v, 0); -} - -#endif /* !SOFTMMU_CODE_ACCESS */ - -#undef RES_TYPE -#undef DATA_TYPE -#undef DATA_STYPE -#undef SUFFIX -#undef USUFFIX -#undef DATA_SIZE -#undef MMUSUFFIX -#undef ADDR_READ -#undef URETSUFFIX -#undef SRETSUFFIX -#undef SHIFT diff --git a/include/exec/cpu_ldst_useronly_template.h b/include/exec/cpu_ldst_useronly_template.h deleted file mode 100644 index dbdc7a845d..0000000000 --- a/include/exec/cpu_ldst_useronly_template.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * User-only accessor function support - * - * Generate inline load/store functions for one data size. - * - * Generate a store function as well as signed and unsigned loads. - * - * Not used directly but included from cpu_ldst.h. - * - * Copyright (c) 2015 Linaro Limited - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see <http://www.gnu.org/licenses/>. - */ - -#if !defined(CODE_ACCESS) -#include "trace-root.h" -#endif - -#include "trace/mem.h" - -#if DATA_SIZE == 8 -#define SUFFIX q -#define USUFFIX q -#define DATA_TYPE uint64_t -#define SHIFT 3 -#elif DATA_SIZE == 4 -#define SUFFIX l -#define USUFFIX l -#define DATA_TYPE uint32_t -#define SHIFT 2 -#elif DATA_SIZE == 2 -#define SUFFIX w -#define USUFFIX uw -#define DATA_TYPE uint16_t -#define DATA_STYPE int16_t -#define SHIFT 1 -#elif DATA_SIZE == 1 -#define SUFFIX b -#define USUFFIX ub -#define DATA_TYPE uint8_t -#define DATA_STYPE int8_t -#define SHIFT 0 -#else -#error unsupported data size -#endif - -#if DATA_SIZE == 8 -#define RES_TYPE uint64_t -#else -#define RES_TYPE uint32_t -#endif - -static inline RES_TYPE -glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr) -{ - RES_TYPE ret; -#ifdef CODE_ACCESS - set_helper_retaddr(1); - ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr)); - clear_helper_retaddr(); -#else - uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, false, - MMU_USER_IDX); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); - ret = glue(glue(ld, USUFFIX), _p)(g2h(ptr)); -#endif - return ret; -} - -#ifndef CODE_ACCESS -static inline RES_TYPE -glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - abi_ptr ptr, - uintptr_t retaddr) -{ - RES_TYPE ret; - set_helper_retaddr(retaddr); - ret = glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(env, ptr); - clear_helper_retaddr(); - return ret; -} -#endif - -#if DATA_SIZE <= 2 -static inline int -glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr) -{ - int ret; -#ifdef CODE_ACCESS - set_helper_retaddr(1); - ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr)); - clear_helper_retaddr(); -#else - uint16_t meminfo = trace_mem_build_info(SHIFT, true, MO_TE, false, - MMU_USER_IDX); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); - ret = glue(glue(lds, SUFFIX), _p)(g2h(ptr)); - qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); -#endif - return ret; -} - -#ifndef CODE_ACCESS -static inline int -glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - abi_ptr ptr, - uintptr_t retaddr) -{ - int ret; - set_helper_retaddr(retaddr); - ret = glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(env, ptr); - clear_helper_retaddr(); - return ret; -} -#endif /* CODE_ACCESS */ -#endif /* DATA_SIZE <= 2 */ - -#ifndef CODE_ACCESS -static inline void -glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, abi_ptr ptr, - RES_TYPE v) -{ - uint16_t meminfo = trace_mem_build_info(SHIFT, false, MO_TE, true, - MMU_USER_IDX); - trace_guest_mem_before_exec(env_cpu(env), ptr, meminfo); - glue(glue(st, SUFFIX), _p)(g2h(ptr), v); - qemu_plugin_vcpu_mem_cb(env_cpu(env), ptr, meminfo); -} - -static inline void -glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env, - abi_ptr ptr, - RES_TYPE v, - uintptr_t retaddr) -{ - set_helper_retaddr(retaddr); - glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(env, ptr, v); - clear_helper_retaddr(); -} -#endif - -#undef RES_TYPE -#undef DATA_TYPE -#undef DATA_STYPE -#undef SUFFIX -#undef USUFFIX -#undef DATA_SIZE -#undef SHIFT diff --git a/include/exec/translator.h b/include/exec/translator.h index 459dd72aab..638e1529c5 100644 --- a/include/exec/translator.h +++ b/include/exec/translator.h @@ -148,41 +148,19 @@ void translator_loop_temp_check(DisasContextBase *db); /* * Translator Load Functions * - * These are intended to replace the old cpu_ld*_code functions and - * are mandatory for front-ends that have been migrated to the common - * translator_loop. These functions are only intended to be called - * from the translation stage and should not be called from helper - * functions. Those functions should be converted to encode the - * relevant information at translation time. + * These are intended to replace the direct usage of the cpu_ld*_code + * functions and are mandatory for front-ends that have been migrated + * to the common translator_loop. These functions are only intended + * to be called from the translation stage and should not be called + * from helper functions. Those functions should be converted to encode + * the relevant information at translation time. */ -#ifdef CONFIG_USER_ONLY - -#define DO_LOAD(type, name, shift) \ - do { \ - set_helper_retaddr(1); \ - ret = name ## _p(g2h(pc)); \ - clear_helper_retaddr(); \ - } while (0) - -#else - -#define DO_LOAD(type, name, shift) \ - do { \ - int mmu_idx = cpu_mmu_index(env, true); \ - TCGMemOpIdx oi = make_memop_idx(shift, mmu_idx); \ - ret = helper_ret_ ## name ## _cmmu(env, pc, oi, 0); \ - } while (0) - -#endif - -#define GEN_TRANSLATOR_LD(fullname, name, type, shift, swap_fn) \ +#define GEN_TRANSLATOR_LD(fullname, type, load_fn, swap_fn) \ static inline type \ fullname ## _swap(CPUArchState *env, abi_ptr pc, bool do_swap) \ { \ - type ret; \ - DO_LOAD(type, name, shift); \ - \ + type ret = load_fn(env, pc); \ if (do_swap) { \ ret = swap_fn(ret); \ } \ @@ -195,11 +173,11 @@ void translator_loop_temp_check(DisasContextBase *db); return fullname ## _swap(env, pc, false); \ } -GEN_TRANSLATOR_LD(translator_ldub, ldub, uint8_t, 0, /* no swap */ ) -GEN_TRANSLATOR_LD(translator_ldsw, ldsw, int16_t, 1, bswap16) -GEN_TRANSLATOR_LD(translator_lduw, lduw, uint16_t, 1, bswap16) -GEN_TRANSLATOR_LD(translator_ldl, ldl, uint32_t, 2, bswap32) -GEN_TRANSLATOR_LD(translator_ldq, ldq, uint64_t, 3, bswap64) +GEN_TRANSLATOR_LD(translator_ldub, uint8_t, cpu_ldub_code, /* no swap */) +GEN_TRANSLATOR_LD(translator_ldsw, int16_t, cpu_ldsw_code, bswap16) +GEN_TRANSLATOR_LD(translator_lduw, uint16_t, cpu_lduw_code, bswap16) +GEN_TRANSLATOR_LD(translator_ldl, uint32_t, cpu_ldl_code, bswap32) +GEN_TRANSLATOR_LD(translator_ldq, uint64_t, cpu_ldq_code, bswap64) #undef GEN_TRANSLATOR_LD #endif /* EXEC__TRANSLATOR_H */ diff --git a/include/hw/arm/allwinner-a10.h b/include/hw/arm/allwinner-a10.h index 7d2d215630..40d0b1d9c0 100644 --- a/include/hw/arm/allwinner-a10.h +++ b/include/hw/arm/allwinner-a10.h @@ -12,12 +12,6 @@ #include "target/arm/cpu.h" -#define AW_A10_PIC_REG_BASE 0x01c20400 -#define AW_A10_PIT_REG_BASE 0x01c20c00 -#define AW_A10_UART0_REG_BASE 0x01c28000 -#define AW_A10_EMAC_BASE 0x01c0b000 -#define AW_A10_SATA_BASE 0x01c18000 - #define AW_A10_SDRAM_BASE 0x40000000 #define TYPE_AW_A10 "allwinner-a10" @@ -29,7 +23,6 @@ typedef struct AwA10State { /*< public >*/ ARMCPU cpu; - qemu_irq irq[AW_A10_PIC_INT_NR]; AwA10PITState timer; AwA10PICState intc; AwEmacState emac; diff --git a/include/hw/arm/exynos4210.h b/include/hw/arm/exynos4210.h index f0f23b0e9b..55260394af 100644 --- a/include/hw/arm/exynos4210.h +++ b/include/hw/arm/exynos4210.h @@ -24,6 +24,7 @@ #ifndef EXYNOS4210_H #define EXYNOS4210_H +#include "hw/or-irq.h" #include "hw/sysbus.h" #include "target/arm/cpu-qom.h" @@ -74,6 +75,8 @@ #define EXYNOS4210_I2C_NUMBER 9 +#define EXYNOS4210_NUM_DMA 3 + typedef struct Exynos4210Irq { qemu_irq int_combiner_irq[EXYNOS4210_MAX_INT_COMBINER_IN_IRQ]; qemu_irq ext_combiner_irq[EXYNOS4210_MAX_EXT_COMBINER_IN_IRQ]; @@ -97,6 +100,7 @@ typedef struct Exynos4210State { MemoryRegion boot_secondary; MemoryRegion bootreg_mem; I2CBus *i2c_if[EXYNOS4210_I2C_NUMBER]; + qemu_or_irq pl330_irq_orgate[EXYNOS4210_NUM_DMA]; } Exynos4210State; #define TYPE_EXYNOS4210_SOC "exynos4210" diff --git a/include/hw/arm/fsl-imx25.h b/include/hw/arm/fsl-imx25.h index 241efb52ae..1c86bb55fb 100644 --- a/include/hw/arm/fsl-imx25.h +++ b/include/hw/arm/fsl-imx25.h @@ -24,6 +24,7 @@ #include "hw/timer/imx_gpt.h" #include "hw/timer/imx_epit.h" #include "hw/net/imx_fec.h" +#include "hw/misc/imx_rngc.h" #include "hw/i2c/imx_i2c.h" #include "hw/gpio/imx_gpio.h" #include "exec/memory.h" @@ -50,6 +51,7 @@ typedef struct FslIMX25State { IMXGPTState gpt[FSL_IMX25_NUM_GPTS]; IMXEPITState epit[FSL_IMX25_NUM_EPITS]; IMXFECState fec; + IMXRNGCState rngc; IMXI2CState i2c[FSL_IMX25_NUM_I2CS]; IMXGPIOState gpio[FSL_IMX25_NUM_GPIOS]; MemoryRegion rom[2]; @@ -211,6 +213,8 @@ typedef struct FslIMX25State { #define FSL_IMX25_GPIO4_SIZE 0x4000 #define FSL_IMX25_GPIO3_ADDR 0x53FA4000 #define FSL_IMX25_GPIO3_SIZE 0x4000 +#define FSL_IMX25_RNGC_ADDR 0x53FB0000 +#define FSL_IMX25_RNGC_SIZE 0x4000 #define FSL_IMX25_GPIO1_ADDR 0x53FCC000 #define FSL_IMX25_GPIO1_SIZE 0x4000 #define FSL_IMX25_GPIO2_ADDR 0x53FD0000 @@ -238,6 +242,7 @@ typedef struct FslIMX25State { #define FSL_IMX25_EPIT1_IRQ 28 #define FSL_IMX25_EPIT2_IRQ 27 #define FSL_IMX25_FEC_IRQ 57 +#define FSL_IMX25_RNGC_IRQ 22 #define FSL_IMX25_I2C1_IRQ 3 #define FSL_IMX25_I2C2_IRQ 4 #define FSL_IMX25_I2C3_IRQ 10 diff --git a/include/hw/arm/stm32f405_soc.h b/include/hw/arm/stm32f405_soc.h new file mode 100644 index 0000000000..1fe97f8c3a --- /dev/null +++ b/include/hw/arm/stm32f405_soc.h @@ -0,0 +1,73 @@ +/* + * STM32F405 SoC + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef HW_ARM_STM32F405_SOC_H +#define HW_ARM_STM32F405_SOC_H + +#include "hw/misc/stm32f4xx_syscfg.h" +#include "hw/timer/stm32f2xx_timer.h" +#include "hw/char/stm32f2xx_usart.h" +#include "hw/adc/stm32f2xx_adc.h" +#include "hw/misc/stm32f4xx_exti.h" +#include "hw/or-irq.h" +#include "hw/ssi/stm32f2xx_spi.h" +#include "hw/arm/armv7m.h" + +#define TYPE_STM32F405_SOC "stm32f405-soc" +#define STM32F405_SOC(obj) \ + OBJECT_CHECK(STM32F405State, (obj), TYPE_STM32F405_SOC) + +#define STM_NUM_USARTS 7 +#define STM_NUM_TIMERS 4 +#define STM_NUM_ADCS 6 +#define STM_NUM_SPIS 6 + +#define FLASH_BASE_ADDRESS 0x08000000 +#define FLASH_SIZE (1024 * 1024) +#define SRAM_BASE_ADDRESS 0x20000000 +#define SRAM_SIZE (192 * 1024) + +typedef struct STM32F405State { + /*< private >*/ + SysBusDevice parent_obj; + /*< public >*/ + + char *cpu_type; + + ARMv7MState armv7m; + + STM32F4xxSyscfgState syscfg; + STM32F4xxExtiState exti; + STM32F2XXUsartState usart[STM_NUM_USARTS]; + STM32F2XXTimerState timer[STM_NUM_TIMERS]; + qemu_or_irq adc_irqs; + STM32F2XXADCState adc[STM_NUM_ADCS]; + STM32F2XXSPIState spi[STM_NUM_SPIS]; + + MemoryRegion sram; + MemoryRegion flash; + MemoryRegion flash_alias; +} STM32F405State; + +#endif diff --git a/include/hw/misc/imx_rngc.h b/include/hw/misc/imx_rngc.h new file mode 100644 index 0000000000..f0d2b44d4f --- /dev/null +++ b/include/hw/misc/imx_rngc.h @@ -0,0 +1,35 @@ +/* + * Freescale i.MX RNGC emulation + * + * Copyright (C) 2020 Martin Kaiser <martin@kaiser.cx> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef IMX_RNGC_H +#define IMX_RNGC_H + +#include "hw/sysbus.h" + +#define TYPE_IMX_RNGC "imx.rngc" +#define IMX_RNGC(obj) OBJECT_CHECK(IMXRNGCState, (obj), TYPE_IMX_RNGC) + +typedef struct IMXRNGCState { + /*< private >*/ + SysBusDevice parent_obj; + + /*< public >*/ + MemoryRegion iomem; + + uint8_t op_self_test; + uint8_t op_seed; + uint8_t mask; + bool auto_seed; + + QEMUBH *self_test_bh; + QEMUBH *seed_bh; + qemu_irq irq; +} IMXRNGCState; + +#endif /* IMX_RNGC_H */ diff --git a/include/hw/misc/stm32f4xx_exti.h b/include/hw/misc/stm32f4xx_exti.h new file mode 100644 index 0000000000..707036a41b --- /dev/null +++ b/include/hw/misc/stm32f4xx_exti.h @@ -0,0 +1,60 @@ +/* + * STM32F4XX EXTI + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef HW_STM_EXTI_H +#define HW_STM_EXTI_H + +#include "hw/sysbus.h" +#include "hw/hw.h" + +#define EXTI_IMR 0x00 +#define EXTI_EMR 0x04 +#define EXTI_RTSR 0x08 +#define EXTI_FTSR 0x0C +#define EXTI_SWIER 0x10 +#define EXTI_PR 0x14 + +#define TYPE_STM32F4XX_EXTI "stm32f4xx-exti" +#define STM32F4XX_EXTI(obj) \ + OBJECT_CHECK(STM32F4xxExtiState, (obj), TYPE_STM32F4XX_EXTI) + +#define NUM_GPIO_EVENT_IN_LINES 16 +#define NUM_INTERRUPT_OUT_LINES 16 + +typedef struct { + SysBusDevice parent_obj; + + MemoryRegion mmio; + + uint32_t exti_imr; + uint32_t exti_emr; + uint32_t exti_rtsr; + uint32_t exti_ftsr; + uint32_t exti_swier; + uint32_t exti_pr; + + qemu_irq irq[NUM_INTERRUPT_OUT_LINES]; +} STM32F4xxExtiState; + +#endif diff --git a/include/hw/misc/stm32f4xx_syscfg.h b/include/hw/misc/stm32f4xx_syscfg.h new file mode 100644 index 0000000000..c62c6629e5 --- /dev/null +++ b/include/hw/misc/stm32f4xx_syscfg.h @@ -0,0 +1,61 @@ +/* + * STM32F4xx SYSCFG + * + * Copyright (c) 2014 Alistair Francis <alistair@alistair23.me> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef HW_STM_SYSCFG_H +#define HW_STM_SYSCFG_H + +#include "hw/sysbus.h" +#include "hw/hw.h" + +#define SYSCFG_MEMRMP 0x00 +#define SYSCFG_PMC 0x04 +#define SYSCFG_EXTICR1 0x08 +#define SYSCFG_EXTICR2 0x0C +#define SYSCFG_EXTICR3 0x10 +#define SYSCFG_EXTICR4 0x14 +#define SYSCFG_CMPCR 0x20 + +#define TYPE_STM32F4XX_SYSCFG "stm32f4xx-syscfg" +#define STM32F4XX_SYSCFG(obj) \ + OBJECT_CHECK(STM32F4xxSyscfgState, (obj), TYPE_STM32F4XX_SYSCFG) + +#define SYSCFG_NUM_EXTICR 4 + +typedef struct { + /* <private> */ + SysBusDevice parent_obj; + + /* <public> */ + MemoryRegion mmio; + + uint32_t syscfg_memrmp; + uint32_t syscfg_pmc; + uint32_t syscfg_exticr[SYSCFG_NUM_EXTICR]; + uint32_t syscfg_cmpcr; + + qemu_irq irq; + qemu_irq gpio_out[16]; +} STM32F4xxSyscfgState; + +#endif diff --git a/include/hw/or-irq.h b/include/hw/or-irq.h index 3a3230dd84..0038bfbe3d 100644 --- a/include/hw/or-irq.h +++ b/include/hw/or-irq.h @@ -33,7 +33,7 @@ /* This can safely be increased if necessary without breaking * migration compatibility (as long as it remains greater than 15). */ -#define MAX_OR_LINES 32 +#define MAX_OR_LINES 48 typedef struct OrIRQState qemu_or_irq; diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h index b3bcf2e632..976fbae599 100644 --- a/include/hw/pci-host/q35.h +++ b/include/hw/pci-host/q35.h @@ -32,6 +32,7 @@ #include "hw/acpi/ich9.h" #include "hw/pci-host/pam.h" #include "hw/i386/intel_iommu.h" +#include "qemu/units.h" #define TYPE_Q35_HOST_DEVICE "q35-pcihost" #define Q35_HOST_DEVICE(obj) \ @@ -54,6 +55,8 @@ typedef struct MCHPCIState { MemoryRegion smram_region, open_high_smram; MemoryRegion smram, low_smram, high_smram; MemoryRegion tseg_blackhole, tseg_window; + MemoryRegion smbase_blackhole, smbase_window; + bool has_smram_at_smbase; Range pci_hole; uint64_t below_4g_mem_size; uint64_t above_4g_mem_size; @@ -97,6 +100,13 @@ typedef struct Q35PCIHost { #define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY 0xffff #define MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_MAX 0xfff +#define MCH_HOST_BRIDGE_SMBASE_SIZE (128 * KiB) +#define MCH_HOST_BRIDGE_SMBASE_ADDR 0x30000 +#define MCH_HOST_BRIDGE_F_SMBASE 0x9c +#define MCH_HOST_BRIDGE_F_SMBASE_QUERY 0xff +#define MCH_HOST_BRIDGE_F_SMBASE_IN_RAM 0x01 +#define MCH_HOST_BRIDGE_F_SMBASE_LCK 0x02 + #define MCH_HOST_BRIDGE_PCIEXBAR 0x60 /* 64bit register */ #define MCH_HOST_BRIDGE_PCIEXBAR_SIZE 8 /* 64bit register */ #define MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT 0xb0000000 diff --git a/include/hw/virtio/vhost-vsock.h b/include/hw/virtio/vhost-vsock.h index d509d67c4a..bc5a988ee5 100644 --- a/include/hw/virtio/vhost-vsock.h +++ b/include/hw/virtio/vhost-vsock.h @@ -33,6 +33,8 @@ typedef struct { struct vhost_virtqueue vhost_vqs[2]; struct vhost_dev vhost_dev; VirtQueue *event_vq; + VirtQueue *recv_vq; + VirtQueue *trans_vq; QEMUTimer *post_load_timer; /*< public >*/ diff --git a/include/migration/register.h b/include/migration/register.h index 00c38ebe9f..c1dcff0f90 100644 --- a/include/migration/register.h +++ b/include/migration/register.h @@ -71,7 +71,7 @@ typedef struct SaveVMHandlers { } SaveVMHandlers; int register_savevm_live(const char *idstr, - int instance_id, + uint32_t instance_id, int version_id, const SaveVMHandlers *ops, void *opaque); diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 4aef72c426..30667631bc 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -229,6 +229,7 @@ extern const VMStateInfo vmstate_info_tmp; extern const VMStateInfo vmstate_info_bitmap; extern const VMStateInfo vmstate_info_qtailq; extern const VMStateInfo vmstate_info_gtree; +extern const VMStateInfo vmstate_info_qlist; #define type_check_2darray(t1,t2,n,m) ((t1(*)[n][m])0 - (t2*)0) /* @@ -798,6 +799,26 @@ extern const VMStateInfo vmstate_info_gtree; .offset = offsetof(_state, _field), \ } +/* + * For migrating a QLIST + * Target QLIST needs be properly initialized. + * _type: type of QLIST element + * _next: name of QLIST_ENTRY entry field in QLIST element + * _vmsd: VMSD for QLIST element + * size: size of QLIST element + * start: offset of QLIST_ENTRY in QTAILQ element + */ +#define VMSTATE_QLIST_V(_field, _state, _version, _vmsd, _type, _next) \ +{ \ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .vmsd = &(_vmsd), \ + .size = sizeof(_type), \ + .info = &vmstate_info_qlist, \ + .offset = offsetof(_state, _field), \ + .start = offsetof(_type, _next), \ +} + /* _f : field name _f_n : num of elements field_name _n : num of elements @@ -1157,8 +1178,10 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); +#define VMSTATE_INSTANCE_ID_ANY -1 + /* Returns: 0 on success, -1 on failure */ -int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id, +int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, const VMStateDescription *vmsd, void *base, int alias_id, int required_for_version, diff --git a/include/qemu/queue.h b/include/qemu/queue.h index 4764d93ea3..4d4554a7ce 100644 --- a/include/qemu/queue.h +++ b/include/qemu/queue.h @@ -501,4 +501,43 @@ union { \ QTAILQ_RAW_TQH_CIRC(head)->tql_prev = QTAILQ_RAW_TQE_CIRC(elm, entry); \ } while (/*CONSTCOND*/0) +#define QLIST_RAW_FIRST(head) \ + field_at_offset(head, 0, void *) + +#define QLIST_RAW_NEXT(elm, entry) \ + field_at_offset(elm, entry, void *) + +#define QLIST_RAW_PREVIOUS(elm, entry) \ + field_at_offset(elm, entry + sizeof(void *), void *) + +#define QLIST_RAW_FOREACH(elm, head, entry) \ + for ((elm) = *QLIST_RAW_FIRST(head); \ + (elm); \ + (elm) = *QLIST_RAW_NEXT(elm, entry)) + +#define QLIST_RAW_INSERT_HEAD(head, elm, entry) do { \ + void *first = *QLIST_RAW_FIRST(head); \ + *QLIST_RAW_FIRST(head) = elm; \ + *QLIST_RAW_PREVIOUS(elm, entry) = QLIST_RAW_FIRST(head); \ + if (first) { \ + *QLIST_RAW_NEXT(elm, entry) = first; \ + *QLIST_RAW_PREVIOUS(first, entry) = QLIST_RAW_NEXT(elm, entry); \ + } else { \ + *QLIST_RAW_NEXT(elm, entry) = NULL; \ + } \ +} while (0) + +#define QLIST_RAW_REVERSE(head, elm, entry) do { \ + void *iter = *QLIST_RAW_FIRST(head), *prev = NULL, *next; \ + while (iter) { \ + next = *QLIST_RAW_NEXT(iter, entry); \ + *QLIST_RAW_PREVIOUS(iter, entry) = QLIST_RAW_NEXT(next, entry); \ + *QLIST_RAW_NEXT(iter, entry) = prev; \ + prev = iter; \ + iter = next; \ + } \ + *QLIST_RAW_FIRST(head) = prev; \ + *QLIST_RAW_PREVIOUS(prev, entry) = QLIST_RAW_FIRST(head); \ +} while (0) + #endif /* QEMU_SYS_QUEUE_H */ diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h new file mode 100644 index 0000000000..f4df0a40f6 --- /dev/null +++ b/include/standard-headers/linux/fuse.h @@ -0,0 +1,891 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ +/* + This file defines the kernel interface of FUSE + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. + + This -- and only this -- header file may also be distributed under + the terms of the BSD Licence as follows: + + Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. +*/ + +/* + * This file defines the kernel interface of FUSE + * + * Protocol changelog: + * + * 7.1: + * - add the following messages: + * FUSE_SETATTR, FUSE_SYMLINK, FUSE_MKNOD, FUSE_MKDIR, FUSE_UNLINK, + * FUSE_RMDIR, FUSE_RENAME, FUSE_LINK, FUSE_OPEN, FUSE_READ, FUSE_WRITE, + * FUSE_RELEASE, FUSE_FSYNC, FUSE_FLUSH, FUSE_SETXATTR, FUSE_GETXATTR, + * FUSE_LISTXATTR, FUSE_REMOVEXATTR, FUSE_OPENDIR, FUSE_READDIR, + * FUSE_RELEASEDIR + * - add padding to messages to accommodate 32-bit servers on 64-bit kernels + * + * 7.2: + * - add FOPEN_DIRECT_IO and FOPEN_KEEP_CACHE flags + * - add FUSE_FSYNCDIR message + * + * 7.3: + * - add FUSE_ACCESS message + * - add FUSE_CREATE message + * - add filehandle to fuse_setattr_in + * + * 7.4: + * - add frsize to fuse_kstatfs + * - clean up request size limit checking + * + * 7.5: + * - add flags and max_write to fuse_init_out + * + * 7.6: + * - add max_readahead to fuse_init_in and fuse_init_out + * + * 7.7: + * - add FUSE_INTERRUPT message + * - add POSIX file lock support + * + * 7.8: + * - add lock_owner and flags fields to fuse_release_in + * - add FUSE_BMAP message + * - add FUSE_DESTROY message + * + * 7.9: + * - new fuse_getattr_in input argument of GETATTR + * - add lk_flags in fuse_lk_in + * - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in + * - add blksize field to fuse_attr + * - add file flags field to fuse_read_in and fuse_write_in + * - Add ATIME_NOW and MTIME_NOW flags to fuse_setattr_in + * + * 7.10 + * - add nonseekable open flag + * + * 7.11 + * - add IOCTL message + * - add unsolicited notification support + * - add POLL message and NOTIFY_POLL notification + * + * 7.12 + * - add umask flag to input argument of create, mknod and mkdir + * - add notification messages for invalidation of inodes and + * directory entries + * + * 7.13 + * - make max number of background requests and congestion threshold + * tunables + * + * 7.14 + * - add splice support to fuse device + * + * 7.15 + * - add store notify + * - add retrieve notify + * + * 7.16 + * - add BATCH_FORGET request + * - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct + * fuse_ioctl_iovec' instead of ambiguous 'struct iovec' + * - add FUSE_IOCTL_32BIT flag + * + * 7.17 + * - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK + * + * 7.18 + * - add FUSE_IOCTL_DIR flag + * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE + * + * 7.20 + * - add FUSE_AUTO_INVAL_DATA + * + * 7.21 + * - add FUSE_READDIRPLUS + * - send the requested events in POLL request + * + * 7.22 + * - add FUSE_ASYNC_DIO + * + * 7.23 + * - add FUSE_WRITEBACK_CACHE + * - add time_gran to fuse_init_out + * - add reserved space to fuse_init_out + * - add FATTR_CTIME + * - add ctime and ctimensec to fuse_setattr_in + * - add FUSE_RENAME2 request + * - add FUSE_NO_OPEN_SUPPORT flag + * + * 7.24 + * - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support + * + * 7.25 + * - add FUSE_PARALLEL_DIROPS + * + * 7.26 + * - add FUSE_HANDLE_KILLPRIV + * - add FUSE_POSIX_ACL + * + * 7.27 + * - add FUSE_ABORT_ERROR + * + * 7.28 + * - add FUSE_COPY_FILE_RANGE + * - add FOPEN_CACHE_DIR + * - add FUSE_MAX_PAGES, add max_pages to init_out + * - add FUSE_CACHE_SYMLINKS + * + * 7.29 + * - add FUSE_NO_OPENDIR_SUPPORT flag + * + * 7.30 + * - add FUSE_EXPLICIT_INVAL_DATA + * - add FUSE_IOCTL_COMPAT_X32 + * + * 7.31 + * - add FUSE_WRITE_KILL_PRIV flag + * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING + * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag + */ + +#ifndef _LINUX_FUSE_H +#define _LINUX_FUSE_H + +#include <stdint.h> + +/* + * Version negotiation: + * + * Both the kernel and userspace send the version they support in the + * INIT request and reply respectively. + * + * If the major versions match then both shall use the smallest + * of the two minor versions for communication. + * + * If the kernel supports a larger major version, then userspace shall + * reply with the major version it supports, ignore the rest of the + * INIT message and expect a new INIT message from the kernel with a + * matching major version. + * + * If the library supports a larger major version, then it shall fall + * back to the major protocol version sent by the kernel for + * communication and reply with that major version (and an arbitrary + * supported minor version). + */ + +/** Version number of this interface */ +#define FUSE_KERNEL_VERSION 7 + +/** Minor version number of this interface */ +#define FUSE_KERNEL_MINOR_VERSION 31 + +/** The node ID of the root inode */ +#define FUSE_ROOT_ID 1 + +/* Make sure all structures are padded to 64bit boundary, so 32bit + userspace works under 64bit kernels */ + +struct fuse_attr { + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t rdev; + uint32_t blksize; + uint32_t padding; +}; + +struct fuse_kstatfs { + uint64_t blocks; + uint64_t bfree; + uint64_t bavail; + uint64_t files; + uint64_t ffree; + uint32_t bsize; + uint32_t namelen; + uint32_t frsize; + uint32_t padding; + uint32_t spare[6]; +}; + +struct fuse_file_lock { + uint64_t start; + uint64_t end; + uint32_t type; + uint32_t pid; /* tgid */ +}; + +/** + * Bitmasks for fuse_setattr_in.valid + */ +#define FATTR_MODE (1 << 0) +#define FATTR_UID (1 << 1) +#define FATTR_GID (1 << 2) +#define FATTR_SIZE (1 << 3) +#define FATTR_ATIME (1 << 4) +#define FATTR_MTIME (1 << 5) +#define FATTR_FH (1 << 6) +#define FATTR_ATIME_NOW (1 << 7) +#define FATTR_MTIME_NOW (1 << 8) +#define FATTR_LOCKOWNER (1 << 9) +#define FATTR_CTIME (1 << 10) + +/** + * Flags returned by the OPEN request + * + * FOPEN_DIRECT_IO: bypass page cache for this open file + * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + * FOPEN_NONSEEKABLE: the file is not seekable + * FOPEN_CACHE_DIR: allow caching this directory + * FOPEN_STREAM: the file is stream-like (no file position at all) + */ +#define FOPEN_DIRECT_IO (1 << 0) +#define FOPEN_KEEP_CACHE (1 << 1) +#define FOPEN_NONSEEKABLE (1 << 2) +#define FOPEN_CACHE_DIR (1 << 3) +#define FOPEN_STREAM (1 << 4) + +/** + * INIT request/reply flags + * + * FUSE_ASYNC_READ: asynchronous read requests + * FUSE_POSIX_LOCKS: remote locking for POSIX file locks + * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) + * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem + * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." + * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB + * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_SPLICE_WRITE: kernel supports splice write on the device + * FUSE_SPLICE_MOVE: kernel supports splice move on the device + * FUSE_SPLICE_READ: kernel supports splice read on the device + * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories + * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages + * FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one) + * FUSE_READDIRPLUS_AUTO: adaptive readdirplus + * FUSE_ASYNC_DIO: asynchronous direct I/O submission + * FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes + * FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens + * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir + * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc + * FUSE_POSIX_ACL: filesystem supports posix acls + * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED + * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages + * FUSE_CACHE_SYMLINKS: cache READLINK responses + * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir + * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request + * FUSE_MAP_ALIGNMENT: map_alignment field is valid + */ +#define FUSE_ASYNC_READ (1 << 0) +#define FUSE_POSIX_LOCKS (1 << 1) +#define FUSE_FILE_OPS (1 << 2) +#define FUSE_ATOMIC_O_TRUNC (1 << 3) +#define FUSE_EXPORT_SUPPORT (1 << 4) +#define FUSE_BIG_WRITES (1 << 5) +#define FUSE_DONT_MASK (1 << 6) +#define FUSE_SPLICE_WRITE (1 << 7) +#define FUSE_SPLICE_MOVE (1 << 8) +#define FUSE_SPLICE_READ (1 << 9) +#define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_HAS_IOCTL_DIR (1 << 11) +#define FUSE_AUTO_INVAL_DATA (1 << 12) +#define FUSE_DO_READDIRPLUS (1 << 13) +#define FUSE_READDIRPLUS_AUTO (1 << 14) +#define FUSE_ASYNC_DIO (1 << 15) +#define FUSE_WRITEBACK_CACHE (1 << 16) +#define FUSE_NO_OPEN_SUPPORT (1 << 17) +#define FUSE_PARALLEL_DIROPS (1 << 18) +#define FUSE_HANDLE_KILLPRIV (1 << 19) +#define FUSE_POSIX_ACL (1 << 20) +#define FUSE_ABORT_ERROR (1 << 21) +#define FUSE_MAX_PAGES (1 << 22) +#define FUSE_CACHE_SYMLINKS (1 << 23) +#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) +#define FUSE_EXPLICIT_INVAL_DATA (1 << 25) +#define FUSE_MAP_ALIGNMENT (1 << 26) + +/** + * CUSE INIT request/reply flags + * + * CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl + */ +#define CUSE_UNRESTRICTED_IOCTL (1 << 0) + +/** + * Release flags + */ +#define FUSE_RELEASE_FLUSH (1 << 0) +#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1) + +/** + * Getattr flags + */ +#define FUSE_GETATTR_FH (1 << 0) + +/** + * Lock flags + */ +#define FUSE_LK_FLOCK (1 << 0) + +/** + * WRITE flags + * + * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed + * FUSE_WRITE_LOCKOWNER: lock_owner field is valid + * FUSE_WRITE_KILL_PRIV: kill suid and sgid bits + */ +#define FUSE_WRITE_CACHE (1 << 0) +#define FUSE_WRITE_LOCKOWNER (1 << 1) +#define FUSE_WRITE_KILL_PRIV (1 << 2) + +/** + * Read flags + */ +#define FUSE_READ_LOCKOWNER (1 << 1) + +/** + * Ioctl flags + * + * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine + * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed + * FUSE_IOCTL_RETRY: retry with new iovecs + * FUSE_IOCTL_32BIT: 32bit ioctl + * FUSE_IOCTL_DIR: is a directory + * FUSE_IOCTL_COMPAT_X32: x32 compat ioctl on 64bit machine (64bit time_t) + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +#define FUSE_IOCTL_COMPAT (1 << 0) +#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +#define FUSE_IOCTL_RETRY (1 << 2) +#define FUSE_IOCTL_32BIT (1 << 3) +#define FUSE_IOCTL_DIR (1 << 4) +#define FUSE_IOCTL_COMPAT_X32 (1 << 5) + +#define FUSE_IOCTL_MAX_IOV 256 + +/** + * Poll flags + * + * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify + */ +#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0) + +/** + * Fsync flags + * + * FUSE_FSYNC_FDATASYNC: Sync data only, not metadata + */ +#define FUSE_FSYNC_FDATASYNC (1 << 0) + +enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_FORGET = 2, /* no reply */ + FUSE_GETATTR = 3, + FUSE_SETATTR = 4, + FUSE_READLINK = 5, + FUSE_SYMLINK = 6, + FUSE_MKNOD = 8, + FUSE_MKDIR = 9, + FUSE_UNLINK = 10, + FUSE_RMDIR = 11, + FUSE_RENAME = 12, + FUSE_LINK = 13, + FUSE_OPEN = 14, + FUSE_READ = 15, + FUSE_WRITE = 16, + FUSE_STATFS = 17, + FUSE_RELEASE = 18, + FUSE_FSYNC = 20, + FUSE_SETXATTR = 21, + FUSE_GETXATTR = 22, + FUSE_LISTXATTR = 23, + FUSE_REMOVEXATTR = 24, + FUSE_FLUSH = 25, + FUSE_INIT = 26, + FUSE_OPENDIR = 27, + FUSE_READDIR = 28, + FUSE_RELEASEDIR = 29, + FUSE_FSYNCDIR = 30, + FUSE_GETLK = 31, + FUSE_SETLK = 32, + FUSE_SETLKW = 33, + FUSE_ACCESS = 34, + FUSE_CREATE = 35, + FUSE_INTERRUPT = 36, + FUSE_BMAP = 37, + FUSE_DESTROY = 38, + FUSE_IOCTL = 39, + FUSE_POLL = 40, + FUSE_NOTIFY_REPLY = 41, + FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, + FUSE_READDIRPLUS = 44, + FUSE_RENAME2 = 45, + FUSE_LSEEK = 46, + FUSE_COPY_FILE_RANGE = 47, + FUSE_SETUPMAPPING = 48, + FUSE_REMOVEMAPPING = 49, + + /* CUSE specific operations */ + CUSE_INIT = 4096, + + /* Reserved opcodes: helpful to detect structure endian-ness */ + CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ + FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ +}; + +enum fuse_notify_code { + FUSE_NOTIFY_POLL = 1, + FUSE_NOTIFY_INVAL_INODE = 2, + FUSE_NOTIFY_INVAL_ENTRY = 3, + FUSE_NOTIFY_STORE = 4, + FUSE_NOTIFY_RETRIEVE = 5, + FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_CODE_MAX, +}; + +/* The read buffer is required to be at least 8k, but may be much larger */ +#define FUSE_MIN_READ_BUFFER 8192 + +#define FUSE_COMPAT_ENTRY_OUT_SIZE 120 + +struct fuse_entry_out { + uint64_t nodeid; /* Inode ID */ + uint64_t generation; /* Inode generation: nodeid:gen must + be unique for the fs's lifetime */ + uint64_t entry_valid; /* Cache timeout for the name */ + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t entry_valid_nsec; + uint32_t attr_valid_nsec; + struct fuse_attr attr; +}; + +struct fuse_forget_in { + uint64_t nlookup; +}; + +struct fuse_forget_one { + uint64_t nodeid; + uint64_t nlookup; +}; + +struct fuse_batch_forget_in { + uint32_t count; + uint32_t dummy; +}; + +struct fuse_getattr_in { + uint32_t getattr_flags; + uint32_t dummy; + uint64_t fh; +}; + +#define FUSE_COMPAT_ATTR_OUT_SIZE 96 + +struct fuse_attr_out { + uint64_t attr_valid; /* Cache timeout for the attributes */ + uint32_t attr_valid_nsec; + uint32_t dummy; + struct fuse_attr attr; +}; + +#define FUSE_COMPAT_MKNOD_IN_SIZE 8 + +struct fuse_mknod_in { + uint32_t mode; + uint32_t rdev; + uint32_t umask; + uint32_t padding; +}; + +struct fuse_mkdir_in { + uint32_t mode; + uint32_t umask; +}; + +struct fuse_rename_in { + uint64_t newdir; +}; + +struct fuse_rename2_in { + uint64_t newdir; + uint32_t flags; + uint32_t padding; +}; + +struct fuse_link_in { + uint64_t oldnodeid; +}; + +struct fuse_setattr_in { + uint32_t valid; + uint32_t padding; + uint64_t fh; + uint64_t size; + uint64_t lock_owner; + uint64_t atime; + uint64_t mtime; + uint64_t ctime; + uint32_t atimensec; + uint32_t mtimensec; + uint32_t ctimensec; + uint32_t mode; + uint32_t unused4; + uint32_t uid; + uint32_t gid; + uint32_t unused5; +}; + +struct fuse_open_in { + uint32_t flags; + uint32_t unused; +}; + +struct fuse_create_in { + uint32_t flags; + uint32_t mode; + uint32_t umask; + uint32_t padding; +}; + +struct fuse_open_out { + uint64_t fh; + uint32_t open_flags; + uint32_t padding; +}; + +struct fuse_release_in { + uint64_t fh; + uint32_t flags; + uint32_t release_flags; + uint64_t lock_owner; +}; + +struct fuse_flush_in { + uint64_t fh; + uint32_t unused; + uint32_t padding; + uint64_t lock_owner; +}; + +struct fuse_read_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t read_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; +}; + +#define FUSE_COMPAT_WRITE_IN_SIZE 24 + +struct fuse_write_in { + uint64_t fh; + uint64_t offset; + uint32_t size; + uint32_t write_flags; + uint64_t lock_owner; + uint32_t flags; + uint32_t padding; +}; + +struct fuse_write_out { + uint32_t size; + uint32_t padding; +}; + +#define FUSE_COMPAT_STATFS_SIZE 48 + +struct fuse_statfs_out { + struct fuse_kstatfs st; +}; + +struct fuse_fsync_in { + uint64_t fh; + uint32_t fsync_flags; + uint32_t padding; +}; + +struct fuse_setxattr_in { + uint32_t size; + uint32_t flags; +}; + +struct fuse_getxattr_in { + uint32_t size; + uint32_t padding; +}; + +struct fuse_getxattr_out { + uint32_t size; + uint32_t padding; +}; + +struct fuse_lk_in { + uint64_t fh; + uint64_t owner; + struct fuse_file_lock lk; + uint32_t lk_flags; + uint32_t padding; +}; + +struct fuse_lk_out { + struct fuse_file_lock lk; +}; + +struct fuse_access_in { + uint32_t mask; + uint32_t padding; +}; + +struct fuse_init_in { + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; +}; + +#define FUSE_COMPAT_INIT_OUT_SIZE 8 +#define FUSE_COMPAT_22_INIT_OUT_SIZE 24 + +struct fuse_init_out { + uint32_t major; + uint32_t minor; + uint32_t max_readahead; + uint32_t flags; + uint16_t max_background; + uint16_t congestion_threshold; + uint32_t max_write; + uint32_t time_gran; + uint16_t max_pages; + uint16_t map_alignment; + uint32_t unused[8]; +}; + +#define CUSE_INIT_INFO_MAX 4096 + +struct cuse_init_in { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; +}; + +struct cuse_init_out { + uint32_t major; + uint32_t minor; + uint32_t unused; + uint32_t flags; + uint32_t max_read; + uint32_t max_write; + uint32_t dev_major; /* chardev major */ + uint32_t dev_minor; /* chardev minor */ + uint32_t spare[10]; +}; + +struct fuse_interrupt_in { + uint64_t unique; +}; + +struct fuse_bmap_in { + uint64_t block; + uint32_t blocksize; + uint32_t padding; +}; + +struct fuse_bmap_out { + uint64_t block; +}; + +struct fuse_ioctl_in { + uint64_t fh; + uint32_t flags; + uint32_t cmd; + uint64_t arg; + uint32_t in_size; + uint32_t out_size; +}; + +struct fuse_ioctl_iovec { + uint64_t base; + uint64_t len; +}; + +struct fuse_ioctl_out { + int32_t result; + uint32_t flags; + uint32_t in_iovs; + uint32_t out_iovs; +}; + +struct fuse_poll_in { + uint64_t fh; + uint64_t kh; + uint32_t flags; + uint32_t events; +}; + +struct fuse_poll_out { + uint32_t revents; + uint32_t padding; +}; + +struct fuse_notify_poll_wakeup_out { + uint64_t kh; +}; + +struct fuse_fallocate_in { + uint64_t fh; + uint64_t offset; + uint64_t length; + uint32_t mode; + uint32_t padding; +}; + +struct fuse_in_header { + uint32_t len; + uint32_t opcode; + uint64_t unique; + uint64_t nodeid; + uint32_t uid; + uint32_t gid; + uint32_t pid; + uint32_t padding; +}; + +struct fuse_out_header { + uint32_t len; + int32_t error; + uint64_t unique; +}; + +struct fuse_dirent { + uint64_t ino; + uint64_t off; + uint32_t namelen; + uint32_t type; + char name[]; +}; + +#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) +#define FUSE_DIRENT_ALIGN(x) \ + (((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1)) +#define FUSE_DIRENT_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) + +struct fuse_direntplus { + struct fuse_entry_out entry_out; + struct fuse_dirent dirent; +}; + +#define FUSE_NAME_OFFSET_DIRENTPLUS \ + offsetof(struct fuse_direntplus, dirent.name) +#define FUSE_DIRENTPLUS_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen) + +struct fuse_notify_inval_inode_out { + uint64_t ino; + int64_t off; + int64_t len; +}; + +struct fuse_notify_inval_entry_out { + uint64_t parent; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_delete_out { + uint64_t parent; + uint64_t child; + uint32_t namelen; + uint32_t padding; +}; + +struct fuse_notify_store_out { + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +struct fuse_notify_retrieve_out { + uint64_t notify_unique; + uint64_t nodeid; + uint64_t offset; + uint32_t size; + uint32_t padding; +}; + +/* Matches the size of fuse_write_in */ +struct fuse_notify_retrieve_in { + uint64_t dummy1; + uint64_t offset; + uint32_t size; + uint32_t dummy2; + uint64_t dummy3; + uint64_t dummy4; +}; + +/* Device ioctls: */ +#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t) + +struct fuse_lseek_in { + uint64_t fh; + uint64_t offset; + uint32_t whence; + uint32_t padding; +}; + +struct fuse_lseek_out { + uint64_t offset; +}; + +struct fuse_copy_file_range_in { + uint64_t fh_in; + uint64_t off_in; + uint64_t nodeid_out; + uint64_t fh_out; + uint64_t off_out; + uint64_t len; + uint64_t flags; +}; + +#endif /* _LINUX_FUSE_H */ diff --git a/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h index 0224ac3e78..0224ac3e78 100644 --- a/tcg/tcg-gvec-desc.h +++ b/include/tcg/tcg-gvec-desc.h diff --git a/tcg/tcg-mo.h b/include/tcg/tcg-mo.h index c2c55704e1..c2c55704e1 100644 --- a/tcg/tcg-mo.h +++ b/include/tcg/tcg-mo.h diff --git a/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h index 830d68f697..830d68f697 100644 --- a/tcg/tcg-op-gvec.h +++ b/include/tcg/tcg-op-gvec.h diff --git a/tcg/tcg-op.h b/include/tcg/tcg-op.h index 4af272daa5..230db6e022 100644 --- a/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -25,7 +25,7 @@ #ifndef TCG_TCG_OP_H #define TCG_TCG_OP_H -#include "tcg.h" +#include "tcg/tcg.h" #include "exec/helper-proto.h" #include "exec/helper-gen.h" diff --git a/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index 9288a04946..9288a04946 100644 --- a/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h diff --git a/tcg/tcg.h b/include/tcg/tcg.h index 92ca10dffc..54e5446880 100644 --- a/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -31,7 +31,7 @@ #include "qemu/bitops.h" #include "qemu/plugin.h" #include "qemu/queue.h" -#include "tcg-mo.h" +#include "tcg/tcg-mo.h" #include "tcg-target.h" #include "qemu/int128.h" @@ -211,7 +211,7 @@ typedef uint64_t TCGRegSet; typedef enum TCGOpcode { #define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name, -#include "tcg-opc.h" +#include "tcg/tcg-opc.h" #undef DEF NB_OPS, } TCGOpcode; @@ -1290,27 +1290,6 @@ void helper_be_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val, void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val, TCGMemOpIdx oi, uintptr_t retaddr); -uint8_t helper_ret_ldub_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -int8_t helper_ret_ldsb_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint16_t helper_le_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -int16_t helper_le_ldsw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint32_t helper_le_ldl_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint64_t helper_le_ldq_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint16_t helper_be_lduw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -int16_t helper_be_ldsw_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint32_t helper_be_ldl_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); -uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr, - TCGMemOpIdx oi, uintptr_t retaddr); - /* Temporary aliases until backends are converted. */ #ifdef TARGET_WORDS_BIGENDIAN # define helper_ret_ldsw_mmu helper_be_ldsw_mmu @@ -1322,10 +1301,6 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr, # define helper_ret_stw_mmu helper_be_stw_mmu # define helper_ret_stl_mmu helper_be_stl_mmu # define helper_ret_stq_mmu helper_be_stq_mmu -# define helper_ret_lduw_cmmu helper_be_lduw_cmmu -# define helper_ret_ldsw_cmmu helper_be_ldsw_cmmu -# define helper_ret_ldl_cmmu helper_be_ldl_cmmu -# define helper_ret_ldq_cmmu helper_be_ldq_cmmu #else # define helper_ret_ldsw_mmu helper_le_ldsw_mmu # define helper_ret_lduw_mmu helper_le_lduw_mmu @@ -1336,10 +1311,6 @@ uint64_t helper_be_ldq_cmmu(CPUArchState *env, target_ulong addr, # define helper_ret_stw_mmu helper_le_stw_mmu # define helper_ret_stl_mmu helper_le_stl_mmu # define helper_ret_stq_mmu helper_le_stq_mmu -# define helper_ret_lduw_cmmu helper_le_lduw_cmmu -# define helper_ret_ldsw_cmmu helper_le_ldsw_cmmu -# define helper_ret_ldl_cmmu helper_le_ldl_cmmu -# define helper_ret_ldq_cmmu helper_le_ldq_cmmu #endif uint32_t helper_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr, diff --git a/include/user/syscall-trace.h b/include/user/syscall-trace.h index 9e60473643..79fd3e5aa9 100644 --- a/include/user/syscall-trace.h +++ b/include/user/syscall-trace.h @@ -10,6 +10,8 @@ #ifndef _SYSCALL_TRACE_H_ #define _SYSCALL_TRACE_H_ +#include "trace-root.h" + /* * These helpers just provide a common place for the various * subsystems that want to track syscalls to put their hooks in. We diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 07b16cc0f4..f3080a1635 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -10,6 +10,7 @@ #include "qemu/path.h" #include "qemu/queue.h" #include "qemu/guest-random.h" +#include "qemu/units.h" #ifdef _ARCH_PPC64 #undef ARCH_DLINFO @@ -2191,7 +2192,7 @@ unsigned long init_guest_space(unsigned long host_start, * to where we need to put the commpage. */ munmap((void *)real_start, host_size); - real_size = aligned_size + qemu_host_page_size; + real_size = aligned_size + align; real_start = (unsigned long) mmap((void *)real_start, real_size, PROT_NONE, flags, -1, 0); if (real_start == (unsigned long)-1) { @@ -2364,24 +2365,51 @@ static void load_elf_image(const char *image_name, int image_fd, } } - load_addr = loaddr; - if (ehdr->e_type == ET_DYN) { - /* The image indicates that it can be loaded anywhere. Find a - location that can hold the memory space required. If the - image is pre-linked, LOADDR will be non-zero. Since we do - not supply MAP_FIXED here we'll use that address if and - only if it remains available. */ - load_addr = target_mmap(loaddr, hiaddr - loaddr, PROT_NONE, - MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, - -1, 0); - if (load_addr == -1) { - goto exit_perror; + if (pinterp_name != NULL) { + /* + * This is the main executable. + * + * Reserve extra space for brk. + * We hold on to this space while placing the interpreter + * and the stack, lest they be placed immediately after + * the data segment and block allocation from the brk. + * + * 16MB is chosen as "large enough" without being so large + * as to allow the result to not fit with a 32-bit guest on + * a 32-bit host. + */ + info->reserve_brk = 16 * MiB; + hiaddr += info->reserve_brk; + + if (ehdr->e_type == ET_EXEC) { + /* + * Make sure that the low address does not conflict with + * MMAP_MIN_ADDR or the QEMU application itself. + */ + probe_guest_base(image_name, loaddr, hiaddr); } - } else if (pinterp_name != NULL) { - /* This is the main executable. Make sure that the low - address does not conflict with MMAP_MIN_ADDR or the - QEMU application itself. */ - probe_guest_base(image_name, loaddr, hiaddr); + } + + /* + * Reserve address space for all of this. + * + * In the case of ET_EXEC, we supply MAP_FIXED so that we get + * exactly the address range that is required. + * + * Otherwise this is ET_DYN, and we are searching for a location + * that can hold the memory space required. If the image is + * pre-linked, LOADDR will be non-zero, and the kernel should + * honor that address if it happens to be free. + * + * In both cases, we will overwrite pages in this range with mappings + * from the executable. + */ + load_addr = target_mmap(loaddr, hiaddr - loaddr, PROT_NONE, + MAP_PRIVATE | MAP_ANON | MAP_NORESERVE | + (ehdr->e_type == ET_EXEC ? MAP_FIXED : 0), + -1, 0); + if (load_addr == -1) { + goto exit_perror; } load_bias = load_addr - loaddr; @@ -2860,6 +2888,17 @@ int load_elf_binary(struct linux_binprm *bprm, struct image_info *info) bprm->core_dump = &elf_core_dump; #endif + /* + * If we reserved extra space for brk, release it now. + * The implementation of do_brk in syscalls.c expects to be able + * to mmap pages in this space. + */ + if (info->reserve_brk) { + abi_ulong start_brk = HOST_PAGE_ALIGN(info->brk); + abi_ulong end_brk = HOST_PAGE_ALIGN(info->brk + info->reserve_brk); + target_munmap(start_brk, end_brk - start_brk); + } + return 0; } diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index c6b9d6ad66..73dcc761e6 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -69,6 +69,29 @@ IOCTL(KDSETLED, 0, TYPE_INT) IOCTL_SPECIAL(KDSIGACCEPT, 0, do_ioctl_kdsigaccept, TYPE_INT) + IOCTL(RTC_AIE_ON, 0, TYPE_NULL) + IOCTL(RTC_AIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_UIE_ON, 0, TYPE_NULL) + IOCTL(RTC_UIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_PIE_ON, 0, TYPE_NULL) + IOCTL(RTC_PIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_WIE_ON, 0, TYPE_NULL) + IOCTL(RTC_WIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_ALM_READ, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_ALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_RD_TIME, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_SET_TIME, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_IRQP_READ, IOC_R, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_IRQP_SET, IOC_W, TYPE_ULONG) + IOCTL(RTC_EPOCH_READ, IOC_R, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_EPOCH_SET, IOC_W, TYPE_ULONG) + IOCTL(RTC_WKALM_RD, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) + IOCTL(RTC_WKALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) + IOCTL(RTC_PLL_GET, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) + IOCTL(RTC_PLL_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) + IOCTL(RTC_VL_READ, IOC_R, MK_PTR(TYPE_INT)) + IOCTL(RTC_VL_CLR, 0, TYPE_NULL) + IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) IOCTL(BLKRRPART, 0, TYPE_NULL) @@ -114,7 +137,13 @@ IOCTL(FDMSGON, 0, TYPE_NULL) IOCTL(FDMSGOFF, 0, TYPE_NULL) + IOCTL(FDSETEMSGTRESH, 0, TYPE_NULL) + IOCTL(FDFMTBEG, 0, TYPE_NULL) + IOCTL(FDFMTTRK, IOC_W, MK_PTR(MK_STRUCT(STRUCT_format_descr))) + IOCTL(FDFMTEND, 0, TYPE_NULL) IOCTL(FDFLUSH, 0, TYPE_NULL) + IOCTL(FDSETMAXERRS, IOC_W, MK_PTR(MK_STRUCT(STRUCT_floppy_max_errors))) + IOCTL(FDGETMAXERRS, IOC_R, MK_PTR(MK_STRUCT(STRUCT_floppy_max_errors))) IOCTL(FDRESET, 0, TYPE_NULL) IOCTL(FDRAWCMD, 0, TYPE_NULL) IOCTL(FDTWADDLE, 0, TYPE_NULL) @@ -138,6 +167,12 @@ IOCTL(FS_IOC_GETFLAGS, IOC_R, MK_PTR(TYPE_INT)) IOCTL(FS_IOC_SETFLAGS, IOC_W, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC_GETVERSION, IOC_R, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC_SETVERSION, IOC_W, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC32_GETFLAGS, IOC_R, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC32_SETFLAGS, IOC_W, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC32_GETVERSION, IOC_R, MK_PTR(TYPE_INT)) + IOCTL(FS_IOC32_SETVERSION, IOC_W, MK_PTR(TYPE_INT)) #ifdef CONFIG_USBFS /* USB ioctls */ @@ -522,3 +557,9 @@ IOCTL_IGNORE(TIOCSTART) IOCTL_IGNORE(TIOCSTOP) #endif + +#ifdef CONFIG_KCOV + IOCTL(KCOV_ENABLE, 0, TYPE_NULL) + IOCTL(KCOV_DISABLE, 0, TYPE_NULL) + IOCTL(KCOV_INIT_TRACE, IOC_R, TYPE_ULONG) +#endif diff --git a/linux-user/main.c b/linux-user/main.c index 8718d03ee2..fba833aac9 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -37,7 +37,7 @@ #include "qemu/plugin.h" #include "cpu.h" #include "exec/exec-all.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "qemu/timer.h" #include "qemu/envlist.h" #include "qemu/guest-random.h" diff --git a/linux-user/qemu.h b/linux-user/qemu.h index f6f5fe5fbb..560a68090e 100644 --- a/linux-user/qemu.h +++ b/linux-user/qemu.h @@ -35,6 +35,7 @@ struct image_info { abi_ulong end_data; abi_ulong start_brk; abi_ulong brk; + abi_ulong reserve_brk; abi_ulong start_mmap; abi_ulong start_stack; abi_ulong stack_limit; diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 171c0caef3..d60142f069 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -73,6 +73,9 @@ #ifdef CONFIG_SENDFILE #include <sys/sendfile.h> #endif +#ifdef CONFIG_KCOV +#include <sys/kcov.h> +#endif #define termios host_termios #define winsize host_winsize @@ -107,6 +110,7 @@ #include <netpacket/packet.h> #include <linux/netlink.h> #include <linux/if_alg.h> +#include <linux/rtc.h> #include "linux_loop.h" #include "uname.h" @@ -115,6 +119,7 @@ #include "user/syscall-trace.h" #include "qapi/error.h" #include "fd-trans.h" +#include "tcg/tcg.h" #ifndef CLONE_IO #define CLONE_IO 0x80000000 /* Clone io context */ @@ -5174,6 +5179,8 @@ static abi_long do_ioctl(int fd, int cmd, abi_long arg) break; case TYPE_PTRVOID: case TYPE_INT: + case TYPE_LONG: + case TYPE_ULONG: ret = get_errno(safe_ioctl(fd, ie->host_cmd, arg)); break; case TYPE_PTR: diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index 98c2119de9..9b61ae8547 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -763,6 +763,42 @@ struct target_pollfd { #define TARGET_KDSETLED 0x4B32 /* set led state [lights, not flags] */ #define TARGET_KDSIGACCEPT 0x4B4E +struct target_rtc_pll_info { + int pll_ctrl; + int pll_value; + int pll_max; + int pll_min; + int pll_posmult; + int pll_negmult; + abi_long pll_clock; +}; + +/* real time clock ioctls */ +#define TARGET_RTC_AIE_ON TARGET_IO('p', 0x01) +#define TARGET_RTC_AIE_OFF TARGET_IO('p', 0x02) +#define TARGET_RTC_UIE_ON TARGET_IO('p', 0x03) +#define TARGET_RTC_UIE_OFF TARGET_IO('p', 0x04) +#define TARGET_RTC_PIE_ON TARGET_IO('p', 0x05) +#define TARGET_RTC_PIE_OFF TARGET_IO('p', 0x06) +#define TARGET_RTC_WIE_ON TARGET_IO('p', 0x0f) +#define TARGET_RTC_WIE_OFF TARGET_IO('p', 0x10) +#define TARGET_RTC_ALM_READ TARGET_IOR('p', 0x08, struct rtc_time) +#define TARGET_RTC_ALM_SET TARGET_IOW('p', 0x07, struct rtc_time) +#define TARGET_RTC_RD_TIME TARGET_IOR('p', 0x09, struct rtc_time) +#define TARGET_RTC_SET_TIME TARGET_IOW('p', 0x0a, struct rtc_time) +#define TARGET_RTC_IRQP_READ TARGET_IOR('p', 0x0b, abi_ulong) +#define TARGET_RTC_IRQP_SET TARGET_IOW('p', 0x0c, abi_ulong) +#define TARGET_RTC_EPOCH_READ TARGET_IOR('p', 0x0d, abi_ulong) +#define TARGET_RTC_EPOCH_SET TARGET_IOW('p', 0x0e, abi_ulong) +#define TARGET_RTC_WKALM_RD TARGET_IOR('p', 0x10, struct rtc_wkalrm) +#define TARGET_RTC_WKALM_SET TARGET_IOW('p', 0x0f, struct rtc_wkalrm) +#define TARGET_RTC_PLL_GET TARGET_IOR('p', 0x11, \ + struct target_rtc_pll_info) +#define TARGET_RTC_PLL_SET TARGET_IOW('p', 0x12, \ + struct target_rtc_pll_info) +#define TARGET_RTC_VL_READ TARGET_IOR('p', 0x13, int) +#define TARGET_RTC_VL_CLR TARGET_IO('p', 0x14) + #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) #define TARGET_FIOGETOWN TARGET_IOR('f', 123, int) @@ -899,7 +935,13 @@ struct target_pollfd { #define TARGET_FDMSGON TARGET_IO(2, 0x45) #define TARGET_FDMSGOFF TARGET_IO(2, 0x46) +#define TARGET_FDFMTBEG TARGET_IO(2, 0x47) +#define TARGET_FDFMTTRK TARGET_IOW(2, 0x48, struct format_descr) +#define TARGET_FDFMTEND TARGET_IO(2, 0x49) +#define TARGET_FDSETEMSGTRESH TARGET_IO(2, 0x4a) #define TARGET_FDFLUSH TARGET_IO(2, 0x4b) +#define TARGET_FDSETMAXERRS TARGET_IOW(2, 0x4c, struct floppy_max_errors) +#define TARGET_FDGETMAXERRS TARGET_IOR(2, 0x0e, struct floppy_max_errors) #define TARGET_FDRESET TARGET_IO(2, 0x54) #define TARGET_FDRAWCMD TARGET_IO(2, 0x58) #define TARGET_FDTWADDLE TARGET_IO(2, 0x59) @@ -911,13 +953,19 @@ struct target_pollfd { #define TARGET_FICLONE TARGET_IOW(0x94, 9, int) #define TARGET_FICLONERANGE TARGET_IOW(0x94, 13, struct file_clone_range) -/* Note that the ioctl numbers claim type "long" but the actual type - * used by the kernel is "int". +/* + * Note that the ioctl numbers for FS_IOC_<GET|SET><FLAGS|VERSION> + * claim type "long" but the actual type used by the kernel is "int". */ #define TARGET_FS_IOC_GETFLAGS TARGET_IOR('f', 1, abi_long) #define TARGET_FS_IOC_SETFLAGS TARGET_IOW('f', 2, abi_long) - +#define TARGET_FS_IOC_GETVERSION TARGET_IOR('v', 1, abi_long) +#define TARGET_FS_IOC_SETVERSION TARGET_IOW('v', 2, abi_long) #define TARGET_FS_IOC_FIEMAP TARGET_IOWR('f',11,struct fiemap) +#define TARGET_FS_IOC32_GETFLAGS TARGET_IOR('f', 1, int) +#define TARGET_FS_IOC32_SETFLAGS TARGET_IOW('f', 2, int) +#define TARGET_FS_IOC32_GETVERSION TARGET_IOR('v', 1, int) +#define TARGET_FS_IOC32_SETVERSION TARGET_IOW('v', 2, int) /* usb ioctls */ #define TARGET_USBDEVFS_CONTROL TARGET_IOWRU('U', 0) @@ -2422,6 +2470,11 @@ struct target_mtpos { #define TARGET_MTIOCGET TARGET_IOR('m', 2, struct target_mtget) #define TARGET_MTIOCPOS TARGET_IOR('m', 3, struct target_mtpos) +/* kcov ioctls */ +#define TARGET_KCOV_ENABLE TARGET_IO('c', 100) +#define TARGET_KCOV_DISABLE TARGET_IO('c', 101) +#define TARGET_KCOV_INIT_TRACE TARGET_IOR('c', 1, abi_ulong) + struct target_sysinfo { abi_long uptime; /* Seconds since boot */ abi_ulong loads[3]; /* 1, 5, and 15 minute load averages */ diff --git a/linux-user/syscall_types.h b/linux-user/syscall_types.h index 4e36983826..5ba4155047 100644 --- a/linux-user/syscall_types.h +++ b/linux-user/syscall_types.h @@ -255,12 +255,49 @@ STRUCT(blkpg_partition, MK_ARRAY(TYPE_CHAR, BLKPG_DEVNAMELTH), /* devname */ MK_ARRAY(TYPE_CHAR, BLKPG_VOLNAMELTH)) /* volname */ +STRUCT(rtc_time, + TYPE_INT, /* tm_sec */ + TYPE_INT, /* tm_min */ + TYPE_INT, /* tm_hour */ + TYPE_INT, /* tm_mday */ + TYPE_INT, /* tm_mon */ + TYPE_INT, /* tm_year */ + TYPE_INT, /* tm_wday */ + TYPE_INT, /* tm_yday */ + TYPE_INT) /* tm_isdst */ + +STRUCT(rtc_wkalrm, + TYPE_CHAR, /* enabled */ + TYPE_CHAR, /* pending */ + MK_STRUCT(STRUCT_rtc_time)) /* time */ + +STRUCT(rtc_pll_info, + TYPE_INT, /* pll_ctrl */ + TYPE_INT, /* pll_value */ + TYPE_INT, /* pll_max */ + TYPE_INT, /* pll_min */ + TYPE_INT, /* pll_posmult */ + TYPE_INT, /* pll_negmult */ + TYPE_LONG) /* pll_clock */ + STRUCT(blkpg_ioctl_arg, TYPE_INT, /* op */ TYPE_INT, /* flags */ TYPE_INT, /* datalen */ TYPE_PTRVOID) /* data */ +STRUCT(format_descr, + TYPE_INT, /* device */ + TYPE_INT, /* head */ + TYPE_INT) /* track */ + +STRUCT(floppy_max_errors, + TYPE_INT, /* abort */ + TYPE_INT, /* read_track */ + TYPE_INT, /* reset */ + TYPE_INT, /* recal */ + TYPE_INT) /* reporting */ + #if defined(CONFIG_USBFS) /* usb device ioctls */ STRUCT(usbdevfs_ctrltransfer, diff --git a/migration/migration.c b/migration/migration.c index 354ad072fa..990bff00c0 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1005,17 +1005,6 @@ static bool migrate_caps_check(bool *cap_list, #endif if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { - if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { - /* The decompression threads asynchronously write into RAM - * rather than use the atomic copies needed to avoid - * userfaulting. It should be possible to fix the decompression - * threads for compatibility in future. - */ - error_setg(errp, "Postcopy is not currently compatible " - "with compression"); - return false; - } - /* This check is reasonably expensive, so only when it's being * set the first time, also it's only the destination that needs * special support. @@ -1784,6 +1773,7 @@ void qmp_migrate_incoming(const char *uri, Error **errp) } if (!once) { error_setg(errp, "The incoming migration has already been started"); + return; } qemu_start_incoming_migration(uri, &local_err); @@ -2035,11 +2025,10 @@ void qmp_migrate_set_downtime(double value, Error **errp) } value *= 1000; /* Convert to milliseconds */ - value = MAX(0, MIN(INT64_MAX, value)); MigrateSetParameters p = { .has_downtime_limit = true, - .downtime_limit = value, + .downtime_limit = (int64_t)value, }; qmp_migrate_set_parameters(&p, errp); @@ -3224,6 +3213,37 @@ void migration_consume_urgent_request(void) qemu_sem_wait(&migrate_get_current()->rate_limit_sem); } +/* Returns true if the rate limiting was broken by an urgent request */ +bool migration_rate_limit(void) +{ + int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + MigrationState *s = migrate_get_current(); + + bool urgent = false; + migration_update_counters(s, now); + if (qemu_file_rate_limit(s->to_dst_file)) { + /* + * Wait for a delay to do rate limiting OR + * something urgent to post the semaphore. + */ + int ms = s->iteration_start_time + BUFFER_DELAY - now; + trace_migration_rate_limit_pre(ms); + if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { + /* + * We were woken by one or more urgent things but + * the timedwait will have consumed one of them. + * The service routine for the urgent wake will dec + * the semaphore itself for each item it consumes, + * so add this one we just eat back. + */ + qemu_sem_post(&s->rate_limit_sem); + urgent = true; + } + trace_migration_rate_limit_post(urgent); + } + return urgent; +} + /* * Master migration thread on the source VM. * It drives the migration and pumps the data down the outgoing channel. @@ -3290,8 +3310,6 @@ static void *migration_thread(void *opaque) trace_migration_thread_setup_complete(); while (migration_is_active(s)) { - int64_t current_time; - if (urgent || !qemu_file_rate_limit(s->to_dst_file)) { MigIterateState iter_state = migration_iteration_run(s); if (iter_state == MIG_ITERATE_SKIP) { @@ -3318,29 +3336,7 @@ static void *migration_thread(void *opaque) update_iteration_initial_status(s); } - current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - - migration_update_counters(s, current_time); - - urgent = false; - if (qemu_file_rate_limit(s->to_dst_file)) { - /* Wait for a delay to do rate limiting OR - * something urgent to post the semaphore. - */ - int ms = s->iteration_start_time + BUFFER_DELAY - current_time; - trace_migration_thread_ratelimit_pre(ms); - if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { - /* We were worken by one or more urgent things but - * the timedwait will have consumed one of them. - * The service routine for the urgent wake will dec - * the semaphore itself for each item it consumes, - * so add this one we just eat back. - */ - qemu_sem_post(&s->rate_limit_sem); - urgent = true; - } - trace_migration_thread_ratelimit_post(urgent); - } + urgent = migration_rate_limit(); } trace_migration_thread_after_loop(); diff --git a/migration/migration.h b/migration/migration.h index 79b3dda146..aa9ff6f27b 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -341,5 +341,6 @@ int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque); void migration_make_urgent_request(void); void migration_consume_urgent_request(void); +bool migration_rate_limit(void); #endif diff --git a/migration/ram.c b/migration/ram.c index 96feb4062c..d2208b5534 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -703,7 +703,7 @@ typedef struct { static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { - MultiFDInit_t msg; + MultiFDInit_t msg = {}; int ret; msg.magic = cpu_to_be32(MULTIFD_MAGIC); @@ -803,7 +803,10 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) } for (i = 0; i < p->pages->used; i++) { - packet->offset[i] = cpu_to_be64(p->pages->offset[i]); + /* there are architectures where ram_addr_t is 32 bit */ + uint64_t temp = p->pages->offset[i]; + + packet->offset[i] = cpu_to_be64(temp); } } @@ -877,10 +880,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) } for (i = 0; i < p->pages->used; i++) { - ram_addr_t offset = be64_to_cpu(packet->offset[i]); + uint64_t offset = be64_to_cpu(packet->offset[i]); if (offset > (block->used_length - TARGET_PAGE_SIZE)) { - error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT + error_setg(errp, "multifd: offset too long %" PRIu64 " (max " RAM_ADDR_FMT ")", offset, block->max_length); return -1; @@ -900,6 +903,12 @@ struct { uint64_t packet_num; /* send channels ready */ QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; } *multifd_send_state; /* @@ -928,6 +937,10 @@ static int multifd_send_pages(RAMState *rs) MultiFDPages_t *pages = multifd_send_state->pages; uint64_t transferred; + if (atomic_read(&multifd_send_state->exiting)) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { p = &multifd_send_state->params[i]; @@ -945,10 +958,10 @@ static int multifd_send_pages(RAMState *rs) } qemu_mutex_unlock(&p->mutex); } - p->pages->used = 0; + assert(!p->pages->used); + assert(!p->pages->block); p->packet_num = multifd_send_state->packet_num++; - p->pages->block = NULL; multifd_send_state->pages = p->pages; p->pages = pages; transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len; @@ -1009,6 +1022,16 @@ static void multifd_send_terminate_threads(Error *err) } } + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (atomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } + for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -1033,6 +1056,10 @@ void multifd_save_cleanup(void) if (p->running) { qemu_thread_join(&p->thread); } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + socket_send_channel_destroy(p->c); p->c = NULL; qemu_mutex_destroy(&p->mutex); @@ -1118,6 +1145,10 @@ static void *multifd_send_thread(void *opaque) while (true) { qemu_sem_wait(&p->sem); + + if (atomic_read(&multifd_send_state->exiting)) { + break; + } qemu_mutex_lock(&p->mutex); if (p->pending_job) { @@ -1130,6 +1161,8 @@ static void *multifd_send_thread(void *opaque) p->flags = 0; p->num_packets++; p->num_pages += used; + p->pages->used = 0; + p->pages->block = NULL; qemu_mutex_unlock(&p->mutex); trace_multifd_send(p->id, packet_num, used, flags, @@ -1224,6 +1257,7 @@ int multifd_save_setup(void) multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); qemu_sem_init(&multifd_send_state->channels_ready, 0); + atomic_set(&multifd_send_state->exiting, 0); for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -1236,7 +1270,7 @@ int multifd_save_setup(void) p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(ram_addr_t) * page_count; + + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); p->packet->version = cpu_to_be32(MULTIFD_VERSION); @@ -1281,7 +1315,9 @@ static void multifd_recv_terminate_threads(Error *err) - normal quit, i.e. everything went fine, just finished - error quit: We close the channels so the channel threads finish the qio_channel_read_all_eof() */ - qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + if (p->c) { + qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } qemu_mutex_unlock(&p->mutex); } } @@ -1307,6 +1343,10 @@ int multifd_load_cleanup(Error **errp) qemu_sem_post(&p->sem_sync); qemu_thread_join(&p->thread); } + } + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + object_unref(OBJECT(p->c)); p->c = NULL; qemu_mutex_destroy(&p->mutex); @@ -1447,7 +1487,7 @@ int multifd_load_setup(void) p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(ram_addr_t) * page_count; + + sizeof(uint64_t) * page_count; p->packet = g_malloc0(p->packet_len); p->name = g_strdup_printf("multifdrecv_%d", i); } @@ -1731,7 +1771,7 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { uint8_t shift = rb->clear_bmap_shift; hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); - hwaddr start = (page << TARGET_PAGE_BITS) & (-size); + hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size); /* * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this @@ -1968,7 +2008,7 @@ static void ram_release_pages(const char *rbname, uint64_t offset, int pages) return; } - ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS); + ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS); } /* @@ -2056,7 +2096,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) uint8_t *p; bool send_async = true; RAMBlock *block = pss->block; - ram_addr_t offset = pss->page << TARGET_PAGE_BITS; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; ram_addr_t current_addr = block->offset + offset; p = block->host + offset; @@ -2243,7 +2283,8 @@ static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) *again = false; return false; } - if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) { + if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS) + >= pss->block->used_length) { /* Didn't find anything in this RAM Block */ pss->page = 0; pss->block = QLIST_NEXT_RCU(pss->block, next); @@ -2434,7 +2475,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) * it's the 1st request. */ error_report("ram_save_queue_pages no previous block"); - goto err; + return -1; } } else { ramblock = qemu_ram_block_by_name(rbname); @@ -2442,7 +2483,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) if (!ramblock) { /* We shouldn't be asked for a non-existent RAMBlock */ error_report("ram_save_queue_pages no block '%s'", rbname); - goto err; + return -1; } rs->last_req_rb = ramblock; } @@ -2451,7 +2492,7 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) error_report("%s request overrun start=" RAM_ADDR_FMT " len=" RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, __func__, start, len, ramblock->used_length); - goto err; + return -1; } struct RAMSrcPageRequest *new_entry = @@ -2467,9 +2508,6 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) qemu_mutex_unlock(&rs->src_page_req_mutex); return 0; - -err: - return -1; } static bool save_page_use_compression(RAMState *rs) @@ -2537,7 +2575,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, bool last_stage) { RAMBlock *block = pss->block; - ram_addr_t offset = pss->page << TARGET_PAGE_BITS; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; if (control_save_page(rs, block, offset, &res)) { @@ -2563,10 +2601,13 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss, } /* - * do not use multifd for compression as the first page in the new - * block should be posted out before sending the compressed page + * Do not use multifd for: + * 1. Compression as the first page in the new block should be posted out + * before sending the compressed page + * 2. In postcopy as one whole host page should be placed */ - if (!save_page_use_compression(rs) && migrate_use_multifd()) { + if (!save_page_use_compression(rs) && migrate_use_multifd() + && !migration_in_postcopy()) { return ram_save_multifd_page(rs, block, offset); } @@ -2617,8 +2658,11 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss, pages += tmppages; pss->page++; + /* Allow rate limiting to happen in the middle of huge pages */ + migration_rate_limit(); } while ((pss->page & (pagesize_bits - 1)) && - offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS)); + offset_in_ramblock(pss->block, + ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)); /* The offset we leave with is the last one we looked at */ pss->page--; @@ -2835,8 +2879,10 @@ void ram_postcopy_migrated_memory_release(MigrationState *ms) while (run_start < range) { unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); - ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS, - (run_end - run_start) << TARGET_PAGE_BITS); + ram_discard_range(block->idstr, + ((ram_addr_t)run_start) << TARGET_PAGE_BITS, + ((ram_addr_t)(run_end - run_start)) + << TARGET_PAGE_BITS); run_start = find_next_zero_bit(bitmap, range, run_end + 1); } } @@ -3072,8 +3118,6 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms) */ int ram_discard_range(const char *rbname, uint64_t start, size_t length) { - int ret = -1; - trace_ram_discard_range(rbname, start, length); RCU_READ_LOCK_GUARD(); @@ -3081,7 +3125,7 @@ int ram_discard_range(const char *rbname, uint64_t start, size_t length) if (!rb) { error_report("ram_discard_range: Failed to find block '%s'", rbname); - goto err; + return -1; } /* @@ -3093,10 +3137,7 @@ int ram_discard_range(const char *rbname, uint64_t start, size_t length) length >> qemu_target_page_bits()); } - ret = ram_block_discard_range(rb, start, length); - -err: - return ret; + return ram_block_discard_range(rb, start, length); } /* @@ -3451,6 +3492,14 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) rs->target_page_count += pages; /* + * During postcopy, it is necessary to make sure one whole host + * page is sent in one chunk. + */ + if (migrate_postcopy_ram()) { + flush_compressed_data(rs); + } + + /* * we want to check in the 1st loop, just in case it was the 1st * time and we had to sync the dirty bitmap. * qemu_clock_get_ns() is a bit expensive, so we only check each @@ -4031,8 +4080,9 @@ static int ram_load_postcopy(QEMUFile *f) MigrationIncomingState *mis = migration_incoming_get_current(); /* Temporary page that is later 'placed' */ void *postcopy_host_page = mis->postcopy_tmp_page; - void *last_host = NULL; + void *this_host = NULL; bool all_zero = false; + int target_pages = 0; while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr; @@ -4041,6 +4091,7 @@ static int ram_load_postcopy(QEMUFile *f) void *place_source = NULL; RAMBlock *block = NULL; uint8_t ch; + int len; addr = qemu_get_be64(f); @@ -4058,7 +4109,8 @@ static int ram_load_postcopy(QEMUFile *f) trace_ram_load_postcopy_loop((uint64_t)addr, flags); place_needed = false; - if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { + if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | + RAM_SAVE_FLAG_COMPRESS_PAGE)) { block = ram_block_from_stream(f, flags); host = host_from_ram_block_offset(block, addr); @@ -4067,6 +4119,7 @@ static int ram_load_postcopy(QEMUFile *f) ret = -EINVAL; break; } + target_pages++; matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; /* * Postcopy requires that we place whole host pages atomically; @@ -4076,38 +4129,47 @@ static int ram_load_postcopy(QEMUFile *f) * that's moved into place later. * The migration protocol uses, possibly smaller, target-pages * however the source ensures it always sends all the components - * of a host page in order. + * of a host page in one chunk. */ page_buffer = postcopy_host_page + ((uintptr_t)host & (block->page_size - 1)); /* If all TP are zero then we can optimise the place */ - if (!((uintptr_t)host & (block->page_size - 1))) { + if (target_pages == 1) { all_zero = true; + this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, + block->page_size); } else { /* not the 1st TP within the HP */ - if (host != (last_host + TARGET_PAGE_SIZE)) { - error_report("Non-sequential target page %p/%p", - host, last_host); + if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) != + (uintptr_t)this_host) { + error_report("Non-same host page %p/%p", + host, this_host); ret = -EINVAL; break; } } - /* * If it's the last part of a host page then we place the host * page */ - place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & - (block->page_size - 1)) == 0; + if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) { + place_needed = true; + target_pages = 0; + } place_source = postcopy_host_page; } - last_host = host; switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_ZERO: ch = qemu_get_byte(f); - memset(page_buffer, ch, TARGET_PAGE_SIZE); + /* + * Can skip to set page_buffer when + * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). + */ + if (ch || !matches_target_page_size) { + memset(page_buffer, ch, TARGET_PAGE_SIZE); + } if (ch) { all_zero = false; } @@ -4131,6 +4193,17 @@ static int ram_load_postcopy(QEMUFile *f) TARGET_PAGE_SIZE); } break; + case RAM_SAVE_FLAG_COMPRESS_PAGE: + all_zero = false; + len = qemu_get_be32(f); + if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { + error_report("Invalid compressed data length: %d", len); + ret = -EINVAL; + break; + } + decompress_data_with_multi_threads(f, page_buffer, len); + break; + case RAM_SAVE_FLAG_EOS: /* normal exit */ multifd_recv_sync_main(); @@ -4142,6 +4215,11 @@ static int ram_load_postcopy(QEMUFile *f) break; } + /* Got the whole host page, wait for decompress before placing. */ + if (place_needed) { + ret |= wait_for_decompress_done(); + } + /* Detect for any possible file errors */ if (!ret && qemu_file_get_error(f)) { ret = qemu_file_get_error(f); @@ -4149,7 +4227,8 @@ static int ram_load_postcopy(QEMUFile *f) if (!ret && place_needed) { /* This gets called at the last target page in the host page */ - void *place_dest = host + TARGET_PAGE_SIZE - block->page_size; + void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host, + block->page_size); if (all_zero) { ret = postcopy_place_page_zero(mis, place_dest, @@ -4201,13 +4280,16 @@ static void colo_flush_ram_cache(void) while (block) { offset = migration_bitmap_find_dirty(ram_state, block, offset); - if (offset << TARGET_PAGE_BITS >= block->used_length) { + if (((ram_addr_t)offset) << TARGET_PAGE_BITS + >= block->used_length) { offset = 0; block = QLIST_NEXT_RCU(block, next); } else { migration_bitmap_clear_dirty(ram_state, block, offset); - dst_host = block->host + (offset << TARGET_PAGE_BITS); - src_host = block->colo_cache + (offset << TARGET_PAGE_BITS); + dst_host = block->host + + (((ram_addr_t)offset) << TARGET_PAGE_BITS); + src_host = block->colo_cache + + (((ram_addr_t)offset) << TARGET_PAGE_BITS); memcpy(dst_host, src_host, TARGET_PAGE_SIZE); } } @@ -4227,7 +4309,7 @@ static void colo_flush_ram_cache(void) */ static int ram_load_precopy(QEMUFile *f) { - int flags = 0, ret = 0, invalid_flags = 0, len = 0; + int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ bool postcopy_advised = postcopy_is_advised(); if (!migrate_use_compression()) { @@ -4239,6 +4321,17 @@ static int ram_load_precopy(QEMUFile *f) void *host = NULL; uint8_t ch; + /* + * Yield periodically to let main loop run, but an iteration of + * the main loop is expensive, so do it each some iterations + */ + if ((i & 32767) == 0 && qemu_in_coroutine()) { + aio_co_schedule(qemu_get_current_aio_context(), + qemu_coroutine_self()); + qemu_coroutine_yield(); + } + i++; + addr = qemu_get_be64(f); flags = addr & ~TARGET_PAGE_MASK; addr &= TARGET_PAGE_MASK; @@ -4385,6 +4478,7 @@ static int ram_load_precopy(QEMUFile *f) } } + ret |= wait_for_decompress_done(); return ret; } @@ -4416,8 +4510,6 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } else { ret = ram_load_precopy(f); } - - ret |= wait_for_decompress_done(); } trace_ram_load_complete(ret, seq_iter); diff --git a/migration/savevm.c b/migration/savevm.c index 59efc1981d..adfdca26ac 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -233,7 +233,7 @@ typedef struct CompatEntry { typedef struct SaveStateEntry { QTAILQ_ENTRY(SaveStateEntry) entry; char idstr[256]; - int instance_id; + uint32_t instance_id; int alias_id; int version_id; /* version id read from the stream */ @@ -250,6 +250,7 @@ typedef struct SaveStateEntry { typedef struct SaveState { QTAILQ_HEAD(, SaveStateEntry) handlers; + SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1]; int global_section_id; uint32_t len; const char *name; @@ -261,6 +262,7 @@ typedef struct SaveState { static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), + .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, .global_section_id = 0, }; @@ -665,10 +667,10 @@ void dump_vmstate_json_to_file(FILE *out_file) fclose(out_file); } -static int calculate_new_instance_id(const char *idstr) +static uint32_t calculate_new_instance_id(const char *idstr) { SaveStateEntry *se; - int instance_id = 0; + uint32_t instance_id = 0; QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (strcmp(idstr, se->idstr) == 0 @@ -676,6 +678,8 @@ static int calculate_new_instance_id(const char *idstr) instance_id = se->instance_id + 1; } } + /* Make sure we never loop over without being noticed */ + assert(instance_id != VMSTATE_INSTANCE_ID_ANY); return instance_id; } @@ -709,20 +713,43 @@ static void savevm_state_handler_insert(SaveStateEntry *nse) { MigrationPriority priority = save_state_priority(nse); SaveStateEntry *se; + int i; assert(priority <= MIG_PRI_MAX); - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (save_state_priority(se) < priority) { + for (i = priority - 1; i >= 0; i--) { + se = savevm_state.handler_pri_head[i]; + if (se != NULL) { + assert(save_state_priority(se) < priority); break; } } - if (se) { + if (i >= 0) { QTAILQ_INSERT_BEFORE(se, nse, entry); } else { QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry); } + + if (savevm_state.handler_pri_head[priority] == NULL) { + savevm_state.handler_pri_head[priority] = nse; + } +} + +static void savevm_state_handler_remove(SaveStateEntry *se) +{ + SaveStateEntry *next; + MigrationPriority priority = save_state_priority(se); + + if (se == savevm_state.handler_pri_head[priority]) { + next = QTAILQ_NEXT(se, entry); + if (next != NULL && save_state_priority(next) == priority) { + savevm_state.handler_pri_head[priority] = next; + } else { + savevm_state.handler_pri_head[priority] = NULL; + } + } + QTAILQ_REMOVE(&savevm_state.handlers, se, entry); } /* TODO: Individual devices generally have very little idea about the rest @@ -730,7 +757,7 @@ static void savevm_state_handler_insert(SaveStateEntry *nse) Meanwhile pass -1 as instance_id if you do not already have a clearly distinguishing id for all instances of your device class. */ int register_savevm_live(const char *idstr, - int instance_id, + uint32_t instance_id, int version_id, const SaveVMHandlers *ops, void *opaque) @@ -750,7 +777,7 @@ int register_savevm_live(const char *idstr, pstrcat(se->idstr, sizeof(se->idstr), idstr); - if (instance_id == -1) { + if (instance_id == VMSTATE_INSTANCE_ID_ANY) { se->instance_id = calculate_new_instance_id(se->idstr); } else { se->instance_id = instance_id; @@ -777,14 +804,14 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque) QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { - QTAILQ_REMOVE(&savevm_state.handlers, se, entry); + savevm_state_handler_remove(se); g_free(se->compat); g_free(se); } } } -int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id, +int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, const VMStateDescription *vmsd, void *opaque, int alias_id, int required_for_version, @@ -817,14 +844,14 @@ int vmstate_register_with_alias_id(VMStateIf *obj, int instance_id, se->compat = g_new0(CompatEntry, 1); pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); - se->compat->instance_id = instance_id == -1 ? + se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ? calculate_compat_instance_id(vmsd->name) : instance_id; - instance_id = -1; + instance_id = VMSTATE_INSTANCE_ID_ANY; } } pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); - if (instance_id == -1) { + if (instance_id == VMSTATE_INSTANCE_ID_ANY) { se->instance_id = calculate_new_instance_id(se->idstr); } else { se->instance_id = instance_id; @@ -841,7 +868,7 @@ void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd, QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { if (se->vmsd == vmsd && se->opaque == opaque) { - QTAILQ_REMOVE(&savevm_state.handlers, se, entry); + savevm_state_handler_remove(se); g_free(se->compat); g_free(se); } @@ -1600,7 +1627,7 @@ int qemu_save_device_state(QEMUFile *f) return qemu_file_get_error(f); } -static SaveStateEntry *find_se(const char *idstr, int instance_id) +static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id) { SaveStateEntry *se; @@ -2267,7 +2294,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) /* Find savevm section */ se = find_se(idstr, instance_id); if (se == NULL) { - error_report("Unknown savevm section or instance '%s' %d. " + error_report("Unknown savevm section or instance '%s' %"PRIu32". " "Make sure that your current VM setup matches your " "saved VM setup, including any hotplugged devices", idstr, instance_id); @@ -2291,7 +2318,7 @@ qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) ret = vmstate_load(f, se); if (ret < 0) { - error_report("error while loading state for instance 0x%x of" + error_report("error while loading state for instance 0x%"PRIx32" of" " device '%s'", instance_id, idstr); return ret; } diff --git a/migration/trace-events b/migration/trace-events index 6dee7b5389..4ab0a503d2 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -76,6 +76,11 @@ get_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val put_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d" put_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d" +get_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)" +get_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)" +put_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)" +put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)" + # qemu-file.c qemu_file_fclose(void) "" @@ -138,12 +143,12 @@ migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi6 migration_completion_file_err(void) "" migration_completion_postcopy_end(void) "" migration_completion_postcopy_end_after_complete(void) "" +migration_rate_limit_pre(int ms) "%d ms" +migration_rate_limit_post(int urgent) "urgent: %d" migration_return_path_end_before(void) "" migration_return_path_end_after(int rp_error) "%d" migration_thread_after_loop(void) "" migration_thread_file_err(void) "" -migration_thread_ratelimit_pre(int ms) "%d ms" -migration_thread_ratelimit_post(int urgent) "urgent: %d" migration_thread_setup_complete(void) "" open_return_path_on_source(void) "" open_return_path_on_source_continue(void) "" diff --git a/migration/vmstate-types.c b/migration/vmstate-types.c index 7236cf92bc..1eee36773a 100644 --- a/migration/vmstate-types.c +++ b/migration/vmstate-types.c @@ -843,3 +843,73 @@ const VMStateInfo vmstate_info_gtree = { .get = get_gtree, .put = put_gtree, }; + +static int put_qlist(QEMUFile *f, void *pv, size_t unused_size, + const VMStateField *field, QJSON *vmdesc) +{ + const VMStateDescription *vmsd = field->vmsd; + /* offset of the QTAILQ entry in a QTAILQ element*/ + size_t entry_offset = field->start; + void *elm; + int ret; + + trace_put_qlist(field->name, vmsd->name, vmsd->version_id); + QLIST_RAW_FOREACH(elm, pv, entry_offset) { + qemu_put_byte(f, true); + ret = vmstate_save_state(f, vmsd, elm, vmdesc); + if (ret) { + error_report("%s: failed to save %s (%d)", field->name, + vmsd->name, ret); + return ret; + } + } + qemu_put_byte(f, false); + trace_put_qlist_end(field->name, vmsd->name); + + return 0; +} + +static int get_qlist(QEMUFile *f, void *pv, size_t unused_size, + const VMStateField *field) +{ + int ret = 0; + const VMStateDescription *vmsd = field->vmsd; + /* size of a QLIST element */ + size_t size = field->size; + /* offset of the QLIST entry in a QLIST element */ + size_t entry_offset = field->start; + int version_id = field->version_id; + void *elm; + + trace_get_qlist(field->name, vmsd->name, vmsd->version_id); + if (version_id > vmsd->version_id) { + error_report("%s %s", vmsd->name, "too new"); + return -EINVAL; + } + if (version_id < vmsd->minimum_version_id) { + error_report("%s %s", vmsd->name, "too old"); + return -EINVAL; + } + + while (qemu_get_byte(f)) { + elm = g_malloc(size); + ret = vmstate_load_state(f, vmsd, elm, version_id); + if (ret) { + error_report("%s: failed to load %s (%d)", field->name, + vmsd->name, ret); + g_free(elm); + return ret; + } + QLIST_RAW_INSERT_HEAD(pv, elm, entry_offset); + } + QLIST_RAW_REVERSE(pv, elm, entry_offset); + trace_get_qlist_end(field->name, vmsd->name); + + return ret; +} + +const VMStateInfo vmstate_info_qlist = { + .name = "qlist", + .get = get_qlist, + .put = put_qlist, +}; diff --git a/plugins/api.c b/plugins/api.c index fa1d9f276d..53c8a73582 100644 --- a/plugins/api.c +++ b/plugins/api.c @@ -46,6 +46,7 @@ #include "qemu/plugin-memory.h" #include "hw/boards.h" #endif +#include "trace/mem.h" /* Uninstall and Reset handlers */ diff --git a/qemu-doc.texi b/qemu-doc.texi index 39f950471f..2328e7ea47 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -633,17 +633,6 @@ encrypted disk images. * disk_images_snapshot_mode:: Snapshot mode * vm_snapshots:: VM snapshots * qemu_img_invocation:: qemu-img Invocation -* qemu_nbd_invocation:: qemu-nbd Invocation -* disk_images_formats:: Disk image file formats -* host_drives:: Using host drives -* disk_images_fat_images:: Virtual FAT disk images -* disk_images_nbd:: NBD access -* disk_images_sheepdog:: Sheepdog disk images -* disk_images_iscsi:: iSCSI LUNs -* disk_images_gluster:: GlusterFS disk images -* disk_images_ssh:: Secure Shell (ssh) disk images -* disk_images_nvme:: NVMe userspace driver -* disk_image_locking:: Disk image file locking @end menu @node disk_images_quickstart @@ -724,13 +713,6 @@ state is not saved or restored properly (in particular USB). @include qemu-img.texi -@node qemu_nbd_invocation -@subsection @code{qemu-nbd} Invocation - -@include qemu-nbd.texi - -@include docs/qemu-block-drivers.texi - @node pcsys_network @section Network emulation diff --git a/qemu-nbd.texi b/qemu-nbd.texi deleted file mode 100644 index 7f55657722..0000000000 --- a/qemu-nbd.texi +++ /dev/null @@ -1,214 +0,0 @@ -@example -@c man begin SYNOPSIS -@command{qemu-nbd} [OPTION]... @var{filename} - -@command{qemu-nbd} @option{-L} [OPTION]... - -@command{qemu-nbd} @option{-d} @var{dev} -@c man end -@end example - -@c man begin DESCRIPTION - -Export a QEMU disk image using the NBD protocol. - -Other uses: -@itemize -@item -Bind a /dev/nbdX block device to a QEMU server (on Linux). -@item -As a client to query exports of a remote NBD server. -@end itemize - -@c man end - -@c man begin OPTIONS -@var{filename} is a disk image filename, or a set of block -driver options if @option{--image-opts} is specified. - -@var{dev} is an NBD device. - -@table @option -@item --object type,id=@var{id},...props... -Define a new instance of the @var{type} object class identified by @var{id}. -See the @code{qemu(1)} manual page for full details of the properties -supported. The common object types that it makes sense to define are the -@code{secret} object, which is used to supply passwords and/or encryption -keys, and the @code{tls-creds} object, which is used to supply TLS -credentials for the qemu-nbd server or client. -@item -p, --port=@var{port} -The TCP port to listen on as a server, or connect to as a client -(default @samp{10809}). -@item -o, --offset=@var{offset} -The offset into the image. -@item -b, --bind=@var{iface} -The interface to bind to as a server, or connect to as a client -(default @samp{0.0.0.0}). -@item -k, --socket=@var{path} -Use a unix socket with path @var{path}. -@item --image-opts -Treat @var{filename} as a set of image options, instead of a plain -filename. If this flag is specified, the @var{-f} flag should -not be used, instead the '@code{format=}' option should be set. -@item -f, --format=@var{fmt} -Force the use of the block driver for format @var{fmt} instead of -auto-detecting. -@item -r, --read-only -Export the disk as read-only. -@item -P, --partition=@var{num} -Deprecated: Only expose MBR partition @var{num}. Understands physical -partitions 1-4 and logical partition 5. New code should instead use -@option{--image-opts} with the raw driver wrapping a subset of the -original image. -@item -B, --bitmap=@var{name} -If @var{filename} has a qcow2 persistent bitmap @var{name}, expose -that bitmap via the ``qemu:dirty-bitmap:@var{name}'' context -accessible through NBD_OPT_SET_META_CONTEXT. -@item -s, --snapshot -Use @var{filename} as an external snapshot, create a temporary -file with backing_file=@var{filename}, redirect the write to -the temporary one. -@item -l, --load-snapshot=@var{snapshot_param} -Load an internal snapshot inside @var{filename} and export it -as an read-only device, @var{snapshot_param} format is -'snapshot.id=[ID],snapshot.name=[NAME]' or '[ID_OR_NAME]' -@item -n, --nocache -@itemx --cache=@var{cache} -The cache mode to be used with the file. See the documentation of -the emulator's @code{-drive cache=...} option for allowed values. -@item --aio=@var{aio} -Set the asynchronous I/O mode between @samp{threads} (the default) -and @samp{native} (Linux only). -@item --discard=@var{discard} -Control whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) -requests are ignored or passed to the filesystem. @var{discard} is one of -@samp{ignore} (or @samp{off}), @samp{unmap} (or @samp{on}). The default is -@samp{ignore}. -@item --detect-zeroes=@var{detect-zeroes} -Control the automatic conversion of plain zero writes by the OS to -driver-specific optimized zero write commands. @var{detect-zeroes} is one of -@samp{off}, @samp{on} or @samp{unmap}. @samp{unmap} -converts a zero write to an unmap operation and can only be used if -@var{discard} is set to @samp{unmap}. The default is @samp{off}. -@item -c, --connect=@var{dev} -Connect @var{filename} to NBD device @var{dev} (Linux only). -@item -d, --disconnect -Disconnect the device @var{dev} (Linux only). -@item -e, --shared=@var{num} -Allow up to @var{num} clients to share the device (default -@samp{1}). Safe for readers, but for now, consistency is not -guaranteed between multiple writers. -@item -t, --persistent -Don't exit on the last connection. -@item -x, --export-name=@var{name} -Set the NBD volume export name (default of a zero-length string). -@item -D, --description=@var{description} -Set the NBD volume export description, as a human-readable -string. -@item -L, --list -Connect as a client and list all details about the exports exposed by -a remote NBD server. This enables list mode, and is incompatible -with options that change behavior related to a specific export (such as -@option{--export-name}, @option{--offset}, ...). -@item --tls-creds=ID -Enable mandatory TLS encryption for the server by setting the ID -of the TLS credentials object previously created with the --object -option; or provide the credentials needed for connecting as a client -in list mode. -@item --fork -Fork off the server process and exit the parent once the server is running. -@item --pid-file=PATH -Store the server's process ID in the given file. -@item --tls-authz=ID -Specify the ID of a qauthz object previously created with the ---object option. This will be used to authorize connecting users -against their x509 distinguished name. -@item -v, --verbose -Display extra debugging information. -@item -h, --help -Display this help and exit. -@item -V, --version -Display version information and exit. -@item -T, --trace [[enable=]@var{pattern}][,events=@var{file}][,file=@var{file}] -@findex --trace -@include qemu-option-trace.texi -@end table - -@c man end - -@c man begin EXAMPLES -Start a server listening on port 10809 that exposes only the -guest-visible contents of a qcow2 file, with no TLS encryption, and -with the default export name (an empty string). The command is -one-shot, and will block until the first successful client -disconnects: - -@example -qemu-nbd -f qcow2 file.qcow2 -@end example - -Start a long-running server listening with encryption on port 10810, -and whitelist clients with a specific X.509 certificate to connect to -a 1 megabyte subset of a raw file, using the export name 'subset': - -@example -qemu-nbd \ - --object tls-creds-x509,id=tls0,endpoint=server,dir=/path/to/qemutls \ - --object 'authz-simple,id=auth0,identity=CN=laptop.example.com,,\ - O=Example Org,,L=London,,ST=London,,C=GB' \ - --tls-creds tls0 --tls-authz auth0 \ - -t -x subset -p 10810 \ - --image-opts driver=raw,offset=1M,size=1M,file.driver=file,file.filename=file.raw -@end example - -Serve a read-only copy of just the first MBR partition of a guest -image over a Unix socket with as many as 5 simultaneous readers, with -a persistent process forked as a daemon: - -@example -qemu-nbd --fork --persistent --shared=5 --socket=/path/to/sock \ - --partition=1 --read-only --format=qcow2 file.qcow2 -@end example - -Expose the guest-visible contents of a qcow2 file via a block device -/dev/nbd0 (and possibly creating /dev/nbd0p1 and friends for -partitions found within), then disconnect the device when done. -Access to bind qemu-nbd to an /dev/nbd device generally requires root -privileges, and may also require the execution of @code{modprobe nbd} -to enable the kernel NBD client module. @emph{CAUTION}: Do not use -this method to mount filesystems from an untrusted guest image - a -malicious guest may have prepared the image to attempt to trigger -kernel bugs in partition probing or file system mounting. - -@example -qemu-nbd -c /dev/nbd0 -f qcow2 file.qcow2 -qemu-nbd -d /dev/nbd0 -@end example - -Query a remote server to see details about what export(s) it is -serving on port 10809, and authenticating via PSK: - -@example -qemu-nbd \ - --object tls-creds-psk,id=tls0,dir=/tmp/keys,username=eblake,endpoint=client \ - --tls-creds tls0 -L -b remote.example.com -@end example - -@c man end - -@ignore - -@setfilename qemu-nbd -@settitle QEMU Disk Network Block Device Server - -@c man begin AUTHOR -Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws>. -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -@c man end - -@c man begin SEEALSO -qemu(1), qemu-img(1) -@c man end - -@end ignore diff --git a/qemu-option-trace.texi b/qemu-option-trace.texi index 7d1b7f05c5..162f1528d2 100644 --- a/qemu-option-trace.texi +++ b/qemu-option-trace.texi @@ -1,3 +1,7 @@ +@c The contents of this file must be kept in sync with qemu-option-trace.rst.inc +@c until all the users of the texi file have been converted to rst and +@c the texi file can be removed. + Specify tracing options. @table @option diff --git a/qemu-options.hx b/qemu-options.hx index 709162c159..224a8e8712 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -953,7 +953,7 @@ STEXI @findex -cdrom Use @var{file} as CD-ROM image (you cannot use @option{-hdc} and @option{-cdrom} at the same time). You can use the host CD-ROM by -using @file{/dev/cdrom} as filename (@pxref{host_drives}). +using @file{/dev/cdrom} as filename. ETEXI DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev, diff --git a/roms/edk2-funcs.sh b/roms/edk2-funcs.sh index 3f4485b201..cd6e4f2c82 100644 --- a/roms/edk2-funcs.sh +++ b/roms/edk2-funcs.sh @@ -112,6 +112,9 @@ qemu_edk2_get_cross_prefix() ( [ "$gcc_arch" == i686 ] && [ "$host_arch" == x86_64 ] ); then # no cross-compiler needed : + elif ( [ -e /etc/debian_version ] && [ "$gcc_arch" == arm ] ); then + # force soft-float cross-compiler on Debian + printf 'arm-linux-gnueabi-' else printf '%s-linux-gnu-\n' "$gcc_arch" fi diff --git a/scripts/git.orderfile b/scripts/git.orderfile index e89790941c..1f747b583a 100644 --- a/scripts/git.orderfile +++ b/scripts/git.orderfile @@ -25,5 +25,8 @@ qga/*.json # headers *.h +# decoding tree specification +*.decode + # code *.c diff --git a/scripts/qapi/schema.py b/scripts/qapi/schema.py index 0bfc5256fb..5100110fa2 100644 --- a/scripts/qapi/schema.py +++ b/scripts/qapi/schema.py @@ -795,7 +795,7 @@ class QAPISchema(object): self.docs = parser.docs self._entity_list = [] self._entity_dict = {} - self._module_dict = {} + self._module_dict = OrderedDict() self._schema_dir = os.path.dirname(fname) self._make_module(None) # built-ins self._make_module(fname) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index f76d77363b..29c27f4681 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -186,6 +186,7 @@ rm -rf "$output/include/standard-headers/linux" mkdir -p "$output/include/standard-headers/linux" for i in "$tmpdir"/include/linux/*virtio*.h \ "$tmpdir/include/linux/qemu_fw_cfg.h" \ + "$tmpdir/include/linux/fuse.h" \ "$tmpdir/include/linux/input.h" \ "$tmpdir/include/linux/input-event-codes.h" \ "$tmpdir/include/linux/pci_regs.h" \ diff --git a/stubs/vmstate.c b/stubs/vmstate.c index 6951d9fdc5..cc4fe41dfc 100644 --- a/stubs/vmstate.c +++ b/stubs/vmstate.c @@ -4,7 +4,7 @@ const VMStateDescription vmstate_dummy = {}; int vmstate_register_with_alias_id(VMStateIf *obj, - int instance_id, + uint32_t instance_id, const VMStateDescription *vmsd, void *base, int alias_id, int required_for_version, diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h index a530249a5b..3f782c0efe 100644 --- a/target/alpha/cpu.h +++ b/target/alpha/cpu.h @@ -193,8 +193,6 @@ enum { PALcode cheats and usees the KSEG mapping for its code+data rather than physical addresses. */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_KERNEL_IDX 0 #define MMU_USER_IDX 1 #define MMU_PHYS_IDX 2 diff --git a/target/alpha/translate.c b/target/alpha/translate.c index f7f1ed0f41..8870284f57 100644 --- a/target/alpha/translate.c +++ b/target/alpha/translate.c @@ -23,7 +23,7 @@ #include "disas/disas.h" #include "qemu/host-utils.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "exec/helper-proto.h" #include "exec/helper-gen.h" diff --git a/target/arm/arch_dump.c b/target/arm/arch_dump.c index 26a2c09868..2345dec3c2 100644 --- a/target/arm/arch_dump.c +++ b/target/arm/arch_dump.c @@ -62,12 +62,23 @@ struct aarch64_user_vfp_state { QEMU_BUILD_BUG_ON(sizeof(struct aarch64_user_vfp_state) != 528); +/* struct user_sve_header from arch/arm64/include/uapi/asm/ptrace.h */ +struct aarch64_user_sve_header { + uint32_t size; + uint32_t max_size; + uint16_t vl; + uint16_t max_vl; + uint16_t flags; + uint16_t reserved; +} QEMU_PACKED; + struct aarch64_note { Elf64_Nhdr hdr; char name[8]; /* align_up(sizeof("CORE"), 4) */ union { struct aarch64_elf_prstatus prstatus; struct aarch64_user_vfp_state vfp; + struct aarch64_user_sve_header sve; }; } QEMU_PACKED; @@ -76,6 +87,8 @@ struct aarch64_note { (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_elf_prstatus)) #define AARCH64_PRFPREG_NOTE_SIZE \ (AARCH64_NOTE_HEADER_SIZE + sizeof(struct aarch64_user_vfp_state)) +#define AARCH64_SVE_NOTE_SIZE(env) \ + (AARCH64_NOTE_HEADER_SIZE + sve_size(env)) static void aarch64_note_init(struct aarch64_note *note, DumpState *s, const char *name, Elf64_Word namesz, @@ -128,11 +141,102 @@ static int aarch64_write_elf64_prfpreg(WriteCoreDumpFunction f, return 0; } +#ifdef TARGET_AARCH64 +static off_t sve_zreg_offset(uint32_t vq, int n) +{ + off_t off = sizeof(struct aarch64_user_sve_header); + return ROUND_UP(off, 16) + vq * 16 * n; +} + +static off_t sve_preg_offset(uint32_t vq, int n) +{ + return sve_zreg_offset(vq, 32) + vq * 16 / 8 * n; +} + +static off_t sve_fpsr_offset(uint32_t vq) +{ + off_t off = sve_preg_offset(vq, 17); + return ROUND_UP(off, 16); +} + +static off_t sve_fpcr_offset(uint32_t vq) +{ + return sve_fpsr_offset(vq) + sizeof(uint32_t); +} + +static uint32_t sve_current_vq(CPUARMState *env) +{ + return sve_zcr_len_for_el(env, arm_current_el(env)) + 1; +} + +static size_t sve_size_vq(uint32_t vq) +{ + off_t off = sve_fpcr_offset(vq) + sizeof(uint32_t); + return ROUND_UP(off, 16); +} + +static size_t sve_size(CPUARMState *env) +{ + return sve_size_vq(sve_current_vq(env)); +} + +static int aarch64_write_elf64_sve(WriteCoreDumpFunction f, + CPUARMState *env, int cpuid, + DumpState *s) +{ + struct aarch64_note *note; + ARMCPU *cpu = env_archcpu(env); + uint32_t vq = sve_current_vq(env); + uint64_t tmp[ARM_MAX_VQ * 2], *r; + uint32_t fpr; + uint8_t *buf; + int ret, i; + + note = g_malloc0(AARCH64_SVE_NOTE_SIZE(env)); + buf = (uint8_t *)¬e->sve; + + aarch64_note_init(note, s, "LINUX", 6, NT_ARM_SVE, sve_size_vq(vq)); + + note->sve.size = cpu_to_dump32(s, sve_size_vq(vq)); + note->sve.max_size = cpu_to_dump32(s, sve_size_vq(cpu->sve_max_vq)); + note->sve.vl = cpu_to_dump16(s, vq * 16); + note->sve.max_vl = cpu_to_dump16(s, cpu->sve_max_vq * 16); + note->sve.flags = cpu_to_dump16(s, 1); + + for (i = 0; i < 32; ++i) { + r = sve_bswap64(tmp, &env->vfp.zregs[i].d[0], vq * 2); + memcpy(&buf[sve_zreg_offset(vq, i)], r, vq * 16); + } + + for (i = 0; i < 17; ++i) { + r = sve_bswap64(tmp, r = &env->vfp.pregs[i].p[0], + DIV_ROUND_UP(vq * 2, 8)); + memcpy(&buf[sve_preg_offset(vq, i)], r, vq * 16 / 8); + } + + fpr = cpu_to_dump32(s, vfp_get_fpsr(env)); + memcpy(&buf[sve_fpsr_offset(vq)], &fpr, sizeof(uint32_t)); + + fpr = cpu_to_dump32(s, vfp_get_fpcr(env)); + memcpy(&buf[sve_fpcr_offset(vq)], &fpr, sizeof(uint32_t)); + + ret = f(note, AARCH64_SVE_NOTE_SIZE(env), s); + g_free(note); + + if (ret < 0) { + return -1; + } + + return 0; +} +#endif + int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, int cpuid, void *opaque) { struct aarch64_note note; - CPUARMState *env = &ARM_CPU(cs)->env; + ARMCPU *cpu = ARM_CPU(cs); + CPUARMState *env = &cpu->env; DumpState *s = opaque; uint64_t pstate, sp; int ret, i; @@ -163,7 +267,18 @@ int arm_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, return -1; } - return aarch64_write_elf64_prfpreg(f, env, cpuid, s); + ret = aarch64_write_elf64_prfpreg(f, env, cpuid, s); + if (ret) { + return ret; + } + +#ifdef TARGET_AARCH64 + if (cpu_isar_feature(aa64_sve, cpu)) { + ret = aarch64_write_elf64_sve(f, env, cpuid, s); + } +#endif + + return ret; } /* struct pt_regs from arch/arm/include/asm/ptrace.h */ @@ -335,6 +450,11 @@ ssize_t cpu_get_note_size(int class, int machine, int nr_cpus) if (class == ELFCLASS64) { note_size = AARCH64_PRSTATUS_NOTE_SIZE; note_size += AARCH64_PRFPREG_NOTE_SIZE; +#ifdef TARGET_AARCH64 + if (cpu_isar_feature(aa64_sve, cpu)) { + note_size += AARCH64_SVE_NOTE_SIZE(env); + } +#endif } else { note_size = ARM_PRSTATUS_NOTE_SIZE; if (arm_feature(env, ARM_FEATURE_VFP)) { diff --git a/target/arm/arm-semi.c b/target/arm/arm-semi.c index 47d61f6fe1..788fe61b51 100644 --- a/target/arm/arm-semi.c +++ b/target/arm/arm-semi.c @@ -144,7 +144,8 @@ static int alloc_guestfd(void) guestfd_array = g_array_new(FALSE, TRUE, sizeof(GuestFD)); } - for (i = 0; i < guestfd_array->len; i++) { + /* SYS_OPEN should return nonzero handle on success. Start guestfd from 1 */ + for (i = 1; i < guestfd_array->len; i++) { GuestFD *gf = &g_array_index(guestfd_array, GuestFD, i); if (gf->type == GuestFDUnused) { @@ -168,7 +169,7 @@ static GuestFD *do_get_guestfd(int guestfd) return NULL; } - if (guestfd < 0 || guestfd >= guestfd_array->len) { + if (guestfd <= 0 || guestfd >= guestfd_array->len) { return NULL; } diff --git a/target/arm/cpu.c b/target/arm/cpu.c index d62fd5fdc6..64cd0a7d73 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -2121,6 +2121,7 @@ static void cortex_r5_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_V7); set_feature(&cpu->env, ARM_FEATURE_V7MP); set_feature(&cpu->env, ARM_FEATURE_PMSA); + set_feature(&cpu->env, ARM_FEATURE_PMU); cpu->midr = 0x411fc153; /* r1p3 */ cpu->id_pfr0 = 0x0131; cpu->id_pfr1 = 0x001; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 40f2c45e17..c1aedbeac0 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -980,6 +980,31 @@ void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq); void aarch64_sve_change_el(CPUARMState *env, int old_el, int new_el, bool el0_a64); void aarch64_add_sve_properties(Object *obj); + +/* + * SVE registers are encoded in KVM's memory in an endianness-invariant format. + * The byte at offset i from the start of the in-memory representation contains + * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the + * lowest offsets are stored in the lowest memory addresses, then that nearly + * matches QEMU's representation, which is to use an array of host-endian + * uint64_t's, where the lower offsets are at the lower indices. To complete + * the translation we just need to byte swap the uint64_t's on big-endian hosts. + */ +static inline uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) +{ +#ifdef HOST_WORDS_BIGENDIAN + int i; + + for (i = 0; i < nr; ++i) { + dst[i] = bswap64(src[i]); + } + + return dst; +#else + return src; +#endif +} + #else static inline void aarch64_sve_narrow_vq(CPUARMState *env, unsigned vq) { } static inline void aarch64_sve_change_el(CPUARMState *env, int o, diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c index b4cd680fc4..36aa6badfd 100644 --- a/target/arm/helper-a64.c +++ b/target/arm/helper-a64.c @@ -31,7 +31,7 @@ #include "exec/cpu_ldst.h" #include "qemu/int128.h" #include "qemu/atomic128.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "fpu/softfloat.h" #include <zlib.h> /* For crc32 */ diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index 876184b8fe..e2da756e65 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -877,30 +877,6 @@ static int kvm_arch_put_fpsimd(CPUState *cs) } /* - * SVE registers are encoded in KVM's memory in an endianness-invariant format. - * The byte at offset i from the start of the in-memory representation contains - * the bits [(7 + 8 * i) : (8 * i)] of the register value. As this means the - * lowest offsets are stored in the lowest memory addresses, then that nearly - * matches QEMU's representation, which is to use an array of host-endian - * uint64_t's, where the lower offsets are at the lower indices. To complete - * the translation we just need to byte swap the uint64_t's on big-endian hosts. - */ -static uint64_t *sve_bswap64(uint64_t *dst, uint64_t *src, int nr) -{ -#ifdef HOST_WORDS_BIGENDIAN - int i; - - for (i = 0; i < nr; ++i) { - dst[i] = bswap64(src[i]); - } - - return dst; -#else - return src; -#endif -} - -/* * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits * and PREGS and the FFR have a slice size of 256 bits. However we simply hard * code the slice index to zero for now as it's unlikely we'll need more than diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c index e5a346cb87..27d16ad9ad 100644 --- a/target/arm/op_helper.c +++ b/target/arm/op_helper.c @@ -295,7 +295,12 @@ void HELPER(wfi)(CPUARMState *env, uint32_t insn_len) } if (target_el) { - env->pc -= insn_len; + if (env->aarch64) { + env->pc -= insn_len; + } else { + env->regs[15] -= insn_len; + } + raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2), target_el); } diff --git a/target/arm/pauth_helper.c b/target/arm/pauth_helper.c index d3194f2043..0a5f41e10c 100644 --- a/target/arm/pauth_helper.c +++ b/target/arm/pauth_helper.c @@ -89,7 +89,7 @@ static uint64_t pac_sub(uint64_t i) uint64_t o = 0; int b; - for (b = 0; b < 64; b += 16) { + for (b = 0; b < 64; b += 4) { o |= (uint64_t)sub[(i >> b) & 0xf] << b; } return o; @@ -104,7 +104,7 @@ static uint64_t pac_inv_sub(uint64_t i) uint64_t o = 0; int b; - for (b = 0; b < 64; b += 16) { + for (b = 0; b < 64; b += 4) { o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b; } return o; diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index fc0c1755d2..fdfa652094 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -25,6 +25,7 @@ #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" +#include "tcg/tcg.h" /* Note that vector data is stored in host-endian 64-bit chunks, diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c index 5feb312941..e63f8bda29 100644 --- a/target/arm/tlb_helper.c +++ b/target/arm/tlb_helper.c @@ -44,7 +44,7 @@ static inline uint32_t merge_syn_data_abort(uint32_t template_syn, syn = syn_data_abort_with_iss(same_el, 0, 0, 0, 0, 0, ea, 0, s1ptw, is_write, fsc, - false); + true); /* Merge the runtime syndrome with the template syndrome. */ syn |= template_syn; } diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 8c18cdff87..96a5be2b37 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -20,8 +20,8 @@ #include "cpu.h" #include "exec/exec-all.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "qemu/log.h" #include "arm_ldst.h" #include "translate.h" diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 5d7edd0907..b35bad245e 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -20,9 +20,9 @@ #include "qemu/osdep.h" #include "cpu.h" #include "exec/exec-all.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" -#include "tcg-gvec-desc.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" +#include "tcg/tcg-gvec-desc.h" #include "qemu/log.h" #include "arm_ldst.h" #include "translate.h" diff --git a/target/arm/translate.c b/target/arm/translate.c index 5185e08641..2f4aea927f 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -24,8 +24,8 @@ #include "internals.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "qemu/log.h" #include "qemu/bitops.h" #include "arm_ldst.h" @@ -8556,6 +8556,9 @@ static ISSInfo make_issinfo(DisasContext *s, int rd, bool p, bool w) /* ISS not valid if writeback */ if (p && !w) { ret = rd; + if (s->base.pc_next - s->pc_curr == 2) { + ret |= ISSIs16Bit; + } } else { ret = ISSInvalid; } diff --git a/target/cris/cpu.h b/target/cris/cpu.h index a7c2a8e15b..ca240bc788 100644 --- a/target/cris/cpu.h +++ b/target/cris/cpu.h @@ -253,8 +253,6 @@ enum { #define cpu_signal_handler cpu_cris_signal_handler /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_USER_IDX 1 static inline int cpu_mmu_index (CPUCRISState *env, bool ifetch) { diff --git a/target/cris/translate.c b/target/cris/translate.c index cb57516a44..aaa46b5bca 100644 --- a/target/cris/translate.c +++ b/target/cris/translate.c @@ -27,7 +27,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/helper-proto.h" #include "mmu.h" #include "exec/cpu_ldst.h" diff --git a/target/hppa/translate.c b/target/hppa/translate.c index 2f8d407a82..f25927aeca 100644 --- a/target/hppa/translate.c +++ b/target/hppa/translate.c @@ -22,7 +22,7 @@ #include "disas/disas.h" #include "qemu/host-utils.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "exec/helper-proto.h" #include "exec/helper-gen.h" diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 594326a794..e6de38ae02 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1955,9 +1955,6 @@ uint64_t cpu_get_tsc(CPUX86State *env); #define cpu_list x86_cpu_list /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _ksmap -#define MMU_MODE1_SUFFIX _user -#define MMU_MODE2_SUFFIX _knosmap /* SMAP disabled or CPL<3 && AC=1 */ #define MMU_KSMAP_IDX 0 #define MMU_USER_IDX 1 #define MMU_KNOSMAP_IDX 2 diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c index d50d4b0c40..acf41f8885 100644 --- a/target/i386/mem_helper.c +++ b/target/i386/mem_helper.c @@ -24,7 +24,7 @@ #include "exec/cpu_ldst.h" #include "qemu/int128.h" #include "qemu/atomic128.h" -#include "tcg.h" +#include "tcg/tcg.h" void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0) { diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c index 87a627f9dc..b96de068ca 100644 --- a/target/i386/seg_helper.c +++ b/target/i386/seg_helper.c @@ -37,37 +37,37 @@ # define LOG_PCALL_STATE(cpu) do { } while (0) #endif -#ifdef CONFIG_USER_ONLY -#define MEMSUFFIX _kernel -#define DATA_SIZE 1 -#include "exec/cpu_ldst_useronly_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_useronly_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_useronly_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_useronly_template.h" -#undef MEMSUFFIX -#else -#define CPU_MMU_INDEX (cpu_mmu_index_kernel(env)) -#define MEMSUFFIX _kernel -#define DATA_SIZE 1 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 2 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 4 -#include "exec/cpu_ldst_template.h" - -#define DATA_SIZE 8 -#include "exec/cpu_ldst_template.h" -#undef CPU_MMU_INDEX -#undef MEMSUFFIX -#endif +/* + * TODO: Convert callers to compute cpu_mmu_index_kernel once + * and use *_mmuidx_ra directly. + */ +#define cpu_ldub_kernel_ra(e, p, r) \ + cpu_ldub_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r) +#define cpu_lduw_kernel_ra(e, p, r) \ + cpu_lduw_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r) +#define cpu_ldl_kernel_ra(e, p, r) \ + cpu_ldl_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r) +#define cpu_ldq_kernel_ra(e, p, r) \ + cpu_ldq_mmuidx_ra(e, p, cpu_mmu_index_kernel(e), r) + +#define cpu_stb_kernel_ra(e, p, v, r) \ + cpu_stb_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r) +#define cpu_stw_kernel_ra(e, p, v, r) \ + cpu_stw_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r) +#define cpu_stl_kernel_ra(e, p, v, r) \ + cpu_stl_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r) +#define cpu_stq_kernel_ra(e, p, v, r) \ + cpu_stq_mmuidx_ra(e, p, v, cpu_mmu_index_kernel(e), r) + +#define cpu_ldub_kernel(e, p) cpu_ldub_kernel_ra(e, p, 0) +#define cpu_lduw_kernel(e, p) cpu_lduw_kernel_ra(e, p, 0) +#define cpu_ldl_kernel(e, p) cpu_ldl_kernel_ra(e, p, 0) +#define cpu_ldq_kernel(e, p) cpu_ldq_kernel_ra(e, p, 0) + +#define cpu_stb_kernel(e, p, v) cpu_stb_kernel_ra(e, p, v, 0) +#define cpu_stw_kernel(e, p, v) cpu_stw_kernel_ra(e, p, v, 0) +#define cpu_stl_kernel(e, p, v) cpu_stl_kernel_ra(e, p, v, 0) +#define cpu_stq_kernel(e, p, v) cpu_stq_kernel_ra(e, p, v, 0) /* return non zero if error */ static inline int load_segment_ra(CPUX86State *env, uint32_t *e1_ptr, diff --git a/target/i386/translate.c b/target/i386/translate.c index 7c99ef1385..d9af8f4078 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -22,7 +22,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "exec/translator.h" diff --git a/target/lm32/translate.c b/target/lm32/translate.c index 73db9654d6..e583d52d03 100644 --- a/target/lm32/translate.c +++ b/target/lm32/translate.c @@ -23,7 +23,7 @@ #include "exec/helper-proto.h" #include "exec/exec-all.h" #include "exec/translator.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "qemu/qemu-print.h" #include "exec/cpu_ldst.h" diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h index 11c71fa962..3de8e06dfe 100644 --- a/target/m68k/cpu.h +++ b/target/m68k/cpu.h @@ -519,8 +519,6 @@ enum { #define cpu_list m68k_cpu_list /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_KERNEL_IDX 0 #define MMU_USER_IDX 1 static inline int cpu_mmu_index (CPUM68KState *env, bool ifetch) diff --git a/target/m68k/op_helper.c b/target/m68k/op_helper.c index bc4f845e3f..202498deb5 100644 --- a/target/m68k/op_helper.c +++ b/target/m68k/op_helper.c @@ -42,8 +42,8 @@ static void cf_rte(CPUM68KState *env) uint32_t fmt; sp = env->aregs[7]; - fmt = cpu_ldl_kernel(env, sp); - env->pc = cpu_ldl_kernel(env, sp + 4); + fmt = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0); + env->pc = cpu_ldl_mmuidx_ra(env, sp + 4, MMU_KERNEL_IDX, 0); sp |= (fmt >> 28) & 3; env->aregs[7] = sp + 8; @@ -58,13 +58,13 @@ static void m68k_rte(CPUM68KState *env) sp = env->aregs[7]; throwaway: - sr = cpu_lduw_kernel(env, sp); + sr = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0); sp += 2; - env->pc = cpu_ldl_kernel(env, sp); + env->pc = cpu_ldl_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0); sp += 4; if (m68k_feature(env, M68K_FEATURE_QUAD_MULDIV)) { /* all except 68000 */ - fmt = cpu_lduw_kernel(env, sp); + fmt = cpu_lduw_mmuidx_ra(env, sp, MMU_KERNEL_IDX, 0); sp += 2; switch (fmt >> 12) { case 0: @@ -260,12 +260,12 @@ static void cf_interrupt_all(CPUM68KState *env, int is_hw) /* ??? This could cause MMU faults. */ sp &= ~3; sp -= 4; - cpu_stl_kernel(env, sp, retaddr); + cpu_stl_mmuidx_ra(env, sp, retaddr, MMU_KERNEL_IDX, 0); sp -= 4; - cpu_stl_kernel(env, sp, fmt); + cpu_stl_mmuidx_ra(env, sp, fmt, MMU_KERNEL_IDX, 0); env->aregs[7] = sp; /* Jump to vector. */ - env->pc = cpu_ldl_kernel(env, env->vbr + vector); + env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0); } static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp, @@ -278,23 +278,24 @@ static inline void do_stack_frame(CPUM68KState *env, uint32_t *sp, switch (format) { case 4: *sp -= 4; - cpu_stl_kernel(env, *sp, env->pc); + cpu_stl_mmuidx_ra(env, *sp, env->pc, MMU_KERNEL_IDX, 0); *sp -= 4; - cpu_stl_kernel(env, *sp, addr); + cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0); break; case 3: case 2: *sp -= 4; - cpu_stl_kernel(env, *sp, addr); + cpu_stl_mmuidx_ra(env, *sp, addr, MMU_KERNEL_IDX, 0); break; } *sp -= 2; - cpu_stw_kernel(env, *sp, (format << 12) + (cs->exception_index << 2)); + cpu_stw_mmuidx_ra(env, *sp, (format << 12) + (cs->exception_index << 2), + MMU_KERNEL_IDX, 0); } *sp -= 4; - cpu_stl_kernel(env, *sp, retaddr); + cpu_stl_mmuidx_ra(env, *sp, retaddr, MMU_KERNEL_IDX, 0); *sp -= 2; - cpu_stw_kernel(env, *sp, sr); + cpu_stw_mmuidx_ra(env, *sp, sr, MMU_KERNEL_IDX, 0); } static void m68k_interrupt_all(CPUM68KState *env, int is_hw) @@ -353,36 +354,52 @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw) cpu_abort(cs, "DOUBLE MMU FAULT\n"); } env->mmu.fault = true; + /* push data 3 */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* push data 3 */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* push data 2 */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* push data 2 */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* push data 1 */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* push data 1 */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 1 / push data 0 */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* write back 1 / push data 0 */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 1 address */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* write back 1 address */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 2 data */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* write back 2 data */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 2 address */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* write back 2 address */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 3 data */ sp -= 4; - cpu_stl_kernel(env, sp, 0); /* write back 3 data */ + cpu_stl_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 3 address */ sp -= 4; - cpu_stl_kernel(env, sp, env->mmu.ar); /* write back 3 address */ + cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0); + /* fault address */ sp -= 4; - cpu_stl_kernel(env, sp, env->mmu.ar); /* fault address */ + cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0); + /* write back 1 status */ sp -= 2; - cpu_stw_kernel(env, sp, 0); /* write back 1 status */ + cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 2 status */ sp -= 2; - cpu_stw_kernel(env, sp, 0); /* write back 2 status */ + cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* write back 3 status */ sp -= 2; - cpu_stw_kernel(env, sp, 0); /* write back 3 status */ + cpu_stw_mmuidx_ra(env, sp, 0, MMU_KERNEL_IDX, 0); + /* special status word */ sp -= 2; - cpu_stw_kernel(env, sp, env->mmu.ssw); /* special status word */ + cpu_stw_mmuidx_ra(env, sp, env->mmu.ssw, MMU_KERNEL_IDX, 0); + /* effective address */ sp -= 4; - cpu_stl_kernel(env, sp, env->mmu.ar); /* effective address */ + cpu_stl_mmuidx_ra(env, sp, env->mmu.ar, MMU_KERNEL_IDX, 0); + do_stack_frame(env, &sp, 7, oldsr, 0, retaddr); env->mmu.fault = false; if (qemu_loglevel_mask(CPU_LOG_INT)) { @@ -414,7 +431,7 @@ static void m68k_interrupt_all(CPUM68KState *env, int is_hw) env->aregs[7] = sp; /* Jump to vector. */ - env->pc = cpu_ldl_kernel(env, env->vbr + vector); + env->pc = cpu_ldl_mmuidx_ra(env, env->vbr + vector, MMU_KERNEL_IDX, 0); } static void do_interrupt_all(CPUM68KState *env, int is_hw) diff --git a/target/m68k/translate.c b/target/m68k/translate.c index fcdb7bc8e4..0f80888203 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -22,7 +22,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "qemu/log.h" #include "qemu/qemu-print.h" #include "exec/cpu_ldst.h" @@ -289,16 +289,21 @@ static void gen_jmp(DisasContext *s, TCGv dest) s->base.is_jmp = DISAS_JUMP; } -static void gen_exception(DisasContext *s, uint32_t dest, int nr) +static void gen_raise_exception(int nr) { TCGv_i32 tmp; - update_cc_op(s); - tcg_gen_movi_i32(QREG_PC, dest); - tmp = tcg_const_i32(nr); gen_helper_raise_exception(cpu_env, tmp); tcg_temp_free_i32(tmp); +} + +static void gen_exception(DisasContext *s, uint32_t dest, int nr) +{ + update_cc_op(s); + tcg_gen_movi_i32(QREG_PC, dest); + + gen_raise_exception(nr); s->base.is_jmp = DISAS_NORETURN; } @@ -6198,29 +6203,36 @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) { DisasContext *dc = container_of(dcbase, DisasContext, base); - if (dc->base.is_jmp == DISAS_NORETURN) { - return; - } - if (dc->base.singlestep_enabled) { - gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG)); - return; - } - switch (dc->base.is_jmp) { + case DISAS_NORETURN: + break; case DISAS_TOO_MANY: update_cc_op(dc); - gen_jmp_tb(dc, 0, dc->pc); + if (dc->base.singlestep_enabled) { + tcg_gen_movi_i32(QREG_PC, dc->pc); + gen_raise_exception(EXCP_DEBUG); + } else { + gen_jmp_tb(dc, 0, dc->pc); + } break; case DISAS_JUMP: /* We updated CC_OP and PC in gen_jmp/gen_jmp_im. */ - tcg_gen_lookup_and_goto_ptr(); + if (dc->base.singlestep_enabled) { + gen_raise_exception(EXCP_DEBUG); + } else { + tcg_gen_lookup_and_goto_ptr(); + } break; case DISAS_EXIT: /* * We updated CC_OP and PC in gen_exit_tb, but also modified * other state that may require returning to the main loop. */ - tcg_gen_exit_tb(NULL, 0); + if (dc->base.singlestep_enabled) { + gen_raise_exception(EXCP_DEBUG); + } else { + tcg_gen_exit_tb(NULL, 0); + } break; default: g_assert_not_reached(); diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h index 95773089aa..32522f606b 100644 --- a/target/microblaze/cpu.h +++ b/target/microblaze/cpu.h @@ -328,9 +328,6 @@ int cpu_mb_signal_handler(int host_signum, void *pinfo, #define cpu_signal_handler cpu_mb_signal_handler /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _nommu -#define MMU_MODE1_SUFFIX _kernel -#define MMU_MODE2_SUFFIX _user #define MMU_NOMMU_IDX 0 #define MMU_KERNEL_IDX 1 #define MMU_USER_IDX 2 diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c index 525115b041..37a844db99 100644 --- a/target/microblaze/translate.c +++ b/target/microblaze/translate.c @@ -22,7 +22,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/helper-proto.h" #include "microblaze-decode.h" #include "exec/cpu_ldst.h" diff --git a/target/mips/cpu.h b/target/mips/cpu.h index ca00f41daf..c218ccc4a8 100644 --- a/target/mips/cpu.h +++ b/target/mips/cpu.h @@ -1147,10 +1147,6 @@ extern uint32_t cpu_rddsp(uint32_t mask_num, CPUMIPSState *env); * MMU modes definitions. We carefully match the indices with our * hflags layout. */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _super -#define MMU_MODE2_SUFFIX _user -#define MMU_MODE3_SUFFIX _error #define MMU_USER_IDX 2 static inline int hflags_mmu_index(uint32_t hflags) diff --git a/target/mips/op_helper.c b/target/mips/op_helper.c index 18fcee4a78..79d44da6fa 100644 --- a/target/mips/op_helper.c +++ b/target/mips/op_helper.c @@ -52,69 +52,6 @@ static void raise_exception(CPUMIPSState *env, uint32_t exception) do_raise_exception(env, exception, 0); } -#if defined(CONFIG_USER_ONLY) -#define HELPER_LD(name, insn, type) \ -static inline type do_##name(CPUMIPSState *env, target_ulong addr, \ - int mem_idx, uintptr_t retaddr) \ -{ \ - return (type) cpu_##insn##_data_ra(env, addr, retaddr); \ -} -#else -#define HELPER_LD(name, insn, type) \ -static inline type do_##name(CPUMIPSState *env, target_ulong addr, \ - int mem_idx, uintptr_t retaddr) \ -{ \ - switch (mem_idx) { \ - case 0: return (type) cpu_##insn##_kernel_ra(env, addr, retaddr); \ - case 1: return (type) cpu_##insn##_super_ra(env, addr, retaddr); \ - default: \ - case 2: return (type) cpu_##insn##_user_ra(env, addr, retaddr); \ - case 3: return (type) cpu_##insn##_error_ra(env, addr, retaddr); \ - } \ -} -#endif -HELPER_LD(lw, ldl, int32_t) -#if defined(TARGET_MIPS64) -HELPER_LD(ld, ldq, int64_t) -#endif -#undef HELPER_LD - -#if defined(CONFIG_USER_ONLY) -#define HELPER_ST(name, insn, type) \ -static inline void do_##name(CPUMIPSState *env, target_ulong addr, \ - type val, int mem_idx, uintptr_t retaddr) \ -{ \ - cpu_##insn##_data_ra(env, addr, val, retaddr); \ -} -#else -#define HELPER_ST(name, insn, type) \ -static inline void do_##name(CPUMIPSState *env, target_ulong addr, \ - type val, int mem_idx, uintptr_t retaddr) \ -{ \ - switch (mem_idx) { \ - case 0: \ - cpu_##insn##_kernel_ra(env, addr, val, retaddr); \ - break; \ - case 1: \ - cpu_##insn##_super_ra(env, addr, val, retaddr); \ - break; \ - default: \ - case 2: \ - cpu_##insn##_user_ra(env, addr, val, retaddr); \ - break; \ - case 3: \ - cpu_##insn##_error_ra(env, addr, val, retaddr); \ - break; \ - } \ -} -#endif -HELPER_ST(sb, stb, uint8_t) -HELPER_ST(sw, stl, uint32_t) -#if defined(TARGET_MIPS64) -HELPER_ST(sd, stq, uint64_t) -#endif -#undef HELPER_ST - /* 64 bits arithmetic for 32 bits hosts */ static inline uint64_t get_HILO(CPUMIPSState *env) { @@ -379,12 +316,12 @@ target_ulong helper_##name(CPUMIPSState *env, target_ulong arg, int mem_idx) \ } \ env->CP0_LLAddr = do_translate_address(env, arg, 0, GETPC()); \ env->lladdr = arg; \ - env->llval = do_##insn(env, arg, mem_idx, GETPC()); \ + env->llval = cpu_##insn##_mmuidx_ra(env, arg, mem_idx, GETPC()); \ return env->llval; \ } -HELPER_LD_ATOMIC(ll, lw, 0x3) +HELPER_LD_ATOMIC(ll, ldl, 0x3) #ifdef TARGET_MIPS64 -HELPER_LD_ATOMIC(lld, ld, 0x7) +HELPER_LD_ATOMIC(lld, ldq, 0x7) #endif #undef HELPER_LD_ATOMIC #endif @@ -400,42 +337,42 @@ HELPER_LD_ATOMIC(lld, ld, 0x7) void helper_swl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2, int mem_idx) { - do_sb(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC()); + cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 24), mem_idx, GETPC()); if (GET_LMASK(arg2) <= 2) { - do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 16), + mem_idx, GETPC()); } if (GET_LMASK(arg2) <= 1) { - do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 8), + mem_idx, GETPC()); } if (GET_LMASK(arg2) == 0) { - do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)arg1, mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)arg1, + mem_idx, GETPC()); } } void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2, int mem_idx) { - do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC()); + cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC()); if (GET_LMASK(arg2) >= 1) { - do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), + mem_idx, GETPC()); } if (GET_LMASK(arg2) >= 2) { - do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), + mem_idx, GETPC()); } if (GET_LMASK(arg2) == 3) { - do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), + mem_idx, GETPC()); } } @@ -453,82 +390,82 @@ void helper_swr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2, void helper_sdl(CPUMIPSState *env, target_ulong arg1, target_ulong arg2, int mem_idx) { - do_sb(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC()); + cpu_stb_mmuidx_ra(env, arg2, (uint8_t)(arg1 >> 56), mem_idx, GETPC()); if (GET_LMASK64(arg2) <= 6) { - do_sb(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 1), (uint8_t)(arg1 >> 48), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 5) { - do_sb(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 2), (uint8_t)(arg1 >> 40), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 4) { - do_sb(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 3), (uint8_t)(arg1 >> 32), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 3) { - do_sb(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 4), (uint8_t)(arg1 >> 24), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 2) { - do_sb(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 5), (uint8_t)(arg1 >> 16), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 1) { - do_sb(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 6), (uint8_t)(arg1 >> 8), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) <= 0) { - do_sb(env, GET_OFFSET(arg2, 7), (uint8_t)arg1, mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, 7), (uint8_t)arg1, + mem_idx, GETPC()); } } void helper_sdr(CPUMIPSState *env, target_ulong arg1, target_ulong arg2, int mem_idx) { - do_sb(env, arg2, (uint8_t)arg1, mem_idx, GETPC()); + cpu_stb_mmuidx_ra(env, arg2, (uint8_t)arg1, mem_idx, GETPC()); if (GET_LMASK64(arg2) >= 1) { - do_sb(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -1), (uint8_t)(arg1 >> 8), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) >= 2) { - do_sb(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -2), (uint8_t)(arg1 >> 16), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) >= 3) { - do_sb(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -3), (uint8_t)(arg1 >> 24), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) >= 4) { - do_sb(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -4), (uint8_t)(arg1 >> 32), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) >= 5) { - do_sb(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -5), (uint8_t)(arg1 >> 40), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) >= 6) { - do_sb(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -6), (uint8_t)(arg1 >> 48), + mem_idx, GETPC()); } if (GET_LMASK64(arg2) == 7) { - do_sb(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56), mem_idx, - GETPC()); + cpu_stb_mmuidx_ra(env, GET_OFFSET(arg2, -7), (uint8_t)(arg1 >> 56), + mem_idx, GETPC()); } } #endif /* TARGET_MIPS64 */ @@ -546,14 +483,14 @@ void helper_lwm(CPUMIPSState *env, target_ulong addr, target_ulong reglist, for (i = 0; i < base_reglist; i++) { env->active_tc.gpr[multiple_regs[i]] = - (target_long)do_lw(env, addr, mem_idx, GETPC()); + (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC()); addr += 4; } } if (do_r31) { - env->active_tc.gpr[31] = (target_long)do_lw(env, addr, mem_idx, - GETPC()); + env->active_tc.gpr[31] = + (target_long)cpu_ldl_mmuidx_ra(env, addr, mem_idx, GETPC()); } } @@ -567,14 +504,14 @@ void helper_swm(CPUMIPSState *env, target_ulong addr, target_ulong reglist, target_ulong i; for (i = 0; i < base_reglist; i++) { - do_sw(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx, - GETPC()); + cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]], + mem_idx, GETPC()); addr += 4; } } if (do_r31) { - do_sw(env, addr, env->active_tc.gpr[31], mem_idx, GETPC()); + cpu_stw_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC()); } } @@ -589,14 +526,15 @@ void helper_ldm(CPUMIPSState *env, target_ulong addr, target_ulong reglist, target_ulong i; for (i = 0; i < base_reglist; i++) { - env->active_tc.gpr[multiple_regs[i]] = do_ld(env, addr, mem_idx, - GETPC()); + env->active_tc.gpr[multiple_regs[i]] = + cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC()); addr += 8; } } if (do_r31) { - env->active_tc.gpr[31] = do_ld(env, addr, mem_idx, GETPC()); + env->active_tc.gpr[31] = + cpu_ldq_mmuidx_ra(env, addr, mem_idx, GETPC()); } } @@ -610,14 +548,14 @@ void helper_sdm(CPUMIPSState *env, target_ulong addr, target_ulong reglist, target_ulong i; for (i = 0; i < base_reglist; i++) { - do_sd(env, addr, env->active_tc.gpr[multiple_regs[i]], mem_idx, - GETPC()); + cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[multiple_regs[i]], + mem_idx, GETPC()); addr += 8; } } if (do_r31) { - do_sd(env, addr, env->active_tc.gpr[31], mem_idx, GETPC()); + cpu_stq_mmuidx_ra(env, addr, env->active_tc.gpr[31], mem_idx, GETPC()); } } #endif diff --git a/target/mips/translate.c b/target/mips/translate.c index 4bff585bd6..efe75e6be0 100644 --- a/target/mips/translate.c +++ b/target/mips/translate.c @@ -26,7 +26,7 @@ #include "internal.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "hw/mips/cpudevs.h" diff --git a/target/moxie/translate.c b/target/moxie/translate.c index c87e9ec2b1..d5fb27dfb8 100644 --- a/target/moxie/translate.c +++ b/target/moxie/translate.c @@ -26,7 +26,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "disas/disas.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "qemu/qemu-print.h" diff --git a/target/nios2/cpu.h b/target/nios2/cpu.h index 361b06ffeb..78f633f970 100644 --- a/target/nios2/cpu.h +++ b/target/nios2/cpu.h @@ -217,8 +217,6 @@ void do_nios2_semihosting(CPUNios2State *env); #define CPU_SAVE_VERSION 1 /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_SUPERVISOR_IDX 0 #define MMU_USER_IDX 1 diff --git a/target/nios2/translate.c b/target/nios2/translate.c index 82107bf270..6c34cd3193 100644 --- a/target/nios2/translate.c +++ b/target/nios2/translate.c @@ -23,7 +23,7 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/exec-all.h" #include "disas/disas.h" #include "exec/helper-proto.h" diff --git a/target/openrisc/fpu_helper.c b/target/openrisc/fpu_helper.c index 59e1413279..6f75ea0505 100644 --- a/target/openrisc/fpu_helper.c +++ b/target/openrisc/fpu_helper.c @@ -70,7 +70,7 @@ void cpu_set_fpcsr(CPUOpenRISCState *env, uint32_t val) float_round_down }; - env->fpcsr = val & 0x7ff; + env->fpcsr = val & 0xfff; set_float_rounding_mode(rm_to_sf[extract32(val, 1, 2)], &env->fp_status); } diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c index 8dd28d6cf1..52323a16df 100644 --- a/target/openrisc/translate.c +++ b/target/openrisc/translate.c @@ -22,7 +22,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "disas/disas.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "qemu/log.h" #include "qemu/bitops.h" #include "qemu/qemu-print.h" diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 103bfe9dc2..8ebeaba649 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -951,8 +951,6 @@ struct ppc_radix_page_info { * + real/paged mode combinations. The other two modes are for * external PID load/store. */ -#define MMU_MODE8_SUFFIX _epl -#define MMU_MODE9_SUFFIX _eps #define PPC_TLB_EPID_LOAD 8 #define PPC_TLB_EPID_STORE 9 diff --git a/target/ppc/mem_helper.c b/target/ppc/mem_helper.c index 1351b53f28..e8e2a8ac2a 100644 --- a/target/ppc/mem_helper.c +++ b/target/ppc/mem_helper.c @@ -25,7 +25,7 @@ #include "exec/helper-proto.h" #include "helper_regs.h" #include "exec/cpu_ldst.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "internal.h" #include "qemu/atomic128.h" @@ -177,14 +177,7 @@ static void dcbz_common(CPUPPCState *env, target_ulong addr, } else { /* Slow path */ for (i = 0; i < dcbz_size; i += 8) { - if (epid) { -#if !defined(CONFIG_USER_ONLY) - /* Does not make sense on USER_ONLY config */ - cpu_stq_eps_ra(env, addr + i, 0, retaddr); -#endif - } else { - cpu_stq_data_ra(env, addr + i, 0, retaddr); - } + cpu_stq_mmuidx_ra(env, addr + i, 0, mmu_idx, retaddr); } } } @@ -216,7 +209,7 @@ void helper_icbiep(CPUPPCState *env, target_ulong addr) #if !defined(CONFIG_USER_ONLY) /* See comments above */ addr &= ~(env->dcache_line_size - 1); - cpu_ldl_epl_ra(env, addr, GETPC()); + cpu_ldl_mmuidx_ra(env, addr, PPC_TLB_EPID_LOAD, GETPC()); #endif } diff --git a/target/ppc/translate.c b/target/ppc/translate.c index f5fe5d0611..9dcf8dc261 100644 --- a/target/ppc/translate.c +++ b/target/ppc/translate.c @@ -23,8 +23,8 @@ #include "internal.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "qemu/host-utils.h" #include "qemu/main-loop.h" #include "exec/cpu_ldst.h" diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index 767c8762ac..85403da9c8 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -22,7 +22,7 @@ #include "qemu/main-loop.h" #include "cpu.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "trace.h" int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch) diff --git a/target/riscv/translate.c b/target/riscv/translate.c index 8e40ed3ac4..14dc71156b 100644 --- a/target/riscv/translate.c +++ b/target/riscv/translate.c @@ -19,7 +19,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" #include "cpu.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "disas/disas.h" #include "exec/cpu_ldst.h" #include "exec/exec-all.h" diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h index e195e5c7c8..8a557fd8d1 100644 --- a/target/s390x/cpu.h +++ b/target/s390x/cpu.h @@ -36,11 +36,6 @@ #define TARGET_INSN_START_EXTRA_WORDS 2 -#define MMU_MODE0_SUFFIX _primary -#define MMU_MODE1_SUFFIX _secondary -#define MMU_MODE2_SUFFIX _home -#define MMU_MODE3_SUFFIX _real - #define MMU_USER_IDX 0 #define S390_MAX_CPUS 248 diff --git a/target/s390x/mem_helper.c b/target/s390x/mem_helper.c index 2921419c27..a237dec757 100644 --- a/target/s390x/mem_helper.c +++ b/target/s390x/mem_helper.c @@ -27,6 +27,7 @@ #include "exec/cpu_ldst.h" #include "qemu/int128.h" #include "qemu/atomic128.h" +#include "tcg/tcg.h" #if !defined(CONFIG_USER_ONLY) #include "hw/s390x/storage-keys.h" @@ -2025,7 +2026,7 @@ uint32_t HELPER(testblock)(CPUS390XState *env, uint64_t real_addr) real_addr = wrap_address(env, real_addr) & TARGET_PAGE_MASK; for (i = 0; i < TARGET_PAGE_SIZE; i += 8) { - cpu_stq_real_ra(env, real_addr + i, 0, ra); + cpu_stq_mmuidx_ra(env, real_addr + i, 0, MMU_REAL_IDX, ra); } return 0; @@ -2259,11 +2260,11 @@ void HELPER(idte)(CPUS390XState *env, uint64_t r1, uint64_t r2, uint32_t m4) for (i = 0; i < entries; i++) { /* addresses are not wrapped in 24/31bit mode but table index is */ raddr = table + ((index + i) & 0x7ff) * sizeof(entry); - entry = cpu_ldq_real_ra(env, raddr, ra); + entry = cpu_ldq_mmuidx_ra(env, raddr, MMU_REAL_IDX, ra); if (!(entry & REGION_ENTRY_I)) { /* we are allowed to not store if already invalid */ entry |= REGION_ENTRY_I; - cpu_stq_real_ra(env, raddr, entry, ra); + cpu_stq_mmuidx_ra(env, raddr, entry, MMU_REAL_IDX, ra); } } } @@ -2290,9 +2291,9 @@ void HELPER(ipte)(CPUS390XState *env, uint64_t pto, uint64_t vaddr, pte_addr += VADDR_PAGE_TX(vaddr) * 8; /* Mark the page table entry as invalid */ - pte = cpu_ldq_real_ra(env, pte_addr, ra); + pte = cpu_ldq_mmuidx_ra(env, pte_addr, MMU_REAL_IDX, ra); pte |= PAGE_ENTRY_I; - cpu_stq_real_ra(env, pte_addr, pte, ra); + cpu_stq_mmuidx_ra(env, pte_addr, pte, MMU_REAL_IDX, ra); /* XXX we exploit the fact that Linux passes the exact virtual address here - it's not obliged to! */ diff --git a/target/s390x/translate.c b/target/s390x/translate.c index 4292bb0dd0..b764ec3140 100644 --- a/target/s390x/translate.c +++ b/target/s390x/translate.c @@ -33,8 +33,8 @@ #include "internal.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "qemu/log.h" #include "qemu/host-utils.h" #include "exec/cpu_ldst.h" diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h index ecaa7a18a9..452a596e67 100644 --- a/target/sh4/cpu.h +++ b/target/sh4/cpu.h @@ -254,8 +254,6 @@ void cpu_load_tlb(CPUSH4State * env); #define cpu_list sh4_cpu_list /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_USER_IDX 1 static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch) { diff --git a/target/sh4/translate.c b/target/sh4/translate.c index 922785e225..6192d83e8c 100644 --- a/target/sh4/translate.c +++ b/target/sh4/translate.c @@ -23,7 +23,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "exec/helper-proto.h" #include "exec/helper-gen.h" diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c index 7345827a96..e91cfdecd3 100644 --- a/target/sparc/ldst_helper.c +++ b/target/sparc/ldst_helper.c @@ -19,7 +19,7 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "tcg.h" +#include "tcg/tcg.h" #include "exec/helper-proto.h" #include "exec/exec-all.h" #include "exec/cpu_ldst.h" diff --git a/target/sparc/translate.c b/target/sparc/translate.c index edc23a7c40..9416a551cf 100644 --- a/target/sparc/translate.c +++ b/target/sparc/translate.c @@ -24,7 +24,7 @@ #include "disas/disas.h" #include "exec/helper-proto.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "exec/helper-gen.h" diff --git a/target/tilegx/translate.c b/target/tilegx/translate.c index abce7e1c75..65f1c91f4f 100644 --- a/target/tilegx/translate.c +++ b/target/tilegx/translate.c @@ -24,7 +24,7 @@ #include "exec/log.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "linux-user/syscall_defs.h" diff --git a/target/tricore/translate.c b/target/tricore/translate.c index c574638c9f..609d75ae8a 100644 --- a/target/tricore/translate.c +++ b/target/tricore/translate.c @@ -22,7 +22,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "exec/cpu_ldst.h" #include "qemu/qemu-print.h" diff --git a/target/unicore32/cpu.h b/target/unicore32/cpu.h index 50ed9dd99c..7a32e086ed 100644 --- a/target/unicore32/cpu.h +++ b/target/unicore32/cpu.h @@ -133,8 +133,6 @@ void cpu_asr_write(CPUUniCore32State *env1, target_ulong val, target_ulong mask) int uc32_cpu_signal_handler(int host_signum, void *pinfo, void *puc); /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _kernel -#define MMU_MODE1_SUFFIX _user #define MMU_USER_IDX 1 static inline int cpu_mmu_index(CPUUniCore32State *env, bool ifetch) { diff --git a/target/unicore32/translate.c b/target/unicore32/translate.c index 0f6891b8aa..d4b06df672 100644 --- a/target/unicore32/translate.c +++ b/target/unicore32/translate.c @@ -13,7 +13,7 @@ #include "cpu.h" #include "disas/disas.h" #include "exec/exec-all.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "qemu/log.h" #include "exec/cpu_ldst.h" #include "exec/translator.h" diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h index 75e65df597..493f4fc80c 100644 --- a/target/xtensa/cpu.h +++ b/target/xtensa/cpu.h @@ -689,10 +689,6 @@ static inline uint32_t xtensa_replicate_windowstart(CPUXtensaState *env) } /* MMU modes definitions */ -#define MMU_MODE0_SUFFIX _ring0 -#define MMU_MODE1_SUFFIX _ring1 -#define MMU_MODE2_SUFFIX _ring2 -#define MMU_MODE3_SUFFIX _ring3 #define MMU_USER_IDX 3 static inline int cpu_mmu_index(CPUXtensaState *env, bool ifetch) diff --git a/target/xtensa/mmu_helper.c b/target/xtensa/mmu_helper.c index f15bff306f..b01ff9399a 100644 --- a/target/xtensa/mmu_helper.c +++ b/target/xtensa/mmu_helper.c @@ -63,10 +63,11 @@ void HELPER(itlb_hit_test)(CPUXtensaState *env, uint32_t vaddr) { /* - * Attempt the memory load; we don't care about the result but + * Probe the memory; we don't care about the result but * only the side-effects (ie any MMU or other exception) */ - cpu_ldub_code_ra(env, vaddr, GETPC()); + probe_access(env, vaddr, 1, MMU_INST_FETCH, + cpu_mmu_index(env, true), GETPC()); } void HELPER(wsr_rasid)(CPUXtensaState *env, uint32_t v) diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c index e6d910786c..8aa972cafd 100644 --- a/target/xtensa/translate.c +++ b/target/xtensa/translate.c @@ -33,7 +33,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "disas/disas.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #include "qemu/log.h" #include "qemu/qemu-print.h" #include "exec/cpu_ldst.h" diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c index 3f921015d3..843fd0ca69 100644 --- a/tcg/aarch64/tcg-target.inc.c +++ b/tcg/aarch64/tcg-target.inc.c @@ -10,7 +10,7 @@ * See the COPYING file in the top-level directory for details. */ -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #include "qemu/bitops.h" /* We're going to re-use TCGType in setting of the SF bit, which controls @@ -1541,7 +1541,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d, } #ifdef CONFIG_SOFTMMU -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, * TCGMemOpIdx oi, uintptr_t ra) diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c index 94d80d79d1..fffb6611e2 100644 --- a/tcg/arm/tcg-target.inc.c +++ b/tcg/arm/tcg-target.inc.c @@ -23,7 +23,7 @@ */ #include "elf.h" -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" int arm_arch = __ARM_ARCH; @@ -1131,7 +1131,7 @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args, } #ifdef CONFIG_SOFTMMU -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, * int mmu_idx, uintptr_t ra) diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 928e8b87bb..bfb3f5f6e9 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -223,7 +223,7 @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr, * The x86 has a pretty strong memory ordering which only really * allows for some stores to be re-ordered after loads. */ -#include "tcg-mo.h" +#include "tcg/tcg-mo.h" #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD) diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index 9d8ed974e0..cdedcb2b25 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { @@ -1647,7 +1647,7 @@ static void tcg_out_nopn(TCGContext *s, int n) } #if defined(CONFIG_SOFTMMU) -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, * int mmu_idx, uintptr_t ra) diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c index 5442167045..1da663ce84 100644 --- a/tcg/mips/tcg-target.inc.c +++ b/tcg/mips/tcg-target.inc.c @@ -1107,7 +1107,7 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *arg) } #if defined(CONFIG_SOFTMMU) -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" static void * const qemu_ld_helpers[16] = { [MO_UB] = helper_ret_ldub_mmu, diff --git a/tcg/optimize.c b/tcg/optimize.c index f7f4e873c9..53aa8e5329 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -24,7 +24,7 @@ */ #include "qemu/osdep.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" #define CASE_OP_32_64(x) \ glue(glue(case INDEX_op_, x), _i32): \ diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c index d308d69aba..ee1f9227c1 100644 --- a/tcg/ppc/tcg-target.inc.c +++ b/tcg/ppc/tcg-target.inc.c @@ -23,7 +23,7 @@ */ #include "elf.h" -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #if defined _CALL_DARWIN || defined __APPLE__ #define TCG_TARGET_CALL_DARWIN @@ -1845,7 +1845,7 @@ static const uint32_t qemu_exts_opc[4] = { }; #if defined (CONFIG_SOFTMMU) -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, * int mmu_idx, uintptr_t ra) diff --git a/tcg/riscv/tcg-target.inc.c b/tcg/riscv/tcg-target.inc.c index 7018509693..2bc0ba71f2 100644 --- a/tcg/riscv/tcg-target.inc.c +++ b/tcg/riscv/tcg-target.inc.c @@ -27,7 +27,7 @@ * THE SOFTWARE. */ -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { @@ -921,7 +921,7 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0) */ #if defined(CONFIG_SOFTMMU) -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, * TCGMemOpIdx oi, uintptr_t ra) diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c index 8aaa4cebe8..b07e9ff7d6 100644 --- a/tcg/s390/tcg-target.inc.c +++ b/tcg/s390/tcg-target.inc.c @@ -29,7 +29,7 @@ #error "unsupported code generation mode" #endif -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #include "elf.h" /* ??? The translation blocks produced by TCG are generally small enough to @@ -1536,7 +1536,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg data, } #if defined(CONFIG_SOFTMMU) -#include "tcg-ldst.inc.c" +#include "../tcg-ldst.inc.c" /* We're expecting to use a 20-bit negative offset on the tlb memory ops. */ QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0); diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c index d7986cda5c..65fddb310d 100644 --- a/tcg/sparc/tcg-target.inc.c +++ b/tcg/sparc/tcg-target.inc.c @@ -22,7 +22,7 @@ * THE SOFTWARE. */ -#include "tcg-pool.inc.c" +#include "../tcg-pool.inc.c" #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { diff --git a/tcg/tcg-common.c b/tcg/tcg-common.c index 97305a3efc..7e1992e79e 100644 --- a/tcg/tcg-common.c +++ b/tcg/tcg-common.c @@ -32,7 +32,7 @@ uintptr_t tci_tb_ptr; TCGOpDef tcg_op_defs[] = { #define DEF(s, oargs, iargs, cargs, flags) \ { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags }, -#include "tcg-opc.h" +#include "tcg/tcg-opc.h" #undef DEF }; const size_t tcg_op_defs_max = ARRAY_SIZE(tcg_op_defs); diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 5c95ecd51c..41b4a3c661 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -18,11 +18,11 @@ */ #include "qemu/osdep.h" -#include "tcg.h" -#include "tcg-op.h" -#include "tcg-op-gvec.h" +#include "tcg/tcg.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-op-gvec.h" #include "qemu/main-loop.h" -#include "tcg-gvec-desc.h" +#include "tcg/tcg-gvec-desc.h" #define MAX_UNROLL 4 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c index 6714991bf4..b6937e8d64 100644 --- a/tcg/tcg-op-vec.c +++ b/tcg/tcg-op-vec.c @@ -19,9 +19,9 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "tcg.h" -#include "tcg-op.h" -#include "tcg-mo.h" +#include "tcg/tcg.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-mo.h" /* Reduce the number of ifdefs below. This assumes that all uses of TCGV_HIGH and TCGV_LOW are properly protected by a conditional that diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index c245126f98..7d782002e3 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -25,9 +25,9 @@ #include "qemu/osdep.h" #include "cpu.h" #include "exec/exec-all.h" -#include "tcg.h" -#include "tcg-op.h" -#include "tcg-mo.h" +#include "tcg/tcg.h" +#include "tcg/tcg-op.h" +#include "tcg/tcg-mo.h" #include "trace-tcg.h" #include "trace/mem.h" #include "exec/plugin-gen.h" @@ -48,7 +48,7 @@ #include "hw/boards.h" #endif -#include "tcg-op.h" +#include "tcg/tcg-op.h" #if UINTPTR_MAX == UINT32_MAX # define ELF_CLASS ELFCLASS32 @@ -30,7 +30,7 @@ #include "qemu-common.h" #include "tcg/tcg.h" /* MAX_OPC_PARAM_IARGS */ #include "exec/cpu_ldst.h" -#include "tcg-op.h" +#include "tcg/tcg-op.h" /* Marker for missing code. */ #define TODO() \ diff --git a/tests/acceptance/boot_linux_console.py b/tests/acceptance/boot_linux_console.py index 9c6aa2040a..e40b84651b 100644 --- a/tests/acceptance/boot_linux_console.py +++ b/tests/acceptance/boot_linux_console.py @@ -400,6 +400,91 @@ class BootLinuxConsole(Test): self.wait_for_console_pattern('Boot successful.') # TODO user command, for now the uart is stuck + def test_arm_cubieboard_initrd(self): + """ + :avocado: tags=arch:arm + :avocado: tags=machine:cubieboard + """ + deb_url = ('https://apt.armbian.com/pool/main/l/' + 'linux-4.20.7-sunxi/linux-image-dev-sunxi_5.75_armhf.deb') + deb_hash = '1334c29c44d984ffa05ed10de8c3361f33d78315' + deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash) + kernel_path = self.extract_from_deb(deb_path, + '/boot/vmlinuz-4.20.7-sunxi') + dtb_path = '/usr/lib/linux-image-dev-sunxi/sun4i-a10-cubieboard.dtb' + dtb_path = self.extract_from_deb(deb_path, dtb_path) + initrd_url = ('https://github.com/groeck/linux-build-test/raw/' + '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/' + 'arm/rootfs-armv5.cpio.gz') + initrd_hash = '2b50f1873e113523967806f4da2afe385462ff9b' + initrd_path_gz = self.fetch_asset(initrd_url, asset_hash=initrd_hash) + initrd_path = os.path.join(self.workdir, 'rootfs.cpio') + archive.gzip_uncompress(initrd_path_gz, initrd_path) + + self.vm.set_console() + kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE + + 'console=ttyS0,115200 ' + 'usbcore.nousb ' + 'panic=-1 noreboot') + self.vm.add_args('-kernel', kernel_path, + '-dtb', dtb_path, + '-initrd', initrd_path, + '-append', kernel_command_line, + '-no-reboot') + self.vm.launch() + self.wait_for_console_pattern('Boot successful.') + + exec_command_and_wait_for_pattern(self, 'cat /proc/cpuinfo', + 'Allwinner sun4i/sun5i') + exec_command_and_wait_for_pattern(self, 'cat /proc/iomem', + 'system-control@1c00000') + exec_command_and_wait_for_pattern(self, 'reboot', + 'reboot: Restarting system') + + def test_arm_cubieboard_sata(self): + """ + :avocado: tags=arch:arm + :avocado: tags=machine:cubieboard + """ + deb_url = ('https://apt.armbian.com/pool/main/l/' + 'linux-4.20.7-sunxi/linux-image-dev-sunxi_5.75_armhf.deb') + deb_hash = '1334c29c44d984ffa05ed10de8c3361f33d78315' + deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash) + kernel_path = self.extract_from_deb(deb_path, + '/boot/vmlinuz-4.20.7-sunxi') + dtb_path = '/usr/lib/linux-image-dev-sunxi/sun4i-a10-cubieboard.dtb' + dtb_path = self.extract_from_deb(deb_path, dtb_path) + rootfs_url = ('https://github.com/groeck/linux-build-test/raw/' + '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/' + 'arm/rootfs-armv5.ext2.gz') + rootfs_hash = '093e89d2b4d982234bf528bc9fb2f2f17a9d1f93' + rootfs_path_gz = self.fetch_asset(rootfs_url, asset_hash=rootfs_hash) + rootfs_path = os.path.join(self.workdir, 'rootfs.cpio') + archive.gzip_uncompress(rootfs_path_gz, rootfs_path) + + self.vm.set_console() + kernel_command_line = (self.KERNEL_COMMON_COMMAND_LINE + + 'console=ttyS0,115200 ' + 'usbcore.nousb ' + 'root=/dev/sda ro ' + 'panic=-1 noreboot') + self.vm.add_args('-kernel', kernel_path, + '-dtb', dtb_path, + '-drive', 'if=none,format=raw,id=disk0,file=' + + rootfs_path, + '-device', 'ide-hd,bus=ide.0,drive=disk0', + '-append', kernel_command_line, + '-no-reboot') + self.vm.launch() + self.wait_for_console_pattern('Boot successful.') + + exec_command_and_wait_for_pattern(self, 'cat /proc/cpuinfo', + 'Allwinner sun4i/sun5i') + exec_command_and_wait_for_pattern(self, 'cat /proc/partitions', + 'sda') + exec_command_and_wait_for_pattern(self, 'reboot', + 'reboot: Restarting system') + def test_s390x_s390_ccw_virtio(self): """ :avocado: tags=arch:s390x diff --git a/tests/data/acpi/q35/DSDT b/tests/data/acpi/q35/DSDT Binary files differindex 77ea60ffed..1f91888d7a 100644 --- a/tests/data/acpi/q35/DSDT +++ b/tests/data/acpi/q35/DSDT diff --git a/tests/data/acpi/q35/DSDT.acpihmat b/tests/data/acpi/q35/DSDT.acpihmat Binary files differindex 30e3717b5b..3586f6368a 100644 --- a/tests/data/acpi/q35/DSDT.acpihmat +++ b/tests/data/acpi/q35/DSDT.acpihmat diff --git a/tests/data/acpi/q35/DSDT.bridge b/tests/data/acpi/q35/DSDT.bridge Binary files differindex fbc2d40000..eae3a2a865 100644 --- a/tests/data/acpi/q35/DSDT.bridge +++ b/tests/data/acpi/q35/DSDT.bridge diff --git a/tests/data/acpi/q35/DSDT.cphp b/tests/data/acpi/q35/DSDT.cphp Binary files differindex 6a896cb214..53d735a4de 100644 --- a/tests/data/acpi/q35/DSDT.cphp +++ b/tests/data/acpi/q35/DSDT.cphp diff --git a/tests/data/acpi/q35/DSDT.dimmpxm b/tests/data/acpi/q35/DSDT.dimmpxm Binary files differindex 23fdf5e60a..02ccdd5f38 100644 --- a/tests/data/acpi/q35/DSDT.dimmpxm +++ b/tests/data/acpi/q35/DSDT.dimmpxm diff --git a/tests/data/acpi/q35/DSDT.ipmibt b/tests/data/acpi/q35/DSDT.ipmibt Binary files differindex c3fca0a71e..9e2d4f785c 100644 --- a/tests/data/acpi/q35/DSDT.ipmibt +++ b/tests/data/acpi/q35/DSDT.ipmibt diff --git a/tests/data/acpi/q35/DSDT.memhp b/tests/data/acpi/q35/DSDT.memhp Binary files differindex 2abd0e36cd..baefa611ac 100644 --- a/tests/data/acpi/q35/DSDT.memhp +++ b/tests/data/acpi/q35/DSDT.memhp diff --git a/tests/data/acpi/q35/DSDT.mmio64 b/tests/data/acpi/q35/DSDT.mmio64 Binary files differindex b32034a11c..aae0ea2110 100644 --- a/tests/data/acpi/q35/DSDT.mmio64 +++ b/tests/data/acpi/q35/DSDT.mmio64 diff --git a/tests/data/acpi/q35/DSDT.numamem b/tests/data/acpi/q35/DSDT.numamem Binary files differindex d8b2b47f8b..859a2e0871 100644 --- a/tests/data/acpi/q35/DSDT.numamem +++ b/tests/data/acpi/q35/DSDT.numamem diff --git a/tests/data/acpi/rebuild-expected-aml.sh b/tests/data/acpi/rebuild-expected-aml.sh index f89d4624bc..d44e511533 100755 --- a/tests/data/acpi/rebuild-expected-aml.sh +++ b/tests/data/acpi/rebuild-expected-aml.sh @@ -14,7 +14,7 @@ qemu_bins="x86_64-softmmu/qemu-system-x86_64 aarch64-softmmu/qemu-system-aarch64" -if [ ! -e "tests/bios-tables-test" ]; then +if [ ! -e "tests/qtest/bios-tables-test" ]; then echo "Test: bios-tables-test is required! Run make check before this script." echo "Run this script from the build directory." exit 1; @@ -26,11 +26,11 @@ for qemu in $qemu_bins; do echo "Also, run this script from the build directory." exit 1; fi - TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test + TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/qtest/bios-tables-test done eval `grep SRC_PATH= config-host.mak` -echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/bios-tables-test-allowed-diff.h +echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h echo "The files were rebuilt and can be added to git." diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index f1ac2d7e96..3ab4872bd7 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -16,7 +16,10 @@ * 1. add empty files for new tables, if any, under tests/data/acpi * 2. list any changed files in tests/bios-tables-test-allowed-diff.h * 3. commit the above *before* making changes that affect the tables - * Maintainer: + * + * Contributor or ACPI Maintainer (steps 4-7 need to be redone to resolve conflicts + * in binary commit created in step 6): + * * After 1-3 above tests will pass but ignore differences with the expected files. * You will also notice that tests/bios-tables-test-allowed-diff.h lists * a bunch of files. This is your hint that you need to do the below: @@ -28,13 +31,23 @@ * output. If not - disassemble them yourself in any way you like. * Look at the differences - make sure they make sense and match what the * changes you are merging are supposed to do. + * Save the changes, preferably in form of ASL diff for the commit log in + * step 6. * * 5. From build directory, run: * $(SRC_PATH)/tests/data/acpi/rebuild-expected-aml.sh - * 6. Now commit any changes. - * 7. Before doing a pull request, make sure tests/bios-tables-test-allowed-diff.h - * is empty - this will ensure following changes to ACPI tables will - * be noticed. + * 6. Now commit any changes to the expected binary, include diff from step 4 + * in commit log. + * 7. Before sending patches to the list (Contributor) + * or before doing a pull request (Maintainer), make sure + * tests/bios-tables-test-allowed-diff.h is empty - this will ensure + * following changes to ACPI tables will be noticed. + * + * The resulting patchset/pull request then looks like this: + * - patch 1: list changed files in tests/bios-tables-test-allowed-diff.h. + * - patches 2 - n: real changes, may contain multiple patches. + * - patch n + 1: update golden master binaries and empty + * tests/bios-tables-test-allowed-diff.h */ #include "qemu/osdep.h" diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 53afec4395..26e2e77289 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -356,6 +356,43 @@ static void migrate_set_parameter_int(QTestState *who, const char *parameter, migrate_check_parameter_int(who, parameter, value); } +static char *migrate_get_parameter_str(QTestState *who, + const char *parameter) +{ + QDict *rsp; + char *result; + + rsp = wait_command(who, "{ 'execute': 'query-migrate-parameters' }"); + result = g_strdup(qdict_get_str(rsp, parameter)); + qobject_unref(rsp); + return result; +} + +static void migrate_check_parameter_str(QTestState *who, const char *parameter, + const char *value) +{ + char *result; + + result = migrate_get_parameter_str(who, parameter); + g_assert_cmpstr(result, ==, value); + g_free(result); +} + +__attribute__((unused)) +static void migrate_set_parameter_str(QTestState *who, const char *parameter, + const char *value) +{ + QDict *rsp; + + rsp = qtest_qmp(who, + "{ 'execute': 'migrate-set-parameters'," + "'arguments': { %s: %s } }", + parameter, value); + g_assert(qdict_haskey(rsp, "return")); + qobject_unref(rsp); + migrate_check_parameter_str(who, parameter, value); +} + static void migrate_pause(QTestState *who) { QDict *rsp; @@ -480,14 +517,14 @@ static int test_migrate_start(QTestState **from, QTestState **to, } else if (strcmp(arch, "ppc64") == 0) { machine_opts = "vsmt=8"; memory_size = "256M"; + start_address = PPC_TEST_MEM_START; + end_address = PPC_TEST_MEM_END; arch_source = g_strdup_printf("-nodefaults " "-prom-env 'use-nvramrc?=true' -prom-env " "'nvramrc=hex .\" _\" begin %x %x " "do i c@ 1 + i c! 1000 +loop .\" B\" 0 " "until'", end_address, start_address); arch_target = g_strdup(""); - start_address = PPC_TEST_MEM_START; - end_address = PPC_TEST_MEM_END; } else if (strcmp(arch, "aarch64") == 0) { init_bootfile(bootpath, aarch64_kernel, sizeof(aarch64_kernel)); machine_opts = "virt,gic-version=max"; @@ -1202,6 +1239,61 @@ static void test_migrate_auto_converge(void) test_migrate_end(from, to, true); } +static void test_multifd_tcp(void) +{ + MigrateStart *args = migrate_start_new(); + QTestState *from, *to; + QDict *rsp; + char *uri; + + if (test_migrate_start(&from, &to, "defer", args)) { + return; + } + + /* + * We want to pick a speed slow enough that the test completes + * quickly, but that it doesn't complete precopy even on a slow + * machine, so also set the downtime. + */ + /* 1 ms should make it not converge*/ + migrate_set_parameter_int(from, "downtime-limit", 1); + /* 1GB/s */ + migrate_set_parameter_int(from, "max-bandwidth", 1000000000); + + migrate_set_parameter_int(from, "multifd-channels", 16); + migrate_set_parameter_int(to, "multifd-channels", 16); + + migrate_set_capability(from, "multifd", "true"); + migrate_set_capability(to, "multifd", "true"); + + /* Start incoming migration from the 1st socket */ + rsp = wait_command(to, "{ 'execute': 'migrate-incoming'," + " 'arguments': { 'uri': 'tcp:127.0.0.1:0' }}"); + qobject_unref(rsp); + + /* Wait for the first serial output from the source */ + wait_for_serial("src_serial"); + + uri = migrate_get_socket_address(to, "socket-address"); + + migrate_qmp(from, uri, "{}"); + + wait_for_migration_pass(from); + + /* 300ms it should converge */ + migrate_set_parameter_int(from, "downtime-limit", 300); + + if (!got_stop) { + qtest_qmp_eventwait(from, "STOP"); + } + qtest_qmp_eventwait(to, "RESUME"); + + wait_for_serial("dest_serial"); + wait_for_migration_complete(from); + test_migrate_end(from, to, true); + free(uri); +} + int main(int argc, char **argv) { char template[] = "/tmp/migration-test-XXXXXX"; @@ -1266,6 +1358,7 @@ int main(int argc, char **argv) test_validate_uuid_dst_not_set); qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); + qtest_add_func("/migration/multifd/tcp", test_multifd_tcp); ret = g_test_run(); diff --git a/tests/qtest/q35-test.c b/tests/qtest/q35-test.c index a68183d513..c922d81bc0 100644 --- a/tests/qtest/q35-test.c +++ b/tests/qtest/q35-test.c @@ -186,6 +186,109 @@ static void test_tseg_size(const void *data) qtest_quit(qts); } +#define SMBASE 0x30000 +#define SMRAM_TEST_PATTERN 0x32 +#define SMRAM_TEST_RESET_PATTERN 0x23 + +static void test_smram_smbase_lock(void) +{ + QPCIBus *pcibus; + QPCIDevice *pcidev; + QDict *response; + QTestState *qts; + int i; + + qts = qtest_init("-M q35"); + + pcibus = qpci_new_pc(qts, NULL); + g_assert(pcibus != NULL); + + pcidev = qpci_device_find(pcibus, 0); + g_assert(pcidev != NULL); + + /* check that SMRAM is not enabled by default */ + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0); + qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN); + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN); + + /* check that writing junk to 0x9c before before negotiating is ignored */ + for (i = 0; i < 0xff; i++) { + qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0); + } + + /* enable SMRAM at SMBASE */ + qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, 0xff); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x01); + /* lock SMRAM at SMBASE */ + qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, 0x02); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x02); + + /* check that SMRAM at SMBASE is locked and can't be unlocked */ + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, 0xff); + for (i = 0; i <= 0xff; i++) { + /* make sure register is immutable */ + qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0x02); + + /* RAM access should go into black hole */ + qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN); + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, 0xff); + } + + /* reset */ + response = qtest_qmp(qts, "{'execute': 'system_reset', 'arguments': {} }"); + g_assert(response); + g_assert(!qdict_haskey(response, "error")); + qobject_unref(response); + + /* check RAM at SMBASE is available after reset */ + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == 0); + qtest_writeb(qts, SMBASE, SMRAM_TEST_RESET_PATTERN); + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_RESET_PATTERN); + + g_free(pcidev); + qpci_free_pc(pcibus); + + qtest_quit(qts); +} + +static void test_without_smram_base(void) +{ + QPCIBus *pcibus; + QPCIDevice *pcidev; + QTestState *qts; + int i; + + qts = qtest_init("-M pc-q35-4.1"); + + pcibus = qpci_new_pc(qts, NULL); + g_assert(pcibus != NULL); + + pcidev = qpci_device_find(pcibus, 0); + g_assert(pcidev != NULL); + + /* check that RAM is accessible */ + qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN); + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, SMRAM_TEST_PATTERN); + + /* check that writing to 0x9c succeeds */ + for (i = 0; i <= 0xff; i++) { + qpci_config_writeb(pcidev, MCH_HOST_BRIDGE_F_SMBASE, i); + g_assert(qpci_config_readb(pcidev, MCH_HOST_BRIDGE_F_SMBASE) == i); + } + + /* check that RAM is still accessible */ + qtest_writeb(qts, SMBASE, SMRAM_TEST_PATTERN + 1); + g_assert_cmpint(qtest_readb(qts, SMBASE), ==, (SMRAM_TEST_PATTERN + 1)); + + g_free(pcidev); + qpci_free_pc(pcibus); + + qtest_quit(qts); +} + int main(int argc, char **argv) { g_test_init(&argc, &argv, NULL); @@ -197,5 +300,7 @@ int main(int argc, char **argv) qtest_add_data_func("/q35/tseg-size/8mb", &tseg_8mb, test_tseg_size); qtest_add_data_func("/q35/tseg-size/ext/16mb", &tseg_ext_16mb, test_tseg_size); + qtest_add_func("/q35/smram/smbase_lock", test_smram_smbase_lock); + qtest_add_func("/q35/smram/legacy_smbase", test_without_smram_base); return g_test_run(); } diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index 2324b964ad..9ee0f1e4fd 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -707,9 +707,9 @@ static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *s = arg; - TestServer *dest = test_server_new("dest"); - GString *dest_cmdline = g_string_new(qos_get_current_command_line()); - char *uri = g_strdup_printf("%s%s", "unix:", dest->mig_path); + TestServer *dest; + GString *dest_cmdline; + char *uri; QTestState *to; GSource *source; QDict *rsp; @@ -720,6 +720,10 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) return; } + dest = test_server_new("dest"); + dest_cmdline = g_string_new(qos_get_current_command_line()); + uri = g_strdup_printf("%s%s", "unix:", dest->mig_path); + size = get_log_size(s); g_assert_cmpint(size, ==, (256 * 1024 * 1024) / (VHOST_LOG_PAGE * 8)); @@ -778,6 +782,7 @@ static void test_migrate(void *obj, void *arg, QGuestAllocator *alloc) qtest_quit(to); test_server_free(dest); g_free(uri); + g_string_free(dest_cmdline, true); } static void wait_for_rings_started(TestServer *s, size_t count) diff --git a/tests/tcg/aarch64/Makefile.softmmu-target b/tests/tcg/aarch64/Makefile.softmmu-target index 7b4eede3f0..f6b5121f5c 100644 --- a/tests/tcg/aarch64/Makefile.softmmu-target +++ b/tests/tcg/aarch64/Makefile.softmmu-target @@ -61,4 +61,7 @@ run-memory-replay: memory-replay run-memory-record $(QEMU_OPTS) memory, \ "$< on $(TARGET_NAME)") -EXTRA_TESTS+=memory-record memory-replay +run-pauth-3: pauth-3 +pauth-3: CFLAGS += -march=armv8.3-a + +EXTRA_TESTS+=memory-record memory-replay pauth-3 diff --git a/tests/tcg/aarch64/Makefile.target b/tests/tcg/aarch64/Makefile.target index df3fe8032c..efa67cf1e9 100644 --- a/tests/tcg/aarch64/Makefile.target +++ b/tests/tcg/aarch64/Makefile.target @@ -18,8 +18,9 @@ run-fcvt: fcvt $(call diff-out,$<,$(AARCH64_SRC)/fcvt.ref) # Pauth Tests -AARCH64_TESTS += pauth-1 pauth-2 +AARCH64_TESTS += pauth-1 pauth-2 pauth-4 run-pauth-%: QEMU_OPTS += -cpu max +pauth-%: CFLAGS += -march=armv8.3-a # Semihosting smoke test for linux-user AARCH64_TESTS += semihosting diff --git a/tests/tcg/aarch64/pauth-1.c b/tests/tcg/aarch64/pauth-1.c index a3c1443cd0..ea0984ea82 100644 --- a/tests/tcg/aarch64/pauth-1.c +++ b/tests/tcg/aarch64/pauth-1.c @@ -2,8 +2,6 @@ #include <sys/prctl.h> #include <stdio.h> -asm(".arch armv8.4-a"); - #ifndef PR_PAC_RESET_KEYS #define PR_PAC_RESET_KEYS 54 #define PR_PAC_APDAKEY (1 << 2) diff --git a/tests/tcg/aarch64/pauth-2.c b/tests/tcg/aarch64/pauth-2.c index 2fe030ba3d..9bba0beb63 100644 --- a/tests/tcg/aarch64/pauth-2.c +++ b/tests/tcg/aarch64/pauth-2.c @@ -1,8 +1,6 @@ #include <stdint.h> #include <assert.h> -asm(".arch armv8.4-a"); - void do_test(uint64_t value) { uint64_t salt1, salt2; diff --git a/tests/tcg/aarch64/pauth-4.c b/tests/tcg/aarch64/pauth-4.c new file mode 100644 index 0000000000..1040e92aec --- /dev/null +++ b/tests/tcg/aarch64/pauth-4.c @@ -0,0 +1,25 @@ +#include <stdint.h> +#include <assert.h> + +int main() +{ + uintptr_t x, y; + + asm("mov %0, lr\n\t" + "pacia %0, sp\n\t" /* sigill if pauth not supported */ + "eor %0, %0, #4\n\t" /* corrupt single bit */ + "mov %1, %0\n\t" + "autia %1, sp\n\t" /* validate corrupted pointer */ + "xpaci %0\n\t" /* strip pac from corrupted pointer */ + : "=r"(x), "=r"(y)); + + /* + * Once stripped, the corrupted pointer is of the form 0x0000...wxyz. + * We expect the autia to indicate failure, producing a pointer of the + * form 0x000e....wxyz. Use xpaci and != for the test, rather than + * extracting explicit bits from the top, because the location of the + * error code "e" depends on the configuration of virtual memory. + */ + assert(x != y); + return 0; +} diff --git a/tests/tcg/aarch64/system/pauth-3.c b/tests/tcg/aarch64/system/pauth-3.c new file mode 100644 index 0000000000..42eff4d5ea --- /dev/null +++ b/tests/tcg/aarch64/system/pauth-3.c @@ -0,0 +1,40 @@ +#include <inttypes.h> +#include <minilib.h> + +int main() +{ + /* + * Test vector from QARMA paper (https://eprint.iacr.org/2016/444.pdf) + * to verify one computation of the pauth_computepac() function, + * which uses sbox2. + * + * Use PACGA, because it returns the most bits from ComputePAC. + * We still only get the most significant 32-bits of the result. + */ + + static const uint64_t d[5] = { + 0xfb623599da6e8127ull, + 0x477d469dec0b8762ull, + 0x84be85ce9804e94bull, + 0xec2802d4e0a488e9ull, + 0xc003b93999b33765ull & 0xffffffff00000000ull + }; + uint64_t r; + + asm("msr apgakeyhi_el1, %[w0]\n\t" + "msr apgakeylo_el1, %[k0]\n\t" + "pacga %[r], %[P], %[T]" + : [r] "=r"(r) + : [P] "r" (d[0]), + [T] "r" (d[1]), + [w0] "r" (d[2]), + [k0] "r" (d[3])); + + if (r == d[4]) { + ml_printf("OK\n"); + return 0; + } else { + ml_printf("FAIL: %lx != %lx\n", r, d[4]); + return 1; + } +} diff --git a/tests/test-vmstate.c b/tests/test-vmstate.c index 8f184f3556..cea363dd69 100644 --- a/tests/test-vmstate.c +++ b/tests/test-vmstate.c @@ -926,6 +926,28 @@ static const VMStateDescription vmstate_domain = { } }; +/* test QLIST Migration */ + +typedef struct TestQListElement { + uint32_t id; + QLIST_ENTRY(TestQListElement) next; +} TestQListElement; + +typedef struct TestQListContainer { + uint32_t id; + QLIST_HEAD(, TestQListElement) list; +} TestQListContainer; + +static const VMStateDescription vmstate_qlist_element = { + .name = "test/queue list", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, TestQListElement), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_iommu = { .name = "iommu", .version_id = 1, @@ -939,6 +961,18 @@ static const VMStateDescription vmstate_iommu = { } }; +static const VMStateDescription vmstate_container = { + .name = "test/container/qlist", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, TestQListContainer), + VMSTATE_QLIST_V(list, TestQListContainer, 1, vmstate_qlist_element, + TestQListElement, next), + VMSTATE_END_OF_LIST() + } +}; + uint8_t first_domain_dump[] = { /* id */ 0x00, 0x0, 0x0, 0x6, @@ -1229,6 +1263,140 @@ static void test_gtree_load_iommu(void) qemu_fclose(fload); } +static uint8_t qlist_dump[] = { + 0x00, 0x00, 0x00, 0x01, /* container id */ + 0x1, /* start of a */ + 0x00, 0x00, 0x00, 0x0a, + 0x1, /* start of b */ + 0x00, 0x00, 0x0b, 0x00, + 0x1, /* start of c */ + 0x00, 0x0c, 0x00, 0x00, + 0x1, /* start of d */ + 0x0d, 0x00, 0x00, 0x00, + 0x0, /* end of list */ + QEMU_VM_EOF, /* just to ensure we won't get EOF reported prematurely */ +}; + +static TestQListContainer *alloc_container(void) +{ + TestQListElement *a = g_malloc(sizeof(TestQListElement)); + TestQListElement *b = g_malloc(sizeof(TestQListElement)); + TestQListElement *c = g_malloc(sizeof(TestQListElement)); + TestQListElement *d = g_malloc(sizeof(TestQListElement)); + TestQListContainer *container = g_malloc(sizeof(TestQListContainer)); + + a->id = 0x0a; + b->id = 0x0b00; + c->id = 0xc0000; + d->id = 0xd000000; + container->id = 1; + + QLIST_INIT(&container->list); + QLIST_INSERT_HEAD(&container->list, d, next); + QLIST_INSERT_HEAD(&container->list, c, next); + QLIST_INSERT_HEAD(&container->list, b, next); + QLIST_INSERT_HEAD(&container->list, a, next); + return container; +} + +static void free_container(TestQListContainer *container) +{ + TestQListElement *iter, *tmp; + + QLIST_FOREACH_SAFE(iter, &container->list, next, tmp) { + QLIST_REMOVE(iter, next); + g_free(iter); + } + g_free(container); +} + +static void compare_containers(TestQListContainer *c1, TestQListContainer *c2) +{ + TestQListElement *first_item_c1, *first_item_c2; + + while (!QLIST_EMPTY(&c1->list)) { + first_item_c1 = QLIST_FIRST(&c1->list); + first_item_c2 = QLIST_FIRST(&c2->list); + assert(first_item_c2); + assert(first_item_c1->id == first_item_c2->id); + QLIST_REMOVE(first_item_c1, next); + QLIST_REMOVE(first_item_c2, next); + g_free(first_item_c1); + g_free(first_item_c2); + } + assert(QLIST_EMPTY(&c2->list)); +} + +/* + * Check the prev & next fields are correct by doing list + * manipulations on the container. We will do that for both + * the source and the destination containers + */ +static void manipulate_container(TestQListContainer *c) +{ + TestQListElement *prev = NULL, *iter = QLIST_FIRST(&c->list); + TestQListElement *elem; + + elem = g_malloc(sizeof(TestQListElement)); + elem->id = 0x12; + QLIST_INSERT_AFTER(iter, elem, next); + + elem = g_malloc(sizeof(TestQListElement)); + elem->id = 0x13; + QLIST_INSERT_HEAD(&c->list, elem, next); + + while (iter) { + prev = iter; + iter = QLIST_NEXT(iter, next); + } + + elem = g_malloc(sizeof(TestQListElement)); + elem->id = 0x14; + QLIST_INSERT_BEFORE(prev, elem, next); + + elem = g_malloc(sizeof(TestQListElement)); + elem->id = 0x15; + QLIST_INSERT_AFTER(prev, elem, next); + + QLIST_REMOVE(prev, next); + g_free(prev); +} + +static void test_save_qlist(void) +{ + TestQListContainer *container = alloc_container(); + + save_vmstate(&vmstate_container, container); + compare_vmstate(qlist_dump, sizeof(qlist_dump)); + free_container(container); +} + +static void test_load_qlist(void) +{ + QEMUFile *fsave, *fload; + TestQListContainer *orig_container = alloc_container(); + TestQListContainer *dest_container = g_malloc0(sizeof(TestQListContainer)); + char eof; + + QLIST_INIT(&dest_container->list); + + fsave = open_test_file(true); + qemu_put_buffer(fsave, qlist_dump, sizeof(qlist_dump)); + g_assert(!qemu_file_get_error(fsave)); + qemu_fclose(fsave); + + fload = open_test_file(false); + vmstate_load_state(fload, &vmstate_container, dest_container, 1); + eof = qemu_get_byte(fload); + g_assert(!qemu_file_get_error(fload)); + g_assert_cmpint(eof, ==, QEMU_VM_EOF); + manipulate_container(orig_container); + manipulate_container(dest_container); + compare_containers(orig_container, dest_container); + free_container(orig_container); + free_container(dest_container); +} + typedef struct TmpTestStruct { TestStruct *parent; int64_t diff; @@ -1353,6 +1521,8 @@ int main(int argc, char **argv) g_test_add_func("/vmstate/gtree/load/loaddomain", test_gtree_load_domain); g_test_add_func("/vmstate/gtree/save/saveiommu", test_gtree_save_iommu); g_test_add_func("/vmstate/gtree/load/loadiommu", test_gtree_load_iommu); + g_test_add_func("/vmstate/qlist/save/saveqlist", test_save_qlist); + g_test_add_func("/vmstate/qlist/load/loadqlist", test_load_qlist); g_test_add_func("/vmstate/tmp_struct", test_tmp_struct); g_test_run(); diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in new file mode 100644 index 0000000000..9bcd86f8dc --- /dev/null +++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in @@ -0,0 +1,5 @@ +{ + "description": "QEMU virtiofsd vhost-user-fs", + "type": "fs", + "binary": "@libexecdir@/virtiofsd" +} diff --git a/tools/virtiofsd/Makefile.objs b/tools/virtiofsd/Makefile.objs new file mode 100644 index 0000000000..076f667e46 --- /dev/null +++ b/tools/virtiofsd/Makefile.objs @@ -0,0 +1,12 @@ +virtiofsd-obj-y = buffer.o \ + fuse_opt.o \ + fuse_log.o \ + fuse_lowlevel.o \ + fuse_signals.o \ + fuse_virtio.o \ + helper.o \ + passthrough_ll.o \ + seccomp.o + +seccomp.o-cflags := $(SECCOMP_CFLAGS) +seccomp.o-libs := $(SECCOMP_LIBS) diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c new file mode 100644 index 0000000000..27c1377f22 --- /dev/null +++ b/tools/virtiofsd/buffer.c @@ -0,0 +1,351 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2010 Miklos Szeredi <miklos@szeredi.hu> + * + * Functions for dealing with `struct fuse_buf` and `struct + * fuse_bufvec`. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +size_t fuse_buf_size(const struct fuse_bufvec *bufv) +{ + size_t i; + size_t size = 0; + + for (i = 0; i < bufv->count; i++) { + if (bufv->buf[i].size == SIZE_MAX) { + size = SIZE_MAX; + } else { + size += bufv->buf[i].size; + } + } + + return size; +} + +static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, + struct fuse_bufvec *in_buf) +{ + ssize_t res, i, j; + size_t iovcnt = in_buf->count; + struct iovec *iov; + int fd = out_buf->fd; + + iov = calloc(iovcnt, sizeof(struct iovec)); + if (!iov) { + return -ENOMEM; + } + + for (i = 0, j = 0; i < iovcnt; i++) { + /* Skip the buf with 0 size */ + if (in_buf->buf[i].size) { + iov[j].iov_base = in_buf->buf[i].mem; + iov[j].iov_len = in_buf->buf[i].size; + j++; + } + } + + if (out_buf->flags & FUSE_BUF_FD_SEEK) { + res = pwritev(fd, iov, iovcnt, out_buf->pos); + } else { + res = writev(fd, iov, iovcnt); + } + + if (res == -1) { + res = -errno; + } + + free(iov); + return res; +} + +static size_t min_size(size_t s1, size_t s2) +{ + return s1 < s2 ? s1 : s2; +} + +static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + ssize_t res = 0; + size_t copied = 0; + + while (len) { + if (dst->flags & FUSE_BUF_FD_SEEK) { + res = pwrite(dst->fd, (char *)src->mem + src_off, len, + dst->pos + dst_off); + } else { + res = write(dst->fd, (char *)src->mem + src_off, len); + } + if (res == -1) { + if (!copied) { + return -errno; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + if (!(dst->flags & FUSE_BUF_FD_RETRY)) { + break; + } + + src_off += res; + dst_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + ssize_t res = 0; + size_t copied = 0; + + while (len) { + if (src->flags & FUSE_BUF_FD_SEEK) { + res = pread(src->fd, (char *)dst->mem + dst_off, len, + src->pos + src_off); + } else { + res = read(src->fd, (char *)dst->mem + dst_off, len); + } + if (res == -1) { + if (!copied) { + return -errno; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + if (!(src->flags & FUSE_BUF_FD_RETRY)) { + break; + } + + dst_off += res; + src_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + char buf[4096]; + struct fuse_buf tmp = { + .size = sizeof(buf), + .flags = 0, + }; + ssize_t res; + size_t copied = 0; + + tmp.mem = buf; + + while (len) { + size_t this_len = min_size(tmp.size, len); + size_t read_len; + + res = fuse_buf_read(&tmp, 0, src, src_off, this_len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + if (res == 0) { + break; + } + + read_len = res; + res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + + if (res < this_len) { + break; + } + + dst_off += res; + src_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + int src_is_fd = src->flags & FUSE_BUF_IS_FD; + int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; + + if (!src_is_fd && !dst_is_fd) { + char *dstmem = (char *)dst->mem + dst_off; + char *srcmem = (char *)src->mem + src_off; + + if (dstmem != srcmem) { + if (dstmem + len <= srcmem || srcmem + len <= dstmem) { + memcpy(dstmem, srcmem, len); + } else { + memmove(dstmem, srcmem, len); + } + } + + return len; + } else if (!src_is_fd) { + return fuse_buf_write(dst, dst_off, src, src_off, len); + } else if (!dst_is_fd) { + return fuse_buf_read(dst, dst_off, src, src_off, len); + } else { + return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } +} + +static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) +{ + if (bufv->idx < bufv->count) { + return &bufv->buf[bufv->idx]; + } else { + return NULL; + } +} + +static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) +{ + const struct fuse_buf *buf = fuse_bufvec_current(bufv); + + bufv->off += len; + assert(bufv->off <= buf->size); + if (bufv->off == buf->size) { + assert(bufv->idx < bufv->count); + bufv->idx++; + if (bufv->idx == bufv->count) { + return 0; + } + bufv->off = 0; + } + return 1; +} + +ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) +{ + size_t copied = 0, i; + + if (dstv == srcv) { + return fuse_buf_size(dstv); + } + + /* + * use writev to improve bandwidth when all the + * src buffers already mapped by the daemon + * process + */ + for (i = 0; i < srcv->count; i++) { + if (srcv->buf[i].flags & FUSE_BUF_IS_FD) { + break; + } + } + if ((i == srcv->count) && (dstv->count == 1) && + (dstv->idx == 0) && + (dstv->buf[0].flags & FUSE_BUF_IS_FD)) { + dstv->buf[0].pos += dstv->off; + return fuse_buf_writev(&dstv->buf[0], srcv); + } + + for (;;) { + const struct fuse_buf *src = fuse_bufvec_current(srcv); + const struct fuse_buf *dst = fuse_bufvec_current(dstv); + size_t src_len; + size_t dst_len; + size_t len; + ssize_t res; + + if (src == NULL || dst == NULL) { + break; + } + + src_len = src->size - srcv->off; + dst_len = dst->size - dstv->off; + len = min_size(src_len, dst_len); + + res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + copied += res; + + if (!fuse_bufvec_advance(srcv, res) || + !fuse_bufvec_advance(dstv, res)) { + break; + } + + if (res < len) { + break; + } + } + + return copied; +} + +void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len) +{ + void *ptr; + + if (len > iter->size - iter->pos) { + return NULL; + } + + ptr = iter->mem + iter->pos; + iter->pos += len; + return ptr; +} + +const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter) +{ + const char *str = iter->mem + iter->pos; + size_t remaining = iter->size - iter->pos; + size_t i; + + for (i = 0; i < remaining; i++) { + if (str[i] == '\0') { + iter->pos += i + 1; + return str; + } + } + return NULL; +} diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h new file mode 100644 index 0000000000..7a4c713559 --- /dev/null +++ b/tools/virtiofsd/fuse.h @@ -0,0 +1,1249 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_H_ +#define FUSE_H_ + +/* + * + * This file defines the library interface of FUSE + * + * IMPORTANT: you should define FUSE_USE_VERSION before including this header. + */ + +#include "fuse_common.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <time.h> + +/* + * Basic FUSE API + */ + +/** Handle for a FUSE filesystem */ +struct fuse; + +/** + * Readdir flags, passed to ->readdir() + */ +enum fuse_readdir_flags { + /** + * "Plus" mode. + * + * The kernel wants to prefill the inode cache during readdir. The + * filesystem may honour this by filling in the attributes and setting + * FUSE_FILL_DIR_FLAGS for the filler function. The filesystem may also + * just ignore this flag completely. + */ + FUSE_READDIR_PLUS = (1 << 0), +}; + +enum fuse_fill_dir_flags { + /** + * "Plus" mode: all file attributes are valid + * + * The attributes are used by the kernel to prefill the inode cache + * during a readdir. + * + * It is okay to set FUSE_FILL_DIR_PLUS if FUSE_READDIR_PLUS is not set + * and vice versa. + */ + FUSE_FILL_DIR_PLUS = (1 << 1), +}; + +/** + * Function to add an entry in a readdir() operation + * + * The *off* parameter can be any non-zero value that enables the + * filesystem to identify the current point in the directory + * stream. It does not need to be the actual physical position. A + * value of zero is reserved to indicate that seeking in directories + * is not supported. + * + * @param buf the buffer passed to the readdir() operation + * @param name the file name of the directory entry + * @param stat file attributes, can be NULL + * @param off offset of the next entry or zero + * @param flags fill flags + * @return 1 if buffer is full, zero otherwise + */ +typedef int (*fuse_fill_dir_t)(void *buf, const char *name, + const struct stat *stbuf, off_t off, + enum fuse_fill_dir_flags flags); +/** + * Configuration of the high-level API + * + * This structure is initialized from the arguments passed to + * fuse_new(), and then passed to the file system's init() handler + * which should ensure that the configuration is compatible with the + * file system implementation. + */ +struct fuse_config { + /** + * If `set_gid` is non-zero, the st_gid attribute of each file + * is overwritten with the value of `gid`. + */ + int set_gid; + unsigned int gid; + + /** + * If `set_uid` is non-zero, the st_uid attribute of each file + * is overwritten with the value of `uid`. + */ + int set_uid; + unsigned int uid; + + /** + * If `set_mode` is non-zero, the any permissions bits set in + * `umask` are unset in the st_mode attribute of each file. + */ + int set_mode; + unsigned int umask; + + /** + * The timeout in seconds for which name lookups will be + * cached. + */ + double entry_timeout; + + /** + * The timeout in seconds for which a negative lookup will be + * cached. This means, that if file did not exist (lookup + * retuned ENOENT), the lookup will only be redone after the + * timeout, and the file/directory will be assumed to not + * exist until then. A value of zero means that negative + * lookups are not cached. + */ + double negative_timeout; + + /** + * The timeout in seconds for which file/directory attributes + * (as returned by e.g. the `getattr` handler) are cached. + */ + double attr_timeout; + + /** + * Allow requests to be interrupted + */ + int intr; + + /** + * Specify which signal number to send to the filesystem when + * a request is interrupted. The default is hardcoded to + * USR1. + */ + int intr_signal; + + /** + * Normally, FUSE assigns inodes to paths only for as long as + * the kernel is aware of them. With this option inodes are + * instead remembered for at least this many seconds. This + * will require more memory, but may be necessary when using + * applications that make use of inode numbers. + * + * A number of -1 means that inodes will be remembered for the + * entire life-time of the file-system process. + */ + int remember; + + /** + * The default behavior is that if an open file is deleted, + * the file is renamed to a hidden file (.fuse_hiddenXXX), and + * only removed when the file is finally released. This + * relieves the filesystem implementation of having to deal + * with this problem. This option disables the hiding + * behavior, and files are removed immediately in an unlink + * operation (or in a rename operation which overwrites an + * existing file). + * + * It is recommended that you not use the hard_remove + * option. When hard_remove is set, the following libc + * functions fail on unlinked files (returning errno of + * ENOENT): read(2), write(2), fsync(2), close(2), f*xattr(2), + * ftruncate(2), fstat(2), fchmod(2), fchown(2) + */ + int hard_remove; + + /** + * Honor the st_ino field in the functions getattr() and + * fill_dir(). This value is used to fill in the st_ino field + * in the stat(2), lstat(2), fstat(2) functions and the d_ino + * field in the readdir(2) function. The filesystem does not + * have to guarantee uniqueness, however some applications + * rely on this value being unique for the whole filesystem. + * + * Note that this does *not* affect the inode that libfuse + * and the kernel use internally (also called the "nodeid"). + */ + int use_ino; + + /** + * If use_ino option is not given, still try to fill in the + * d_ino field in readdir(2). If the name was previously + * looked up, and is still in the cache, the inode number + * found there will be used. Otherwise it will be set to -1. + * If use_ino option is given, this option is ignored. + */ + int readdir_ino; + + /** + * This option disables the use of page cache (file content cache) + * in the kernel for this filesystem. This has several affects: + * + * 1. Each read(2) or write(2) system call will initiate one + * or more read or write operations, data will not be + * cached in the kernel. + * + * 2. The return value of the read() and write() system calls + * will correspond to the return values of the read and + * write operations. This is useful for example if the + * file size is not known in advance (before reading it). + * + * Internally, enabling this option causes fuse to set the + * `direct_io` field of `struct fuse_file_info` - overwriting + * any value that was put there by the file system. + */ + int direct_io; + + /** + * This option disables flushing the cache of the file + * contents on every open(2). This should only be enabled on + * filesystems where the file data is never changed + * externally (not through the mounted FUSE filesystem). Thus + * it is not suitable for network filesystems and other + * intermediate filesystems. + * + * NOTE: if this option is not specified (and neither + * direct_io) data is still cached after the open(2), so a + * read(2) system call will not always initiate a read + * operation. + * + * Internally, enabling this option causes fuse to set the + * `keep_cache` field of `struct fuse_file_info` - overwriting + * any value that was put there by the file system. + */ + int kernel_cache; + + /** + * This option is an alternative to `kernel_cache`. Instead of + * unconditionally keeping cached data, the cached data is + * invalidated on open(2) if if the modification time or the + * size of the file has changed since it was last opened. + */ + int auto_cache; + + /** + * The timeout in seconds for which file attributes are cached + * for the purpose of checking if auto_cache should flush the + * file data on open. + */ + int ac_attr_timeout_set; + double ac_attr_timeout; + + /** + * If this option is given the file-system handlers for the + * following operations will not receive path information: + * read, write, flush, release, fsync, readdir, releasedir, + * fsyncdir, lock, ioctl and poll. + * + * For the truncate, getattr, chmod, chown and utimens + * operations the path will be provided only if the struct + * fuse_file_info argument is NULL. + */ + int nullpath_ok; + + /** + * The remaining options are used by libfuse internally and + * should not be touched. + */ + int show_help; + char *modules; + int debug; +}; + + +/** + * The file system operations: + * + * Most of these should work very similarly to the well known UNIX + * file system operations. A major exception is that instead of + * returning an error in 'errno', the operation should return the + * negated error value (-errno) directly. + * + * All methods are optional, but some are essential for a useful + * filesystem (e.g. getattr). Open, flush, release, fsync, opendir, + * releasedir, fsyncdir, access, create, truncate, lock, init and + * destroy are special purpose methods, without which a full featured + * filesystem can still be implemented. + * + * In general, all methods are expected to perform any necessary + * permission checking. However, a filesystem may delegate this task + * to the kernel by passing the `default_permissions` mount option to + * `fuse_new()`. In this case, methods will only be called if + * the kernel's permission check has succeeded. + * + * Almost all operations take a path which can be of any length. + */ +struct fuse_operations { + /** + * Get file attributes. + * + * Similar to stat(). The 'st_dev' and 'st_blksize' fields are + * ignored. The 'st_ino' field is ignored except if the 'use_ino' + * mount option is given. In that case it is passed to userspace, + * but libfuse and the kernel will still assign a different + * inode for internal use (called the "nodeid"). + * + * `fi` will always be NULL if the file is not currently open, but + * may also be NULL if the file is open. + */ + int (*getattr)(const char *, struct stat *, struct fuse_file_info *fi); + + /** + * Read the target of a symbolic link + * + * The buffer should be filled with a null terminated string. The + * buffer size argument includes the space for the terminating + * null character. If the linkname is too long to fit in the + * buffer, it should be truncated. The return value should be 0 + * for success. + */ + int (*readlink)(const char *, char *, size_t); + + /** + * Create a file node + * + * This is called for creation of all non-directory, non-symlink + * nodes. If the filesystem defines a create() method, then for + * regular files that will be called instead. + */ + int (*mknod)(const char *, mode_t, dev_t); + + /** + * Create a directory + * + * Note that the mode argument may not have the type specification + * bits set, i.e. S_ISDIR(mode) can be false. To obtain the + * correct directory type bits use mode|S_IFDIR + */ + int (*mkdir)(const char *, mode_t); + + /** Remove a file */ + int (*unlink)(const char *); + + /** Remove a directory */ + int (*rmdir)(const char *); + + /** Create a symbolic link */ + int (*symlink)(const char *, const char *); + + /** + * Rename a file + * + * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If + * RENAME_NOREPLACE is specified, the filesystem must not + * overwrite *newname* if it exists and return an error + * instead. If `RENAME_EXCHANGE` is specified, the filesystem + * must atomically exchange the two files, i.e. both must + * exist and neither may be deleted. + */ + int (*rename)(const char *, const char *, unsigned int flags); + + /** Create a hard link to a file */ + int (*link)(const char *, const char *); + + /** + * Change the permission bits of a file + * + * `fi` will always be NULL if the file is not currenlty open, but + * may also be NULL if the file is open. + */ + int (*chmod)(const char *, mode_t, struct fuse_file_info *fi); + + /** + * Change the owner and group of a file + * + * `fi` will always be NULL if the file is not currenlty open, but + * may also be NULL if the file is open. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + */ + int (*chown)(const char *, uid_t, gid_t, struct fuse_file_info *fi); + + /** + * Change the size of a file + * + * `fi` will always be NULL if the file is not currenlty open, but + * may also be NULL if the file is open. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + */ + int (*truncate)(const char *, off_t, struct fuse_file_info *fi); + + /** + * Open a file + * + * Open flags are available in fi->flags. The following rules + * apply. + * + * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be + * filtered out / handled by the kernel. + * + * - Access modes (O_RDONLY, O_WRONLY, O_RDWR, O_EXEC, O_SEARCH) + * should be used by the filesystem to check if the operation is + * permitted. If the ``-o default_permissions`` mount option is + * given, this check is already done by the kernel before calling + * open() and may thus be omitted by the filesystem. + * + * - When writeback caching is enabled, the kernel may send + * read requests even for files opened with O_WRONLY. The + * filesystem should be prepared to handle this. + * + * - When writeback caching is disabled, the filesystem is + * expected to properly handle the O_APPEND flag and ensure + * that each write is appending to the end of the file. + * + * - When writeback caching is enabled, the kernel will + * handle O_APPEND. However, unless all changes to the file + * come through the kernel this will not work reliably. The + * filesystem should thus either ignore the O_APPEND flag + * (and let the kernel handle it), or return an error + * (indicating that reliably O_APPEND is not available). + * + * Filesystem may store an arbitrary file handle (pointer, + * index, etc) in fi->fh, and use this in other all other file + * operations (read, write, flush, release, fsync). + * + * Filesystem may also implement stateless file I/O and not store + * anything in fi->fh. + * + * There are also some flags (direct_io, keep_cache) which the + * filesystem may set in fi, to change the way the file is opened. + * See fuse_file_info structure in <fuse_common.h> for more details. + * + * If this request is answered with an error code of ENOSYS + * and FUSE_CAP_NO_OPEN_SUPPORT is set in + * `fuse_conn_info.capable`, this is treated as success and + * future calls to open will also succeed without being send + * to the filesystem process. + * + */ + int (*open)(const char *, struct fuse_file_info *); + + /** + * Read data from an open file + * + * Read should return exactly the number of bytes requested except + * on EOF or error, otherwise the rest of the data will be + * substituted with zeroes. An exception to this is when the + * 'direct_io' mount option is specified, in which case the return + * value of the read system call will reflect the return value of + * this operation. + */ + int (*read)(const char *, char *, size_t, off_t, struct fuse_file_info *); + + /** + * Write data to an open file + * + * Write should return exactly the number of bytes requested + * except on error. An exception to this is when the 'direct_io' + * mount option is specified (see read operation). + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + */ + int (*write)(const char *, const char *, size_t, off_t, + struct fuse_file_info *); + + /** + * Get file system statistics + * + * The 'f_favail', 'f_fsid' and 'f_flag' fields are ignored + */ + int (*statfs)(const char *, struct statvfs *); + + /** + * Possibly flush cached data + * + * BIG NOTE: This is not equivalent to fsync(). It's not a + * request to sync dirty data. + * + * Flush is called on each close() of a file descriptor, as opposed to + * release which is called on the close of the last file descriptor for + * a file. Under Linux, errors returned by flush() will be passed to + * userspace as errors from close(), so flush() is a good place to write + * back any cached dirty data. However, many applications ignore errors + * on close(), and on non-Linux systems, close() may succeed even if flush() + * returns an error. For these reasons, filesystems should not assume + * that errors returned by flush will ever be noticed or even + * delivered. + * + * NOTE: The flush() method may be called more than once for each + * open(). This happens if more than one file descriptor refers to an + * open file handle, e.g. due to dup(), dup2() or fork() calls. It is + * not possible to determine if a flush is final, so each flush should + * be treated equally. Multiple write-flush sequences are relatively + * rare, so this shouldn't be a problem. + * + * Filesystems shouldn't assume that flush will be called at any + * particular point. It may be called more times than expected, or not + * at all. + * + * [close]: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html + */ + int (*flush)(const char *, struct fuse_file_info *); + + /** + * Release an open file + * + * Release is called when there are no more references to an open + * file: all file descriptors are closed and all memory mappings + * are unmapped. + * + * For every open() call there will be exactly one release() call + * with the same flags and file handle. It is possible to + * have a file opened more than once, in which case only the last + * release will mean, that no more reads/writes will happen on the + * file. The return value of release is ignored. + */ + int (*release)(const char *, struct fuse_file_info *); + + /* + * Synchronize file contents + * + * If the datasync parameter is non-zero, then only the user data + * should be flushed, not the meta data. + */ + int (*fsync)(const char *, int, struct fuse_file_info *); + + /** Set extended attributes */ + int (*setxattr)(const char *, const char *, const char *, size_t, int); + + /** Get extended attributes */ + int (*getxattr)(const char *, const char *, char *, size_t); + + /** List extended attributes */ + int (*listxattr)(const char *, char *, size_t); + + /** Remove extended attributes */ + int (*removexattr)(const char *, const char *); + + /* + * Open directory + * + * Unless the 'default_permissions' mount option is given, + * this method should check if opendir is permitted for this + * directory. Optionally opendir may also return an arbitrary + * filehandle in the fuse_file_info structure, which will be + * passed to readdir, releasedir and fsyncdir. + */ + int (*opendir)(const char *, struct fuse_file_info *); + + /* + * Read directory + * + * The filesystem may choose between two modes of operation: + * + * 1) The readdir implementation ignores the offset parameter, and + * passes zero to the filler function's offset. The filler + * function will not return '1' (unless an error happens), so the + * whole directory is read in a single readdir operation. + * + * 2) The readdir implementation keeps track of the offsets of the + * directory entries. It uses the offset parameter and always + * passes non-zero offset to the filler function. When the buffer + * is full (or an error happens) the filler function will return + * '1'. + */ + int (*readdir)(const char *, void *, fuse_fill_dir_t, off_t, + struct fuse_file_info *, enum fuse_readdir_flags); + + /** + * Release directory + */ + int (*releasedir)(const char *, struct fuse_file_info *); + + /** + * Synchronize directory contents + * + * If the datasync parameter is non-zero, then only the user data + * should be flushed, not the meta data + */ + int (*fsyncdir)(const char *, int, struct fuse_file_info *); + + /** + * Initialize filesystem + * + * The return value will passed in the `private_data` field of + * `struct fuse_context` to all file operations, and as a + * parameter to the destroy() method. It overrides the initial + * value provided to fuse_main() / fuse_new(). + */ + void *(*init)(struct fuse_conn_info *conn, struct fuse_config *cfg); + + /** + * Clean up filesystem + * + * Called on filesystem exit. + */ + void (*destroy)(void *private_data); + + /** + * Check file access permissions + * + * This will be called for the access() system call. If the + * 'default_permissions' mount option is given, this method is not + * called. + * + * This method is not called under Linux kernel versions 2.4.x + */ + int (*access)(const char *, int); + + /** + * Create and open a file + * + * If the file does not exist, first create it with the specified + * mode, and then open it. + * + * If this method is not implemented or under Linux kernel + * versions earlier than 2.6.15, the mknod() and open() methods + * will be called instead. + */ + int (*create)(const char *, mode_t, struct fuse_file_info *); + + /** + * Perform POSIX file locking operation + * + * The cmd argument will be either F_GETLK, F_SETLK or F_SETLKW. + * + * For the meaning of fields in 'struct flock' see the man page + * for fcntl(2). The l_whence field will always be set to + * SEEK_SET. + * + * For checking lock ownership, the 'fuse_file_info->owner' + * argument must be used. + * + * For F_GETLK operation, the library will first check currently + * held locks, and if a conflicting lock is found it will return + * information without calling this method. This ensures, that + * for local locks the l_pid field is correctly filled in. The + * results may not be accurate in case of race conditions and in + * the presence of hard links, but it's unlikely that an + * application would rely on accurate GETLK results in these + * cases. If a conflicting lock is not found, this method will be + * called, and the filesystem may fill out l_pid by a meaningful + * value, or it may leave this field zero. + * + * For F_SETLK and F_SETLKW the l_pid field will be set to the pid + * of the process performing the locking operation. + * + * Note: if this method is not implemented, the kernel will still + * allow file locking to work locally. Hence it is only + * interesting for network filesystems and similar. + */ + int (*lock)(const char *, struct fuse_file_info *, int cmd, struct flock *); + + /** + * Change the access and modification times of a file with + * nanosecond resolution + * + * This supersedes the old utime() interface. New applications + * should use this. + * + * `fi` will always be NULL if the file is not currenlty open, but + * may also be NULL if the file is open. + * + * See the utimensat(2) man page for details. + */ + int (*utimens)(const char *, const struct timespec tv[2], + struct fuse_file_info *fi); + + /** + * Map block index within file to block index within device + * + * Note: This makes sense only for block device backed filesystems + * mounted with the 'blkdev' option + */ + int (*bmap)(const char *, size_t blocksize, uint64_t *idx); + + /** + * Ioctl + * + * flags will have FUSE_IOCTL_COMPAT set for 32bit ioctls in + * 64bit environment. The size and direction of data is + * determined by _IOC_*() decoding of cmd. For _IOC_NONE, + * data will be NULL, for _IOC_WRITE data is out area, for + * _IOC_READ in area and if both are set in/out area. In all + * non-NULL cases, the area is of _IOC_SIZE(cmd) bytes. + * + * If flags has FUSE_IOCTL_DIR then the fuse_file_info refers to a + * directory file handle. + * + * Note : the unsigned long request submitted by the application + * is truncated to 32 bits. + */ + int (*ioctl)(const char *, unsigned int cmd, void *arg, + struct fuse_file_info *, unsigned int flags, void *data); + + /** + * Poll for IO readiness events + * + * Note: If ph is non-NULL, the client should notify + * when IO readiness events occur by calling + * fuse_notify_poll() with the specified ph. + * + * Regardless of the number of times poll with a non-NULL ph + * is received, single notification is enough to clear all. + * Notifying more times incurs overhead but doesn't harm + * correctness. + * + * The callee is responsible for destroying ph with + * fuse_pollhandle_destroy() when no longer in use. + */ + int (*poll)(const char *, struct fuse_file_info *, + struct fuse_pollhandle *ph, unsigned *reventsp); + + /* + * Write contents of buffer to an open file + * + * Similar to the write() method, but data is supplied in a + * generic buffer. Use fuse_buf_copy() to transfer data to + * the destination. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + */ + int (*write_buf)(const char *, struct fuse_bufvec *buf, off_t off, + struct fuse_file_info *); + + /* + * Store data from an open file in a buffer + * + * Similar to the read() method, but data is stored and + * returned in a generic buffer. + * + * No actual copying of data has to take place, the source + * file descriptor may simply be stored in the buffer for + * later data transfer. + * + * The buffer must be allocated dynamically and stored at the + * location pointed to by bufp. If the buffer contains memory + * regions, they too must be allocated using malloc(). The + * allocated memory will be freed by the caller. + */ + int (*read_buf)(const char *, struct fuse_bufvec **bufp, size_t size, + off_t off, struct fuse_file_info *); + /** + * Perform BSD file locking operation + * + * The op argument will be either LOCK_SH, LOCK_EX or LOCK_UN + * + * Nonblocking requests will be indicated by ORing LOCK_NB to + * the above operations + * + * For more information see the flock(2) manual page. + * + * Additionally fi->owner will be set to a value unique to + * this open file. This same value will be supplied to + * ->release() when the file is released. + * + * Note: if this method is not implemented, the kernel will still + * allow file locking to work locally. Hence it is only + * interesting for network filesystems and similar. + */ + int (*flock)(const char *, struct fuse_file_info *, int op); + + /** + * Allocates space for an open file + * + * This function ensures that required space is allocated for specified + * file. If this function returns success then any subsequent write + * request to specified range is guaranteed not to fail because of lack + * of space on the file system media. + */ + int (*fallocate)(const char *, int, off_t, off_t, struct fuse_file_info *); + + /** + * Copy a range of data from one file to another + * + * Performs an optimized copy between two file descriptors without the + * additional cost of transferring data through the FUSE kernel module + * to user space (glibc) and then back into the FUSE filesystem again. + * + * In case this method is not implemented, glibc falls back to reading + * data from the source and writing to the destination. Effectively + * doing an inefficient copy of the data. + */ + ssize_t (*copy_file_range)(const char *path_in, + struct fuse_file_info *fi_in, off_t offset_in, + const char *path_out, + struct fuse_file_info *fi_out, off_t offset_out, + size_t size, int flags); + + /** + * Find next data or hole after the specified offset + */ + off_t (*lseek)(const char *, off_t off, int whence, + struct fuse_file_info *); +}; + +/* + * Extra context that may be needed by some filesystems + * + * The uid, gid and pid fields are not filled in case of a writepage + * operation. + */ +struct fuse_context { + /** Pointer to the fuse object */ + struct fuse *fuse; + + /** User ID of the calling process */ + uid_t uid; + + /** Group ID of the calling process */ + gid_t gid; + + /** Process ID of the calling thread */ + pid_t pid; + + /** Private filesystem data */ + void *private_data; + + /** Umask of the calling process */ + mode_t umask; +}; + +/** + * Main function of FUSE. + * + * This is for the lazy. This is all that has to be called from the + * main() function. + * + * This function does the following: + * - parses command line options, and handles --help and + * --version + * - installs signal handlers for INT, HUP, TERM and PIPE + * - registers an exit handler to unmount the filesystem on program exit + * - creates a fuse handle + * - registers the operations + * - calls either the single-threaded or the multi-threaded event loop + * + * Most file systems will have to parse some file-system specific + * arguments before calling this function. It is recommended to do + * this with fuse_opt_parse() and a processing function that passes + * through any unknown options (this can also be achieved by just + * passing NULL as the processing function). That way, the remaining + * options can be passed directly to fuse_main(). + * + * fuse_main() accepts all options that can be passed to + * fuse_parse_cmdline(), fuse_new(), or fuse_session_new(). + * + * Option parsing skips argv[0], which is assumed to contain the + * program name. This element must always be present and is used to + * construct a basic ``usage: `` message for the --help + * output. argv[0] may also be set to the empty string. In this case + * the usage message is suppressed. This can be used by file systems + * to print their own usage line first. See hello.c for an example of + * how to do this. + * + * Note: this is currently implemented as a macro. + * + * The following error codes may be returned from fuse_main(): + * 1: Invalid option arguments + * 2: No mount point specified + * 3: FUSE setup failed + * 4: Mounting failed + * 5: Failed to daemonize (detach from session) + * 6: Failed to set up signal handlers + * 7: An error occured during the life of the file system + * + * @param argc the argument counter passed to the main() function + * @param argv the argument vector passed to the main() function + * @param op the file system operation + * @param private_data Initial value for the `private_data` + * field of `struct fuse_context`. May be overridden by the + * `struct fuse_operations.init` handler. + * @return 0 on success, nonzero on failure + * + * Example usage, see hello.c + */ +/* + * int fuse_main(int argc, char *argv[], const struct fuse_operations *op, + * void *private_data); + */ +#define fuse_main(argc, argv, op, private_data) \ + fuse_main_real(argc, argv, op, sizeof(*(op)), private_data) + +/* + * More detailed API + */ + +/** + * Print available options (high- and low-level) to stdout. This is + * not an exhaustive list, but includes only those options that may be + * of interest to an end-user of a file system. + * + * The function looks at the argument vector only to determine if + * there are additional modules to be loaded (module=foo option), + * and attempts to call their help functions as well. + * + * @param args the argument vector. + */ +void fuse_lib_help(struct fuse_args *args); + +/** + * Create a new FUSE filesystem. + * + * This function accepts most file-system independent mount options + * (like context, nodev, ro - see mount(8)), as well as the + * FUSE-specific mount options from mount.fuse(8). + * + * If the --help option is specified, the function writes a help text + * to stdout and returns NULL. + * + * Option parsing skips argv[0], which is assumed to contain the + * program name. This element must always be present and is used to + * construct a basic ``usage: `` message for the --help output. If + * argv[0] is set to the empty string, no usage message is included in + * the --help output. + * + * If an unknown option is passed in, an error message is written to + * stderr and the function returns NULL. + * + * @param args argument vector + * @param op the filesystem operations + * @param op_size the size of the fuse_operations structure + * @param private_data Initial value for the `private_data` + * field of `struct fuse_context`. May be overridden by the + * `struct fuse_operations.init` handler. + * @return the created FUSE handle + */ +#if FUSE_USE_VERSION == 30 +struct fuse *fuse_new_30(struct fuse_args *args, + const struct fuse_operations *op, size_t op_size, + void *private_data); +#define fuse_new(args, op, size, data) fuse_new_30(args, op, size, data) +#else +struct fuse *fuse_new(struct fuse_args *args, const struct fuse_operations *op, + size_t op_size, void *private_data); +#endif + +/** + * Mount a FUSE file system. + * + * @param mountpoint the mount point path + * @param f the FUSE handle + * + * @return 0 on success, -1 on failure. + **/ +int fuse_mount(struct fuse *f, const char *mountpoint); + +/** + * Unmount a FUSE file system. + * + * See fuse_session_unmount() for additional information. + * + * @param f the FUSE handle + **/ +void fuse_unmount(struct fuse *f); + +/** + * Destroy the FUSE handle. + * + * NOTE: This function does not unmount the filesystem. If this is + * needed, call fuse_unmount() before calling this function. + * + * @param f the FUSE handle + */ +void fuse_destroy(struct fuse *f); + +/** + * FUSE event loop. + * + * Requests from the kernel are processed, and the appropriate + * operations are called. + * + * For a description of the return value and the conditions when the + * event loop exits, refer to the documentation of + * fuse_session_loop(). + * + * @param f the FUSE handle + * @return see fuse_session_loop() + * + * See also: fuse_loop_mt() + */ +int fuse_loop(struct fuse *f); + +/** + * Flag session as terminated + * + * This function will cause any running event loops to exit on + * the next opportunity. + * + * @param f the FUSE handle + */ +void fuse_exit(struct fuse *f); + +/** + * Get the current context + * + * The context is only valid for the duration of a filesystem + * operation, and thus must not be stored and used later. + * + * @return the context + */ +struct fuse_context *fuse_get_context(void); + +/** + * Get the current supplementary group IDs for the current request + * + * Similar to the getgroups(2) system call, except the return value is + * always the total number of group IDs, even if it is larger than the + * specified size. + * + * The current fuse kernel module in linux (as of 2.6.30) doesn't pass + * the group list to userspace, hence this function needs to parse + * "/proc/$TID/task/$TID/status" to get the group IDs. + * + * This feature may not be supported on all operating systems. In + * such a case this function will return -ENOSYS. + * + * @param size size of given array + * @param list array of group IDs to be filled in + * @return the total number of supplementary group IDs or -errno on failure + */ +int fuse_getgroups(int size, gid_t list[]); + +/** + * Check if the current request has already been interrupted + * + * @return 1 if the request has been interrupted, 0 otherwise + */ +int fuse_interrupted(void); + +/** + * Invalidates cache for the given path. + * + * This calls fuse_lowlevel_notify_inval_inode internally. + * + * @return 0 on successful invalidation, negative error value otherwise. + * This routine may return -ENOENT to indicate that there was + * no entry to be invalidated, e.g., because the path has not + * been seen before or has been forgotten; this should not be + * considered to be an error. + */ +int fuse_invalidate_path(struct fuse *f, const char *path); + +/** + * The real main function + * + * Do not call this directly, use fuse_main() + */ +int fuse_main_real(int argc, char *argv[], const struct fuse_operations *op, + size_t op_size, void *private_data); + +/** + * Start the cleanup thread when using option "remember". + * + * This is done automatically by fuse_loop_mt() + * @param fuse struct fuse pointer for fuse instance + * @return 0 on success and -1 on error + */ +int fuse_start_cleanup_thread(struct fuse *fuse); + +/** + * Stop the cleanup thread when using option "remember". + * + * This is done automatically by fuse_loop_mt() + * @param fuse struct fuse pointer for fuse instance + */ +void fuse_stop_cleanup_thread(struct fuse *fuse); + +/** + * Iterate over cache removing stale entries + * use in conjunction with "-oremember" + * + * NOTE: This is already done for the standard sessions + * + * @param fuse struct fuse pointer for fuse instance + * @return the number of seconds until the next cleanup + */ +int fuse_clean_cache(struct fuse *fuse); + +/* + * Stacking API + */ + +/** + * Fuse filesystem object + * + * This is opaque object represents a filesystem layer + */ +struct fuse_fs; + +/* + * These functions call the relevant filesystem operation, and return + * the result. + * + * If the operation is not defined, they return -ENOSYS, with the + * exception of fuse_fs_open, fuse_fs_release, fuse_fs_opendir, + * fuse_fs_releasedir and fuse_fs_statfs, which return 0. + */ + +int fuse_fs_getattr(struct fuse_fs *fs, const char *path, struct stat *buf, + struct fuse_file_info *fi); +int fuse_fs_rename(struct fuse_fs *fs, const char *oldpath, const char *newpath, + unsigned int flags); +int fuse_fs_unlink(struct fuse_fs *fs, const char *path); +int fuse_fs_rmdir(struct fuse_fs *fs, const char *path); +int fuse_fs_symlink(struct fuse_fs *fs, const char *linkname, const char *path); +int fuse_fs_link(struct fuse_fs *fs, const char *oldpath, const char *newpath); +int fuse_fs_release(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi); +int fuse_fs_open(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi); +int fuse_fs_read(struct fuse_fs *fs, const char *path, char *buf, size_t size, + off_t off, struct fuse_file_info *fi); +int fuse_fs_read_buf(struct fuse_fs *fs, const char *path, + struct fuse_bufvec **bufp, size_t size, off_t off, + struct fuse_file_info *fi); +int fuse_fs_write(struct fuse_fs *fs, const char *path, const char *buf, + size_t size, off_t off, struct fuse_file_info *fi); +int fuse_fs_write_buf(struct fuse_fs *fs, const char *path, + struct fuse_bufvec *buf, off_t off, + struct fuse_file_info *fi); +int fuse_fs_fsync(struct fuse_fs *fs, const char *path, int datasync, + struct fuse_file_info *fi); +int fuse_fs_flush(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi); +int fuse_fs_statfs(struct fuse_fs *fs, const char *path, struct statvfs *buf); +int fuse_fs_opendir(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi); +int fuse_fs_readdir(struct fuse_fs *fs, const char *path, void *buf, + fuse_fill_dir_t filler, off_t off, + struct fuse_file_info *fi, enum fuse_readdir_flags flags); +int fuse_fs_fsyncdir(struct fuse_fs *fs, const char *path, int datasync, + struct fuse_file_info *fi); +int fuse_fs_releasedir(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi); +int fuse_fs_create(struct fuse_fs *fs, const char *path, mode_t mode, + struct fuse_file_info *fi); +int fuse_fs_lock(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi, int cmd, struct flock *lock); +int fuse_fs_flock(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi, int op); +int fuse_fs_chmod(struct fuse_fs *fs, const char *path, mode_t mode, + struct fuse_file_info *fi); +int fuse_fs_chown(struct fuse_fs *fs, const char *path, uid_t uid, gid_t gid, + struct fuse_file_info *fi); +int fuse_fs_truncate(struct fuse_fs *fs, const char *path, off_t size, + struct fuse_file_info *fi); +int fuse_fs_utimens(struct fuse_fs *fs, const char *path, + const struct timespec tv[2], struct fuse_file_info *fi); +int fuse_fs_access(struct fuse_fs *fs, const char *path, int mask); +int fuse_fs_readlink(struct fuse_fs *fs, const char *path, char *buf, + size_t len); +int fuse_fs_mknod(struct fuse_fs *fs, const char *path, mode_t mode, + dev_t rdev); +int fuse_fs_mkdir(struct fuse_fs *fs, const char *path, mode_t mode); +int fuse_fs_setxattr(struct fuse_fs *fs, const char *path, const char *name, + const char *value, size_t size, int flags); +int fuse_fs_getxattr(struct fuse_fs *fs, const char *path, const char *name, + char *value, size_t size); +int fuse_fs_listxattr(struct fuse_fs *fs, const char *path, char *list, + size_t size); +int fuse_fs_removexattr(struct fuse_fs *fs, const char *path, const char *name); +int fuse_fs_bmap(struct fuse_fs *fs, const char *path, size_t blocksize, + uint64_t *idx); +int fuse_fs_ioctl(struct fuse_fs *fs, const char *path, unsigned int cmd, + void *arg, struct fuse_file_info *fi, unsigned int flags, + void *data); +int fuse_fs_poll(struct fuse_fs *fs, const char *path, + struct fuse_file_info *fi, struct fuse_pollhandle *ph, + unsigned *reventsp); +int fuse_fs_fallocate(struct fuse_fs *fs, const char *path, int mode, + off_t offset, off_t length, struct fuse_file_info *fi); +ssize_t fuse_fs_copy_file_range(struct fuse_fs *fs, const char *path_in, + struct fuse_file_info *fi_in, off_t off_in, + const char *path_out, + struct fuse_file_info *fi_out, off_t off_out, + size_t len, int flags); +off_t fuse_fs_lseek(struct fuse_fs *fs, const char *path, off_t off, int whence, + struct fuse_file_info *fi); +void fuse_fs_init(struct fuse_fs *fs, struct fuse_conn_info *conn, + struct fuse_config *cfg); +void fuse_fs_destroy(struct fuse_fs *fs); + +int fuse_notify_poll(struct fuse_pollhandle *ph); + +/** + * Create a new fuse filesystem object + * + * This is usually called from the factory of a fuse module to create + * a new instance of a filesystem. + * + * @param op the filesystem operations + * @param op_size the size of the fuse_operations structure + * @param private_data Initial value for the `private_data` + * field of `struct fuse_context`. May be overridden by the + * `struct fuse_operations.init` handler. + * @return a new filesystem object + */ +struct fuse_fs *fuse_fs_new(const struct fuse_operations *op, size_t op_size, + void *private_data); + +/** + * Factory for creating filesystem objects + * + * The function may use and remove options from 'args' that belong + * to this module. + * + * For now the 'fs' vector always contains exactly one filesystem. + * This is the filesystem which will be below the newly created + * filesystem in the stack. + * + * @param args the command line arguments + * @param fs NULL terminated filesystem object vector + * @return the new filesystem object + */ +typedef struct fuse_fs *(*fuse_module_factory_t)(struct fuse_args *args, + struct fuse_fs *fs[]); +/** + * Register filesystem module + * + * If the "-omodules=*name*_:..." option is present, filesystem + * objects are created and pushed onto the stack with the *factory_* + * function. + * + * @param name_ the name of this filesystem module + * @param factory_ the factory function for this filesystem module + */ +#define FUSE_REGISTER_MODULE(name_, factory_) \ + fuse_module_factory_t fuse_module_##name_##_factory = factory_ + +/** Get session from fuse object */ +struct fuse_session *fuse_get_session(struct fuse *f); + +/** + * Open a FUSE file descriptor and set up the mount for the given + * mountpoint and flags. + * + * @param mountpoint reference to the mount in the file system + * @param options mount options + * @return the FUSE file descriptor or -1 upon error + */ +int fuse_open_channel(const char *mountpoint, const char *options); + +#endif /* FUSE_H_ */ diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h new file mode 100644 index 0000000000..686c42c0a5 --- /dev/null +++ b/tools/virtiofsd/fuse_common.h @@ -0,0 +1,816 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +/** @file */ + +#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) +#error \ + "Never include <fuse_common.h> directly; use <fuse.h> or <fuse_lowlevel.h> instead." +#endif + +#ifndef FUSE_COMMON_H_ +#define FUSE_COMMON_H_ + +#include "fuse_log.h" +#include "fuse_opt.h" +#include <stdint.h> +#include <sys/types.h> + +/** Major version of FUSE library interface */ +#define FUSE_MAJOR_VERSION 3 + +/** Minor version of FUSE library interface */ +#define FUSE_MINOR_VERSION 2 + +#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) +#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + +/** + * Information about an open file. + * + * File Handles are created by the open, opendir, and create methods and closed + * by the release and releasedir methods. Multiple file handles may be + * concurrently open for the same file. Generally, a client will create one + * file handle per file descriptor, though in some cases multiple file + * descriptors can share a single file handle. + */ +struct fuse_file_info { + /** Open flags. Available in open() and release() */ + int flags; + + /* + * In case of a write operation indicates if this was caused + * by a delayed write from the page cache. If so, then the + * context's pid, uid, and gid fields will not be valid, and + * the *fh* value may not match the *fh* value that would + * have been sent with the corresponding individual write + * requests if write caching had been disabled. + */ + unsigned int writepage:1; + + /** Can be filled in by open, to use direct I/O on this file. */ + unsigned int direct_io:1; + + /* + * Can be filled in by open. It signals the kernel that any + * currently cached file data (ie., data that the filesystem + * provided the last time the file was open) need not be + * invalidated. Has no effect when set in other contexts (in + * particular it does nothing when set by opendir()). + */ + unsigned int keep_cache:1; + + /* + * Indicates a flush operation. Set in flush operation, also + * maybe set in highlevel lock operation and lowlevel release + * operation. + */ + unsigned int flush:1; + + /* + * Can be filled in by open, to indicate that the file is not + * seekable. + */ + unsigned int nonseekable:1; + + /* + * Indicates that flock locks for this file should be + * released. If set, lock_owner shall contain a valid value. + * May only be set in ->release(). + */ + unsigned int flock_release:1; + + /* + * Can be filled in by opendir. It signals the kernel to + * enable caching of entries returned by readdir(). Has no + * effect when set in other contexts (in particular it does + * nothing when set by open()). + */ + unsigned int cache_readdir:1; + + /* Indicates that suid/sgid bits should be removed upon write */ + unsigned int kill_priv:1; + + + /** Padding. Reserved for future use*/ + unsigned int padding:24; + unsigned int padding2:32; + + /* + * File handle id. May be filled in by filesystem in create, + * open, and opendir(). Available in most other file operations on the + * same file handle. + */ + uint64_t fh; + + /** Lock owner id. Available in locking operations and flush */ + uint64_t lock_owner; + + /* + * Requested poll events. Available in ->poll. Only set on kernels + * which support it. If unsupported, this field is set to zero. + */ + uint32_t poll_events; +}; + +/* + * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' + */ + +/** + * Indicates that the filesystem supports asynchronous read requests. + * + * If this capability is not requested/available, the kernel will + * ensure that there is at most one pending read request per + * file-handle at any time, and will attempt to order read requests by + * increasing offset. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ASYNC_READ (1 << 0) + +/** + * Indicates that the filesystem supports "remote" locking. + * + * This feature is enabled by default when supported by the kernel, + * and if getlk() and setlk() handlers are implemented. + */ +#define FUSE_CAP_POSIX_LOCKS (1 << 1) + +/** + * Indicates that the filesystem supports the O_TRUNC open flag. If + * disabled, and an application specifies O_TRUNC, fuse first calls + * truncate() and then open() with O_TRUNC filtered out. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) + +/** + * Indicates that the filesystem supports lookups of "." and "..". + * + * This feature is disabled by default. + */ +#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) + +/** + * Indicates that the kernel should not apply the umask to the + * file mode on create operations. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_DONT_MASK (1 << 6) + +/** + * Indicates that libfuse should try to use splice() when writing to + * the fuse device. This may improve performance. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_SPLICE_WRITE (1 << 7) + +/** + * Indicates that libfuse should try to move pages instead of copying when + * writing to / reading from the fuse device. This may improve performance. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_SPLICE_MOVE (1 << 8) + +/** + * Indicates that libfuse should try to use splice() when reading from + * the fuse device. This may improve performance. + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a write_buf() handler. + */ +#define FUSE_CAP_SPLICE_READ (1 << 9) + +/** + * If set, the calls to flock(2) will be emulated using POSIX locks and must + * then be handled by the filesystem's setlock() handler. + * + * If not set, flock(2) calls will be handled by the FUSE kernel module + * internally (so any access that does not go through the kernel cannot be taken + * into account). + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a flock() handler. + */ +#define FUSE_CAP_FLOCK_LOCKS (1 << 10) + +/** + * Indicates that the filesystem supports ioctl's on directories. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_IOCTL_DIR (1 << 11) + +/** + * Traditionally, while a file is open the FUSE kernel module only + * asks the filesystem for an update of the file's attributes when a + * client attempts to read beyond EOF. This is unsuitable for + * e.g. network filesystems, where the file contents may change + * without the kernel knowing about it. + * + * If this flag is set, FUSE will check the validity of the attributes + * on every read. If the attributes are no longer valid (i.e., if the + * *attr_timeout* passed to fuse_reply_attr() or set in `struct + * fuse_entry_param` has passed), it will first issue a `getattr` + * request. If the new mtime differs from the previous value, any + * cached file *contents* will be invalidated as well. + * + * This flag should always be set when available. If all file changes + * go through the kernel, *attr_timeout* should be set to a very large + * number to avoid unnecessary getattr() calls. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) + +/** + * Indicates that the filesystem supports readdirplus. + * + * This feature is enabled by default when supported by the kernel and if the + * filesystem implements a readdirplus() handler. + */ +#define FUSE_CAP_READDIRPLUS (1 << 13) + +/** + * Indicates that the filesystem supports adaptive readdirplus. + * + * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect. + * + * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel + * will always issue readdirplus() requests to retrieve directory + * contents. + * + * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel + * will issue both readdir() and readdirplus() requests, depending on + * how much information is expected to be required. + * + * As of Linux 4.20, the algorithm is as follows: when userspace + * starts to read directory entries, issue a READDIRPLUS request to + * the filesystem. If any entry attributes have been looked up by the + * time userspace requests the next batch of entries continue with + * READDIRPLUS, otherwise switch to plain READDIR. This will reasult + * in eg plain "ls" triggering READDIRPLUS first then READDIR after + * that because it doesn't do lookups. "ls -l" should result in all + * READDIRPLUS, except if dentries are already cached. + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements both a readdirplus() and a readdir() + * handler. + */ +#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) + +/** + * Indicates that the filesystem supports asynchronous direct I/O submission. + * + * If this capability is not requested/available, the kernel will ensure that + * there is at most one pending read and one pending write request per direct + * I/O file-handle at any time. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ASYNC_DIO (1 << 15) + +/** + * Indicates that writeback caching should be enabled. This means that + * individual write request may be buffered and merged in the kernel + * before they are send to the filesystem. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) + +/** + * Indicates support for zero-message opens. If this flag is set in + * the `capable` field of the `fuse_conn_info` structure, then the + * filesystem may return `ENOSYS` from the open() handler to indicate + * success. Further attempts to open files will be handled in the + * kernel. (If this flag is not set, returning ENOSYS will be treated + * as an error and signaled to the caller). + * + * Setting (or unsetting) this flag in the `want` field has *no + * effect*. + */ +#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) + +/** + * Indicates support for parallel directory operations. If this flag + * is unset, the FUSE kernel module will ensure that lookup() and + * readdir() requests are never issued concurrently for the same + * directory. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) + +/** + * Indicates support for POSIX ACLs. + * + * If this feature is enabled, the kernel will cache and have + * responsibility for enforcing ACLs. ACL will be stored as xattrs and + * passed to userspace, which is responsible for updating the ACLs in + * the filesystem, keeping the file mode in sync with the ACL, and + * ensuring inheritance of default ACLs when new filesystem nodes are + * created. Note that this requires that the file system is able to + * parse and interpret the xattr representation of ACLs. + * + * Enabling this feature implicitly turns on the + * ``default_permissions`` mount option (even if it was not passed to + * mount(2)). + * + * This feature is disabled by default. + */ +#define FUSE_CAP_POSIX_ACL (1 << 19) + +/** + * Indicates that the filesystem is responsible for unsetting + * setuid and setgid bits when a file is written, truncated, or + * its owner is changed. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) + +/** + * Indicates support for zero-message opendirs. If this flag is set in + * the `capable` field of the `fuse_conn_info` structure, then the filesystem + * may return `ENOSYS` from the opendir() handler to indicate success. Further + * opendir and releasedir messages will be handled in the kernel. (If this + * flag is not set, returning ENOSYS will be treated as an error and signalled + * to the caller.) + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) + +/** + * Ioctl flags + * + * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine + * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed + * FUSE_IOCTL_RETRY: retry with new iovecs + * FUSE_IOCTL_DIR: is a directory + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +#define FUSE_IOCTL_COMPAT (1 << 0) +#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +#define FUSE_IOCTL_RETRY (1 << 2) +#define FUSE_IOCTL_DIR (1 << 4) + +#define FUSE_IOCTL_MAX_IOV 256 + +/** + * Connection information, passed to the ->init() method + * + * Some of the elements are read-write, these can be changed to + * indicate the value requested by the filesystem. The requested + * value must usually be smaller than the indicated value. + */ +struct fuse_conn_info { + /** + * Major version of the protocol (read-only) + */ + unsigned proto_major; + + /** + * Minor version of the protocol (read-only) + */ + unsigned proto_minor; + + /** + * Maximum size of the write buffer + */ + unsigned max_write; + + /** + * Maximum size of read requests. A value of zero indicates no + * limit. However, even if the filesystem does not specify a + * limit, the maximum size of read requests will still be + * limited by the kernel. + * + * NOTE: For the time being, the maximum size of read requests + * must be set both here *and* passed to fuse_session_new() + * using the ``-o max_read=<n>`` mount option. At some point + * in the future, specifying the mount option will no longer + * be necessary. + */ + unsigned max_read; + + /** + * Maximum readahead + */ + unsigned max_readahead; + + /** + * Capability flags that the kernel supports (read-only) + */ + unsigned capable; + + /** + * Capability flags that the filesystem wants to enable. + * + * libfuse attempts to initialize this field with + * reasonable default values before calling the init() handler. + */ + unsigned want; + + /** + * Maximum number of pending "background" requests. A + * background request is any type of request for which the + * total number is not limited by other means. As of kernel + * 4.8, only two types of requests fall into this category: + * + * 1. Read-ahead requests + * 2. Asynchronous direct I/O requests + * + * Read-ahead requests are generated (if max_readahead is + * non-zero) by the kernel to preemptively fill its caches + * when it anticipates that userspace will soon read more + * data. + * + * Asynchronous direct I/O requests are generated if + * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large + * direct I/O request. In this case the kernel will internally + * split it up into multiple smaller requests and submit them + * to the filesystem concurrently. + * + * Note that the following requests are *not* background + * requests: writeback requests (limited by the kernel's + * flusher algorithm), regular (i.e., synchronous and + * buffered) userspace read/write requests (limited to one per + * thread), asynchronous read requests (Linux's io_submit(2) + * call actually blocks, so these are also limited to one per + * thread). + */ + unsigned max_background; + + /** + * Kernel congestion threshold parameter. If the number of pending + * background requests exceeds this number, the FUSE kernel module will + * mark the filesystem as "congested". This instructs the kernel to + * expect that queued requests will take some time to complete, and to + * adjust its algorithms accordingly (e.g. by putting a waiting thread + * to sleep instead of using a busy-loop). + */ + unsigned congestion_threshold; + + /** + * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible + * for updating mtime and ctime when write requests are received. The + * updated values are passed to the filesystem with setattr() requests. + * However, if the filesystem does not support the full resolution of + * the kernel timestamps (nanoseconds), the mtime and ctime values used + * by kernel and filesystem will differ (and result in an apparent + * change of times after a cache flush). + * + * To prevent this problem, this variable can be used to inform the + * kernel about the timestamp granularity supported by the file-system. + * The value should be power of 10. The default is 1, i.e. full + * nano-second resolution. Filesystems supporting only second resolution + * should set this to 1000000000. + */ + unsigned time_gran; + + /** + * For future use. + */ + unsigned reserved[22]; +}; + +struct fuse_session; +struct fuse_pollhandle; +struct fuse_conn_info_opts; + +/** + * This function parses several command-line options that can be used + * to override elements of struct fuse_conn_info. The pointer returned + * by this function should be passed to the + * fuse_apply_conn_info_opts() method by the file system's init() + * handler. + * + * Before using this function, think twice if you really want these + * parameters to be adjustable from the command line. In most cases, + * they should be determined by the file system internally. + * + * The following options are recognized: + * + * -o max_write=N sets conn->max_write + * -o max_readahead=N sets conn->max_readahead + * -o max_background=N sets conn->max_background + * -o congestion_threshold=N sets conn->congestion_threshold + * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want + * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want + * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want + * -o no_remote_lock Equivalent to -o + *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets + *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets + *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets + *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets + *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets + *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets + *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets + *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets + *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o + *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO + *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in + *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in + *conn->want -o time_gran=N sets conn->time_gran + * + * Known options will be removed from *args*, unknown options will be + * passed through unchanged. + * + * @param args argument vector (input+output) + * @return parsed options + **/ +struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args); + +/** + * This function applies the (parsed) parameters in *opts* to the + * *conn* pointer. It may modify the following fields: wants, + * max_write, max_readahead, congestion_threshold, max_background, + * time_gran. A field is only set (or unset) if the corresponding + * option has been explicitly set. + */ +void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn); + +/** + * Go into the background + * + * @param foreground if true, stay in the foreground + * @return 0 on success, -1 on failure + */ +int fuse_daemonize(int foreground); + +/** + * Get the version of the library + * + * @return the version + */ +int fuse_version(void); + +/** + * Get the full package version string of the library + * + * @return the package version + */ +const char *fuse_pkgversion(void); + +/** + * Destroy poll handle + * + * @param ph the poll handle + */ +void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); + +/* + * Data buffer + */ + +/** + * Buffer flags + */ +enum fuse_buf_flags { + /** + * Buffer contains a file descriptor + * + * If this flag is set, the .fd field is valid, otherwise the + * .mem fields is valid. + */ + FUSE_BUF_IS_FD = (1 << 1), + + /** + * Seek on the file descriptor + * + * If this flag is set then the .pos field is valid and is + * used to seek to the given offset before performing + * operation on file descriptor. + */ + FUSE_BUF_FD_SEEK = (1 << 2), + + /** + * Retry operation on file descriptor + * + * If this flag is set then retry operation on file descriptor + * until .size bytes have been copied or an error or EOF is + * detected. + */ + FUSE_BUF_FD_RETRY = (1 << 3), +}; + +/** + * Single data buffer + * + * Generic data buffer for I/O, extended attributes, etc... Data may + * be supplied as a memory pointer or as a file descriptor + */ +struct fuse_buf { + /** + * Size of data in bytes + */ + size_t size; + + /** + * Buffer flags + */ + enum fuse_buf_flags flags; + + /** + * Memory pointer + * + * Used unless FUSE_BUF_IS_FD flag is set. + */ + void *mem; + + /** + * File descriptor + * + * Used if FUSE_BUF_IS_FD flag is set. + */ + int fd; + + /** + * File position + * + * Used if FUSE_BUF_FD_SEEK flag is set. + */ + off_t pos; +}; + +/** + * Data buffer vector + * + * An array of data buffers, each containing a memory pointer or a + * file descriptor. + * + * Allocate dynamically to add more than one buffer. + */ +struct fuse_bufvec { + /** + * Number of buffers in the array + */ + size_t count; + + /** + * Index of current buffer within the array + */ + size_t idx; + + /** + * Current offset within the current buffer + */ + size_t off; + + /** + * Array of buffers + */ + struct fuse_buf buf[1]; +}; + +/* Initialize bufvec with a single buffer of given size */ +#define FUSE_BUFVEC_INIT(size__) \ + ((struct fuse_bufvec){ /* .count= */ 1, \ + /* .idx = */ 0, \ + /* .off = */ 0, /* .buf = */ \ + { /* [0] = */ { \ + /* .size = */ (size__), \ + /* .flags = */ (enum fuse_buf_flags)0, \ + /* .mem = */ NULL, \ + /* .fd = */ -1, \ + /* .pos = */ 0, \ + } } }) + +/** + * Get total size of data in a fuse buffer vector + * + * @param bufv buffer vector + * @return size of data + */ +size_t fuse_buf_size(const struct fuse_bufvec *bufv); + +/** + * Copy data from one buffer vector to another + * + * @param dst destination buffer vector + * @param src source buffer vector + * @return actual number of bytes copied or -errno on error + */ +ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + +/** + * Memory buffer iterator + * + */ +struct fuse_mbuf_iter { + /** + * Data pointer + */ + void *mem; + + /** + * Total length, in bytes + */ + size_t size; + + /** + * Offset from start of buffer + */ + size_t pos; +}; + +/* Initialize memory buffer iterator from a fuse_buf */ +#define FUSE_MBUF_ITER_INIT(fbuf) \ + ((struct fuse_mbuf_iter){ \ + .mem = fbuf->mem, \ + .size = fbuf->size, \ + .pos = 0, \ + }) + +/** + * Consume bytes from a memory buffer iterator + * + * @param iter memory buffer iterator + * @param len number of bytes to consume + * @return pointer to start of consumed bytes or + * NULL if advancing beyond end of buffer + */ +void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len); + +/** + * Consume a NUL-terminated string from a memory buffer iterator + * + * @param iter memory buffer iterator + * @return pointer to the string or + * NULL if advancing beyond end of buffer or there is no NUL-terminator + */ +const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter); + +/* + * Signal handling + */ +/** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * + * Stores session in a global variable. May only be called once per + * process until fuse_remove_signal_handlers() is called. + * + * Once either of the POSIX signals arrives, the signal handler calls + * fuse_session_exit(). + * + * @param se the session to exit + * @return 0 on success, -1 on failure + * + * See also: + * fuse_remove_signal_handlers() + */ +int fuse_set_signal_handlers(struct fuse_session *se); + +/** + * Restore default signal handlers + * + * Resets global session. After this fuse_set_signal_handlers() may + * be called again. + * + * @param se the same session as given in fuse_set_signal_handlers() + * + * See also: + * fuse_set_signal_handlers() + */ +void fuse_remove_signal_handlers(struct fuse_session *se); + +/* + * Compatibility stuff + */ + +#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 +#error only API version 30 or greater is supported +#endif + + +/* + * This interface uses 64 bit off_t. + * + * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! + */ + +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 6) && \ + !defined __cplusplus +_Static_assert(sizeof(off_t) == 8, "fuse: off_t must be 64bit"); +#else +struct _fuse_off_t_must_be_64bit_dummy_struct { + unsigned _fuse_off_t_must_be_64bit:((sizeof(off_t) == 8) ? 1 : -1); +}; +#endif + +#endif /* FUSE_COMMON_H_ */ diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h new file mode 100644 index 0000000000..4e47e5880d --- /dev/null +++ b/tools/virtiofsd/fuse_i.h @@ -0,0 +1,115 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#ifndef FUSE_I_H +#define FUSE_I_H + +#define FUSE_USE_VERSION 31 +#include "fuse.h" +#include "fuse_lowlevel.h" + +struct fv_VuDev; +struct fv_QueueInfo; + +struct fuse_req { + struct fuse_session *se; + uint64_t unique; + int ctr; + pthread_mutex_t lock; + struct fuse_ctx ctx; + struct fuse_chan *ch; + int interrupted; + unsigned int ioctl_64bit:1; + union { + struct { + uint64_t unique; + } i; + struct { + fuse_interrupt_func_t func; + void *data; + } ni; + } u; + struct fuse_req *next; + struct fuse_req *prev; +}; + +struct fuse_notify_req { + uint64_t unique; + void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, + const void *, const struct fuse_buf *); + struct fuse_notify_req *next; + struct fuse_notify_req *prev; +}; + +struct fuse_session { + char *mountpoint; + volatile int exited; + int fd; + int debug; + int deny_others; + struct fuse_lowlevel_ops op; + int got_init; + struct cuse_data *cuse_data; + void *userdata; + uid_t owner; + struct fuse_conn_info conn; + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; + pthread_rwlock_t init_rwlock; + int got_destroy; + int broken_splice_nonblock; + uint64_t notify_ctr; + struct fuse_notify_req notify_list; + size_t bufsize; + int error; + char *vu_socket_path; + int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; + int thread_pool_size; +}; + +struct fuse_chan { + pthread_mutex_t lock; + int ctr; + int fd; + struct fv_QueueInfo *qi; +}; + +/** + * Filesystem module + * + * Filesystem modules are registered with the FUSE_REGISTER_MODULE() + * macro. + * + */ +struct fuse_module { + char *name; + fuse_module_factory_t factory; + struct fuse_module *next; + struct fusemod_so *so; + int ctr; +}; + +int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count); +void fuse_free_req(fuse_req_t req); + +void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_bufvec *bufv, + struct fuse_chan *ch); + + +#define FUSE_MAX_MAX_PAGES 256 +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/* room needed in buffer to accommodate header */ +#define FUSE_BUFFER_HEADER_SIZE 0x1000 + +#endif diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c new file mode 100644 index 0000000000..c301ff6da1 --- /dev/null +++ b/tools/virtiofsd/fuse_log.c @@ -0,0 +1,41 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2019 Red Hat, Inc. + * + * Logging API. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_log.h" + +#include <stdarg.h> +#include <stdio.h> + +static void default_log_func(__attribute__((unused)) enum fuse_log_level level, + const char *fmt, va_list ap) +{ + vfprintf(stderr, fmt, ap); +} + +static fuse_log_func_t log_func = default_log_func; + +void fuse_set_log_func(fuse_log_func_t func) +{ + if (!func) { + func = default_log_func; + } + + log_func = func; +} + +void fuse_log(enum fuse_log_level level, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + log_func(level, fmt, ap); + va_end(ap); +} diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h new file mode 100644 index 0000000000..bf6c11ff11 --- /dev/null +++ b/tools/virtiofsd/fuse_log.h @@ -0,0 +1,74 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2019 Red Hat, Inc. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_LOG_H_ +#define FUSE_LOG_H_ + +/** @file + * + * This file defines the logging interface of FUSE + */ + +#include <stdarg.h> + +/** + * Log severity level + * + * These levels correspond to syslog(2) log levels since they are widely used. + */ +enum fuse_log_level { + FUSE_LOG_EMERG, + FUSE_LOG_ALERT, + FUSE_LOG_CRIT, + FUSE_LOG_ERR, + FUSE_LOG_WARNING, + FUSE_LOG_NOTICE, + FUSE_LOG_INFO, + FUSE_LOG_DEBUG +}; + +/** + * Log message handler function. + * + * This function must be thread-safe. It may be called from any libfuse + * function, including fuse_parse_cmdline() and other functions invoked before + * a FUSE filesystem is created. + * + * Install a custom log message handler function using fuse_set_log_func(). + * + * @param level log severity level + * @param fmt sprintf-style format string including newline + * @param ap format string arguments + */ +typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt, + va_list ap); + +/** + * Install a custom log handler function. + * + * Log messages are emitted by libfuse functions to report errors and debug + * information. Messages are printed to stderr by default but this can be + * overridden by installing a custom log message handler function. + * + * The log message handler function is global and affects all FUSE filesystems + * created within this process. + * + * @param func a custom log message handler function or NULL to revert to + * the default + */ +void fuse_set_log_func(fuse_log_func_t func); + +/** + * Emit a log message + * + * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc) + * @param fmt sprintf-style format string including newline + */ +void fuse_log(enum fuse_log_level level, const char *fmt, ...); + +#endif /* FUSE_LOG_H_ */ diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c new file mode 100644 index 0000000000..de2e2e0c65 --- /dev/null +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -0,0 +1,2761 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Implementation of (most of) the low-level FUSE API. The session loop + * functions are implemented in separate files. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "standard-headers/linux/fuse.h" +#include "fuse_misc.h" +#include "fuse_opt.h" +#include "fuse_virtio.h" + +#include <assert.h> +#include <errno.h> +#include <glib.h> +#include <limits.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <unistd.h> + +#define THREAD_POOL_SIZE 64 + +#define OFFSET_MAX 0x7fffffffffffffffLL + +struct fuse_pollhandle { + uint64_t kh; + struct fuse_session *se; +}; + +static size_t pagesize; + +static __attribute__((constructor)) void fuse_ll_init_pagesize(void) +{ + pagesize = getpagesize(); +} + +static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) +{ + *attr = (struct fuse_attr){ + .ino = stbuf->st_ino, + .mode = stbuf->st_mode, + .nlink = stbuf->st_nlink, + .uid = stbuf->st_uid, + .gid = stbuf->st_gid, + .rdev = stbuf->st_rdev, + .size = stbuf->st_size, + .blksize = stbuf->st_blksize, + .blocks = stbuf->st_blocks, + .atime = stbuf->st_atime, + .mtime = stbuf->st_mtime, + .ctime = stbuf->st_ctime, + .atimensec = ST_ATIM_NSEC(stbuf), + .mtimensec = ST_MTIM_NSEC(stbuf), + .ctimensec = ST_CTIM_NSEC(stbuf), + }; +} + +static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) +{ + stbuf->st_mode = attr->mode; + stbuf->st_uid = attr->uid; + stbuf->st_gid = attr->gid; + stbuf->st_size = attr->size; + stbuf->st_atime = attr->atime; + stbuf->st_mtime = attr->mtime; + stbuf->st_ctime = attr->ctime; + ST_ATIM_NSEC_SET(stbuf, attr->atimensec); + ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); + ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); +} + +static size_t iov_length(const struct iovec *iov, size_t count) +{ + size_t seg; + size_t ret = 0; + + for (seg = 0; seg < count; seg++) { + ret += iov[seg].iov_len; + } + return ret; +} + +static void list_init_req(struct fuse_req *req) +{ + req->next = req; + req->prev = req; +} + +static void list_del_req(struct fuse_req *req) +{ + struct fuse_req *prev = req->prev; + struct fuse_req *next = req->next; + prev->next = next; + next->prev = prev; +} + +static void list_add_req(struct fuse_req *req, struct fuse_req *next) +{ + struct fuse_req *prev = next->prev; + req->next = next; + req->prev = prev; + prev->next = req; + next->prev = req; +} + +static void destroy_req(fuse_req_t req) +{ + pthread_mutex_destroy(&req->lock); + free(req); +} + +void fuse_free_req(fuse_req_t req) +{ + int ctr; + struct fuse_session *se = req->se; + + pthread_mutex_lock(&se->lock); + req->u.ni.func = NULL; + req->u.ni.data = NULL; + list_del_req(req); + ctr = --req->ctr; + req->ch = NULL; + pthread_mutex_unlock(&se->lock); + if (!ctr) { + destroy_req(req); + } +} + +static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) +{ + struct fuse_req *req; + + req = (struct fuse_req *)calloc(1, sizeof(struct fuse_req)); + if (req == NULL) { + fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); + } else { + req->se = se; + req->ctr = 1; + list_init_req(req); + fuse_mutex_init(&req->lock); + } + + return req; +} + +/* Send data. If *ch* is NULL, send via session master fd */ +static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) +{ + struct fuse_out_header *out = iov[0].iov_base; + + out->len = iov_length(iov, count); + if (out->unique == 0) { + fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, + out->len); + } else if (out->error) { + fuse_log(FUSE_LOG_DEBUG, + " unique: %llu, error: %i (%s), outsize: %i\n", + (unsigned long long)out->unique, out->error, + strerror(-out->error), out->len); + } else { + fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", + (unsigned long long)out->unique, out->len); + } + + if (fuse_lowlevel_is_virtio(se)) { + return virtio_send_msg(se, ch, iov, count); + } + + abort(); /* virtio should have taken it before here */ + return 0; +} + + +int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count) +{ + struct fuse_out_header out = { + .unique = req->unique, + .error = error, + }; + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); + error = -ERANGE; + } + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + + return fuse_send_msg(req->se, req->ch, iov, count); +} + +static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, + int count) +{ + int res; + + res = fuse_send_reply_iov_nofree(req, error, iov, count); + fuse_free_req(req); + return res; +} + +static int send_reply(fuse_req_t req, int error, const void *arg, + size_t argsize) +{ + struct iovec iov[2]; + int count = 1; + if (argsize) { + iov[1].iov_base = (void *)arg; + iov[1].iov_len = argsize; + count++; + } + return send_reply_iov(req, error, iov, count); +} + +int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) +{ + int res; + struct iovec *padded_iov; + + padded_iov = malloc((count + 1) * sizeof(struct iovec)); + if (padded_iov == NULL) { + return fuse_reply_err(req, ENOMEM); + } + + memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); + count++; + + res = send_reply_iov(req, 0, padded_iov, count); + free(padded_iov); + + return res; +} + + +/* + * 'buf` is allowed to be empty so that the proper size may be + * allocated by the caller + */ +size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + const char *name, const struct stat *stbuf, off_t off) +{ + (void)req; + size_t namelen; + size_t entlen; + size_t entlen_padded; + struct fuse_dirent *dirent; + + namelen = strlen(name); + entlen = FUSE_NAME_OFFSET + namelen; + entlen_padded = FUSE_DIRENT_ALIGN(entlen); + + if ((buf == NULL) || (entlen_padded > bufsize)) { + return entlen_padded; + } + + dirent = (struct fuse_dirent *)buf; + dirent->ino = stbuf->st_ino; + dirent->off = off; + dirent->namelen = namelen; + dirent->type = (stbuf->st_mode & S_IFMT) >> 12; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + + return entlen_padded; +} + +static void convert_statfs(const struct statvfs *stbuf, + struct fuse_kstatfs *kstatfs) +{ + *kstatfs = (struct fuse_kstatfs){ + .bsize = stbuf->f_bsize, + .frsize = stbuf->f_frsize, + .blocks = stbuf->f_blocks, + .bfree = stbuf->f_bfree, + .bavail = stbuf->f_bavail, + .files = stbuf->f_files, + .ffree = stbuf->f_ffree, + .namelen = stbuf->f_namemax, + }; +} + +static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) +{ + return send_reply(req, 0, arg, argsize); +} + +int fuse_reply_err(fuse_req_t req, int err) +{ + return send_reply(req, -err, NULL, 0); +} + +void fuse_reply_none(fuse_req_t req) +{ + fuse_free_req(req); +} + +static unsigned long calc_timeout_sec(double t) +{ + if (t > (double)ULONG_MAX) { + return ULONG_MAX; + } else if (t < 0.0) { + return 0; + } else { + return (unsigned long)t; + } +} + +static unsigned int calc_timeout_nsec(double t) +{ + double f = t - (double)calc_timeout_sec(t); + if (f < 0.0) { + return 0; + } else if (f >= 0.999999999) { + return 999999999; + } else { + return (unsigned int)(f * 1.0e9); + } +} + +static void fill_entry(struct fuse_entry_out *arg, + const struct fuse_entry_param *e) +{ + *arg = (struct fuse_entry_out){ + .nodeid = e->ino, + .generation = e->generation, + .entry_valid = calc_timeout_sec(e->entry_timeout), + .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout), + .attr_valid = calc_timeout_sec(e->attr_timeout), + .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout), + }; + convert_stat(&e->attr, &arg->attr); +} + +/* + * `buf` is allowed to be empty so that the proper size may be + * allocated by the caller + */ +size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + const char *name, + const struct fuse_entry_param *e, off_t off) +{ + (void)req; + size_t namelen; + size_t entlen; + size_t entlen_padded; + + namelen = strlen(name); + entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; + entlen_padded = FUSE_DIRENT_ALIGN(entlen); + if ((buf == NULL) || (entlen_padded > bufsize)) { + return entlen_padded; + } + + struct fuse_direntplus *dp = (struct fuse_direntplus *)buf; + memset(&dp->entry_out, 0, sizeof(dp->entry_out)); + fill_entry(&dp->entry_out, e); + + struct fuse_dirent *dirent = &dp->dirent; + *dirent = (struct fuse_dirent){ + .ino = e->attr.st_ino, + .off = off, + .namelen = namelen, + .type = (e->attr.st_mode & S_IFMT) >> 12, + }; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + + return entlen_padded; +} + +static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) +{ + arg->fh = f->fh; + if (f->direct_io) { + arg->open_flags |= FOPEN_DIRECT_IO; + } + if (f->keep_cache) { + arg->open_flags |= FOPEN_KEEP_CACHE; + } + if (f->cache_readdir) { + arg->open_flags |= FOPEN_CACHE_DIR; + } + if (f->nonseekable) { + arg->open_flags |= FOPEN_NONSEEKABLE; + } +} + +int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) +{ + struct fuse_entry_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + fill_entry(&arg, e); + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *f) +{ + char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; + size_t entrysize = sizeof(struct fuse_entry_out); + struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; + struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + + memset(buf, 0, sizeof(buf)); + fill_entry(earg, e); + fill_open(oarg, f); + return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out)); +} + +int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout) +{ + struct fuse_attr_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + arg.attr_valid = calc_timeout_sec(attr_timeout); + arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); + convert_stat(attr, &arg.attr); + + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_readlink(fuse_req_t req, const char *linkname) +{ + return send_reply_ok(req, linkname, strlen(linkname)); +} + +int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) +{ + struct fuse_open_out arg; + + memset(&arg, 0, sizeof(arg)); + fill_open(&arg, f); + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_write(fuse_req_t req, size_t count) +{ + struct fuse_write_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.size = count; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) +{ + return send_reply_ok(req, buf, size); +} + +static int fuse_send_data_iov_fallback(struct fuse_session *se, + struct fuse_chan *ch, struct iovec *iov, + int iov_count, struct fuse_bufvec *buf, + size_t len) +{ + /* Optimize common case */ + if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && + !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { + /* + * FIXME: also avoid memory copy if there are multiple buffers + * but none of them contain an fd + */ + + iov[iov_count].iov_base = buf->buf[0].mem; + iov[iov_count].iov_len = len; + iov_count++; + return fuse_send_msg(se, ch, iov, iov_count); + } + + if (fuse_lowlevel_is_virtio(se) && buf->count == 1 && + buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) { + return virtio_send_data_iov(se, ch, iov, iov_count, buf, len); + } + + abort(); /* Will have taken vhost path */ + return 0; +} + +static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, + struct fuse_bufvec *buf) +{ + size_t len = fuse_buf_size(buf); + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); +} + +int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) +{ + struct iovec iov[2]; + struct fuse_out_header out = { + .unique = req->unique, + }; + int res; + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + + res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); + return res; + } else { + return fuse_reply_err(req, res); + } +} + +int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) +{ + struct fuse_statfs_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + convert_statfs(stbuf, &arg.st); + + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_xattr(fuse_req_t req, size_t count) +{ + struct fuse_getxattr_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.size = count; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_lock(fuse_req_t req, const struct flock *lock) +{ + struct fuse_lk_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.lk.type = lock->l_type; + if (lock->l_type != F_UNLCK) { + arg.lk.start = lock->l_start; + if (lock->l_len == 0) { + arg.lk.end = OFFSET_MAX; + } else { + arg.lk.end = lock->l_start + lock->l_len - 1; + } + } + arg.lk.pid = lock->l_pid; + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_bmap(fuse_req_t req, uint64_t idx) +{ + struct fuse_bmap_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.block = idx; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, + size_t count) +{ + struct fuse_ioctl_iovec *fiov; + size_t i; + + fiov = malloc(sizeof(fiov[0]) * count); + if (!fiov) { + return NULL; + } + + for (i = 0; i < count; i++) { + fiov[i].base = (uintptr_t)iov[i].iov_base; + fiov[i].len = iov[i].iov_len; + } + + return fiov; +} + +int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + size_t in_count, const struct iovec *out_iov, + size_t out_count) +{ + struct fuse_ioctl_out arg; + struct fuse_ioctl_iovec *in_fiov = NULL; + struct fuse_ioctl_iovec *out_fiov = NULL; + struct iovec iov[4]; + size_t count = 1; + int res; + + memset(&arg, 0, sizeof(arg)); + arg.flags |= FUSE_IOCTL_RETRY; + arg.in_iovs = in_count; + arg.out_iovs = out_count; + iov[count].iov_base = &arg; + iov[count].iov_len = sizeof(arg); + count++; + + /* Can't handle non-compat 64bit ioctls on 32bit */ + if (sizeof(void *) == 4 && req->ioctl_64bit) { + res = fuse_reply_err(req, EINVAL); + goto out; + } + + if (in_count) { + in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); + if (!in_fiov) { + goto enomem; + } + + iov[count].iov_base = (void *)in_fiov; + iov[count].iov_len = sizeof(in_fiov[0]) * in_count; + count++; + } + if (out_count) { + out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); + if (!out_fiov) { + goto enomem; + } + + iov[count].iov_base = (void *)out_fiov; + iov[count].iov_len = sizeof(out_fiov[0]) * out_count; + count++; + } + + res = send_reply_iov(req, 0, iov, count); +out: + free(in_fiov); + free(out_fiov); + + return res; + +enomem: + res = fuse_reply_err(req, ENOMEM); + goto out; +} + +int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) +{ + struct fuse_ioctl_out arg; + struct iovec iov[3]; + size_t count = 1; + + memset(&arg, 0, sizeof(arg)); + arg.result = result; + iov[count].iov_base = &arg; + iov[count].iov_len = sizeof(arg); + count++; + + if (size) { + iov[count].iov_base = (char *)buf; + iov[count].iov_len = size; + count++; + } + + return send_reply_iov(req, 0, iov, count); +} + +int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, + int count) +{ + struct iovec *padded_iov; + struct fuse_ioctl_out arg; + int res; + + padded_iov = malloc((count + 2) * sizeof(struct iovec)); + if (padded_iov == NULL) { + return fuse_reply_err(req, ENOMEM); + } + + memset(&arg, 0, sizeof(arg)); + arg.result = result; + padded_iov[1].iov_base = &arg; + padded_iov[1].iov_len = sizeof(arg); + + memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); + + res = send_reply_iov(req, 0, padded_iov, count + 2); + free(padded_iov); + + return res; +} + +int fuse_reply_poll(fuse_req_t req, unsigned revents) +{ + struct fuse_poll_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.revents = revents; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_lseek(fuse_req_t req, off_t off) +{ + struct fuse_lseek_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.offset = off; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.lookup) { + req->se->op.lookup(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_forget(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_forget_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.forget) { + req->se->op.forget(req, nodeid, arg->nlookup); + } else { + fuse_reply_none(req); + } +} + +static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_batch_forget_in *arg; + struct fuse_forget_data *forgets; + size_t scount; + + (void)nodeid; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_none(req); + return; + } + + /* + * Prevent integer overflow. The compiler emits the following warning + * unless we use the scount local variable: + * + * error: comparison is always false due to limited range of data type + * [-Werror=type-limits] + * + * This may be true on 64-bit hosts but we need this check for 32-bit + * hosts. + */ + scount = arg->count; + if (scount > SIZE_MAX / sizeof(forgets[0])) { + fuse_reply_none(req); + return; + } + + forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0])); + if (!forgets) { + fuse_reply_none(req); + return; + } + + if (req->se->op.forget_multi) { + req->se->op.forget_multi(req, arg->count, forgets); + } else if (req->se->op.forget) { + unsigned int i; + + for (i = 0; i < arg->count; i++) { + struct fuse_req *dummy_req; + + dummy_req = fuse_ll_alloc_req(req->se); + if (dummy_req == NULL) { + break; + } + + dummy_req->unique = req->unique; + dummy_req->ctx = req->ctx; + dummy_req->ch = NULL; + + req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); + } else { + fuse_reply_none(req); + } +} + +static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + + struct fuse_getattr_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (arg->getattr_flags & FUSE_GETATTR_FH) { + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fip = &fi; + } + + if (req->se->op.getattr) { + req->se->op.getattr(req, nodeid, fip); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.setattr) { + struct fuse_setattr_in *arg; + struct fuse_file_info *fi = NULL; + struct fuse_file_info fi_store; + struct stat stbuf; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&stbuf, 0, sizeof(stbuf)); + convert_attr(arg, &stbuf); + if (arg->valid & FATTR_FH) { + arg->valid &= ~FATTR_FH; + memset(&fi_store, 0, sizeof(fi_store)); + fi = &fi_store; + fi->fh = arg->fh; + } + arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID | + FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | + FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | + FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | + FUSE_SET_ATTR_CTIME; + + req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_access(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_access_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.access) { + req->se->op.access(req, nodeid, arg->mask); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + (void)iter; + + if (req->se->op.readlink) { + req->se->op.readlink(req, nodeid); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_mknod_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + req->ctx.umask = arg->umask; + + if (req->se->op.mknod) { + req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_mkdir_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { + req->se->op.mkdir(req, nodeid, name, arg->mode); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.unlink) { + req->se->op.unlink(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rmdir) { + req->se->op.rmdir(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + const char *linkname = fuse_mbuf_iter_advance_str(iter); + + if (!name || !linkname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.symlink) { + req->se->op.symlink(req, linkname, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rename(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_rename_in *arg; + const char *oldname; + const char *newname; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + oldname = fuse_mbuf_iter_advance_str(iter); + newname = fuse_mbuf_iter_advance_str(iter); + if (!arg || !oldname || !newname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_rename2_in *arg; + const char *oldname; + const char *newname; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + oldname = fuse_mbuf_iter_advance_str(iter); + newname = fuse_mbuf_iter_advance_str(iter); + if (!arg || !oldname || !newname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, + arg->flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_link(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.link) { + req->se->op.link(req, arg->oldnodeid, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_create(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.create) { + struct fuse_create_in *arg; + struct fuse_file_info fi; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + + req->ctx.umask = arg->umask; + + req->se->op.create(req, nodeid, name, arg->mode, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_open(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_open_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + + if (req->se->op.open) { + req->se->op.open(req, nodeid, &fi); + } else { + fuse_reply_open(req, &fi); + } +} + +static void do_read(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.read) { + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_write(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_write_in *arg; + struct fuse_file_info fi; + const char *param; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + param = fuse_mbuf_iter_advance(iter, arg->size); + if (!param) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv) +{ + struct fuse_session *se = req->se; + struct fuse_bufvec *pbufv = ibufv; + struct fuse_bufvec tmpbufv = { + .buf[0] = ibufv->buf[0], + .count = 1, + }; + struct fuse_write_in *arg; + size_t arg_size = sizeof(*arg); + struct fuse_file_info fi; + + memset(&fi, 0, sizeof(fi)); + + arg = fuse_mbuf_iter_advance(iter, arg_size); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE); + fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + if (ibufv->count == 1) { + assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); + tmpbufv.buf[0].mem = ((char *)arg) + arg_size; + tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size; + pbufv = &tmpbufv; + } else { + /* + * Input bufv contains the headers in the first element + * and the data in the rest, we need to skip that first element + */ + ibufv->buf[0].size = 0; + } + + if (fuse_buf_size(pbufv) != arg->size) { + fuse_log(FUSE_LOG_ERR, + "fuse: do_write_buf: buffer size doesn't match arg->size\n"); + fuse_reply_err(req, EIO); + return; + } + + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); +} + +static void do_flush(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_flush_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; + fi.lock_owner = arg->lock_owner; + + if (req->se->op.flush) { + req->se->op.flush(req, nodeid, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_release(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_release_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; + fi.lock_owner = arg->lock_owner; + + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; + } + + if (req->se->op.release) { + req->se->op.release(req, nodeid, &fi); + } else { + fuse_reply_err(req, 0); + } +} + +static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fsync_in *arg; + struct fuse_file_info fi; + int datasync; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fsync) { + if (fi.fh == (uint64_t)-1) { + req->se->op.fsync(req, nodeid, datasync, NULL); + } else { + req->se->op.fsync(req, nodeid, datasync, &fi); + } + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_open_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + + if (req->se->op.opendir) { + req->se->op.opendir(req, nodeid, &fi); + } else { + fuse_reply_open(req, &fi); + } +} + +static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.readdir) { + req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.readdirplus) { + req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_release_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + + if (req->se->op.releasedir) { + req->se->op.releasedir(req, nodeid, &fi); + } else { + fuse_reply_err(req, 0); + } +} + +static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fsync_in *arg; + struct fuse_file_info fi; + int datasync; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fsyncdir) { + req->se->op.fsyncdir(req, nodeid, datasync, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + (void)nodeid; + (void)iter; + + if (req->se->op.statfs) { + req->se->op.statfs(req, nodeid); + } else { + struct statvfs buf = { + .f_namemax = 255, + .f_bsize = 512, + }; + fuse_reply_statfs(req, &buf); + } +} + +static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_setxattr_in *arg; + const char *name; + const char *value; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + value = fuse_mbuf_iter_advance(iter, arg->size); + if (!value) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.setxattr) { + req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_getxattr_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.getxattr) { + req->se->op.getxattr(req, nodeid, name, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_getxattr_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.listxattr) { + req->se->op.listxattr(req, nodeid, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.removexattr) { + req->se->op.removexattr(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void convert_fuse_file_lock(struct fuse_file_lock *fl, + struct flock *flock) +{ + memset(flock, 0, sizeof(struct flock)); + flock->l_type = fl->type; + flock->l_whence = SEEK_SET; + flock->l_start = fl->start; + if (fl->end == OFFSET_MAX) { + flock->l_len = 0; + } else { + flock->l_len = fl->end - fl->start + 1; + } + flock->l_pid = fl->pid; +} + +static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; + + convert_fuse_file_lock(&arg->lk, &flock); + if (req->se->op.getlk) { + req->se->op.getlk(req, nodeid, &fi, &flock); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter, int sleep) +{ + struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; + + if (arg->lk_flags & FUSE_LK_FLOCK) { + int op = 0; + + switch (arg->lk.type) { + case F_RDLCK: + op = LOCK_SH; + break; + case F_WRLCK: + op = LOCK_EX; + break; + case F_UNLCK: + op = LOCK_UN; + break; + } + if (!sleep) { + op |= LOCK_NB; + } + + if (req->se->op.flock) { + req->se->op.flock(req, nodeid, &fi, op); + } else { + fuse_reply_err(req, ENOSYS); + } + } else { + convert_fuse_file_lock(&arg->lk, &flock); + if (req->se->op.setlk) { + req->se->op.setlk(req, nodeid, &fi, &flock, sleep); + } else { + fuse_reply_err(req, ENOSYS); + } + } +} + +static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + do_setlk_common(req, nodeid, iter, 0); +} + +static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + do_setlk_common(req, nodeid, iter, 1); +} + +static int find_interrupted(struct fuse_session *se, struct fuse_req *req) +{ + struct fuse_req *curr; + + for (curr = se->list.next; curr != &se->list; curr = curr->next) { + if (curr->unique == req->u.i.unique) { + fuse_interrupt_func_t func; + void *data; + + curr->ctr++; + pthread_mutex_unlock(&se->lock); + + /* Ugh, ugly locking */ + pthread_mutex_lock(&curr->lock); + pthread_mutex_lock(&se->lock); + curr->interrupted = 1; + func = curr->u.ni.func; + data = curr->u.ni.data; + pthread_mutex_unlock(&se->lock); + if (func) { + func(curr, data); + } + pthread_mutex_unlock(&curr->lock); + + pthread_mutex_lock(&se->lock); + curr->ctr--; + if (!curr->ctr) { + destroy_req(curr); + } + + return 1; + } + } + for (curr = se->interrupts.next; curr != &se->interrupts; + curr = curr->next) { + if (curr->u.i.unique == req->u.i.unique) { + return 1; + } + } + return 0; +} + +static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_interrupt_in *arg; + struct fuse_session *se = req->se; + + (void)nodeid; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", + (unsigned long long)arg->unique); + + req->u.i.unique = arg->unique; + + pthread_mutex_lock(&se->lock); + if (find_interrupted(se, req)) { + destroy_req(req); + } else { + list_add_req(req, &se->interrupts); + } + pthread_mutex_unlock(&se->lock); +} + +static struct fuse_req *check_interrupt(struct fuse_session *se, + struct fuse_req *req) +{ + struct fuse_req *curr; + + for (curr = se->interrupts.next; curr != &se->interrupts; + curr = curr->next) { + if (curr->u.i.unique == req->unique) { + req->interrupted = 1; + list_del_req(curr); + free(curr); + return NULL; + } + } + curr = se->interrupts.next; + if (curr != &se->interrupts) { + list_del_req(curr); + list_init_req(curr); + return curr; + } else { + return NULL; + } +} + +static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.bmap) { + req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_ioctl_in *arg; + unsigned int flags; + void *in_buf = NULL; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + flags = arg->flags; + if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { + fuse_reply_err(req, ENOTTY); + return; + } + + if (arg->in_size) { + in_buf = fuse_mbuf_iter_advance(iter, arg->in_size); + if (!in_buf) { + fuse_reply_err(req, EINVAL); + return; + } + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) { + req->ioctl_64bit = 1; + } + + if (req->se->op.ioctl) { + req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg, + &fi, flags, in_buf, arg->in_size, arg->out_size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) +{ + free(ph); +} + +static void do_poll(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_poll_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.poll_events = arg->events; + + if (req->se->op.poll) { + struct fuse_pollhandle *ph = NULL; + + if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { + ph = malloc(sizeof(struct fuse_pollhandle)); + if (ph == NULL) { + fuse_reply_err(req, ENOMEM); + return; + } + ph->kh = arg->kh; + ph->se = req->se; + } + + req->se->op.poll(req, nodeid, &fi, ph); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fallocate_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fallocate) { + req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, + &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, + struct fuse_mbuf_iter *iter) +{ + struct fuse_copy_file_range_in *arg; + struct fuse_file_info fi_in, fi_out; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi_in, 0, sizeof(fi_in)); + fi_in.fh = arg->fh_in; + + memset(&fi_out, 0, sizeof(fi_out)); + fi_out.fh = arg->fh_out; + + + if (req->se->op.copy_file_range) { + req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in, + arg->nodeid_out, arg->off_out, &fi_out, + arg->len, arg->flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_lseek_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.lseek) { + req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_init(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + size_t compat_size = offsetof(struct fuse_init_in, max_readahead); + struct fuse_init_in *arg; + struct fuse_init_out outarg; + struct fuse_session *se = req->se; + size_t bufsize = se->bufsize; + size_t outargsize = sizeof(outarg); + + (void)nodeid; + + /* First consume the old fields... */ + arg = fuse_mbuf_iter_advance(iter, compat_size); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + /* ...and now consume the new fields. */ + if (arg->major == 7 && arg->minor >= 6) { + if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { + fuse_reply_err(req, EINVAL); + return; + } + } + + fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); + if (arg->major == 7 && arg->minor >= 6) { + fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); + fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead); + } + se->conn.proto_major = arg->major; + se->conn.proto_minor = arg->minor; + se->conn.capable = 0; + se->conn.want = 0; + + memset(&outarg, 0, sizeof(outarg)); + outarg.major = FUSE_KERNEL_VERSION; + outarg.minor = FUSE_KERNEL_MINOR_VERSION; + + if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) { + fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", + arg->major, arg->minor); + fuse_reply_err(req, EPROTO); + return; + } + + if (arg->major > 7) { + /* Wait for a second INIT request with a 7.X version */ + send_reply_ok(req, &outarg, sizeof(outarg)); + return; + } + + if (arg->max_readahead < se->conn.max_readahead) { + se->conn.max_readahead = arg->max_readahead; + } + if (arg->flags & FUSE_ASYNC_READ) { + se->conn.capable |= FUSE_CAP_ASYNC_READ; + } + if (arg->flags & FUSE_POSIX_LOCKS) { + se->conn.capable |= FUSE_CAP_POSIX_LOCKS; + } + if (arg->flags & FUSE_ATOMIC_O_TRUNC) { + se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; + } + if (arg->flags & FUSE_EXPORT_SUPPORT) { + se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; + } + if (arg->flags & FUSE_DONT_MASK) { + se->conn.capable |= FUSE_CAP_DONT_MASK; + } + if (arg->flags & FUSE_FLOCK_LOCKS) { + se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; + } + if (arg->flags & FUSE_AUTO_INVAL_DATA) { + se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; + } + if (arg->flags & FUSE_DO_READDIRPLUS) { + se->conn.capable |= FUSE_CAP_READDIRPLUS; + } + if (arg->flags & FUSE_READDIRPLUS_AUTO) { + se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; + } + if (arg->flags & FUSE_ASYNC_DIO) { + se->conn.capable |= FUSE_CAP_ASYNC_DIO; + } + if (arg->flags & FUSE_WRITEBACK_CACHE) { + se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; + } + if (arg->flags & FUSE_NO_OPEN_SUPPORT) { + se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; + } + if (arg->flags & FUSE_PARALLEL_DIROPS) { + se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; + } + if (arg->flags & FUSE_POSIX_ACL) { + se->conn.capable |= FUSE_CAP_POSIX_ACL; + } + if (arg->flags & FUSE_HANDLE_KILLPRIV) { + se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; + } + if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { + se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; + } + if (!(arg->flags & FUSE_MAX_PAGES)) { + size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + + FUSE_BUFFER_HEADER_SIZE; + if (bufsize > max_bufsize) { + bufsize = max_bufsize; + } + } +#ifdef HAVE_SPLICE +#ifdef HAVE_VMSPLICE + se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; +#endif + se->conn.capable |= FUSE_CAP_SPLICE_READ; +#endif + se->conn.capable |= FUSE_CAP_IOCTL_DIR; + + /* + * Default settings for modern filesystems. + * + * Most of these capabilities were disabled by default in + * libfuse2 for backwards compatibility reasons. In libfuse3, + * we can finally enable them by default (as long as they're + * supported by the kernel). + */ +#define LL_SET_DEFAULT(cond, cap) \ + if ((cond) && (se->conn.capable & (cap))) \ + se->conn.want |= (cap) + LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); + LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); + LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); + LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); + LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); + LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); + LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); + LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); + LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS); + LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); + LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); + LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, + FUSE_CAP_READDIRPLUS_AUTO); + se->conn.time_gran = 1; + + if (bufsize < FUSE_MIN_READ_BUFFER) { + fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", + bufsize); + bufsize = FUSE_MIN_READ_BUFFER; + } + se->bufsize = bufsize; + + if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; + } + + se->got_init = 1; + se->got_destroy = 0; + if (se->op.init) { + se->op.init(se->userdata, &se->conn); + } + + if (se->conn.want & (~se->conn.capable)) { + fuse_log(FUSE_LOG_ERR, + "fuse: error: filesystem requested capabilities " + "0x%x that are not supported by kernel, aborting.\n", + se->conn.want & (~se->conn.capable)); + fuse_reply_err(req, EPROTO); + se->error = -EPROTO; + fuse_session_exit(se); + return; + } + + if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; + } + if (arg->flags & FUSE_MAX_PAGES) { + outarg.flags |= FUSE_MAX_PAGES; + outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; + } + + /* + * Always enable big writes, this is superseded + * by the max_write option + */ + outarg.flags |= FUSE_BIG_WRITES; + + if (se->conn.want & FUSE_CAP_ASYNC_READ) { + outarg.flags |= FUSE_ASYNC_READ; + } + if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) { + outarg.flags |= FUSE_PARALLEL_DIROPS; + } + if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { + outarg.flags |= FUSE_POSIX_LOCKS; + } + if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) { + outarg.flags |= FUSE_ATOMIC_O_TRUNC; + } + if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) { + outarg.flags |= FUSE_EXPORT_SUPPORT; + } + if (se->conn.want & FUSE_CAP_DONT_MASK) { + outarg.flags |= FUSE_DONT_MASK; + } + if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) { + outarg.flags |= FUSE_FLOCK_LOCKS; + } + if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) { + outarg.flags |= FUSE_AUTO_INVAL_DATA; + } + if (se->conn.want & FUSE_CAP_READDIRPLUS) { + outarg.flags |= FUSE_DO_READDIRPLUS; + } + if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) { + outarg.flags |= FUSE_READDIRPLUS_AUTO; + } + if (se->conn.want & FUSE_CAP_ASYNC_DIO) { + outarg.flags |= FUSE_ASYNC_DIO; + } + if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) { + outarg.flags |= FUSE_WRITEBACK_CACHE; + } + if (se->conn.want & FUSE_CAP_POSIX_ACL) { + outarg.flags |= FUSE_POSIX_ACL; + } + outarg.max_readahead = se->conn.max_readahead; + outarg.max_write = se->conn.max_write; + if (se->conn.max_background >= (1 << 16)) { + se->conn.max_background = (1 << 16) - 1; + } + if (se->conn.congestion_threshold > se->conn.max_background) { + se->conn.congestion_threshold = se->conn.max_background; + } + if (!se->conn.congestion_threshold) { + se->conn.congestion_threshold = se->conn.max_background * 3 / 4; + } + + outarg.max_background = se->conn.max_background; + outarg.congestion_threshold = se->conn.congestion_threshold; + outarg.time_gran = se->conn.time_gran; + + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); + fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); + fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); + fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); + fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background); + fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", + outarg.congestion_threshold); + fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + + send_reply_ok(req, &outarg, outargsize); +} + +static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_session *se = req->se; + + (void)nodeid; + (void)iter; + + se->got_destroy = 1; + se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + + send_reply_ok(req, NULL, 0); +} + +static int send_notify_iov(struct fuse_session *se, int notify_code, + struct iovec *iov, int count) +{ + struct fuse_out_header out = { + .error = notify_code, + }; + + if (!se->got_init) { + return -ENOTCONN; + } + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + + return fuse_send_msg(se, NULL, iov, count); +} + +int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph) +{ + if (ph != NULL) { + struct fuse_notify_poll_wakeup_out outarg = { + .kh = ph->kh, + }; + struct iovec iov[2]; + + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + + return send_notify_iov(ph->se, FUSE_NOTIFY_POLL, iov, 2); + } else { + return 0; + } +} + +int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + off_t off, off_t len) +{ + struct fuse_notify_inval_inode_out outarg = { + .ino = ino, + .off = off, + .len = len, + }; + struct iovec iov[2]; + + if (!se) { + return -EINVAL; + } + + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + + return send_notify_iov(se, FUSE_NOTIFY_INVAL_INODE, iov, 2); +} + +int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + const char *name, size_t namelen) +{ + struct fuse_notify_inval_entry_out outarg = { + .parent = parent, + .namelen = namelen, + }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; + iov[2].iov_len = namelen + 1; + + return send_notify_iov(se, FUSE_NOTIFY_INVAL_ENTRY, iov, 3); +} + +int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + fuse_ino_t child, const char *name, + size_t namelen) +{ + struct fuse_notify_delete_out outarg = { + .parent = parent, + .child = child, + .namelen = namelen, + }; + struct iovec iov[3]; + + if (!se) { + return -EINVAL; + } + + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + iov[2].iov_base = (void *)name; + iov[2].iov_len = namelen + 1; + + return send_notify_iov(se, FUSE_NOTIFY_DELETE, iov, 3); +} + +int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv) +{ + struct fuse_out_header out = { + .error = FUSE_NOTIFY_STORE, + }; + struct fuse_notify_store_out outarg = { + .nodeid = ino, + .offset = offset, + .size = fuse_buf_size(bufv), + }; + struct iovec iov[3]; + int res; + + if (!se) { + return -EINVAL; + } + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(out); + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + + res = fuse_send_data_iov(se, NULL, iov, 2, bufv); + if (res > 0) { + res = -res; + } + + return res; +} + +void *fuse_req_userdata(fuse_req_t req) +{ + return req->se->userdata; +} + +const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) +{ + return &req->ctx; +} + +void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + void *data) +{ + pthread_mutex_lock(&req->lock); + pthread_mutex_lock(&req->se->lock); + req->u.ni.func = func; + req->u.ni.data = data; + pthread_mutex_unlock(&req->se->lock); + if (req->interrupted && func) { + func(req, data); + } + pthread_mutex_unlock(&req->lock); +} + +int fuse_req_interrupted(fuse_req_t req) +{ + int interrupted; + + pthread_mutex_lock(&req->se->lock); + interrupted = req->interrupted; + pthread_mutex_unlock(&req->se->lock); + + return interrupted; +} + +static struct { + void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *); + const char *name; +} fuse_ll_ops[] = { + [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, + [FUSE_FORGET] = { do_forget, "FORGET" }, + [FUSE_GETATTR] = { do_getattr, "GETATTR" }, + [FUSE_SETATTR] = { do_setattr, "SETATTR" }, + [FUSE_READLINK] = { do_readlink, "READLINK" }, + [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, + [FUSE_MKNOD] = { do_mknod, "MKNOD" }, + [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, + [FUSE_UNLINK] = { do_unlink, "UNLINK" }, + [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, + [FUSE_RENAME] = { do_rename, "RENAME" }, + [FUSE_LINK] = { do_link, "LINK" }, + [FUSE_OPEN] = { do_open, "OPEN" }, + [FUSE_READ] = { do_read, "READ" }, + [FUSE_WRITE] = { do_write, "WRITE" }, + [FUSE_STATFS] = { do_statfs, "STATFS" }, + [FUSE_RELEASE] = { do_release, "RELEASE" }, + [FUSE_FSYNC] = { do_fsync, "FSYNC" }, + [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, + [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, + [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, + [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, + [FUSE_FLUSH] = { do_flush, "FLUSH" }, + [FUSE_INIT] = { do_init, "INIT" }, + [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, + [FUSE_READDIR] = { do_readdir, "READDIR" }, + [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, + [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, + [FUSE_GETLK] = { do_getlk, "GETLK" }, + [FUSE_SETLK] = { do_setlk, "SETLK" }, + [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, + [FUSE_ACCESS] = { do_access, "ACCESS" }, + [FUSE_CREATE] = { do_create, "CREATE" }, + [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, + [FUSE_BMAP] = { do_bmap, "BMAP" }, + [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, + [FUSE_POLL] = { do_poll, "POLL" }, + [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, + [FUSE_DESTROY] = { do_destroy, "DESTROY" }, + [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" }, + [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, + [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, + [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, + [FUSE_LSEEK] = { do_lseek, "LSEEK" }, +}; + +#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) + +static const char *opname(enum fuse_opcode opcode) +{ + if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) { + return "???"; + } else { + return fuse_ll_ops[opcode].name; + } +} + +void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) +{ + struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; + fuse_session_process_buf_int(se, &bufv, NULL); +} + +/* + * Restriction: + * bufv is normally a single entry buffer, except for a write + * where (if it's in memory) then the bufv may be multiple entries, + * where the first entry contains all headers and subsequent entries + * contain data + * bufv shall not use any offsets etc to make the data anything + * other than contiguous starting from 0. + */ +void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_bufvec *bufv, + struct fuse_chan *ch) +{ + const struct fuse_buf *buf = bufv->buf; + struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; + struct fuse_req *req; + int err; + + /* The first buffer must be a memory buffer */ + assert(!(buf->flags & FUSE_BUF_IS_FD)); + + in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); + assert(in); /* caller guarantees the input buffer is large enough */ + + fuse_log( + FUSE_LOG_DEBUG, + "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", + (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode), + in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid); + + req = fuse_ll_alloc_req(se); + if (req == NULL) { + struct fuse_out_header out = { + .unique = in->unique, + .error = -ENOMEM, + }; + struct iovec iov = { + .iov_base = &out, + .iov_len = sizeof(struct fuse_out_header), + }; + + fuse_send_msg(se, ch, &iov, 1); + return; + } + + req->unique = in->unique; + req->ctx.uid = in->uid; + req->ctx.gid = in->gid; + req->ctx.pid = in->pid; + req->ch = ch; + + /* + * INIT and DESTROY requests are serialized, all other request types + * run in parallel. This prevents races between FUSE_INIT and ordinary + * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and + * FUSE_DESTROY and FUSE_DESTROY. + */ + if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT || + in->opcode == FUSE_DESTROY) { + pthread_rwlock_wrlock(&se->init_rwlock); + } else { + pthread_rwlock_rdlock(&se->init_rwlock); + } + + err = EIO; + if (!se->got_init) { + enum fuse_opcode expected; + + expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; + if (in->opcode != expected) { + goto reply_err; + } + } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { + if (fuse_lowlevel_is_virtio(se)) { + /* + * TODO: This is after a hard reboot typically, we need to do + * a destroy, but we can't reply to this request yet so + * we can't use do_destroy + */ + fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__); + se->got_destroy = 1; + se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + } else { + goto reply_err; + } + } + + err = EACCES; + /* Implement -o allow_root */ + if (se->deny_others && in->uid != se->owner && in->uid != 0 && + in->opcode != FUSE_INIT && in->opcode != FUSE_READ && + in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && + in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && + in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && + in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) { + goto reply_err; + } + + err = ENOSYS; + if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) { + goto reply_err; + } + if (in->opcode != FUSE_INTERRUPT) { + struct fuse_req *intr; + pthread_mutex_lock(&se->lock); + intr = check_interrupt(se, req); + list_add_req(req, &se->list); + pthread_mutex_unlock(&se->lock); + if (intr) { + fuse_reply_err(intr, EAGAIN); + } + } + + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, &iter, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } + + pthread_rwlock_unlock(&se->init_rwlock); + return; + +reply_err: + fuse_reply_err(req, err); + pthread_rwlock_unlock(&se->init_rwlock); +} + +#define LL_OPTION(n, o, v) \ + { \ + n, offsetof(struct fuse_session, o), v \ + } + +static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("debug", debug, 1), + LL_OPTION("-d", debug, 1), + LL_OPTION("--debug", debug, 1), + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), + LL_OPTION("--fd=%d", vu_listen_fd, 0), + LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0), + FUSE_OPT_END +}; + +void fuse_lowlevel_version(void) +{ + printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION, + FUSE_KERNEL_MINOR_VERSION); +} + +void fuse_lowlevel_help(void) +{ + /* + * These are not all options, but the ones that are + * potentially of interest to an end-user + */ + printf( + " -o allow_root allow access by root\n" + " --socket-path=PATH path for the vhost-user socket\n" + " --fd=FDNUM fd number of vhost-user socket\n" + " --thread-pool-size=NUM thread pool size limit (default %d)\n", + THREAD_POOL_SIZE); +} + +void fuse_session_destroy(struct fuse_session *se) +{ + if (se->got_init && !se->got_destroy) { + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + } + pthread_rwlock_destroy(&se->init_rwlock); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) { + close(se->fd); + } + + if (fuse_lowlevel_is_virtio(se)) { + virtio_session_close(se); + } + + free(se->vu_socket_path); + se->vu_socket_path = NULL; + + free(se); +} + + +struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata) +{ + struct fuse_session *se; + + if (sizeof(struct fuse_lowlevel_ops) < op_size) { + fuse_log( + FUSE_LOG_ERR, + "fuse: warning: library too old, some operations may not work\n"); + op_size = sizeof(struct fuse_lowlevel_ops); + } + + if (args->argc == 0) { + fuse_log(FUSE_LOG_ERR, + "fuse: empty argv passed to fuse_session_new().\n"); + return NULL; + } + + se = (struct fuse_session *)calloc(1, sizeof(struct fuse_session)); + if (se == NULL) { + fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); + goto out1; + } + se->fd = -1; + se->vu_listen_fd = -1; + se->thread_pool_size = THREAD_POOL_SIZE; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + + /* Parse options */ + if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) { + goto out2; + } + if (args->argc == 1 && args->argv[0][0] == '-') { + fuse_log(FUSE_LOG_ERR, + "fuse: warning: argv[0] looks like an option, but " + "will be ignored\n"); + } else if (args->argc != 1) { + int i; + fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); + for (i = 1; i < args->argc - 1; i++) { + fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); + } + fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); + goto out4; + } + + if (!se->vu_socket_path && se->vu_listen_fd < 0) { + fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n"); + goto out4; + } + if (se->vu_socket_path && se->vu_listen_fd >= 0) { + fuse_log(FUSE_LOG_ERR, + "fuse: --socket-path and --fd cannot be given together\n"); + goto out4; + } + + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; + + list_init_req(&se->list); + list_init_req(&se->interrupts); + fuse_mutex_init(&se->lock); + pthread_rwlock_init(&se->init_rwlock, NULL); + + memcpy(&se->op, op, op_size); + se->owner = getuid(); + se->userdata = userdata; + + return se; + +out4: + fuse_opt_free_args(args); +out2: + free(se); +out1: + return NULL; +} + +int fuse_session_mount(struct fuse_session *se) +{ + return virtio_session_mount(se); +} + +int fuse_session_fd(struct fuse_session *se) +{ + return se->fd; +} + +void fuse_session_unmount(struct fuse_session *se) +{ +} + +int fuse_lowlevel_is_virtio(struct fuse_session *se) +{ + return !!se->virtio_dev; +} + +#ifdef linux +int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +{ + char *buf; + size_t bufsize = 1024; + char path[128]; + int ret; + int fd; + unsigned long pid = req->ctx.pid; + char *s; + + sprintf(path, "/proc/%lu/task/%lu/status", pid, pid); + +retry: + buf = malloc(bufsize); + if (buf == NULL) { + return -ENOMEM; + } + + ret = -EIO; + fd = open(path, O_RDONLY); + if (fd == -1) { + goto out_free; + } + + ret = read(fd, buf, bufsize); + close(fd); + if (ret < 0) { + ret = -EIO; + goto out_free; + } + + if ((size_t)ret == bufsize) { + free(buf); + bufsize *= 4; + goto retry; + } + + ret = -EIO; + s = strstr(buf, "\nGroups:"); + if (s == NULL) { + goto out_free; + } + + s += 8; + ret = 0; + while (1) { + char *end; + unsigned long val = strtoul(s, &end, 0); + if (end == s) { + break; + } + + s = end; + if (ret < size) { + list[ret] = val; + } + ret++; + } + +out_free: + free(buf); + return ret; +} +#else /* linux */ +/* + * This is currently not implemented on other than Linux... + */ +int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]) +{ + (void)req; + (void)size; + (void)list; + return -ENOSYS; +} +#endif + +void fuse_session_exit(struct fuse_session *se) +{ + se->exited = 1; +} + +void fuse_session_reset(struct fuse_session *se) +{ + se->exited = 0; + se->error = 0; +} + +int fuse_session_exited(struct fuse_session *se) +{ + return se->exited; +} diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h new file mode 100644 index 0000000000..138041e5f1 --- /dev/null +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -0,0 +1,1991 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_LOWLEVEL_H_ +#define FUSE_LOWLEVEL_H_ + +/** + * @file + * + * Low level API + * + * IMPORTANT: you should define FUSE_USE_VERSION before including this + * header. To use the newest API define it to 31 (recommended for any + * new application). + */ + +#ifndef FUSE_USE_VERSION +#error FUSE_USE_VERSION not defined +#endif + +#include "fuse_common.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <utime.h> + +/* + * Miscellaneous definitions + */ + +/** The node ID of the root inode */ +#define FUSE_ROOT_ID 1 + +/** Inode number type */ +typedef uint64_t fuse_ino_t; + +/** Request pointer type */ +typedef struct fuse_req *fuse_req_t; + +/** + * Session + * + * This provides hooks for processing requests, and exiting + */ +struct fuse_session; + +/** Directory entry parameters supplied to fuse_reply_entry() */ +struct fuse_entry_param { + /** + * Unique inode number + * + * In lookup, zero means negative entry (from version 2.5) + * Returning ENOENT also means negative entry, but by setting zero + * ino the kernel may cache negative entries for entry_timeout + * seconds. + */ + fuse_ino_t ino; + + /** + * Generation number for this entry. + * + * If the file system will be exported over NFS, the + * ino/generation pairs need to be unique over the file + * system's lifetime (rather than just the mount time). So if + * the file system reuses an inode after it has been deleted, + * it must assign a new, previously unused generation number + * to the inode at the same time. + * + */ + uint64_t generation; + + /** + * Inode attributes. + * + * Even if attr_timeout == 0, attr must be correct. For example, + * for open(), FUSE uses attr.st_size from lookup() to determine + * how many bytes to request. If this value is not correct, + * incorrect data will be returned. + */ + struct stat attr; + + /** + * Validity timeout (in seconds) for inode attributes. If + * attributes only change as a result of requests that come + * through the kernel, this should be set to a very large + * value. + */ + double attr_timeout; + + /** + * Validity timeout (in seconds) for the name. If directory + * entries are changed/deleted only as a result of requests + * that come through the kernel, this should be set to a very + * large value. + */ + double entry_timeout; +}; + +/** + * Additional context associated with requests. + * + * Note that the reported client uid, gid and pid may be zero in some + * situations. For example, if the FUSE file system is running in a + * PID or user namespace but then accessed from outside the namespace, + * there is no valid uid/pid/gid that could be reported. + */ +struct fuse_ctx { + /** User ID of the calling process */ + uid_t uid; + + /** Group ID of the calling process */ + gid_t gid; + + /** Thread ID of the calling process */ + pid_t pid; + + /** Umask of the calling process */ + mode_t umask; +}; + +struct fuse_forget_data { + fuse_ino_t ino; + uint64_t nlookup; +}; + +/* 'to_set' flags in setattr */ +#define FUSE_SET_ATTR_MODE (1 << 0) +#define FUSE_SET_ATTR_UID (1 << 1) +#define FUSE_SET_ATTR_GID (1 << 2) +#define FUSE_SET_ATTR_SIZE (1 << 3) +#define FUSE_SET_ATTR_ATIME (1 << 4) +#define FUSE_SET_ATTR_MTIME (1 << 5) +#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) +#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) +#define FUSE_SET_ATTR_CTIME (1 << 10) + +/* + * Request methods and replies + */ + +/** + * Low level filesystem operations + * + * Most of the methods (with the exception of init and destroy) + * receive a request handle (fuse_req_t) as their first argument. + * This handle must be passed to one of the specified reply functions. + * + * This may be done inside the method invocation, or after the call + * has returned. The request handle is valid until one of the reply + * functions is called. + * + * Other pointer arguments (name, fuse_file_info, etc) are not valid + * after the call has returned, so if they are needed later, their + * contents have to be copied. + * + * In general, all methods are expected to perform any necessary + * permission checking. However, a filesystem may delegate this task + * to the kernel by passing the `default_permissions` mount option to + * `fuse_session_new()`. In this case, methods will only be called if + * the kernel's permission check has succeeded. + * + * The filesystem sometimes needs to handle a return value of -ENOENT + * from the reply function, which means, that the request was + * interrupted, and the reply discarded. For example if + * fuse_reply_open() return -ENOENT means, that the release method for + * this file will not be called. + */ +struct fuse_lowlevel_ops { + /** + * Initialize filesystem + * + * This function is called when libfuse establishes + * communication with the FUSE kernel module. The file system + * should use this module to inspect and/or modify the + * connection parameters provided in the `conn` structure. + * + * Note that some parameters may be overwritten by options + * passed to fuse_session_new() which take precedence over the + * values set in this handler. + * + * There's no reply to this function + * + * @param userdata the user data passed to fuse_session_new() + */ + void (*init)(void *userdata, struct fuse_conn_info *conn); + + /** + * Clean up filesystem. + * + * Called on filesystem exit. When this method is called, the + * connection to the kernel may be gone already, so that eg. calls + * to fuse_lowlevel_notify_* will fail. + * + * There's no reply to this function + * + * @param userdata the user data passed to fuse_session_new() + */ + void (*destroy)(void *userdata); + + /** + * Look up a directory entry by name and get its attributes. + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name the name to look up + */ + void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Forget about an inode + * + * This function is called when the kernel removes an inode + * from its internal caches. + * + * The inode's lookup count increases by one for every call to + * fuse_reply_entry and fuse_reply_create. The nlookup parameter + * indicates by how much the lookup count should be decreased. + * + * Inodes with a non-zero lookup count may receive request from + * the kernel even after calls to unlink, rmdir or (when + * overwriting an existing file) rename. Filesystems must handle + * such requests properly and it is recommended to defer removal + * of the inode until the lookup count reaches zero. Calls to + * unlink, rmdir or rename will be followed closely by forget + * unless the file or directory is open, in which case the + * kernel issues forget only after the release or releasedir + * calls. + * + * Note that if a file system will be exported over NFS the + * inodes lifetime must extend even beyond forget. See the + * generation field in struct fuse_entry_param above. + * + * On unmount the lookup count for all inodes implicitly drops + * to zero. It is not guaranteed that the file system will + * receive corresponding forget messages for the affected + * inodes. + * + * Valid replies: + * fuse_reply_none + * + * @param req request handle + * @param ino the inode number + * @param nlookup the number of lookups to forget + */ + void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); + + /** + * Get file attributes. + * + * If writeback caching is enabled, the kernel may have a + * better idea of a file's length than the FUSE file system + * (eg if there has been a write that extended the file size, + * but that has not yet been passed to the filesystem.n + * + * In this case, the st_size value provided by the file system + * will be ignored. + * + * Valid replies: + * fuse_reply_attr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi for future use, currently always NULL + */ + void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Set file attributes + * + * In the 'attr' argument only members indicated by the 'to_set' + * bitmask contain valid values. Other members contain undefined + * values. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits if the file + * size or owner is being changed. + * + * If the setattr was invoked from the ftruncate() system call + * under Linux kernel versions 2.6.15 or later, the fi->fh will + * contain the value set by the open method or will be undefined + * if the open method didn't set any value. Otherwise (not + * ftruncate call, or kernel version earlier than 2.6.15) the fi + * parameter will be NULL. + * + * Valid replies: + * fuse_reply_attr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param attr the attributes + * @param to_set bit mask of attributes which should be set + * @param fi file information, or NULL + */ + void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int to_set, struct fuse_file_info *fi); + + /** + * Read symbolic link + * + * Valid replies: + * fuse_reply_readlink + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + */ + void (*readlink)(fuse_req_t req, fuse_ino_t ino); + + /** + * Create file node + * + * Create a regular file, character device, block device, fifo or + * socket node. + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode file type and mode with which to create the new file + * @param rdev the device number (only valid if created file is a device) + */ + void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev); + + /** + * Create a directory + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode with which to create the new file + */ + void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode); + + /** + * Remove a file + * + * If the file's inode's lookup count is non-zero, the file + * system is expected to postpone any removal of the inode + * until the lookup count reaches zero (see description of the + * forget function). + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to remove + */ + void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Remove a directory + * + * If the directory's inode's lookup count is non-zero, the + * file system is expected to postpone any removal of the + * inode until the lookup count reaches zero (see description + * of the forget function). + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to remove + */ + void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Create a symbolic link + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param link the contents of the symbolic link + * @param parent inode number of the parent directory + * @param name to create + */ + void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent, + const char *name); + + /** + * Rename a file + * + * If the target exists it should be atomically replaced. If + * the target's inode's lookup count is non-zero, the file + * system is expected to postpone any removal of the inode + * until the lookup count reaches zero (see description of the + * forget function). + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EINVAL, i.e. all + * future bmap requests will fail with EINVAL without being + * send to the filesystem process. + * + * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If + * RENAME_NOREPLACE is specified, the filesystem must not + * overwrite *newname* if it exists and return an error + * instead. If `RENAME_EXCHANGE` is specified, the filesystem + * must atomically exchange the two files, i.e. both must + * exist and neither may be deleted. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the old parent directory + * @param name old name + * @param newparent inode number of the new parent directory + * @param newname new name + */ + void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname, + unsigned int flags); + + /** + * Create a hard link + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param ino the old inode number + * @param newparent inode number of the new parent directory + * @param newname new name to create + */ + void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, + const char *newname); + + /** + * Open a file + * + * Open flags are available in fi->flags. The following rules + * apply. + * + * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be + * filtered out / handled by the kernel. + * + * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used + * by the filesystem to check if the operation is + * permitted. If the ``-o default_permissions`` mount + * option is given, this check is already done by the + * kernel before calling open() and may thus be omitted by + * the filesystem. + * + * - When writeback caching is enabled, the kernel may send + * read requests even for files opened with O_WRONLY. The + * filesystem should be prepared to handle this. + * + * - When writeback caching is disabled, the filesystem is + * expected to properly handle the O_APPEND flag and ensure + * that each write is appending to the end of the file. + * + * - When writeback caching is enabled, the kernel will + * handle O_APPEND. However, unless all changes to the file + * come through the kernel this will not work reliably. The + * filesystem should thus either ignore the O_APPEND flag + * (and let the kernel handle it), or return an error + * (indicating that reliably O_APPEND is not available). + * + * Filesystem may store an arbitrary file handle (pointer, + * index, etc) in fi->fh, and use this in other all other file + * operations (read, write, flush, release, fsync). + * + * Filesystem may also implement stateless file I/O and not store + * anything in fi->fh. + * + * There are also some flags (direct_io, keep_cache) which the + * filesystem may set in fi, to change the way the file is opened. + * See fuse_file_info structure in <fuse_common.h> for more details. + * + * If this request is answered with an error code of ENOSYS + * and FUSE_CAP_NO_OPEN_SUPPORT is set in + * `fuse_conn_info.capable`, this is treated as success and + * future calls to open and release will also succeed without being + * sent to the filesystem process. + * + * Valid replies: + * fuse_reply_open + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Read data + * + * Read should send exactly the number of bytes requested except + * on EOF or error, otherwise the rest of the data will be + * substituted with zeroes. An exception to this is when the file + * has been opened in 'direct_io' mode, in which case the return + * value of the read system call will reflect the return value of + * this operation. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_iov + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size number of bytes to read + * @param off offset to read from + * @param fi file information + */ + void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Write data + * + * Write should return exactly the number of bytes requested + * except on error. An exception to this is when the file has + * been opened in 'direct_io' mode, in which case the return value + * of the write system call will reflect the return value of this + * operation. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param buf data to write + * @param size number of bytes to write + * @param off offset to write to + * @param fi file information + */ + void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, + off_t off, struct fuse_file_info *fi); + + /** + * Flush method + * + * This is called on each close() of the opened file. + * + * Since file descriptors can be duplicated (dup, dup2, fork), for + * one open call there may be many flush calls. + * + * Filesystems shouldn't assume that flush will always be called + * after some writes, or that if will be called at all. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * NOTE: the name of the method is misleading, since (unlike + * fsync) the filesystem is not forced to flush pending writes. + * One reason to flush data is if the filesystem wants to return + * write errors during close. However, such use is non-portable + * because POSIX does not require [close] to wait for delayed I/O to + * complete. + * + * If the filesystem supports file locking operations (setlk, + * getlk) it should remove all locks belonging to 'fi->owner'. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to flush() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * + * [close]: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html + */ + void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Release an open file + * + * Release is called when there are no more references to an open + * file: all file descriptors are closed and all memory mappings + * are unmapped. + * + * For every open call there will be exactly one release call (unless + * the filesystem is force-unmounted). + * + * The filesystem may reply with an error, but error values are + * not returned to close() or munmap() which triggered the + * release. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * fi->flags will contain the same flags as for open. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Synchronize file contents + * + * If the datasync parameter is non-zero, then only the user data + * should be flushed, not the meta data. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to fsync() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param datasync flag indicating if only data should be flushed + * @param fi file information + */ + void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi); + + /** + * Open a directory + * + * Filesystem may store an arbitrary file handle (pointer, index, + * etc) in fi->fh, and use this in other all other directory + * stream operations (readdir, releasedir, fsyncdir). + * + * If this request is answered with an error code of ENOSYS and + * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, + * this is treated as success and future calls to opendir and + * releasedir will also succeed without being sent to the filesystem + * process. In addition, the kernel will cache readdir results + * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. + * + * Valid replies: + * fuse_reply_open + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Read directory + * + * Send a buffer filled using fuse_add_direntry(), with size not + * exceeding the requested size. Send an empty buffer on end of + * stream. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * Returning a directory entry from readdir() does not affect + * its lookup count. + * + * If off_t is non-zero, then it will correspond to one of the off_t + * values that was previously returned by readdir() for the same + * directory handle. In this case, readdir() should skip over entries + * coming before the position defined by the off_t value. If entries + * are added or removed while the directory handle is open, they filesystem + * may still include the entries that have been removed, and may not + * report the entries that have been created. However, addition or + * removal of entries must never cause readdir() to skip over unrelated + * entries or to report them more than once. This means + * that off_t can not be a simple index that enumerates the entries + * that have been returned but must contain sufficient information to + * uniquely determine the next directory entry to return even when the + * set of entries is changing. + * + * The function does not have to report the '.' and '..' + * entries, but is allowed to do so. Note that, if readdir does + * not return '.' or '..', they will not be implicitly returned, + * and this behavior is observable by the caller. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum number of bytes to send + * @param off offset to continue reading the directory stream + * @param fi file information + */ + void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Release an open directory + * + * For every opendir call there will be exactly one releasedir + * call (unless the filesystem is force-unmounted). + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*releasedir)(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi); + + /** + * Synchronize directory contents + * + * If the datasync parameter is non-zero, then only the directory + * contents should be flushed, not the meta data. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to fsyncdir() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param datasync flag indicating if only data should be flushed + * @param fi file information + */ + void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi); + + /** + * Get file system statistics + * + * Valid replies: + * fuse_reply_statfs + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number, zero means "undefined" + */ + void (*statfs)(fuse_req_t req, fuse_ino_t ino); + + /** + * Set an extended attribute + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future setxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + */ + void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags); + + /** + * Get an extended attribute + * + * If size is zero, the size of the value should be sent with + * fuse_reply_xattr. + * + * If the size is non-zero, and the value fits in the buffer, the + * value should be sent with fuse_reply_buf. + * + * If the size is too small for the value, the ERANGE error should + * be sent. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future getxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_xattr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param name of the extended attribute + * @param size maximum size of the value to send + */ + void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size); + + /** + * List extended attribute names + * + * If size is zero, the total size of the attribute list should be + * sent with fuse_reply_xattr. + * + * If the size is non-zero, and the null character separated + * attribute list fits in the buffer, the list should be sent with + * fuse_reply_buf. + * + * If the size is too small for the list, the ERANGE error should + * be sent. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future listxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_xattr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum size of the list to send + */ + void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size); + + /** + * Remove an extended attribute + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future removexattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param name of the extended attribute + */ + void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name); + + /** + * Check file access permissions + * + * This will be called for the access() and chdir() system + * calls. If the 'default_permissions' mount option is given, + * this method is not called. + * + * This method is not called under Linux kernel versions 2.4.x + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent success, i.e. this and all future access() + * requests will succeed without being send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param mask requested access mode + */ + void (*access)(fuse_req_t req, fuse_ino_t ino, int mask); + + /** + * Create and open a file + * + * If the file does not exist, first create it with the specified + * mode, and then open it. + * + * See the description of the open handler for more + * information. + * + * If this method is not implemented or under Linux kernel + * versions earlier than 2.6.15, the mknod() and open() methods + * will be called instead. + * + * If this request is answered with an error code of ENOSYS, the handler + * is treated as not implemented (i.e., for this and future requests the + * mknod() and open() handlers will be called instead). + * + * Valid replies: + * fuse_reply_create + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode file type and mode with which to create the new file + * @param fi file information + */ + void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi); + + /** + * Test for a POSIX file lock + * + * Valid replies: + * fuse_reply_lock + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param lock the region/type to test + */ + void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock); + + /** + * Acquire, modify or release a POSIX file lock + * + * For POSIX threads (NPTL) there's a 1-1 relation between pid and + * owner, but otherwise this is not always the case. For checking + * lock ownership, 'fi->owner' must be used. The l_pid field in + * 'struct flock' should only be used to fill in this field in + * getlk(). + * + * Note: if the locking methods are not implemented, the kernel + * will still allow file locking to work locally. Hence these are + * only interesting for network filesystems and similar. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param lock the region/type to set + * @param sleep locking operation may sleep + */ + void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock, int sleep); + + /** + * Map block index within file to block index within device + * + * Note: This makes sense only for block device backed filesystems + * mounted with the 'blkdev' option + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure, i.e. all future bmap() requests will + * fail with the same error code without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_bmap + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param blocksize unit of block index + * @param idx block index within file + */ + void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize, + uint64_t idx); + + /** + * Ioctl + * + * Note: For unrestricted ioctls (not allowed for FUSE + * servers), data in and out areas can be discovered by giving + * iovs and setting FUSE_IOCTL_RETRY in *flags*. For + * restricted ioctls, kernel prepares in/out data area + * according to the information encoded in cmd. + * + * Valid replies: + * fuse_reply_ioctl_retry + * fuse_reply_ioctl + * fuse_reply_ioctl_iov + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param cmd ioctl command + * @param arg ioctl argument + * @param fi file information + * @param flags for FUSE_IOCTL_* flags + * @param in_buf data fetched from the caller + * @param in_bufsz number of fetched bytes + * @param out_bufsz maximum size of output data + * + * Note : the unsigned long request submitted by the application + * is truncated to 32 bits. + */ + void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, const void *in_buf, + size_t in_bufsz, size_t out_bufsz); + + /** + * Poll for IO readiness + * + * Note: If ph is non-NULL, the client should notify + * when IO readiness events occur by calling + * fuse_lowlevel_notify_poll() with the specified ph. + * + * Regardless of the number of times poll with a non-NULL ph + * is received, single notification is enough to clear all. + * Notifying more times incurs overhead but doesn't harm + * correctness. + * + * The callee is responsible for destroying ph with + * fuse_pollhandle_destroy() when no longer in use. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as success (with a kernel-defined default poll-mask) and + * future calls to pull() will succeed the same way without being send + * to the filesystem process. + * + * Valid replies: + * fuse_reply_poll + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param ph poll handle to be used for notification + */ + void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct fuse_pollhandle *ph); + + /** + * Write data made available in a buffer + * + * This is a more generic version of the ->write() method. If + * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the + * kernel supports splicing from the fuse device, then the + * data will be made available in pipe for supporting zero + * copy data transfer. + * + * buf->count is guaranteed to be one (and thus buf->idx is + * always zero). The write_buf handler must ensure that + * bufv->off is correctly updated (reflecting the number of + * bytes read from bufv->buf[0]). + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param bufv buffer containing the data + * @param off offset to write to + * @param fi file information + */ + void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, + off_t off, struct fuse_file_info *fi); + + /** + * Forget about multiple inodes + * + * See description of the forget function for more + * information. + * + * Valid replies: + * fuse_reply_none + * + * @param req request handle + */ + void (*forget_multi)(fuse_req_t req, size_t count, + struct fuse_forget_data *forgets); + + /** + * Acquire, modify or release a BSD file lock + * + * Note: if the locking methods are not implemented, the kernel + * will still allow file locking to work locally. Hence these are + * only interesting for network filesystems and similar. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param op the locking operation, see flock(2) + */ + void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int op); + + /** + * Allocate requested space. If this function returns success then + * subsequent writes to the specified range shall not fail due to the lack + * of free space on the file system storage media. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future fallocate() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param offset starting point for allocated region + * @param length size of allocated region + * @param mode determines the operation to be performed on the given range, + * see fallocate(2) + */ + void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + off_t length, struct fuse_file_info *fi); + + /** + * Read directory with attributes + * + * Send a buffer filled using fuse_add_direntry_plus(), with size not + * exceeding the requested size. Send an empty buffer on end of + * stream. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * In contrast to readdir() (which does not affect the lookup counts), + * the lookup count of every entry returned by readdirplus(), except "." + * and "..", is incremented by one. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum number of bytes to send + * @param off offset to continue reading the directory stream + * @param fi file information + */ + void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Copy a range of data from one file to another + * + * Performs an optimized copy between two file descriptors without the + * additional cost of transferring data through the FUSE kernel module + * to user space (glibc) and then back into the FUSE filesystem again. + * + * In case this method is not implemented, glibc falls back to reading + * data from the source and writing to the destination. Effectively + * doing an inefficient copy of the data. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future copy_file_range() requests will fail with EOPNOTSUPP without + * being send to the filesystem process. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino_in the inode number or the source file + * @param off_in starting point from were the data should be read + * @param fi_in file information of the source file + * @param ino_out the inode number or the destination file + * @param off_out starting point where the data should be written + * @param fi_out file information of the destination file + * @param len maximum size of the data to copy + * @param flags passed along with the copy_file_range() syscall + */ + void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + struct fuse_file_info *fi_in, fuse_ino_t ino_out, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags); + + /** + * Find next data or hole after the specified offset + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure, i.e. all future lseek() requests will + * fail with the same error code without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_lseek + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param off offset to start search from + * @param whence either SEEK_DATA or SEEK_HOLE + * @param fi file information + */ + void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + struct fuse_file_info *fi); +}; + +/** + * Reply with an error code or success. + * + * Possible requests: + * all except forget + * + * Whereever possible, error codes should be chosen from the list of + * documented error conditions in the corresponding system calls + * manpage. + * + * An error code of ENOSYS is sometimes treated specially. This is + * indicated in the documentation of the affected handler functions. + * + * The following requests may be answered with a zero error code: + * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr, + * removexattr, setlk. + * + * @param req request handle + * @param err the positive error value, or zero for success + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_err(fuse_req_t req, int err); + +/** + * Don't send reply + * + * Possible requests: + * forget + * forget_multi + * retrieve_reply + * + * @param req request handle + */ +void fuse_reply_none(fuse_req_t req); + +/** + * Reply with a directory entry + * + * Possible requests: + * lookup, mknod, mkdir, symlink, link + * + * Side effects: + * increments the lookup count on success + * + * @param req request handle + * @param e the entry parameters + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); + +/** + * Reply with a directory entry and open parameters + * + * currently the following members of 'fi' are used: + * fh, direct_io, keep_cache + * + * Possible requests: + * create + * + * Side effects: + * increments the lookup count on success + * + * @param req request handle + * @param e the entry parameters + * @param fi file information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *fi); + +/** + * Reply with attributes + * + * Possible requests: + * getattr, setattr + * + * @param req request handle + * @param attr the attributes + * @param attr_timeout validity timeout (in seconds) for the attributes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout); + +/** + * Reply with the contents of a symbolic link + * + * Possible requests: + * readlink + * + * @param req request handle + * @param link symbolic link contents + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_readlink(fuse_req_t req, const char *link); + +/** + * Reply with open parameters + * + * currently the following members of 'fi' are used: + * fh, direct_io, keep_cache + * + * Possible requests: + * open, opendir + * + * @param req request handle + * @param fi file information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi); + +/** + * Reply with number of bytes written + * + * Possible requests: + * write + * + * @param req request handle + * @param count the number of bytes written + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_write(fuse_req_t req, size_t count); + +/** + * Reply with data + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * @param req request handle + * @param buf buffer containing data + * @param size the size of data in bytes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + +/** + * Reply with data copied/moved from buffer(s) + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * Side effects: + * when used to return data from a readdirplus() (but not readdir()) + * call, increments the lookup count of each returned entry by one + * on success. + * + * @param req request handle + * @param bufv buffer vector + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv); + +/** + * Reply with data vector + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * @param req request handle + * @param iov the vector containing the data + * @param count the size of vector + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count); + +/** + * Reply with filesystem statistics + * + * Possible requests: + * statfs + * + * @param req request handle + * @param stbuf filesystem statistics + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf); + +/** + * Reply with needed buffer size + * + * Possible requests: + * getxattr, listxattr + * + * @param req request handle + * @param count the buffer size needed in bytes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_xattr(fuse_req_t req, size_t count); + +/** + * Reply with file lock information + * + * Possible requests: + * getlk + * + * @param req request handle + * @param lock the lock information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_lock(fuse_req_t req, const struct flock *lock); + +/** + * Reply with block index + * + * Possible requests: + * bmap + * + * @param req request handle + * @param idx block index within device + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + +/* + * Filling a buffer in readdir + */ + +/** + * Add a directory entry to the buffer + * + * Buffer needs to be large enough to hold the entry. If it's not, + * then the entry is not filled in but the size of the entry is still + * returned. The caller can check this by comparing the bufsize + * parameter with the returned entry size. If the entry size is + * larger than the buffer size, the operation failed. + * + * From the 'stbuf' argument the st_ino field and bits 12-15 of the + * st_mode field are used. The other fields are ignored. + * + * *off* should be any non-zero value that the filesystem can use to + * identify the current point in the directory stream. It does not + * need to be the actual physical position. A value of zero is + * reserved to mean "from the beginning", and should therefore never + * be used (the first call to fuse_add_direntry should be passed the + * offset of the second directory entry). + * + * @param req request handle + * @param buf the point where the new entry will be added to the buffer + * @param bufsize remaining size of the buffer + * @param name the name of the entry + * @param stbuf the file attributes + * @param off the offset of the next entry + * @return the space needed for the entry + */ +size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + const char *name, const struct stat *stbuf, off_t off); + +/** + * Add a directory entry to the buffer with the attributes + * + * See documentation of `fuse_add_direntry()` for more details. + * + * @param req request handle + * @param buf the point where the new entry will be added to the buffer + * @param bufsize remaining size of the buffer + * @param name the name of the entry + * @param e the directory entry + * @param off the offset of the next entry + * @return the space needed for the entry + */ +size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + const char *name, + const struct fuse_entry_param *e, off_t off); + +/** + * Reply to ask for data fetch and output buffer preparation. ioctl + * will be retried with the specified input data fetched and output + * buffer prepared. + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param in_iov iovec specifying data to fetch from the caller + * @param in_count number of entries in in_iov + * @param out_iov iovec specifying addresses to write output to + * @param out_count number of entries in out_iov + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + size_t in_count, const struct iovec *out_iov, + size_t out_count); + +/** + * Reply to finish ioctl + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param result result to be passed to the caller + * @param buf buffer containing output data + * @param size length of output data + */ +int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); + +/** + * Reply to finish ioctl with iov buffer + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param result result to be passed to the caller + * @param iov the vector containing the data + * @param count the size of vector + */ +int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, + int count); + +/** + * Reply with poll result event mask + * + * @param req request handle + * @param revents poll result event mask + */ +int fuse_reply_poll(fuse_req_t req, unsigned revents); + +/** + * Reply with offset + * + * Possible requests: + * lseek + * + * @param req request handle + * @param off offset of next data or hole + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_lseek(fuse_req_t req, off_t off); + +/* + * Notification + */ + +/** + * Notify IO readiness event + * + * For more information, please read comment for poll operation. + * + * @param ph poll handle to notify IO readiness event for + */ +int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); + +/** + * Notify to invalidate cache for an inode. + * + * Added in FUSE protocol version 7.12. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * If the filesystem has writeback caching enabled, invalidating an + * inode will first trigger a writeback of all dirty pages. The call + * will block until all writeback requests have completed and the + * inode has been invalidated. It will, however, not wait for + * completion of pending writeback requests that have been issued + * before. + * + * If there are no dirty pages, this function will never block. + * + * @param se the session object + * @param ino the inode number + * @param off the offset in the inode where to start invalidating + * or negative to invalidate attributes only + * @param len the amount of cache to invalidate or 0 for all + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + off_t off, off_t len); + +/** + * Notify to invalidate parent attributes and the dentry matching + * parent/name + * + * To avoid a deadlock this function must not be called in the + * execution path of a related filesytem operation or within any code + * that could hold a lock that could be needed to execute such an + * operation. As of kernel 4.18, a "related operation" is a lookup(), + * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create() + * request for the parent, and a setattr(), unlink(), rmdir(), + * rename(), setxattr(), removexattr(), readdir() or readdirplus() + * request for the inode itself. + * + * When called correctly, this function will never block. + * + * Added in FUSE protocol version 7.12. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param parent inode number + * @param name file name + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + const char *name, size_t namelen); + +/** + * This function behaves like fuse_lowlevel_notify_inval_entry() with + * the following additional effect (at least as of Linux kernel 4.8): + * + * If the provided *child* inode matches the inode that is currently + * associated with the cached dentry, and if there are any inotify + * watches registered for the dentry, then the watchers are informed + * that the dentry has been deleted. + * + * To avoid a deadlock this function must not be called while + * executing a related filesytem operation or while holding a lock + * that could be needed to execute such an operation (see the + * description of fuse_lowlevel_notify_inval_entry() for more + * details). + * + * When called correctly, this function will never block. + * + * Added in FUSE protocol version 7.18. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param parent inode number + * @param child inode number + * @param name file name + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + fuse_ino_t child, const char *name, + size_t namelen); + +/** + * Store data to the kernel buffers + * + * Synchronously store data in the kernel buffers belonging to the + * given inode. The stored data is marked up-to-date (no read will be + * performed against it, unless it's invalidated or evicted from the + * cache). + * + * If the stored data overflows the current file size, then the size + * is extended, similarly to a write(2) on the filesystem. + * + * If this function returns an error, then the store wasn't fully + * completed, but it may have been partially completed. + * + * Added in FUSE protocol version 7.15. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param ino the inode number + * @param offset the starting offset into the file to store to + * @param bufv buffer vector + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv); + +/* + * Utility functions + */ + +/** + * Get the userdata from the request + * + * @param req request handle + * @return the user data passed to fuse_session_new() + */ +void *fuse_req_userdata(fuse_req_t req); + +/** + * Get the context from the request + * + * The pointer returned by this function will only be valid for the + * request's lifetime + * + * @param req request handle + * @return the context structure + */ +const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); + +/** + * Get the current supplementary group IDs for the specified request + * + * Similar to the getgroups(2) system call, except the return value is + * always the total number of group IDs, even if it is larger than the + * specified size. + * + * The current fuse kernel module in linux (as of 2.6.30) doesn't pass + * the group list to userspace, hence this function needs to parse + * "/proc/$TID/task/$TID/status" to get the group IDs. + * + * This feature may not be supported on all operating systems. In + * such a case this function will return -ENOSYS. + * + * @param req request handle + * @param size size of given array + * @param list array of group IDs to be filled in + * @return the total number of supplementary group IDs or -errno on failure + */ +int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]); + +/** + * Callback function for an interrupt + * + * @param req interrupted request + * @param data user data + */ +typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); + +/** + * Register/unregister callback for an interrupt + * + * If an interrupt has already happened, then the callback function is + * called from within this function, hence it's not possible for + * interrupts to be lost. + * + * @param req request handle + * @param func the callback function or NULL for unregister + * @param data user data passed to the callback function + */ +void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + void *data); + +/** + * Check if a request has already been interrupted + * + * @param req request handle + * @return 1 if the request has been interrupted, 0 otherwise + */ +int fuse_req_interrupted(fuse_req_t req); + +/** + * Check if the session is connected via virtio + * + * @param se session object + * @return 1 if the session is a virtio session + */ +int fuse_lowlevel_is_virtio(struct fuse_session *se); + +/* + * Inquiry functions + */ + +/** + * Print low-level version information to stdout. + */ +void fuse_lowlevel_version(void); + +/** + * Print available low-level options to stdout. This is not an + * exhaustive list, but includes only those options that may be of + * interest to an end-user of a file system. + */ +void fuse_lowlevel_help(void); + +/** + * Print available options for `fuse_parse_cmdline()`. + */ +void fuse_cmdline_help(void); + +/* + * Filesystem setup & teardown + */ + +struct fuse_cmdline_opts { + int foreground; + int debug; + int nodefault_subtype; + int show_version; + int show_help; + int print_capabilities; + int syslog; + int log_level; + unsigned int max_idle_threads; +}; + +/** + * Utility function to parse common options for simple file systems + * using the low-level API. A help text that describes the available + * options can be printed with `fuse_cmdline_help`. A single + * non-option argument is treated as the mountpoint. Multiple + * non-option arguments will result in an error. + * + * If neither -o subtype= or -o fsname= options are given, a new + * subtype option will be added and set to the basename of the program + * (the fsname will remain unset, and then defaults to "fuse"). + * + * Known options will be removed from *args*, unknown options will + * remain. + * + * @param args argument vector (input+output) + * @param opts output argument for parsed options + * @return 0 on success, -1 on failure + */ +int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts); + +/** + * Create a low level session. + * + * Returns a session structure suitable for passing to + * fuse_session_mount() and fuse_session_loop(). + * + * This function accepts most file-system independent mount options + * (like context, nodev, ro - see mount(8)), as well as the general + * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and + * -o default_permissions, but not ``-o use_ino``). Instead of `-o + * debug`, debugging may also enabled with `-d` or `--debug`. + * + * If not all options are known, an error message is written to stderr + * and the function returns NULL. + * + * Option parsing skips argv[0], which is assumed to contain the + * program name. To prevent accidentally passing an option in + * argv[0], this element must always be present (even if no options + * are specified). It may be set to the empty string ('\0') if no + * reasonable value can be provided. + * + * @param args argument vector + * @param op the (low-level) filesystem operations + * @param op_size sizeof(struct fuse_lowlevel_ops) + * @param userdata user data + * + * @return the fuse session on success, NULL on failure + **/ +struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata); + +/** + * Mount a FUSE file system. + * + * @param se session object + * + * @return 0 on success, -1 on failure. + **/ +int fuse_session_mount(struct fuse_session *se); + +/** + * Enter a single threaded, blocking event loop. + * + * When the event loop terminates because the connection to the FUSE + * kernel module has been closed, this function returns zero. This + * happens when the filesystem is unmounted regularly (by the + * filesystem owner or root running the umount(8) or fusermount(1) + * command), or if connection is explicitly severed by writing ``1`` + * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only + * way to distinguish between these two conditions is to check if the + * filesystem is still mounted after the session loop returns. + * + * When some error occurs during request processing, the function + * returns a negated errno(3) value. + * + * If the loop has been terminated because of a signal handler + * installed by fuse_set_signal_handlers(), this function returns the + * (positive) signal value that triggered the exit. + * + * @param se the session + * @return 0, -errno, or a signal value + */ +int fuse_session_loop(struct fuse_session *se); + +/** + * Flag a session as terminated. + * + * This function is invoked by the POSIX signal handlers, when + * registered using fuse_set_signal_handlers(). It will cause any + * running event loops to terminate on the next opportunity. + * + * @param se the session + */ +void fuse_session_exit(struct fuse_session *se); + +/** + * Reset the terminated flag of a session + * + * @param se the session + */ +void fuse_session_reset(struct fuse_session *se); + +/** + * Query the terminated flag of a session + * + * @param se the session + * @return 1 if exited, 0 if not exited + */ +int fuse_session_exited(struct fuse_session *se); + +/** + * Ensure that file system is unmounted. + * + * In regular operation, the file system is typically unmounted by the + * user calling umount(8) or fusermount(1), which then terminates the + * FUSE session loop. However, the session loop may also terminate as + * a result of an explicit call to fuse_session_exit() (e.g. by a + * signal handler installed by fuse_set_signal_handler()). In this + * case the filesystem remains mounted, but any attempt to access it + * will block (while the filesystem process is still running) or give + * an ESHUTDOWN error (after the filesystem process has terminated). + * + * If the communication channel with the FUSE kernel module is still + * open (i.e., if the session loop was terminated by an explicit call + * to fuse_session_exit()), this function will close it and unmount + * the filesystem. If the communication channel has been closed by the + * kernel, this method will do (almost) nothing. + * + * NOTE: The above semantics mean that if the connection to the kernel + * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file, + * this method will *not* unmount the filesystem. + * + * @param se the session + */ +void fuse_session_unmount(struct fuse_session *se); + +/** + * Destroy a session + * + * @param se the session + */ +void fuse_session_destroy(struct fuse_session *se); + +/* + * Custom event loop support + */ + +/** + * Return file descriptor for communication with kernel. + * + * The file selector can be used to integrate FUSE with a custom event + * loop. Whenever data is available for reading on the provided fd, + * the event loop should call `fuse_session_receive_buf` followed by + * `fuse_session_process_buf` to process the request. + * + * The returned file descriptor is valid until `fuse_session_unmount` + * is called. + * + * @param se the session + * @return a file descriptor + */ +int fuse_session_fd(struct fuse_session *se); + +/** + * Process a raw request supplied in a generic buffer + * + * The fuse_buf may contain a memory buffer or a pipe file descriptor. + * + * @param se the session + * @param buf the fuse_buf containing the request + */ +void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf); + +/** + * Read a raw request from the kernel into the supplied buffer. + * + * Depending on file system options, system capabilities, and request + * size the request is either read into a memory buffer or spliced + * into a temporary pipe. + * + * @param se the session + * @param buf the fuse_buf to store the request in + * @return the actual size of the raw request, or -errno on error + */ +int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); + +#endif /* FUSE_LOWLEVEL_H_ */ diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h new file mode 100644 index 0000000000..5c618ce21f --- /dev/null +++ b/tools/virtiofsd/fuse_misc.h @@ -0,0 +1,60 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include <pthread.h> +#include "config-host.h" + +/* + * Versioned symbols cannot be used in some cases because it + * - confuse the dynamic linker in uClibc + * - not supported on MacOSX (in MachO binary format) + */ +#if (!defined(__UCLIBC__) && !defined(__APPLE__)) +#define FUSE_SYMVER(x) __asm__(x) +#else +#define FUSE_SYMVER(x) +#endif + +#ifndef USE_UCLIBC +#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL) +#else +/* Is this hack still needed? */ +static inline void fuse_mutex_init(pthread_mutex_t *mut) +{ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + pthread_mutex_init(mut, &attr); + pthread_mutexattr_destroy(&attr); +} +#endif + +#ifdef HAVE_STRUCT_STAT_ST_ATIM +/* Linux */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val) +#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val) +#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val) +#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC) +/* FreeBSD */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val) +#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val) +#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val) +#else +#define ST_ATIM_NSEC(stbuf) 0 +#define ST_CTIM_NSEC(stbuf) 0 +#define ST_MTIM_NSEC(stbuf) 0 +#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0) +#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0) +#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0) +#endif diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c new file mode 100644 index 0000000000..28922361a2 --- /dev/null +++ b/tools/virtiofsd/fuse_opt.c @@ -0,0 +1,450 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Implementation of option parsing routines (dealing with `struct + * fuse_args`). + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_opt.h" +#include "fuse_i.h" +#include "fuse_misc.h" + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct fuse_opt_context { + void *data; + const struct fuse_opt *opt; + fuse_opt_proc_t proc; + int argctr; + int argc; + char **argv; + struct fuse_args outargs; + char *opts; + int nonopt; +}; + +void fuse_opt_free_args(struct fuse_args *args) +{ + if (args) { + if (args->argv && args->allocated) { + int i; + for (i = 0; i < args->argc; i++) { + free(args->argv[i]); + } + free(args->argv); + } + args->argc = 0; + args->argv = NULL; + args->allocated = 0; + } +} + +static int alloc_failed(void) +{ + fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); + return -1; +} + +int fuse_opt_add_arg(struct fuse_args *args, const char *arg) +{ + char **newargv; + char *newarg; + + assert(!args->argv || args->allocated); + + newarg = strdup(arg); + if (!newarg) { + return alloc_failed(); + } + + newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); + if (!newargv) { + free(newarg); + return alloc_failed(); + } + + args->argv = newargv; + args->allocated = 1; + args->argv[args->argc++] = newarg; + args->argv[args->argc] = NULL; + return 0; +} + +static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, + const char *arg) +{ + assert(pos <= args->argc); + if (fuse_opt_add_arg(args, arg) == -1) { + return -1; + } + + if (pos != args->argc - 1) { + char *newarg = args->argv[args->argc - 1]; + memmove(&args->argv[pos + 1], &args->argv[pos], + sizeof(char *) * (args->argc - pos - 1)); + args->argv[pos] = newarg; + } + return 0; +} + +int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) +{ + return fuse_opt_insert_arg_common(args, pos, arg); +} + +static int next_arg(struct fuse_opt_context *ctx, const char *opt) +{ + if (ctx->argctr + 1 >= ctx->argc) { + fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); + return -1; + } + ctx->argctr++; + return 0; +} + +static int add_arg(struct fuse_opt_context *ctx, const char *arg) +{ + return fuse_opt_add_arg(&ctx->outargs, arg); +} + +static int add_opt_common(char **opts, const char *opt, int esc) +{ + unsigned oldlen = *opts ? strlen(*opts) : 0; + char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); + + if (!d) { + return alloc_failed(); + } + + *opts = d; + if (oldlen) { + d += oldlen; + *d++ = ','; + } + + for (; *opt; opt++) { + if (esc && (*opt == ',' || *opt == '\\')) { + *d++ = '\\'; + } + *d++ = *opt; + } + *d = '\0'; + + return 0; +} + +int fuse_opt_add_opt(char **opts, const char *opt) +{ + return add_opt_common(opts, opt, 0); +} + +int fuse_opt_add_opt_escaped(char **opts, const char *opt) +{ + return add_opt_common(opts, opt, 1); +} + +static int add_opt(struct fuse_opt_context *ctx, const char *opt) +{ + return add_opt_common(&ctx->opts, opt, 1); +} + +static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, + int iso) +{ + if (key == FUSE_OPT_KEY_DISCARD) { + return 0; + } + + if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { + int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); + if (res == -1 || !res) { + return res; + } + } + if (iso) { + return add_opt(ctx, arg); + } else { + return add_arg(ctx, arg); + } +} + +static int match_template(const char *t, const char *arg, unsigned *sepp) +{ + int arglen = strlen(arg); + const char *sep = strchr(t, '='); + sep = sep ? sep : strchr(t, ' '); + if (sep && (!sep[1] || sep[1] == '%')) { + int tlen = sep - t; + if (sep[0] == '=') { + tlen++; + } + if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { + *sepp = sep - t; + return 1; + } + } + if (strcmp(t, arg) == 0) { + *sepp = 0; + return 1; + } + return 0; +} + +static const struct fuse_opt *find_opt(const struct fuse_opt *opt, + const char *arg, unsigned *sepp) +{ + for (; opt && opt->templ; opt++) { + if (match_template(opt->templ, arg, sepp)) { + return opt; + } + } + return NULL; +} + +int fuse_opt_match(const struct fuse_opt *opts, const char *opt) +{ + unsigned dummy; + return find_opt(opts, opt, &dummy) ? 1 : 0; +} + +static int process_opt_param(void *var, const char *format, const char *param, + const char *arg) +{ + assert(format[0] == '%'); + if (format[1] == 's') { + char **s = var; + char *copy = strdup(param); + if (!copy) { + return alloc_failed(); + } + + free(*s); + *s = copy; + } else { + if (sscanf(param, format, var) != 1) { + fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", + arg); + return -1; + } + } + return 0; +} + +static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt, + unsigned sep, const char *arg, int iso) +{ + if (opt->offset == -1U) { + if (call_proc(ctx, arg, opt->value, iso) == -1) { + return -1; + } + } else { + void *var = (char *)ctx->data + opt->offset; + if (sep && opt->templ[sep + 1]) { + const char *param = arg + sep; + if (opt->templ[sep] == '=') { + param++; + } + if (process_opt_param(var, opt->templ + sep + 1, param, arg) == + -1) { + return -1; + } + } else { + *(int *)var = opt->value; + } + } + return 0; +} + +static int process_opt_sep_arg(struct fuse_opt_context *ctx, + const struct fuse_opt *opt, unsigned sep, + const char *arg, int iso) +{ + int res; + char *newarg; + char *param; + + if (next_arg(ctx, arg) == -1) { + return -1; + } + + param = ctx->argv[ctx->argctr]; + newarg = malloc(sep + strlen(param) + 1); + if (!newarg) { + return alloc_failed(); + } + + memcpy(newarg, arg, sep); + strcpy(newarg + sep, param); + res = process_opt(ctx, opt, sep, newarg, iso); + free(newarg); + + return res; +} + +static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) +{ + unsigned sep; + const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); + if (opt) { + for (; opt; opt = find_opt(opt + 1, arg, &sep)) { + int res; + if (sep && opt->templ[sep] == ' ' && !arg[sep]) { + res = process_opt_sep_arg(ctx, opt, sep, arg, iso); + } else { + res = process_opt(ctx, opt, sep, arg, iso); + } + if (res == -1) { + return -1; + } + } + return 0; + } else { + return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); + } +} + +static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) +{ + char *s = opts; + char *d = s; + int end = 0; + + while (!end) { + if (*s == '\0') { + end = 1; + } + if (*s == ',' || end) { + int res; + + *d = '\0'; + res = process_gopt(ctx, opts, 1); + if (res == -1) { + return -1; + } + d = opts; + } else { + if (s[0] == '\\' && s[1] != '\0') { + s++; + if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' && + s[2] >= '0' && s[2] <= '7') { + *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 + + (s[2] - '0'); + s += 2; + } else { + *d++ = *s; + } + } else { + *d++ = *s; + } + } + s++; + } + + return 0; +} + +static int process_option_group(struct fuse_opt_context *ctx, const char *opts) +{ + int res; + char *copy = strdup(opts); + + if (!copy) { + fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); + return -1; + } + res = process_real_option_group(ctx, copy); + free(copy); + return res; +} + +static int process_one(struct fuse_opt_context *ctx, const char *arg) +{ + if (ctx->nonopt || arg[0] != '-') { + return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); + } else if (arg[1] == 'o') { + if (arg[2]) { + return process_option_group(ctx, arg + 2); + } else { + if (next_arg(ctx, arg) == -1) { + return -1; + } + + return process_option_group(ctx, ctx->argv[ctx->argctr]); + } + } else if (arg[1] == '-' && !arg[2]) { + if (add_arg(ctx, arg) == -1) { + return -1; + } + ctx->nonopt = ctx->outargs.argc; + return 0; + } else { + return process_gopt(ctx, arg, 0); + } +} + +static int opt_parse(struct fuse_opt_context *ctx) +{ + if (ctx->argc) { + if (add_arg(ctx, ctx->argv[0]) == -1) { + return -1; + } + } + + for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) { + if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) { + return -1; + } + } + + if (ctx->opts) { + if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || + fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) { + return -1; + } + } + + /* If option separator ("--") is the last argument, remove it */ + if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && + strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { + free(ctx->outargs.argv[ctx->outargs.argc - 1]); + ctx->outargs.argv[--ctx->outargs.argc] = NULL; + } + + return 0; +} + +int fuse_opt_parse(struct fuse_args *args, void *data, + const struct fuse_opt opts[], fuse_opt_proc_t proc) +{ + int res; + struct fuse_opt_context ctx = { + .data = data, + .opt = opts, + .proc = proc, + }; + + if (!args || !args->argv || !args->argc) { + return 0; + } + + ctx.argc = args->argc; + ctx.argv = args->argv; + + res = opt_parse(&ctx); + if (res != -1) { + struct fuse_args tmp = *args; + *args = ctx.outargs; + ctx.outargs = tmp; + } + free(ctx.opts); + fuse_opt_free_args(&ctx.outargs); + return res; +} diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h new file mode 100644 index 0000000000..8f59b4d301 --- /dev/null +++ b/tools/virtiofsd/fuse_opt.h @@ -0,0 +1,272 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_OPT_H_ +#define FUSE_OPT_H_ + +/** @file + * + * This file defines the option parsing interface of FUSE + */ + +/** + * Option description + * + * This structure describes a single option, and action associated + * with it, in case it matches. + * + * More than one such match may occur, in which case the action for + * each match is executed. + * + * There are three possible actions in case of a match: + * + * i) An integer (int or unsigned) variable determined by 'offset' is + * set to 'value' + * + * ii) The processing function is called, with 'value' as the key + * + * iii) An integer (any) or string (char *) variable determined by + * 'offset' is set to the value of an option parameter + * + * 'offset' should normally be either set to + * + * - 'offsetof(struct foo, member)' actions i) and iii) + * + * - -1 action ii) + * + * The 'offsetof()' macro is defined in the <stddef.h> header. + * + * The template determines which options match, and also have an + * effect on the action. Normally the action is either i) or ii), but + * if a format is present in the template, then action iii) is + * performed. + * + * The types of templates are: + * + * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only + * themselves. Invalid values are "--" and anything beginning + * with "-o" + * + * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or + * the relevant option in a comma separated option list + * + * 3) "bar=", "--foo=", etc. These are variations of 1) and 2) + * which have a parameter + * + * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform + * action iii). + * + * 5) "-x ", etc. Matches either "-xparam" or "-x param" as + * two separate arguments + * + * 6) "-x %s", etc. Combination of 4) and 5) + * + * If the format is "%s", memory is allocated for the string unlike with + * scanf(). The previous value (if non-NULL) stored at the this location is + * freed. + */ +struct fuse_opt { + /** Matching template and optional parameter formatting */ + const char *templ; + + /** + * Offset of variable within 'data' parameter of fuse_opt_parse() + * or -1 + */ + unsigned long offset; + + /** + * Value to set the variable to, or to be passed as 'key' to the + * processing function. Ignored if template has a format + */ + int value; +}; + +/** + * Key option. In case of a match, the processing function will be + * called with the specified key. + */ +#define FUSE_OPT_KEY(templ, key) \ + { \ + templ, -1U, key \ + } + +/** + * Last option. An array of 'struct fuse_opt' must end with a NULL + * template value + */ +#define FUSE_OPT_END \ + { \ + NULL, 0, 0 \ + } + +/** + * Argument list + */ +struct fuse_args { + /** Argument count */ + int argc; + + /** Argument vector. NULL terminated */ + char **argv; + + /** Is 'argv' allocated? */ + int allocated; +}; + +/** + * Initializer for 'struct fuse_args' + */ +#define FUSE_ARGS_INIT(argc, argv) \ + { \ + argc, argv, 0 \ + } + +/** + * Key value passed to the processing function if an option did not + * match any template + */ +#define FUSE_OPT_KEY_OPT -1 + +/** + * Key value passed to the processing function for all non-options + * + * Non-options are the arguments beginning with a character other than + * '-' or all arguments after the special '--' option + */ +#define FUSE_OPT_KEY_NONOPT -2 + +/** + * Special key value for options to keep + * + * Argument is not passed to processing function, but behave as if the + * processing function returned 1 + */ +#define FUSE_OPT_KEY_KEEP -3 + +/** + * Special key value for options to discard + * + * Argument is not passed to processing function, but behave as if the + * processing function returned zero + */ +#define FUSE_OPT_KEY_DISCARD -4 + +/** + * Processing function + * + * This function is called if + * - option did not match any 'struct fuse_opt' + * - argument is a non-option + * - option did match and offset was set to -1 + * + * The 'arg' parameter will always contain the whole argument or + * option including the parameter if exists. A two-argument option + * ("-x foo") is always converted to single argument option of the + * form "-xfoo" before this function is called. + * + * Options of the form '-ofoo' are passed to this function without the + * '-o' prefix. + * + * The return value of this function determines whether this argument + * is to be inserted into the output argument vector, or discarded. + * + * @param data is the user data passed to the fuse_opt_parse() function + * @param arg is the whole argument or option + * @param key determines why the processing function was called + * @param outargs the current output argument list + * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept + */ +typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, + struct fuse_args *outargs); + +/** + * Option parsing function + * + * If 'args' was returned from a previous call to fuse_opt_parse() or + * it was constructed from + * + * A NULL 'args' is equivalent to an empty argument vector + * + * A NULL 'opts' is equivalent to an 'opts' array containing a single + * end marker + * + * A NULL 'proc' is equivalent to a processing function always + * returning '1' + * + * @param args is the input and output argument list + * @param data is the user data + * @param opts is the option description array + * @param proc is the processing function + * @return -1 on error, 0 on success + */ +int fuse_opt_parse(struct fuse_args *args, void *data, + const struct fuse_opt opts[], fuse_opt_proc_t proc); + +/** + * Add an option to a comma separated option list + * + * @param opts is a pointer to an option list, may point to a NULL value + * @param opt is the option to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_opt(char **opts, const char *opt); + +/** + * Add an option, escaping commas, to a comma separated option list + * + * @param opts is a pointer to an option list, may point to a NULL value + * @param opt is the option to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_opt_escaped(char **opts, const char *opt); + +/** + * Add an argument to a NULL terminated argument vector + * + * @param args is the structure containing the current argument list + * @param arg is the new argument to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_arg(struct fuse_args *args, const char *arg); + +/** + * Add an argument at the specified position in a NULL terminated + * argument vector + * + * Adds the argument to the N-th position. This is useful for adding + * options at the beginning of the array which must not come after the + * special '--' option. + * + * @param args is the structure containing the current argument list + * @param pos is the position at which to add the argument + * @param arg is the new argument to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg); + +/** + * Free the contents of argument list + * + * The structure itself is not freed + * + * @param args is the structure containing the argument list + */ +void fuse_opt_free_args(struct fuse_args *args); + + +/** + * Check if an option matches + * + * @param opts is the option description array + * @param opt is the option to match + * @return 1 if a match is found, 0 if not + */ +int fuse_opt_match(const struct fuse_opt opts[], const char *opt); + +#endif /* FUSE_OPT_H_ */ diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c new file mode 100644 index 0000000000..f18625b6e2 --- /dev/null +++ b/tools/virtiofsd/fuse_signals.c @@ -0,0 +1,98 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Utility functions for setting signal handlers. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +static struct fuse_session *fuse_instance; + +static void exit_handler(int sig) +{ + if (fuse_instance) { + fuse_session_exit(fuse_instance); + if (sig <= 0) { + fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); + abort(); + } + fuse_instance->error = sig; + } +} + +static void do_nothing(int sig) +{ + (void)sig; +} + +static int set_one_signal_handler(int sig, void (*handler)(int), int remove) +{ + struct sigaction sa; + struct sigaction old_sa; + + memset(&sa, 0, sizeof(struct sigaction)); + sa.sa_handler = remove ? SIG_DFL : handler; + sigemptyset(&(sa.sa_mask)); + sa.sa_flags = 0; + + if (sigaction(sig, NULL, &old_sa) == -1) { + fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n", + strerror(errno)); + return -1; + } + + if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && + sigaction(sig, &sa, NULL) == -1) { + fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int fuse_set_signal_handlers(struct fuse_session *se) +{ + /* + * If we used SIG_IGN instead of the do_nothing function, + * then we would be unable to tell if we set SIG_IGN (and + * thus should reset to SIG_DFL in fuse_remove_signal_handlers) + * or if it was already set to SIG_IGN (and should be left + * untouched. + */ + if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || + set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || + set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || + set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) { + return -1; + } + + fuse_instance = se; + return 0; +} + +void fuse_remove_signal_handlers(struct fuse_session *se) +{ + if (fuse_instance != se) { + fuse_log(FUSE_LOG_ERR, + "fuse: fuse_remove_signal_handlers: unknown session\n"); + } else { + fuse_instance = NULL; + } + + set_one_signal_handler(SIGHUP, exit_handler, 1); + set_one_signal_handler(SIGINT, exit_handler, 1); + set_one_signal_handler(SIGTERM, exit_handler, 1); + set_one_signal_handler(SIGPIPE, do_nothing, 1); +} diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c new file mode 100644 index 0000000000..80a6e929df --- /dev/null +++ b/tools/virtiofsd/fuse_virtio.c @@ -0,0 +1,986 @@ +/* + * virtio-fs glue for FUSE + * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert <dgilbert@redhat.com> + * + * Implements the glue between libfuse and libvhost-user + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qapi/error.h" +#include "fuse_i.h" +#include "standard-headers/linux/fuse.h" +#include "fuse_misc.h" +#include "fuse_opt.h" +#include "fuse_virtio.h" + +#include <assert.h> +#include <errno.h> +#include <glib.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/eventfd.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <unistd.h> + +#include "contrib/libvhost-user/libvhost-user.h" + +struct fv_VuDev; +struct fv_QueueInfo { + pthread_t thread; + /* + * This lock protects the VuVirtq preventing races between + * fv_queue_thread() and fv_queue_worker(). + */ + pthread_mutex_t vq_lock; + + struct fv_VuDev *virtio_dev; + + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; + int kill_fd; /* For killing the thread */ +}; + +/* A FUSE request */ +typedef struct { + VuVirtqElement elem; + struct fuse_chan ch; + + /* Used to complete requests that involve no reply */ + bool reply_sent; +} FVRequest; + +/* + * We pass the dev element into libvhost-user + * and then use it to get back to the outer + * container for other data. + */ +struct fv_VuDev { + VuDev dev; + struct fuse_session *se; + + /* + * Either handle virtqueues or vhost-user protocol messages. Don't do + * both at the same time since that could lead to race conditions if + * virtqueues or memory tables change while another thread is accessing + * them. + * + * The assumptions are: + * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev. + * 2. virtio_loop() reads/writes virtqueues and VuDev. + */ + pthread_rwlock_t vu_dispatch_rwlock; + + /* + * The following pair of fields are only accessed in the main + * virtio_loop + */ + size_t nqueues; + struct fv_QueueInfo **qi; +}; + +/* From spec */ +struct virtio_fs_config { + char tag[36]; + uint32_t num_queues; +}; + +/* Callback from libvhost-user */ +static uint64_t fv_get_features(VuDev *dev) +{ + return 1ULL << VIRTIO_F_VERSION_1; +} + +/* Callback from libvhost-user */ +static void fv_set_features(VuDev *dev, uint64_t features) +{ +} + +/* + * Callback from libvhost-user if there's a new fd we're supposed to listen + * to, typically a queue kick? + */ +static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb, + void *data) +{ + fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); +} + +/* + * Callback from libvhost-user if we're no longer supposed to listen on an fd + */ +static void fv_remove_watch(VuDev *dev, int fd) +{ + fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); +} + +/* Callback from libvhost-user to panic */ +static void fv_panic(VuDev *dev, const char *err) +{ + fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err); + /* TODO: Allow reconnects?? */ + exit(EXIT_FAILURE); +} + +/* + * Copy from an iovec into a fuse_buf (memory only) + * Caller must ensure there is space + */ +static void copy_from_iov(struct fuse_buf *buf, size_t out_num, + const struct iovec *out_sg) +{ + void *dest = buf->mem; + + while (out_num) { + size_t onelen = out_sg->iov_len; + memcpy(dest, out_sg->iov_base, onelen); + dest += onelen; + out_sg++; + out_num--; + } +} + +/* + * Copy from one iov to another, the given number of bytes + * The caller must have checked sizes. + */ +static void copy_iov(struct iovec *src_iov, int src_count, + struct iovec *dst_iov, int dst_count, size_t to_copy) +{ + size_t dst_offset = 0; + /* Outer loop copies 'src' elements */ + while (to_copy) { + assert(src_count); + size_t src_len = src_iov[0].iov_len; + size_t src_offset = 0; + + if (src_len > to_copy) { + src_len = to_copy; + } + /* Inner loop copies contents of one 'src' to maybe multiple dst. */ + while (src_len) { + assert(dst_count); + size_t dst_len = dst_iov[0].iov_len - dst_offset; + if (dst_len > src_len) { + dst_len = src_len; + } + + memcpy(dst_iov[0].iov_base + dst_offset, + src_iov[0].iov_base + src_offset, dst_len); + src_len -= dst_len; + to_copy -= dst_len; + src_offset += dst_len; + dst_offset += dst_len; + + assert(dst_offset <= dst_iov[0].iov_len); + if (dst_offset == dst_iov[0].iov_len) { + dst_offset = 0; + dst_iov++; + dst_count--; + } + } + src_iov++; + src_count--; + } +} + +/* + * Called back by ll whenever it wants to send a reply/message back + * The 1st element of the iov starts with the fuse_out_header + * 'unique'==0 means it's a notify message. + */ +int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) +{ + FVRequest *req = container_of(ch, FVRequest, ch); + struct fv_QueueInfo *qi = ch->qi; + VuDev *dev = &se->virtio_dev->dev; + VuVirtq *q = vu_get_queue(dev, qi->qidx); + VuVirtqElement *elem = &req->elem; + int ret = 0; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); + + struct fuse_out_header *out = iov[0].iov_base; + /* TODO: Endianness! */ + + size_t tosend_len = iov_size(iov, count); + + /* unique == 0 is notification, which we don't support */ + assert(out->unique); + assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; + struct iovec *in_sg = elem->in_sg; + size_t in_len = iov_size(in_sg, in_num); + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", + __func__, elem->index, in_num, in_len); + + /* + * The elem should have room for a 'fuse_out_header' (out from fuse) + * plus the data based on the len in the header. + */ + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); + ret = -E2BIG; + goto err; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); + ret = -E2BIG; + goto err; + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); + + pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + pthread_mutex_lock(&qi->vq_lock); + vu_queue_push(dev, q, elem, tosend_len); + vu_queue_notify(dev, q); + pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + + req->reply_sent = true; + +err: + return ret; +} + +/* + * Callback from fuse_send_data_iov_* when it's virtio and the buffer + * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK + * We need send the iov and then the buffer. + * Return 0 on success + */ +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, struct fuse_bufvec *buf, + size_t len) +{ + FVRequest *req = container_of(ch, FVRequest, ch); + struct fv_QueueInfo *qi = ch->qi; + VuDev *dev = &se->virtio_dev->dev; + VuVirtq *q = vu_get_queue(dev, qi->qidx); + VuVirtqElement *elem = &req->elem; + int ret = 0; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); + + struct fuse_out_header *out = iov[0].iov_base; + /* TODO: Endianness! */ + + size_t iov_len = iov_size(iov, count); + size_t tosend_len = iov_len + len; + + out->len = tosend_len; + + fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__, + count, len, iov_len); + + /* unique == 0 is notification which we don't support */ + assert(out->unique); + + assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; + struct iovec *in_sg = elem->in_sg; + size_t in_len = iov_size(in_sg, in_num); + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", + __func__, elem->index, in_num, in_len); + + /* + * The elem should have room for a 'fuse_out_header' (out from fuse) + * plus the data based on the len in the header. + */ + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); + ret = E2BIG; + goto err; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); + ret = E2BIG; + goto err; + } + + /* TODO: Limit to 'len' */ + + /* First copy the header data from iov->in_sg */ + copy_iov(iov, count, in_sg, in_num, iov_len); + + /* + * Build a copy of the the in_sg iov so we can skip bits in it, + * including changing the offsets + */ + struct iovec *in_sg_cpy = calloc(sizeof(struct iovec), in_num); + assert(in_sg_cpy); + memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num); + /* These get updated as we skip */ + struct iovec *in_sg_ptr = in_sg_cpy; + int in_sg_cpy_count = in_num; + + /* skip over parts of in_sg that contained the header iov */ + size_t skip_size = iov_len; + + size_t in_sg_left = 0; + do { + while (skip_size != 0 && in_sg_cpy_count) { + if (skip_size >= in_sg_ptr[0].iov_len) { + skip_size -= in_sg_ptr[0].iov_len; + in_sg_ptr++; + in_sg_cpy_count--; + } else { + in_sg_ptr[0].iov_len -= skip_size; + in_sg_ptr[0].iov_base += skip_size; + break; + } + } + + int i; + for (i = 0, in_sg_left = 0; i < in_sg_cpy_count; i++) { + in_sg_left += in_sg_ptr[i].iov_len; + } + fuse_log(FUSE_LOG_DEBUG, + "%s: after skip skip_size=%zd in_sg_cpy_count=%d " + "in_sg_left=%zd\n", + __func__, skip_size, in_sg_cpy_count, in_sg_left); + ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count, + buf->buf[0].pos); + + if (ret == -1) { + ret = errno; + fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n", + __func__, len); + free(in_sg_cpy); + goto err; + } + fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__, + ret, len); + if (ret < len && ret) { + fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__); + /* Skip over this much next time around */ + skip_size = ret; + buf->buf[0].pos += ret; + len -= ret; + + /* Lets do another read */ + continue; + } + if (!ret) { + /* EOF case? */ + fuse_log(FUSE_LOG_DEBUG, "%s: !ret in_sg_left=%zd\n", __func__, + in_sg_left); + break; + } + if (ret != len) { + fuse_log(FUSE_LOG_DEBUG, "%s: ret!=len\n", __func__); + ret = EIO; + free(in_sg_cpy); + goto err; + } + in_sg_left -= ret; + len -= ret; + } while (in_sg_left); + free(in_sg_cpy); + + /* Need to fix out->len on EOF */ + if (len) { + struct fuse_out_header *out_sg = in_sg[0].iov_base; + + tosend_len -= len; + out_sg->len = tosend_len; + } + + ret = 0; + + pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + pthread_mutex_lock(&qi->vq_lock); + vu_queue_push(dev, q, elem, tosend_len); + vu_queue_notify(dev, q); + pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + +err: + if (ret == 0) { + req->reply_sent = true; + } + + return ret; +} + +/* Process one FVRequest in a thread pool */ +static void fv_queue_worker(gpointer data, gpointer user_data) +{ + struct fv_QueueInfo *qi = user_data; + struct fuse_session *se = qi->virtio_dev->se; + struct VuDev *dev = &qi->virtio_dev->dev; + FVRequest *req = data; + VuVirtqElement *elem = &req->elem; + struct fuse_buf fbuf = {}; + bool allocated_bufv = false; + struct fuse_bufvec bufv; + struct fuse_bufvec *pbufv; + + assert(se->bufsize > sizeof(struct fuse_in_header)); + + /* + * An element contains one request and the space to send our response + * They're spread over multiple descriptors in a scatter/gather set + * and we can't trust the guest to keep them still; so copy in/out. + */ + fbuf.mem = malloc(se->bufsize); + assert(fbuf.mem); + + fuse_mutex_init(&req->ch.lock); + req->ch.fd = -1; + req->ch.qi = qi; + + /* The 'out' part of the elem is from qemu */ + unsigned int out_num = elem->out_num; + struct iovec *out_sg = elem->out_sg; + size_t out_len = iov_size(out_sg, out_num); + fuse_log(FUSE_LOG_DEBUG, + "%s: elem %d: with %d out desc of length %zd\n", + __func__, elem->index, out_num, out_len); + + /* + * The elem should contain a 'fuse_in_header' (in to fuse) + * plus the data based on the len in the header. + */ + if (out_len < sizeof(struct fuse_in_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", + __func__, elem->index); + assert(0); /* TODO */ + } + if (out_len > se->bufsize) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__, + elem->index); + assert(0); /* TODO */ + } + /* Copy just the first element and look at it */ + copy_from_iov(&fbuf, 1, out_sg); + + pbufv = NULL; /* Compiler thinks an unitialised path */ + if (out_num > 2 && + out_sg[0].iov_len == sizeof(struct fuse_in_header) && + ((struct fuse_in_header *)fbuf.mem)->opcode == FUSE_WRITE && + out_sg[1].iov_len == sizeof(struct fuse_write_in)) { + /* + * For a write we don't actually need to copy the + * data, we can just do it straight out of guest memory + * but we must still copy the headers in case the guest + * was nasty and changed them while we were using them. + */ + fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); + + /* copy the fuse_write_in header afte rthe fuse_in_header */ + fbuf.mem += out_sg->iov_len; + copy_from_iov(&fbuf, 1, out_sg + 1); + fbuf.mem -= out_sg->iov_len; + fbuf.size = out_sg[0].iov_len + out_sg[1].iov_len; + + /* Allocate the bufv, with space for the rest of the iov */ + pbufv = malloc(sizeof(struct fuse_bufvec) + + sizeof(struct fuse_buf) * (out_num - 2)); + if (!pbufv) { + fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", + __func__); + goto out; + } + + allocated_bufv = true; + pbufv->count = 1; + pbufv->buf[0] = fbuf; + + size_t iovindex, pbufvindex; + iovindex = 2; /* 2 headers, separate iovs */ + pbufvindex = 1; /* 2 headers, 1 fusebuf */ + + for (; iovindex < out_num; iovindex++, pbufvindex++) { + pbufv->count++; + pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ + pbufv->buf[pbufvindex].flags = 0; + pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; + pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; + } + } else { + /* Normal (non fast write) path */ + + /* Copy the rest of the buffer */ + fbuf.mem += out_sg->iov_len; + copy_from_iov(&fbuf, out_num - 1, out_sg + 1); + fbuf.mem -= out_sg->iov_len; + fbuf.size = out_len; + + /* TODO! Endianness of header */ + + /* TODO: Add checks for fuse_session_exited */ + bufv.buf[0] = fbuf; + bufv.count = 1; + pbufv = &bufv; + } + pbufv->idx = 0; + pbufv->off = 0; + fuse_session_process_buf_int(se, pbufv, &req->ch); + +out: + if (allocated_bufv) { + free(pbufv); + } + + /* If the request has no reply, still recycle the virtqueue element */ + if (!req->reply_sent) { + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); + + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, + elem->index); + + pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + pthread_mutex_lock(&qi->vq_lock); + vu_queue_push(dev, q, elem, 0); + vu_queue_notify(dev, q); + pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } + + pthread_mutex_destroy(&req->ch.lock); + free(fbuf.mem); + free(req); +} + +/* Thread function for individual queues, created when a queue is 'started' */ +static void *fv_queue_thread(void *opaque) +{ + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); + struct fuse_session *se = qi->virtio_dev->se; + GThreadPool *pool; + + pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, TRUE, + NULL); + if (!pool) { + fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); + return NULL; + } + + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { + struct pollfd pf[2]; + int ret; + + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; + pf[1].fd = qi->kill_fd; + pf[1].events = POLLIN; + pf[1].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, + qi->qidx); + int poll_res = ppoll(pf, 2, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { + fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", + __func__); + continue; + } + fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); + break; + } + assert(poll_res >= 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", + __func__, pf[0].revents, qi->qidx); + break; + } + if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, + "%s: Unexpected poll revents %x Queue %d killfd\n", + __func__, pf[1].revents, qi->qidx); + break; + } + if (pf[1].revents) { + fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n", + __func__, qi->qidx); + break; + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, + qi->qidx); + + eventfd_t evalue; + if (eventfd_read(qi->kick_fd, &evalue)) { + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } + /* Mutual exclusion with virtio_loop() */ + ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + assert(ret == 0); /* there is no possible error case */ + pthread_mutex_lock(&qi->vq_lock); + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); + + fuse_log(FUSE_LOG_DEBUG, + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + + while (1) { + FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest)); + if (!req) { + break; + } + + req->reply_sent = false; + + g_thread_pool_push(pool, req, NULL); + } + + pthread_mutex_unlock(&qi->vq_lock); + pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + } + + g_thread_pool_free(pool, FALSE, TRUE); + + return NULL; +} + +static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) +{ + int ret; + struct fv_QueueInfo *ourqi; + + assert(qidx < vud->nqueues); + ourqi = vud->qi[qidx]; + + /* Kill the thread */ + if (eventfd_write(ourqi->kill_fd, 1)) { + fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n", + qidx, strerror(errno)); + } + ret = pthread_join(ourqi->thread, NULL); + if (ret) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", + __func__, qidx, ret); + } + pthread_mutex_destroy(&ourqi->vq_lock); + close(ourqi->kill_fd); + ourqi->kick_fd = -1; + free(vud->qi[qidx]); + vud->qi[qidx] = NULL; +} + +/* Callback from libvhost-user on start or stop of a queue */ +static void fv_queue_set_started(VuDev *dev, int qidx, bool started) +{ + struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev); + struct fv_QueueInfo *ourqi; + + fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx, + started); + assert(qidx >= 0); + + /* + * Ignore additional request queues for now. passthrough_ll.c must be + * audited for thread-safety issues first. It was written with a + * well-behaved client in mind and may not protect against all types of + * races yet. + */ + if (qidx > 1) { + fuse_log(FUSE_LOG_ERR, + "%s: multiple request queues not yet implemented, please only " + "configure 1 request queue\n", + __func__); + exit(EXIT_FAILURE); + } + + if (started) { + /* Fire up a thread to watch this queue */ + if (qidx >= vud->nqueues) { + vud->qi = realloc(vud->qi, (qidx + 1) * sizeof(vud->qi[0])); + assert(vud->qi); + memset(vud->qi + vud->nqueues, 0, + sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues))); + vud->nqueues = qidx + 1; + } + if (!vud->qi[qidx]) { + vud->qi[qidx] = calloc(sizeof(struct fv_QueueInfo), 1); + assert(vud->qi[qidx]); + vud->qi[qidx]->virtio_dev = vud; + vud->qi[qidx]->qidx = qidx; + } else { + /* Shouldn't have been started */ + assert(vud->qi[qidx]->kick_fd == -1); + } + ourqi = vud->qi[qidx]; + ourqi->kick_fd = dev->vq[qidx].kick_fd; + + ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); + assert(ourqi->kill_fd != -1); + pthread_mutex_init(&ourqi->vq_lock, NULL); + + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); + assert(0); + } + } else { + fv_queue_cleanup_thread(vud, qidx); + } +} + +static bool fv_queue_order(VuDev *dev, int qidx) +{ + return false; +} + +static const VuDevIface fv_iface = { + .get_features = fv_get_features, + .set_features = fv_set_features, + + /* Don't need process message, we've not got any at vhost-user level */ + .queue_set_started = fv_queue_set_started, + + .queue_is_processed_in_order = fv_queue_order, +}; + +/* + * Main loop; this mostly deals with events on the vhost-user + * socket itself, and not actual fuse data. + */ +int virtio_loop(struct fuse_session *se) +{ + fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); + + while (!fuse_session_exited(se)) { + struct pollfd pf[1]; + bool ok; + int ret; + pf[0].fd = se->vu_socketfd; + pf[0].events = POLLIN; + pf[0].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__); + int poll_res = ppoll(pf, 1, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { + fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", + __func__); + continue; + } + fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n"); + break; + } + assert(poll_res == 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__, + pf[0].revents); + break; + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); + /* Mutual exclusion with fv_queue_thread() */ + ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock); + assert(ret == 0); /* there is no possible error case */ + + ok = vu_dispatch(&se->virtio_dev->dev); + + pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock); + + if (!ok) { + fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); + break; + } + } + + /* + * Make sure all fv_queue_thread()s quit on exit, as we're about to + * free virtio dev and fuse session, no one should access them anymore. + */ + for (int i = 0; i < se->virtio_dev->nqueues; i++) { + if (!se->virtio_dev->qi[i]) { + continue; + } + + fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i); + fv_queue_cleanup_thread(se->virtio_dev, i); + } + + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); + + return 0; +} + +static void strreplace(char *s, char old, char new) +{ + for (; *s; ++s) { + if (*s == old) { + *s = new; + } + } +} + +static bool fv_socket_lock(struct fuse_session *se) +{ + g_autofree gchar *sk_name = NULL; + g_autofree gchar *pidfile = NULL; + g_autofree gchar *dir = NULL; + Error *local_err = NULL; + + dir = qemu_get_local_state_pathname("run/virtiofsd"); + + if (g_mkdir_with_parents(dir, S_IRWXU) < 0) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s", + __func__, dir, strerror(errno)); + return false; + } + + sk_name = g_strdup(se->vu_socket_path); + strreplace(sk_name, '/', '.'); + pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name); + + if (!qemu_write_pidfile(pidfile, &local_err)) { + error_report_err(local_err); + return false; + } + + return true; +} + +static int fv_create_listen_socket(struct fuse_session *se) +{ + struct sockaddr_un un; + mode_t old_umask; + + /* Nothing to do if fd is already initialized */ + if (se->vu_listen_fd >= 0) { + return 0; + } + + if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); + return -1; + } + + if (!strlen(se->vu_socket_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path is empty\n"); + return -1; + } + + /* Check the vu_socket_path is already used */ + if (!fv_socket_lock(se)) { + return -1; + } + + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge + */ + unlink(se->vu_socket_path); + strcpy(un.sun_path, se->vu_socket_path); + size_t addr_len = sizeof(un); + + int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n"); + return -1; + } + un.sun_family = AF_UNIX; + + /* + * Unfortunately bind doesn't let you set the mask on the socket, + * so set umask to 077 and restore it later. + */ + old_umask = umask(0077); + if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); + umask(old_umask); + return -1; + } + umask(old_umask); + + if (listen(listen_sock, 1) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); + return -1; + } + + se->vu_listen_fd = listen_sock; + return 0; +} + +int virtio_session_mount(struct fuse_session *se) +{ + int ret; + + ret = fv_create_listen_socket(se); + if (ret < 0) { + return ret; + } + + se->fd = -1; + + fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", + __func__); + int data_sock = accept(se->vu_listen_fd, NULL, NULL); + if (data_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); + close(se->vu_listen_fd); + return -1; + } + close(se->vu_listen_fd); + se->vu_listen_fd = -1; + fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", + __func__); + + /* TODO: Some cleanup/deallocation! */ + se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1); + if (!se->virtio_dev) { + fuse_log(FUSE_LOG_ERR, "%s: virtio_dev calloc failed\n", __func__); + close(data_sock); + return -1; + } + + se->vu_socketfd = data_sock; + se->virtio_dev->se = se; + pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL); + vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, fv_set_watch, + fv_remove_watch, &fv_iface); + + return 0; +} + +void virtio_session_close(struct fuse_session *se) +{ + close(se->vu_socketfd); + + if (!se->virtio_dev) { + return; + } + + free(se->virtio_dev->qi); + pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock); + free(se->virtio_dev); + se->virtio_dev = NULL; +} diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h new file mode 100644 index 0000000000..111684032c --- /dev/null +++ b/tools/virtiofsd/fuse_virtio.h @@ -0,0 +1,33 @@ +/* + * virtio-fs glue for FUSE + * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert <dgilbert@redhat.com> + * + * Implements the glue between libfuse and libvhost-user + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#ifndef FUSE_VIRTIO_H +#define FUSE_VIRTIO_H + +#include "fuse_i.h" + +struct fuse_session; + +int virtio_session_mount(struct fuse_session *se); +void virtio_session_close(struct fuse_session *se); +int virtio_loop(struct fuse_session *se); + + +int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count); + +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, + struct fuse_bufvec *buf, size_t len); + +#endif diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c new file mode 100644 index 0000000000..0801cf752c --- /dev/null +++ b/tools/virtiofsd/helper.c @@ -0,0 +1,349 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Helper functions to create (simple) standalone programs. With the + * aid of these functions it should be possible to create full FUSE + * file system by implementing nothing but the request handlers. + + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" +#include "fuse_misc.h" +#include "fuse_opt.h" + +#include <errno.h> +#include <limits.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> +#include <unistd.h> + +#define FUSE_HELPER_OPT(t, p) \ + { \ + t, offsetof(struct fuse_cmdline_opts, p), 1 \ + } +#define FUSE_HELPER_OPT_VALUE(t, p, v) \ + { \ + t, offsetof(struct fuse_cmdline_opts, p), v \ + } + +static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), + FUSE_HELPER_OPT("--help", show_help), + FUSE_HELPER_OPT("-V", show_version), + FUSE_HELPER_OPT("--version", show_version), + FUSE_HELPER_OPT("--print-capabilities", print_capabilities), + FUSE_HELPER_OPT("-d", debug), + FUSE_HELPER_OPT("debug", debug), + FUSE_HELPER_OPT("-d", foreground), + FUSE_HELPER_OPT("debug", foreground), + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), + FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--syslog", syslog), + FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), + FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), + FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING), + FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR), + FUSE_OPT_END +}; + +struct fuse_conn_info_opts { + int atomic_o_trunc; + int no_remote_posix_lock; + int no_remote_flock; + int splice_write; + int splice_move; + int splice_read; + int no_splice_write; + int no_splice_move; + int no_splice_read; + int auto_inval_data; + int no_auto_inval_data; + int no_readdirplus; + int no_readdirplus_auto; + int async_dio; + int no_async_dio; + int writeback_cache; + int no_writeback_cache; + int async_read; + int sync_read; + unsigned max_write; + unsigned max_readahead; + unsigned max_background; + unsigned congestion_threshold; + unsigned time_gran; + int set_max_write; + int set_max_readahead; + int set_max_background; + int set_congestion_threshold; + int set_time_gran; +}; + +#define CONN_OPTION(t, p, v) \ + { \ + t, offsetof(struct fuse_conn_info_opts, p), v \ + } +static const struct fuse_opt conn_info_opt_spec[] = { + CONN_OPTION("max_write=%u", max_write, 0), + CONN_OPTION("max_write=", set_max_write, 1), + CONN_OPTION("max_readahead=%u", max_readahead, 0), + CONN_OPTION("max_readahead=", set_max_readahead, 1), + CONN_OPTION("max_background=%u", max_background, 0), + CONN_OPTION("max_background=", set_max_background, 1), + CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), + CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), + CONN_OPTION("sync_read", sync_read, 1), + CONN_OPTION("async_read", async_read, 1), + CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), + CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), + CONN_OPTION("no_remote_lock", no_remote_flock, 1), + CONN_OPTION("no_remote_flock", no_remote_flock, 1), + CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), + CONN_OPTION("splice_write", splice_write, 1), + CONN_OPTION("no_splice_write", no_splice_write, 1), + CONN_OPTION("splice_move", splice_move, 1), + CONN_OPTION("no_splice_move", no_splice_move, 1), + CONN_OPTION("splice_read", splice_read, 1), + CONN_OPTION("no_splice_read", no_splice_read, 1), + CONN_OPTION("auto_inval_data", auto_inval_data, 1), + CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), + CONN_OPTION("readdirplus=no", no_readdirplus, 1), + CONN_OPTION("readdirplus=yes", no_readdirplus, 0), + CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), + CONN_OPTION("readdirplus=auto", no_readdirplus, 0), + CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), + CONN_OPTION("async_dio", async_dio, 1), + CONN_OPTION("no_async_dio", no_async_dio, 1), + CONN_OPTION("writeback_cache", writeback_cache, 1), + CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), + CONN_OPTION("time_gran=%u", time_gran, 0), + CONN_OPTION("time_gran=", set_time_gran, 1), + FUSE_OPT_END +}; + + +void fuse_cmdline_help(void) +{ + printf(" -h --help print help\n" + " -V --version print version\n" + " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" + " -o cache=<mode> cache mode. could be one of \"auto, " + "always, none\"\n" + " default: auto\n" + " -o flock|no_flock enable/disable flock\n" + " default: no_flock\n" + " -o log_level=<level> log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n" + " -o norace disable racy fallback\n" + " default: false\n" + " -o posix_lock|no_posix_lock\n" + " enable/disable remote posix lock\n" + " default: posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " + "cache=none\n" + " -o timeout=<number> I/O timeout (second)\n" + " default: depends on cache= option.\n" + " -o writeback|no_writeback enable/disable writeback cache\n" + " default: no_writeback\n" + " -o xattr|no_xattr enable/disable xattr\n" + " default: no_xattr\n" + ); +} + +static int fuse_helper_opt_proc(void *data, const char *arg, int key, + struct fuse_args *outargs) +{ + (void)data; + (void)outargs; + + switch (key) { + case FUSE_OPT_KEY_NONOPT: + fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); + return -1; + + default: + /* Pass through unknown options */ + return 1; + } +} + +int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) +{ + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; + opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == + -1) { + return -1; + } + + return 0; +} + + +int fuse_daemonize(int foreground) +{ + int ret = 0, rett; + if (!foreground) { + int nullfd; + int waiter[2]; + char completed; + + if (pipe(waiter)) { + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n", + strerror(errno)); + return -1; + } + + /* + * demonize current process by forking it and killing the + * parent. This makes current process as a child of 'init'. + */ + switch (fork()) { + case -1: + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n", + strerror(errno)); + return -1; + case 0: + break; + default: + _exit(read(waiter[0], &completed, + sizeof(completed) != sizeof(completed))); + } + + if (setsid() == -1) { + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n", + strerror(errno)); + return -1; + } + + ret = chdir("/"); + + nullfd = open("/dev/null", O_RDWR, 0); + if (nullfd != -1) { + rett = dup2(nullfd, 0); + if (!ret) { + ret = rett; + } + rett = dup2(nullfd, 1); + if (!ret) { + ret = rett; + } + rett = dup2(nullfd, 2); + if (!ret) { + ret = rett; + } + if (nullfd > 2) { + close(nullfd); + } + } + + /* Propagate completion of daemon initialization */ + completed = 1; + rett = write(waiter[1], &completed, sizeof(completed)); + if (!ret) { + ret = rett; + } + close(waiter[0]); + close(waiter[1]); + } else { + ret = chdir("/"); + } + return ret; +} + +void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn) +{ + if (opts->set_max_write) { + conn->max_write = opts->max_write; + } + if (opts->set_max_background) { + conn->max_background = opts->max_background; + } + if (opts->set_congestion_threshold) { + conn->congestion_threshold = opts->congestion_threshold; + } + if (opts->set_time_gran) { + conn->time_gran = opts->time_gran; + } + if (opts->set_max_readahead) { + conn->max_readahead = opts->max_readahead; + } + +#define LL_ENABLE(cond, cap) \ + if (cond) \ + conn->want |= (cap) +#define LL_DISABLE(cond, cap) \ + if (cond) \ + conn->want &= ~(cap) + + LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); + LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); + + LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); + LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); + + LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); + LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); + + LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); + LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); + + LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); + LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); + + LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); + LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); + + LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); + LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); + + LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); + LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); + + LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); + LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); +} + +struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args) +{ + struct fuse_conn_info_opts *opts; + + opts = calloc(1, sizeof(struct fuse_conn_info_opts)); + if (opts == NULL) { + fuse_log(FUSE_LOG_ERR, "calloc failed\n"); + return NULL; + } + if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { + free(opts); + return NULL; + } + return opts; +} diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h new file mode 100644 index 0000000000..0b98275ed5 --- /dev/null +++ b/tools/virtiofsd/passthrough_helpers.h @@ -0,0 +1,51 @@ +/* + * FUSE: Filesystem in Userspace + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +/* + * Creates files on the underlying file system in response to a FUSE_MKNOD + * operation + */ +static int mknod_wrapper(int dirfd, const char *path, const char *link, + int mode, dev_t rdev) +{ + int res; + + if (S_ISREG(mode)) { + res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); + if (res >= 0) { + res = close(res); + } + } else if (S_ISDIR(mode)) { + res = mkdirat(dirfd, path, mode); + } else if (S_ISLNK(mode) && link != NULL) { + res = symlinkat(link, dirfd, path); + } else if (S_ISFIFO(mode)) { + res = mkfifoat(dirfd, path, mode); + } else { + res = mknodat(dirfd, path, mode, rdev); + } + + return res; +} diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c new file mode 100644 index 0000000000..e6f2399efc --- /dev/null +++ b/tools/virtiofsd/passthrough_ll.c @@ -0,0 +1,3006 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU GPLv2. + * See the file COPYING. + */ + +/* + * + * This file system mirrors the existing file system hierarchy of the + * system, starting at the root file system. This is implemented by + * just "passing through" all requests to the corresponding user-space + * libc functions. In contrast to passthrough.c and passthrough_fh.c, + * this implementation uses the low-level API. Its performance should + * be the least bad among the three, but many operations are not + * implemented. In particular, it is not possible to remove files (or + * directories) because the code necessary to defer actual removal + * until the file is not opened anymore would make the example much + * more complicated. + * + * When writeback caching is enabled (-o writeback mount option), it + * is only possible to write to files for which the mounting user has + * read permissions. This is because the writeback cache requires the + * kernel to be able to issue read requests for all files (which the + * passthrough filesystem cannot satisfy if it can't read the file in + * the underlying filesystem). + * + * Compile with: + * + * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o + * passthrough_ll + * + * ## Source code ## + * \include passthrough_ll.c + */ + +#include "qemu/osdep.h" +#include "qemu/timer.h" +#include "fuse_virtio.h" +#include "fuse_log.h" +#include "fuse_lowlevel.h" +#include <assert.h> +#include <cap-ng.h> +#include <dirent.h> +#include <errno.h> +#include <glib.h> +#include <inttypes.h> +#include <limits.h> +#include <pthread.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/prctl.h> +#include <sys/resource.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/xattr.h> +#include <syslog.h> +#include <unistd.h> + +#include "passthrough_helpers.h" +#include "seccomp.h" + +/* Keep track of inode posix locks for each owner. */ +struct lo_inode_plock { + uint64_t lock_owner; + int fd; /* fd for OFD locks */ +}; + +struct lo_map_elem { + union { + struct lo_inode *inode; + struct lo_dirp *dirp; + int fd; + ssize_t freelist; + }; + bool in_use; +}; + +/* Maps FUSE fh or ino values to internal objects */ +struct lo_map { + struct lo_map_elem *elems; + size_t nelems; + ssize_t freelist; +}; + +struct lo_key { + ino_t ino; + dev_t dev; +}; + +struct lo_inode { + int fd; + + /* + * Atomic reference count for this object. The nlookup field holds a + * reference and release it when nlookup reaches 0. + */ + gint refcount; + + struct lo_key key; + + /* + * This counter keeps the inode alive during the FUSE session. + * Incremented when the FUSE inode number is sent in a reply + * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is + * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc. + * + * Note that this value is untrusted because the client can manipulate + * it arbitrarily using FUSE_FORGET requests. + * + * Protected by lo->mutex. + */ + uint64_t nlookup; + + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + + bool is_symlink; +}; + +struct lo_cred { + uid_t euid; + gid_t egid; +}; + +enum { + CACHE_NONE, + CACHE_AUTO, + CACHE_ALWAYS, +}; + +struct lo_data { + pthread_mutex_t mutex; + int debug; + int norace; + int writeback; + int flock; + int posix_lock; + int xattr; + char *source; + double timeout; + int cache; + int timeout_set; + int readdirplus_set; + int readdirplus_clear; + struct lo_inode root; + GHashTable *inodes; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ + + /* An O_PATH file descriptor to /proc/self/fd/ */ + int proc_self_fd; +}; + +static const struct fuse_opt lo_opts[] = { + { "writeback", offsetof(struct lo_data, writeback), 1 }, + { "no_writeback", offsetof(struct lo_data, writeback), 0 }, + { "source=%s", offsetof(struct lo_data, source), 0 }, + { "flock", offsetof(struct lo_data, flock), 1 }, + { "no_flock", offsetof(struct lo_data, flock), 0 }, + { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, + { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, + { "xattr", offsetof(struct lo_data, xattr), 1 }, + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, + { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, + { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, + { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "norace", offsetof(struct lo_data, norace), 1 }, + { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, + { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, + FUSE_OPT_END +}; +static bool use_syslog = false; +static int current_log_level; +static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + uint64_t n); + +static struct { + pthread_mutex_t mutex; + void *saved; +} cap; +/* That we loaded cap-ng in the current thread from the saved */ +static __thread bool cap_loaded = 0; + +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); + +static int is_dot_or_dotdot(const char *name) +{ + return name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); +} + +/* Is `path` a single path component that is not "." or ".."? */ +static int is_safe_path_component(const char *path) +{ + if (strchr(path, '/')) { + return 0; + } + + return !is_dot_or_dotdot(path); +} + +static struct lo_data *lo_data(fuse_req_t req) +{ + return (struct lo_data *)fuse_req_userdata(req); +} + +/* + * Load capng's state from our saved state if the current thread + * hadn't previously been loaded. + * returns 0 on success + */ +static int load_capng(void) +{ + if (!cap_loaded) { + pthread_mutex_lock(&cap.mutex); + capng_restore_state(&cap.saved); + /* + * restore_state free's the saved copy + * so make another. + */ + cap.saved = capng_save_state(); + if (!cap.saved) { + fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); + return -EINVAL; + } + pthread_mutex_unlock(&cap.mutex); + + /* + * We want to use the loaded state for our pid, + * not the original + */ + capng_setpid(syscall(SYS_gettid)); + cap_loaded = true; + } + return 0; +} + +/* + * Helpers for dropping and regaining effective capabilities. Returns 0 + * on success, error otherwise + */ +static int drop_effective_cap(const char *cap_name, bool *cap_dropped) +{ + int cap, ret; + + cap = capng_name_to_capability(cap_name); + if (cap < 0) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", + cap_name, strerror(errno)); + goto out; + } + + if (load_capng()) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); + goto out; + } + + /* We dont have this capability in effective set already. */ + if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { + ret = 0; + goto out; + } + + if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); + goto out; + } + + if (capng_apply(CAPNG_SELECT_CAPS)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); + goto out; + } + + ret = 0; + if (cap_dropped) { + *cap_dropped = true; + } + +out: + return ret; +} + +static int gain_effective_cap(const char *cap_name) +{ + int cap; + int ret = 0; + + cap = capng_name_to_capability(cap_name); + if (cap < 0) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", + cap_name, strerror(errno)); + goto out; + } + + if (load_capng()) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); + goto out; + } + + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); + goto out; + } + + if (capng_apply(CAPNG_SELECT_CAPS)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); + goto out; + } + ret = 0; + +out: + return ret; +} + +static void lo_map_init(struct lo_map *map) +{ + map->elems = NULL; + map->nelems = 0; + map->freelist = -1; +} + +static void lo_map_destroy(struct lo_map *map) +{ + free(map->elems); +} + +static int lo_map_grow(struct lo_map *map, size_t new_nelems) +{ + struct lo_map_elem *new_elems; + size_t i; + + if (new_nelems <= map->nelems) { + return 1; + } + + new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems); + if (!new_elems) { + return 0; + } + + for (i = map->nelems; i < new_nelems; i++) { + new_elems[i].freelist = i + 1; + new_elems[i].in_use = false; + } + new_elems[new_nelems - 1].freelist = -1; + + map->elems = new_elems; + map->freelist = map->nelems; + map->nelems = new_nelems; + return 1; +} + +static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) +{ + struct lo_map_elem *elem; + + if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { + return NULL; + } + + elem = &map->elems[map->freelist]; + map->freelist = elem->freelist; + + elem->in_use = true; + + return elem; +} + +static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) +{ + ssize_t *prev; + + if (!lo_map_grow(map, key + 1)) { + return NULL; + } + + for (prev = &map->freelist; *prev != -1; + prev = &map->elems[*prev].freelist) { + if (*prev == key) { + struct lo_map_elem *elem = &map->elems[key]; + + *prev = elem->freelist; + elem->in_use = true; + return elem; + } + } + return NULL; +} + +static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) +{ + if (key >= map->nelems) { + return NULL; + } + if (!map->elems[key].in_use) { + return NULL; + } + return &map->elems[key]; +} + +static void lo_map_remove(struct lo_map *map, size_t key) +{ + struct lo_map_elem *elem; + + if (key >= map->nelems) { + return; + } + + elem = &map->elems[key]; + if (!elem->in_use) { + return; + } + + elem->in_use = false; + + elem->freelist = map->freelist; + map->freelist = key; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo_data(req)->fd_map); + if (!elem) { + return -1; + } + + elem->fd = fd; + return elem - lo_data(req)->fd_map.elems; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); + if (!elem) { + return -1; + } + + elem->dirp = dirp; + return elem - lo_data(req)->dirp_map.elems; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo_data(req)->ino_map); + if (!elem) { + return -1; + } + + elem->inode = inode; + return elem - lo_data(req)->ino_map.elems; +} + +static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) +{ + struct lo_inode *inode = *inodep; + + if (!inode) { + return; + } + + *inodep = NULL; + + if (g_atomic_int_dec_and_test(&inode->refcount)) { + close(inode->fd); + free(inode); + } +} + +/* Caller must release refcount using lo_inode_put() */ +static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->ino_map, ino); + if (elem) { + g_atomic_int_inc(&elem->inode->refcount); + } + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { + return NULL; + } + + return elem->inode; +} + +/* + * TODO Remove this helper and force callers to hold an inode refcount until + * they are done with the fd. This will be done in a later patch to make + * review easier. + */ +static int lo_fd(fuse_req_t req, fuse_ino_t ino) +{ + struct lo_inode *inode = lo_inode(req, ino); + int fd; + + if (!inode) { + return -1; + } + + fd = inode->fd; + lo_inode_put(lo_data(req), &inode); + return fd; +} + +static void lo_init(void *userdata, struct fuse_conn_info *conn) +{ + struct lo_data *lo = (struct lo_data *)userdata; + + if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { + conn->want |= FUSE_CAP_EXPORT_SUPPORT; + } + + if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } + if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { + if (lo->flock) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); + conn->want &= ~FUSE_CAP_FLOCK_LOCKS; + } + } + + if (conn->capable & FUSE_CAP_POSIX_LOCKS) { + if (lo->posix_lock) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); + conn->want |= FUSE_CAP_POSIX_LOCKS; + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); + conn->want &= ~FUSE_CAP_POSIX_LOCKS; + } + } + + if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; + } +} + +static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + int res; + struct stat buf; + struct lo_data *lo = lo_data(req); + + (void)fi; + + res = + fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + + fuse_reply_attr(req, &buf, lo->timeout); +} + +/* + * Increments parent->nlookup and caller must release refcount using + * lo_inode_put(&parent). + */ +static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode, + char path[PATH_MAX], struct lo_inode **parent) +{ + char procname[64]; + char *last; + struct stat stat; + struct lo_inode *p; + int retries = 2; + int res; + +retry: + sprintf(procname, "%i", inode->fd); + + res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX); + if (res < 0) { + fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__); + goto fail_noretry; + } + + if (res >= PATH_MAX) { + fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__); + goto fail_noretry; + } + path[res] = '\0'; + + last = strrchr(path, '/'); + if (last == NULL) { + /* Shouldn't happen */ + fuse_log( + FUSE_LOG_WARNING, + "%s: INTERNAL ERROR: bad path read from proc\n", __func__); + goto fail_noretry; + } + if (last == path) { + p = &lo->root; + pthread_mutex_lock(&lo->mutex); + p->nlookup++; + g_atomic_int_inc(&p->refcount); + pthread_mutex_unlock(&lo->mutex); + } else { + *last = '\0'; + res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0); + if (res == -1) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to stat parent: %m\n", __func__); + } + goto fail; + } + p = lo_find(lo, &stat); + if (p == NULL) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to find parent\n", __func__); + } + goto fail; + } + } + last++; + res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW); + if (res == -1) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to stat last\n", __func__); + } + goto fail_unref; + } + if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) { + if (!retries) { + fuse_log(FUSE_LOG_WARNING, + "%s: failed to match last\n", __func__); + } + goto fail_unref; + } + *parent = p; + memmove(path, last, strlen(last) + 1); + + return 0; + +fail_unref: + unref_inode_lolocked(lo, p, 1); + lo_inode_put(lo, &p); +fail: + if (retries) { + retries--; + goto retry; + } +fail_noretry: + errno = EIO; + return -1; +} + +static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode, + const struct timespec *tv) +{ + int res; + struct lo_inode *parent; + char path[PATH_MAX]; + + if (inode->is_symlink) { + res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH); + if (res == -1 && errno == EINVAL) { + /* Sorry, no race free way to set times on symlink. */ + if (lo->norace) { + errno = EPERM; + } else { + goto fallback; + } + } + return res; + } + sprintf(path, "%i", inode->fd); + + return utimensat(lo->proc_self_fd, path, tv, 0); + +fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW); + unref_inode_lolocked(lo, parent, 1); + lo_inode_put(lo, &parent); + } + + return res; +} + +static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->fd_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { + return -1; + } + + return elem->fd; +} + +static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int valid, struct fuse_file_info *fi) +{ + int saverr; + char procname[64]; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + int ifd; + int res; + int fd; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + ifd = inode->fd; + + /* If fi->fh is invalid we'll report EBADF later */ + if (fi) { + fd = lo_fi_fd(req, fi); + } + + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { + res = fchmod(fd, attr->st_mode); + } else { + sprintf(procname, "%i", ifd); + res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); + } + if (res == -1) { + goto out_err; + } + } + if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { + uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; + gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; + + res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + goto out_err; + } + } + if (valid & FUSE_SET_ATTR_SIZE) { + int truncfd; + + if (fi) { + truncfd = fd; + } else { + sprintf(procname, "%i", ifd); + truncfd = openat(lo->proc_self_fd, procname, O_RDWR); + if (truncfd < 0) { + goto out_err; + } + } + + res = ftruncate(truncfd, attr->st_size); + if (!fi) { + saverr = errno; + close(truncfd); + errno = saverr; + } + if (res == -1) { + goto out_err; + } + } + if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { + struct timespec tv[2]; + + tv[0].tv_sec = 0; + tv[1].tv_sec = 0; + tv[0].tv_nsec = UTIME_OMIT; + tv[1].tv_nsec = UTIME_OMIT; + + if (valid & FUSE_SET_ATTR_ATIME_NOW) { + tv[0].tv_nsec = UTIME_NOW; + } else if (valid & FUSE_SET_ATTR_ATIME) { + tv[0] = attr->st_atim; + } + + if (valid & FUSE_SET_ATTR_MTIME_NOW) { + tv[1].tv_nsec = UTIME_NOW; + } else if (valid & FUSE_SET_ATTR_MTIME) { + tv[1] = attr->st_mtim; + } + + if (fi) { + res = futimens(fd, tv); + } else { + res = utimensat_empty(lo, inode, tv); + } + if (res == -1) { + goto out_err; + } + } + lo_inode_put(lo, &inode); + + return lo_getattr(req, ino, fi); + +out_err: + saverr = errno; + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) +{ + struct lo_inode *p; + struct lo_key key = { + .ino = st->st_ino, + .dev = st->st_dev, + }; + + pthread_mutex_lock(&lo->mutex); + p = g_hash_table_lookup(lo->inodes, &key); + if (p) { + assert(p->nlookup > 0); + p->nlookup++; + g_atomic_int_inc(&p->refcount); + } + pthread_mutex_unlock(&lo->mutex); + + return p; +} + +/* value_destroy_func for posix_locks GHashTable */ +static void posix_locks_value_destroy(gpointer data) +{ + struct lo_inode_plock *plock = data; + + /* + * We had used open() for locks and had only one fd. So + * closing this fd should release all OFD locks. + */ + close(plock->fd); + free(plock); +} + +/* + * Increments nlookup and caller must release refcount using + * lo_inode_put(&parent). + */ +static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e) +{ + int newfd; + int res; + int saverr; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode = NULL; + struct lo_inode *dir = lo_inode(req, parent); + + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse + * mount point in guest, but we don't have its inode info in the + * ino_map. + */ + if (!dir) { + return ENOENT; + } + + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; + + /* Do not allow escaping root directory */ + if (dir == &lo->root && strcmp(name, "..") == 0) { + name = "."; + } + + newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; + } + + res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + goto out_err; + } + + inode = lo_find(lo, &e->attr); + if (inode) { + close(newfd); + newfd = -1; + } else { + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { + goto out_err; + } + + inode->is_symlink = S_ISLNK(e->attr.st_mode); + + /* + * One for the caller and one for nlookup (released in + * unref_inode_lolocked()) + */ + g_atomic_int_set(&inode->refcount, 2); + + inode->nlookup = 1; + inode->fd = newfd; + newfd = -1; + inode->key.ino = e->attr.st_ino; + inode->key.dev = e->attr.st_dev; + pthread_mutex_init(&inode->plock_mutex, NULL); + inode->posix_locks = g_hash_table_new_full( + g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); + + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); + g_hash_table_insert(lo->inodes, &inode->key, inode); + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; + lo_inode_put(lo, &inode); + lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e->ino); + + return 0; + +out_err: + saverr = errno; + if (newfd != -1) { + close(newfd); + } + lo_inode_put(lo, &inode); + lo_inode_put(lo, &dir); + return saverr; +} + +static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + struct fuse_entry_param e; + int err; + + fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, + name); + + /* + * Don't use is_safe_path_component(), allow "." and ".." for NFS export + * support. + */ + if (strchr(name, '/')) { + fuse_reply_err(req, EINVAL); + return; + } + + err = lo_do_lookup(req, parent, name, &e); + if (err) { + fuse_reply_err(req, err); + } else { + fuse_reply_entry(req, &e); + } +} + +/* + * On some archs, setres*id is limited to 2^16 but they + * provide setres*id32 variants that allow 2^32. + * Others just let setres*id do 2^32 anyway. + */ +#ifdef SYS_setresgid32 +#define OURSYS_setresgid SYS_setresgid32 +#else +#define OURSYS_setresgid SYS_setresgid +#endif + +#ifdef SYS_setresuid32 +#define OURSYS_setresuid SYS_setresuid32 +#else +#define OURSYS_setresuid SYS_setresuid +#endif + +/* + * Change to uid/gid of caller so that file is created with + * ownership of caller. + * TODO: What about selinux context? + */ +static int lo_change_cred(fuse_req_t req, struct lo_cred *old) +{ + int res; + + old->euid = geteuid(); + old->egid = getegid(); + + res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); + if (res == -1) { + return errno; + } + + res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); + if (res == -1) { + int errno_save = errno; + + syscall(OURSYS_setresgid, -1, old->egid, -1); + return errno_save; + } + + return 0; +} + +/* Regain Privileges */ +static void lo_restore_cred(struct lo_cred *old) +{ + int res; + + res = syscall(OURSYS_setresuid, -1, old->euid, -1); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); + exit(1); + } + + res = syscall(OURSYS_setresgid, -1, old->egid, -1); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); + exit(1); + } +} + +static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + const char *name, mode_t mode, dev_t rdev, + const char *link) +{ + int res; + int saverr; + struct lo_data *lo = lo_data(req); + struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + dir = lo_inode(req, parent); + if (!dir) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOMEM; + + saverr = lo_change_cred(req, &old); + if (saverr) { + goto out; + } + + res = mknod_wrapper(dir->fd, name, link, mode, rdev); + + saverr = errno; + + lo_restore_cred(&old); + + if (res == -1) { + goto out; + } + + saverr = lo_do_lookup(req, parent, name, &e); + if (saverr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + lo_inode_put(lo, &dir); + return; + +out: + lo_inode_put(lo, &dir); + fuse_reply_err(req, saverr); +} + +static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev) +{ + lo_mknod_symlink(req, parent, name, mode, rdev, NULL); +} + +static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode) +{ + lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); +} + +static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, + const char *name) +{ + lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); +} + +static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode, + int dfd, const char *name) +{ + int res; + struct lo_inode *parent; + char path[PATH_MAX]; + + if (inode->is_symlink) { + res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH); + if (res == -1 && (errno == ENOENT || errno == EINVAL)) { + /* Sorry, no race free way to hard-link a symlink. */ + if (lo->norace) { + errno = EPERM; + } else { + goto fallback; + } + } + return res; + } + + sprintf(path, "%i", inode->fd); + + return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW); + +fallback: + res = lo_parent_and_name(lo, inode, path, &parent); + if (res != -1) { + res = linkat(parent->fd, path, dfd, name, 0); + unref_inode_lolocked(lo, parent, 1); + lo_inode_put(lo, &parent); + } + + return res; +} + +static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + const char *name) +{ + int res; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; + struct lo_inode *inode; + struct fuse_entry_param e; + int saverr; + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + inode = lo_inode(req, ino); + if (!parent_inode || !inode) { + errno = EBADF; + goto out_err; + } + + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + + res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name); + if (res == -1) { + goto out_err; + } + + res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + goto out_err; + } + + pthread_mutex_lock(&lo->mutex); + inode->nlookup++; + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +/* Increments nlookup and caller must release refcount using lo_inode_put() */ +static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, + const char *name) +{ + int res; + struct stat attr; + + res = fstatat(lo_fd(req, parent), name, &attr, + AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + return NULL; + } + + return lo_find(lo_data(req), &attr); +} + +static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int res; + struct lo_inode *inode; + struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + inode = lookup_name(req, parent, name); + if (!inode) { + fuse_reply_err(req, EIO); + return; + } + + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); + lo_inode_put(lo, &inode); +} + +static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname, + unsigned int flags) +{ + int res; + struct lo_inode *parent_inode; + struct lo_inode *newparent_inode; + struct lo_inode *oldinode = NULL; + struct lo_inode *newinode = NULL; + struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + newparent_inode = lo_inode(req, newparent); + if (!parent_inode || !newparent_inode) { + fuse_reply_err(req, EBADF); + goto out; + } + + oldinode = lookup_name(req, parent, name); + newinode = lookup_name(req, newparent, newname); + + if (!oldinode) { + fuse_reply_err(req, EIO); + goto out; + } + + if (flags) { +#ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); +#else + res = syscall(SYS_renameat2, parent_inode->fd, name, + newparent_inode->fd, newname, flags); + if (res == -1 && errno == ENOSYS) { + fuse_reply_err(req, EINVAL); + } else { + fuse_reply_err(req, res == -1 ? errno : 0); + } +#endif + goto out; + } + + res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); + + fuse_reply_err(req, res == -1 ? errno : 0); +out: + unref_inode_lolocked(lo, oldinode, 1); + unref_inode_lolocked(lo, newinode, 1); + lo_inode_put(lo, &oldinode); + lo_inode_put(lo, &newinode); + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &newparent_inode); +} + +static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int res; + struct lo_inode *inode; + struct lo_data *lo = lo_data(req); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + inode = lookup_name(req, parent, name); + if (!inode) { + fuse_reply_err(req, EIO); + return; + } + + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); + lo_inode_put(lo, &inode); +} + +/* To be called with lo->mutex held */ +static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) +{ + if (!inode) { + return; + } + + assert(inode->nlookup >= n); + inode->nlookup -= n; + if (!inode->nlookup) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); + if (g_hash_table_size(inode->posix_locks)) { + fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); + } + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); + + /* Drop our refcount from lo_do_lookup() */ + lo_inode_put(lo, &inode); + } +} + +static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + uint64_t n) +{ + if (!inode) { + return; + } + + pthread_mutex_lock(&lo->mutex); + unref_inode(lo, inode, n); + pthread_mutex_unlock(&lo->mutex); +} + +static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + + inode = lo_inode(req, ino); + if (!inode) { + return; + } + + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", + (unsigned long long)ino, (unsigned long long)inode->nlookup, + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); + lo_inode_put(lo, &inode); +} + +static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +{ + lo_forget_one(req, ino, nlookup); + fuse_reply_none(req); +} + +static void lo_forget_multi(fuse_req_t req, size_t count, + struct fuse_forget_data *forgets) +{ + int i; + + for (i = 0; i < count; i++) { + lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); +} + +static void lo_readlink(fuse_req_t req, fuse_ino_t ino) +{ + char buf[PATH_MAX + 1]; + int res; + + res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + + if (res == sizeof(buf)) { + return (void)fuse_reply_err(req, ENAMETOOLONG); + } + + buf[res] = '\0'; + + fuse_reply_readlink(req, buf); +} + +struct lo_dirp { + gint refcount; + DIR *dp; + struct dirent *entry; + off_t offset; +}; + +static void lo_dirp_put(struct lo_dirp **dp) +{ + struct lo_dirp *d = *dp; + + if (!d) { + return; + } + *dp = NULL; + + if (g_atomic_int_dec_and_test(&d->refcount)) { + closedir(d->dp); + free(d); + } +} + +/* Call lo_dirp_put() on the return value when no longer needed */ +static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); + if (elem) { + g_atomic_int_inc(&elem->dirp->refcount); + } + pthread_mutex_unlock(&lo->mutex); + if (!elem) { + return NULL; + } + + return elem->dirp; +} + +static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + int error = ENOMEM; + struct lo_data *lo = lo_data(req); + struct lo_dirp *d; + int fd; + ssize_t fh; + + d = calloc(1, sizeof(struct lo_dirp)); + if (d == NULL) { + goto out_err; + } + + fd = openat(lo_fd(req, ino), ".", O_RDONLY); + if (fd == -1) { + goto out_errno; + } + + d->dp = fdopendir(fd); + if (d->dp == NULL) { + goto out_errno; + } + + d->offset = 0; + d->entry = NULL; + + g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ + pthread_mutex_lock(&lo->mutex); + fh = lo_add_dirp_mapping(req, d); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + goto out_err; + } + + fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { + fi->cache_readdir = 1; + } + fuse_reply_open(req, fi); + return; + +out_errno: + error = errno; +out_err: + if (d) { + if (d->dp) { + closedir(d->dp); + } + if (fd != -1) { + close(fd); + } + free(d); + } + fuse_reply_err(req, error); +} + +static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) +{ + struct lo_data *lo = lo_data(req); + struct lo_dirp *d = NULL; + struct lo_inode *dinode; + char *buf = NULL; + char *p; + size_t rem = size; + int err = EBADF; + + dinode = lo_inode(req, ino); + if (!dinode) { + goto error; + } + + d = lo_dirp(req, fi); + if (!d) { + goto error; + } + + err = ENOMEM; + buf = calloc(1, size); + if (!buf) { + goto error; + } + p = buf; + + if (offset != d->offset) { + seekdir(d->dp, offset); + d->entry = NULL; + d->offset = offset; + } + while (1) { + size_t entsize; + off_t nextoff; + const char *name; + + if (!d->entry) { + errno = 0; + d->entry = readdir(d->dp); + if (!d->entry) { + if (errno) { /* Error */ + err = errno; + goto error; + } else { /* End of stream */ + break; + } + } + } + nextoff = d->entry->d_off; + name = d->entry->d_name; + + fuse_ino_t entry_ino = 0; + struct fuse_entry_param e = (struct fuse_entry_param){ + .attr.st_ino = d->entry->d_ino, + .attr.st_mode = d->entry->d_type << 12, + }; + + /* Hide root's parent directory */ + if (dinode == &lo->root && strcmp(name, "..") == 0) { + e.attr.st_ino = lo->root.key.ino; + e.attr.st_mode = DT_DIR << 12; + } + + if (plus) { + if (!is_dot_or_dotdot(name)) { + err = lo_do_lookup(req, ino, name, &e); + if (err) { + goto error; + } + entry_ino = e.ino; + } + + entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); + } else { + entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); + } + if (entsize > rem) { + if (entry_ino != 0) { + lo_forget_one(req, entry_ino, 1); + } + break; + } + + p += entsize; + rem -= entsize; + + d->entry = NULL; + d->offset = nextoff; + } + + err = 0; +error: + lo_dirp_put(&d); + lo_inode_put(lo, &dinode); + + /* + * If there's an error, we can only signal it if we haven't stored + * any entries yet - otherwise we'd end up with wrong lookup + * counts for the entries that are already in the buffer. So we + * return what we've collected until that point. + */ + if (err && rem == size) { + fuse_reply_err(req, err); + } else { + fuse_reply_buf(req, buf, size - rem); + } + free(buf); +} + +static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + lo_do_readdir(req, ino, size, offset, fi, 0); +} + +static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + lo_do_readdir(req, ino, size, offset, fi, 1); +} + +static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + struct lo_dirp *d; + + (void)ino; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); + if (!elem) { + pthread_mutex_unlock(&lo->mutex); + fuse_reply_err(req, EBADF); + return; + } + + d = elem->dirp; + lo_map_remove(&lo->dirp_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + + lo_dirp_put(&d); /* paired with lo_opendir() */ + + fuse_reply_err(req, 0); +} + +static void update_open_flags(int writeback, struct fuse_file_info *fi) +{ + /* + * With writeback cache, kernel may send read requests even + * when userspace opened write-only + */ + if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { + fi->flags &= ~O_ACCMODE; + fi->flags |= O_RDWR; + } + + /* + * With writeback cache, O_APPEND is handled by the kernel. + * This breaks atomicity (since the file may change in the + * underlying filesystem, so that the kernel's idea of the + * end of the file isn't accurate anymore). In this example, + * we just accept that. A more rigorous filesystem may want + * to return an error here + */ + if (writeback && (fi->flags & O_APPEND)) { + fi->flags &= ~O_APPEND; + } + + /* + * O_DIRECT in guest should not necessarily mean bypassing page + * cache on host as well. If somebody needs that behavior, it + * probably should be a configuration knob in daemon. + */ + fi->flags &= ~O_DIRECT; +} + +static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) +{ + int fd; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; + + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, + name); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + if (!parent_inode) { + fuse_reply_err(req, EBADF); + return; + } + + err = lo_change_cred(req, &old); + if (err) { + goto out; + } + + update_open_flags(lo->writeback, fi); + + fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW, + mode); + err = fd == -1 ? errno : 0; + lo_restore_cred(&old); + + if (!err) { + ssize_t fh; + + pthread_mutex_lock(&lo->mutex); + fh = lo_add_fd_mapping(req, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); + err = ENOMEM; + goto out; + } + + fi->fh = fh; + err = lo_do_lookup(req, parent, name, &e); + } + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + +out: + lo_inode_put(lo, &parent_inode); + + if (err) { + fuse_reply_err(req, err); + } else { + fuse_reply_create(req, &e, fi); + } +} + +/* Should be called with inode->plock_mutex held */ +static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + struct lo_inode *inode, + uint64_t lock_owner, + pid_t pid, int *err) +{ + struct lo_inode_plock *plock; + char procname[64]; + int fd; + + plock = + g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); + + if (plock) { + return plock; + } + + plock = malloc(sizeof(struct lo_inode_plock)); + if (!plock) { + *err = ENOMEM; + return NULL; + } + + /* Open another instance of file which can be used for ofd locks. */ + sprintf(procname, "%i", inode->fd); + + /* TODO: What if file is not writable? */ + fd = openat(lo->proc_self_fd, procname, O_RDWR); + if (fd == -1) { + *err = errno; + free(plock); + return NULL; + } + + plock->lock_owner = lock_owner; + plock->fd = fd; + g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), + plock); + return plock; +} + +static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + struct lo_inode_plock *plock; + int ret, saverr = 0; + + fuse_log(FUSE_LOG_DEBUG, + "lo_getlk(ino=%" PRIu64 ", flags=%d)" + " owner=0x%lx, l_type=%d l_start=0x%lx" + " l_len=0x%lx\n", + ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start, + lock->l_len); + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + pthread_mutex_lock(&inode->plock_mutex); + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + if (!plock) { + saverr = ret; + goto out; + } + + ret = fcntl(plock->fd, F_OFD_GETLK, lock); + if (ret == -1) { + saverr = errno; + } + +out: + pthread_mutex_unlock(&inode->plock_mutex); + lo_inode_put(lo, &inode); + + if (saverr) { + fuse_reply_err(req, saverr); + } else { + fuse_reply_lock(req, lock); + } +} + +static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock, int sleep) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + struct lo_inode_plock *plock; + int ret, saverr = 0; + + fuse_log(FUSE_LOG_DEBUG, + "lo_setlk(ino=%" PRIu64 ", flags=%d)" + " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d" + " l_start=0x%lx l_len=0x%lx\n", + ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, + lock->l_whence, lock->l_start, lock->l_len); + + if (sleep) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + pthread_mutex_lock(&inode->plock_mutex); + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + + if (!plock) { + saverr = ret; + goto out; + } + + /* TODO: Is it alright to modify flock? */ + lock->l_pid = 0; + ret = fcntl(plock->fd, F_OFD_SETLK, lock); + if (ret == -1) { + saverr = errno; + } + +out: + pthread_mutex_unlock(&inode->plock_mutex); + lo_inode_put(lo, &inode); + + fuse_reply_err(req, saverr); +} + +static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + int res; + struct lo_dirp *d; + int fd; + + (void)ino; + + d = lo_dirp(req, fi); + if (!d) { + fuse_reply_err(req, EBADF); + return; + } + + fd = dirfd(d->dp); + if (datasync) { + res = fdatasync(fd); + } else { + res = fsync(fd); + } + + lo_dirp_put(&d); + + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + int fd; + ssize_t fh; + char buf[64]; + struct lo_data *lo = lo_data(req); + + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, + fi->flags); + + update_open_flags(lo->writeback, fi); + + sprintf(buf, "%i", lo_fd(req, ino)); + fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); + } + + pthread_mutex_lock(&lo->mutex); + fh = lo_add_fd_mapping(req, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); + fuse_reply_err(req, ENOMEM); + return; + } + + fi->fh = fh; + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + fuse_reply_open(req, fi); +} + +static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + int fd = -1; + + (void)ino; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->fd_map, fi->fh); + if (elem) { + fd = elem->fd; + elem = NULL; + lo_map_remove(&lo->fd_map, fi->fh); + } + pthread_mutex_unlock(&lo->mutex); + + close(fd); + fuse_reply_err(req, 0); +} + +static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + int res; + (void)ino; + struct lo_inode *inode; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + /* An fd is going away. Cleanup associated posix locks */ + pthread_mutex_lock(&inode->plock_mutex); + g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner)); + pthread_mutex_unlock(&inode->plock_mutex); + + res = close(dup(lo_fi_fd(req, fi))); + lo_inode_put(lo_data(req), &inode); + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + int res; + int fd; + char *buf; + + fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, + (void *)fi); + + if (!fi) { + struct lo_data *lo = lo_data(req); + + res = asprintf(&buf, "%i", lo_fd(req, ino)); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + + fd = openat(lo->proc_self_fd, buf, O_RDWR); + free(buf); + if (fd == -1) { + return (void)fuse_reply_err(req, errno); + } + } else { + fd = lo_fi_fd(req, fi); + } + + if (datasync) { + res = fdatasync(fd); + } else { + res = fsync(fd); + } + if (!fi) { + close(fd); + } + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + + fuse_log(FUSE_LOG_DEBUG, + "lo_read(ino=%" PRIu64 ", size=%zd, " + "off=%lu)\n", + ino, size, (unsigned long)offset); + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + buf.buf[0].fd = lo_fi_fd(req, fi); + buf.buf[0].pos = offset; + + fuse_reply_data(req, &buf); +} + +static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + struct fuse_bufvec *in_buf, off_t off, + struct fuse_file_info *fi) +{ + (void)ino; + ssize_t res; + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); + bool cap_fsetid_dropped = false; + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + + fuse_log(FUSE_LOG_DEBUG, + "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, + out_buf.buf[0].size, (unsigned long)off); + + /* + * If kill_priv is set, drop CAP_FSETID which should lead to kernel + * clearing setuid/setgid on file. + */ + if (fi->kill_priv) { + res = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (res != 0) { + fuse_reply_err(req, res); + return; + } + } + + res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { + fuse_reply_write(req, (size_t)res); + } + + if (cap_fsetid_dropped) { + res = gain_effective_cap("FSETID"); + if (res) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } +} + +static void lo_statfs(fuse_req_t req, fuse_ino_t ino) +{ + int res; + struct statvfs stbuf; + + res = fstatvfs(lo_fd(req, ino), &stbuf); + if (res == -1) { + fuse_reply_err(req, errno); + } else { + fuse_reply_statfs(req, &stbuf); + } +} + +static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + off_t length, struct fuse_file_info *fi) +{ + int err = EOPNOTSUPP; + (void)ino; + +#ifdef CONFIG_FALLOCATE + err = fallocate(lo_fi_fd(req, fi), mode, offset, length); + if (err < 0) { + err = errno; + } + +#elif defined(CONFIG_POSIX_FALLOCATE) + if (mode) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + err = posix_fallocate(lo_fi_fd(req, fi), offset, length); +#endif + + fuse_reply_err(req, err); +} + +static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int op) +{ + int res; + (void)ino; + + res = flock(lo_fi_fd(req, fi), op); + + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size) +{ + struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", + ino, name, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to getxattr on symlink. */ + saverr = EPERM; + goto out; + } + + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + + if (size) { + value = malloc(size); + if (!value) { + goto out_err; + } + + ret = fgetxattr(fd, name, value, size); + if (ret == -1) { + goto out_err; + } + saverr = 0; + if (ret == 0) { + goto out; + } + + fuse_reply_buf(req, value, ret); + } else { + ret = fgetxattr(fd, name, NULL, 0); + if (ret == -1) { + goto out_err; + } + + fuse_reply_xattr(req, ret); + } +out_free: + free(value); + + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; +out: + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; +} + +static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) +{ + struct lo_data *lo = lo_data(req); + char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, + size); + + if (inode->is_symlink) { + /* Sorry, no race free way to listxattr on symlink. */ + saverr = EPERM; + goto out; + } + + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + + if (size) { + value = malloc(size); + if (!value) { + goto out_err; + } + + ret = flistxattr(fd, value, size); + if (ret == -1) { + goto out_err; + } + saverr = 0; + if (ret == 0) { + goto out; + } + + fuse_reply_buf(req, value, ret); + } else { + ret = flistxattr(fd, NULL, 0); + if (ret == -1) { + goto out_err; + } + + fuse_reply_xattr(req, ret); + } +out_free: + free(value); + + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; +out: + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); + goto out_free; +} + +static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags) +{ + char procname[64]; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 + ", name=%s value=%s size=%zd)\n", ino, name, value, size); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ + saverr = EPERM; + goto out; + } + + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDWR); + if (fd < 0) { + saverr = errno; + goto out; + } + + ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + +out: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) +{ + char procname[64]; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, + name); + + if (inode->is_symlink) { + /* Sorry, no race free way to setxattr on symlink. */ + saverr = EPERM; + goto out; + } + + sprintf(procname, "%i", inode->fd); + fd = openat(lo->proc_self_fd, procname, O_RDWR); + if (fd < 0) { + saverr = errno; + goto out; + } + + ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + +out: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +#ifdef HAVE_COPY_FILE_RANGE +static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + struct fuse_file_info *fi_in, fuse_ino_t ino_out, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags) +{ + int in_fd, out_fd; + ssize_t res; + + in_fd = lo_fi_fd(req, fi_in); + out_fd = lo_fi_fd(req, fi_out); + + fuse_log(FUSE_LOG_DEBUG, + "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " + "off=%lu, ino=%" PRIu64 "/fd=%d, " + "off=%lu, size=%zd, flags=0x%x)\n", + ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags); + + res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { + fuse_reply_err(req, errno); + } else { + fuse_reply_write(req, res); + } +} +#endif + +static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + struct fuse_file_info *fi) +{ + off_t res; + + (void)ino; + res = lseek(lo_fi_fd(req, fi), off, whence); + if (res != -1) { + fuse_reply_lseek(req, res); + } else { + fuse_reply_err(req, errno); + } +} + +static void lo_destroy(void *userdata) +{ + struct lo_data *lo = (struct lo_data *)userdata; + + pthread_mutex_lock(&lo->mutex); + while (true) { + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init(&iter, lo->inodes); + if (!g_hash_table_iter_next(&iter, &key, &value)) { + break; + } + + struct lo_inode *inode = value; + unref_inode(lo, inode, inode->nlookup); + } + pthread_mutex_unlock(&lo->mutex); +} + +static struct fuse_lowlevel_ops lo_oper = { + .init = lo_init, + .lookup = lo_lookup, + .mkdir = lo_mkdir, + .mknod = lo_mknod, + .symlink = lo_symlink, + .link = lo_link, + .unlink = lo_unlink, + .rmdir = lo_rmdir, + .rename = lo_rename, + .forget = lo_forget, + .forget_multi = lo_forget_multi, + .getattr = lo_getattr, + .setattr = lo_setattr, + .readlink = lo_readlink, + .opendir = lo_opendir, + .readdir = lo_readdir, + .readdirplus = lo_readdirplus, + .releasedir = lo_releasedir, + .fsyncdir = lo_fsyncdir, + .create = lo_create, + .getlk = lo_getlk, + .setlk = lo_setlk, + .open = lo_open, + .release = lo_release, + .flush = lo_flush, + .fsync = lo_fsync, + .read = lo_read, + .write_buf = lo_write_buf, + .statfs = lo_statfs, + .fallocate = lo_fallocate, + .flock = lo_flock, + .getxattr = lo_getxattr, + .listxattr = lo_listxattr, + .setxattr = lo_setxattr, + .removexattr = lo_removexattr, +#ifdef HAVE_COPY_FILE_RANGE + .copy_file_range = lo_copy_file_range, +#endif + .lseek = lo_lseek, + .destroy = lo_destroy, +}; + +/* Print vhost-user.json backend program capabilities */ +static void print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"fs\"\n"); + printf("}\n"); +} + +/* + * Move to a new mount, net, and pid namespaces to isolate this process. + */ +static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) +{ + pid_t child; + + /* + * Create a new pid namespace for *child* processes. We'll have to + * fork in order to enter the new pid namespace. A new mount namespace + * is also needed so that we can remount /proc for the new pid + * namespace. + * + * Our UNIX domain sockets have been created. Now we can move to + * an empty network namespace to prevent TCP/IP and other network + * activity in case this process is compromised. + */ + if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { + fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); + exit(1); + } + + child = fork(); + if (child < 0) { + fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); + exit(1); + } + if (child > 0) { + pid_t waited; + int wstatus; + + /* The parent waits for the child */ + do { + waited = waitpid(child, &wstatus, 0); + } while (waited < 0 && errno == EINTR && !se->exited); + + /* We were terminated by a signal, see fuse_signals.c */ + if (se->exited) { + exit(0); + } + + if (WIFEXITED(wstatus)) { + exit(WEXITSTATUS(wstatus)); + } + + exit(1); + } + + /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ + prctl(PR_SET_PDEATHSIG, SIGTERM); + + /* + * If the mounts have shared propagation then we want to opt out so our + * mount changes don't affect the parent mount namespace. + */ + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); + exit(1); + } + + /* The child must remount /proc to use the new pid namespace */ + if (mount("proc", "/proc", "proc", + MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); + exit(1); + } + + /* Now we can get our /proc/self/fd directory file descriptor */ + lo->proc_self_fd = open("/proc/self/fd", O_PATH); + if (lo->proc_self_fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); + exit(1); + } +} + +/* + * Capture the capability state, we'll need to restore this for individual + * threads later; see load_capng. + */ +static void setup_capng(void) +{ + /* Note this accesses /proc so has to happen before the sandbox */ + if (capng_get_caps_process()) { + fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); + exit(1); + } + pthread_mutex_init(&cap.mutex, NULL); + pthread_mutex_lock(&cap.mutex); + cap.saved = capng_save_state(); + if (!cap.saved) { + fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); + exit(1); + } + pthread_mutex_unlock(&cap.mutex); +} + +static void cleanup_capng(void) +{ + free(cap.saved); + cap.saved = NULL; + pthread_mutex_destroy(&cap.mutex); +} + + +/* + * Make the source directory our root so symlinks cannot escape and no other + * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. + */ +static void setup_mounts(const char *source) +{ + int oldroot; + int newroot; + + if (mount(source, source, NULL, MS_BIND, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); + exit(1); + } + + /* This magic is based on lxc's lxc_pivot_root() */ + oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (oldroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); + exit(1); + } + + newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (newroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); + exit(1); + } + + if (fchdir(newroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); + exit(1); + } + + if (syscall(__NR_pivot_root, ".", ".") < 0) { + fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); + exit(1); + } + + if (fchdir(oldroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); + exit(1); + } + + if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); + exit(1); + } + + if (umount2(".", MNT_DETACH) < 0) { + fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); + exit(1); + } + + if (fchdir(newroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); + exit(1); + } + + close(newroot); + close(oldroot); +} + +/* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + bool enable_syslog) +{ + setup_namespaces(lo, se); + setup_mounts(lo->source); + setup_seccomp(enable_syslog); +} + +/* Raise the maximum number of open file descriptors */ +static void setup_nofile_rlimit(void) +{ + const rlim_t max_fds = 1000000; + struct rlimit rlim; + + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } + + if (rlim.rlim_cur >= max_fds) { + return; /* nothing to do */ + } + + rlim.rlim_cur = max_fds; + rlim.rlim_max = max_fds; + + if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { + /* Ignore SELinux denials */ + if (errno == EPERM) { + return; + } + + fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } +} + +static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) +{ + g_autofree char *localfmt = NULL; + + if (current_log_level < level) { + return; + } + + if (current_log_level == FUSE_LOG_DEBUG) { + if (!use_syslog) { + localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s", + get_clock(), syscall(__NR_gettid), fmt); + } else { + localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), + fmt); + } + fmt = localfmt; + } + + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { + case FUSE_LOG_EMERG: + priority = LOG_EMERG; + break; + case FUSE_LOG_ALERT: + priority = LOG_ALERT; + break; + case FUSE_LOG_CRIT: + priority = LOG_CRIT; + break; + case FUSE_LOG_ERR: + priority = LOG_ERR; + break; + case FUSE_LOG_WARNING: + priority = LOG_WARNING; + break; + case FUSE_LOG_NOTICE: + priority = LOG_NOTICE; + break; + case FUSE_LOG_INFO: + priority = LOG_INFO; + break; + case FUSE_LOG_DEBUG: + priority = LOG_DEBUG; + break; + } + vsyslog(priority, fmt, ap); + } else { + vfprintf(stderr, fmt, ap); + } +} + +static void setup_root(struct lo_data *lo, struct lo_inode *root) +{ + int fd, res; + struct stat stat; + + fd = open("/", O_PATH); + if (fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); + exit(1); + } + + res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); + exit(1); + } + + root->is_symlink = false; + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; + root->nlookup = 2; + g_atomic_int_set(&root->refcount, 2); +} + +static guint lo_key_hash(gconstpointer key) +{ + const struct lo_key *lkey = key; + + return (guint)lkey->ino + (guint)lkey->dev; +} + +static gboolean lo_key_equal(gconstpointer a, gconstpointer b) +{ + const struct lo_key *la = a; + const struct lo_key *lb = b; + + return la->ino == lb->ino && la->dev == lb->dev; +} + +static void fuse_lo_data_cleanup(struct lo_data *lo) +{ + if (lo->inodes) { + g_hash_table_destroy(lo->inodes); + } + lo_map_destroy(&lo->fd_map); + lo_map_destroy(&lo->dirp_map); + lo_map_destroy(&lo->ino_map); + + if (lo->proc_self_fd >= 0) { + close(lo->proc_self_fd); + } + + if (lo->root.fd >= 0) { + close(lo->root.fd); + } + + free(lo->source); +} + +int main(int argc, char *argv[]) +{ + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); + struct fuse_session *se; + struct fuse_cmdline_opts opts; + struct lo_data lo = { + .debug = 0, + .writeback = 0, + .posix_lock = 1, + .proc_self_fd = -1, + }; + struct lo_map_elem *root_elem; + int ret = -1; + + /* Don't mask creation mode, kernel already did that */ + umask(0); + + pthread_mutex_init(&lo.mutex, NULL); + lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_AUTO; + + /* + * Set up the ino map like this: + * [0] Reserved (will not be used) + * [1] Root inode + */ + lo_map_init(&lo.ino_map); + lo_map_reserve(&lo.ino_map, 0)->in_use = false; + root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); + root_elem->inode = &lo.root; + + lo_map_init(&lo.dirp_map); + lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { + goto err_out1; + } + fuse_set_log_func(log_func); + use_syslog = opts.syslog; + if (use_syslog) { + openlog("virtiofsd", LOG_PID, LOG_DAEMON); + } + + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); + printf(" -o source=PATH shared directory tree\n"); + fuse_lowlevel_help(); + ret = 0; + goto err_out1; + } else if (opts.show_version) { + fuse_lowlevel_version(); + ret = 0; + goto err_out1; + } else if (opts.print_capabilities) { + print_capabilities(); + ret = 0; + goto err_out1; + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { + goto err_out1; + } + + /* + * log_level is 0 if not configured via cmd options (0 is LOG_EMERG, + * and we don't use this log level). + */ + if (opts.log_level != 0) { + current_log_level = opts.log_level; + } + lo.debug = opts.debug; + if (lo.debug) { + current_log_level = FUSE_LOG_DEBUG; + } + if (lo.source) { + struct stat stat; + int res; + + res = lstat(lo.source, &stat); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", + lo.source); + exit(1); + } + if (!S_ISDIR(stat.st_mode)) { + fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); + exit(1); + } + } else { + lo.source = strdup("/"); + } + if (!lo.timeout_set) { + switch (lo.cache) { + case CACHE_NONE: + lo.timeout = 0.0; + break; + + case CACHE_AUTO: + lo.timeout = 1.0; + break; + + case CACHE_ALWAYS: + lo.timeout = 86400.0; + break; + } + } else if (lo.timeout < 0) { + fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); + exit(1); + } + + se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); + if (se == NULL) { + goto err_out1; + } + + if (fuse_set_signal_handlers(se) != 0) { + goto err_out2; + } + + if (fuse_session_mount(se) != 0) { + goto err_out3; + } + + fuse_daemonize(opts.foreground); + + setup_nofile_rlimit(); + + /* Must be before sandbox since it wants /proc */ + setup_capng(); + + setup_sandbox(&lo, se, opts.syslog); + + setup_root(&lo, &lo.root); + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + + fuse_session_unmount(se); + cleanup_capng(); +err_out3: + fuse_remove_signal_handlers(se); +err_out2: + fuse_session_destroy(se); +err_out1: + fuse_opt_free_args(&args); + + fuse_lo_data_cleanup(&lo); + + return ret ? 1 : 0; +} diff --git a/tools/virtiofsd/seccomp.c b/tools/virtiofsd/seccomp.c new file mode 100644 index 0000000000..2d9d4a7ec0 --- /dev/null +++ b/tools/virtiofsd/seccomp.c @@ -0,0 +1,165 @@ +/* + * Seccomp sandboxing for virtiofsd + * + * Copyright (C) 2019 Red Hat, Inc. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "seccomp.h" +#include "fuse_i.h" +#include "fuse_log.h" +#include <errno.h> +#include <glib.h> +#include <seccomp.h> +#include <stdlib.h> + +/* Bodge for libseccomp 2.4.2 which broke ppoll */ +#if !defined(__SNR_ppoll) && defined(__SNR_brk) +#ifdef __NR_ppoll +#define __SNR_ppoll __NR_ppoll +#else +#define __SNR_ppoll __PNR_ppoll +#endif +#endif + +static const int syscall_whitelist[] = { + /* TODO ireg sem*() syscalls */ + SCMP_SYS(brk), + SCMP_SYS(capget), /* For CAP_FSETID */ + SCMP_SYS(capset), + SCMP_SYS(clock_gettime), + SCMP_SYS(clone), +#ifdef __NR_clone3 + SCMP_SYS(clone3), +#endif + SCMP_SYS(close), + SCMP_SYS(copy_file_range), + SCMP_SYS(dup), + SCMP_SYS(eventfd2), + SCMP_SYS(exit), + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), + SCMP_SYS(fdatasync), + SCMP_SYS(fgetxattr), + SCMP_SYS(flistxattr), + SCMP_SYS(flock), + SCMP_SYS(fremovexattr), + SCMP_SYS(fsetxattr), + SCMP_SYS(fstat), + SCMP_SYS(fstatfs), + SCMP_SYS(fsync), + SCMP_SYS(ftruncate), + SCMP_SYS(futex), + SCMP_SYS(getdents), + SCMP_SYS(getdents64), + SCMP_SYS(getegid), + SCMP_SYS(geteuid), + SCMP_SYS(getpid), + SCMP_SYS(gettid), + SCMP_SYS(gettimeofday), + SCMP_SYS(linkat), + SCMP_SYS(lseek), + SCMP_SYS(madvise), + SCMP_SYS(mkdirat), + SCMP_SYS(mknodat), + SCMP_SYS(mmap), + SCMP_SYS(mprotect), + SCMP_SYS(mremap), + SCMP_SYS(munmap), + SCMP_SYS(newfstatat), + SCMP_SYS(open), + SCMP_SYS(openat), + SCMP_SYS(ppoll), + SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */ + SCMP_SYS(preadv), + SCMP_SYS(pread64), + SCMP_SYS(pwritev), + SCMP_SYS(pwrite64), + SCMP_SYS(read), + SCMP_SYS(readlinkat), + SCMP_SYS(recvmsg), + SCMP_SYS(renameat), + SCMP_SYS(renameat2), + SCMP_SYS(rt_sigaction), + SCMP_SYS(rt_sigprocmask), + SCMP_SYS(rt_sigreturn), + SCMP_SYS(sendmsg), + SCMP_SYS(setresgid), + SCMP_SYS(setresuid), +#ifdef __NR_setresgid32 + SCMP_SYS(setresgid32), +#endif +#ifdef __NR_setresuid32 + SCMP_SYS(setresuid32), +#endif + SCMP_SYS(set_robust_list), + SCMP_SYS(symlinkat), + SCMP_SYS(time), /* Rarely needed, except on static builds */ + SCMP_SYS(tgkill), + SCMP_SYS(unlinkat), + SCMP_SYS(utimensat), + SCMP_SYS(write), + SCMP_SYS(writev), +}; + +/* Syscalls used when --syslog is enabled */ +static const int syscall_whitelist_syslog[] = { + SCMP_SYS(sendto), +}; + +static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n", + syscalls[i]); + exit(1); + } + } +} + +void setup_seccomp(bool enable_syslog) +{ + scmp_filter_ctx ctx; + +#ifdef SCMP_ACT_KILL_PROCESS + ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); + /* Handle a newer libseccomp but an older kernel */ + if (!ctx && errno == EOPNOTSUPP) { + ctx = seccomp_init(SCMP_ACT_TRAP); + } +#else + ctx = seccomp_init(SCMP_ACT_TRAP); +#endif + if (!ctx) { + fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n"); + exit(1); + } + + add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist)); + if (enable_syslog) { + add_whitelist(ctx, syscall_whitelist_syslog, + G_N_ELEMENTS(syscall_whitelist_syslog)); + } + + /* libvhost-user calls this for post-copy migration, we don't need it */ + if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(userfaultfd), 0) != 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n"); + exit(1); + } + + if (seccomp_load(ctx) < 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n"); + exit(1); + } + + seccomp_release(ctx); +} diff --git a/tools/virtiofsd/seccomp.h b/tools/virtiofsd/seccomp.h new file mode 100644 index 0000000000..d47c8eade6 --- /dev/null +++ b/tools/virtiofsd/seccomp.h @@ -0,0 +1,16 @@ +/* + * Seccomp sandboxing for virtiofsd + * + * Copyright (C) 2019 Red Hat, Inc. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef VIRTIOFSD_SECCOMP_H +#define VIRTIOFSD_SECCOMP_H + +#include <stdbool.h> + +void setup_seccomp(bool enable_syslog); + +#endif /* VIRTIOFSD_SECCOMP_H */ diff --git a/trace/mem-internal.h b/trace/mem-internal.h index 0a32aa22ca..8b72b678fa 100644 --- a/trace/mem-internal.h +++ b/trace/mem-internal.h @@ -47,21 +47,4 @@ static inline uint16_t trace_mem_get_info(MemOp op, mmu_idx); } -/* Used by the atomic helpers */ -static inline -uint16_t trace_mem_build_info_no_se_be(int size_shift, bool store, - TCGMemOpIdx oi) -{ - return trace_mem_build_info(size_shift, false, MO_BE, store, - get_mmuidx(oi)); -} - -static inline -uint16_t trace_mem_build_info_no_se_le(int size_shift, bool store, - TCGMemOpIdx oi) -{ - return trace_mem_build_info(size_shift, false, MO_LE, store, - get_mmuidx(oi)); -} - #endif /* TRACE__MEM_INTERNAL_H */ diff --git a/ui/console.c b/ui/console.c index 69339b028b..179901c35e 100644 --- a/ui/console.c +++ b/ui/console.c @@ -2338,6 +2338,7 @@ void qemu_display_help(void) int idx; printf("Available display backend types:\n"); + printf("none\n"); for (idx = DISPLAY_TYPE_NONE; idx < DISPLAY_TYPE__MAX; idx++) { if (!dpys[idx]) { ui_module_load_one(DisplayType_str(idx)); diff --git a/ui/vnc-enc-zrle.c b/ui/vnc-enc-zrle.c index 17fd28a2e2..b4f71e32cf 100644 --- a/ui/vnc-enc-zrle.c +++ b/ui/vnc-enc-zrle.c @@ -98,8 +98,8 @@ static int zrle_compress_data(VncState *vs, int level) /* set pointers */ zstream->next_in = vs->zrle->zrle.buffer; zstream->avail_in = vs->zrle->zrle.offset; - zstream->next_out = vs->zrle->zlib.buffer + vs->zrle->zlib.offset; - zstream->avail_out = vs->zrle->zlib.capacity - vs->zrle->zlib.offset; + zstream->next_out = vs->zrle->zlib.buffer; + zstream->avail_out = vs->zrle->zlib.capacity; zstream->data_type = Z_BINARY; /* start encoding */ @@ -898,8 +898,6 @@ int vnc_raw_send_framebuffer_update(VncState *vs, int x, int y, int w, int h) int vnc_send_framebuffer_update(VncState *vs, int x, int y, int w, int h) { int n = 0; - bool encode_raw = false; - size_t saved_offs = vs->output.offset; switch(vs->vnc_encoding) { case VNC_ENCODING_ZLIB: @@ -922,24 +920,10 @@ int vnc_send_framebuffer_update(VncState *vs, int x, int y, int w, int h) n = vnc_zywrle_send_framebuffer_update(vs, x, y, w, h); break; default: - encode_raw = true; + vnc_framebuffer_update(vs, x, y, w, h, VNC_ENCODING_RAW); + n = vnc_raw_send_framebuffer_update(vs, x, y, w, h); break; } - - /* If the client has the same pixel format as our internal buffer and - * a RAW encoding would need less space fall back to RAW encoding to - * save bandwidth and processing power in the client. */ - if (!encode_raw && vs->write_pixels == vnc_write_pixels_copy && - 12 + h * w * VNC_SERVER_FB_BYTES <= (vs->output.offset - saved_offs)) { - vs->output.offset = saved_offs; - encode_raw = true; - } - - if (encode_raw) { - vnc_framebuffer_update(vs, x, y, w, h, VNC_ENCODING_RAW); - n = vnc_raw_send_framebuffer_update(vs, x, y, w, h); - } - return n; } @@ -2087,8 +2071,15 @@ static void set_encodings(VncState *vs, int32_t *encodings, size_t n_encodings) break; #endif case VNC_ENCODING_ZLIB: - vs->features |= VNC_FEATURE_ZLIB_MASK; - vs->vnc_encoding = enc; + /* + * VNC_ENCODING_ZRLE compresses better than VNC_ENCODING_ZLIB. + * So prioritize ZRLE, even if the client hints that it prefers + * ZLIB. + */ + if ((vs->features & VNC_FEATURE_ZRLE_MASK) == 0) { + vs->features |= VNC_FEATURE_ZLIB_MASK; + vs->vnc_encoding = enc; + } break; case VNC_ENCODING_ZRLE: vs->features |= VNC_FEATURE_ZRLE_MASK; diff --git a/util/cacheinfo.c b/util/cacheinfo.c index ea6f3e99bf..d94dc6adc8 100644 --- a/util/cacheinfo.c +++ b/util/cacheinfo.c @@ -93,10 +93,16 @@ static void sys_cache_info(int *isize, int *dsize) static void sys_cache_info(int *isize, int *dsize) { # ifdef _SC_LEVEL1_ICACHE_LINESIZE - *isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE); + int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE); + if (tmp_isize > 0) { + *isize = tmp_isize; + } # endif # ifdef _SC_LEVEL1_DCACHE_LINESIZE - *dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (tmp_dsize > 0) { + *dsize = tmp_dsize; + } # endif } #endif /* sys_cache_info */ @@ -1604,9 +1604,6 @@ static bool main_loop_should_exit(void) RunState r; ShutdownCause request; - if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { - return false; - } if (preconfig_exit_requested) { if (runstate_check(RUN_STATE_PRECONFIG)) { runstate_set(RUN_STATE_PRELAUNCH); @@ -1635,8 +1632,13 @@ static bool main_loop_should_exit(void) pause_all_vcpus(); qemu_system_reset(request); resume_all_vcpus(); + /* + * runstate can change in pause_all_vcpus() + * as iothread mutex is unlocked + */ if (!runstate_check(RUN_STATE_RUNNING) && - !runstate_check(RUN_STATE_INMIGRATE)) { + !runstate_check(RUN_STATE_INMIGRATE) && + !runstate_check(RUN_STATE_FINISH_MIGRATE)) { runstate_set(RUN_STATE_PRELAUNCH); } } @@ -2753,8 +2755,6 @@ static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp) static void configure_accelerators(const char *progname) { const char *accel; - char **accel_list, **tmp; - bool accel_initialised = false; bool init_failed = false; qemu_opts_foreach(qemu_find_opts("icount"), @@ -2762,26 +2762,33 @@ static void configure_accelerators(const char *progname) accel = qemu_opt_get(qemu_get_machine_opts(), "accel"); if (QTAILQ_EMPTY(&qemu_accel_opts.head)) { + char **accel_list, **tmp; + if (accel == NULL) { /* Select the default accelerator */ - if (!accel_find("tcg") && !accel_find("kvm")) { - error_report("No accelerator selected and" - " no default accelerator available"); - exit(1); - } else { - int pnlen = strlen(progname); - if (pnlen >= 3 && g_str_equal(&progname[pnlen - 3], "kvm")) { + bool have_tcg = accel_find("tcg"); + bool have_kvm = accel_find("kvm"); + + if (have_tcg && have_kvm) { + if (g_str_has_suffix(progname, "kvm")) { /* If the program name ends with "kvm", we prefer KVM */ accel = "kvm:tcg"; } else { accel = "tcg:kvm"; } + } else if (have_kvm) { + accel = "kvm"; + } else if (have_tcg) { + accel = "tcg"; + } else { + error_report("No accelerator selected and" + " no default accelerator available"); + exit(1); } } - accel_list = g_strsplit(accel, ":", 0); - for (tmp = accel_list; !accel_initialised && tmp && *tmp; tmp++) { + for (tmp = accel_list; *tmp; tmp++) { /* * Filter invalid accelerators here, to prevent obscenities * such as "-machine accel=tcg,,thread=single". |