summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.clang-format126
-rw-r--r--.github/workflows/build-kernel-module.yml113
-rw-r--r--.github/workflows/build-programs.yml66
-rw-r--r--.github/workflows/lint.yml34
-rw-r--r--.github/workflows/release.yml64
-rw-r--r--.gitignore2
-rw-r--r--CMakeLists.txt498
-rw-r--r--Kbuild.in2
-rw-r--r--LOCKS80
-rw-r--r--README.md456
-rwxr-xr-xbuild.sh6
-rw-r--r--cmake/Build.cmake27
-rw-r--r--cmake/CheckAFLCCompiler.cmake24
-rw-r--r--cmake/DockerImage.cmake25
-rw-r--r--cmake/FindCheckPatch.cmake31
-rw-r--r--cmake/FindClangFormat.cmake20
-rw-r--r--cmake/FindDocker.cmake20
-rw-r--r--cmake/FindFuse.cmake10
-rw-r--r--cmake/FindKernelHeaders.cmake61
-rw-r--r--cmake/FindLibatomic.cmake45
-rw-r--r--cmake/FindStdatomic.cmake14
-rw-r--r--cmake/GenerateBuild.cmake11
-rw-r--r--cmake/GenerateVersion.cmake20
-rw-r--r--cmake/InstallVersionFile.cmake.in8
-rw-r--r--cmake/Kernel.cmake58
-rw-r--r--cmake/Lint.cmake70
-rw-r--r--cmake/PostVersionPackaging.cmake9
-rw-r--r--cmake/PreVersionPackaging.cmake11
-rw-r--r--cmake/Version.cmake125
-rw-r--r--cmake/toolchain/Aarch64LinuxGnu.cmake24
-rw-r--r--cmake/toolchain/PowerpcLinuxGnu.cmake24
-rw-r--r--conf/README.server35
-rwxr-xr-xget-version.sh22
-rw-r--r--inc/dnbd3/build.h.in11
-rw-r--r--inc/dnbd3/config.h (renamed from src/config.h)2
-rw-r--r--inc/dnbd3/config/client.h52
-rw-r--r--inc/dnbd3/config/server.h (renamed from src/serverconfig.h)10
-rw-r--r--inc/dnbd3/shared/crc32.h (renamed from src/shared/crc32.h)0
-rw-r--r--inc/dnbd3/shared/fdsignal.h (renamed from src/shared/fdsignal.h)0
-rw-r--r--inc/dnbd3/shared/log.h (renamed from src/shared/log.h)17
-rw-r--r--inc/dnbd3/shared/protocol.h (renamed from src/shared/protocol.h)9
-rw-r--r--inc/dnbd3/shared/serialize.h (renamed from src/serialize.h)7
-rw-r--r--inc/dnbd3/shared/sockhelper.h (renamed from src/shared/sockhelper.h)2
-rw-r--r--inc/dnbd3/shared/timing.h (renamed from src/shared/timing.h)4
-rw-r--r--inc/dnbd3/types.h (renamed from src/types.h)38
-rw-r--r--inc/dnbd3/version.h.in12
-rwxr-xr-xpack.sh6
-rw-r--r--pkg/CMakeLists.txt8
-rw-r--r--pkg/config/CMakeLists.txt15
-rw-r--r--pkg/config/alt-servers (renamed from conf/alt-servers)2
-rw-r--r--pkg/config/rpc.acl (renamed from conf/rpc.acl)2
-rw-r--r--pkg/config/server.conf (renamed from conf/server.conf)31
-rw-r--r--pkg/docker/archlinux_dockerfile28
-rw-r--r--pkg/docker/ubuntu-20-04_dockerfile28
-rw-r--r--pkg/systemd/CMakeLists.txt13
-rw-r--r--pkg/systemd/dnbd3-server.service12
-rw-r--r--src/CMakeLists.txt24
-rw-r--r--src/bench/CMakeLists.txt22
-rw-r--r--src/bench/connection.c12
-rw-r--r--src/bench/connection.h2
-rw-r--r--src/bench/helper.h2
-rw-r--r--src/bench/main.c8
-rw-r--r--src/bench/serialize.c5
-rw-r--r--src/client/CMakeLists.txt18
-rw-r--r--src/client/client.c284
-rw-r--r--src/clientconfig.h36
-rw-r--r--src/fuse/CMakeLists.txt29
-rw-r--r--src/fuse/connection.c542
-rw-r--r--src/fuse/connection.h24
-rw-r--r--src/fuse/helper.c6
-rw-r--r--src/fuse/helper.h14
-rw-r--r--src/fuse/main.c427
-rw-r--r--src/fuse/serialize.c5
-rw-r--r--src/kernel/.clang-format552
-rw-r--r--src/kernel/CMakeLists.txt66
-rw-r--r--src/kernel/Kbuild5
-rw-r--r--src/kernel/blk.c740
-rw-r--r--src/kernel/blk.h18
-rw-r--r--src/kernel/core.c81
-rw-r--r--src/kernel/dnbd3.h84
-rw-r--r--src/kernel/dnbd3_main.c250
-rw-r--r--src/kernel/dnbd3_main.h148
-rw-r--r--src/kernel/net.c1929
-rw-r--r--src/kernel/net.h29
l---------src/kernel/serialize.c1
-rw-r--r--src/kernel/serialize_kmod.c5
-rw-r--r--src/kernel/sysfs.c177
-rw-r--r--src/kernel/sysfs.h20
-rw-r--r--src/kernel/utils.c41
-rw-r--r--src/kernel/utils.h29
-rw-r--r--src/server/CMakeLists.txt112
-rw-r--r--src/server/altservers.c79
-rw-r--r--src/server/altservers.h2
-rw-r--r--src/server/fileutil.c2
-rw-r--r--src/server/fuse.c661
-rw-r--r--src/server/fuse.h10
-rw-r--r--src/server/globals.c72
-rw-r--r--src/server/globals.h93
-rw-r--r--src/server/helper.h4
-rw-r--r--src/server/image.c685
-rw-r--r--src/server/image.h48
-rw-r--r--src/server/ini.c2
-rw-r--r--src/server/integrity.c20
-rw-r--r--src/server/locks.c4
-rw-r--r--src/server/locks.h20
-rw-r--r--src/server/net.c174
-rw-r--r--src/server/net.h4
-rw-r--r--src/server/picohttpparser/CMakeLists.txt11
-rw-r--r--src/server/reference.h5
-rw-r--r--src/server/rpc.c29
-rw-r--r--src/server/serialize.c5
-rw-r--r--src/server/server.c84
-rw-r--r--src/server/server.h4
-rw-r--r--src/server/threadpool.c19
-rw-r--r--src/server/threadpool.h5
-rw-r--r--src/server/uplink.c1127
-rw-r--r--src/server/uplink.h8
-rw-r--r--src/shared/CMakeLists.txt28
-rw-r--r--src/shared/crc32.c238
-rw-r--r--src/shared/fdsignal.c2
-rw-r--r--src/shared/log.c36
-rw-r--r--src/shared/serialize.c (renamed from src/serialize.c)43
-rw-r--r--src/shared/sockhelper.c36
-rw-r--r--src/shared/timing.c2
-rw-r--r--src/version.c.in3
-rw-r--r--src/version.h30
126 files changed, 8143 insertions, 3649 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..6adc436
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -3
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: false
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: true
+ AfterControlStatement: false
+ AfterEnum: true
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: true
+ AfterUnion: true
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 6
+ContinuationIndentWidth: 6
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: true
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+#IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentGotoLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 3
+IndentWrappedFunctionNames: false
+#InsertTrailingCommas: Wrapped
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 60
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+#PenaltyIndentedWhitespace: 20
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+#SpaceAroundPointerQualifiers: Both
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInConditionalStatement: true
+SpacesInParentheses: true
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 3
+#UseTab: AlignWithSpaces
+UseTab: ForContinuationAndIndentation
+...
diff --git a/.github/workflows/build-kernel-module.yml b/.github/workflows/build-kernel-module.yml
new file mode 100644
index 0000000..526bf8e
--- /dev/null
+++ b/.github/workflows/build-kernel-module.yml
@@ -0,0 +1,113 @@
+name: Build dnbd3 kernel module
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - name: CentOS 8.4 [4.18.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-rpm
+ kernel-source: https://vault.centos.org/8.4.2105/BaseOS/Source/SPackages/kernel-4.18.0-305.25.1.el8_4.src.rpm
+ - name: CentOS 8.5 [4.18.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-rpm
+ kernel-source: https://vault.centos.org/8.5.2111/BaseOS/Source/SPackages/kernel-4.18.0-348.2.1.el8_5.src.rpm
+ - name: CentOS 9.3 [5.14.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-tar
+ kernel-source: https://files.bwlp.ks.uni-freiburg.de/stuff/centos/linux-5.14.0-362.18.1.el9_3.tar.xz
+ - name: CentOS 9.4 [5.14.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-tar
+ kernel-source: https://files.bwlp.ks.uni-freiburg.de/stuff/centos/linux-5.14.0-427.el9.tar.xz
+ - name: Vanilla [4.19.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 4.19.y
+ - name: Vanilla [5.4.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.4.y
+ - name: Vanilla [5.10.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.10.y
+ - name: Vanilla [5.15.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.15.y
+ - name: Vanilla [6.1.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 6.1.y
+ - name: Vanilla [6.6.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 6.6.y
+ name: Build dnbd3 ${{ matrix.config.name }}
+ runs-on: ${{ matrix.config.build-os }}
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ libelf-dev \
+ rpm2cpio \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Checkout Vanilla kernel version [git]
+ if: matrix.config.kernel-type == 'vanilla'
+ run: git clone --depth 1 --branch "linux-${{ matrix.config.kernel-version }}" "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git" "../kernel"
+ - name: Checkout CentOS kernel version [rpm]
+ if: matrix.config.kernel-type == 'centos-rpm'
+ run: |
+ mkdir "${{ github.workspace }}/../kernel"
+ mkdir "${{ github.workspace }}/../kernel-download"
+ wget -q -O "${{ github.workspace }}/../kernel-download/kernel.rpm" "${{ matrix.config.kernel-source }}"
+ cd "${{ github.workspace }}/../kernel-download"
+ rpm2cpio "${{ github.workspace }}/../kernel-download/kernel.rpm" | cpio --extract --make-directories
+ tar --strip 1 -a -x -f linux-*.el*.tar.xz -C "${{ github.workspace }}/../kernel"
+ cd
+ rm -rf -- "${{ github.workspace }}/../kernel-download"
+ - name: Checkout CentOS kernel version [tar]
+ if: matrix.config.kernel-type == 'centos-tar'
+ run: |
+ mkdir "${{ github.workspace }}/../kernel"
+ mkdir "${{ github.workspace }}/../kernel-download"
+ wget -q -O "${{ github.workspace }}/../kernel-download/kernel.tar.xz" "${{ matrix.config.kernel-source }}"
+ tar --strip 1 -a -x -f "${{ github.workspace }}/../kernel-download/kernel.tar.xz" -C "${{ github.workspace }}/../kernel"
+ rm -rf -- "${{ github.workspace }}/../kernel-download"
+ - name: Configure kernel version
+ working-directory: ${{ github.workspace }}/../kernel
+ run: |
+ make defconfig
+ make modules_prepare
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D DNBD3_KERNEL_MODULE=ON \
+ -D KERNEL_BUILD_DIR=${{ github.workspace }}/../kernel \
+ -D KERNEL_INSTALL_DIR=${{ github.workspace }}/../kernel/extra \
+ -D KERNEL_SCRIPTS_DIR=${{ github.workspace }}/../kernel/scripts \
+ -D DNBD3_BENCHMARK=OFF \
+ -D DNBD3_CLIENT_FUSE=OFF \
+ -D DNBD3_SERVER=OFF \
+ -D DNBD3_SERVER_FUSE=OFF \
+ -D DNBD3_RELEASE_HARDEN=OFF
+ - name: Build dnbd3 kernel module
+ working-directory: ${{ github.workspace }}/build
+ run: make
diff --git a/.github/workflows/build-programs.yml b/.github/workflows/build-programs.yml
new file mode 100644
index 0000000..06642b5
--- /dev/null
+++ b/.github/workflows/build-programs.yml
@@ -0,0 +1,66 @@
+name: Build dnbd3 programs
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - name: debug [default] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Debug"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "OFF"
+ - name: debug [server with fuse support] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Debug"
+ build-cfg-server-fuse: "ON"
+ build-cfg-harden: "OFF"
+ - name: release [default] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "OFF"
+ - name: release [server with fuse support] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "ON"
+ build-cfg-harden: "OFF"
+ - name: release [default hardening] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "ON"
+ name: Build dnbd3 ${{ matrix.config.name }}
+ runs-on: ${{ matrix.config.build-os }}
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=${{ matrix.config.build-type }} \
+ -D DNBD3_KERNEL_MODULE=OFF \
+ -D DNBD3_BENCHMARK=ON \
+ -D DNBD3_SERVER_FUSE=${{ matrix.config.build-cfg-server-fuse }} \
+ -D DNBD3_RELEASE_HARDEN=${{ matrix.config.build-cfg-harden }}
+ - name: Build dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..6d41378
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,34 @@
+name: Lint dnbd3
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ lint:
+ name: Lint dnbd3
+ runs-on: ubuntu-22.04
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D DNBD3_BENCHMARK=ON
+ - name: Lint dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make lint
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..6a06173
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,64 @@
+name: Release dnbd3
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ release:
+ name: Release dnbd3
+ # Use very old system (= libc) for building for best compatibility
+ runs-on: ubuntu-18.04
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 release
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D DNBD3_KERNEL_MODULE=OFF \
+ -D DNBD3_BENCHMARK=OFF \
+ -D DNBD3_SERVER_FUSE=OFF \
+ -D DNBD3_RELEASE_HARDEN=OFF
+ - name: Build dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make package
+ - name: Create links to artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: |
+ ln -s dnbd3_*.deb dnbd3_linux_x86_64.deb
+ ln -s dnbd3_*.rpm dnbd3_linux_x86_64.rpm
+ ln -s dnbd3_*.tar.gz dnbd3_linux_x86_64.tar.gz
+ - name: Attach Debian artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.deb
+ asset_name: dnbd3_linux_x86_64.deb
+ tag: ${{ github.ref }}
+ - name: Attach RedHat artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.rpm
+ asset_name: dnbd3_linux_x86_64.rpm
+ tag: ${{ github.ref }}
+ - name: Attach generic artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.tar.gz
+ asset_name: dnbd3_linux_x86_64.tar.gz
+ tag: ${{ github.ref }}
diff --git a/.gitignore b/.gitignore
index 38ae262..6617c58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,3 @@ build/
*.swp
.autotools
.idea
-/version.txt
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc8bfb7..69459dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,252 +1,246 @@
-################################################################################
-# GENERAL #
-################################################################################
-
-PROJECT(dnbd3 C)
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12)
-IF (CMAKE_BUILD_TYPE STREQUAL "")
- SET(CMAKE_BUILD_TYPE Debug)
-ENDIF()
-
-SET(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Path prefix for system installation")
-OPTION(BUILD_FUSE_CLIENT "Build dnbd3 fuse client" ON)
-OPTION(BUILD_SERVER "Build dnbd3 server" ON)
-OPTION(BUILD_STRESSTEST "Build dnbd3 stress testing tool" OFF)
-SET(EXTRA_C_FLAGS "" CACHE STRING "Additional options to pass to compiler")
-
-OPTION(SERVER_FOR_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
-
-# Is there a non-retarded way to check if build type is debug or release?
-# When specifying, it is case insensitive, so DeBuG would also enable debug builds,
-# but in cmake, we can only do case sensitive matches... :/
-string( TOLOWER "${CMAKE_BUILD_TYPE}" bt_lower )
-if (NOT bt_lower MATCHES "^(debug|release)$")
- message( FATAL_ERROR "Build type needs to be either Debug or Release" )
-endif()
-
-message( "Build Type selected: ${CMAKE_BUILD_TYPE}" )
-
-IF(CMAKE_SYSTEM_NAME MATCHES "BSD")
- message("Detected *BSD System: disable build of Kernel Module.")
- SET(BUILD_KERNEL_MODULE False)
-ELSE()
- OPTION(BUILD_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON)
-ENDIF()
-
-INCLUDE(CheckCCompilerFlag)
-macro (TRY_ADD_FLAG _FLAG)
- UNSET(TMP_TEST CACHE)
- CHECK_C_COMPILER_FLAG("${_FLAG}" TMP_TEST)
- if (TMP_TEST)
- message(":-) Compiler supports ${_FLAG}")
- SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${_FLAG}")
- else()
- message(":-( Compiler does not support ${_FLAG}")
- endif()
-
-endmacro()
-
-# Common for gcc and clang
-SET(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now,-z,defs -pie")
-SET(CMAKE_C_FLAGS "-fPIE -std=c11 -Wno-multichar -fno-strict-aliasing -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 ${EXTRA_C_FLAGS}")
-SET(CMAKE_C_FLAGS_RELEASE " -O3 -Wno-unused-result -DNDEBUG")
-# Hardening. Try as much as is possible.
-TRY_ADD_FLAG("-mmitigate-rop")
-TRY_ADD_FLAG("-fstack-protector-strong")
-TRY_ADD_FLAG("-fstack-clash-protection")
-TRY_ADD_FLAG("-Wformat")
-TRY_ADD_FLAG("-Wformat-security")
-TRY_ADD_FLAG("-Werror=format-security")
-if(CMAKE_C_COMPILER MATCHES "clang")
- message( "Using clang flags." )
- SET(CMAKE_C_FLAGS_DEBUG " -O1 -fno-omit-frame-pointer -g -Wall -Wextra -Wpedantic -Wno-unused-result -D_DEBUG")
-elseif (CMAKE_C_COMPILER MATCHES "(cc-)|(cc$)")
- message( "Using (g)cc flags." )
- SET(CMAKE_C_FLAGS_DEBUG " -O0 -g -Wall -Wextra -Wpedantic -Wconversion -Wno-sign-conversion -D_DEBUG")
-else()
- message( FATAL_ERROR "Could not determine compiler type." )
-endif()
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
-
-ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
-ADD_DEFINITIONS(-DWITH_IPV6)
-
-FIND_PACKAGE(Threads)
-
-SET(DO_ABORT False)
-
-message( " *************************************************" )
-if(BUILD_FUSE_CLIENT)
- FIND_PACKAGE(Fuse)
- if(NOT FUSE_FOUND)
- message( " *** No fuse dev libs found, can't build dnbd3-fuse" )
- SET(DO_ABORT True)
- endif()
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-fuse" )
- SET(DO_ABORT True)
- endif()
-endif()
-if(BUILD_SERVER)
- FIND_PACKAGE(Jansson)
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-server" )
- SET(DO_ABORT True)
- endif()
- if(NOT JANSSON_FOUND)
- message( " *** No jansson lib found, can't build dnbd3-server" )
- SET(DO_ABORT True)
- endif()
-endif()
-if(BUILD_STRESSTEST)
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-bench" )
- SET(DO_ABORT True)
- endif()
-endif()
-message( " *************************************************" )
-if(DO_ABORT)
- message( FATAL_ERROR "Aborting." )
-endif()
-
-#SET(FUSE_INCLUDE_DIR "")
-#SET(JANSSON_INCLUDE_DIR "")
-
-################################################################################
-# VERSION HEADER #
-################################################################################
-
-FILE(WRITE ${CMAKE_BINARY_DIR}/version.cmake
-"EXECUTE_PROCESS(
- COMMAND \${CMD}
- OUTPUT_VARIABLE VERSION
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- CONFIGURE_FILE(\${SRC} \${DST} @ONLY)
-")
-ADD_CUSTOM_TARGET(
- version
- ${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/src/version.c.in
- -D DST=${CMAKE_BINARY_DIR}/generated/version.c
- -D CMD=${CMAKE_SOURCE_DIR}/get-version.sh
- -P ${CMAKE_BINARY_DIR}/version.cmake
-)
-
-INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/generated )
-
-################################################################################
-# CLIENT #
-################################################################################
-
-if(BUILD_KERNEL_MODULE)
- FILE(GLOB_RECURSE CLIENT_SRCS src/client/*.c)
- ADD_EXECUTABLE(dnbd3-client ${CLIENT_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-client)
- ADD_DEPENDENCIES(dnbd3-client version)
- INSTALL(TARGETS dnbd3-client RUNTIME DESTINATION sbin)
-ENDIF()
-
-
-################################################################################
-# SERVER #
-################################################################################
-
-if(BUILD_SERVER)
- IF(SERVER_FOR_AFL)
- message(" ######################## Building server for AFL mode - will be useless otherwise!")
- ADD_DEFINITIONS(-DAFL_MODE)
- ENDIF()
- FILE(GLOB SERVER_SRCS src/server/*.c src/shared/*.c src/server/picohttpparser/*.c)
- ADD_EXECUTABLE(dnbd3-server ${SERVER_SRCS})
- TARGET_INCLUDE_DIRECTORIES(dnbd3-server PRIVATE ${JANSSON_INCLUDE_DIR})
- TARGET_LINK_LIBRARIES(dnbd3-server ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
- if(UNIX AND NOT APPLE)
- target_link_libraries(dnbd3-server rt)
- endif()
- ADD_DEPENDENCIES(dnbd3-server version)
- INSTALL(TARGETS dnbd3-server RUNTIME DESTINATION sbin)
-endif()
-
-
-
-################################################################################
-# FUSE #
-################################################################################
-
-if(BUILD_FUSE_CLIENT)
- FILE(GLOB FUSE_SRCS src/fuse/*.c src/shared/*.c)
- ADD_EXECUTABLE(dnbd3-fuse ${FUSE_SRCS})
- TARGET_INCLUDE_DIRECTORIES(dnbd3-fuse PRIVATE ${FUSE_INCLUDE_DIRS})
- TARGET_LINK_LIBRARIES(dnbd3-fuse ${CMAKE_THREAD_LIBS_INIT} ${FUSE_LIBRARIES})
- ADD_DEPENDENCIES(dnbd3-fuse version)
- INSTALL(TARGETS dnbd3-fuse RUNTIME DESTINATION bin)
-endif()
-
-################################################################################
-# STRESSTEST #
-################################################################################
-
-if(BUILD_STRESSTEST)
- FILE(GLOB BENCH_SRCS src/bench/*.c src/shared/*.c)
- ADD_EXECUTABLE(dnbd3-bench ${BENCH_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-bench ${CMAKE_THREAD_LIBS_INIT})
- ADD_DEPENDENCIES(dnbd3-bench version)
- INSTALL(TARGETS dnbd3-bench RUNTIME DESTINATION bin)
-endif()
-
-################################################################################
-# MODULE #
-################################################################################
-
-IF(BUILD_KERNEL_MODULE)
- SET(MODULE_NAME dnbd3)
- SET(MODULE_FILE ${MODULE_NAME}.ko)
- FILE(GLOB MODULE_SOURCE_FILES src/kernel/*.c src/serialize.c)
- FILE(GLOB MODULE_HEADER_FILES src/kernel/*.h)
-
- SET(KERNEL_DIR "" CACHE PATH "Path to kernel sources to compile against")
- IF(KERNEL_DIR STREQUAL "")
- SET(KERNEL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build")
- ENDIF()
-
- SET(KBUILD_COMMAND ${CMAKE_MAKE_PROGRAM} -C ${KERNEL_DIR}
- M=${CMAKE_BINARY_DIR} modules
- )
-
- CONFIGURE_FILE(Kbuild.in ${CMAKE_BINARY_DIR}/Kbuild)
-
- FOREACH(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES})
- CONFIGURE_FILE(${MODULE_SOURCE_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
- ENDFOREACH( MODULE_SOURCE_FILE )
-
- FOREACH(MODULE_HEADER_FILE ${MODULE_HEADER_FILES})
- CONFIGURE_FILE(${MODULE_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
- ENDFOREACH( MODULE_HEADER_FILE )
-
- ADD_CUSTOM_COMMAND(
- OUTPUT ${CMAKE_BINARY_DIR}/${MODULE_FILE}
- COMMAND ${KBUILD_COMMAND}
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
- DEPENDS ${MODULE_SOURCE_FILES} Kbuild.in
- VERBATIM
- )
-
- ADD_CUSTOM_TARGET(${MODULE_NAME} ALL DEPENDS ${CMAKE_BINARY_DIR}/${MODULE_FILE})
-
- INSTALL(FILES ${CMAKE_BINARY_DIR}/${MODULE_NAME}.ko
- DESTINATION /lib/modules/${CMAKE_SYSTEM_VERSION}/kernel/drivers/block
- PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
- )
-
- INSTALL(CODE "EXECUTE_PROCESS(COMMAND depmod -a)")
-ENDIF()
-
-
-#
-# Other install files
-#
-
-FILE(GLOB conf_files "${CMAKE_CURRENT_SOURCE_DIR}/conf/*")
-INSTALL(FILES ${conf_files} DESTINATION /etc/dnbd3-server/sample/)
-
+cmake_minimum_required(VERSION 3.10)
+
+# include CMake macros
+set(PROJECT_MODULES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+ ${PROJECT_MODULES_DIR})
+
+# define root CMake project
+project(dnbd3
+ DESCRIPTION "dnbd3 Linux kernel module, server, clients and utilities"
+ LANGUAGES C)
+
+# define project options to define build configuration
+OPTION(DNBD3_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON)
+OPTION(DNBD3_BENCHMARK "Enable build of dnbd3-bench" OFF)
+OPTION(DNBD3_CLIENT_FUSE "Enable build of dnbd3-fuse" ON)
+OPTION(DNBD3_SERVER "Enable build of dnbd3-server" ON)
+OPTION(DNBD3_SERVER_FUSE "Enable FUSE-Integration for dnbd3-server" OFF)
+OPTION(DNBD3_SERVER_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
+OPTION(DNBD3_SERVER_DEBUG_LOCKS "Add lock debugging code to dnbd3-server" OFF)
+OPTION(DNBD3_SERVER_DEBUG_THREADS "Add thread debugging code to dnbd3-server" OFF)
+OPTION(DNBD3_RELEASE_HARDEN "Compile dnbd3 programs in Release build with code hardening options" OFF)
+OPTION(DNBD3_PACKAGE_DOCKER "Enable packaging of Docker image" OFF)
+
+# set supported build configurations
+set(CMAKE_CONFIGURATION_TYPES Debug Release)
+
+# set compilation in debug mode as default configuration
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Debug)
+ message(STATUS "Build type is not set. Defaulting to ${CMAKE_BUILD_TYPE} build!")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# search for required packages
+find_package(Git REQUIRED)
+find_package(Threads REQUIRED)
+
+# include project version and build type related macros
+include(Version)
+include(Build)
+include(Lint)
+
+# check for system and enable or disable built of Linux kernel module
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+ # disable build of the dnbd3 Linux kernel module on a system other than Linux, eg. FreeBSD
+ message(STATUS "Detected non-Linux system: Disable build of the dnbd3 Linux kernel module")
+ set(DNBD3_KERNEL_MODULE OFF)
+endif(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+# enable or disable requirements for a built of the Linux kernel module
+if(DNBD3_KERNEL_MODULE)
+ # require Linux kernel headers
+ find_package(KernelHeaders REQUIRED)
+
+ # print configured settings
+ message(STATUS "Path to Linux kernel modules to compile against is " ${KERNEL_BUILD_DIR})
+ message(STATUS "Path to install Linux kernel modules is " ${KERNEL_INSTALL_DIR})
+endif(DNBD3_KERNEL_MODULE)
+
+# set include directories
+set(PROJECT_GEN_DIR ${CMAKE_BINARY_DIR}/generated)
+set(PROJECT_INCLUDE_DIR_PREFIX inc)
+set(PROJECT_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/${PROJECT_INCLUDE_DIR_PREFIX})
+set(PROJECT_INCLUDE_GEN_DIR ${PROJECT_GEN_DIR}/${PROJECT_INCLUDE_DIR_PREFIX})
+include_directories(${PROJECT_INCLUDE_DIR})
+
+# get all global header files for the linter
+set(DNBD3_HEADER_FILES ${PROJECT_INCLUDE_DIR}/dnbd3/build.h.in
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config/client.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config/server.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/crc32.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/fdsignal.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/log.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/protocol.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/serialize.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/sockhelper.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/timing.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/types.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/version.h.in)
+
+# add linter for header files
+add_linter(dnbd3-headers-lint "${DNBD3_HEADER_FILES}")
+add_linter_fix(dnbd3-headers-lint-fix "${DNBD3_HEADER_FILES}")
+
+# generate project version C header file from template
+# exposes dnbd3-generate-version and dnbd3-version target
+set(INCLUDE_VERSION_HEADER ${PROJECT_INCLUDE_DIR}/dnbd3/version.h)
+set(INCLUDE_VERSION_HEADER_TEMPLATE ${PROJECT_INCLUDE_DIR}/dnbd3/version.h.in)
+set(INCLUDE_VERSION_HEADER_GENERATE ${PROJECT_INCLUDE_GEN_DIR}/dnbd3/version.h)
+set(INCLUDE_VERSION_HEADER_GENERATE_PREFIX ${PROJECT_INCLUDE_DIR_PREFIX}/dnbd3)
+gen_project_version(${INCLUDE_VERSION_HEADER} ${INCLUDE_VERSION_HEADER_TEMPLATE} ${INCLUDE_VERSION_HEADER_GENERATE} ${GIT_EXECUTABLE} ${CMAKE_SOURCE_DIR})
+
+# generate project build type C header file from template
+# exposes dnbd3-generate-build and dnbd3-build target
+set(INCLUDE_BUILD_HEADER_TEMPLATE ${PROJECT_INCLUDE_DIR}/dnbd3/build.h.in)
+set(INCLUDE_BUILD_HEADER_GENERATE ${PROJECT_INCLUDE_GEN_DIR}/dnbd3/build.h)
+gen_build_type(${INCLUDE_BUILD_HEADER_TEMPLATE} ${INCLUDE_BUILD_HEADER_GENERATE})
+
+# add compile option to handle files greater than 2GB on a 32bit system
+add_definitions(-D_FILE_OFFSET_BITS=64)
+
+# define global C flags for compilation
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
+
+# enable all error warnings in Debug build configuration
+set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -Wpedantic -Wconversion -Wformat -Wformat-security -Werror=format-security -Wno-sign-conversion")
+set(CMAKE_C_FLAGS_RELEASE "-Wno-error")
+
+# set compilation optimization
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Og -DDEBUG")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -DNDEBUG")
+
+if(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+ # harden builds with specific C flags
+ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2 -fstack-protector-all -fstack-clash-protection")
+ # set specific hardened linker flags
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,relro,-z,now,-z,defs -pie")
+ # print status message of configuration
+ message(STATUS "Set compilation of DNBD3 with hardened code options - done")
+else(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+ # print status message of disabled configuration
+ message(STATUS "Disabled compilation of dnbd3 with hardened code options - done")
+endif(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+
+# define packaging if Release build is enabled
+if(CMAKE_BUILD_TYPE MATCHES Release)
+ # get version source package or Git repository
+ get_repository_version(REPOSITORY_VERSION REPOSITORY_VERSION_SHORT REPOSITORY_BRANCH ${INCLUDE_VERSION_HEADER} ${CMAKE_BUILD_TYPE} ${GIT_EXECUTABLE} ${CMAKE_SOURCE_DIR})
+
+ # define project version
+ if(KernelHeaders_VERSION)
+ set(REPOSITORY_VERSION_FULL ${REPOSITORY_VERSION}-${KernelHeaders_VERSION})
+ else(KernelHeaders_VERSION)
+ set(REPOSITORY_VERSION_FULL ${REPOSITORY_VERSION})
+ endif(KernelHeaders_VERSION)
+
+ set(CPACK_GENERATOR "DEB;RPM;TGZ")
+ set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+ set(CPACK_MONOLITHIC_INSTALL True)
+ set(CPACK_PACKAGE_VERSION ${REPOSITORY_VERSION})
+ set(CPACK_PACKAGE_VERSION_FULL ${REPOSITORY_VERSION_FULL})
+ set(CPACK_PACKAGE_SECTION admin)
+ set(CPACK_PACKAGE_VENDOR "University of Freiburg")
+ set(CPACK_PACKAGE_CONTACT "Christian Rößler <christian.roessler@rz.uni-freiburg.de>")
+ set(CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/bwLehrpool/dnbd3")
+ set(CPACK_PACKAGE_CHECKSUM SHA256)
+ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR})
+ set(CPACK_SOURCE_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}_source)
+ set(CPACK_STRIP_FILES True)
+ set(CPACK_PACKAGE_RELOCATABLE False)
+ set(CPACK_SET_DESTDIR True)
+ set(CMAKE_INSTALL_PREFIX "/usr")
+ set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+ set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/COPYING)
+ set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_SOURCE_DIR}/README.md)
+
+ # set DEB generator specific packaging options
+ set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6, libfuse2, libjansson4, libatomic1")
+ if(DNBD3_KERNEL_MODULE)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postinst "depmod -a\n")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postrm "depmod -a\n")
+ set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postinst
+ ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postrm)
+ endif(DNBD3_KERNEL_MODULE)
+
+ # set RPM generator specific packaging options
+ set(CPACK_RPM_PACKAGE_REQUIRES "glibc, fuse-libs, jansson, libatomic")
+ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/lib"
+ "/lib/modules"
+ "/lib/modules/${CMAKE_SYSTEM_VERSION}"
+ "/lib/modules/${CMAKE_SYSTEM_VERSION}/extra"
+ "/etc"
+ "/usr"
+ "/usr/lib"
+ "/usr/lib/systemd"
+ "/usr/lib/systemd/system")
+ if(DNBD3_KERNEL_MODULE)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/post "depmod -a\n")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/postun "depmod -a\n")
+ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/post)
+ set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/postun)
+ endif(DNBD3_KERNEL_MODULE)
+
+ # configure source packaging
+ set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
+ set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR}" "/"
+ "${PROJECT_GEN_DIR}" "/")
+ set(CPACK_SOURCE_IGNORE_FILES "/build/"
+ "/.git/"
+ "/.github/"
+ ".gitignore"
+ "version.h.in")
+
+ # include CPack functionality
+ include(CPack)
+
+ # prepare source packaging
+ add_custom_command(OUTPUT ${INCLUDE_VERSION_HEADER}
+ COMMAND ${CMAKE_COMMAND} -D VERSION_HEADER_INPUT_FILE=${INCLUDE_VERSION_HEADER_GENERATE}
+ -D VERSION_HEADER_OUTPUT_FILE=${INCLUDE_VERSION_HEADER}
+ -P ${PROJECT_MODULES_DIR}/PreVersionPackaging.cmake
+ COMMENT "Prepare version.h"
+ DEPENDS dnbd3-generate-version)
+
+ # main source packaging
+ add_custom_target(package_source_main
+ COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target package_source
+ DEPENDS ${INCLUDE_VERSION_HEADER}
+ VERBATIM
+ USES_TERMINAL)
+
+ # post source packaging and exported target to build source packages
+ add_custom_target(source
+ COMMAND ${CMAKE_COMMAND} -D VERSION_HEADER_INPUT_FILE=${INCLUDE_VERSION_HEADER_GENERATE}
+ -D VERSION_HEADER_OUTPUT_FILE=${INCLUDE_VERSION_HEADER}
+ -P ${PROJECT_MODULES_DIR}/PostVersionPackaging.cmake
+ COMMENT "Cleanup version.h"
+ DEPENDS package_source_main)
+
+ # include target to make docker image
+ if(NOT DNBD3_KERNEL_MODULE AND DNBD3_SERVER AND DNBD3_PACKAGE_DOCKER)
+ find_package(Docker REQUIRED)
+ include(DockerImage)
+
+ set(DOCKER_TAG ${CPACK_PACKAGE_NAME}:${REPOSITORY_VERSION_SHORT})
+
+ # define Ubuntu docker image
+ set(DOCKER_FILE_UBUNTU ${CMAKE_SOURCE_DIR}/pkg/docker/ubuntu-20-04_dockerfile)
+ set(PACKAGE_FILE_UBUNTU ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}.deb)
+ set(DOCKER_IMAGE_UBUNTU ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}_ubuntu-20-04_docker.tar)
+ add_docker_image(docker-ubuntu-20-04 ${DOCKER_IMAGE_UBUNTU} ${DOCKER_FILE_UBUNTU} ${DOCKER_TAG} ${PACKAGE_FILE_UBUNTU} ${CMAKE_BINARY_DIR})
+
+ # define Archlinux docker image
+ set(DOCKER_FILE_ARCHLINUX ${CMAKE_SOURCE_DIR}/pkg/docker/archlinux_dockerfile)
+ set(PACKAGE_FILE_ARCHLINUX ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}.tar.gz)
+ set(DOCKER_IMAGE_ARCHLINUX ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}_archlinux_docker.tar)
+ add_docker_image(docker-archlinux ${DOCKER_IMAGE_ARCHLINUX} ${DOCKER_FILE_ARCHLINUX} ${DOCKER_TAG} ${PACKAGE_FILE_ARCHLINUX} ${CMAKE_BINARY_DIR})
+ endif(NOT DNBD3_KERNEL_MODULE AND DNBD3_SERVER AND DNBD3_PACKAGE_DOCKER)
+endif(CMAKE_BUILD_TYPE MATCHES Release)
+
+# add all dnbd3 related projects from the source code directory
+add_subdirectory(src)
+
+# add configuration and operational files for packaging purposes
+add_subdirectory(pkg)
diff --git a/Kbuild.in b/Kbuild.in
deleted file mode 100644
index 667cee0..0000000
--- a/Kbuild.in
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-m := ${MODULE_NAME}.o
-${MODULE_NAME}-objs += core.o blk.o net.o sysfs.o utils.o serialize_kmod.o \ No newline at end of file
diff --git a/LOCKS b/LOCKS
deleted file mode 100644
index 77e44a8..0000000
--- a/LOCKS
+++ /dev/null
@@ -1,80 +0,0 @@
-Some notes about locking in dnbd3
-
-The order of aquiring multiple locks is
-VERY IMPORTANT, as you'll produce a possible deadlock
-if you do it in the wrong order.
-Take very good care of locking order if you have lots
-of functions that call each other. You might lose
-track of what's going on. ;)
-
-===== FUSE =====
-mutexInit
-newAltLock
-altLock
-connection.sendMutex
-requests.lock
-
-===== SERVER =====
-This is a list of used locks, in the order they
-have to be aquired if you must hold multiple locks.
-Note this list might be out of date, take a look at the
-defines in lock.h for the effective order.
-reloadLock
-remoteCloneLock
-_clients_lock
-_clients[].lock
-integrityQueueLock
-_images_lock
-_images[].lock
-uplink.queueLock
-altServersLock
-client.sendMutex
-uplink.rttLock
-uplink.sendMutex
-aclLock
-
-If you need to lock multiple clients/images/... at once,
-lock the client with the lowest array index first.
-
-If the program logic would require to aquire the
-locks in a different order, you HAVE TO rework the
-code.
-For example, if you hold the lock for client 10 and
-you need to look up some other client. You MUST NOT
-simply fetch the _clients_lock now and then iterate
-over the clients until you find the one you need,
-as it violates the above order to first lock on the
-clients array and then the clients lock.
-Instead, you need to release client 10's lock,
-then lock on _clients_lock and iterate over the
-clients. Now you check if you either encounter
-the client you originally held the lock on, or
-the client you are looking for. You immediately
-lock on those two. You can then release the
-_clients_lock and work with both clients.
-pseudo code:
-
-// client10 is assumed to be a pointer to
-// a client, which happens to be at index 10
-lock (client10->lock);
-....
-// oh, i need another client
-unlock(client10->lock);
-lock(_clients_lock);
-client clientA = NULL, clientB = NULL;
-for (i = 0; i < _num_clients; ++i) {
- if (client[i] == client10) {
- clientA = client[i];
- lock(clientA.lock);
- } else if (client[i].something == <whatever>) {
- clientB = client[i];
- lock(clientB.lock);
- }
-}
-unlock(_clients_lock);
-if (clientA && clientB) { // Make sure we actually found both!
- // DO something important with both clients
-}
-if (clientA) unlock(clientA.lock);
-if (clientB) unlock(clientB.lock);
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ebf1aea
--- /dev/null
+++ b/README.md
@@ -0,0 +1,456 @@
+# dnbd3 - distributed network block device (version 3)
+
+The distributed network block device version 3 (dnbd3) is a network protocol similar to [nbd](https://github.com/NetworkBlockDevice/nbd) to implement a distributed block-based read-only storage system. Such a distributed block-based storage system consists of dnbd3 components, namly one or more servers and several clients. Servers are meant to expose virtual disk images as block devices to clients using dnbd3. Clients read data blocks from servers and implement failover and a load balancing mechanism to connect to the fastest server available for data exchange.
+
+This repository contains the source code for the following dnbd3 components:
+
+ - **dnbd3**: Linux kernel module client for dnbd3
+ - **dnbd3-bench**: Benchmark utility to test dnbd3
+ - **dnbd3-fuse**: Fuse client for dnbd3
+ - **dnbd3-server**: Server to serve virtual disk images for dnbd3
+
+The dnbd3 components have been built/tested on the following Linux kernel versions and Unix distributions:
+
+ - Generic/Vanilla Kernel **4.9** to **6.1**
+ - Archlinux with **Linux kernel 5.15.x** or **6.1.x**
+ - Raspberry Pi OS with **Linux kernel 5.4.x**
+ - Ubuntu 20.04 with **Linux kernel 5.4.x**
+ - Ubuntu 18.04 with **Linux kernel 4.19.x**
+ - CentOS 8 with **Linux kernel 4.18.x**
+ - CentOS 7 with **Linux kernel 3.10.x**
+ - AlmaLinux 8 with **Linux kernel 4.18.x**
+ - Rocky Linux 8 with **Linux kernel 4.18.x**
+ - FreeBSD 12.x and 13.x (only user space programs, eg. dnbd3-server)
+
+
+## Build
+
+### Preliminaries
+A build of the dnbd3 components requires the installation of the following build tools and libraries under your supported Unix distribution.
+
+#### Archlinux with Linux kernel 5.15.x or 5.10.x
+```shell
+pacman -S git \
+ make \
+ cmake \
+ gcc \
+ clang \
+ linux-headers \ # or linux-lts-headers
+ fuse2 \
+ jansson \
+ afl \
+ dpkg \
+ rpm-tools
+```
+
+#### Raspberry Pi OS with Linux kernel 5.4.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ raspberrypi-kernel-headers \
+ libfuse-dev \
+ libjansson-dev \
+ afl \
+ rpm
+```
+
+#### Ubuntu 20.04 with Linux kernel 5.4.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+```
+
+Note that `afl` is not available on Ubuntu 20.04 and should be built from the [original sources](https://github.com/google/AFL).
+
+#### Ubuntu 18.04 with Linux kernel 4.19.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ afl \
+ rpm
+```
+
+#### {CentOS, AlmaLinux, Rocky Linux} 8 with Linux kernel 4.18.x
+```shell
+yum install git \
+ make \
+ cmake \
+ gcc \
+ clang-tools-extra \
+ kernel-devel \
+ elfutils-libelf-devel \
+ fuse-devel \
+ jansson-devel \
+ libatomic \
+ rpm-build
+```
+
+Note that `afl` is not available on CentOS 8 and the likes and should be built from the [original sources](https://github.com/google/AFL).
+
+#### CentOS 7 with Linux kernel 3.10.x
+Before any required preliminaries can be installed, enable the `epel` package repository with the following command line calls:
+
+```shell
+yum install epel-release
+yum repolist # refresh epel package repository
+```
+
+The `epel` package repository enables the installation of `cmake3` on CentOS 7 which is later required to build dnbd3 components.
+Then, install the required preliminaries with the following command line call as usual:
+
+```shell
+yum install git \
+ make \
+ cmake3 \
+ gcc \
+ kernel-devel \
+ elfutils-libelf-devel \
+ rpm-build
+```
+
+Note that `afl` is not available on CentOS 7 and should be built from the [original sources](https://github.com/google/AFL).
+
+> **Warning: All dnbd3 components can only be built if a GCC compiler with stdatomic support is used.
+> This feature is available with GCC 4.9 or later as part of the C11 language support.
+> Since CentOS 7 is shipped with GCC 4.8 you have to install a new GCC version greater or equal than GCC 4.9.**
+
+The installation of GCC 7.3 on CentOS requires some additional instructions as follows.
+First, install Software Collections on your system that allows you to build, install, and use multiple versions of GCC on the same system withoutaffecting system-wide installed packages. Software collections is part of the CentOS `extras` repository and can be installed by running the following command:
+
+```shell
+yum install centos-release-scl
+```
+
+After installation of Software Collections, install the Developer Toolset in version 7 and additional packages with the following command line call:
+
+```shell
+yum install devtoolset-7 \
+ devtoolset-7-libatomic-devel \
+ llvm-toolset-7-git-clang-format \
+ fuse-devel \
+ jansson-devel
+```
+
+To access GCC 7.3, you need to launch a new shell instance using the Software Collections `scl` tool:
+
+```shell
+scl enable devtoolset-7 llvm-toolset-7 bash
+```
+
+Now, GCC 7.3 is the default version in your current shell.
+This allows you to build all dnbd3 components on CentOS 7.
+
+
+#### FreeBSD 12.x and 13.x
+```shell
+pkg install git \
+ cmake \
+ gcc \
+ clang-devel \
+ pkgconf \
+ fusefs-libs \
+ jansson \
+ afl \
+ rpm4
+```
+
+
+### Preparation
+Before a build takes place, you should create a `build` directory inside the root folder of the repository. After that, change your working directory to that new directory as follows:
+
+```shell
+mkdir build
+cd build
+```
+
+
+### Configuration
+A build of the dnbd3 components can be configured and customized by the following configuration variables (CMake cache entries):
+
+| Variable | Type | Values | Default value | Description |
+|:-----------------------------|:-------|:----------------------------------------|:--------------------------------------|----------------------------------------------------------------------|
+| `CMAKE_BUILD_TYPE` | STRING | {`Debug`, `Release`} | `Debug` | Build configuration of the dnbd3 project. |
+| `KERNEL_BUILD_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/build | Path to Linux kernel modules to compile against. |
+| `KERNEL_INSTALL_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/extra | Path to install Linux kernel modules. |
+| `KERNEL_SCRIPTS_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/build/scripts | Path to Linux kernel scripts directory. |
+| `DNBD3_KERNEL_MODULE` | OPTION | {`ON`, `OFF`} | `ON` | Build the dnbd3 Linux kernel module. |
+| `DNBD3_BENCHMARK` | OPTION | {`ON`, `OFF`} | `OFF` | Enable build of dnbd3-bench. |
+| `DNBD3_CLIENT_FUSE` | OPTION | {`ON`, `OFF`} | `ON` | Enable build of dnbd3-fuse. |
+| `DNBD3_SERVER` | OPTION | {`ON`, `OFF`} | `ON` | Enable build of dnbd3-server. |
+| `DNBD3_SERVER_FUSE` | OPTION | {`ON`, `OFF`} | `OFF` | Enable FUSE-Integration for dnbd3-server. |
+| `DNBD3_SERVER_AFL` | OPTION | {`ON`, `OFF`} | `OFF` | Build dnbd3-server for usage with afl-fuzz. |
+| `DNBD3_SERVER_DEBUG_LOCKS` | OPTION | {`ON`, `OFF`} | `OFF` | Add lock debugging code to dnbd3-server. |
+| `DNBD3_SERVER_DEBUG_THREADS` | OPTION | {`ON`, `OFF`} | `OFF` | Add thread debugging code to dnbd3-server. |
+| `DNBD3_RELEASE_HARDEN` | OPTION | {`ON`, `OFF`} | `OFF` | Compile dnbd3 programs in Release build with code hardening options. |
+| `DNBD3_PACKAGE_DOCKER` | OPTION | {`ON`, `OFF`} | `OFF` | Enable packaging of Docker image. |
+
+A value from the range of appropriate values can be assigend to each configuration variable by executing CMake once with the following command pattern:
+
+```shell
+cmake -D<VARIABLE>=<VALUE> [-D ...] ../.
+```
+
+> **Note that the default compiler on FreeBSD 12.x and 13.x is clang/llvm and should be changed to gcc by appending the set CMake compiler configuration variable -DCMAKE_C_COMPILER=gcc to the CMake configuration command.**
+
+
+### Cross-Compiling
+With the help of CMake, it is also possible to cross-compile the dnbd3 components for a Linux target architecture other than the compiling Linux host architecture. This repository is shipped with two CMake toolchain files to cross-compile all components for the following two Linux target architectures if necessary.
+
+> **Note that all used header files (eg. Linux kernel headers) and libraries (eg. jansson, fuse) for the target architecture are installed and set up properly, so that the cross-compiler can find and use them.**
+
+
+#### Cross-Compiling for _powerpc_ Target
+If you want to cross-compile all dnbd3 components for the _powerpc_ Linux target architecture (eg. for a Mac G5), make sure that the `powerpc-linux-gnu-gcc` cross-compiler is installed on your host system. Then, call CMake with the shipped toolchain file for this specific cross-compiler as follows.
+
+```shell
+cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/PowerpcLinuxGnu.cmake [-D ...] ../.
+```
+
+
+#### Cross-Compiling for _aarch64_ Target
+If you want to cross-compile all dnbd3 components for the _aarch64_ Linux target architecture (eg. for a Raspberry Pi 4), make sure that the `aarch64-linux-gnu-gcc` cross-compiler is installed on your host system. Then, call CMake with the shipped toolchain file for this specific cross-compiler as follows.
+
+```shell
+cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/Aarch64LinuxGnu.cmake [-D ...] ../.
+```
+
+
+### Debug
+In the `Debug` build configuration, all dnbd3 components can be built by calling `make`:
+
+```shell
+make
+```
+
+Optionally, the output files can be installed with superuser permissions on the local system using the Makefile target `install`:
+
+```shell
+sudo make install
+sudo depmod -a # only required if DNBD3_KERNEL_MODULE is enabled
+```
+
+
+### Packages
+In the `Release` build configuration, installation packages can be built by calling the make target `package`:
+
+```shell
+make package
+```
+
+This target creates a Debian installation package (\*.deb), a RPM installation package (\*.rpm) and a compressed archive (\*.tar.gz) containing the built dnbd3 components.
+
+
+### Sources
+In the `Release` build configuration, sources can be built by calling the make target `source`:
+
+```shell
+make source
+```
+
+This target creates compressed archives (\*_source.tar.gz and \*_source.zip) containing the source code of this repository for code distribution purposes.
+
+
+### Docker image
+A docker image of the built dnbd3 components can be created in the `Release` build configuration with the option `DNBD3_PACKAGE_DOCKER=ON`, `DNBD3_SERVER=ON` and `DNBD3_KERNEL_MODULE=OFF`. The image is based on Ubuntu 20.04 and a created docker container from it starts the embedded dnbd3-server automatically.
+
+Before the image is built, make sure that your docker daemon runs and you are a member of the `docker` group to access the docker deamon without any super user privileges. Then, build the docker image based on either Ubuntu 20.04 or Archlinux by calling one of the following Make target:
+
+```
+make docker-ubuntu-20-04
+make docker-archlinux
+```
+
+The built docker image is saved as archive file (\*_ubuntu-20-04_docker.tar) and can be deployed to other machines. On each machine, the created image can be loaded with the following docker client call:
+
+```shell
+docker image load -i *_ubuntu-20-04_docker.tar
+```
+
+After the image is loaded, a docker network needs to be available so that each docker container based on this image can establish a network connection. Therefore, a docker network called `dnbd3` is created with the following docker client call:
+
+```shell
+docker network create --driver=bridge --subnet=192.168.100.0/24 dnbd3
+```
+
+If the network is present, docker containers with a name of form `dnbd3-server<NUMBER>` and an IPv4 address from the network's subnet can be created using docker client calls like the following ones:
+
+```
+docker container create --name dnbd3-server1 --ip 192.168.100.10 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server2 --ip 192.168.100.50 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server3 --ip 192.168.100.100 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server4 --ip 192.168.100.123 --network dnbd3 <IMAGE_TAG>
+```
+
+Note that the image is already tagged with an `IMAGE_TAG` which is set to the current dnbd3 package version number and follows the format `dnbd3:<DNBD3_VERSION>`. The `IMAGE_TAG` can be reused to create a docker container. Finally, each container based on the image can be started with the following docker client call:
+
+```
+docker container start -a dnbd3-server<MUNBER>
+```
+
+
+## Configuration of _dnbd3-server_
+The dnbd3-server is started according to the following command line call.
+
+```shell
+dnbd3-server -c <CONFIG_DIR>
+```
+
+An operation of the dnbd3-server requires a configuration directory to provide proper functionality. The configuration directory should contain two configuration files, namely the _alt-servers_ and the _server.conf_ file.
+
+
+### Configuration file _alt-servers_
+The _alt-servers_ configuration file specifies the list of known alt-servers for the dnbd3-server. The configuration in the file is specified the INI file format as shown in the following.
+
+```ini
+[Address]
+comment=Whatever
+for=purpose # where purpose is either "client" or "replication"
+namespace=some/path/
+```
+
+All fields in an INI section are optional. If the `for` key is missing, the dnbd3-server will be used for replication and will be propagated to clients that request a list of alt servers. The `namespace` key can be specified multiple times per INI section. If this key is missing, the server will be used for all image names. Otherwise, it will only be used for images which's name starts with one of the given strings.
+
+If the dnbd3-server is not running in proxy mode, this file won't do much.
+
+
+### Configuration file _server.conf_
+The _server.conf_ file is the main configuration file of the dnbd3-server. The configuration in the file is specified the INI file format as shown in the following.
+
+```ini
+[dnbd3]
+basePath=/srv/openslx/dnbd3 # virtual root of image files
+serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs)
+clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs)
+isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image
+uplinkTimeout=1250 # r/w timeout for connections to uplink servers
+```
+
+
+## Debugging
+Debugging of the Linux kernel modules and the user space utility requires this project to be built in the `Debug` configuration.
+
+### Linux kernel module
+The Linux kernel module **dnbd3** supports the Linux kernel's dynamic debug feature if the Linux kernel is built with the enabled kernel configuration `CONFIG_DYNAMIC_DEBUG`. The dynamic debug feature allows the printing of customizable debug messages into the Linux kernel's message buffer.
+
+Dynamic debug for the modules can be either enabled at module initialization or during operation. At module initialization, dynamic debug can be enabled by modprobe using the "fake" module parameter `dyndbg`:
+
+```shell
+modprobe dnbd3 dyndbg=+pflmt
+```
+
+The module parameter `dyndbg` customizes the debug messages written into the Linux kernel's message buffer. The specific value `+pflmt` enables all debug messages in the source code and includes function name (`f`), line number (`l`), module name (`m`) and thread ID (`t`) for each executed debug statement from the source code.
+
+During operation, debug messages from debug statements in the code can be customized and enabled dynamically as well using the debugfs control file `<DEBUG_FS>/dynamic_debug/control` where `DEBUG_FS` is the mount point of a mounted DebugFS, eg. `/sys/kernel/debug`:
+
+```shell
+echo "module dnbd3 +pflmt" > <DEBUG_FS>/dynamic_debug/control
+```
+
+More information regarding the Linux kernel's dynamic debug feature can be found in the [Linux kernel documentation](https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html).
+
+
+## Development notes
+
+### Code style of source code files
+The code style fo all source code files can be checked by calling the make target `lint`:
+
+```shell
+make lint
+```
+
+If some source code files do not meet the project's code style, they can be fixed automatically by calling the make target `lint-fix`:
+
+```shell
+make lint-fix
+```
+
+
+### Resource locking in dnbd3
+The order of aquiring multiple locks is very important, as you'll produce a possible deadlock if you do it in the wrong order. Take very good care of locking order if you have lots of functions that call each other. You might lose track of what's going on.
+
+
+#### dnbd3-fuse
+This is a list of used locks, in the order they have to be aquired if you must hold multiple locks.
+
+```
+mutexInit
+newAltLock
+altLock
+connection.sendMutex
+requests.lock
+```
+
+
+#### dnbd3-server
+This is a list of used locks, in the order they have to be aquired if you must hold multiple locks. Take a look at the lock priority defines in _src/server/locks.h_ for the effective order.
+
+```
+reloadLock
+loadLock
+remoteCloneLock
+_clients_lock
+_clients[].lock
+integrityQueueLock
+imageListLock
+_images[].lock
+uplink.queueLock
+altServersLock
+client.sendMutex
+uplink.rttLock
+uplink.sendMutex
+aclLock
+initLock
+dirLock
+```
+
+If you need to lock multiple clients or images or etc at once, lock the client with the lowest array index first.
+
+If the program logic would require to aquire the locks in a different order, you have to rework the code. For example, if you hold the lock for client 10 and you need to look up some other client. You must not simply fetch the _clients_lock now and then iterate over the clients until you find the one you need, as it violates the above order to first lock on the clients array and then the clients lock. Instead, you need to release client 10's lock, then lock on _clients_lock and iterate over the clients. Now you check if you either encounter the client you originally held the lock on, or the client you are looking for. You immediately lock on those two. You can then release the _clients_lock and work with both clients.
+This described implementation advice is visualized in the following pseudo C code.
+
+```C
+/* client10 is assumed to be a pointer to a client, which happens to be at index 10 */
+lock (client10->lock);
+/* ... */
+/* we need another client */
+unlock(client10->lock);
+
+lock(_clients_lock);
+client clientA = NULL, clientB = NULL;
+for (i = 0; i < _num_clients; ++i) {
+ if (client[i] == client10) {
+ clientA = client[i];
+ lock(clientA.lock);
+ } else if (client[i].something == <whatever>) {
+ clientB = client[i];
+ lock(clientB.lock);
+ }
+}
+unlock(_clients_lock);
+
+if (clientA && clientB) {
+ /* make sure we actually found both */
+ /* do something important with both clients */
+}
+
+if (clientA)
+ unlock(clientA.lock);
+if (clientB)
+ unlock(clientB.lock);
+```
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 6726a86..0000000
--- a/build.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-mkdir -p build
-cd build/
-cmake ..
-make
diff --git a/cmake/Build.cmake b/cmake/Build.cmake
new file mode 100644
index 0000000..a7f4c07
--- /dev/null
+++ b/cmake/Build.cmake
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+macro(gen_build_type BUILD_INPUT_FILE_TEMPLATE BUILD_OUTPUT_FILE)
+ get_filename_component(BUILD_OUTPUT_FILENAME ${BUILD_OUTPUT_FILE} NAME)
+ # command that will trigger a rebuild of build.h every time
+ add_custom_command(OUTPUT regenerate-build-file
+ COMMAND ${CMAKE_COMMAND} -E sleep 0
+ COMMENT "Trigger generating ${BUILD_OUTPUT_FILENAME}")
+
+ # call the GenerateBuild.cmake file to generate the build.h file
+ add_custom_command(OUTPUT ${BUILD_OUTPUT_FILE}
+ COMMAND ${CMAKE_COMMAND} -D BUILD_INPUT_FILE_TEMPLATE=${BUILD_INPUT_FILE_TEMPLATE}
+ -D BUILD_OUTPUT_FILE=${BUILD_OUTPUT_FILE}
+ -D BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -P ${PROJECT_MODULES_DIR}/GenerateBuild.cmake
+ COMMENT "Generating ${BUILD_OUTPUT_FILENAME}"
+ DEPENDS regenerate-build-file)
+ add_custom_target(dnbd3-generate-build DEPENDS ${BUILD_OUTPUT_FILE})
+
+ # create target to expose project build type
+ add_library(dnbd3-build INTERFACE)
+ target_include_directories(dnbd3-build INTERFACE ${PROJECT_INCLUDE_GEN_DIR})
+ add_dependencies(dnbd3-build dnbd3-generate-build)
+endmacro(gen_build_type BUILD_INPUT_FILE_TEMPLATE BUILD_OUTPUT_FILE)
diff --git a/cmake/CheckAFLCCompiler.cmake b/cmake/CheckAFLCCompiler.cmake
new file mode 100644
index 0000000..249248b
--- /dev/null
+++ b/cmake/CheckAFLCCompiler.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if corresponding AFL C Compiler form original C compiler is available
+# if an AFL C compiler is available, then the path to the AFL C compiler is returned in AFL_C_COMPILER
+macro(check_afl_c_compiler AFL_C_COMPILER AFL_C_COMPILER_NAME C_COMPILER_PATH C_COMPILER_ID)
+ # determine AFL C compiler suffix from original C compiler ID
+ if(${C_COMPILER_ID} MATCHES "GNU")
+ set(AFL_C_COMPILER_SUFFIX "gcc")
+ elseif(${C_COMPILER_ID} MATCHES "Clang")
+ set(AFL_C_COMPILER_SUFFIX "clang")
+ else(${C_COMPILER_ID} MATCHES "Clang")
+ get_filename_component(AFL_C_COMPILER_SUFFIX ${C_COMPILER_PATH} NAME)
+ endif(${C_COMPILER_ID} MATCHES "GNU")
+
+ # define search file name and search for AFL C compiler program
+ set(AFL_C_COMPILER_SEARCH_NAME "afl-${AFL_C_COMPILER_SUFFIX}")
+ find_program(${AFL_C_COMPILER} NAMES ${AFL_C_COMPILER_SEARCH_NAME})
+
+ # return the AFL C compiler name to the caller
+ set(${AFL_C_COMPILER_NAME} ${AFL_C_COMPILER_SEARCH_NAME})
+endmacro(check_afl_c_compiler) \ No newline at end of file
diff --git a/cmake/DockerImage.cmake b/cmake/DockerImage.cmake
new file mode 100644
index 0000000..83f4b9d
--- /dev/null
+++ b/cmake/DockerImage.cmake
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# create a pseudo target to do packaging before docker image is built
+add_custom_target(package_docker
+ COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target package
+ VERBATIM)
+
+# macro to build a docker image based on a provided Dockerfile and an installation package
+macro(add_docker_image TARGET_NAME DOCKER_IMAGE DOCKER_FILE DOCKER_TAG PACKAGE_FILE BUILD_DIR)
+ get_filename_component(PACKAGE_FILE_PATH ${PACKAGE_FILE} PATH)
+ get_filename_component(PACKAGE_FILE_NAME ${PACKAGE_FILE} NAME)
+
+ # commands and target to build docker image
+ add_custom_command(OUTPUT ${DOCKER_IMAGE}
+ COMMAND docker image build -t ${DOCKER_TAG} --file ${DOCKER_FILE} --build-arg DNBD3_PACKAGE_FILE_NAME=${PACKAGE_FILE_NAME} ${BUILD_DIR}
+ COMMAND docker image save -o ${DOCKER_IMAGE} ${DOCKER_TAG}
+ COMMAND docker image rm ${DOCKER_TAG}
+ DEPENDS ${DOCKER_FILE}
+ package_docker)
+ add_custom_target(${TARGET_NAME}
+ DEPENDS ${DOCKER_IMAGE})
+endmacro(add_docker_image TARGET_NAME DOCKER_IMAGE DOCKER_FILE PACKAGE_FILE)
diff --git a/cmake/FindCheckPatch.cmake b/cmake/FindCheckPatch.cmake
new file mode 100644
index 0000000..8454e6b
--- /dev/null
+++ b/cmake/FindCheckPatch.cmake
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if custom Linux kernel script directory was specified
+if(NOT KERNEL_SCRIPTS_DIR)
+ set(KERNEL_SCRIPTS_DIR "${KERNEL_BUILD_DIR}/scripts"
+ CACHE PATH "Path to Linux kernel scripts directory")
+endif(NOT KERNEL_SCRIPTS_DIR)
+
+# find the checkpatch.pl script in the given KERNEL_SCRIPTS_DIR
+find_program(CheckPatch_EXECUTABLE
+ NAMES checkpatch.pl
+ PATHS ${KERNEL_SCRIPTS_DIR})
+
+
+# get the checkpatch.pl version
+if(CheckPatch_EXECUTABLE)
+ execute_process(COMMAND ${CheckPatch_EXECUTABLE} --version
+ OUTPUT_VARIABLE CheckPatch_VERBOSE_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ string(REGEX REPLACE ".*Version:.([0-9]+\\.[0-9]+).*" "\\1" CheckPatch_VERSION "${CheckPatch_VERBOSE_VERSION}")
+endif(CheckPatch_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CheckPatch
+ FOUND_VAR CheckPatch_FOUND
+ REQUIRED_VARS CheckPatch_EXECUTABLE
+ VERSION_VAR CheckPatch_VERSION
+ FAIL_MESSAGE "checkpatch.pl is not available! Please install checkpatch.pl to lint and format the source code!")
diff --git a/cmake/FindClangFormat.cmake b/cmake/FindClangFormat.cmake
new file mode 100644
index 0000000..a6c77d4
--- /dev/null
+++ b/cmake/FindClangFormat.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_program(ClangFormat_EXECUTABLE NAMES clang-format)
+
+if(ClangFormat_EXECUTABLE)
+ execute_process(COMMAND clang-format --version
+ OUTPUT_VARIABLE ClangFormat_VERBOSE_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" ClangFormat_VERSION ${ClangFormat_VERBOSE_VERSION})
+endif(ClangFormat_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ClangFormat
+ FOUND_VAR ClangFormat_FOUND
+ REQUIRED_VARS ClangFormat_EXECUTABLE
+ VERSION_VAR ClangFormat_VERSION
+ FAIL_MESSAGE "clang-format is not available! Please install clang-format to lint and format the source code!")
diff --git a/cmake/FindDocker.cmake b/cmake/FindDocker.cmake
new file mode 100644
index 0000000..ef3046d
--- /dev/null
+++ b/cmake/FindDocker.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_program(Docker_EXECUTABLE NAMES docker)
+
+if(Docker_EXECUTABLE)
+ execute_process(COMMAND docker version --format "{{.Server.Version}}"
+ OUTPUT_VARIABLE Docker_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif(Docker_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Docker
+ FOUND_VAR Docker_FOUND
+ REQUIRED_VARS Docker_EXECUTABLE
+ VERSION_VAR Docker_VERSION
+ FAIL_MESSAGE "Docker is not available! Please install docker to build and run containers!")
+
diff --git a/cmake/FindFuse.cmake b/cmake/FindFuse.cmake
index dd8a6c1..09e8ba0 100644
--- a/cmake/FindFuse.cmake
+++ b/cmake/FindFuse.cmake
@@ -38,7 +38,7 @@
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#=============================================================================
-cmake_minimum_required(VERSION 2.8.3)
+cmake_minimum_required(VERSION 2.8.12)
########## Private ##########
function(fusedebug _varname)
@@ -53,9 +53,9 @@ set(FUSE_LIBRARIES )
set(FUSE_DEFINITIONS )
set(FUSE_INCLUDE_DIRS )
-find_package(PkgConfig)
+find_package(PkgConfig QUIET)
-set(PC_FUSE_INCLUDE_DIRS )
+set(PC_FUSE_INCLUDE_DIRS "/usr/include/fuse")
set(PC_FUSE_LIBRARY_DIRS )
if(PKG_CONFIG_FOUND)
pkg_check_modules(PC_FUSE "fuse" QUIET)
@@ -133,9 +133,9 @@ endif(FUSE_FOUND)
if(FUSE_INCLUDE_DIRS)
include(FindPackageHandleStandardArgs)
if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY)
- find_package_handle_standard_args(FUSE REQUIRED_VARS FUSE_LIBRARIES FUSE_INCLUDE_DIRS VERSION_VAR FUSE_VERSION)
+ find_package_handle_standard_args(Fuse REQUIRED_VARS FUSE_LIBRARIES FUSE_INCLUDE_DIRS VERSION_VAR FUSE_VERSION)
else()
- find_package_handle_standard_args(FUSE "FUSE not found" FUSE_LIBRARIES FUSE_INCLUDE_DIRS)
+ find_package_handle_standard_args(Fuse "FUSE not found" FUSE_LIBRARIES FUSE_INCLUDE_DIRS)
endif()
else(FUSE_INCLUDE_DIRS)
if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY)
diff --git a/cmake/FindKernelHeaders.cmake b/cmake/FindKernelHeaders.cmake
new file mode 100644
index 0000000..c04243e
--- /dev/null
+++ b/cmake/FindKernelHeaders.cmake
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if custom Linux kernel build directory was specified
+if(NOT KERNEL_BUILD_DIR)
+ set(KERNEL_BUILD_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build"
+ CACHE PATH "Path to Linux kernel modules to compile against")
+endif(NOT KERNEL_BUILD_DIR)
+
+# check if custom Linux kernel output directory was specified
+if(NOT KERNEL_INSTALL_DIR)
+ set(KERNEL_INSTALL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/extra"
+ CACHE PATH "Path to install Linux kernel modules")
+endif(NOT KERNEL_INSTALL_DIR)
+
+if(NOT EXISTS "${KERNEL_BUILD_DIR}/Module.symvers")
+ message(WARNING "\n\nModule.symvers not found in ${KERNEL_BUILD_DIR}\n"
+ "Your kernel sources don't seem to belong to a built kernel,"
+ " expect missing symbols when building kernel module.\n\n")
+endif()
+
+# find the Linux kernel headers from given KERNEL_BUILD_DIR
+find_path(KernelHeaders_INCLUDE_DIR
+ NAMES linux/kernel.h
+ linux/module.h
+ generated/utsrelease.h
+ PATHS ${KERNEL_BUILD_DIR}/include
+ NO_DEFAULT_PATH)
+
+# get Linux kernel headers version
+file(READ "${KERNEL_BUILD_DIR}/include/generated/utsrelease.h" tmpvar)
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+if("${KernelHeaders_VERSION}" EQUAL "")
+ file(READ "${KERNEL_BUILD_DIR}/include/config/kernel.release" tmpvar)
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+endif()
+if("${KernelHeaders_VERSION}" EQUAL "")
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${KernelHeaders_INCLUDE_DIR})
+endif()
+if("${KernelHeaders_VERSION}" EQUAL "")
+ message(FATAL_ERROR "Cannot determine kernel version")
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KernelHeaders
+ FOUND_VAR KernelHeaders_FOUND
+ REQUIRED_VARS KernelHeaders_INCLUDE_DIR
+ VERSION_VAR KernelHeaders_VERSION
+ FAIL_MESSAGE "Linux kernel headers are not available! Please install them to build kernel modules!")
+
+mark_as_advanced(KernelHeaders_INCLUDE_DIR KernelHeaders_MODULE_INSTALL_DIR)
+
+# print found information
+if(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
+ message(VERBOSE "KERNEL_BUILD_DIR: ${KERNEL_BUILD_DIR}")
+ message(VERBOSE "KERNEL_INSTALL_DIR: ${KERNEL_INSTALL_DIR}")
+ message(VERBOSE "KernelHeaders_FOUND: ${KernelHeaders_FOUND}")
+ message(VERBOSE "KernelHeaders_VERSION: ${KernelHeaders_VERSION}")
+endif(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
diff --git a/cmake/FindLibatomic.cmake b/cmake/FindLibatomic.cmake
new file mode 100644
index 0000000..e1c4915
--- /dev/null
+++ b/cmake/FindLibatomic.cmake
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# Use pkg-config to get the directories and then use these values
+# in the FIND_PATH() and FIND_LIBRARY() calls
+find_package(PkgConfig QUIET)
+pkg_check_modules(PKG_Libatomic QUIET libatomic)
+
+set(Libatomic_COMPILE_OPTIONS ${PKG_Libatomic_CFLAGS_OTHER})
+set(Libatomic_VERSION ${PKG_Libatomic_VERSION})
+
+find_library(Libatomic_LIBRARY
+ NAMES atomic
+ HINTS ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}
+ ${PKG_Libatomic_LIBRARY_DIRS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Libatomic
+ FOUND_VAR Libatomic_FOUND
+ REQUIRED_VARS Libatomic_LIBRARY
+ VERSION_VAR Libatomic_VERSION
+ FAIL_MESSAGE "Library 'atomic' is not available! Please install this required library!")
+
+if(Libatomic_FOUND AND NOT TARGET Libatomic::Libatomic)
+ add_library(Libatomic::Libatomic UNKNOWN IMPORTED)
+ set_target_properties(Libatomic::Libatomic PROPERTIES
+ IMPORTED_LOCATION "${Libatomic_LIBRARY}"
+ INTERFACE_COMPILE_OPTIONS "${Libatomic_COMPILE_OPTIONS}")
+endif(Libatomic_FOUND AND NOT TARGET Libatomic::Libatomic)
+
+mark_as_advanced(Libatomic_LIBRARY)
+
+if(Libatomic_FOUND)
+ set(Libatomic_LIBRARIES ${Libatomic_LIBRARY})
+endif(Libatomic_FOUND)
+
+# print found information
+if(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
+ message(VERBOSE "Libatomic_FOUND: ${Libatomic_FOUND}")
+ message(VERBOSE "Libatomic_VERSION: ${Libatomic_VERSION}")
+ message(VERBOSE "Libatomic_COMPILE_OPTIONS: ${Libatomic_COMPILE_OPTIONS}")
+ message(VERBOSE "Libatomic_LIBRARIES: ${Libatomic_LIBRARIES}")
+endif(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
diff --git a/cmake/FindStdatomic.cmake b/cmake/FindStdatomic.cmake
new file mode 100644
index 0000000..d7ee9b8
--- /dev/null
+++ b/cmake/FindStdatomic.cmake
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_file(Stdatomic_INCLUDE_FILE
+ NAMES stdatomic.h
+ HINTS ${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Stdatomic
+ FOUND_VAR Stdatomic_FOUND
+ REQUIRED_VARS Stdatomic_INCLUDE_FILE
+ FAIL_MESSAGE "Compiler does not support atomic operations!")
diff --git a/cmake/GenerateBuild.cmake b/cmake/GenerateBuild.cmake
new file mode 100644
index 0000000..96b2906
--- /dev/null
+++ b/cmake/GenerateBuild.cmake
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# set current build type of the project
+set(DNBD3_BUILD ${BUILD_TYPE})
+string(TIMESTAMP DNBD3_BUILD_DATE "%Y-%m-%d" UTC)
+
+# write dnbd3 build type into a new C source file based on the specified build file template
+configure_file(${BUILD_INPUT_FILE_TEMPLATE} ${BUILD_OUTPUT_FILE})
diff --git a/cmake/GenerateVersion.cmake b/cmake/GenerateVersion.cmake
new file mode 100644
index 0000000..b7579bc
--- /dev/null
+++ b/cmake/GenerateVersion.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# set CMake module path to include version macros
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+ ${VERSION_MODULE_PATH})
+
+# include version macros
+include(Version)
+
+# get Git version of Git repository
+get_repository_version(DNBD3_VERSION DNBD3_VERSION_SHORT DNBD3_BRANCH ${VERSION_INPUT_FILE} ${VERSION_BUILD_TYPE} ${GIT_EXECUTABLE} ${REPOSITORY_DIR})
+
+# generate version header if header does not exists
+if(NOT EXISTS ${VERSION_INPUT_FILE})
+ # write dnbd3 version into a new C source file based on the specified version template
+ configure_file(${VERSION_INPUT_FILE_TEMPLATE} ${VERSION_OUTPUT_FILE})
+endif(NOT EXISTS ${VERSION_INPUT_FILE})
diff --git a/cmake/InstallVersionFile.cmake.in b/cmake/InstallVersionFile.cmake.in
new file mode 100644
index 0000000..8121c25
--- /dev/null
+++ b/cmake/InstallVersionFile.cmake.in
@@ -0,0 +1,8 @@
+#
+# AUTOGENERATED: DO NOT EDIT THIS FILE
+#
+
+if(CPACK_SOURCE_INSTALLED_DIRECTORIES AND EXISTS "@INCLUDE_VERSION_HEADER_GENERATE@")
+ file(INSTALL "@INCLUDE_VERSION_HEADER_GENERATE@"
+ DESTINATION "@INCLUDE_VERSION_HEADER_GENERATE_PREFIX@")
+endif(CPACK_SOURCE_INSTALLED_DIRECTORIES AND EXISTS "@INCLUDE_VERSION_HEADER_GENERATE@")
diff --git a/cmake/Kernel.cmake b/cmake/Kernel.cmake
new file mode 100644
index 0000000..9ecbbba
--- /dev/null
+++ b/cmake/Kernel.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMake macros to build and install Linux kernel modules
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# macro to define kernel module targets
+macro(add_kernel_module MODULE_NAME KERNEL_BUILD_DIR KERNEL_INSTALL_DIR MODULE_MACRO MODULE_SOURCE_FILES MODULE_HEADER_FILES BUILD_SOURCE_FILE)
+ # create directory for kernel module
+ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME})
+ # copy build source file
+ get_filename_component(BUILD_SOURCE_FILENAME ${BUILD_SOURCE_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${BUILD_SOURCE_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BUILD_SOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${BUILD_SOURCE_FILE})
+ set(BUILD_SOURCE_FILE_PREPARED ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${BUILD_SOURCE_FILENAME})
+ # copy source files
+ foreach(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES})
+ get_filename_component(MODULE_SOURCE_FILENAME ${MODULE_SOURCE_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_SOURCE_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MODULE_SOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${MODULE_SOURCE_FILE})
+ set(MODULE_SOURCE_FILES_PREPARED ${MODULE_SOURCE_FILES_PREPARED}
+ ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_SOURCE_FILENAME})
+ endforeach()
+ # copy header files
+ foreach(MODULE_HEADER_FILE ${MODULE_HEADER_FILES})
+ get_filename_component(MODULE_HEADER_FILENAME ${MODULE_HEADER_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_HEADER_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MODULE_HEADER_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${MODULE_HEADER_FILE})
+ set(MODULE_HEADER_FILES_PREPARED ${MODULE_HEADER_FILES_PREPARED}
+ ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_HEADER_FILENAME})
+ endforeach()
+ # check if module depends on another module
+ if(NOT ${ARGV7} STREQUAL "")
+ set(MODULE_EXTRA_SYMBOLS ${CMAKE_CURRENT_BINARY_DIR}/${ARGV7}/Module.symvers)
+ endif()
+ # define build command
+ set(MODULE_BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${MODULE_MACRO}
+ -C ${KERNEL_BUILD_DIR}
+ M=${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME} modules
+ EXTRA_CFLAGS=${KERNEL_C_FLAGS}
+ KBUILD_MODPOST_WARN=1
+ KBUILD_EXTRA_SYMBOLS=${MODULE_EXTRA_SYMBOLS})
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko
+ COMMAND ${MODULE_BUILD_COMMAND}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ COMMENT "Build kernel module ${MODULE_NAME}"
+ DEPENDS ${BUILD_SOURCE_FILE_PREPARED} ${MODULE_HEADER_FILES_PREPARED} ${MODULE_SOURCE_FILES_PREPARED}
+ VERBATIM)
+ add_custom_target(${MODULE_NAME} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko ${ARGV7})
+ # install kernel module
+ install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko
+ DESTINATION ${KERNEL_INSTALL_DIR}
+ PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+ COMPONENT kernel)
+endmacro(add_kernel_module)
diff --git a/cmake/Lint.cmake b/cmake/Lint.cmake
new file mode 100644
index 0000000..4453fe3
--- /dev/null
+++ b/cmake/Lint.cmake
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMake macros to check style of source code files
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_package(ClangFormat)
+find_package(CheckPatch)
+
+if(ClangFormat_FOUND OR CheckPatch_FOUND)
+ # add target to trigger all linter targets
+ add_custom_target(lint)
+endif(ClangFormat_FOUND OR CheckPatch_FOUND)
+
+# macro to define lint targets
+macro(add_linter LINT_NAME LINT_SOURCE_FILES)
+ if(ClangFormat_FOUND)
+ add_custom_target(${LINT_NAME}
+ COMMAND ${ClangFormat_EXECUTABLE} --Werror --dry-run ${LINT_SOURCE_FILES} ${ARGN}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_SOURCE_FILES} ${ARGN})
+ add_dependencies(lint ${LINT_NAME})
+ endif(ClangFormat_FOUND)
+endmacro(add_linter)
+
+# macro to define lint targets for kernel source code files
+macro(add_kernel_linter LINT_NAME LINT_IGNORE_OPTIONS LINT_SOURCE_FILES LINT_HEADER_FILES)
+ if(CheckPatch_FOUND)
+ set(LINT_IGNORE_ARGS "")
+ foreach(IGNORE_OPTION ${LINT_IGNORE_OPTIONS})
+ list(APPEND LINT_IGNORE_ARGS "--ignore" "${IGNORE_OPTION}")
+ endforeach(IGNORE_OPTION ${LINT_IGNORE_OPTIONS})
+ add_custom_target(${LINT_NAME}
+ COMMAND ${CheckPatch_EXECUTABLE} --no-tree --max-line-length=120 ${LINT_IGNORE_ARGS} -f ${LINT_SOURCE_FILES} ${LINT_HEADER_FILES}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_SOURCE_FILES} ${LINT_HEADER_FILES})
+ add_dependencies(lint ${LINT_NAME})
+ endif(CheckPatch_FOUND)
+endmacro(add_kernel_linter)
+
+if(ClangFormat_FOUND OR CheckPatch_FOUND)
+ # add target to trigger all formatter targets
+ add_custom_target(lint-fix)
+endif(ClangFormat_FOUND OR CheckPatch_FOUND)
+
+# macro to define formatter targets
+macro(add_linter_fix LINT_FIX_NAME LINT_FIX_SOURCE_FILES)
+ if(ClangFormat_FOUND)
+ add_custom_target(${LINT_FIX_NAME}
+ COMMAND ${ClangFormat_EXECUTABLE} --Werror -i ${LINT_FIX_SOURCE_FILES} ${ARGN}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_FIX_SOURCE_FILES} ${ARGN})
+ add_dependencies(lint-fix ${LINT_FIX_NAME})
+ endif(ClangFormat_FOUND)
+endmacro(add_linter_fix)
+
+# macro to define formatter targets for kernel source code files
+macro(add_kernel_linter_fix LINT_FIX_NAME LINT_FIX_IGNORE_OPTIONS LINT_FIX_SOURCE_FILES LINT_FIX_HEADER_FILES)
+ if(CheckPatch_FOUND)
+ set(LINT_FIX_IGNORE_ARGS "")
+ foreach(IGNORE_OPTION ${LINT_FIX_IGNORE_OPTIONS})
+ list(APPEND LINT_FIX_IGNORE_ARGS "--ignore" "${IGNORE_OPTION}")
+ endforeach(IGNORE_OPTION ${LINT_FIX_IGNORE_OPTIONS})
+ add_custom_target(${LINT_FIX_NAME}
+ COMMAND ${CheckPatch_EXECUTABLE} --no-tree --max-line-length=120 ${LINT_FIX_IGNORE_ARGS} --fix-inplace -f ${LINT_FIX_SOURCE_FILES} ${LINT_FIX_HEADER_FILES}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_FIX_SOURCE_FILES} ${LINT_FIX_HEADER_FILES})
+ add_dependencies(lint-fix ${LINT_FIX_NAME})
+ endif(CheckPatch_FOUND)
+endmacro(add_kernel_linter_fix)
diff --git a/cmake/PostVersionPackaging.cmake b/cmake/PostVersionPackaging.cmake
new file mode 100644
index 0000000..877cd12
--- /dev/null
+++ b/cmake/PostVersionPackaging.cmake
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+if(EXISTS ${VERSION_HEADER_INPUT_FILE})
+ # remove version.h if generated version.h is available from a Git build
+ file(REMOVE ${VERSION_HEADER_OUTPUT_FILE})
+endif(EXISTS ${VERSION_HEADER_INPUT_FILE})
diff --git a/cmake/PreVersionPackaging.cmake b/cmake/PreVersionPackaging.cmake
new file mode 100644
index 0000000..e960155
--- /dev/null
+++ b/cmake/PreVersionPackaging.cmake
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+if(EXISTS ${VERSION_HEADER_INPUT_FILE})
+ # copy generated version.h into the source repository for packaging purposes
+ get_filename_component(VERSION_HEADER_OUTPUT_FILE_PATH ${VERSION_HEADER_OUTPUT_FILE} PATH)
+ file(COPY ${VERSION_HEADER_INPUT_FILE}
+ DESTINATION ${VERSION_HEADER_OUTPUT_FILE_PATH})
+endif(EXISTS ${VERSION_HEADER_INPUT_FILE})
diff --git a/cmake/Version.cmake b/cmake/Version.cmake
new file mode 100644
index 0000000..0f26944
--- /dev/null
+++ b/cmake/Version.cmake
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+macro(gen_project_version VERSION_INPUT_FILE VERSION_INPUT_FILE_TEMPLATE VERSION_OUTPUT_FILE GIT_EXECUTABLE REPOSITORY_DIR)
+ get_filename_component(VERSION_OUTPUT_FILENAME ${VERSION_OUTPUT_FILE} NAME)
+ # command that will trigger a rebuild of version.h every time
+ add_custom_command(OUTPUT regenerate-version-file
+ COMMAND ${CMAKE_COMMAND} -E sleep 0
+ COMMENT "Trigger generating ${VERSION_OUTPUT_FILENAME}")
+
+ # call the GenerateVersion.cmake file to generate the version.c file
+ add_custom_command(OUTPUT ${VERSION_OUTPUT_FILE}
+ COMMAND ${CMAKE_COMMAND} -D VERSION_MODULE_PATH=${PROJECT_MODULES_DIR}
+ -D VERSION_INPUT_FILE=${VERSION_INPUT_FILE}
+ -D VERSION_INPUT_FILE_TEMPLATE=${VERSION_INPUT_FILE_TEMPLATE}
+ -D VERSION_OUTPUT_FILE=${VERSION_OUTPUT_FILE}
+ -D VERSION_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -D GIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -D REPOSITORY_DIR=${REPOSITORY_DIR}
+ -P ${PROJECT_MODULES_DIR}/GenerateVersion.cmake
+ COMMENT "Generating ${VERSION_OUTPUT_FILENAME}"
+ DEPENDS regenerate-version-file)
+ add_custom_target(dnbd3-generate-version DEPENDS ${VERSION_OUTPUT_FILE})
+
+ # create target to expose project version
+ add_library(dnbd3-version INTERFACE)
+ target_include_directories(dnbd3-version INTERFACE ${PROJECT_INCLUDE_GEN_DIR})
+ add_dependencies(dnbd3-version dnbd3-generate-version)
+endmacro(gen_project_version VERSION_INPUT_FILE VERSION_INPUT_FILE_TEMPLATE VERSION_OUTPUT_FILE)
+
+# macro to get Git version information
+macro(get_repository_version REPOSITORY_VERSION REPOSITORY_VERSION_SHORT REPOSITORY_BRANCH VERSION_HEADER_FILE VERSION_BUILD_TYPE GIT_EXECUTABLE REPOSITORY_DIR)
+ # set empty Git version information
+ set(GIT_VERSION "")
+ # set empty Git branch information
+ set(GIT_BRANCH "")
+
+ # check if generated version header from source package is available
+ if(EXISTS ${VERSION_HEADER_FILE})
+ # get version information from the generated version header of the source package
+ file(READ ${VERSION_HEADER_FILE} GIT_VERSION_VERBOSE)
+ string(REGEX MATCH "DNBD3_VERSION[ \t]+\"([0-9][A-Za-z0-9.+~-]*)\"" GIT_VERSION ${GIT_VERSION_VERBOSE})
+ set(GIT_VERSION "${CMAKE_MATCH_1}")
+
+ # get branch information from the generated version header of the source package
+ file(READ ${VERSION_HEADER_FILE} GIT_BRANCH_VERBOSE)
+ string(REGEX MATCH "DNBD3_BRANCH[ \t]+\"([0-9][A-Za-z0-9.+~-]*)\"" GIT_BRANCH ${GIT_BRANCH_VERBOSE})
+ set(GIT_BRANCH "${CMAKE_MATCH_1}")
+ else(EXISTS ${VERSION_HEADER_FILE})
+ # get detailed Git version information from Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags HEAD
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_VERSION_VERBOSE
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # parse version information from repository if Git command succeeds
+ if(GIT_RETURN_CODE EQUAL 0)
+ # remove the first letter of the version to satisfy packaging rules
+ string(REGEX MATCH "([0-9]+:)?[0-9][A-Za-z0-9.+~-]*" GIT_VERSION ${GIT_VERSION_VERBOSE})
+ endif(GIT_RETURN_CODE EQUAL 0)
+
+ # overwrite version from Git if version is unknown
+ if(GIT_VERSION STREQUAL "")
+ # overwrite version information with unknown version 'v0.0'
+ set(GIT_VERSION "0.0")
+
+ # print a message in Release build configuration to warn about the unknown version
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "The version information from Git tags in this dnbd3 Git repository is missing! Please fetch all Git tags of this repository for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_VERSION STREQUAL "")
+
+ set(${REPOSITORY_VERSION_SHORT} ${GIT_VERSION})
+
+ # get current branch of Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_BRANCH_VERBOSE
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # check output to get branch information
+ if(GIT_RETURN_CODE EQUAL 0)
+ set(GIT_BRANCH ${GIT_BRANCH_VERBOSE})
+ endif(GIT_RETURN_CODE EQUAL 0)
+
+ if(GIT_BRANCH STREQUAL "")
+ # overwrite branch information with 'unknown' branch
+ set(GIT_BRANCH "unknown")
+
+ # print a message in Release build configuration to warn about the unknown branch
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "The current branch in the dnbd3 Git repository is unknown! Please check the branches of this repository for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_BRANCH STREQUAL "")
+
+ # get status of Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} status --porcelain
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_STATUS
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # check if Git repository is dirty
+ if(GIT_RETURN_CODE EQUAL 0 AND NOT GIT_STATUS STREQUAL "")
+ # the Git repository is dirty, thus extend the version information
+ set(GIT_VERSION "${GIT_VERSION}+MOD")
+
+ # print a message in Release build configuration to warn about the dirty repository
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "This dnbd3 Git repository is dirty! Please commit or revert all changes for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_RETURN_CODE EQUAL 0 AND NOT GIT_STATUS STREQUAL "")
+ endif(EXISTS ${VERSION_HEADER_FILE})
+
+ # return version and branch to caller
+ set(${REPOSITORY_VERSION} ${GIT_VERSION})
+ set(${REPOSITORY_BRANCH} ${GIT_BRANCH})
+endmacro(get_repository_version)
diff --git a/cmake/toolchain/Aarch64LinuxGnu.cmake b/cmake/toolchain/Aarch64LinuxGnu.cmake
new file mode 100644
index 0000000..59c5f00
--- /dev/null
+++ b/cmake/toolchain/Aarch64LinuxGnu.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMAKE toolchain file for cross compilation with aarch64-linux-gnu-gcc
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+set(CMAKE_LINKER aarch64-linux-gnu-ld)
+set(CMAKE_ASM_COMPILER aarch64-linux-gnu-as)
+set(CMAKE_OBJCOPY aarch64-linux-gnu-objcopy)
+set(CMAKE_STRIP aarch64-linux-gnu-strip)
+set(CMAKE_CPP aarch64-linux-gnu-cpp)
+
+# path of headers and libraries for aarch64-linux-gnu target
+set(CMAKE_FIND_ROOT_PATH "/usr/aarch64-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/cmake/toolchain/PowerpcLinuxGnu.cmake b/cmake/toolchain/PowerpcLinuxGnu.cmake
new file mode 100644
index 0000000..d3c1ca5
--- /dev/null
+++ b/cmake/toolchain/PowerpcLinuxGnu.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMAKE toolchain file for cross compilation with powerpc-linux-gnu-gcc
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR ppc)
+
+set(CMAKE_C_COMPILER powerpc-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER powerpc-linux-gnu-g++)
+set(CMAKE_LINKER powerpc-linux-gnu-ld)
+set(CMAKE_ASM_COMPILER powerpc-linux-gnu-as)
+set(CMAKE_OBJCOPY powerpc-linux-gnu-objcopy)
+set(CMAKE_STRIP powerpc-linux-gnu-strip)
+set(CMAKE_CPP powerpc-linux-gnu-cpp)
+
+# path of headers and libraries for powerpc-linux-gnu target
+set(CMAKE_FIND_ROOT_PATH "/usr/powerpc-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/conf/README.server b/conf/README.server
deleted file mode 100644
index 08be09f..0000000
--- a/conf/README.server
+++ /dev/null
@@ -1,35 +0,0 @@
-Configuration for dnbd3-server
-
-The server requires a config directory.
-Start it like so: ./dnbd3-server -c ./my-config/
-
-There are two files in that dir
-
-== alt-servers ==
-List of known alt-servers for this server.
-INI Format:
-[Address]
-comment=Whatever
-for=client | replication
-namespace=some/path/
-
-All fields in a section are optional. If the "for" key is missing, the server
-will be used for replication and will be sent to clients that request a list
-of alt servers.
-The namespace key can be specified multiple times per section. If it is missing,
-the server will be used for all image names; otherwise, it will only be used
-for images which's name starts with one of the given strings.
-
-If you're not running in proxy mode, this file won't do much for you
-
-== server.conf ==
-
-Main configuration file. Ini format.
-
-[dnbd3]
-basePath=/srv/openslx/dnbd3 # virtual root of image files
-serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs)
-clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs)
-isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image
-uplinkTimeout=1250 # r/w timeout for connections to uplink servers
-
diff --git a/get-version.sh b/get-version.sh
deleted file mode 100755
index 1d4a8cb..0000000
--- a/get-version.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-
-# Always create version string for repository this script lies in,
-# not the cwd... Makes usage easier in cmake
-ARG0="$0"
-SELF="$(readlink -f "${ARG0}")"
-ROOT_DIR="$(dirname "${SELF}")"
-cd "$ROOT_DIR"
-
-if [ -d .git ]; then
- [ -n "$(git diff)" ] && MODDED='+MOD'
- echo $(git describe)$MODDED, branch $(git rev-parse --abbrev-ref HEAD), built "$(date +%Y-%m-%d)"
- exit 0
-fi
-
-if [ -f "version.txt" ]; then
- cat "version.txt"
- exit 0
-fi
-
-echo "-unknown-"
-
diff --git a/inc/dnbd3/build.h.in b/inc/dnbd3/build.h.in
new file mode 100644
index 0000000..062ed17
--- /dev/null
+++ b/inc/dnbd3/build.h.in
@@ -0,0 +1,11 @@
+/*
+ * AUTOGENERATED: DO NOT EDIT THIS FILE
+ */
+
+#ifndef BUILD_H_
+#define BUILD_H_
+
+#define DNBD3_BUILD "@DNBD3_BUILD@"
+#define DNBD3_BUILD_DATE "@DNBD3_BUILD_DATE@"
+
+#endif /* BUILD_H_ */
diff --git a/src/config.h b/inc/dnbd3/config.h
index 50336af..eb4b8b1 100644
--- a/src/config.h
+++ b/inc/dnbd3/config.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
diff --git a/inc/dnbd3/config/client.h b/inc/dnbd3/config/client.h
new file mode 100644
index 0000000..55cf8b3
--- /dev/null
+++ b/inc/dnbd3/config/client.h
@@ -0,0 +1,52 @@
+#ifndef _CLIENTCONFIG_H_
+#define _CLIENTCONFIG_H_
+
+// Which is the minimum protocol version the client expects from the server
+#define MIN_SUPPORTED_SERVER 2
+
+// Send keepalive every X seconds
+#define KEEPALIVE_INTERVAL 10
+
+// in seconds if not stated otherwise
+#define SOCKET_TIMEOUT_SEND 2
+
+// Socker receive timeout. Must be higher than keepalive interval, otherwise
+// the connection might be aborted when idle
+#define SOCKET_TIMEOUT_RECV 13
+
+// During discovery, we use very short minimum timeouts (unless in panic mode)
+#define SOCKET_TIMEOUT_DISCOVERY 1
+
+// IO timeout for block layer
+#define BLOCK_LAYER_TIMEOUT 10
+
+#define RTT_THRESHOLD_FACTOR(us) (((us) * 3) / 4) // 3/4 = current to best must be 25% worse
+#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
+#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
+// This must be a power of two:
+#define RTT_BLOCK_SIZE 4096
+
+// Interval of several repeating tasks (in seconds)
+#define TIMER_INTERVAL_PROBE_STARTUP 2
+#define TIMER_INTERVAL_PROBE_SWITCH 10
+#define TIMER_INTERVAL_PROBE_PANIC 2
+#define TIMER_INTERVAL_PROBE_MAX 45
+// How many discover runs after setting up a device should be considered the startup phase
+// during that phase, check all servers, before we start doing it selectively
+// and also don't increase the discover interval during this period
+#define DISCOVER_STARTUP_PHASE_COUNT 6
+// How many servers should be tested at maximum after above
+#define DISCOVER_REDUCED_SERVER_COUNT 3
+// Number of RTT probes to keep in history and average the value over
+#define DISCOVER_HISTORY_SIZE 4
+
+// Number of unsuccessful alt_server probes before read errors are reported to the block layer
+// (ALL servers will be probed this many times)
+// Set to 0 to disable
+#define PROBE_COUNT_TIMEOUT 0
+
+// ++ Kernel module ++
+#define DEFAULT_READ_AHEAD_KB 512
+#define NUMBER_DEVICES 8
+
+#endif
diff --git a/src/serverconfig.h b/inc/dnbd3/config/server.h
index 239f0a2..b6eee2c 100644
--- a/src/serverconfig.h
+++ b/inc/dnbd3/config/server.h
@@ -1,7 +1,7 @@
#ifndef _SERVERCONFIG_H_
#define _SERVERCONFIG_H_
-#include "config.h"
+#include <dnbd3/config.h>
// +++++ Performance/memory related
#define SERVER_MAX_CLIENTS 4000
@@ -13,11 +13,15 @@
#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
#define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
-#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_QUEUE 500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request
#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients
#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
-#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
+// Wait a maximum of 5 minutes before saving cache map (if data was received at all)
+#define CACHE_MAP_MAX_SAVE_DELAY 300
+// If more than 500MB have been received from uplink without saving cache map, do so
+#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000)
// Time in ms to wait for a read/write call to complete on an uplink connection
#define SOCKET_TIMEOUT_UPLINK 5000
diff --git a/src/shared/crc32.h b/inc/dnbd3/shared/crc32.h
index 00b8bdd..00b8bdd 100644
--- a/src/shared/crc32.h
+++ b/inc/dnbd3/shared/crc32.h
diff --git a/src/shared/fdsignal.h b/inc/dnbd3/shared/fdsignal.h
index 960a2a9..960a2a9 100644
--- a/src/shared/fdsignal.h
+++ b/inc/dnbd3/shared/fdsignal.h
diff --git a/src/shared/log.h b/inc/dnbd3/shared/log.h
index 5b1e8f7..2a15f1d 100644
--- a/src/shared/log.h
+++ b/inc/dnbd3/shared/log.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Simon Rettberg
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -23,6 +23,7 @@
#include <stdbool.h>
#include <unistd.h>
+#include <stdio.h>
typedef unsigned int logmask_t;
#define LOG_ERROR ((logmask_t)1) // Fatal error, server will terminate
@@ -32,6 +33,10 @@ typedef unsigned int logmask_t;
#define LOG_DEBUG1 ((logmask_t)16) // Debug information, use this for non-spammy stuff
#define LOG_DEBUG2 ((logmask_t)32) // Use this for debug messages that will show up a lot
+/**
+ * Initialize the logging (constructor)
+ */
+void log_init(void);
/**
* Check if cansoleMask | fileMask has all of mask set.
@@ -45,6 +50,16 @@ void log_setConsoleMask(logmask_t mask);
void log_setConsoleTimestamps(bool on);
/**
+ * Set console output stream
+ * The output stream can be either stdout or stderr
+ *
+ * Note: A call of this function is optional and only required if the output
+ * stream should be changed from stdout to stderr since the log
+ * implementation defaults to the output stream stdout
+ */
+int log_setConsoleOutputStream(FILE *outputStream);
+
+/**
* Open or reopen the log file. If path is NULL and the
* function was called with a path before, the same path
* will be used again.
diff --git a/src/shared/protocol.h b/inc/dnbd3/shared/protocol.h
index 2b21c21..1dd47f8 100644
--- a/src/shared/protocol.h
+++ b/inc/dnbd3/shared/protocol.h
@@ -1,10 +1,9 @@
#ifndef _PROTOCOL_H_
#define _PROTOCOL_H_
-#include "sockhelper.h"
-
-#include "../types.h"
-#include "../serialize.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+#include <dnbd3/shared/sockhelper.h>
#include <errno.h>
#include <sys/types.h>
@@ -69,10 +68,8 @@ static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid,
request.magic = dnbd3_packet_magic;
request.cmd = CMD_SELECT_IMAGE;
request.size = (uint32_t)len;
-#ifdef _DEBUG
request.handle = 0;
request.offset = 0;
-#endif
fixup_request( request );
iov[0].iov_base = &request;
iov[0].iov_len = sizeof(request);
diff --git a/src/serialize.h b/inc/dnbd3/shared/serialize.h
index 1b73531..b808fd0 100644
--- a/src/serialize.h
+++ b/inc/dnbd3/shared/serialize.h
@@ -1,11 +1,10 @@
#ifndef SERIALIZER_H_
#define SERIALIZER_H_
-// Careful with includes - this is used in kernel module too
-#include "config.h"
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
-typedef struct
-{
+typedef struct {
char buffer[MAX_PAYLOAD]; // This MUST be the first member or send_reply() will blow up
char *buffer_end;
char *buffer_pointer;
diff --git a/src/shared/sockhelper.h b/inc/dnbd3/shared/sockhelper.h
index 8d70789..5c7d903 100644
--- a/src/shared/sockhelper.h
+++ b/inc/dnbd3/shared/sockhelper.h
@@ -6,7 +6,7 @@
* abstract from the IP version by using getaddrinfo() and thelike.
*/
-#include "../types.h"
+#include <dnbd3/types.h>
#include <stdint.h>
#include <sys/socket.h>
#include <string.h>
diff --git a/src/shared/timing.h b/inc/dnbd3/shared/timing.h
index f23bfeb..2530416 100644
--- a/src/shared/timing.h
+++ b/inc/dnbd3/shared/timing.h
@@ -1,10 +1,6 @@
#ifndef _D_TIMING_H
#define _D_TIMING_H
-#ifndef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 199309L
-#endif
-
#include <time.h>
#include <stdint.h>
#include <stdbool.h>
diff --git a/src/types.h b/inc/dnbd3/types.h
index cb0ccfd..699fa68 100644
--- a/src/types.h
+++ b/inc/dnbd3/types.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,10 +21,15 @@
#ifndef TYPES_H_
#define TYPES_H_
-#include "config.h"
-#ifndef KERNEL_MODULE
+#include <dnbd3/config.h>
+#ifdef DNBD3_KERNEL_MODULE
+#include <linux/kernel.h>
+#include <linux/string.h>
+#else
#include <stdint.h>
#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
#endif
#ifndef MIN
@@ -34,6 +39,9 @@
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
#ifdef __GNUC__
#define UNUSED __attribute__ ((unused))
#else
@@ -62,7 +70,7 @@
#include <netinet/in.h>
#endif
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
#define send(a,b,c,d) write(a,b,c)
#define recv(a,b,c,d) read(a,b,c)
#endif
@@ -91,10 +99,8 @@
(a).cmd = net_order_16((a).cmd); \
(a).size = net_order_32((a).size); \
} while (0)
-#define ENDIAN_MODE "Big Endian"
-#ifndef BIG_ENDIAN
-#define BIG_ENDIAN
-#endif
+#define DNBD3_ENDIAN_MODE "Big Endian"
+#define DNBD3_BIG_ENDIAN
#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
#define dnbd3_packet_magic ((uint16_t)( (0x73) | (0x72 << 8) ))
// Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
@@ -103,10 +109,8 @@
#define net_order_16(a) (a)
#define fixup_request(a) while(0)
#define fixup_reply(a) while(0)
-#define ENDIAN_MODE "Little Endian"
-#ifndef LITTLE_ENDIAN
-#define LITTLE_ENDIAN
-#endif
+#define DNBD3_ENDIAN_MODE "Little Endian"
+#define DNBD3_LITTLE_ENDIAN
#else
#error "Unknown Endianness"
#endif
@@ -124,10 +128,14 @@ typedef struct __attribute__((packed)) dnbd3_host_t
dnbd3_af type; // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored)
} dnbd3_host_t;
+/* IOCTLs */
+#define MAX_HOSTS_PER_IOCTL NUMBER_SERVERS
+
typedef struct __attribute__((packed))
{
uint16_t len;
- dnbd3_host_t host;
+ dnbd3_host_t hosts[MAX_HOSTS_PER_IOCTL];
+ uint8_t hosts_num;
uint16_t imgnamelen;
char *imgname;
int rid;
@@ -153,10 +161,10 @@ typedef struct __attribute__((packed))
uint32_t size; // 4byte
union {
struct {
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
uint64_t offset_small:56; // 7byte
uint8_t hops; // 1byte
-#elif defined(BIG_ENDIAN)
+#elif defined(DNBD3_BIG_ENDIAN)
uint8_t hops; // 1byte
uint64_t offset_small:56; // 7byte
#endif
diff --git a/inc/dnbd3/version.h.in b/inc/dnbd3/version.h.in
new file mode 100644
index 0000000..727c8b8
--- /dev/null
+++ b/inc/dnbd3/version.h.in
@@ -0,0 +1,12 @@
+/*
+ * AUTOGENERATED: DO NOT EDIT THIS FILE
+ */
+
+#ifndef VERSION_H_
+#define VERSION_H_
+
+#define DNBD3_VERSION "@DNBD3_VERSION@"
+#define DNBD3_BRANCH "@DNBD3_BRANCH@"
+#define DNBD3_VERSION_LONG "@GIT_VERSION@, branch @DNBD3_BRANCH@"
+
+#endif /* VERSION_H_ */
diff --git a/pack.sh b/pack.sh
deleted file mode 100755
index 9cbe5c4..0000000
--- a/pack.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./get-version.sh > version.txt
-tar ckzf dnbd3.tar.gz src cmake CMakeLists.txt get-version.sh version.txt
-rm -- version.txt
-
diff --git a/pkg/CMakeLists.txt b/pkg/CMakeLists.txt
new file mode 100644
index 0000000..3060345
--- /dev/null
+++ b/pkg/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-pkg
+ LANGUAGES C)
+
+add_subdirectory(config)
+add_subdirectory(systemd)
diff --git a/pkg/config/CMakeLists.txt b/pkg/config/CMakeLists.txt
new file mode 100644
index 0000000..efbd2bf
--- /dev/null
+++ b/pkg/config/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-config
+ LANGUAGES C)
+
+# define all configuration files
+set(DNBD3_CONFIG_FILES ${CMAKE_CURRENT_SOURCE_DIR}/alt-servers
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.acl
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.conf)
+
+# install configuration files into sample directory
+install(FILES ${DNBD3_CONFIG_FILES}
+ DESTINATION /etc/dnbd3-server/sample
+ COMPONENT server)
diff --git a/conf/alt-servers b/pkg/config/alt-servers
index 1d5d39e..d9d2636 100644
--- a/conf/alt-servers
+++ b/pkg/config/alt-servers
@@ -6,7 +6,7 @@ comment=My first alt server that will not be used for replication
for=client
[192.168.100.50]
-comment=Super sectret alt server that will be used for replication, but clients don't know about it
+comment=Super secret alt server that will be used for replication, but clients don't know about it
for=replication
[192.168.100.123]
diff --git a/conf/rpc.acl b/pkg/config/rpc.acl
index 5167ae3..b3b4561 100644
--- a/conf/rpc.acl
+++ b/pkg/config/rpc.acl
@@ -1,5 +1,5 @@
# Everything from localhost
127.0.0.0/8 ALL
+
# Some info reading for another machine
132.230.8.113 STATS CLIENT_LIST IMAGE_LIST
-
diff --git a/conf/server.conf b/pkg/config/server.conf
index a15092f..22bd14a 100644
--- a/conf/server.conf
+++ b/pkg/config/server.conf
@@ -1,36 +1,51 @@
[dnbd3]
; port to listen on (default: 5003)
listenPort=5003
+
; relative root directory for images, ending in .r[1-9][0-9]*
-basePath=/mnt/storage/dnbd3
+basePath=/mnt/storage
+
; artificial connection delay for connecting servers
serverPenalty=100000
+
; artificial connection delay for connecting clients
clientPenalty=0
+
; is this server a proxy? if true, requests for non-existing images will be relayed to known alt-servers
isProxy=true
+
; if proxy is true and an image is incomplete, should idle bandwidth be used to replicate missing blocks?
backgroundReplication=true
+
; minimum amount of connected clients for background replication to kick in
bgrMinClients=0
+
; if another proxy requests and image that we don't have, should we ask our alt-servers for it?
lookupMissingForProxy=true
+
; create sparse files instead of preallocating; ignored if backgroundReplication=true
; -- only recommended if cache space is small
sparseFiles=false
+
; if true (which is the default), images will automatically be removed from the list if they can't be accessed
removeMissingImages=true
+
; timeout in ms for send/recv on connections to uplink servers (used for replication)
-uplinkTimeout=1250
+uplinkTimeout=5000
+
; timeout in ms for send/recv on connections to clients (using an image on this server)
clientTimeout=15000
+
; set this to true to close handles of unused images after some timeout
closeUnusedFd=false
+
; set this to true to load files without the .r[0-9]+ extension too, assuming RID=1
vmdkLegacyMode=false
+
; Don't set the server flag when connecting to alt-servers
; Intended for if the proxy is used for on-client caching
pretendClient=false
+
; When running in proxy mode and running out of space, automatically delete oldest image(s) to make
; the newly replicated image fit. In sparse mode, this will make sure at least 2GB of free space are
; available when replicating a new image. During normal operation, it will free at least 256MB whenever
@@ -49,16 +64,21 @@ maxImages=1000
maxPayload=9M
maxReplicationSize=150G
-; Log related config
+; Maximum number of bytes to prefetch when relaying client request to upstream server
+maxPrefetch=256k
+
[logging]
; log file path and name
; comment out to disable logging to file
; protip: use SIGUSR2 to reopen log file
file=./dnbd3.log
+
; which type of messages to log to file
fileMask=ERROR WARNING MINOR INFO DEBUG1
-; which to log to console (stdout)
+
+; which type of messages to log to console (stdout)
consoleMask=ERROR WARNING MINOR INFO
+
; Valid types (warning: specifying invalid types will not yield an error!)
; ERROR Fatal error, server will terminate
; WARNING Major issue, something is broken but keep running
@@ -66,7 +86,6 @@ consoleMask=ERROR WARNING MINOR INFO
; INFO Informational message
; DEBUG1 Debug information, used for medium verbosity
; DEBUG2 Used for debug messages that would show up a lot
-;
+
; Whether timestamps should be output to console too (or just to file if false)
consoleTimestamps=false
-
diff --git a/pkg/docker/archlinux_dockerfile b/pkg/docker/archlinux_dockerfile
new file mode 100644
index 0000000..ea6145b
--- /dev/null
+++ b/pkg/docker/archlinux_dockerfile
@@ -0,0 +1,28 @@
+# use Archlinux as base image
+FROM archlinux:latest
+
+# declare arguments that should be set by 'docker build --build-arg ...'
+ARG DNBD3_PACKAGE_FILE_NAME
+
+# copy built package file from host to docker image
+COPY ${DNBD3_PACKAGE_FILE_NAME} /tmp
+
+# install required dependencies
+RUN pacman --noconfirm -Sy
+RUN pacman --noconfirm -S fuse2 jansson
+
+# install installation package
+RUN tar -xf /tmp/${DNBD3_PACKAGE_FILE_NAME} --strip-components=1 -C /
+
+# use default config for dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/server.conf /etc/dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/alt-servers /etc/dnbd3-server
+
+# make default storage point for dnbd3-server
+RUN mkdir -p /mnt/storage
+
+# expose the port of the dnbd3-server to the host
+EXPOSE 5003
+
+# run dnbd3-server
+CMD [ "dnbd3-server", "-n" ]
diff --git a/pkg/docker/ubuntu-20-04_dockerfile b/pkg/docker/ubuntu-20-04_dockerfile
new file mode 100644
index 0000000..ad2adcb
--- /dev/null
+++ b/pkg/docker/ubuntu-20-04_dockerfile
@@ -0,0 +1,28 @@
+# use Ubuntu 20.04 as base image
+FROM ubuntu:focal
+
+# declare arguments that should be set by 'docker build --build-arg ...'
+ARG DNBD3_PACKAGE_FILE_NAME
+
+# copy built package file from host to docker image
+COPY ${DNBD3_PACKAGE_FILE_NAME} /tmp
+
+# install required dependencies
+RUN apt-get update
+RUN apt-get install -y libfuse2 libjansson4
+
+# install installation package
+RUN dpkg -i /tmp/${DNBD3_PACKAGE_FILE_NAME}
+
+# use default config for dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/server.conf /etc/dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/alt-servers /etc/dnbd3-server
+
+# make default storage point for dnbd3-server
+RUN mkdir -p /mnt/storage
+
+# expose the port of the dnbd3-server to the host
+EXPOSE 5003
+
+# run dnbd3-server
+CMD [ "dnbd3-server", "-n" ]
diff --git a/pkg/systemd/CMakeLists.txt b/pkg/systemd/CMakeLists.txt
new file mode 100644
index 0000000..b094b4b
--- /dev/null
+++ b/pkg/systemd/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-systemd
+ LANGUAGES C)
+
+# define all systemd related files
+set(DNBD3_SYSTEMD_FILES ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3-server.service)
+
+# install systemd service files
+install(FILES ${DNBD3_SYSTEMD_FILES}
+ DESTINATION /usr/lib/systemd/system
+ COMPONENT server)
diff --git a/pkg/systemd/dnbd3-server.service b/pkg/systemd/dnbd3-server.service
new file mode 100644
index 0000000..de800ce
--- /dev/null
+++ b/pkg/systemd/dnbd3-server.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=DNBD3 server
+Wants=network-online.target
+After=network-online.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/dnbd3-server -n -c /etc/dnbd3-server
+Restart=on-failure
+
+[Install]
+WantedBy=multi-user.target
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..96ffcae
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-src
+ LANGUAGES C)
+
+if(DNBD3_BENCHMARK)
+ add_subdirectory(bench)
+endif(DNBD3_BENCHMARK)
+
+if(DNBD3_KERNEL_MODULE)
+ add_subdirectory(client)
+ add_subdirectory(kernel)
+endif(DNBD3_KERNEL_MODULE)
+
+if(DNBD3_CLIENT_FUSE)
+ add_subdirectory(fuse)
+endif(DNBD3_CLIENT_FUSE)
+
+if(DNBD3_SERVER)
+ add_subdirectory(server)
+endif(DNBD3_SERVER)
+
+add_subdirectory(shared)
diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt
new file mode 100644
index 0000000..24542a7
--- /dev/null
+++ b/src/bench/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-bench
+ LANGUAGES C)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_BENCH_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_BENCH_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h)
+
+add_executable(dnbd3-bench ${DNBD3_BENCH_SOURCE_FILES})
+target_link_libraries(dnbd3-bench dnbd3-version dnbd3-shared ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS dnbd3-bench RUNTIME DESTINATION bin
+ COMPONENT bench)
+
+add_linter(dnbd3-bench-lint "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
+add_linter_fix(dnbd3-bench-lint-fix "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
diff --git a/src/bench/connection.c b/src/bench/connection.c
index 26be440..974bc8a 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -1,10 +1,10 @@
#include "connection.h"
#include "helper.h"
-#include "../config.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
#include <stdlib.h>
#include <pthread.h>
@@ -97,7 +97,7 @@ bool connection_init_n_times(
counters->fails++;
logadd( LOG_ERROR, "rid mismatch" );
//} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
- } else if ( !dnbd3_get_block( sock, (((uint64_t)rand()) << 16 + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
+ } else if ( !dnbd3_get_block( sock, (((uint64_t)rand() << 16) + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
counters->fails++;
logadd( LOG_ERROR, "send: get block failed" );
} else if ( !dnbd3_get_reply( sock, &reply ) ) {
diff --git a/src/bench/connection.h b/src/bench/connection.h
index 770bf0d..422c93e 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -1,7 +1,7 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
#include <stdbool.h>
#include <stdint.h>
#include "helper.h"
diff --git a/src/bench/helper.h b/src/bench/helper.h
index e0c0262..53f32bf 100644
--- a/src/bench/helper.h
+++ b/src/bench/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
diff --git a/src/bench/main.c b/src/bench/main.c
index f8c55c3..37e2821 100644
--- a/src/bench/main.c
+++ b/src/bench/main.c
@@ -4,8 +4,9 @@
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/version.h>
#include <stdio.h>
#include <stdlib.h>
@@ -19,6 +20,7 @@
static void printUsage(char *argv0, int exitCode)
{
+ printf( "Version: %s\n", DNBD3_VERSION_LONG );
printf( "Usage: %s [--debug] --host <serverAddress(es)> --image <imageName> [--rid revision]\n", argv0 );
printf( "Or: %s [-d] -h <serverAddress(es)> -i <imageName> [-r revision]\n", argv0 );
printf( " -h --host List of space separated hosts to use\n" );
@@ -74,6 +76,8 @@ int main(int argc, char *argv[])
int n_threads = 1;
int bs = 4096;
+ log_init();
+
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
}
diff --git a/src/bench/serialize.c b/src/bench/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/bench/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
new file mode 100644
index 0000000..41f182e
--- /dev/null
+++ b/src/client/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-client
+ LANGUAGES C)
+
+# add compile option to enable enhanced BSD netdb features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_CLIENT_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/client.c)
+
+add_executable(dnbd3-client ${DNBD3_CLIENT_SOURCE_FILES})
+target_link_libraries(dnbd3-client dnbd3-version dnbd3-build dnbd3-shared)
+install(TARGETS dnbd3-client RUNTIME DESTINATION bin
+ COMPONENT kernel)
+
+add_linter(dnbd3-client-lint "${DNBD3_CLIENT_SOURCE_FILES}")
+add_linter_fix(dnbd3-client-lint-fix "${DNBD3_CLIENT_SOURCE_FILES}")
diff --git a/src/client/client.c b/src/client/client.c
index 37f0558..0cf222e 100644
--- a/src/client/client.c
+++ b/src/client/client.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,9 +18,10 @@
*
*/
-#include "../clientconfig.h"
-#include "../types.h"
-#include "../version.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/types.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include <stdio.h>
#include <stdlib.h>
@@ -33,19 +34,19 @@
#include <arpa/inet.h>
#include <string.h>
#include <sys/stat.h>
+#include <sys/socket.h>
#include <sys/un.h>
#include <errno.h>
-#define SOCK_PATH "/var/run/dnbd3.socket"
+#define SOCK_PATH "/run/dnbd3.socket"
#define SOCK_BUFFER 1000
#define DEV_LEN 15
#define MAX_DEVS 50
-
+#define TMP_STR_LEN 100
static int openDevices[MAX_DEVS];
-static const char *optString = "f:h:i:r:d:a:cs:HV?k";
+static const char *optString = "h:i:r:d:a:cs:SA:R:HV?k";
static const struct option longOpts[] = {
- { "file", required_argument, NULL, 'f' },
{ "host", required_argument, NULL, 'h' },
{ "image", required_argument, NULL, 'i' },
{ "rid", required_argument, NULL, 'r' },
@@ -53,8 +54,9 @@ static const struct option longOpts[] = {
{ "ahead", required_argument, NULL, 'a' },
{ "close", no_argument, NULL, 'c' },
{ "switch", required_argument, NULL, 's' },
- { "add", required_argument, NULL, 'adds' },
- { "remove", required_argument, NULL, 'rems' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "add", required_argument, NULL, 'A' },
+ { "remove", required_argument, NULL, 'R' },
{ "help", no_argument, NULL, 'H' },
{ "version", no_argument, NULL, 'V' },
{ "daemon", no_argument, NULL, 'D' },
@@ -66,9 +68,9 @@ static const struct option longOpts[] = {
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg);
static void dnbd3_client_daemon();
-static void dnbd3_daemon_action(int client, int argc, char **argv);
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv);
static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host);
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead);
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers);
static int dnbd3_daemon_send(int argc, char **argv);
static void dnbd3_print_help(char *argv_0);
static void dnbd3_print_version();
@@ -84,11 +86,11 @@ static char host_to_string(const dnbd3_host_t *host, char *target, size_t target
if ( targetlen < 10 ) return false;
if ( host->type == HOST_IP6 ) {
*target++ = '[';
- inet_ntop( AF_INET6, host->addr, target, targetlen - 10 );
+ inet_ntop( AF_INET6, host->addr, target, (socklen_t)targetlen - 10 );
target += strlen( target );
*target++ = ']';
} else if ( host->type == HOST_IP4 ) {
- inet_ntop( AF_INET, host->addr, target, targetlen - 8 );
+ inet_ntop( AF_INET, host->addr, target, (socklen_t)targetlen - 8 );
target += strlen( target );
} else {
snprintf( target, targetlen, "<?addrtype=%d>", (int)host->type );
@@ -135,8 +137,9 @@ static char parse_address(char *string, dnbd3_host_t *host)
// Scan for port
char *portpos = NULL, *ptr = string;
while ( *ptr ) {
- if ( *ptr == ':' )
- portpos = ptr;
+ if ( *ptr == ':' ) {
+ portpos = ptr;
+ }
++ptr;
}
if ( portpos == NULL ) return 0; // No port in string
@@ -192,33 +195,77 @@ static int dnbd3_get_ip(char *hostname, dnbd3_host_t *host)
return true;
}
+/* parses hosts from space separated cmdln string, resolves them and saves them into hosts */
+static int dnbd3_get_resolved_hosts(char *hosts_str, dnbd3_host_t *hosts, const size_t hosts_len)
+{
+ char *hosts_current_token = hosts_str;
+ char *hosts_last_host;
+ int hosts_index = 0;
+ char host_str[TMP_STR_LEN];
+ size_t host_str_len = 0;
+
+ do {
+ /* get next host from string */
+ while ( *hosts_current_token == ' ' ) {
+ hosts_current_token++;
+ }
+
+ /* buffer substring of host to get ip from it */
+ hosts_last_host = strchr( hosts_current_token, ' ' );
+ host_str_len = (hosts_last_host == NULL ? TMP_STR_LEN : (size_t)(hosts_last_host - hosts_current_token) + 1);
+ if ( host_str_len > TMP_STR_LEN ) {
+ host_str_len = TMP_STR_LEN;
+ }
+
+ snprintf( host_str, host_str_len, "%s", hosts_current_token );
+
+ if ( !dnbd3_get_ip( host_str, &hosts[hosts_index] ) )
+ return false;
+
+ hosts_index++;
+
+ /* continue processing of hosts */
+ hosts_current_token = hosts_last_host + 1;
+
+ } while ( hosts_last_host != NULL && hosts_index < hosts_len );
+
+ return hosts_index;
+}
+
int main(int argc, char *argv[])
{
char *dev = NULL;
char host[50];
int action = -1;
+ bool learnNewServers = true;
+ int active_device_num = 0;
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(dnbd3_ioctl_t) );
- msg.len = (uint16_t)sizeof(dnbd3_ioctl_t);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
+ msg.hosts_num = 0;
msg.read_ahead_kb = DEFAULT_READ_AHEAD_KB;
- msg.host.port = htons( PORT );
- msg.host.type = 0;
msg.imgname = NULL;
- msg.use_server_provided_alts = true;
int opt = 0;
int longIndex = 0;
+ // In case the client was invoked as a suid binary, change uid back to original user
+ // and warn the user as this was legacy mode
+ if ( geteuid() == 0 && getuid() != 0 ) {
+ fprintf( stderr, "Warning! %s is a setuid binary. This is deprecated and not needed anymore.\n", argv[0] );
+ fprintf( stderr, "Switching back o user %d\n", (int)getuid() );
+ setgid( getgid() );
+ setuid( getuid() );
+ }
+
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
while ( opt != -1 ) {
switch ( opt ) {
- case 'f':
- break;
case 'h':
- if ( !dnbd3_get_ip( optarg, &msg.host ) ) exit( EXIT_FAILURE );
+ msg.hosts_num = (uint8_t)dnbd3_get_resolved_hosts( optarg, msg.hosts, MAX_HOSTS_PER_IOCTL );
+ if ( !msg.hosts_num )
+ exit( EXIT_FAILURE );
break;
case 'i':
action = IOCTL_OPEN;
@@ -238,25 +285,34 @@ int main(int argc, char *argv[])
action = IOCTL_CLOSE;
break;
case 's':
- dnbd3_get_ip( optarg, &msg.host );
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_SWITCH;
break;
- case 'adds':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_ADD_SRV;
break;
- case 'rems':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'R':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_REM_SRV;
break;
case 'H':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'V':
- dnbd3_print_version();
+ dnbd3_print_version( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case '?':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'D':
dnbd3_client_daemon();
@@ -265,6 +321,14 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+ /* abort if sticky mode is set and image will not be opened */
+ if ( !learnNewServers && action != IOCTL_OPEN ) {
+ printf( "ERROR: sticky mode can only be set if image will be opened.\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ msg.use_server_provided_alts = learnNewServers;
+
// See if socket exists, if so, try to send to daemon
struct stat st;
if ( stat( SOCK_PATH, &st ) == 0 ) {
@@ -275,39 +339,37 @@ int main(int argc, char *argv[])
// Direct requests
- // In case the client was invoked as a suid binary, change uid back to original user
- // when being used for direct ioctl, so that the device's permissions are taken into account
- if ( geteuid() == 0 ) {
- setgid( getgid() );
- setuid( getuid() );
- }
-
- host_to_string( &msg.host, host, 50 );
-
// close device
- if ( action == IOCTL_CLOSE && msg.host.type == 0 && dev && (msg.imgname == NULL )) {
+ if ( action == IOCTL_CLOSE && msg.hosts_num == 0 && dev && (msg.imgname == NULL )) {
printf( "INFO: Closing device %s\n", dev );
- if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Couldn't close device.\n" );
exit( EXIT_FAILURE );
}
// switch host
- if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.host.type != 0 && dev && (msg.imgname == NULL )) {
+ if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.hosts_num == 1 && dev && (msg.imgname == NULL )) {
+ host_to_string( &msg.hosts[0], host, 50 );
if ( action == IOCTL_SWITCH ) printf( "INFO: Switching device %s to %s\n", dev, host );
if ( action == IOCTL_ADD_SRV ) printf( "INFO: %s: adding %s\n", dev, host );
if ( action == IOCTL_REM_SRV ) printf( "INFO: %s: removing %s\n", dev, host );
- if ( dnbd3_ioctl( dev, action, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Failed! Maybe the device is not connected?\n" );
exit( EXIT_FAILURE );
}
// connect
- if ( action == IOCTL_OPEN && msg.host.type != 0 && dev && (msg.imgname != NULL )) {
- printf( "INFO: Connecting device %s to %s for image %s\n", dev, host, msg.imgname );
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) exit( EXIT_SUCCESS );
- printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
- exit( EXIT_FAILURE );
+ if ( action == IOCTL_OPEN && msg.hosts_num > 0 && dev && (msg.imgname != NULL )) {
+ printf( "INFO: Connecting device %s for image %s\n", dev, msg.imgname );
+ active_device_num = dnbd3_ioctl( dev, IOCTL_OPEN, &msg );
+ if ( active_device_num >= 0 ) {
+ host_to_string( &msg.hosts[active_device_num], host, 50 );
+ printf( "INFO: Device %s for image %s is connected to server %s\n", dev, msg.imgname, host);
+ exit( EXIT_SUCCESS );
+ } else {
+ printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
+ exit( EXIT_FAILURE );
+ }
}
dnbd3_print_help( argv[0] );
@@ -317,17 +379,19 @@ int main(int argc, char *argv[])
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg)
{
const int fd = open( dev, O_WRONLY );
- if ( fd < 0 ) {
- printf( "open() for %s failed.\n", dev );
- return false;
+ if ( fd == -1 ) {
+ perror( "open() failed" );
+ return -ENODEV;
+ }
+ if ( msg != NULL && msg->imgname != NULL ) {
+ msg->imgnamelen = (uint16_t)strlen( msg->imgname );
}
- if ( msg != NULL && msg->imgname != NULL ) msg->imgnamelen = (uint16_t)strlen( msg->imgname );
const int ret = ioctl( fd, command, msg );
if ( ret < 0 ) {
- printf( "ioctl() failed.\n" );
+ perror( "ioctl() failed" );
}
close( fd );
- return ret >= 0;
+ return ret;
}
static void dnbd3_client_daemon()
@@ -338,11 +402,8 @@ static void dnbd3_client_daemon()
struct timeval tv;
int done, ret, len;
socklen_t socklen;
-
- if ( geteuid() != 0 ) {
- printf( "Only root can run the dnbd3-client in daemon mode!\n" );
- exit( 1 );
- }
+ struct ucred ucred;
+ int fdTest;
if ( (listener = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) {
perror( "socket" );
@@ -356,12 +417,21 @@ static void dnbd3_client_daemon()
perror( "bind" );
exit( 1 );
}
- chmod( addrLocal.sun_path, 0600 );
+ fchmod( listener, 0666 );
+ chmod( SOCK_PATH, 0666 );
if ( listen( listener, 5 ) == -1 ) {
perror( "listen" );
+ unlink( addrLocal.sun_path );
exit( 1 );
}
+ fdTest = open( "/dev/dnbd0", O_RDWR );
+ if ( fdTest == -1 ) {
+ perror( "Opening /dev/dnbd0 failed. Daemon will probably not work" );
+ } else {
+ close( fdTest );
+ }
+
memset( openDevices, -1, sizeof(openDevices) );
for (;;) {
@@ -372,6 +442,14 @@ static void dnbd3_client_daemon()
continue;
}
+ socklen = sizeof(ucred);
+ if ( getsockopt( client, SOL_SOCKET, SO_PEERCRED, &ucred, &socklen ) == -1 ) {
+ perror( "Could not get credentials of connection" );
+ close( client );
+ continue;
+ }
+ printf("Call from user %d\n", (int)ucred.uid );
+
tv.tv_sec = 1;
tv.tv_usec = 0;
setsockopt( client, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) );
@@ -398,27 +476,28 @@ static void dnbd3_client_daemon()
}
if ( pos >= end ) break;
argv[argc++] = pos;
- printf("Arg %d: '%s'\n", argc, pos);
+ //printf("Arg %d: '%s'\n", argc, pos);
while ( *pos != '\0' ) { // This will always be in bounds because of -4 above
if ( ++pos >= end ) break;
}
}
- dnbd3_daemon_action( client, argc, argv );
+ dnbd3_daemon_action( client, (int)ucred.uid, argc, argv );
}
close( client );
}
}
-static void dnbd3_daemon_action(int client, int argc, char **argv)
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv)
{
int opt = 0;
int longIndex = 0;
char *host = NULL, *image = NULL, *device = NULL;
- int rid = 0, uid = 0, killMe = false, ahead = 512;
+ int rid = 0, killMe = false, ahead = 512;
int len;
int action = -1;
const char *actionName = NULL;
+ bool learnNewServers = true;
optind = 1;
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
@@ -439,18 +518,18 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
case 'r':
rid = atoi( optarg );
break;
- case 'U':
- uid = atoi( optarg );
- break;
case 'c':
action = IOCTL_CLOSE;
actionName = "Close";
break;
- case 'adds':
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
action = IOCTL_ADD_SRV;
actionName = "Add Server";
break;
- case 'rems':
+ case 'R':
action = IOCTL_REM_SRV;
actionName = "Remove Server";
break;
@@ -465,14 +544,14 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
}
if ( killMe ) {
- if ( uid != 0 ) {
+ if ( uid != geteuid() ) {
printf( "Ignoring kill request by user %d\n", uid );
close( client );
return;
}
printf( "Received kill request; exiting.\n" );
- close( client );
unlink( SOCK_PATH );
+ close( client );
exit( 0 );
}
@@ -486,7 +565,7 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
return;
}
if ( action == IOCTL_OPEN && host != NULL && image != NULL && rid >= 0 ) {
- device = dnbd3_daemon_open( uid, host, image, rid, ahead );
+ device = dnbd3_daemon_open( uid, host, image, rid, ahead, learnNewServers);
if ( device != NULL ) {
len = strlen( device );
send( client, &len, sizeof(len), 0 );
@@ -509,11 +588,9 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
} else {
index = atoi( device );
}
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(msg) );
- msg.len = (uint16_t)sizeof(msg);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
if ( host != NULL ) {
- dnbd3_get_ip( host, &msg.host );
+ dnbd3_get_ip( host, &msg.hosts[0] );
}
if ( index < 0 || index >= MAX_DEVS ) {
printf( "%s request with invalid device id %d\n", actionName, index );
@@ -528,7 +605,7 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
printf( "%s: User %d cannot access %s owned by %d\n", actionName, uid, dev, openDevices[index] );
return false;
}
- if ( dnbd3_ioctl( dev, action, &msg ) ) {
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) {
printf( "%s request for device %s of user %d successful\n", actionName, dev, uid );
openDevices[index] = -1;
return true;
@@ -537,23 +614,26 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
return false;
}
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead)
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers)
{
int i, sameUser = 0;
struct stat st;
static char dev[DEV_LEN];
printf( "Opening a device for %s on %s\n", image, host );
// Check number of open devices
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] == uid ) sameUser++;
- }
- if ( sameUser > 1 ) {
- printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
- return NULL ;
+ if ( uid != 0 ) {
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] == uid ) sameUser++;
+ }
+ if ( sameUser > 1 ) {
+ printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
+ return NULL;
+ }
}
// Find free device
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] != -1 ) continue;
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] != -1 )
+ continue;
snprintf( dev, DEV_LEN, "/dev/dnbd%d", i );
if ( stat( dev, &st ) == -1 ) {
break;
@@ -561,16 +641,16 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
// Open
dnbd3_ioctl_t msg;
msg.len = (uint16_t)sizeof(msg);
- if ( !dnbd3_get_ip( host, &msg.host ) ) {
+ if ( !dnbd3_get_ip( host, &msg.hosts[0] ) ) {
printf( "Cannot parse host address %s\n", host );
return NULL ;
}
msg.imgname = image;
msg.imgnamelen = strlen( image );
msg.rid = rid;
- msg.use_server_provided_alts = true;
+ msg.use_server_provided_alts = doLearnNewServers;
msg.read_ahead_kb = readAhead;
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) {
+ if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) >= 0 ) {
openDevices[i] = uid;
printf( "Device %s now occupied by %d\n", dev, uid );
return dev;
@@ -584,7 +664,6 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
static int dnbd3_daemon_send(int argc, char **argv)
{
- const int uid = getuid();
int s, i, len;
struct sockaddr_un remote;
char buffer[SOCK_BUFFER];
@@ -604,7 +683,6 @@ static int dnbd3_daemon_send(int argc, char **argv)
// (Re)build argument string into a single one, arguments separated by null chars
char *pos = buffer;
char *end = buffer + SOCK_BUFFER;
- pos += snprintf( pos, end - pos, "--user%c%d", (int)'\0', uid ) + 1;
for (i = 1; i < argc && pos < end; ++i) {
pos += snprintf( pos, end - pos, "%s", argv[i] ) + 1;
}
@@ -643,28 +721,28 @@ static int dnbd3_daemon_send(int argc, char **argv)
static void dnbd3_print_help(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
- printf( "\nUsage: %s\n"
- "\t-h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n", argv_0 );
- printf( "Start the DNBD3 client.\n" );
- //printf("-f or --file \t\t Configuration file (default /etc/dnbd3-client.conf)\n");
- printf( "-h or --host \t\t Host running dnbd3-server.\n" );
+ printf( "Usage: %s\n", argv_0 );
+ printf( " -h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n" );
+ printf( "Start the DNBD3 client.\n\n" );
+ printf( "-h or --host \t\t List of space separated hosts to use.\n" );
printf( "-i or --image \t\t Image name of exported image.\n" );
printf( "-r or --rid \t\t Release-ID of exported image (default 0, latest).\n" );
printf( "-d or --device \t\t DNBD3 device name.\n" );
printf( "-a or --ahead \t\t Read ahead in KByte (default %i).\n", DEFAULT_READ_AHEAD_KB );
printf( "-c or --close \t\t Disconnect and close device.\n" );
printf( "-s or --switch \t\t Switch dnbd3-server on device (DEBUG).\n" );
+ printf( "-S or --sticky \t\t Use only servers from command line (no learning from servers)\n" );
+ printf( "-A or --add \t\t Add given dnbd3-server on device.\n");
+ printf( "-R or --remove \t\t Remove given dnbd3-server on device.\n");
printf( "-H or --help \t\t Show this help text and quit.\n" );
printf( "-V or --version \t Show version and quit.\n\n" );
- printf( "\t--daemon \t Run as helper daemon\n" );
- printf( "\t--kill \t Kill running helper daemon\n" );
+ printf( " --daemon \t\t Run as helper daemon\n" );
+ printf( " --kill \t\t Kill running helper daemon\n\n" );
printf( "The helper daemon makes it possible for normal users to connect dnbd3 devices.\n" );
- printf( "The client binary needs to be a setuid program for this to work!\n\n" );
}
void dnbd3_print_version()
{
- printf( "Version: %s\n", VERSION_STRING );
- exit( EXIT_SUCCESS );
+ printf( "dnbd3-client version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
}
diff --git a/src/clientconfig.h b/src/clientconfig.h
deleted file mode 100644
index f35f673..0000000
--- a/src/clientconfig.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _CLIENTCONFIG_H_
-#define _CLIENTCONFIG_H_
-
-// Which is the minimum protocol version the client expects from the server
-#define MIN_SUPPORTED_SERVER 2
-
-// in seconds if not stated otherwise (MS = milliseconds)
-#define SOCKET_TIMEOUT_CLIENT_DATA 2
-#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1
-
-#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
-#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
-#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
-// This must be a power of two:
-#define RTT_BLOCK_SIZE 4096
-
-#define STARTUP_MODE_DURATION 30
-// Interval of several repeating tasks (in seconds)
-#define TIMER_INTERVAL_PROBE_STARTUP 4
-#define TIMER_INTERVAL_PROBE_NORMAL 22
-#define TIMER_INTERVAL_PROBE_PANIC 2
-#define TIMER_INTERVAL_KEEPALIVE_PACKET 6
-
-// Expect a keepalive response every X seconds
-#define SOCKET_KEEPALIVE_TIMEOUT 8
-
-// Number of unsuccessful alt_server probes before read errors are reported to the block layer
-// (ALL servers will be probed this many times)
-// Set to 0 to disable
-#define PROBE_COUNT_TIMEOUT 0
-
-// ++ Kernel module ++
-#define DEFAULT_READ_AHEAD_KB 512
-#define NUMBER_DEVICES 8
-
-#endif
diff --git a/src/fuse/CMakeLists.txt b/src/fuse/CMakeLists.txt
new file mode 100644
index 0000000..be062f0
--- /dev/null
+++ b/src/fuse/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-fuse
+ LANGUAGES C)
+
+find_package(Fuse REQUIRED)
+
+# find atomic library required by dnbd3-fuse
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_FUSE_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_FUSE_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h)
+
+add_executable(dnbd3-fuse ${DNBD3_FUSE_SOURCE_FILES})
+target_include_directories(dnbd3-fuse PRIVATE ${FUSE_INCLUDE_DIRS})
+target_link_libraries(dnbd3-fuse dnbd3-build dnbd3-version dnbd3-shared ${FUSE_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS dnbd3-fuse RUNTIME DESTINATION bin
+ COMPONENT fuse)
+
+add_linter(dnbd3-fuse-lint "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
+add_linter_fix(dnbd3-fuse-lint-fix "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
diff --git a/src/fuse/connection.c b/src/fuse/connection.c
index 98b1d36..e760d98 100644
--- a/src/fuse/connection.c
+++ b/src/fuse/connection.c
@@ -1,19 +1,21 @@
#include "connection.h"
#include "helper.h"
-#include "../clientconfig.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <stdio.h>
+#include <stdatomic.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <inttypes.h>
+#include <signal.h>
/* Constants */
static const size_t SHORTBUF = 100;
@@ -30,9 +32,18 @@ static const int FAIL_BACKOFF_START_COUNT = 8;
static bool connectionInitDone = false;
static bool threadInitDone = false;
static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER;
-static bool keepRunning = true;
+// For multi-threaded concurrent connection during init
+static pthread_mutex_t mutexCondConn = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t condConn = PTHREAD_COND_INITIALIZER;
+static atomic_int pendingConnectionAttempts = 0;
+// Shutdown flag
+atomic_bool keepRunning = true;
+// Should we learn new alt-servers from servers we connect to?
static bool learnNewServers;
+static pthread_t tidReceiver;
+static pthread_t tidBackground;
+
// List of pending requests
static struct {
dnbd3_async_t *head;
@@ -55,15 +66,21 @@ static struct {
ticks startupTime;
} connection;
+struct conn_data {
+ char *lowerImage;
+ uint16_t rid;
+ int idx;
+};
+
// Known alt servers
typedef struct _alt_server {
dnbd3_host_t host;
- int consecutiveFails;
- int rtt;
+ atomic_int consecutiveFails;
+ atomic_int rtt;
int rtts[RTT_COUNT];
int rttIndex;
- int bestCount;
- int liveRtt;
+ atomic_int bestCount;
+ atomic_int liveRtt;
} alt_server_t;
static dnbd3_server_entry_t newservers[MAX_ALTS];
@@ -83,136 +100,232 @@ static pthread_rwlock_t altLock = PTHREAD_RWLOCK_INITIALIZER;
/* Static methods */
-static void* connection_receiveThreadMain(void *sock);
-static void* connection_backgroundThread(void *something);
+static void* connectThread(void * data);
+static void* connection_receiveThreadMain( void *sock );
+static void* connection_backgroundThread( void *something );
-static void addAltServers();
+static bool hasAltServer( dnbd3_host_t *host );
+static void addAltServers( void );
static void sortAltServers();
static void probeAltServers();
-static void switchConnection(int sockFd, alt_server_t *srv);
-static void requestAltServers();
-static bool throwDataAway(int sockFd, uint32_t amount);
+static void switchConnection( int sockFd, alt_server_t *srv );
+static void requestAltServers( void );
+static bool sendAltServerRequest( int sock );
+static bool throwDataAway( int sockFd, uint32_t amount );
+
+static void enqueueRequest( dnbd3_async_t *request );
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request );
-static void enqueueRequest(dnbd3_async_t *request);
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request);
+static void blockSignals();
-bool connection_init(const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew)
+bool connection_init( const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew )
{
- int sock = -1;
char host[SHORTBUF];
- size_t hlen;
- serialized_buffer_t buffer;
- uint16_t remoteVersion, remoteRid;
- char *remoteName;
- uint64_t remoteSize;
- struct sockaddr_storage sa;
- socklen_t salen;
- poll_list_t *cons = sock_newPollList();
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
timing_setBase();
pthread_mutex_lock( &mutexInit );
- if ( !connectionInitDone && keepRunning ) {
- dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
- const char *current, *end;
- int altIndex = 0;
- learnNewServers = doLearnNew;
- memset( altservers, 0, sizeof altservers );
- connection.sockFd = -1;
- current = hosts;
- do {
- // Get next host from string
- while ( *current == ' ' ) current++;
- end = strchr( current, ' ' );
- size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
- if ( len > SHORTBUF ) len = SHORTBUF;
- snprintf( host, len, "%s", current );
- int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
- for ( int i = 0; i < newHosts; ++i ) {
- if ( altIndex >= MAX_ALTS )
- break;
- altservers[altIndex].host = tempHosts[i];
- altIndex += 1;
- }
- current = end + 1;
- } while ( end != NULL && altIndex < MAX_ALTS );
- logadd( LOG_INFO, "Got %d servers from init call", altIndex );
- // Connect
- for ( int i = 0; i < altIndex + 5; ++i ) {
- if ( i >= altIndex ) {
- // Additional iteration - no corresponding slot in altservers, this
- // is just so we can make a final calls with longer timeout
- sock = sock_multiConnect( cons, NULL, 400, 3000 );
- if ( sock == -2 ) {
- logadd( LOG_ERROR, "Could not connect to any host" );
- sock = -1;
- break;
- }
- } else {
- if ( altservers[i].host.type == 0 )
- continue;
- // Try to connect - 100ms timeout
- sock = sock_multiConnect( cons, &altservers[i].host, 100, 3000 );
- }
- if ( sock == -2 || sock == -1 )
- continue;
- salen = sizeof(sa);
- if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
- logadd( LOG_ERROR, "getpeername on successful connection failed!? (errno=%d)", errno );
- close( sock );
- sock = -1;
+ if ( connectionInitDone ) {
+ pthread_mutex_unlock( &mutexInit );
+ return false;
+ }
+ learnNewServers = doLearnNew;
+ memset( altservers, 0, sizeof altservers );
+ connection.sockFd = -1;
+ current = hosts;
+ pthread_attr_t threadAttrs;
+ pthread_attr_init( &threadAttrs );
+ pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
+ // Resolve all hosts and connect
+ pthread_mutex_lock( &mutexCondConn );
+ do {
+ // Get next host from string
+ while ( *current == ' ' || *current == '\t' || *current == '\n' ) {
+ current++;
+ }
+ end = current;
+ while ( *end != ' ' && *end != '\t' && *end != '\n' && *end != '\0' ) {
+ end++;
+ }
+ if ( end == current )
+ break;
+ size_t len = (size_t)( end - current ) + 1;
+ if ( len > SHORTBUF ) {
+ len = SHORTBUF;
+ }
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ if ( hasAltServer( &tempHosts[i] ) )
continue;
- }
- hlen = sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
- logadd( LOG_INFO, "Connected to %.*s", (int)hlen, host );
- if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
- logadd( LOG_ERROR, "Could not send select image" );
- } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
- logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
- } else if ( rid != 0 && rid != remoteRid ) {
- logadd( LOG_ERROR, "rid mismatch (want: %d, got: %d)", (int)rid, (int)remoteRid );
- } else {
- logadd( LOG_INFO, "Requested: '%s:%d'", lowerImage, (int)rid );
- logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
- sock_setTimeout( sock, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
- image.name = strdup( remoteName );
- image.rid = remoteRid;
- image.size = remoteSize;
- if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &connection.currentServer ) ) {
- logadd( LOG_ERROR, "sockaddr to dnbd3_host_t failed!?" );
- connection.currentServer.type = 0;
+ altservers[altIndex].host = tempHosts[i];
+ // Start thread for async connect if not connected yet
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd == -1 ) {
+ pthread_t t;
+ struct conn_data *cd = malloc( sizeof(*cd) );
+ // We cannot be sure a thread is taking longer than this function runs, so better copy
+ cd->lowerImage = strdup( lowerImage );
+ cd->rid = rid;
+ cd->idx = altIndex;
+ pendingConnectionAttempts++;
+ if ( ( errno = pthread_create( &t, &threadAttrs, &connectThread, (void*)cd ) ) != 0 ) {
+ pendingConnectionAttempts--;
+ logadd( LOG_ERROR, "Could not create connect thread %d, errno=%d", cd->idx, errno );
+ free( cd->lowerImage );
+ free( cd );
+ continue;
}
- connection.panicSignal = signal_new();
- timing_get( &connection.startupTime );
- connection.sockFd = sock;
- requests.head = NULL;
- requests.tail = NULL;
- requestAltServers();
- break;
- }
- // Failed
- if ( sock != -1 ) {
- close( sock );
- sock = -1;
+ struct timespec timeout;
+ clock_gettime( CLOCK_REALTIME, &timeout );
+ timeout.tv_nsec += 200 * 1000 * 1000;
+ if ( timeout.tv_nsec >= 1000 * 1000 * 1000 ) {
+ timeout.tv_nsec -= 1000 * 1000 * 1000;
+ timeout.tv_sec += 1;
+ }
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &timeout );
}
+ // End async connect
+ altIndex += 1;
}
- if ( sock != -1 ) {
- connectionInitDone = true;
- }
+ current = end + 1;
+ } while ( *end != '\0' && altIndex < MAX_ALTS );
+ logadd( LOG_INFO, "Got %d servers from init call", altIndex );
+ // Wait a maximum of five seconds if we're not connected yet
+ if ( connection.sockFd == -1 && pendingConnectionAttempts > 0 ) {
+ struct timespec end;
+ clock_gettime( CLOCK_REALTIME, &end );
+ end.tv_sec += 5;
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &end );
+ }
+ pthread_mutex_unlock( &mutexCondConn );
+ pthread_attr_destroy( &threadAttrs );
+ if ( connection.sockFd != -1 ) {
+ connectionInitDone = true;
}
pthread_mutex_unlock( &mutexInit );
- sock_destroyPollList( cons );
- return sock != -1;
+ return connectionInitDone;
+}
+
+static void* connectThread(void * data)
+{
+ struct conn_data *cd = (struct conn_data*)data;
+ int idx = cd->idx;
+ int sock = -1;
+ serialized_buffer_t buffer;
+ uint16_t remoteVersion, remoteRid;
+ char *remoteName;
+ uint64_t remoteSize;
+ char host[SHORTBUF];
+ struct sockaddr_storage sa;
+ socklen_t salen = sizeof(sa);
+
+ if ( idx < 0 || idx >= MAX_ALTS || altservers[idx].host.type == 0 ) {
+ logadd( LOG_ERROR, "BUG: Index out of range, or empty server in connect thread (%d)", idx );
+ goto bailout;
+ }
+
+ sock_printHost( &altservers[idx].host, host, sizeof(host) );
+ logadd( LOG_INFO, "Trying to connect to %s", host );
+ sock = sock_connect( &altservers[idx].host, 1500, SOCKET_TIMEOUT_RECV * 1000 );
+ if ( sock == -1 ) {
+ logadd( LOG_INFO, "[%s] Connection failed", host );
+ goto bailout;
+ }
+
+ salen = sizeof( sa );
+ if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
+ logadd( LOG_ERROR, "[%s] getpeername on successful connection failed!? (errno=%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
+ logadd( LOG_INFO, "[%s] Connected", host );
+ if ( !dnbd3_select_image( sock, cd->lowerImage, cd->rid, 0 ) ) {
+ logadd( LOG_ERROR, "[%s] Could not send select image", host );
+ goto bailout;
+ }
+
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_ERROR, "[%s] Could not read select image reply (%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ if ( cd->rid != 0 && cd->rid != remoteRid ) {
+ logadd( LOG_ERROR, "[%s] rid mismatch (want: %d, got: %d)",
+ host, (int)cd->rid, (int)remoteRid );
+ goto bailout;
+ }
+ // Seems we got a winner
+ pthread_mutex_lock( &mutexCondConn );
+ if ( connection.sockFd != -1 || connectionInitDone ) {
+ pthread_mutex_unlock( &mutexCondConn );
+ logadd( LOG_INFO, "[%s] Raced by other connection", host );
+ goto bailout;
+ }
+ logadd( LOG_INFO, "Requested: '%s:%d'", cd->lowerImage, (int)cd->rid );
+ logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
+ image.name = strdup( remoteName );
+ image.rid = remoteRid;
+ image.size = remoteSize;
+ connection.currentServer = altservers[idx].host;
+ connection.panicSignal = signal_new();
+ timing_get( &connection.startupTime );
+ requests.head = NULL;
+ requests.tail = NULL;
+ if ( learnNewServers && !sendAltServerRequest( sock ) )
+ goto bailout;
+ // Everything good, tell main connect function
+ connection.sockFd = sock;
+ atomic_thread_fence( memory_order_release );
+ pendingConnectionAttempts--;
+ if ( idx != 0 ) {
+ // Make server first in list - enough to swap host, other data has not changed yet
+ lock_write( &altLock );
+ dnbd3_host_t tmp = altservers[idx].host;
+ altservers[idx].host = altservers[0].host;
+ altservers[0].host = tmp;
+ unlock_rw( &altLock );
+ }
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ return NULL;
+
+bailout:
+ if ( sock != -1 ) {
+ close( sock );
+ }
+ free( cd->lowerImage );
+ free( cd );
+ // Last one has to wake up main thread, which is waiting for up to 5 seconds for
+ // any connect thread to succeed. If none succeeded, there is no point in waiting
+ // any longer.
+ if ( --pendingConnectionAttempts == 0 ) {
+ pthread_mutex_lock( &mutexCondConn );
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ }
+ return NULL;
}
bool connection_initThreads()
{
pthread_mutex_lock( &mutexInit );
- if ( !keepRunning || !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
+ if ( !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
pthread_mutex_unlock( &mutexInit );
return false;
}
bool success = true;
- pthread_t thread;
threadInitDone = true;
logadd( LOG_DEBUG1, "Initializing stuff" );
if ( pthread_mutex_init( &connection.sendMutex, NULL ) != 0
@@ -220,10 +333,10 @@ bool connection_initThreads()
logadd( LOG_ERROR, "Mutex or spinlock init failure" );
success = false;
} else {
- if ( pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)connection.sockFd ) != 0 ) {
+ if ( pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)connection.sockFd ) != 0 ) {
logadd( LOG_ERROR, "Could not create receive thread" );
success = false;
- } else if ( pthread_create( &thread, NULL, &connection_backgroundThread, NULL ) != 0 ) {
+ } else if ( pthread_create( &tidBackground, NULL, &connection_backgroundThread, NULL ) != 0 ) {
logadd( LOG_ERROR, "Could not create background thread" );
success = false;
}
@@ -241,7 +354,7 @@ uint64_t connection_getImageSize()
return image.size;
}
-bool connection_read(dnbd3_async_t *request)
+bool connection_read( dnbd3_async_t *request )
{
if ( !connectionInitDone ) return false;
pthread_mutex_lock( &connection.sendMutex );
@@ -250,9 +363,7 @@ bool connection_read(dnbd3_async_t *request)
if ( !dnbd3_get_block( connection.sockFd, request->offset, request->length, (uint64_t)request, 0 ) ) {
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
- pthread_mutex_unlock( &connection.sendMutex );
signal_call( connection.panicSignal );
- return true;
}
}
pthread_mutex_unlock( &connection.sendMutex );
@@ -261,24 +372,36 @@ bool connection_read(dnbd3_async_t *request)
void connection_close()
{
- if ( keepRunning ) {
- logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
- }
+ static bool signalled = false;
+ logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
pthread_mutex_lock( &mutexInit );
keepRunning = false;
+ if ( threadInitDone && !signalled ) {
+ signalled = true;
+ pthread_kill( tidReceiver, SIGHUP );
+ pthread_kill( tidBackground, SIGHUP );
+ }
+ pthread_mutex_unlock( &mutexInit );
if ( !connectionInitDone ) {
- pthread_mutex_unlock( &mutexInit );
return;
}
- pthread_mutex_unlock( &mutexInit );
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
+ logadd( LOG_DEBUG1, "Shutting down socket..." );
shutdown( connection.sockFd, SHUT_RDWR );
}
pthread_mutex_unlock( &connection.sendMutex );
}
-size_t connection_printStats(char *buffer, const size_t len)
+void connection_join()
+{
+ if ( !threadInitDone )
+ return;
+ pthread_join( tidReceiver, NULL );
+ pthread_join( tidBackground, NULL );
+}
+
+size_t connection_printStats( char *buffer, const size_t len )
{
int ret;
size_t remaining = len;
@@ -308,7 +431,7 @@ size_t connection_printStats(char *buffer, const size_t len)
*buffer++ = ' ';
}
const size_t addrlen = sock_printHost( &altservers[i].host, buffer, remaining );
- remaining -= (addrlen + 1); // For space or * above
+ remaining -= ( addrlen + 1 ); // For space or * above
buffer += addrlen;
if ( remaining < 3 )
break;
@@ -324,7 +447,7 @@ size_t connection_printStats(char *buffer, const size_t len)
width += 3;
}
ret = snprintf( buffer, remaining, "% *d %s Unreachable:% 5d BestCount:% 5d Live:% 5dµs\n",
- width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
+ width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
if ( ret < 0 ) {
ret = 0;
}
@@ -339,23 +462,23 @@ size_t connection_printStats(char *buffer, const size_t len)
return len - remaining;
}
-static void* connection_receiveThreadMain(void *sockPtr)
+static void* connection_receiveThreadMain( void *sockPtr )
{
int sockFd = (int)(size_t)sockPtr;
dnbd3_reply_t reply;
- pthread_detach( pthread_self() );
+ blockSignals();
while ( keepRunning ) {
int ret;
do {
ret = dnbd3_read_reply( sockFd, &reply, true );
+ if ( !keepRunning ) goto fail;
if ( ret == REPLY_OK ) break;
} while ( ret == REPLY_INTR || ret == REPLY_AGAIN );
if ( ret != REPLY_OK ) {
logadd( LOG_DEBUG1, "Error receiving reply on receiveThread (%d)", ret );
goto fail;
}
-
if ( reply.cmd == CMD_GET_BLOCK ) {
// Get block reply. find matching request
dnbd3_async_t *request = removeRequest( (dnbd3_async_t*)reply.handle );
@@ -390,10 +513,8 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
unlock_rw( &altLock );
}
- // Success, wake up caller
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ fuse_reply_buf( request->fuse_req, request->buffer, request->length );
+ free( request );
}
} else if ( reply.cmd == CMD_GET_SERVERS ) {
// List of known alt servers
@@ -416,7 +537,6 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
}
}
- logadd( LOG_DEBUG1, "Aus der Schleife rausgeflogen! ARRRRRRRRRR" );
fail:;
// Make sure noone is trying to use the socket for sending by locking,
pthread_mutex_lock( &connection.sendMutex );
@@ -424,7 +544,9 @@ fail:;
// as someone could have established a new connection already
if ( connection.sockFd == sockFd ) {
connection.sockFd = -1;
- signal_call( connection.panicSignal );
+ if ( keepRunning ) {
+ signal_call( connection.panicSignal );
+ }
}
pthread_mutex_unlock( &connection.sendMutex );
// As we're the only reader, it's safe to close the socket now
@@ -432,11 +554,12 @@ fail:;
return NULL;
}
-static void* connection_backgroundThread(void *something UNUSED)
+static void* connection_backgroundThread( void *something UNUSED )
{
ticks nextKeepalive;
ticks nextRttCheck;
+ blockSignals();
timing_get( &nextKeepalive );
nextRttCheck = nextKeepalive;
while ( keepRunning ) {
@@ -446,6 +569,8 @@ static void* connection_backgroundThread(void *something UNUSED)
uint32_t wt2 = timing_diffMs( &now, &nextRttCheck );
if ( wt1 > 0 && wt2 > 0 ) {
int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 );
+ if ( !keepRunning )
+ break;
if ( waitRes == SIGNAL_ERROR ) {
logadd( LOG_WARNING, "Error waiting on signal in background thread! Errno = %d", errno );
}
@@ -460,20 +585,20 @@ static void* connection_backgroundThread(void *something UNUSED)
}
sortAltServers();
probeAltServers();
- if ( panic || timing_diff( &connection.startupTime, &now ) <= STARTUP_MODE_DURATION ) {
+ if ( panic || timing_diff( &connection.startupTime, &now ) <= DISCOVER_STARTUP_PHASE_COUNT * TIMER_INTERVAL_PROBE_STARTUP ) {
timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_STARTUP );
} else {
- timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_NORMAL );
+ timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_MAX );
}
}
// Send keepalive packet
if ( timing_reachedPrecise( &nextKeepalive, &now ) ) {
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
- dnbd3_request_t request;
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_KEEPALIVE;
- request.handle = request.offset = request.size = 0;
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_KEEPALIVE,
+ };
fixup_request( request );
ssize_t ret = sock_sendAll( connection.sockFd, &request, sizeof request, 2 );
if ( (size_t)ret != sizeof request ) {
@@ -483,7 +608,7 @@ static void* connection_backgroundThread(void *something UNUSED)
}
}
pthread_mutex_unlock( &connection.sendMutex );
- timing_addSeconds( &nextKeepalive, &now, TIMER_INTERVAL_KEEPALIVE_PACKET );
+ timing_addSeconds( &nextKeepalive, &now, KEEPALIVE_INTERVAL );
}
}
return NULL;
@@ -491,7 +616,20 @@ static void* connection_backgroundThread(void *something UNUSED)
// Private quick helpers
-static void addAltServers()
+/**
+ * Check if given host is in list of altsevers.
+ * Does not lock 'altLock', do so at caller site.
+ */
+static bool hasAltServer( dnbd3_host_t *host )
+{
+ for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
+ if ( isSameAddress( host, &altservers[eIdx].host ) )
+ return true;
+ }
+ return false;
+}
+
+static void addAltServers( void )
{
pthread_mutex_lock( &newAltLock );
lock_write( &altLock );
@@ -499,11 +637,8 @@ static void addAltServers()
if ( newservers[nIdx].host.type == 0 )
continue;
// Got a new alt server, see if it's already known
- for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
- if ( isSameAddress( &newservers[nIdx].host, &altservers[eIdx].host ) ) {
- goto skip_server;
- }
- }
+ if ( hasAltServer( &newservers[nIdx].host ) )
+ continue;
// Not known yet, add - find free slot
int slot = -1;
for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
@@ -528,9 +663,8 @@ static void addAltServers()
altservers[slot].host = newservers[nIdx].host;
altservers[slot].liveRtt = 0;
}
-skip_server:;
}
- memset( newservers, 0, sizeof(newservers) );
+ memset( newservers, 0, sizeof( newservers ) );
unlock_rw( &altLock );
pthread_mutex_unlock( &newAltLock );
}
@@ -604,7 +738,7 @@ static void probeAltServers()
pthread_spin_lock( &requests.lock );
if ( requests.head != NULL ) {
if ( !panic && current != NULL ) {
- const int maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
+ const uint64_t maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
dnbd3_async_t *iterator;
for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) {
// A request with measurement tag is pending
@@ -626,7 +760,7 @@ static void probeAltServers()
}
lock_read( &altLock );
- for ( int altIndex = 0; altIndex < (panic ? MAX_ALTS : MAX_ALTS_ACTIVE); ++altIndex ) {
+ for ( int altIndex = 0; altIndex < ( panic ? MAX_ALTS : MAX_ALTS_ACTIVE ); ++altIndex ) {
alt_server_t * const srv = &altservers[altIndex];
if ( srv->host.type == 0 )
continue;
@@ -634,65 +768,65 @@ static void probeAltServers()
&& rand() % srv->consecutiveFails >= FAIL_BACKOFF_START_COUNT ) {
continue;
}
+ srv->rttIndex += 1;
if ( srv->rttIndex >= RTT_COUNT ) {
srv->rttIndex = 0;
- } else {
- srv->rttIndex += 1;
}
// Probe
+ char hstr[100];
+ sock_printHost( &srv->host, hstr, 100 );
ticks start;
timing_get( &start );
errno = 0;
int sock = sock_connect( &srv->host, panic ? 1000 : 333, 1000 );
if ( sock == -1 ) {
- logadd( LOG_DEBUG1, "Could not connect for probing. errno = %d", errno );
+ logadd( LOG_DEBUG1, "%s probe: Could not connect for probing. errno = %d", hstr, errno );
goto fail;
}
if ( !dnbd3_select_image( sock, image.name, image.rid, 0 ) ) {
- logadd( LOG_DEBUG1, "probe: select_image failed" );
+ logadd( LOG_DEBUG1, "%s probe: select_image failed (sock=%d, errno=%d)", hstr, sock, errno );
goto fail;
}
- if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize )) {
- logadd( LOG_DEBUG1, "probe: select image reply failed" );
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_DEBUG1, "%s probe: select image reply failed", hstr );
goto fail;
}
if ( remoteProto < MIN_SUPPORTED_SERVER ) {
- logadd( LOG_WARNING, "Unsupported remote version (local: %d, remote: %d)", (int)PROTOCOL_VERSION, (int)remoteProto );
+ logadd( LOG_WARNING, "%s probe: Unsupported remote version (local: %d, remote: %d)", hstr, (int)PROTOCOL_VERSION, (int)remoteProto );
srv->consecutiveFails += 10;
goto fail;
}
if ( remoteRid != image.rid || strcmp( remoteName, image.name ) != 0 ) {
- logadd( LOG_WARNING, "Remote rid or name mismatch (got '%s')", remoteName );
+ logadd( LOG_WARNING, "%s probe: Remote rid or name mismatch (got '%s')", hstr, remoteName );
srv->consecutiveFails += 10;
goto fail;
}
if ( !dnbd3_get_block( sock, testOffset, testLength, 0, 0 ) ) {
- logadd( LOG_DEBUG1, "-> block request fail" );
+ logadd( LOG_DEBUG1, "%s probe: -> block request fail", hstr );
goto fail;
}
int a = 111;
- if ( !(a = dnbd3_get_reply( sock, &reply )) || reply.size != testLength ) {
- logadd( LOG_DEBUG1, "<- get block reply fail %d %d", a, (int)reply.size );
+ if ( !( a = dnbd3_get_reply( sock, &reply ) ) || reply.size != testLength ) {
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply fail %d %d", hstr, a, (int)reply.size );
goto fail;
}
if ( request != NULL && removeRequest( request ) != NULL ) {
// Request successfully removed from queue
const ssize_t ret = sock_recv( sock, request->buffer, request->length );
if ( ret != (ssize_t)request->length ) {
- logadd( LOG_DEBUG1, "[RTT] receiving payload for a block reply failed" );
+ logadd( LOG_DEBUG1, "%s probe: receiving payload for a block reply failed", hstr );
// Failure, add to queue again
connection_read( request );
goto fail;
}
- // Success, wake up caller
- logadd( LOG_DEBUG1, "[RTT] Successful direct probe" );
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ // Success, reply to fuse
+ fuse_reply_buf( request->fuse_req, request->buffer, request->length );
+ free( request );
+ logadd( LOG_DEBUG1, "%s probe: Successful direct probe", hstr );
} else {
// Wasn't a request that's in our request queue
if ( !throwDataAway( sock, testLength ) ) {
- logadd( LOG_DEBUG1, "<- get block reply payload fail" );
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply payload fail", hstr );
goto fail;
}
}
@@ -701,7 +835,7 @@ static void probeAltServers()
// Panic mode? Just switch to server
if ( panic ) {
unlock_rw( &altLock );
- switchConnection( sock, srv );
+ if ( keepRunning ) switchConnection( sock, srv );
return;
}
// Non-panic mode:
@@ -733,7 +867,8 @@ static void probeAltServers()
close( sock );
}
continue;
-fail:;
+fail:
+ ;
if ( sock != -1 ) {
close( sock );
}
@@ -774,7 +909,7 @@ fail:;
// Regular logic: Apply threshold when considering switch
if ( !doSwitch && current != NULL ) {
doSwitch = current->rtt > best->rtt + RTT_ABSOLUTE_THRESHOLD
- || RTT_THRESHOLD_FACTOR(current->rtt) > best->rtt + 1000;
+ || RTT_THRESHOLD_FACTOR( current->rtt ) > best->rtt + 1000;
}
}
// Switch if a better server was found
@@ -796,11 +931,10 @@ fail:;
}
}
-static void switchConnection(int sockFd, alt_server_t *srv)
+static void switchConnection( int sockFd, alt_server_t *srv )
{
- pthread_t thread;
struct sockaddr_storage addr;
- socklen_t addrLen = sizeof(addr);
+ socklen_t addrLen = sizeof( addr );
char message[200] = "Connection switched to ";
const size_t len = strlen( message );
int ret;
@@ -829,9 +963,10 @@ static void switchConnection(int sockFd, alt_server_t *srv)
signal_call( connection.panicSignal );
return;
}
+ pthread_detach( tidReceiver );
timing_get( &connection.startupTime );
- pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)sockFd );
- sock_printable( (struct sockaddr*)&addr, sizeof(addr), message + len, sizeof(message) - len );
+ pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)sockFd );
+ sock_printable( (struct sockaddr*)&addr, sizeof( addr ), message + len, sizeof( message ) - len );
logadd( LOG_INFO, "%s", message );
// resend queue
if ( queue != NULL ) {
@@ -855,22 +990,28 @@ static void switchConnection(int sockFd, alt_server_t *srv)
/**
* Does not lock, so get the sendMutex first!
*/
-static void requestAltServers()
+static void requestAltServers( void )
{
if ( connection.sockFd == -1 || !learnNewServers )
return;
- dnbd3_request_t request = { 0 };
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_GET_SERVERS;
- fixup_request( request );
- if ( sock_sendAll( connection.sockFd, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) {
- logadd( LOG_WARNING, "Connection failed while requesting alt server list" );
+ if ( !sendAltServerRequest( connection.sockFd ) ) {
+ logadd( LOG_WARNING, "Main connection failed while requesting alt server list" );
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
}
}
-static bool throwDataAway(int sockFd, uint32_t amount)
+static bool sendAltServerRequest( int sock )
+{
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_GET_SERVERS,
+ };
+ fixup_request( request );
+ return sock_sendAll( sock, &request, sizeof( request ), 2 ) == (ssize_t)sizeof( request );
+}
+
+static bool throwDataAway( int sockFd, uint32_t amount )
{
size_t done = 0;
char tempBuffer[SHORTBUF];
@@ -883,11 +1024,9 @@ static bool throwDataAway(int sockFd, uint32_t amount)
return true;
}
-static void enqueueRequest(dnbd3_async_t *request)
+static void enqueueRequest( dnbd3_async_t *request )
{
request->next = NULL;
- request->finished = false;
- request->success = false;
//logadd( LOG_DEBUG2, "Queue: %p @ %s : %d", request, file, line );
// Measure latency and add to switch formula
timing_get( &request->time );
@@ -901,7 +1040,7 @@ static void enqueueRequest(dnbd3_async_t *request)
pthread_spin_unlock( &requests.lock );
}
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request )
{
pthread_spin_lock( &requests.lock );
//logadd( LOG_DEBUG2, "Remov: %p @ %s : %d", request, file, line );
@@ -925,3 +1064,20 @@ static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
return iterator;
}
+static void blockSignals()
+{
+ sigset_t sigmask;
+ if ( pthread_sigmask( 0, NULL, &sigmask ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot get current sigmask of thread" );
+ sigemptyset( &sigmask );
+ }
+ sigaddset( &sigmask, SIGUSR1 );
+ sigaddset( &sigmask, SIGUSR2 );
+ sigaddset( &sigmask, SIGPIPE );
+ sigaddset( &sigmask, SIGINT );
+ sigaddset( &sigmask, SIGTERM );
+ sigdelset( &sigmask, SIGHUP );
+ if ( pthread_sigmask( SIG_SETMASK, &sigmask, NULL ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot set sigmask of thread" );
+ }
+}
diff --git a/src/fuse/connection.h b/src/fuse/connection.h
index cae554c..b869ac6 100644
--- a/src/fuse/connection.h
+++ b/src/fuse/connection.h
@@ -1,35 +1,41 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/timing.h>
+#include <stdatomic.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
+#include <sys/socket.h>
+#define FUSE_USE_VERSION 30
+#include <fuse_lowlevel.h>
+
+extern atomic_bool keepRunning;
struct _dnbd3_async;
typedef struct _dnbd3_async {
struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller)
- dnbd3_signal_t* signal; // Used to signal the caller
- char* buffer; // Caller-provided buffer to be filled
ticks time; // When request was put on wire, 0 if not measuring
uint64_t offset;
uint32_t length;
- bool finished; // Will be set to true if the request has been handled
- bool success; // Will be set to true if the request succeeded
+ fuse_req_t fuse_req;
+ char buffer[]; // Must be last member!
} dnbd3_async_t;
-bool connection_init(const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers);
+bool connection_init( const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers );
bool connection_initThreads();
uint64_t connection_getImageSize();
-bool connection_read(dnbd3_async_t *request);
+bool connection_read( dnbd3_async_t *request );
void connection_close();
-size_t connection_printStats(char *buffer, const size_t len);
+void connection_join();
+
+size_t connection_printStats( char *buffer, const size_t len );
#endif /* CONNECTION_H_ */
diff --git a/src/fuse/helper.c b/src/fuse/helper.c
index d81b08f..f54073b 100644
--- a/src/fuse/helper.c
+++ b/src/fuse/helper.c
@@ -18,8 +18,8 @@ void printLog( log_info *info )
}
//rewind(file);
- fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) );
- fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", (uint64_t)( info->imageSize/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", (uint64_t)( info->receivedBytes/ ( 1024ll*1024ll ) ) );
fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount );
fprintf( logFile, "Blocksize: 4KiB\n\n" );
fprintf( logFile, "Block access count:\n" );
@@ -29,7 +29,7 @@ void printLog( log_info *info )
if ( i % 50 == 0 ) {
fprintf( logFile, "\n" );
}
- fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] );
+ fprintf( logFile, "%i ", (int) info->blockRequestCount[i] );
}
fprintf( logFile, "\n" );
fclose( logFile );
diff --git a/src/fuse/helper.h b/src/fuse/helper.h
index 9e5d127..b1fa513 100644
--- a/src/fuse/helper.h
+++ b/src/fuse/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
@@ -18,18 +18,18 @@ typedef struct log_info {
-void printLog(log_info *info);
+void printLog( log_info *info );
-int connect_to_server(char *server_adress, int port);
+int connect_to_server( char *server_adress, int port );
-static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddressPort( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( a->port == b->port ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
-static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddress( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
#endif
diff --git a/src/fuse/main.c b/src/fuse/main.c
index 1a5643c..e06f6e8 100644
--- a/src/fuse/main.c
+++ b/src/fuse/main.c
@@ -5,18 +5,26 @@
* See the file COPYING.
*
* Changed by Stephan Schwaer
+ * FUSE lowlevel by Alan Reichert
* */
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
#define FUSE_USE_VERSION 30
-#include <fuse.h>
+#include <dnbd3/config.h>
+#include <fuse_lowlevel.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
/* for printing uint */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
@@ -27,8 +35,14 @@
#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
-static const char * const IMAGE_PATH = "/img";
-static const char * const STATS_PATH = "/status";
+#define INO_ROOT (1)
+#define INO_STATS (2)
+#define INO_IMAGE (3)
+
+static const char *IMAGE_NAME = "img";
+static const char *STATS_NAME = "status";
+
+static struct fuse_session *_fuseSession = NULL;
static uint64_t imageSize;
/* Debug/Benchmark variables */
@@ -36,237 +50,243 @@ static bool useDebug = false;
static log_info logInfo;
static struct timespec startupTime;
static uid_t owner;
-static bool keepRunning = true;
-static void (*fuse_sigIntHandler)(int) = NULL;
-static void (*fuse_sigTermHandler)(int) = NULL;
-static struct fuse_operations dnbd3_fuse_no_operations;
-
-#define SIGPOOLSIZE 6
-static pthread_spinlock_t sigLock;
-static dnbd3_signal_t *signalPool[SIGPOOLSIZE];
-static dnbd3_signal_t **sigEnd = signalPool + SIGPOOLSIZE;
-static void signalInit()
-{
- pthread_spin_init( &sigLock, PTHREAD_PROCESS_PRIVATE );
- for ( size_t i = 0; i < SIGPOOLSIZE; ++i ) {
- signalPool[i] = NULL;
- }
-}
-static inline dnbd3_signal_t *signalGet()
-{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it != NULL ) {
- dnbd3_signal_t *ret = *it;
- *it = NULL;
- pthread_spin_unlock( &sigLock );
- return ret;
- }
- }
- pthread_spin_unlock( &sigLock );
- return signal_newBlocking();
-}
-static inline void signalPut(dnbd3_signal_t *signal)
-{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it == NULL ) {
- *it = signal;
- pthread_spin_unlock( &sigLock );
- return;
- }
- }
- pthread_spin_unlock( &sigLock );
- signal_close( signal );
-}
-static int image_getattr(const char *path, struct stat *stbuf)
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize );
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset );
+static void image_destroy( void *private_data );
+static void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+static void image_ll_init( void *userdata, struct fuse_conn_info *conn );
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi );
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi );
+static int image_stat( fuse_ino_t ino, struct stat *stbuf );
+static void printUsage( char *argv0, int exitCode );
+static void printVersion();
+
+static int image_stat( fuse_ino_t ino, struct stat *stbuf )
{
- int res = 0;
- memset( stbuf, 0, sizeof( struct stat ) );
- stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
- stbuf->st_uid = owner;
- if ( strcmp( path, "/" ) == 0 ) {
+ switch ( ino ) {
+ case INO_ROOT:
stbuf->st_mode = S_IFDIR | 0550;
stbuf->st_nlink = 2;
- } else if ( strcmp( path, IMAGE_PATH ) == 0 ) {
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_IMAGE:
stbuf->st_mode = S_IFREG | 0440;
stbuf->st_nlink = 1;
stbuf->st_size = imageSize;
- } else if ( strcmp( path, STATS_PATH ) == 0 ) {
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_STATS:
stbuf->st_mode = S_IFREG | 0440;
stbuf->st_nlink = 1;
stbuf->st_size = 4096;
clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return -1;
+ }
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = owner;
+ stbuf->st_ino = ino;
+ return 0;
+}
+
+static void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ struct stat stbuf = { 0 };
+ ( void ) fi;
+
+ if ( image_stat( ino, &stbuf ) == -1 ) {
+ fuse_reply_err( req, ENOENT );
} else {
- res = -ENOENT;
+ fuse_reply_attr( req, &stbuf, ino == INO_IMAGE ? 1200 : 1 ); // seconds validity timeout
}
- return res;
}
-static int image_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset UNUSED, struct fuse_file_info *fi UNUSED)
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name )
{
- if ( strcmp( path, "/" ) != 0 ) {
- return -ENOENT;
+ ( void )parent;
+
+ if ( strcmp( name, IMAGE_NAME ) == 0 || strcmp( name, STATS_NAME ) == 0 ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, IMAGE_NAME ) == 0 ) {
+ e.ino = INO_IMAGE;
+ e.attr_timeout = e.entry_timeout = 1200;
+ } else {
+ e.ino = INO_STATS;
+ e.attr_timeout = e.entry_timeout = 0;
+ }
+ if ( image_stat( e.ino, &e.attr ) == 0 ) {
+ fuse_reply_entry( req, &e );
+ return;
+ }
}
- filler( buf, ".", NULL, 0 );
- filler( buf, "..", NULL, 0 );
- filler( buf, IMAGE_PATH + 1, NULL, 0 );
- filler( buf, STATS_PATH + 1, NULL, 0 );
- return 0;
+ fuse_reply_err( req, ENOENT );
}
-static int image_open(const char *path, struct fuse_file_info *fi)
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
{
- if ( strcmp( path, IMAGE_PATH ) != 0 && strcmp( path, STATS_PATH ) != 0 ) {
- return -ENOENT;
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
+}
+
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
+{
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
}
- if ( ( fi->flags & 3 ) != O_RDONLY ) {
- return -EACCES;
+ return fuse_reply_buf( req, NULL, 0 );
+}
+
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ ( void ) fi;
+
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, ENOTDIR );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, IMAGE_NAME, INO_IMAGE );
+ dirbuf_add( req, &b, STATS_NAME, INO_STATS );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
}
- return 0;
}
-static int fillStatsFile(char *buf, size_t size, off_t offset) {
- if ( offset == 0 ) {
- return (int)connection_printStats( buf, size );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ if ( ino != INO_IMAGE && ino != INO_STATS ) {
+ fuse_reply_err( req, EISDIR );
+ } else if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ // auto caching
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
}
+}
+
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset ) {
char buffer[4096];
int ret = (int)connection_printStats( buffer, sizeof buffer );
int len = MIN( ret - (int)offset, (int)size );
- if ( len == 0 )
- return 0;
if ( len < 0 ) {
- return -EOF;
+ fuse_reply_err( req, 0 );
+ return;
}
- memcpy( buf, buffer + offset, len );
- return len;
+ fuse_reply_buf( req, buffer + offset, len );
}
-static int image_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi UNUSED)
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi )
{
- if ( size > __INT_MAX__ ) {
- // fuse docs say we MUST fill the buffer with exactly size bytes and return size,
- // otherwise the buffer will we padded with zeros. Since the return value is just
- // an int, we could not properly fulfill read requests > 2GB. Since there is no
- // mention of a guarantee that this will never happen, better add a safety check.
- // Way to go fuse.
- return -EIO;
- }
- if ( path[1] == STATS_PATH[1] ) {
- return fillStatsFile( buf, size, offset );
+ assert( ino == INO_STATS || ino == INO_IMAGE );
+
+ ( void )fi;
+
+ if ( ino == INO_STATS ) {
+ fillStatsFile( req, size, offset );
+ return;
}
if ( (uint64_t)offset >= imageSize ) {
- return 0;
+ fuse_reply_err( req, 0 );
+ return;
}
-
if ( offset + size > imageSize ) {
size = imageSize - offset;
}
+ if ( size == 0 || size > UINT32_MAX ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
if ( useDebug ) {
- /* count the requested blocks */
uint64_t startBlock = offset / ( 4096 );
const uint64_t endBlock = ( offset + size - 1 ) / ( 4096 );
- for ( ; startBlock <= endBlock; startBlock++ ) {
+ for ( ; startBlock <= endBlock; startBlock++ )
+ {
++logInfo.blockRequestCount[startBlock];
}
}
-
- dnbd3_async_t request;
- request.buffer = buf;
- request.length = (uint32_t)size;
- request.offset = offset;
- request.signal = signalGet();
-
- if ( !connection_read( &request ) ) {
- signalPut( request.signal );
- return -EINVAL;
- }
- while ( !request.finished ) {
- int ret = signal_wait( request.signal, 5000 );
- if ( !keepRunning ) {
- connection_close();
- break;
- }
- if ( ret < 0 ) {
- debugf( "fuse_read signal wait returned %d", ret );
- }
- }
- signalPut( request.signal );
- if ( request.success ) {
- return request.length;
- } else {
- return -EIO;
+ dnbd3_async_t *request = malloc( sizeof(dnbd3_async_t) + size );
+ request->length = (uint32_t)size;
+ request->offset = offset;
+ request->fuse_req = req;
+
+ if ( !connection_read( request ) ) {
+ fuse_reply_err( req, EIO );
+ free( request );
}
}
-static void image_sigHandler(int signum) {
- keepRunning = false;
- if ( signum == SIGINT && fuse_sigIntHandler != NULL ) {
- fuse_sigIntHandler(signum);
- }
- if ( signum == SIGTERM && fuse_sigTermHandler != NULL ) {
- fuse_sigTermHandler(signum);
- }
+static void noopSigHandler( int signum )
+{
+ (void)signum;
}
-static void* image_init(struct fuse_conn_info *conn UNUSED)
+static void image_ll_init( void *userdata, struct fuse_conn_info *conn )
{
+ ( void ) userdata;
+ ( void ) conn;
if ( !connection_initThreads() ) {
logadd( LOG_ERROR, "Could not initialize threads for dnbd3 connection, exiting..." );
- exit( EXIT_FAILURE );
+ if ( _fuseSession != NULL ) {
+ fuse_session_exit( _fuseSession );
+ }
}
- // Prepare our handler
- struct sigaction newHandler;
- memset( &newHandler, 0, sizeof(newHandler) );
- newHandler.sa_handler = &image_sigHandler;
- sigemptyset( &newHandler.sa_mask );
- struct sigaction oldHandler;
- // Retrieve old handlers when setting
- sigaction( SIGINT, &newHandler, &oldHandler );
- fuse_sigIntHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGINT handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- sigaction( SIGTERM, &newHandler, &oldHandler );
- fuse_sigTermHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGTERM handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- return NULL;
}
/* close the connection */
-static void image_destroy(void *private_data UNUSED)
+static void image_destroy( void *private_data UNUSED )
{
if ( useDebug ) {
printLog( &logInfo );
}
connection_close();
- return;
}
/* map the implemented fuse operations */
-static struct fuse_operations image_oper = {
- .getattr = image_getattr,
- .readdir = image_readdir,
- .open = image_open,
- .read = image_read,
- .init = image_init,
+static struct fuse_lowlevel_ops image_oper = {
+ .lookup = image_ll_lookup,
+ .getattr = image_ll_getattr,
+ .readdir = image_ll_readdir,
+ .open = image_ll_open,
+ .read = image_ll_read,
+ .init = image_ll_init,
.destroy = image_destroy,
};
static void printVersion()
{
char *arg[] = { "foo", "-V" };
- printf( "DNBD3-Fuse Version 1.2.3.4, protocol version %d\n", (int)PROTOCOL_VERSION );
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ printf( "dnbd3-fuse version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
+ printf( "Protocol version: %d\n", (int)PROTOCOL_VERSION );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
exit( 0 );
}
-static void printUsage(char *argv0, int exitCode)
+static void printUsage( char *argv0, int exitCode )
{
char *arg[] = { argv0, "-h" };
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
printf( "\n" );
printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
@@ -284,19 +304,19 @@ static void printUsage(char *argv0, int exitCode)
static const char *optString = "dfHh:i:l:o:r:SsVv";
static const struct option longOpts[] = {
- { "debug", no_argument, NULL, 'd' },
- { "help", no_argument, NULL, 'H' },
- { "host", required_argument, NULL, 'h' },
- { "image", required_argument, NULL, 'i' },
- { "log", required_argument, NULL, 'l' },
- { "option", required_argument, NULL, 'o' },
- { "rid", required_argument, NULL, 'r' },
- { "sticky", no_argument, NULL, 'S' },
- { "version", no_argument, NULL, 'v' },
- { 0, 0, 0, 0 }
+ { "debug", no_argument, NULL, 'd' },
+ { "help", no_argument, NULL, 'H' },
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "log", required_argument, NULL, 'l' },
+ { "option", required_argument, NULL, 'o' },
+ { "rid", required_argument, NULL, 'r' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "version", no_argument, NULL, 'v' },
+ { 0, 0, 0, 0 }
};
-int main(int argc, char *argv[])
+int main( int argc, char *argv[] )
{
char *server_address = NULL;
char *image_Name = NULL;
@@ -306,6 +326,12 @@ int main(int argc, char *argv[])
int newArgc;
int opt, lidx;
bool learnNewServers = true;
+ bool single_thread = false;
+ struct fuse_chan *ch;
+ char *mountpoint;
+ int foreground = 0;
+
+ log_init();
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
@@ -316,9 +342,10 @@ int main(int argc, char *argv[])
log_setConsoleTimestamps( true );
log_setFileMask( 65535 );
- newArgv = calloc( argc + 10, sizeof(char*) );
+ newArgv = calloc( argc + 10, sizeof( char* ) );
newArgv[0] = argv[0];
newArgc = 1;
+
while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
switch ( opt ) {
case 'h':
@@ -328,7 +355,7 @@ int main(int argc, char *argv[])
image_Name = optarg;
break;
case 'r':
- rid = (uint16_t)atoi(optarg);
+ rid = (uint16_t)atoi( optarg );
break;
case 'o':
newArgv[newArgc++] = "-o";
@@ -357,15 +384,16 @@ int main(int argc, char *argv[])
case 'd':
useDebug = true;
newArgv[newArgc++] = "-d";
+ foreground = 1;
break;
case 's':
- newArgv[newArgc++] = "-s";
+ single_thread = true;
break;
case 'S':
learnNewServers = false;
break;
case 'f':
- newArgv[newArgc++] = "-f";
+ foreground = 1;
break;
default:
printUsage( argv[0], EXIT_FAILURE );
@@ -386,6 +414,17 @@ int main(int argc, char *argv[])
}
}
+ // Prepare our handler
+ struct sigaction newHandler;
+ memset( &newHandler, 0, sizeof( newHandler ) );
+ newHandler.sa_handler = &noopSigHandler;
+ sigemptyset( &newHandler.sa_mask );
+ sigaction( SIGHUP, &newHandler, NULL );
+ sigset_t sigmask;
+ sigemptyset( &sigmask );
+ sigaddset( &sigmask, SIGHUP );
+ pthread_sigmask( SIG_BLOCK, &sigmask, NULL );
+
if ( !connection_init( server_address, image_Name, rid, learnNewServers ) ) {
logadd( LOG_ERROR, "Could not connect to any server. Bye.\n" );
return EXIT_FAILURE;
@@ -404,17 +443,51 @@ int main(int argc, char *argv[])
// Since dnbd3 is always read only and the remote image will not change
newArgv[newArgc++] = "-o";
- newArgv[newArgc++] = "ro,auto_cache,default_permissions";
+ newArgv[newArgc++] = "ro,default_permissions";
// Mount point goes last
newArgv[newArgc++] = argv[optind];
- printf( "ImagePathName: %s\nFuseArgs:",IMAGE_PATH );
+ printf( "ImagePathName: /%s\nFuseArgs:", IMAGE_NAME );
for ( int i = 0; i < newArgc; ++i ) {
printf( " '%s'", newArgv[i] );
}
- putchar('\n');
+ putchar( '\n' );
clock_gettime( CLOCK_REALTIME, &startupTime );
owner = getuid();
- signalInit();
- return fuse_main( newArgc, newArgv, &image_oper, NULL );
+
+ // Fuse lowlevel loop
+ struct fuse_args args = FUSE_ARGS_INIT( newArgc, newArgv );
+ int fuse_err = 1;
+ if ( fuse_parse_cmdline( &args, &mountpoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Parsing command line failed" );
+ } else if ( ( ch = fuse_mount( mountpoint, &args ) ) == NULL ) {
+ logadd( LOG_ERROR, "Mounting file system failed" );
+ } else {
+ _fuseSession = fuse_lowlevel_new( &args, &image_oper, sizeof( image_oper ), NULL );
+ if ( _fuseSession == NULL ) {
+ logadd( LOG_ERROR, "Could not initialize fuse session" );
+ } else {
+ if ( fuse_set_signal_handlers( _fuseSession ) == -1 ) {
+ logadd( LOG_ERROR, "Could not install fuse signal handlers" );
+ } else {
+ fuse_session_add_chan( _fuseSession, ch );
+ fuse_daemonize( foreground );
+ if ( single_thread ) {
+ fuse_err = fuse_session_loop( _fuseSession );
+ } else {
+ fuse_err = fuse_session_loop_mt( _fuseSession ); //MT produces errors (race conditions) in libfuse and didnt improve speed at all
+ }
+ fuse_remove_signal_handlers( _fuseSession );
+ fuse_session_remove_chan( ch );
+ }
+ fuse_session_destroy( _fuseSession );
+ _fuseSession = NULL;
+ }
+ fuse_unmount( mountpoint, ch );
+ }
+ fuse_opt_free_args( &args );
+ free( newArgv );
+ connection_join();
+ logadd( LOG_DEBUG1, "Terminating. FUSE REPLIED: %d\n", fuse_err );
+ return fuse_err;
}
diff --git a/src/fuse/serialize.c b/src/fuse/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/fuse/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/kernel/.clang-format b/src/kernel/.clang-format
new file mode 100644
index 0000000..c1fe2c6
--- /dev/null
+++ b/src/kernel/.clang-format
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | sort | uniq
+ForEachMacros:
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - '__ata_qc_for_each'
+ - 'ata_qc_for_each'
+ - 'ata_qc_for_each_raw'
+ - 'ata_qc_for_each_with_internal'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - '__bio_for_each_bvec'
+ - 'bio_for_each_bvec'
+ - 'bio_for_each_bvec_all'
+ - 'bio_for_each_integrity_vec'
+ - '__bio_for_each_segment'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'bitmap_for_each_clear_region'
+ - 'bitmap_for_each_set_region'
+ - 'blkg_for_each_descendant_post'
+ - 'blkg_for_each_descendant_pre'
+ - 'blk_queue_for_each_rl'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'bpf_for_each_spilled_reg'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'cxl_for_each_cmd'
+ - 'device_for_each_child_node'
+ - 'dma_fence_chain_for_each'
+ - 'do_for_each_ftrace_op'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_atomic_for_each_plane_damage'
+ - 'drm_client_for_each_connector_iter'
+ - 'drm_client_for_each_modeset'
+ - 'drm_connector_for_each_possible_encoder'
+ - 'drm_for_each_bridge_in_chain'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_crtc_reverse'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_for_each_privobj'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'flow_action_for_each'
+ - 'for_each_active_dev_scope'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_aggr_pgid'
+ - 'for_each_available_child_of_node'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_bvec'
+ - 'for_each_card_auxs'
+ - 'for_each_card_auxs_safe'
+ - 'for_each_card_components'
+ - 'for_each_card_dapms'
+ - 'for_each_card_pre_auxs'
+ - 'for_each_card_prelinks'
+ - 'for_each_card_rtds'
+ - 'for_each_card_rtds_safe'
+ - 'for_each_card_widgets'
+ - 'for_each_card_widgets_safe'
+ - 'for_each_cgroup_storage_type'
+ - 'for_each_child_of_node'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_cmsghdr'
+ - 'for_each_compatible_node'
+ - 'for_each_component_dais'
+ - 'for_each_component_dais_safe'
+ - 'for_each_comp_order'
+ - 'for_each_console'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_not'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dapm_widgets'
+ - 'for_each_dev_addr'
+ - 'for_each_dev_scope'
+ - 'for_each_displayid_db'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_dpcm_be'
+ - 'for_each_dpcm_be_rollback'
+ - 'for_each_dpcm_be_safe'
+ - 'for_each_dpcm_fe'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_element'
+ - 'for_each_element_extid'
+ - 'for_each_element_id'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_free_mem_pfn_range_in_zone'
+ - 'for_each_free_mem_pfn_range_in_zone_from'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_hstate'
+ - 'for_each_if'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_nr'
+ - 'for_each_link_codecs'
+ - 'for_each_link_cpus'
+ - 'for_each_link_platforms'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_member'
+ - 'for_each_memcg_cache_index'
+ - 'for_each_mem_pfn_range'
+ - '__for_each_mem_range'
+ - 'for_each_mem_range'
+ - '__for_each_mem_range_rev'
+ - 'for_each_mem_range_rev'
+ - 'for_each_mem_region'
+ - 'for_each_migratetype_order'
+ - 'for_each_msi_entry'
+ - 'for_each_msi_entry_safe'
+ - 'for_each_net'
+ - 'for_each_net_continue_reverse'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_continue_reverse'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_net_rcu'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_mst_mgr_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_nonreserved_multicast_dest_pgid'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_cpu_node'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_old_mst_mgr_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_mst_mgr_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_plane_in_state_reverse'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pci_msi_entry'
+ - 'for_each_pcm_streams'
+ - 'for_each_physmem_range'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_present_cpu'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_property_of_node'
+ - 'for_each_registered_fb'
+ - 'for_each_requested_gpio'
+ - 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_mem_range'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_rtd_codec_dais'
+ - 'for_each_rtd_components'
+ - 'for_each_rtd_cpu_dais'
+ - 'for_each_rtd_dais'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_set_clump8'
+ - 'for_each_sg'
+ - 'for_each_sg_dma_page'
+ - 'for_each_sg_page'
+ - 'for_each_sgtable_dma_page'
+ - 'for_each_sgtable_dma_sg'
+ - 'for_each_sgtable_page'
+ - 'for_each_sgtable_sg'
+ - 'for_each_sibling_event'
+ - 'for_each_subelement'
+ - 'for_each_subelement_extid'
+ - 'for_each_subelement_id'
+ - '__for_each_thread'
+ - 'for_each_thread'
+ - 'for_each_unicast_dest_pgid'
+ - 'for_each_vsi'
+ - 'for_each_wakeup_source'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'genradix_for_each'
+ - 'genradix_for_each_from'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hctx_for_each_ctx'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - 'hlist_for_each_entry_srcu'
+ - '__hlist_for_each_rcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'i3c_bus_for_each_i2cdev'
+ - 'i3c_bus_for_each_i3cdev'
+ - 'ide_host_for_each_port'
+ - 'ide_port_for_each_dev'
+ - 'ide_port_for_each_present_dev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_continue_ul'
+ - 'idr_for_each_entry_ul'
+ - 'in_dev_for_each_ifa_rcu'
+ - 'in_dev_for_each_ifa_rtnl'
+ - 'inet_bind_bucket_for_each'
+ - 'inet_lhash2_for_each_icsk_rcu'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_func_safe'
+ - 'klp_for_each_func_static'
+ - 'klp_for_each_object'
+ - 'klp_for_each_object_safe'
+ - 'klp_for_each_object_static'
+ - 'kunit_suite_for_each_test_case'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_vcpu'
+ - 'list_for_each'
+ - 'list_for_each_codec'
+ - 'list_for_each_codec_safe'
+ - 'list_for_each_continue'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_rcu'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_entry_srcu'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'mci_for_each_dimm'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'nanddev_io_for_each_page'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_nested'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'pcl_for_each_chunk'
+ - 'pcl_for_each_segment'
+ - 'pcm_for_each_format'
+ - 'ping_portaddr_for_each_entry'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_block'
+ - 'rdma_for_each_port'
+ - 'rdma_umem_for_each_dma_block'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_from'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_from'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_from'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_from'
+ - '__rq_for_each_bio'
+ - 'rq_for_each_bvec'
+ - 'rq_for_each_segment'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'shdma_for_each_chan'
+ - '__shost_for_each_device'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_array_for_each'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'tb_property_for_each'
+ - 'tcf_exts_for_each_action'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'virtio_device_for_each_vq'
+ - 'while_for_each_ftrace_op'
+ - 'xa_for_each'
+ - 'xa_for_each_marked'
+ - 'xa_for_each_range'
+ - 'xa_for_each_start'
+ - 'xas_for_each'
+ - 'xas_for_each_conflict'
+ - 'xas_for_each_marked'
+ - 'xbc_array_for_each_value'
+ - 'xbc_for_each_key_value'
+ - 'xbc_node_for_each_array_value'
+ - 'xbc_node_for_each_child'
+ - 'xbc_node_for_each_key_value'
+ - 'zorro_for_each_dev'
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
new file mode 100644
index 0000000..6bc61ff
--- /dev/null
+++ b/src/kernel/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-kernel
+ LANGUAGES C)
+
+# include macros to define Linux kernel build targets
+include(Kernel)
+
+# set C flags for a Linux kernel module
+set(KERNEL_C_FLAGS "-DDNBD3_KERNEL_MODULE -I ${PROJECT_INCLUDE_GEN_DIR}"
+ CACHE STRING "C flags to be used for building the dnbd3 kernel module")
+# set C flags for the debug mode of a Linux kernel module
+set(KERNEL_C_FLAGS_DEBUG "-g -DDEBUG"
+ CACHE STRING "Additional C flags to be used for building the dnbd3 kernel module in debug mode")
+
+# append include directories to the C flags
+get_property(KERNEL_INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+foreach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} -I ${KERNEL_INCLUDE_DIR}")
+endforeach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+
+# append debug C flags if debug mode is enabled
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} ${KERNEL_C_FLAGS_DEBUG}")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+# dnbd3 Linux kernel module
+set(KERNEL_MODULE_DNBD3_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.c)
+set(KERNEL_MODULE_DNBD3_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.h)
+
+add_kernel_module(dnbd3 "${KERNEL_BUILD_DIR}"
+ "${KERNEL_INSTALL_DIR}"
+ "CONFIG_BLK_DEV_DNBD3=m"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}"
+ ${CMAKE_CURRENT_SOURCE_DIR}/Kbuild)
+
+# add dependency to generate project version header before dnbd3.ko is built
+add_dependencies(dnbd3 dnbd3-generate-version)
+
+set(CHECKPATCH_IGNORE_WARNINGS "NEW_TYPEDEFS"
+ "MSLEEP"
+ "CONSTANT_COMPARISON"
+ "DEEP_INDENTATION"
+ "PREFER_PR_LEVEL"
+ "LINUX_VERSION_CODE"
+ "JIFFIES_COMPARISON"
+ "KREALLOC_ARG_REUSE")
+
+add_kernel_linter(dnbd3-lint "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+add_kernel_linter_fix(dnbd3-lint-fix "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+
+add_linter_fix(dnbd3-lint-fix-clang "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
diff --git a/src/kernel/Kbuild b/src/kernel/Kbuild
new file mode 100644
index 0000000..26afa98
--- /dev/null
+++ b/src/kernel/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Linux kernel module dnbd3
+obj-$(CONFIG_BLK_DEV_DNBD3) := dnbd3.o
+dnbd3-y += dnbd3_main.o blk.o net.o serialize.o sysfs.o
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index 889b988..69e4583 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,248 +19,259 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "blk.h"
#include "net.h"
#include "sysfs.h"
+#include "dnbd3_main.h"
#include <linux/pagemap.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#define dnbd3_req_read(req) \
- req_op(req) == REQ_OP_READ
-#define dnbd3_req_fs(req) \
- dnbd3_req_read(req) || req_op(req) == REQ_OP_WRITE
-#define dnbd3_req_special(req) \
- blk_rq_is_private(req)
-#else
-#define dnbd3_req_read(req) \
- rq_data_dir(req) == READ
-#define dnbd3_req_fs(req) \
- req->cmd_type == REQ_TYPE_FS
-#define dnbd3_req_special(req) \
- req->cmd_type == REQ_TYPE_SPECIAL
-#endif
-
-int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+static int dnbd3_close_device(dnbd3_device_t *dev)
{
- struct gendisk *disk;
- struct request_queue *blk_queue;
-
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_receive);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
+ int result;
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- memset(&dev->initial_server, 0, sizeof(dev->initial_server));
- dev->better_sock = NULL;
+ if (dev->imgname)
+ dev_info(dnbd3_device_to_dev(dev), "closing down device.\n");
+ dev->panic = false;
+ result = dnbd3_net_disconnect(dev);
+ kfree(dev->imgname);
dev->imgname = NULL;
- dev->rid = 0;
- dev->update_available = 0;
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- dev->disconnecting = 0;
- dev->panic = 0;
- dev->panic_count = 0;
- dev->reported_size = 0;
-
- if (!(disk = alloc_disk(1)))
- {
- printk("ERROR: dnbd3 alloc_disk failed.\n");
- return -EIO;
- }
-
- disk->major = major;
- disk->first_minor = minor;
- sprintf(disk->disk_name, "dnbd%d", minor);
- set_capacity(disk, 0);
- set_disk_ro(disk, 1);
- disk->fops = &dnbd3_blk_ops;
-
- spin_lock_init(&dev->blk_lock);
- if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL)
- {
- printk("ERROR: dnbd3 blk_init_queue failed.\n");
- return -EIO;
- }
-
- blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
- blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
-
- disk->queue = blk_queue;
- disk->private_data = dev;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-#else
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-#endif
-#define ONE_MEG (1048576)
- blk_queue_max_segment_size(disk->queue, ONE_MEG);
- blk_queue_max_segments(disk->queue, 0xffff);
- blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
- disk->queue->limits.max_sectors = 256;
- dev->disk = disk;
-#undef ONE_MEG
- add_disk(disk);
- dnbd3_sysfs_init(dev);
- return 0;
+ /* new requests might have been queued up, */
+ /* but now that imgname is NULL no new ones can show up */
+ blk_mq_freeze_queue(dev->queue);
+ set_capacity(dev->disk, 0);
+ blk_mq_unfreeze_queue(dev->queue);
+ return result;
}
-int dnbd3_blk_del_device(dnbd3_device_t *dev)
-{
- dnbd3_sysfs_exit(dev);
- dnbd3_net_disconnect(dev);
- del_gendisk(dev->disk);
- put_disk(dev->disk);
- blk_cleanup_queue(dev->disk->queue);
- return 0;
-}
-
-struct block_device_operations dnbd3_blk_ops =
- { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, };
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
int result = -100;
dnbd3_device_t *dev = bdev->bd_disk->private_data;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
struct request_queue *blk_queue = dev->disk->queue;
+#endif
char *imgname = NULL;
dnbd3_ioctl_t *msg = NULL;
- //unsigned long irqflags;
+ int i = 0, j;
+ u8 locked = 0;
- while (dev->disconnecting)
- {
- // do nothing
- }
-
- if (arg != 0)
- {
+ if (arg != 0) {
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
- if (msg == NULL) return -ENOMEM;
- if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg))
- {
+ if (msg == NULL)
+ return -ENOMEM;
+ if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) {
result = -ENOEXEC;
goto cleanup_return;
}
- if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0)
- {
+ if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) {
result = -ENOENT;
goto cleanup_return;
}
- if (msg->imgname != NULL && msg->imgnamelen > 0)
- {
+ if (msg->imgname != NULL && msg->imgnamelen > 0) {
imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
- if (imgname == NULL)
- {
+ if (imgname == NULL) {
result = -ENOMEM;
goto cleanup_return;
}
- if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0)
- {
+ if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) {
result = -ENOENT;
goto cleanup_return;
}
imgname[msg->imgnamelen] = '\0';
- //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname);
}
}
-
- switch (cmd)
- {
+ switch (cmd) {
case IOCTL_OPEN:
- if (dev->imgname != NULL)
- {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
+ break;
}
- else if (imgname == NULL)
- {
+ locked = 1;
+ if (dev->imgname != NULL) {
+ result = -EBUSY;
+ } else if (imgname == NULL) {
result = -EINVAL;
- }
- else if (msg == NULL)
- {
+ } else if (msg == NULL) {
result = -EINVAL;
- }
- else
- {
- if (sizeof(msg->host) != sizeof(dev->cur_server.host))
- printk("Odd size bug#1 triggered in IOCTL\n");
- memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host));
- dev->cur_server.failures = 0;
- memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server));
+ } else {
+ /* assert that at least one and not to many hosts are given */
+ if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) {
+ result = -EINVAL;
+ break;
+ }
+
dev->imgname = imgname;
dev->rid = msg->rid;
dev->use_server_provided_alts = msg->use_server_provided_alts;
- // Forget all alt servers on explicit connect, set first al server to initial server
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0]));
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- if (blk_queue->backing_dev_info != NULL) {
+
+ dev_info(dnbd3_device_to_dev(dev), "opening device.\n");
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set optimal request size for the queue to half the read-ahead
+ blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512));
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ // set readahead from optimal request size of the queue
+ // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE
+ blk_queue_update_readahead(dev->queue);
+#endif
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ if (blk_queue->backing_dev_info != NULL)
blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
- }
#else
blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
#endif
- if (dnbd3_net_connect(dev) == 0)
- {
- result = 0;
- imgname = NULL; // Prevent kfree at the end
+
+ /* add specified servers to alt server list */
+ for (i = 0; i < NUMBER_SERVERS; i++)
+ dev->alt_servers[i].host.ss_family = 0;
+ for (i = 0; i < msg->hosts_num; i++) {
+ /* copy provided host into corresponding alt server slot */
+ if (dnbd3_add_server(dev, &msg->hosts[i]) == 0)
+ dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n",
+ &dev->alt_servers[i].host);
+ else
+ dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n",
+ &dev->alt_servers[i].host);
}
- else
- {
- result = -ENOENT;
+
+ /*
+ * probe added alt servers in specified order and
+ * choose first working server as initial server
+ */
+ result = -EPROTONOSUPPORT;
+ for (i = 0; i < NUMBER_SERVERS; i++) {
+ /* probe added alt server */
+ if (dev->alt_servers[i].host.ss_family == 0)
+ continue; // Empty slot
+
+ result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true);
+ if (result == 0) {
+ /* connection established, store index of server and exit loop */
+ result = i;
+ break;
+ }
+ }
+
+ if (result >= 0) {
+ /* connection was successful */
+ dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n",
+ &dev->cur_server.host);
+ imgname = NULL; // Prevent kfree at the end
+ } else {
+ /* probing failed */
dev->imgname = NULL;
}
}
break;
case IOCTL_CLOSE:
- dnbd3_blk_fail_all_requests(dev);
- result = dnbd3_net_disconnect(dev);
- dnbd3_blk_fail_all_requests(dev);
- set_capacity(dev->disk, 0);
- if (dev->imgname)
- {
- kfree(dev->imgname);
- dev->imgname = NULL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
}
+ locked = 1;
+ result = dnbd3_close_device(dev);
break;
case IOCTL_SWITCH:
- result = -EINVAL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
+ }
+ locked = 1;
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ } else if (msg == NULL) {
+ result = -EINVAL;
+ } else {
+ dnbd3_alt_server_t *alt_server;
+ struct sockaddr_storage new_addr;
+
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(&msg->hosts[0], dev);
+ if (alt_server == NULL) {
+ mutex_unlock(&dev->alt_servers_lock);
+ /* specified server is not known, so do not switch */
+ result = -ENOENT;
+ } else {
+ /* specified server is known, so try to switch to it */
+ new_addr = alt_server->host;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &new_addr)) {
+ /* specified server is current server, so do not switch */
+ result = 0;
+ } else {
+ dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
+ &new_addr);
+ result = dnbd3_new_connection(dev, &new_addr, false);
+ if (result != 0) {
+ /* switching didn't work */
+ result = -EAGAIN;
+ }
+ }
+ if (result == 0) {
+ /* fake RTT so we don't switch away again soon */
+ mutex_lock(&dev->alt_servers_lock);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ alt_server = &dev->alt_servers[i];
+ if (is_same_server(&alt_server->host, &new_addr)) {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ alt_server->rtts[j] = 1;
+ alt_server->best_count = 100;
+ } else {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ if (alt_server->rtts[j] < 500000)
+ alt_server->rtts[j] = 500000;
+ alt_server->best_count = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ }
+ }
+ }
break;
case IOCTL_ADD_SRV:
- case IOCTL_REM_SRV:
- if (dev->imgname == NULL)
- {
- result = -ENOENT;
+ case IOCTL_REM_SRV: {
+ struct sockaddr_storage addr;
+ dnbd3_host_t *host;
+
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ break;
}
- else if (dev->new_servers_num >= NUMBER_SERVERS)
- {
- result = -EAGAIN;
+ if (msg == NULL) {
+ result = -EINVAL;
+ break;
}
- else if (msg == NULL)
- {
+ host = &msg->hosts[0];
+ if (!dnbd3_host_to_sockaddr(host, &addr)) {
result = -EINVAL;
+ break;
}
- else
- {
- memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host));
- dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM
- ++dev->new_servers_num;
- result = 0;
+
+ if (cmd == IOCTL_ADD_SRV) {
+ result = dnbd3_add_server(dev, host);
+ if (result == -EEXIST)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr);
+ else if (result == -ENOSPC)
+ dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr);
+ } else { // IOCTL_REM_SRV
+ result = dnbd3_rem_server(dev, host);
+ if (result == -ENOENT)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr);
}
break;
-
+ }
case BLKFLSBUF:
result = 0;
break;
@@ -270,113 +282,325 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
}
cleanup_return:
- if (msg) kfree(msg);
- if (imgname) kfree(imgname);
+ kfree(msg);
+ kfree(imgname);
+ if (locked)
+ dnbd3_flag_reset(dev->connection_lock);
return result;
}
-/**
- * dev->blk_lock and q->queue_lock are being held
- * when this is called!
+static const struct block_device_operations dnbd3_blk_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = dnbd3_blk_ioctl,
+};
+
+static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+}
+
+/*
+ * Linux kernel blk-mq driver function (entry point) to handle block IO requests
*/
-void dnbd3_blk_request(struct request_queue *q)
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
- struct request *req;
- dnbd3_device_t *dev;
+ struct request *rq = bd->rq;
+ dnbd3_device_t *dev = rq->q->queuedata;
+ struct dnbd3_cmd *cmd;
- while ((req = blk_fetch_request(q)) != NULL)
- {
- dev = req->rq_disk->private_data;
+ if (dev->imgname == NULL || !device_active(dev))
+ return BLK_STS_IOERR;
- if (dev->imgname == NULL)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (req_op(rq) != REQ_OP_READ)
+ return BLK_STS_IOERR;
- if (!(dnbd3_req_fs(req)))
- {
- __blk_end_request_all(req, 0);
- continue;
- }
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
+ return BLK_STS_TIMEOUT;
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (rq_data_dir(rq) != READ)
+ return BLK_STS_NOTSUPP;
- if (!(dnbd3_req_read(req)))
- {
- __blk_end_request_all(req, -EACCES);
- continue;
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32);
+ blk_mq_start_request(rq);
+ dnbd3_add_queue(dev, rq);
+ return BLK_STS_OK;
+}
+
+static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ , bool reserved
+#endif
+ )
+{
+ unsigned long irqflags;
+ struct request *rq_iter;
+ bool found = false;
+ dnbd3_device_t *dev = req->q->queuedata;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->send_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // If still in send queue, do nothing
+ if (found)
+ return BLK_EH_RESET_TIMER;
+
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ list_del_init(&req->queuelist);
+ break;
}
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (!found) {
+ dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n");
+ // Assume it was fnished concurrently
+ return BLK_EH_DONE;
+ }
+ // Add to send queue again and trigger work, reset timeout
+ dnbd3_add_queue(dev, req);
+ return BLK_EH_RESET_TIMER;
+}
+
+static
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+const
+#endif
+struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .timeout = dnbd3_rq_timeout,
+};
+
+int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ int ret;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->index = minor;
+ // lock for imgname, cur_server etc.
+ spin_lock_init(&dev->blk_lock);
+ spin_lock_init(&dev->send_queue_lock);
+ spin_lock_init(&dev->recv_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+ INIT_LIST_HEAD(&dev->recv_queue);
+ dnbd3_flag_reset(dev->connection_lock);
+ dnbd3_flag_reset(dev->discover_running);
+ mutex_init(&dev->alt_servers_lock);
+ dnbd3_net_work_init(dev);
+
+ // memset has done this already but I like initial values to be explicit
+ dev->imgname = NULL;
+ dev->rid = 0;
+ dev->update_available = false;
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->reported_size = 0;
+
+ // set up tag_set for blk-mq
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tag_set.driver_data = dev;
+ dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ;
+
+ ret = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (ret) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n");
+ goto out;
+ }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set up blk-mq and disk
+ dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev);
+ if (IS_ERR(dev->disk)) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n");
+ ret = PTR_ERR(dev->disk);
+ goto out_cleanup_tags;
+ }
+ dev->queue = dev->disk->queue;
+#else
+ // set up blk-mq
+ dev->queue = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(dev->queue)) {
+ ret = PTR_ERR(dev->queue);
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n");
+ goto out_cleanup_tags;
+ }
+ dev->queue->queuedata = dev;
+#endif
+
+ blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+ blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue);
+#else
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue);
+#endif
+#define ONE_MEG (1048576)
+ blk_queue_max_segment_size(dev->queue, ONE_MEG);
+ blk_queue_max_segments(dev->queue, 0xffff);
+ blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
+ dev->queue->limits.max_sectors = 256;
+#undef ONE_MEG
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ // set up disk
+ dev->disk = alloc_disk(1);
+ if (!dev->disk) {
+ dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n");
+ ret = -ENOMEM;
+ goto out_cleanup_queue;
+ }
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \
+ || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ dev->disk->flags |= GENHD_FL_NO_PART;
+#else
+ dev->disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ dev->disk->major = major;
+ dev->disk->first_minor = minor;
+ dev->disk->minors = 1;
+ dev->disk->fops = &dnbd3_blk_ops;
+ dev->disk->private_data = dev;
+ dev->disk->queue = dev->queue;
+ sprintf(dev->disk->disk_name, "dnbd%d", minor);
+ set_capacity(dev->disk, 0);
+ set_disk_ro(dev->disk, 1);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ ret = add_disk(dev->disk);
+ if (ret != 0)
+ goto out_cleanup_queue;
+#else
+ add_disk(dev->disk);
+#endif
+
+ // set up sysfs
+ dnbd3_sysfs_init(dev);
+
+ return 0;
+
+out_cleanup_queue:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+out_cleanup_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out:
+ mutex_destroy(&dev->alt_servers_lock);
+ return ret;
+}
+
+int dnbd3_blk_del_device(dnbd3_device_t *dev)
+{
+ while (!dnbd3_flag_get(dev->connection_lock))
+ schedule();
+ dnbd3_close_device(dev);
+ dnbd3_sysfs_exit(dev);
+ del_gendisk(dev->disk);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+ put_disk(dev->disk);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+ blk_mq_free_tag_set(&dev->tag_set);
+ mutex_destroy(&dev->alt_servers_lock);
+ return 0;
+}
+
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev)
+{
+ struct request *blk_request;
+ unsigned long flags;
+ struct list_head local_copy;
+ int count = 0;
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
+ INIT_LIST_HEAD(&local_copy);
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ list_add_tail(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
}
+ // Do this even if we didn't move anything from the recv list to the send
+ // list. It might have already contained something, which needs to be
+ // re-requested anyways if this was called because of a server switch.
+ spin_lock_irqsave(&dev->blk_lock, flags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, flags);
}
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
{
- struct request *blk_request, *tmp_request;
- struct request *blk_request2, *tmp_request2;
+ struct request *blk_request;
unsigned long flags;
struct list_head local_copy;
- int dup;
+ int count = 0;
+
INIT_LIST_HEAD(&local_copy);
- spin_lock_irqsave(&dev->blk_lock, flags);
- while (!list_empty(&dev->request_queue_receive))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- while (!list_empty(&dev->request_queue_send))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ while (!list_empty(&dev->send_queue)) {
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist)
- {
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
list_del_init(&blk_request->queuelist);
- if (dnbd3_req_fs(blk_request))
- {
- spin_lock_irqsave(&dev->blk_lock, flags);
- __blk_end_request_all(blk_request, -EIO);
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- }
- else if (dnbd3_req_special(blk_request))
- {
- kfree(blk_request);
- }
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
}
}
diff --git a/src/kernel/blk.h b/src/kernel/blk.h
index 5091d19..c6dcb8d 100644
--- a/src/kernel/blk.h
+++ b/src/kernel/blk.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,22 +22,17 @@
#ifndef BLK_H_
#define BLK_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV
-#endif
-
-extern struct block_device_operations dnbd3_blk_ops;
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
-
-void dnbd3_blk_request(struct request_queue *q);
+// The device has been set up via IOCTL_OPEN and hasn't been closed yet
+#define device_active(dev) ((dev)->reported_size != 0)
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor);
int dnbd3_blk_del_device(dnbd3_device_t *dev);
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev);
+
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev);
#endif /* BLK_H_ */
diff --git a/src/kernel/core.c b/src/kernel/core.c
deleted file mode 100644
index 69a2540..0000000
--- a/src/kernel/core.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include "clientconfig.h"
-#include "dnbd3.h"
-#include "blk.h"
-
-int major;
-static unsigned int max_devs = NUMBER_DEVICES;
-static dnbd3_device_t *dnbd3_device;
-
-static int __init dnbd3_init(void)
-{
- int i;
-
- dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL);
- if (!dnbd3_device)
- return -ENOMEM;
-
- // initialize block device
- if ((major = register_blkdev(0, "dnbd3")) == 0)
- {
- printk("ERROR: dnbd3 register_blkdev failed.\n");
- return -EIO;
- }
-
- printk("DNBD3 kernel module loaded. Machine type: " ENDIAN_MODE "\n");
-
- // add MAX_NUMBER_DEVICES devices
- for (i = 0; i < max_devs; i++)
- {
- if (dnbd3_blk_add_device(&dnbd3_device[i], i) != 0)
- {
- printk("ERROR: adding device failed.\n");
- return -EIO; // TODO: delete all devices added so far. it could happen that it's not the first one that fails. also call unregister_blkdev and free memory
- }
- }
-
- printk("INFO: dnbd3 init successful (%i devices).\n", max_devs);
- return 0;
-}
-
-static void __exit dnbd3_exit(void)
-{
- int i;
-
- for (i = 0; i < max_devs; i++)
- {
- dnbd3_blk_del_device(&dnbd3_device[i]);
- }
-
- unregister_blkdev(major, "dnbd3");
- kfree(dnbd3_device);
- printk("INFO: dnbd3 exit.\n");
-}
-
-module_init( dnbd3_init);
-module_exit( dnbd3_exit);
-
-MODULE_DESCRIPTION("Distributed Network Block Device 3");
-MODULE_LICENSE("GPL");
-
-module_param(max_devs, int, 0444);
-MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
deleted file mode 100644
index f8af69f..0000000
--- a/src/kernel/dnbd3.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef DNBD_H_
-#define DNBD_H_
-
-#include <linux/version.h>
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <net/sock.h>
-
-#define KERNEL_MODULE
-#include "config.h"
-#include "types.h"
-#include "serialize.h"
-
-extern int major;
-
-typedef struct
-{
- dnbd3_host_t host;
- unsigned long rtts[4]; // Last four round trip time measurements in µs
- uint16_t protocol_version; // dnbd3 protocol version of this server
- uint8_t failures; // How many times the server was unreachable
-} dnbd3_server_t;
-
-typedef struct
-{
- // block
- struct gendisk *disk;
- spinlock_t blk_lock;
-
- // sysfs
- struct kobject kobj;
-
- // network
- char *imgname;
- struct socket *sock;
- dnbd3_server_t cur_server, initial_server;
- unsigned long cur_rtt;
- serialized_buffer_t payload_buffer;
- dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers
- int new_servers_num; // number of new alt servers that are waiting to be copied to above array
- dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers
- uint8_t discover, panic, disconnecting, update_available, panic_count;
- uint8_t use_server_provided_alts;
- uint16_t rid;
- uint32_t heartbeat_count;
- uint64_t reported_size;
- // server switch
- struct socket *better_sock;
-
- // process
- struct task_struct * thread_send;
- struct task_struct * thread_receive;
- struct task_struct *thread_discover;
- struct timer_list hb_timer;
- wait_queue_head_t process_queue_send;
- wait_queue_head_t process_queue_receive;
- wait_queue_head_t process_queue_discover;
- struct list_head request_queue_send;
- struct list_head request_queue_receive;
-
-} dnbd3_device_t;
-
-#endif /* DNBD_H_ */
diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c
new file mode 100644
index 0000000..cb42567
--- /dev/null
+++ b/src/kernel/dnbd3_main.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <dnbd3/config/client.h>
+#include <dnbd3/version.h>
+#include <net/ipv6.h>
+#include "dnbd3_main.h"
+#include "blk.h"
+
+int major;
+static unsigned int max_devs = NUMBER_DEVICES;
+static dnbd3_device_t *dnbd3_devices;
+
+struct device *dnbd3_device_to_dev(dnbd3_device_t *dev)
+{
+ return disk_to_dev(dev->disk);
+}
+
+int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest)
+{
+ struct sockaddr_in *sin4;
+ struct sockaddr_in6 *sin6;
+
+ memset(dest, 0, sizeof(*dest));
+ if (host->type == HOST_IP4) {
+ sin4 = (struct sockaddr_in *)dest;
+ sin4->sin_family = AF_INET;
+ memcpy(&(sin4->sin_addr), host->addr, 4);
+ sin4->sin_port = host->port;
+ } else if (host->type == HOST_IP6) {
+ sin6 = (struct sockaddr_in6 *)dest;
+ sin6->sin6_family = AF_INET6;
+ memcpy(&(sin6->sin6_addr), host->addr, 16);
+ sin6->sin6_port = host->port;
+ } else
+ return 0;
+ return 1;
+}
+
+int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y)
+{
+ if (x->ss_family != y->ss_family)
+ return 0;
+ switch (x->ss_family) {
+ case AF_INET: {
+ const struct sockaddr_in *sinx = (const struct sockaddr_in *)x;
+ const struct sockaddr_in *siny = (const struct sockaddr_in *)y;
+
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x;
+ const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y;
+
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Get a free slot pointer from the alt_servers list. Tries to find an
+ * entirely empty slot first, then looks for a slot with a server that
+ * wasn't reachable recently, finally returns NULL if none of the
+ * conditions match.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+static dnbd3_alt_server_t *get_free_alt_server(dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
+ return &dev->alt_servers[i];
+ }
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].failures > 10)
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (is_same_server(addr, &dev->alt_servers[i].host))
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+/**
+ * Returns pointer to existing entry in alt_servers that matches the given
+ * alt server, or NULL if not found.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev)
+{
+ struct sockaddr_storage addr;
+
+ if (!dnbd3_host_to_sockaddr(host, &addr))
+ return NULL;
+ return get_existing_alt_from_addr(&addr, dev);
+}
+
+int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ int result;
+ dnbd3_alt_server_t *alt_server;
+
+ if (host->type != HOST_IP4 && host->type != HOST_IP6)
+ return -EINVAL;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // ADD
+ if (alt_server != NULL) {
+ // Exists
+ result = -EEXIST;
+ } else {
+ // OK add
+ alt_server = get_free_alt_server(dev);
+ if (alt_server == NULL) {
+ result = -ENOSPC;
+ } else {
+ dnbd3_host_to_sockaddr(host, &alt_server->host);
+ alt_server->protocol_version = 0;
+ alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
+ = alt_server->rtts[3] = RTT_UNREACHABLE;
+ alt_server->failures = 0;
+ alt_server->best_count = 0;
+ result = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ dnbd3_alt_server_t *alt_server;
+ int result;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // REMOVE
+ if (alt_server == NULL) {
+ // Not found
+ result = -ENOENT;
+ } else {
+ // Remove
+ alt_server->host.ss_family = 0;
+ result = 0;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+static int __init dnbd3_init(void)
+{
+ int i;
+
+ dnbd3_devices = kcalloc(max_devs, sizeof(*dnbd3_devices), GFP_KERNEL);
+ if (!dnbd3_devices)
+ return -ENOMEM;
+
+ // initialize block device
+ major = register_blkdev(0, "dnbd3");
+ if (major == 0) {
+ pr_err("register_blkdev failed\n");
+ return -EIO;
+ }
+
+ pr_info("kernel module in version %s loaded\n", DNBD3_VERSION);
+ pr_debug("machine type %s\n", DNBD3_ENDIAN_MODE);
+
+ // add MAX_NUMBER_DEVICES devices
+ for (i = 0; i < max_devs; i++) {
+ if (dnbd3_blk_add_device(&dnbd3_devices[i], i) != 0) {
+ pr_err("dnbd3_blk_add_device failed\n");
+ // TODO: delete all devices added so far.
+ // It could happen that it's not the first one that fails.
+ // Also call unregister_blkdev and free memory.
+ return -EIO;
+ }
+ }
+
+ pr_info("init successful (%i devices)\n", max_devs);
+
+ return 0;
+}
+
+static void __exit dnbd3_exit(void)
+{
+ int i;
+
+ pr_debug("exiting kernel module...\n");
+ for (i = 0; i < max_devs; i++)
+ dnbd3_blk_del_device(&dnbd3_devices[i]);
+
+ unregister_blkdev(major, "dnbd3");
+ kfree(dnbd3_devices);
+
+ pr_info("exit kernel module done\n");
+}
+
+module_init(dnbd3_init);
+module_exit(dnbd3_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device 3");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DNBD3_VERSION);
+
+module_param(max_devs, int, 0444);
+MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h
new file mode 100644
index 0000000..a932ba2
--- /dev/null
+++ b/src/kernel/dnbd3_main.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef DNBD_H_
+#define DNBD_H_
+
+#include <dnbd3/config/client.h>
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/blk-mq.h>
+
+#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION)
+#define RHEL_CHECK_VERSION(CONDITION) (CONDITION)
+#else
+#define RHEL_CHECK_VERSION(CONDITION) (0)
+#endif
+
+extern int major;
+
+typedef struct {
+ unsigned long rtts[DISCOVER_HISTORY_SIZE]; // Last X round trip time measurements in µs
+ uint16_t protocol_version; // dnbd3 protocol version of this server
+ uint8_t failures; // How many times the server was unreachable
+ uint8_t best_count; // Number of times server measured best
+ struct sockaddr_storage host; // Address of server
+} dnbd3_alt_server_t;
+
+typedef struct {
+ // block
+ int index;
+ struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue *queue;
+ spinlock_t blk_lock;
+
+ // sysfs
+ struct kobject kobj;
+
+ char *imgname;
+ uint16_t rid;
+ struct socket *sock;
+ struct { // use blk_lock
+ unsigned long rtt;
+ struct sockaddr_storage host;
+ uint16_t protocol_version;
+ } cur_server;
+ serialized_buffer_t payload_buffer;
+ struct mutex alt_servers_lock;
+ dnbd3_alt_server_t alt_servers[NUMBER_SERVERS];
+ bool use_server_provided_alts;
+ bool panic;
+ u8 panic_count;
+ bool update_available;
+ atomic_t connection_lock;
+ // Size if image/device - this is 0 if the device is not in use,
+ // otherwise this is also the value we expect from alt servers.
+ uint64_t reported_size;
+ struct delayed_work keepalive_work;
+
+ // sending
+ struct workqueue_struct *send_wq;
+ spinlock_t send_queue_lock;
+ struct list_head send_queue;
+ struct mutex send_mutex;
+ struct work_struct send_work;
+ // receiving
+ struct workqueue_struct *recv_wq;
+ spinlock_t recv_queue_lock;
+ struct list_head recv_queue;
+ struct mutex recv_mutex;
+ struct work_struct recv_work;
+ // discover
+ atomic_t discover_running;
+ struct delayed_work discover_work;
+ u32 discover_interval;
+ u32 discover_count;
+
+} dnbd3_device_t;
+
+struct dnbd3_cmd {
+ u64 handle;
+};
+
+extern inline struct device *dnbd3_device_to_dev(dnbd3_device_t *dev);
+
+extern inline int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y);
+
+extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev);
+
+extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+extern int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+#define dnbd3_flag_get(x) (atomic_cmpxchg(&(x), 0, 1) == 0)
+#define dnbd3_flag_reset(x) atomic_set(&(x), 0)
+#define dnbd3_flag_taken(x) (atomic_read(&(x)) != 0)
+
+/*
+ * shims for making older kernels look like the current one, if possible, to avoid too
+ * much inline #ifdef which makes code harder to read.
+ */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
+#define BLK_EH_DONE BLK_EH_NOT_HANDLED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
+#define blk_status_t int
+#define BLK_STS_OK 0
+#define BLK_STS_IOERR (-EIO)
+#define BLK_STS_TIMEOUT (-ETIME)
+#define BLK_STS_NOTSUPP (-ENOTSUPP)
+#endif
+
+#endif /* DNBD_H_ */
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 9e48b86..5ef4016 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,1106 +19,1112 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "net.h"
#include "blk.h"
-#include "utils.h"
+#include "dnbd3_main.h"
-#include "serialize.h"
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/random.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
+#define get_random_u32 prandom_u32
+#endif
#include <linux/time.h>
-#include <linux/signal.h>
+#include <linux/ktime.h>
+#include <linux/tcp.h>
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
-#else
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern((af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
+#ifndef ktime_to_s
+#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC)
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-// cmd_flags and cmd_type are merged into cmd_flags now
-#if REQ_FLAG_BITS > 24
-#error "Fix CMD bitshift"
-#endif
-// Pack into cmd_flags field by shifting CMD_* into unused bits of cmd_flags
-#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = REQ_OP_DRV_IN | ((cmd) << REQ_FLAG_BITS)
-#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS)
-#define dnbd3_req_op(req) req_op(req)
-#define DNBD3_DEV_READ REQ_OP_READ
-#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
+#ifdef DEBUG
+#define ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+ } while (0)
#else
-// Old way with type and flags separated
-#define dnbd3_cmd_to_priv(req, cmd) do { \
- (req)->cmd_type = REQ_TYPE_SPECIAL; \
- (req)->cmd_flags = (cmd); \
-} while (0)
-#define dnbd3_priv_to_cmd(req) (req)->cmd_flags
-#define dnbd3_req_op(req) (req)->cmd_type
-#define DNBD3_DEV_READ REQ_TYPE_FS
-#define DNBD3_REQ_OP_SPECIAL REQ_TYPE_SPECIAL
+#define ASSERT(x) \
+ do { \
+ } while (0)
#endif
-/**
- * Some macros for easier debug output. Location in source-code
- * as well as server IP:port info will be printed.
- * The error_* macros include a "goto error;" at the end
- */
-#if 1 // Change to 0 to disable debug messages
-#define debug_print_va_host(_host, _fmt, ...) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " _fmt " (%s, %pI4:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " _fmt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_va_host(_host, _fmt, ...) do { \
- debug_print_va_host(_host, _fmt, __VA_ARGS__); \
- goto error; \
-} while(0)
-#define debug_dev_va(_fmt, ...) debug_print_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define error_dev_va(_fmt, ...) debug_error_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define debug_alt_va(_fmt, ...) debug_print_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-#define error_alt_va(_fmt, ...) debug_error_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-
-#define debug_print_host(_host, txt) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " txt " (%s, %pI4:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " txt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_host(_host, txt) do { \
- debug_print_host(_host, txt); \
- goto error; \
-} while(0)
-#define debug_dev(txt) debug_print_host(dev->cur_server.host, txt)
-#define error_dev(txt) debug_error_host(dev->cur_server.host, txt)
-#define debug_alt(txt) debug_print_host(dev->alt_servers[i].host, txt)
-#define error_alt(txt) debug_error_host(dev->alt_servers[i].host, txt)
-
-#else // Silent
-#define debug_dev(x) do { } while(0)
-#define error_dev(x) goto error
-#define debug_dev_va(x, ...) do { } while(0)
-#define error_dev_va(x, ...) goto error
-#define debug_alt(x) do { } while(0)
-#define error_alt(x) goto error
-#define debug_alt_va(x, ...) do { } while(0)
-#define error_alt_va(x, ...) goto error
-#endif
+#define dnbd3_dev_dbg_host(dev, host, fmt, ...) \
+ dev_dbg(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_info_host(dev, host, fmt, ...) \
+ dev_info(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_err_host(dev, host, fmt, ...) \
+ dev_err(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
-static inline int is_same_server(const dnbd3_server_t * const a, const dnbd3_server_t * const b)
-{
- return (a->host.type == b->host.type) && (a->host.port == b->host.port)
- && (0 == memcmp(a->host.addr, b->host.addr, (a->host.type == HOST_IP4 ? 4 : 16)));
-}
+#define dnbd3_dev_dbg_cur(dev, fmt, ...) \
+ dnbd3_dev_dbg_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_info_cur(dev, fmt, ...) \
+ dnbd3_dev_info_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_err_cur(dev, fmt, ...) \
+ dnbd3_dev_err_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
-static inline dnbd3_server_t *get_existing_server(const dnbd3_server_entry_t * const newserver,
- dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if ((newserver->host.type == dev->alt_servers[i].host.type)
- && (newserver->host.port == dev->alt_servers[i].host.port)
- && (0
- == memcmp(newserver->host.addr, dev->alt_servers[i].host.addr, (newserver->host.type == HOST_IP4 ? 4 : 16))))
- {
- return &dev->alt_servers[i];
- break;
- }
- }
- return NULL ;
-}
-
-static inline dnbd3_server_t *get_free_alt_server(dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == 0)
- return &dev->alt_servers[i];
- }
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].failures > 10)
- return &dev->alt_servers[i];
- }
- return NULL ;
-}
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes);
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count);
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr);
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size);
-int dnbd3_net_connect(dnbd3_device_t *dev)
-{
- struct request *req1 = NULL;
- struct timeval timeout;
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, u16 protocol_version);
- if (dev->disconnecting) {
- debug_dev("CONNECT: Still disconnecting!!!\n");
- while (dev->disconnecting)
- schedule();
- }
- if (dev->thread_receive != NULL) {
- debug_dev("CONNECT: Still receiving!!!\n");
- while (dev->thread_receive != NULL)
- schedule();
- }
- if (dev->thread_send != NULL) {
- debug_dev("CONNECT: Still sending!!!\n");
- while (dev->thread_send != NULL)
- schedule();
- }
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket **sock_out);
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
- timeout.tv_usec = 0;
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info);
- // do some checks before connecting
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket *sock);
- req1 = kmalloc(sizeof(*req1), GFP_ATOMIC );
- if (!req1)
- error_dev("FATAL: Kmalloc(1) failed.");
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd);
- if (dev->cur_server.host.port == 0 || dev->cur_server.host.type == 0 || dev->imgname == NULL )
- error_dev("FATAL: Host, port or image name not set.");
- if (dev->sock)
- error_dev("ERROR: Already connected.");
-
- if (dev->cur_server.host.type != HOST_IP4 && dev->cur_server.host.type != HOST_IP6)
- error_dev_va("ERROR: Unknown address type %d", (int)dev->cur_server.host.type);
-
- debug_dev("INFO: Connecting...");
-
- if (dev->better_sock == NULL )
- {
- // no established connection yet from discovery thread, start new one
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov[2];
- uint16_t rid;
- char *name;
- int mlen;
- init_msghdr(msg);
-
- if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0)
- error_dev("ERROR: Couldn't create socket (v6).");
-
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- dev->sock->sk->sk_allocation = GFP_NOIO;
- if (dev->cur_server.host.type == HOST_IP4)
- {
- struct sockaddr_in sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- memcpy(&(sin.sin_addr), dev->cur_server.host.addr, 4);
- sin.sin_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v4)");
- }
- else
- {
- struct sockaddr_in6 sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin6_family = AF_INET6;
- memcpy(&(sin.sin6_addr), dev->cur_server.host.addr, 16);
- sin.sin6_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v6)");
- }
- // Request filesize
- dnbd3_request.magic = dnbd3_packet_magic;
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(&dev->payload_buffer);
- serializer_put_uint16(&dev->payload_buffer, PROTOCOL_VERSION);
- serializer_put_string(&dev->payload_buffer, dev->imgname);
- serializer_put_uint16(&dev->payload_buffer, dev->rid);
- serializer_put_uint8(&dev->payload_buffer, 0); // is_server = false
- iov[1].iov_base = &dev->payload_buffer;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&dev->payload_buffer);
- fixup_request(dnbd3_request);
- mlen = sizeof(dnbd3_request) + iov[1].iov_len;
- if (kernel_sendmsg(dev->sock, &msg, iov, 2, mlen) != mlen)
- error_dev("ERROR: Couldn't send CMD_SIZE_REQUEST.");
- // receive reply header
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_dev("FATAL: Received corrupted reply header after CMD_SIZE_REQUEST.");
- // check reply header
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 3 || dnbd3_reply.size > MAX_PAYLOAD
- || dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("FATAL: Received invalid reply to CMD_SIZE_REQUEST, image doesn't exist on server.");
- // receive reply payload
- iov[0].iov_base = &dev->payload_buffer;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_dev("FATAL: Cold not read CMD_SELECT_IMAGE payload on handshake.");
- // handle/check reply payload
- serializer_reset_read(&dev->payload_buffer, dnbd3_reply.size);
- dev->cur_server.protocol_version = serializer_get_uint16(&dev->payload_buffer);
- if (dev->cur_server.protocol_version < MIN_SUPPORTED_SERVER)
- error_dev("FATAL: Server version is lower than min supported version.");
- name = serializer_get_string(&dev->payload_buffer);
- if (dev->rid != 0 && strcmp(name, dev->imgname) != 0)
- error_dev_va("FATAL: Server offers image '%s', requested '%s'", name, dev->imgname);
- if (strlen(dev->imgname) < strlen(name))
- {
- dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_ATOMIC );
- if (dev->imgname == NULL )
- error_dev("FATAL: Reallocating buffer for new image name failed");
- }
- strcpy(dev->imgname, name);
- rid = serializer_get_uint16(&dev->payload_buffer);
- if (dev->rid != 0 && dev->rid != rid)
- error_dev_va("FATAL: Server provides rid %d, requested was %d.", (int)rid, (int)dev->rid);
- dev->rid = rid;
- dev->reported_size = serializer_get_uint64(&dev->payload_buffer);
- if (dev->reported_size < 4096)
- error_dev("ERROR: Reported size by server is < 4096");
- // store image information
- set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
- debug_dev_va("INFO: Filesize: %llu.", dev->reported_size);
- dev->update_available = 0;
- }
- else // Switching server, connection is already established and size request was executed
- {
- debug_dev("INFO: On-the-fly server change.");
- dev->sock = dev->better_sock;
- dev->better_sock = NULL;
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- }
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic);
- dev->panic = 0;
- dev->panic_count = 0;
+static void dnbd3_discover(dnbd3_device_t *dev);
- // Enqueue request to request_queue_send for a fresh list of alt servers
- dnbd3_cmd_to_priv(req1, CMD_GET_SERVERS);
- list_add(&req1->queuelist, &dev->request_queue_send);
+static void dnbd3_internal_discover(dnbd3_device_t *dev);
- // create required threads
- dev->thread_send = kthread_create(dnbd3_net_send, dev, dev->disk->disk_name);
- dev->thread_receive = kthread_create(dnbd3_net_receive, dev, dev->disk->disk_name);
- dev->thread_discover = kthread_create(dnbd3_net_discover, dev, dev->disk->disk_name);
- // start them up
- wake_up_process(dev->thread_send);
- wake_up_process(dev->thread_receive);
- wake_up_process(dev->thread_discover);
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms);
- wake_up(&dev->process_queue_send);
+// Use as write-only dump, don't care about race conditions etc.
+static u8 __garbage_mem[PAGE_SIZE];
- // add heartbeat timer
- dev->heartbeat_count = 0;
+/**
+ * Delayed work triggering sending of keepalive packet.
+ */
+static void dnbd3_keepalive_workfn(struct work_struct *work)
+{
+ unsigned long irqflags;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, keepalive_work.work);
-// init_timer_key changed from kernel version 4.14 to 4.15, see and compare to 4.15:
-// https://elixir.bootlin.com/linux/v4.14.32/source/include/linux/timer.h#L98
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
- timer_setup(&dev->hb_timer, dnbd3_net_heartbeat, 0);
-#else
- // Old timer setup
- init_timer(&dev->hb_timer);
- dev->hb_timer.data = (unsigned long)dev;
- dev->hb_timer.function = dnbd3_net_heartbeat;
-#endif
- dev->hb_timer.expires = jiffies + HZ;
- add_timer(&dev->hb_timer);
- return 0;
- error: ;
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ dnbd3_send_empty_request(dev, CMD_KEEPALIVE);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->keepalive_work, KEEPALIVE_INTERVAL * HZ);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
- if (req1)
- kfree(req1);
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-int dnbd3_net_disconnect(dnbd3_device_t *dev)
+/**
+ * Delayed work triggering discovery (alt server check)
+ */
+static void dnbd3_discover_workfn(struct work_struct *work)
{
- if (dev->disconnecting)
- return 0;
-
- if (dev->cur_server.host.port)
- debug_dev("INFO: Disconnecting device.");
-
- dev->disconnecting = 1;
-
- // clear heartbeat timer
- del_timer(&dev->hb_timer);
-
- dev->discover = 0;
-
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
-
- // kill sending and receiving threads
- if (dev->thread_send)
- {
- kthread_stop(dev->thread_send);
- }
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, discover_work.work);
- if (dev->thread_receive)
- {
- kthread_stop(dev->thread_receive);
- }
+ dnbd3_discover(dev);
+}
- if (dev->thread_discover)
- {
- kthread_stop(dev->thread_discover);
- dev->thread_discover = NULL;
- }
+/**
+ * For manually triggering an immediate discovery
+ */
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic)
+{
+ unsigned long irqflags;
- // clear socket
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ if (!device_active(dev))
+ return;
+ if (panic && dnbd3_flag_get(dev->connection_lock)) {
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (!dev->panic) {
+ // Panic freshly turned on
+ dev->panic = true;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC;
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->connection_lock);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
-
- dev->disconnecting = 0;
-
- return 0;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, 1);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg)
-{
- dnbd3_device_t *dev = (dnbd3_device_t *)container_of(arg, dnbd3_device_t, hb_timer);
-#else
-void dnbd3_net_heartbeat(unsigned long arg)
+/**
+ * Wrapper for the actual discover function below. Check run conditions
+ * here and re-schedule delayed task here.
+ */
+static void dnbd3_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = (dnbd3_device_t *)arg;
-#endif
- // Because different events need different intervals, the timer is called once a second.
- // Other intervals can be derived using dev->heartbeat_count.
-#define timeout_seconds(x) (dev->heartbeat_count % (x) == 0)
-
- if (!dev->panic)
- {
- if (timeout_seconds(TIMER_INTERVAL_KEEPALIVE_PACKET))
- {
- struct request *req = kmalloc(sizeof(struct request), GFP_ATOMIC );
- // send keepalive
- if (req)
- {
- dnbd3_cmd_to_priv(req, CMD_KEEPALIVE);
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- wake_up(&dev->process_queue_send);
- }
- else
- {
- debug_dev("ERROR: Couldn't create keepalive request.");
- }
- }
- if ((dev->heartbeat_count > STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_NORMAL))
- || (dev->heartbeat_count <= STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_STARTUP)))
- {
- // Normal discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ unsigned long irqflags;
+
+ if (!device_active(dev) || dnbd3_flag_taken(dev->connection_lock))
+ return; // device not active anymore, or just about to switch
+ if (!dnbd3_flag_get(dev->discover_running))
+ return; // Already busy
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ cancel_delayed_work(&dev->discover_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_internal_discover(dev);
+ dev->discover_count++;
+ // Re-queueing logic
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, dev->discover_interval * HZ);
+ if (dev->discover_interval < TIMER_INTERVAL_PROBE_MAX
+ && dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT) {
+ dev->discover_interval += 2;
}
}
- else if (timeout_seconds(TIMER_INTERVAL_PROBE_PANIC))
- {
- // Panic discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
-
- dev->hb_timer.expires = jiffies + HZ;
-
- ++dev->heartbeat_count;
- add_timer(&dev->hb_timer);
-#undef timeout_seconds
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->discover_running);
}
-int dnbd3_net_discover(void *data)
+/**
+ * Discovery. Probe all (or some) known alt servers,
+ * and initiate connection switch if appropriate
+ */
+static void dnbd3_internal_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = data;
- struct sockaddr_in sin4;
- struct sockaddr_in6 sin6;
struct socket *sock, *best_sock = NULL;
+ dnbd3_alt_server_t *alt;
+ struct sockaddr_storage host_compare, best_server;
+ uint16_t remote_version;
+ ktime_t start, end;
+ unsigned long rtt = 0, best_rtt = 0;
+ int i, j, k, isize, fails, rtt_threshold;
+ int do_change = 0;
+ u8 check_order[NUMBER_SERVERS];
+ const bool ready = dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT;
+ const u32 turn = dev->discover_count % DISCOVER_HISTORY_SIZE;
+
+ // Shuffle alt_servers
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ check_order[i] = i;
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- dnbd3_server_t *alt_server;
- struct msghdr msg;
- struct kvec iov[2];
-
- char *buf, *name;
- serialized_buffer_t *payload;
- uint64_t filesize;
- uint16_t rid;
-
- struct timeval start, end;
- unsigned long rtt, best_rtt = 0;
- unsigned long irqflags;
- int i, j, isize, best_server, current_server;
- int turn = 0;
- int ready = 0, do_change = 0;
- char check_order[NUMBER_SERVERS];
- int mlen;
-
- struct request *last_request = (struct request *)123, *cur_request = (struct request *)456;
-
- struct timeval timeout;
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DISCOVERY;
- timeout.tv_usec = 0;
-
- memset(&sin4, 0, sizeof(sin4));
- memset(&sin6, 0, sizeof(sin6));
-
- init_msghdr(msg);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ j = get_random_u32() % NUMBER_SERVERS;
+ if (j != i) {
+ int tmp = check_order[i];
- buf = kmalloc(4096, GFP_KERNEL);
- if (!buf)
- {
- debug_dev("FATAL: Kmalloc failed (discover)");
- return -1;
+ check_order[i] = check_order[j];
+ check_order[j] = tmp;
+ }
}
- payload = (serialized_buffer_t *)buf; // Reuse this buffer to save kernel mem
- dnbd3_request.magic = dnbd3_packet_magic;
+ best_server.ss_family = 0;
+ best_rtt = RTT_UNREACHABLE;
- for (i = 0; i < NUMBER_SERVERS; ++i) {
- check_order[i] = i;
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_discover,
- kthread_should_stop() || dev->discover || dev->thread_discover == NULL);
+ if (!ready || dev->panic)
+ isize = NUMBER_SERVERS;
+ else
+ isize = 3;
- if (kthread_should_stop() || dev->imgname == NULL || dev->thread_discover == NULL )
+ for (j = 0; j < NUMBER_SERVERS; ++j) {
+ if (!device_active(dev))
break;
+ i = check_order[j];
+ mutex_lock(&dev->alt_servers_lock);
+ host_compare = dev->alt_servers[i].host;
+ fails = dev->alt_servers[i].failures;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (host_compare.ss_family == 0)
+ continue; // Empty slot
+ // Reduced probability for hosts that have been unreachable
+ if (!dev->panic && fails > 50 && (get_random_u32() % 4) != 0)
+ continue; // If not in panic mode, skip server if it failed too many times
+ if (isize-- <= 0 && !is_same_server(&dev->cur_server.host, &host_compare))
+ continue; // Only test isize servers plus current server
+
+ // Initialize socket and connect
+ sock = NULL;
+ if (dnbd3_connect(dev, &host_compare, &sock) != 0)
+ goto error;
- if (!dev->discover)
- continue;
- dev->discover = 0;
-
- if (dev->reported_size < 4096)
- continue;
-
- // Check if the list of alt servers needs to be updated and do so if necessary
- if (dev->new_servers_num)
- {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- for (i = 0; i < dev->new_servers_num; ++i)
- {
- if (dev->new_servers[i].host.type != HOST_IP4 && dev->new_servers[i].host.type != HOST_IP6) // Invalid entry?
- continue;
- alt_server = get_existing_server(&dev->new_servers[i], dev);
- if (alt_server != NULL ) // Server already known
- {
- if (dev->new_servers[i].failures == 1)
- {
- // REMOVE request
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Removing alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Removing alt server %pI6", alt_server->host.addr);
- alt_server->host.type = 0;
- continue;
- }
- // ADD, so just reset fail counter
- alt_server->failures = 0;
- continue;
- }
- if (dev->new_servers[i].failures == 1) // REMOVE, but server is not in list anyways
- continue;
- alt_server = get_free_alt_server(dev);
- if (alt_server == NULL ) // All NUMBER_SERVERS slots are taken, ignore entry
- continue;
- // Add new server entry
- alt_server->host = dev->new_servers[i].host;
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Adding alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Adding alt server %pI6", alt_server->host.addr);
- alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE;
- alt_server->protocol_version = 0;
- alt_server->failures = 0;
- }
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
+ remote_version = 0;
+ if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false))
+ goto error;
- current_server = best_server = -1;
- best_rtt = 0xFFFFFFFul;
- if (dev->heartbeat_count < STARTUP_MODE_DURATION || dev->panic)
- {
- isize = NUMBER_SERVERS;
- }
- else
- {
- isize = 3;
- }
- if (NUMBER_SERVERS > isize) {
- for (i = 0; i < isize; ++i) {
- j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS;
- if (j != i) {
- mlen = check_order[i];
- check_order[i] = check_order[j];
- check_order[j] = mlen;
- }
+ // panic mode, take first responding server
+ if (dev->panic) {
+ dnbd3_dev_info_host(dev, &host_compare, "panic mode, changing to new server\n");
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_host(dev, &host_compare, "...raced, ignoring\n");
+ } else {
+ // Check global flag, a connect might have been in progress
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000);
+ if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0)
+ sock_release(sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
}
}
- for (j = 0; j < NUMBER_SERVERS; ++j)
- {
- i = check_order[j];
- if (dev->alt_servers[i].host.type == 0) // Empty slot
- continue;
- if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times
- continue;
- if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- continue;
-
- // Initialize socket and connect
- if (dnbd3_sock_create(dev->alt_servers[i].host.type, SOCK_STREAM, IPPROTO_TCP, &sock) < 0)
- {
- debug_alt("ERROR: Couldn't create socket (discover).");
- sock = NULL;
- continue;
- }
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- sock->sk->sk_allocation = GFP_NOIO;
- if (dev->alt_servers[i].host.type == HOST_IP4)
- {
- sin4.sin_family = AF_INET;
- memcpy(&sin4.sin_addr, dev->alt_servers[i].host.addr, 4);
- sin4.sin_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin4, sizeof(sin4), 0) < 0)
- goto error;
- }
- else
- {
- sin6.sin6_family = AF_INET6;
- memcpy(&sin6.sin6_addr, dev->alt_servers[i].host.addr, 16);
- sin6.sin6_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin6, sizeof(sin6), 0) < 0)
- goto error;
- }
+ // actual rtt measurement is just the first block requests and reply
+ start = ktime_get_real();
+ if (!dnbd3_request_test_block(dev, &host_compare, sock))
+ goto error;
+ end = ktime_get_real();
- // Request filesize
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(payload);
- serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
- serializer_put_string(payload, dev->imgname); // image name
- serializer_put_uint16(payload, dev->rid); // revision id
- serializer_put_uint8(payload, 0); // are we a server? (no!)
- iov[1].iov_base = payload;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(payload);
- fixup_request(dnbd3_request);
- mlen = iov[1].iov_len + sizeof(dnbd3_request);
- if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen)
- error_alt("ERROR: Requesting image size failed.");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving image size packet (header) failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4)
- error_alt("ERROR: Content of image size packet (header) mismatched (discover).");
-
- // receive data
- iov[0].iov_base = payload;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_alt("ERROR: Receiving image size packet (payload) failed (discover).");
- serializer_reset_read(payload, dnbd3_reply.size);
-
- dev->alt_servers[i].protocol_version = serializer_get_uint16(payload);
- if (dev->alt_servers[i].protocol_version < MIN_SUPPORTED_SERVER)
- error_alt_va("ERROR: Server version too old (client: %d, server: %d, min supported: %d).",
- (int)PROTOCOL_VERSION, (int)dev->alt_servers[i].protocol_version, (int)MIN_SUPPORTED_SERVER);
-
- name = serializer_get_string(payload);
- if (name == NULL )
- error_alt("ERROR: Server did not supply an image name (discover).");
-
- if (strcmp(name, dev->imgname) != 0)
- error_alt_va("ERROR: Image name does not match requested one (client: '%s', server: '%s') (discover).",
- dev->imgname, name);
-
- rid = serializer_get_uint16(payload);
- if (rid != dev->rid)
- error_alt_va("ERROR: Server supplied wrong rid (client: '%d', server: '%d') (discover).",
- (int)dev->rid, (int)rid);
-
- filesize = serializer_get_uint64(payload);
- if (filesize != dev->reported_size)
- error_alt_va("ERROR: Reported image size of %llu does not match expected value %llu.(discover).",
- (unsigned long long)filesize, (unsigned long long)dev->reported_size);
-
- // panic mode, take first responding server
- if (dev->panic)
- {
- dev->panic = 0;
- debug_alt("WARN: Panic mode, changing server:");
- if (best_sock != NULL )
- sock_release(best_sock);
- dev->better_sock = sock; // Pass over socket to take a shortcut in *_connect();
- kfree(buf);
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[i], sizeof(dev->cur_server));
- dnbd3_net_connect(dev);
- return 0;
- }
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ dev->alt_servers[i].protocol_version = remote_version;
+ dev->alt_servers[i].rtts[turn] =
+ (unsigned long)ktime_us_delta(end, start);
- // Request block
- dnbd3_request.cmd = CMD_GET_BLOCK;
- // Do *NOT* pick a random block as it has proven to cause severe
- // cache thrashing on the server
- dnbd3_request.offset = 0;
- dnbd3_request.size = RTT_BLOCK_SIZE;
- fixup_request(dnbd3_request);
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
-
- // start rtt measurement
- do_gettimeofday(&start);
-
- if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0)
- error_alt("ERROR: Requesting test block failed (discover).");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving test block header packet failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic
- != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE)
- error_alt_va("ERROR: Unexpected reply to block request: cmd=%d, size=%d (discover).",
- (int)dnbd3_reply.cmd, (int)dnbd3_reply.size);
-
- // receive data
- iov[0].iov_base = buf;
- iov[0].iov_len = RTT_BLOCK_SIZE;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE)
- error_alt("ERROR: Receiving test block payload failed (discover).");
-
- do_gettimeofday(&end); // end rtt measurement
-
- dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull
- + (end.tv_usec - start.tv_usec));
-
- rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2]
- + dev->alt_servers[i].rtts[3]) / 4;
-
- if (best_rtt > rtt)
- {
- // This one is better, keep socket open in case we switch
- best_rtt = rtt;
- best_server = i;
- if (best_sock != NULL )
- sock_release(best_sock);
- best_sock = sock;
- sock = NULL;
- }
- else
- {
- // Not better, discard connection
- sock_release(sock);
- sock = NULL;
- }
+ rtt = 0;
- // update cur servers rtt
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = rtt;
- current_server = i;
- }
+ for (k = 0; k < DISCOVER_HISTORY_SIZE; ++k)
+ rtt += dev->alt_servers[i].rtts[k];
+ rtt /= DISCOVER_HISTORY_SIZE;
dev->alt_servers[i].failures = 0;
+ if (dev->alt_servers[i].best_count > 1)
+ dev->alt_servers[i].best_count -= 2;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
- continue;
-
- error: ;
- ++dev->alt_servers[i].failures;
+ if (best_rtt > rtt) {
+ // This one is better, keep socket open in case we switch
+ best_rtt = rtt;
+ best_server = host_compare;
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ best_sock = sock;
+ sock = NULL;
+ } else {
+ // Not better, discard connection
sock_release(sock);
sock = NULL;
- dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = RTT_UNREACHABLE;
- current_server = i;
- }
- continue;
}
- if (dev->panic)
- {
- // After 21 retries, bail out by reporting errors to block layer
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count < 255 && ++dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
- dnbd3_blk_fail_all_requests(dev);
- }
+ // update cur servers rtt
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = rtt;
- if (best_server == -1 || kthread_should_stop() || dev->thread_discover == NULL ) // No alt server could be reached at all or thread should stop
- {
- if (best_sock != NULL ) // Should never happen actually
- {
- sock_release(best_sock);
- best_sock = NULL;
- }
- continue;
- }
+ continue;
- do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0
- && RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500;
-
- if (ready && !do_change) {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (!list_empty(&dev->request_queue_send))
- {
- cur_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- do_change = (cur_request == last_request);
- if (do_change)
- printk("WARNING: Hung request on %s\n", dev->disk->disk_name);
- }
- else
- {
- cur_request = (struct request *)123;
- }
- last_request = cur_request;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+error:
+ if (sock != NULL) {
+ sock_release(sock);
+ sock = NULL;
}
-
- // take server with lowest rtt
- if (do_change)
- {
- printk("INFO: Server %d on %s is faster (%lluµs vs. %lluµs)\n", best_server, dev->disk->disk_name,
- (unsigned long long)best_rtt, (unsigned long long)dev->cur_rtt);
- kfree(buf);
- dev->better_sock = best_sock; // Take shortcut by continuing to use open connection
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[best_server], sizeof(dev->cur_server));
- dev->cur_rtt = best_rtt;
- dnbd3_net_connect(dev);
- return 0;
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ if (remote_version)
+ dev->alt_servers[i].protocol_version = remote_version;
+ ++dev->alt_servers[i].failures;
+ dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
+ if (dev->alt_servers[i].best_count > 2)
+ dev->alt_servers[i].best_count -= 3;
}
-
- // Clean up connection that was held open for quicker server switch
- if (best_sock != NULL )
- {
- sock_release(best_sock);
- best_sock = NULL;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = RTT_UNREACHABLE;
+ } // END - for loop over alt_servers
+
+ if (best_server.ss_family == 0) {
+ // No alt server could be reached
+ ASSERT(!best_sock);
+ if (dev->panic) {
+ if (dev->panic_count < 255)
+ dev->panic_count++;
+ // If probe timeout is set, report error to block layer
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
+ dnbd3_blk_fail_all_requests(dev);
}
+ return;
+ }
- if (!ready || (start.tv_usec & 15) != 0)
- turn = (turn + 1) % 4;
- if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing
- ready = 1;
-
+ // If best server was repeatedly measured best, lower the switching threshold more
+ mutex_lock(&dev->alt_servers_lock);
+ alt = get_existing_alt_from_addr(&best_server, dev);
+ if (alt != NULL) {
+ if (alt->best_count < 178)
+ alt->best_count += 3;
+ rtt_threshold = 1800 - (alt->best_count * 10);
+ remote_version = alt->protocol_version;
+ } else {
+ rtt_threshold = 1800;
+ remote_version = 0;
}
- kfree(buf);
- return 0;
+ mutex_unlock(&dev->alt_servers_lock);
+
+ do_change = ready && !is_same_server(&best_server, &dev->cur_server.host)
+ && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold;
+
+ // take server with lowest rtt
+ // if a (dis)connect is already in progress, we do nothing, this is not panic mode
+ if (do_change && device_active(dev) && dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_cur(dev, "server %pISpc is faster (%lluµs vs. %lluµs)\n",
+ &best_server,
+ (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
+ set_socket_timeout(best_sock, false, // recv
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_RECV * 1000) + 500);
+ set_socket_timeout(best_sock, true, // send
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_SEND * 1000) + 500);
+ if (dnbd3_set_primary_connection(dev, best_sock, &best_server, remote_version) != 0)
+ sock_release(best_sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
+ }
+
+ // Clean up connection that was held open for quicker server switch
+ if (best_sock != NULL)
+ sock_release(best_sock);
}
-int dnbd3_net_send(void *data)
+/**
+ * Worker for sending pending requests. This will be triggered whenever
+ * we get a new request from the block layer. The worker will then
+ * work through all the requests in the send queue, request them from
+ * the server, and return again.
+ */
+static void dnbd3_send_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request;
-
- dnbd3_request_t dnbd3_request;
- struct msghdr msg;
- struct kvec iov;
-
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, send_work);
+ struct request *blk_request;
+ struct dnbd3_cmd *cmd;
unsigned long irqflags;
- init_msghdr(msg);
-
- dnbd3_request.magic = dnbd3_packet_magic;
-
- set_user_nice(current, -20);
-
- // move already sent requests to request_queue_send again
- while (!list_empty(&dev->request_queue_receive))
- {
- printk("WARN: Request queue was not empty on %s\n", dev->disk->disk_name);
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- list_add(&blk_request->queuelist, &dev->request_queue_send);
- }
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_send, kthread_should_stop() || !list_empty(&dev->request_queue_send));
-
- if (kthread_should_stop())
+ mutex_lock(&dev->send_mutex);
+ while (dev->sock && device_active(dev)) {
+ // extract next block request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ if (list_empty(&dev->send_queue)) {
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
break;
-
- // extract block request
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (list_empty(&dev->request_queue_send))
- {
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
- blk_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- // what to do?
- switch (dnbd3_req_op(blk_request))
- {
- case DNBD3_DEV_READ:
- dnbd3_request.cmd = CMD_GET_BLOCK;
- dnbd3_request.offset = blk_rq_pos(blk_request) << 9; // *512
- dnbd3_request.size = blk_rq_bytes(blk_request); // bytes left to complete entire request
- // enqueue request to request_queue_receive
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- list_add_tail(&blk_request->queuelist, &dev->request_queue_receive);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- break;
- case DNBD3_REQ_OP_SPECIAL:
- dnbd3_request.cmd = dnbd3_priv_to_cmd(blk_request);
- dnbd3_request.size = 0;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // append to receive queue
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_add_tail(&blk_request->queuelist, &dev->recv_queue);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+
+ cmd = blk_mq_rq_to_pdu(blk_request);
+ if (!dnbd3_send_request(dev->sock, CMD_GET_BLOCK, cmd->handle,
+ blk_rq_pos(blk_request) << 9 /* sectors */, blk_rq_bytes(blk_request))) {
+ if (!dnbd3_flag_taken(dev->connection_lock)) {
+ dnbd3_dev_err_cur(dev, "connection to server lost (send)\n");
+ dnbd3_start_discover(dev, true);
+ }
break;
-
- default:
- printk("ERROR: Unknown command (send %u %u)\n", (int)blk_request->cmd_flags, (int)dnbd3_req_op(blk_request));
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
-
- // send net request
- dnbd3_request.handle = (uint64_t)(uintptr_t)blk_request; // Double cast to prevent warning on 32bit
- fixup_request(dnbd3_request);
- iov.iov_base = &dnbd3_request;
- iov.iov_len = sizeof(dnbd3_request);
- if (kernel_sendmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_request)) != sizeof(dnbd3_request))
- {
- debug_dev("ERROR: Connection to server lost (send)");
- goto error;
- }
- wake_up(&dev->process_queue_receive);
}
-
- dev->thread_send = NULL;
- return 0;
-
- error: ;
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
- dev->thread_send = NULL;
- return -1;
+ mutex_unlock(&dev->send_mutex);
}
-int dnbd3_net_receive(void *data)
+/**
+ * The receive workfn stays active for as long as the connection to a server
+ * lasts, i.e. it only gets restarted when we switch to a new server.
+ */
+static void dnbd3_recv_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request, *received_request;
-
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, recv_work);
+ struct request *blk_request;
+ struct request *rq_iter;
+ struct dnbd3_cmd *cmd;
+ dnbd3_reply_t reply_hdr;
struct req_iterator iter;
struct bio_vec bvec_inst;
struct bio_vec *bvec = &bvec_inst;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov;
void *kaddr;
unsigned long irqflags;
- sigset_t blocked, oldset;
uint16_t rid;
- unsigned long int recv_timeout = jiffies;
-
- int count, remaining, ret;
-
- init_msghdr(msg);
- set_user_nice(current, -20);
+ int remaining;
+ int ret;
- while (!kthread_should_stop())
- {
+ mutex_lock(&dev->recv_mutex);
+ while (dev->sock) {
// receive net reply
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = sizeof(dnbd3_reply);
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags);
- if (ret == -EAGAIN)
- {
- if (jiffies < recv_timeout) recv_timeout = jiffies; // Handle overflow
- if ((jiffies - recv_timeout) / HZ > SOCKET_KEEPALIVE_TIMEOUT)
- error_dev_va("ERROR: Receive timeout reached (%d of %d secs).", (int)((jiffies - recv_timeout) / HZ), (int)SOCKET_KEEPALIVE_TIMEOUT);
- continue;
+ ret = dnbd3_recv_reply(dev->sock, &reply_hdr);
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(dev, "remote peer has performed an orderly shutdown\n");
+ goto out_unlock;
+ } else if (ret < 0) {
+ if (ret == -EAGAIN) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "receive timeout reached\n");
+ } else {
+ /* for all errors other than -EAGAIN, print errno */
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "connection to server lost (receive, errno=%d)\n", ret);
+ }
+ goto out_unlock;
}
- if (ret <= 0)
- error_dev("ERROR: Connection to server lost (receive)");
- if (ret != sizeof(dnbd3_reply))
- error_dev("ERROR: Recv msg header.");
- fixup_reply(dnbd3_reply);
- // check error
- if (dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("ERROR: Wrong packet magic (Receive).");
- if (dnbd3_reply.cmd == 0)
- error_dev("ERROR: Command was 0 (Receive).");
+ /* check if arrived data is valid */
+ if (ret != sizeof(reply_hdr)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv partial msg header (%d/%d bytes)\n",
+ ret, (int)sizeof(reply_hdr));
+ goto out_unlock;
+ }
- // Update timeout
- recv_timeout = jiffies;
+ // check error
+ if (reply_hdr.magic != dnbd3_packet_magic) {
+ dnbd3_dev_err_cur(dev, "wrong packet magic (receive)\n");
+ goto out_unlock;
+ }
// what to do?
- switch (dnbd3_reply.cmd)
- {
+ switch (reply_hdr.cmd) {
case CMD_GET_BLOCK:
// search for replied request in queue
blk_request = NULL;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(received_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- if ((uint64_t)(uintptr_t)received_request == dnbd3_reply.handle) // Double cast to prevent warning on 32bit
- {
- blk_request = received_request;
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ cmd = blk_mq_rq_to_pdu(rq_iter);
+ if (cmd->handle == reply_hdr.handle) {
+ blk_request = rq_iter;
+ list_del_init(&blk_request->queuelist);
break;
}
}
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- if (blk_request == NULL )
- error_dev_va("ERROR: Received block data for unrequested handle (%llu: %llu).\n",
- (unsigned long long)dnbd3_reply.handle, (unsigned long long)dnbd3_reply.size);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (blk_request == NULL) {
+ dnbd3_dev_err_cur(dev, "received block data for unrequested handle (%llx: len=%llu)\n",
+ reply_hdr.handle,
+ (u64)reply_hdr.size);
+ goto out_unlock;
+ }
// receive data and answer to block layer
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
- rq_for_each_segment(bvec_inst, blk_request, iter)
+ rq_for_each_segment(bvec_inst, blk_request, iter) {
#else
- rq_for_each_segment(bvec, blk_request, iter)
+ rq_for_each_segment(bvec, blk_request, iter) {
#endif
- {
- siginitsetinv(&blocked, sigmask(SIGKILL));
- sigprocmask(SIG_SETMASK, &blocked, &oldset);
-
kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
iov.iov_base = kaddr;
iov.iov_len = bvec->bv_len;
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags) != bvec->bv_len)
- {
- kunmap(bvec->bv_page);
- sigprocmask(SIG_SETMASK, &oldset, NULL );
- error_dev("ERROR: Receiving from net to block layer.");
- }
+ ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
kunmap(bvec->bv_page);
-
- sigprocmask(SIG_SETMASK, &oldset, NULL );
+ if (ret != bvec->bv_len) {
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(
+ dev, "remote peer has performed an orderly shutdown\n");
+ } else if (ret < 0) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "disconnect: receiving from net to block layer\n");
+ } else {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "receiving from net to block layer (%d bytes)\n", ret);
+ }
+ // Requeue request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ goto out_unlock;
+ }
}
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- __blk_end_request_all(blk_request, 0);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
+ blk_mq_end_request(blk_request, BLK_STS_OK);
+ break;
case CMD_GET_SERVERS:
- if (!dev->use_server_provided_alts)
- {
- remaining = dnbd3_reply.size;
- goto consume_payload;
- }
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- count = MIN(NUMBER_SERVERS, dnbd3_reply.size / sizeof(dnbd3_server_entry_t));
-
- if (count != 0)
- {
- iov.iov_base = dev->new_servers;
- iov.iov_len = count * sizeof(dnbd3_server_entry_t);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags)
- != (count * sizeof(dnbd3_server_entry_t)))
- error_dev("ERROR: Recv CMD_GET_SERVERS payload.");
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = count;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
- // If there were more servers than accepted, remove the remaining data from the socket buffer
- remaining = dnbd3_reply.size - (count * sizeof(dnbd3_server_entry_t));
- consume_payload: while (remaining > 0)
- {
- count = MIN(sizeof(dnbd3_reply), remaining); // Abuse the reply struct as the receive buffer
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = count;
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
- if (ret <= 0)
- error_dev("ERROR: Recv additional payload from CMD_GET_SERVERS.");
- remaining -= ret;
+ remaining = reply_hdr.size;
+ if (dev->use_server_provided_alts) {
+ dnbd3_server_entry_t new_server;
+
+ while (remaining >= sizeof(dnbd3_server_entry_t)) {
+ if (dnbd3_recv_bytes(dev->sock, &new_server, sizeof(new_server))
+ != sizeof(new_server)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv CMD_GET_SERVERS payload\n");
+ goto out_unlock;
+ }
+ // TODO: Log
+ if (new_server.failures == 0) { // ADD
+ dnbd3_add_server(dev, &new_server.host);
+ } else { // REM
+ dnbd3_rem_server(dev, &new_server.host);
+ }
+ remaining -= sizeof(new_server);
+ }
}
- continue;
+ if (!dnbd3_drain_socket(dev, dev->sock, remaining))
+ goto out_unlock;
+ break;
case CMD_LATEST_RID:
- if (dnbd3_reply.size != 2)
- {
- printk("ERROR: CMD_LATEST_RID.size != 2.\n");
+ if (reply_hdr.size < 2) {
+ dev_err(dnbd3_device_to_dev(dev), "CMD_LATEST_RID.size < 2\n");
continue;
}
- iov.iov_base = &rid;
- iov.iov_len = sizeof(rid);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags) <= 0)
- {
- printk("ERROR: Could not receive CMD_LATEST_RID payload.\n");
- }
- else
- {
+ if (dnbd3_recv_bytes(dev->sock, &rid, 2) != 2) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dev_err(dnbd3_device_to_dev(dev), "could not receive CMD_LATEST_RID payload\n");
+ } else {
rid = net_order_16(rid);
- printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid);
+ dnbd3_dev_info_cur(dev, "latest rid of %s is %d (currently using %d)\n",
+ dev->imgname, (int)rid, (int)dev->rid);
dev->update_available = (rid > dev->rid ? 1 : 0);
}
+ if (reply_hdr.size > 2)
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size - 2);
continue;
case CMD_KEEPALIVE:
- if (dnbd3_reply.size != 0)
- printk("ERROR: keep alive packet with payload.\n");
+ if (reply_hdr.size != 0) {
+ dev_dbg(dnbd3_device_to_dev(dev), "keep alive packet with payload\n");
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size);
+ }
continue;
default:
- printk("ERROR: Unknown command (Receive)\n");
- continue;
+ dev_err(dnbd3_device_to_dev(dev), "unknown command: %d (receive), aborting connection\n", (int)reply_hdr.cmd);
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ // This will check if we actually still need a new connection
+ dnbd3_start_discover(dev, true);
+ mutex_unlock(&dev->recv_mutex);
+}
+/**
+ * Set send or receive timeout of given socket
+ */
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+ int opt = set_send ? SO_SNDTIMEO_NEW : SO_RCVTIMEO_NEW;
+ struct __kernel_sock_timeval timeout;
+#else
+ int opt = set_send ? SO_SNDTIMEO : SO_RCVTIMEO;
+ struct timeval timeout;
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+ sockptr_t timeout_ptr = KERNEL_SOCKPTR(&timeout);
+#else
+ char *timeout_ptr = (char *)&timeout;
+#endif
+
+ timeout.tv_sec = timeout_ms / 1000;
+ timeout.tv_usec = (timeout_ms % 1000) * 1000;
+ sock_setsockopt(sock, SOL_SOCKET, opt, timeout_ptr, sizeof(timeout));
+}
+
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out)
+{
+ ktime_t start;
+ int ret, connect_time_ms;
+ struct socket *sock;
+ int retries = 4;
+ const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in)
+ : sizeof(struct sockaddr_in6);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+ ret = sock_create_kern(&init_net, addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#else
+ ret = sock_create_kern(addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#endif
+ if (ret < 0) {
+ dev_err(dnbd3_device_to_dev(dev), "couldn't create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Only one retry, TCP no delay */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+ tcp_sock_set_syncnt(sock->sk, 1);
+ tcp_sock_set_nodelay(sock->sk);
+ /* because of our aggressive timeouts, this is pointless */
+ sock_no_linger(sock->sk);
+#else
+ /* add legacy version of this, but ignore others as they're not that important */
+ ret = 1;
+ kernel_setsockopt(sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&ret, sizeof(ret));
+#endif
+ /* allow this socket to use reserved mem (vm.mem_free_kbytes) */
+ sk_set_memalloc(sock->sk);
+ sock->sk->sk_allocation = GFP_NOIO;
+
+ if (dev->panic && dev->panic_count > 1) {
+ /* in panic mode for some time, start increasing timeouts */
+ connect_time_ms = dev->panic_count * 1000;
+ } else {
+ /* otherwise, use 2*RTT of current server */
+ connect_time_ms = dev->cur_server.rtt * 2 / 1000;
+ }
+ /* but obey a minimal configurable value, and maximum sanity check */
+ if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000)
+ connect_time_ms = SOCKET_TIMEOUT_SEND * 1000;
+ else if (connect_time_ms > 60000)
+ connect_time_ms = 60000;
+ set_socket_timeout(sock, false, connect_time_ms); // recv
+ set_socket_timeout(sock, true, connect_time_ms); // send
+ start = ktime_get_real();
+ while (--retries > 0) {
+ ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0);
+ connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start);
+ if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) {
+ /* Either I'm losing my mind or there was a specific build of kernel
+ * 5.x where SO_RCVTIMEO didn't affect the connect call above, so
+ * this function would hang for over a minute for unreachable hosts.
+ * Leave in this debug check for twice the configured timeout
+ */
+ dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n",
+ connect_time_ms);
}
+ if (ret != 0) {
+ if (ret == -EINTR)
+ dnbd3_dev_dbg_host(dev, addr, "connect: interrupted system call (blocked %dms)\n",
+ connect_time_ms);
+ else
+ dnbd3_dev_dbg_host(dev, addr, "connect: failed (%d, blocked %dms)\n",
+ ret, connect_time_ms);
+ goto error;
+ }
+ *sock_out = sock;
+ return 0;
}
+error:
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
- printk("dnbd3_net_receive terminated normally.\n");
- dev->thread_receive = NULL;
- return 0;
+#define dnbd3_err_dbg_host(...) do { \
+ if (dev->panic || dev->sock == NULL) \
+ dnbd3_dev_err_host(__VA_ARGS__); \
+ else \
+ dnbd3_dev_dbg_host(__VA_ARGS__); \
+} while (0)
+
+/**
+ * Execute protocol handshake on a newly connected socket.
+ * If this is the initial connection to any server, ie. we're being called
+ * through the initial ioctl() to open a device, we'll store the rid, filesize
+ * etc. in the dev struct., otherwise, this is a potential switch to another
+ * server, so we validate the filesize, rid, name against what we expect.
+ * The server's protocol version is returned in 'remote_version'
+ */
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_data)
+{
+ unsigned long irqflags;
+ const char *name;
+ uint64_t filesize;
+ int mlen;
+ uint16_t rid;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov[2];
+ serialized_buffer_t *payload;
+ dnbd3_reply_t reply_hdr;
+ dnbd3_request_t request_hdr = { .magic = dnbd3_packet_magic };
+
+ payload = kmalloc(sizeof(*payload), GFP_KERNEL);
+ if (payload == NULL)
+ goto error;
+
+ if (copy_data && device_active(dev))
+ dev_warn(dnbd3_device_to_dev(dev), "Called handshake function with copy_data enabled when reported_size is not zero\n");
+
+ // Request filesize
+ request_hdr.cmd = CMD_SELECT_IMAGE;
+ iov[0].iov_base = &request_hdr;
+ iov[0].iov_len = sizeof(request_hdr);
+ serializer_reset_write(payload);
+ serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
+ serializer_put_string(payload, dev->imgname); // image name
+ serializer_put_uint16(payload, dev->rid); // revision id
+ serializer_put_uint8(payload, 0); // are we a server? (no!)
+ iov[1].iov_base = payload;
+ request_hdr.size = iov[1].iov_len = serializer_get_written_length(payload);
+ fixup_request(request_hdr);
+ mlen = iov[0].iov_len + iov[1].iov_len;
+ if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) {
+ dnbd3_err_dbg_host(dev, addr, "requesting image size failed\n");
+ goto error;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving image size packet (header) failed\n");
+ goto error;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic
+ || reply_hdr.cmd != CMD_SELECT_IMAGE || reply_hdr.size < 4
+ || reply_hdr.size > sizeof(*payload)) {
+ dnbd3_err_dbg_host(dev, addr,
+ "corrupt CMD_SELECT_IMAGE reply\n");
+ goto error;
+ }
+
+ // receive data
+ iov[0].iov_base = payload;
+ iov[0].iov_len = reply_hdr.size;
+ if (kernel_recvmsg(sock, &msg, iov, 1, reply_hdr.size, msg.msg_flags)
+ != reply_hdr.size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "receiving payload of CMD_SELECT_IMAGE reply failed\n");
+ goto error;
+ }
+ serializer_reset_read(payload, reply_hdr.size);
+
+ *remote_version = serializer_get_uint16(payload);
+ name = serializer_get_string(payload);
+ rid = serializer_get_uint16(payload);
+ filesize = serializer_get_uint64(payload);
+
+ if (*remote_version < MIN_SUPPORTED_SERVER) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server version too old (client: %d, server: %d, min supported: %d)\n",
+ (int)PROTOCOL_VERSION, (int)*remote_version,
+ (int)MIN_SUPPORTED_SERVER);
+ goto error;
+ }
+ if (name == NULL) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply an image name\n");
+ goto error;
+ }
+ if (rid == 0) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply a revision id\n");
+ goto error;
+ }
+
+ if (copy_data) {
+ if (filesize < DNBD3_BLOCK_SIZE) {
+ dnbd3_err_dbg_host(dev, addr, "reported size by server is < 4096\n");
+ goto error;
+ }
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (strlen(dev->imgname) < strlen(name)) {
+ dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_KERNEL);
+ if (dev->imgname == NULL) {
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_err_dbg_host(dev, addr, "reallocating buffer for new image name failed\n");
+ goto error;
+ }
+ }
+ strcpy(dev->imgname, name);
+ dev->rid = rid;
+ // store image information
+ dev->reported_size = filesize;
+ dev->update_available = 0;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
+ dnbd3_dev_dbg_host(dev, addr, "image size: %llu\n", dev->reported_size);
+ } else {
+ /* switching connection, sanity checks */
+ if (rid != dev->rid) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server supplied wrong rid (client: '%d', server: '%d')\n",
+ (int)dev->rid, (int)rid);
+ goto error;
+ }
+
+ if (strcmp(name, dev->imgname) != 0) {
+ dnbd3_err_dbg_host(dev, addr, "server offers image '%s', requested '%s'\n", name, dev->imgname);
+ goto error;
+ }
+
+ if (filesize != dev->reported_size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "reported image size of %llu does not match expected value %llu\n",
+ (unsigned long long)filesize, (unsigned long long)dev->reported_size);
+ goto error;
+ }
+ }
+ kfree(payload);
+ return true;
+
+error:
+ kfree(payload);
+ return false;
+}
+
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ dnbd3_request_t request_hdr = {
+ .magic = dnbd3_packet_magic,
+ .cmd = cmd,
+ .size = size,
+ .offset = offset,
+ .handle = handle,
+ };
+ struct kvec iov = { .iov_base = &request_hdr, .iov_len = sizeof(request_hdr) };
+
+ fixup_request(request_hdr);
+ return kernel_sendmsg(sock, &msg, &iov, 1, sizeof(request_hdr)) == sizeof(request_hdr);
+}
+
+/**
+ * Send a request with given cmd type and empty payload.
+ */
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd)
+{
+ int ret;
+
+ mutex_lock(&dev->send_mutex);
+ ret = dev->sock
+ && dnbd3_send_request(dev->sock, cmd, 0, 0, 0);
+ mutex_unlock(&dev->send_mutex);
+ return ret;
+}
+
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov = { .iov_base = buffer, .iov_len = count };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, count, msg.msg_flags);
+}
+
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr)
+{
+ int ret = dnbd3_recv_bytes(sock, reply_hdr, sizeof(*reply_hdr));
+
+ fixup_reply(*reply_hdr);
+ return ret;
+}
+
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes)
+{
+ int ret;
+ struct kvec iov;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+
+ while (bytes > 0) {
+ iov.iov_base = __garbage_mem;
+ iov.iov_len = sizeof(__garbage_mem);
+ ret = kernel_recvmsg(sock, &msg, &iov, 1, MIN(bytes, iov.iov_len), msg.msg_flags);
+ if (ret <= 0) {
+ dnbd3_dev_err_cur(dev, "draining payload failed (ret=%d)\n", ret);
+ return false;
+ }
+ bytes -= ret;
+ }
+ return true;
+}
+
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock)
+{
+ dnbd3_reply_t reply_hdr;
+
+ // Request block
+ if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) {
+ dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n");
+ return false;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n");
+ return false;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK
+ || reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) {
+ dnbd3_err_dbg_host(dev, addr,
+ "unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n",
+ (int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle);
+ return false;
+ }
- error:
+ // receive data
+ return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE);
+}
+#undef dnbd3_err_dbg_host
+
+static void replace_main_socket(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ mutex_lock(&dev->send_mutex);
+ // First, shutdown connection, so receive worker will leave its mainloop
if (dev->sock)
kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ mutex_lock(&dev->recv_mutex);
+ // Receive worker is done, get rid of socket and replace
+ if (dev->sock)
+ sock_release(dev->sock);
+ dev->sock = sock;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (addr == NULL) {
+ memset(&dev->cur_server, 0, sizeof(dev->cur_server));
+ } else {
+ dev->cur_server.host = *addr;
+ dev->cur_server.rtt = 0;
+ dev->cur_server.protocol_version = protocol_version;
}
- dev->thread_receive = NULL;
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ mutex_unlock(&dev->recv_mutex);
+ mutex_unlock(&dev->send_mutex);
}
+static void dnbd3_release_resources(dnbd3_device_t *dev)
+{
+ if (dev->send_wq)
+ destroy_workqueue(dev->send_wq);
+ dev->send_wq = NULL;
+ if (dev->recv_wq)
+ destroy_workqueue(dev->recv_wq);
+ dev->recv_wq = NULL;
+ mutex_destroy(&dev->send_mutex);
+ mutex_destroy(&dev->recv_mutex);
+}
+
+/**
+ * Establish new connection on a dnbd3 device.
+ * Return 0 on success, errno otherwise
+ */
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init)
+{
+ unsigned long irqflags;
+ struct socket *sock = NULL;
+ uint16_t proto_version;
+ int ret;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (init && device_active(dev)) {
+ dnbd3_dev_err_cur(dev, "device already configured/connected\n");
+ return -EBUSY;
+ }
+ if (!init && !device_active(dev)) {
+ dev_warn(dnbd3_device_to_dev(dev), "connection switch called on unconfigured device\n");
+ return -ENOTCONN;
+ }
+
+ dnbd3_dev_dbg_host(dev, addr, "connecting...\n");
+ ret = dnbd3_connect(dev, addr, &sock);
+ if (ret != 0 || sock == NULL)
+ goto error;
+
+ /* execute the "select image" handshake */
+ // if init is true, reported_size will be set
+ if (!dnbd3_execute_handshake(dev, sock, addr, &proto_version, init)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (init) {
+ // We're setting up the device for use - allocate resources
+ // Do not goto error before this
+ ASSERT(!dev->send_wq);
+ ASSERT(!dev->recv_wq);
+ mutex_init(&dev->send_mutex);
+ mutex_init(&dev->recv_mutex);
+ // a designated queue for sending, that allows one active task only
+ dev->send_wq = alloc_workqueue("dnbd%d-send",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ 1, dev->index);
+ dev->recv_wq = alloc_workqueue("dnbd%d-recv",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ 1, dev->index);
+ if (!dev->send_wq || !dev->recv_wq) {
+ ret = -ENOMEM;
+ goto error_dealloc;
+ }
+ }
+
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000); // recv
+ dnbd3_set_primary_connection(dev, sock, addr, proto_version);
+ sock = NULL; // In case we ever goto error* after this point
+
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (init) {
+ dev->discover_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_STARTUP;
+ // discovery and keepalive are not critical, use the power efficient queue
+ queue_delayed_work(system_power_efficient_wq, &dev->discover_work,
+ dev->discover_interval * HZ);
+ queue_delayed_work(system_power_efficient_wq, &dev->keepalive_work,
+ KEEPALIVE_INTERVAL * HZ);
+ // but the receiver is performance critical AND runs indefinitely, use the
+ // the cpu intensive queue, as jobs submitted there will not cound towards
+ // the concurrency limit of per-cpu worker threads. It still feels a little
+ // dirty to avoid managing our own thread, but nbd does it too.
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ return 0;
+
+error_dealloc:
+ if (init) {
+ // If anything fails during initialization, free resources again
+ dnbd3_release_resources(dev);
+ }
+error:
+ if (init)
+ dev->reported_size = 0;
+ if (sock)
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
+
+void dnbd3_net_work_init(dnbd3_device_t *dev)
+{
+ INIT_WORK(&dev->send_work, dnbd3_send_workfn);
+ INIT_WORK(&dev->recv_work, dnbd3_recv_workfn);
+ INIT_DELAYED_WORK(&dev->discover_work, dnbd3_discover_workfn);
+ INIT_DELAYED_WORK(&dev->keepalive_work, dnbd3_keepalive_workfn);
+}
+
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (addr->ss_family == 0 || dev->imgname == NULL || sock == NULL) {
+ dnbd3_dev_err_cur(dev, "connect: host, image name or sock not set\n");
+ return -EINVAL;
+ }
+
+ replace_main_socket(dev, sock, addr, protocol_version);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_SWITCH;
+ queue_work(dev->recv_wq, &dev->recv_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+
+ if (dev->use_server_provided_alts)
+ dnbd3_send_empty_request(dev, CMD_GET_SERVERS);
+
+ dnbd3_dev_info_cur(dev, "connection switched\n");
+ dnbd3_blk_requeue_all_requests(dev);
+ return 0;
+}
+
+/**
+ * Disconnect the device, shutting it down.
+ */
+int dnbd3_net_disconnect(dnbd3_device_t *dev)
+{
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (!device_active(dev))
+ return -ENOTCONN;
+ dev_dbg(dnbd3_device_to_dev(dev), "disconnecting device ...\n");
+
+ dev->reported_size = 0;
+ /* quickly fail all requests */
+ dnbd3_blk_fail_all_requests(dev);
+ replace_main_socket(dev, NULL, NULL, 0);
+
+ cancel_delayed_work_sync(&dev->keepalive_work);
+ cancel_delayed_work_sync(&dev->discover_work);
+ cancel_work_sync(&dev->send_work);
+ cancel_work_sync(&dev->recv_work);
+
+ dnbd3_blk_fail_all_requests(dev);
+ dnbd3_release_resources(dev);
+ dev_dbg(dnbd3_device_to_dev(dev), "all workers shut down\n");
+ return 0;
+}
diff --git a/src/kernel/net.h b/src/kernel/net.h
index a06a20c..69fa523 100644
--- a/src/kernel/net.h
+++ b/src/kernel/net.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,30 +22,12 @@
#ifndef NET_H_
#define NET_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#define init_msghdr(h) do { \
- h.msg_name = NULL; \
- h.msg_namelen = 0; \
- h.msg_control = NULL; \
- h.msg_controllen = 0; \
- h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
- } while (0)
+void dnbd3_net_work_init(dnbd3_device_t *dev);
-int dnbd3_net_connect(dnbd3_device_t *lo);
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init);
-int dnbd3_net_disconnect(dnbd3_device_t *lo);
-
-int dnbd3_net_send(void *data);
-
-int dnbd3_net_receive(void *data);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg);
-#else
-void dnbd3_net_heartbeat(unsigned long arg);
-#endif
-
-int dnbd3_net_discover(void *data);
+int dnbd3_net_disconnect(dnbd3_device_t *dev);
#endif /* NET_H_ */
diff --git a/src/kernel/serialize.c b/src/kernel/serialize.c
new file mode 120000
index 0000000..5a4e4ac
--- /dev/null
+++ b/src/kernel/serialize.c
@@ -0,0 +1 @@
+../shared/serialize.c \ No newline at end of file
diff --git a/src/kernel/serialize_kmod.c b/src/kernel/serialize_kmod.c
deleted file mode 100644
index 50746df..0000000
--- a/src/kernel/serialize_kmod.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-#define KERNEL_MODULE
-#include "serialize.c"
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
index 4406072..9deba96 100644
--- a/src/kernel/sysfs.c
+++ b/src/kernel/sysfs.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,156 +22,138 @@
#include <linux/kobject.h>
#include "sysfs.h"
-#include "utils.h"
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
+/**
+ * Print currently connected server IP:PORT
+ */
ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev)
{
- if (dev->cur_server.host.type == HOST_IP4)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- else if (dev->cur_server.host.type == HOST_IP6)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- *buf = '\0';
- return 0;
-}
-
-ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev)
-{
- return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE);
-}
+ ssize_t ret;
-ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev)
-{
- int i, num = 0;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type) ++num;
- }
- return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE);
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%pISpc\n", &dev->cur_server.host), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * List alt servers. One line per server, format is:
+ * IP:PORT RTT consecutive_failures best_count
+ */
ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev)
{
- int i, size = PAGE_SIZE, ret;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == HOST_IP4)
- ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else if (dev->alt_servers[i].host.type == HOST_IP6)
- ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else
+ int i, size = PAGE_SIZE;
+ ssize_t ret;
+
+ if (mutex_lock_interruptible(&dev->alt_servers_lock) != 0)
+ return 0;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
continue;
+
+ ret = MIN(snprintf(buf, size, "%pISpc %llu %d %d\n", &dev->alt_servers[i].host,
+ (unsigned long long)((dev->alt_servers[i].rtts[0] +
+ dev->alt_servers[i].rtts[1] +
+ dev->alt_servers[i].rtts[2] +
+ dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures,
+ (int)dev->alt_servers[i].best_count),
+ size);
size -= ret;
buf += ret;
- if (size <= 0)
- {
+ if (size <= 0) {
size = 0;
break;
}
}
+ mutex_unlock(&dev->alt_servers_lock);
return PAGE_SIZE - size;
}
+/**
+ * Show name of image in use
+ */
ssize_t show_image_name(char *buf, dnbd3_device_t *dev)
{
- if (dev->imgname == NULL) return sprintf(buf, "(null)");
- return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ ssize_t ret;
+
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * Show rid of image in use
+ */
ssize_t show_rid(char *buf, dnbd3_device_t *dev)
{
+ // No locking here, primitive type, no pointer to allocated memory
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE);
}
ssize_t show_update_available(char *buf, dnbd3_device_t *dev)
{
+ // Same story
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE);
}
-device_attr_t cur_server_addr =
-{
- .attr = {.name = "cur_server_addr", .mode = 0444 },
- .show = show_cur_server_addr,
- .store = NULL,
-};
-
-device_attr_t cur_server_rtt =
-{
- .attr = {.name = "cur_server_rtt", .mode = 0444 },
- .show = show_cur_server_rtt,
- .store = NULL,
-};
-
-device_attr_t alt_server_num =
-{
- .attr = {.name = "alt_server_num", .mode = 0444 },
- .show = show_alt_server_num,
- .store = NULL,
+device_attr_t cur_server_addr = {
+ .attr = { .name = "cur_server_addr", .mode = 0444 },
+ .show = show_cur_server_addr,
+ .store = NULL,
};
-device_attr_t alt_servers =
-{
- .attr = {.name = "alt_servers", .mode = 0444 },
- .show = show_alt_servers,
- .store = NULL,
+device_attr_t alt_servers = {
+ .attr = { .name = "alt_servers", .mode = 0444 },
+ .show = show_alt_servers,
+ .store = NULL,
};
-device_attr_t image_name =
-{
- .attr = {.name = "image_name", .mode = 0444 },
- .show = show_image_name,
- .store = NULL,
+device_attr_t image_name = {
+ .attr = { .name = "image_name", .mode = 0444 },
+ .show = show_image_name,
+ .store = NULL,
};
-device_attr_t rid =
-{
- .attr = {.name = "rid", .mode = 0444 },
- .show = show_rid,
- .store = NULL,
+device_attr_t rid = {
+ .attr = { .name = "rid", .mode = 0444 },
+ .show = show_rid,
+ .store = NULL,
};
-device_attr_t update_available =
-{
- .attr = {.name = "update_available", .mode = 0444 },
- .show = show_update_available,
- .store = NULL,
+device_attr_t update_available = {
+ .attr = { .name = "update_available", .mode = 0444 },
+ .show = show_update_available,
+ .store = NULL,
};
ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
device_attr_t *device_attr = container_of(attr, device_attr_t, attr);
dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj);
+
return device_attr->show(buf, dev);
}
-struct attribute *device_attrs[] =
-{
+struct attribute *device_attrs[] = {
&cur_server_addr.attr,
- &cur_server_rtt.attr,
- &alt_server_num.attr,
&alt_servers.attr,
- &image_name.attr,
- &rid.attr,
+ &image_name.attr, &rid.attr,
&update_available.attr,
NULL,
};
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ATTRIBUTE_GROUPS(device);
+#endif
-struct sysfs_ops device_ops =
-{
+const struct sysfs_ops device_ops = {
.show = device_show,
};
@@ -179,14 +162,16 @@ void release(struct kobject *kobj)
kobj->state_initialized = 0;
}
-struct kobj_type device_ktype =
-{
+struct kobj_type device_ktype = {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
.default_attrs = device_attrs,
+#else
+ .default_groups = device_groups,
+#endif
.sysfs_ops = &device_ops,
.release = release,
};
-
void dnbd3_sysfs_init(dnbd3_device_t *dev)
{
int error;
@@ -196,7 +181,7 @@ void dnbd3_sysfs_init(dnbd3_device_t *dev)
error = kobject_init_and_add(kobj, ktype, parent, "%s", "net");
if (error)
- printk("Error initializing dnbd3 device!\n");
+ dev_err(dnbd3_device_to_dev(dev), "initializing sysfs for device failed!\n");
}
void dnbd3_sysfs_exit(dnbd3_device_t *dev)
diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h
index 0a747a5..1db4a07 100644
--- a/src/kernel/sysfs.h
+++ b/src/kernel/sysfs.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,25 +22,16 @@
#ifndef SYSFS_H_
#define SYSFS_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
void dnbd3_sysfs_init(dnbd3_device_t *dev);
void dnbd3_sysfs_exit(dnbd3_device_t *dev);
-typedef struct
-{
+typedef struct {
struct attribute attr;
- ssize_t (*show)(char *, dnbd3_device_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_device_t *);
+ ssize_t (*show)(char *buf, dnbd3_device_t *dev);
+ ssize_t (*store)(const char *buf, size_t len, dnbd3_device_t *dev);
} device_attr_t;
-typedef struct
-{
- struct attribute attr;
- ssize_t (*show)(char *, dnbd3_server_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_server_t *);
-} server_attr_t;
-
-
#endif /* SYSFS_H_ */
diff --git a/src/kernel/utils.c b/src/kernel/utils.c
deleted file mode 100644
index 902025f..0000000
--- a/src/kernel/utils.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include <linux/kernel.h>
-
-#include "utils.h"
-
-unsigned int inet_addr(char *str)
-{
- int a, b, c, d;
- char arr[4];
- sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d);
- arr[0] = a;
- arr[1] = b;
- arr[2] = c;
- arr[3] = d;
- return *(unsigned int *) arr;
-}
-
-void inet_ntoa(struct in_addr addr, char *str)
-{
- unsigned char *ptr = (unsigned char *) &addr;
- sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff);
-}
diff --git a/src/kernel/utils.h b/src/kernel/utils.h
deleted file mode 100644
index e54b3cf..0000000
--- a/src/kernel/utils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef UTILS_H_
-#define UTILS_H_
-
-#include <linux/in.h>
-
-unsigned int inet_addr(char *str);
-void inet_ntoa(struct in_addr addr, char *str);
-
-#endif /* UTILS_H_ */
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
new file mode 100644
index 0000000..9a1e1c4
--- /dev/null
+++ b/src/server/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-server
+ LANGUAGES C)
+
+# find Jansson package required by the dnbd3-server
+find_package(Jansson)
+if(NOT JANSSON_FOUND)
+ message(FATAL_ERROR "*** No jansson lib found, can't build dnbd3-server!")
+endif(NOT JANSSON_FOUND)
+
+# find atomic library required by the dnbd3-server
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX features
+add_definitions(-D_GNU_SOURCE)
+
+if(DNBD3_SERVER_AFL)
+ # check if DNBD3_RELEASE_HARDEN is disabled
+ if(DNBD3_RELEASE_HARDEN)
+ message(FATAL_ERROR "DNBD3_SERVER_AFL can only be enabled if DNBD3_RELEASE_HARDEN is disabled")
+ endif(DNBD3_RELEASE_HARDEN)
+
+ # build dnbd3-server with AFL support
+ message(STATUS "Building dnbd3-server with AFL support")
+ add_definitions(-DDNBD3_SERVER_AFL)
+
+ # change compiler for dnbd3-server sources if AFL enabled
+ include(CheckAFLCCompiler)
+ check_afl_c_compiler(AFL_C_COMPILER AFL_C_COMPILER_NAME ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ID})
+ if(AFL_C_COMPILER)
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER} - done")
+ # change C compiler to a corresponding AFL C compiler
+ set(CMAKE_C_COMPILER "${AFL_C_COMPILER}")
+ else(AFL_C_COMPILER)
+ # no corresponding AFL C compiler found
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER_NAME} - failed")
+ message(FATAL_ERROR "No corresponding AFL C compiler ${AFL_C_COMPILER_NAME} was found for the C compiler ${CMAKE_C_COMPILER}!")
+ endif(AFL_C_COMPILER)
+endif(DNBD3_SERVER_AFL)
+
+set(DNBD3_SERVER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.c)
+set(DNBD3_SERVER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reftypes.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.h)
+
+add_executable(dnbd3-server ${DNBD3_SERVER_SOURCE_FILES})
+target_include_directories(dnbd3-server PRIVATE ${JANSSON_INCLUDE_DIR})
+target_link_libraries(dnbd3-server dnbd3-version dnbd3-build dnbd3-shared picohttpparser Libatomic::Libatomic ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
+
+if(DNBD3_SERVER_FUSE)
+ find_package(Fuse REQUIRED)
+ # include Fuse headers and link with Fuse library
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_FUSE)
+ target_include_directories(dnbd3-server PRIVATE ${FUSE_INCLUDE_DIRS})
+ target_link_libraries(dnbd3-server ${FUSE_LIBRARIES})
+endif(DNBD3_SERVER_FUSE)
+
+if(UNIX AND NOT APPLE)
+ # link dnbd3-server with librt if server is compiled for a Unix system
+ target_link_libraries(dnbd3-server rt)
+endif(UNIX AND NOT APPLE)
+
+if(DNBD3_SERVER_DEBUG_LOCKS)
+ # enable debugging of locks used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_LOCKS)
+endif(DNBD3_SERVER_DEBUG_LOCKS)
+
+if(DNBD3_SERVER_DEBUG_THREADS)
+ # enable debugging of threads used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_THREADS)
+endif(DNBD3_SERVER_DEBUG_THREADS)
+
+install(TARGETS dnbd3-server RUNTIME DESTINATION bin
+ COMPONENT server)
+
+add_linter(dnbd3-server-lint "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+add_linter_fix(dnbd3-server-lint-fix "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+
+# add external dependency (HTTP parser) for the dnbd3-server
+add_subdirectory(picohttpparser)
diff --git a/src/server/altservers.c b/src/server/altservers.c
index 943345c..4413ca6 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -5,16 +5,16 @@
#include "helper.h"
#include "image.h"
#include "fileutil.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../serverconfig.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/config/server.h>
#include "reference.h"
#include <assert.h>
#include <inttypes.h>
#include <jansson.h>
-#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid)
+#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image))
#define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
#define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
@@ -172,7 +172,7 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
if ( uplink->rttTestResult != RTT_INPROGRESS ) {
dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
if ( current == uplink ) {
- threadpool_run( &altservers_runCheck, uplink );
+ threadpool_run( &altservers_runCheck, uplink, "UPLINK" );
} else if ( current != NULL ) {
ref_put( &current->reference );
}
@@ -268,12 +268,32 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server
int idx[size];
int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
for ( int i = 0; i < num; ++i ) {
- servers[i] = altServers[i].host;
+ servers[i] = altServers[idx[i]].host;
}
return num;
}
/**
+ * Returns true if there is at least one alt-server the
+ * given image name would be allowed to be cloned from.
+ */
+bool altservers_imageHasAltServers(const char *image)
+{
+ bool ret = false;
+ mutex_lock( &altServersLock );
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) )
+ continue;
+ if ( !isImageAllowed( &altServers[i], image ) )
+ continue;
+ ret = true;
+ break;
+ }
+ mutex_unlock( &altServersLock );
+ return ret;
+}
+
+/**
* Get <size> alt servers. If there are more alt servers than
* requested, random servers will be picked.
* This function is suited for finding uplink servers as
@@ -450,6 +470,11 @@ static void *altservers_runCheck(void *data)
void altservers_findUplink(dnbd3_uplink_t *uplink)
{
altservers_findUplinkInternal( uplink );
+ // Above function is sync, which means normally when it
+ // returns, rttTestResult will not be RTT_INPROGRESS.
+ // But we might have an ansync call running in parallel, which would
+ // mean the above call returns immediately. Wait for that check
+ // to finish too.
while ( uplink->rttTestResult == RTT_INPROGRESS ) {
usleep( 5000 );
}
@@ -504,17 +529,29 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
return;
}
- LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) );
assert( uplink->rttTestResult == RTT_INPROGRESS );
// Test them all
dnbd3_server_connection_t best = { .fd = -1 };
unsigned long bestRtt = RTT_UNREACHABLE;
unsigned long currentRtt = RTT_UNREACHABLE;
+ uint64_t offset = 0;
+ uint32_t length = DNBD3_BLOCK_SIZE;
+ // Try to use the range of the first request in the queue as RTT block.
+ // In case we have a cluster of servers where none of them has a complete
+ // copy, we at least make sure the one we're potentially switching to
+ // has the next block we're about to request.
+ mutex_lock( &uplink->queueLock );
+ if ( uplink->queue != NULL ) {
+ offset = uplink->queue->from;
+ length = (uint32_t)( uplink->queue->to - offset );
+ }
+ mutex_unlock( &uplink->queueLock );
for (itAlt = 0; itAlt < numAlts; ++itAlt) {
int server = servers[itAlt];
// Connect
clock_gettime( BEST_CLOCK_SOURCE, &start );
- int sock = sock_connect( &altServers[server].host, 750, 1000 );
+ int sock = sock_connect( &altServers[server].host, 750, _uplinkTimeout );
if ( sock == -1 ) { // Connection failed means global error
altservers_serverFailed( server );
continue;
@@ -524,7 +561,8 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
goto image_failed;
}
// See if selecting the image succeeded ++++++++++++++++++++++++++++++
- uint16_t protocolVersion, rid;
+ uint16_t protocolVersion = 0;
+ uint16_t rid;
uint64_t imageSize;
char *name;
serialized_buffer_t serialized;
@@ -543,9 +581,9 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
if ( imageSize != image->virtualFilesize ) {
ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
}
- // Request first block (NOT random!) ++++++++++++++++++++++++++++++
- if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
- LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server );
+ // Request block (NOT random! First or from queue) ++++++++++++
+ if ( !dnbd3_get_block( sock, offset, length, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+ LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request block", server );
}
// See if requesting the block succeeded ++++++++++++++++++++++
dnbd3_reply_t reply;
@@ -553,13 +591,18 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
}
// check reply header
- if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+ if ( reply.cmd != CMD_GET_BLOCK || reply.size != length ) {
// Sanity check failed; count this as global error (malicious/broken server)
ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
}
// flush payload to include this into measurement
char buffer[DNBD3_BLOCK_SIZE];
- if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+ uint32_t todo = length;
+ ssize_t ret;
+ while ( todo != 0 && ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
+ todo -= (uint32_t)ret;
+ }
+ if ( todo != 0 ) {
ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
}
clock_gettime( BEST_CLOCK_SOURCE, &end );
@@ -567,9 +610,6 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
mutex_lock( &uplink->rttLock );
const bool isCurrent = ( uplink->current.index == server );
mutex_unlock( &uplink->rttLock );
- // Penaltize rtt if this was a cycle; this will treat this server with lower priority
- // in the near future too, so we prevent alternating between two servers that are both
- // part of a cycle and have the lowest latency.
uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
+ (end.tv_nsec - start.tv_nsec) / 1000); // µs
uint32_t avg = altservers_updateRtt( uplink, server, rtt );
@@ -614,7 +654,6 @@ failed:
} else {
LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
}
- sock_setTimeout( best.fd, _uplinkTimeout );
mutex_lock( &uplink->rttLock );
uplink->better = best;
uplink->rttTestResult = RTT_DOCHANGE;
@@ -628,10 +667,6 @@ failed:
if ( best.fd != -1 ) {
close( best.fd );
}
- if ( !image->working || uplink->cycleDetected ) {
- image->working = true;
- LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
- }
uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
mutex_lock( &uplink->rttLock );
uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 8e29aaa..78f6fcc 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou
int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
+bool altservers_imageHasAltServers(const char *image);
+
bool altservers_toString(int server, char *buffer, size_t len);
int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
diff --git a/src/server/fileutil.c b/src/server/fileutil.c
index 336ab68..9a9f066 100644
--- a/src/server/fileutil.c
+++ b/src/server/fileutil.c
@@ -68,7 +68,7 @@ bool file_setSize(int fd, uint64_t size)
// Try really hard... image loading logic relies on the file
// having the proper apparent size
uint8_t byte = 0;
- pread( fd, &byte, 1, size - 1 );
+ (void)!pread( fd, &byte, 1, size - 1 );
if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true;
return false;
}
diff --git a/src/server/fuse.c b/src/server/fuse.c
new file mode 100644
index 0000000..12913a6
--- /dev/null
+++ b/src/server/fuse.c
@@ -0,0 +1,661 @@
+#include "fuse.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/log.h>
+
+#ifndef DNBD3_SERVER_FUSE
+
+//
+bool dfuse_init(const char *opts UNUSED, const char *dir UNUSED)
+{
+ logadd( LOG_ERROR, "FUSE: Not compiled in" );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+}
+
+#else
+
+#define PATHLEN (2000)
+static char nullbytes[DNBD3_BLOCK_SIZE];
+
+// FUSE ENABLED
+#define FUSE_USE_VERSION 30
+//
+#include <dnbd3/config.h>
+#include "locks.h"
+#include "threadpool.h"
+#include "image.h"
+#include "uplink.h"
+#include "reference.h"
+#include "helper.h"
+
+#include <fuse_lowlevel.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <signal.h>
+
+#define INO_ROOT (1)
+#define INO_CTRL (2)
+#define INO_DIR (3)
+static const char *NAME_CTRL = "control";
+static const char *NAME_DIR = "images";
+
+typedef struct {
+ fuse_req_t req;
+ uint16_t rid;
+ char name[PATHLEN];
+} lookup_t;
+
+static fuse_ino_t inoCounter = 10;
+typedef struct _dfuse_dir {
+ struct _dfuse_dir *next;
+ struct _dfuse_dir *child;
+ const char *name;
+ uint64_t size;
+ fuse_ino_t ino;
+ int refcount;
+ lookup_t *img;
+} dfuse_entry_t;
+
+typedef struct {
+ dfuse_entry_t *entry;
+ dnbd3_image_t *image;
+} cmdopen_t;
+
+static dfuse_entry_t sroot = {
+ .name = "images",
+ .ino = INO_DIR,
+ .refcount = 2,
+}, *root = &sroot;
+static pthread_mutex_t dirLock;
+
+#define INIT_NONE (0)
+#define INIT_DONE (1)
+#define INIT_SHUTDOWN (2)
+#define INIT_INPROGRESS (3)
+
+static struct fuse_session *fuseSession = NULL;
+static struct fuse_chan *fuseChannel = NULL;
+static char *fuseMountPoint = NULL;
+static pthread_t fuseThreadId;
+static bool haveThread = false;
+static _Atomic(int) initState = INIT_NONE;
+static pthread_mutex_t initLock;
+static struct timespec startupTime;
+
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name);
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino);
+
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer);
+static void cleanupFuse();
+static void* fuseMainLoop(void *data);
+
+static void ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ fi->fh = 0;
+ if ( ino == INO_CTRL ) {
+ if ( ( fi->flags & 3 ) != O_WRONLY ) {
+ fuse_reply_err( req, EINVAL );
+ } else {
+ fi->nonseekable = 1;
+ fuse_reply_open( req, fi );
+ }
+ } else if ( ino == INO_ROOT ) {
+ fuse_reply_err( req, EISDIR );
+ } else {
+ if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else if ( entry->img == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, EISDIR );
+ } else if ( entry->img->rid == 0 ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else {
+ entry->refcount++;
+ mutex_unlock( &dirLock );
+ dnbd3_image_t *image = image_get( entry->img->name, entry->img->rid, true );
+ if ( image == NULL ) {
+ fuse_reply_err( req, ENOENT );
+ mutex_lock( &dirLock );
+ entry->refcount--;
+ mutex_unlock( &dirLock );
+ } else {
+ cmdopen_t *handle = malloc( sizeof(cmdopen_t) );
+ handle->entry = entry;
+ handle->image = image;
+ fi->fh = (uintptr_t)handle;
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
+ }
+ }
+ }
+}
+
+static dfuse_entry_t* addImage(dfuse_entry_t **dir, const char *name, lookup_t *img)
+{
+ const char *slash = strchr( name, '/' );
+ if ( slash == NULL ) {
+ // Name portion at the end
+ char *path = NULL;
+ if ( asprintf( &path, "%s:%d", name, (int)img->rid ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ entry->img = img;
+ } else {
+ free( path );
+ if ( entry->img == NULL ) {
+ return NULL;
+ }
+ }
+ return entry;
+ } else {
+ // Dirname
+ char *path = NULL;
+ if ( asprintf( &path, "%.*s", (int)( slash - name ), name ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ } else {
+ free( path );
+ }
+ return addImage( &entry->child, slash + 1, img );
+ }
+}
+
+static void ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_CTRL ) {
+ fuse_reply_err( req, EROFS );
+ return;
+ }
+ if ( off != 0 ) {
+ fuse_reply_err( req, ESPIPE );
+ return;
+ }
+ if ( size >= PATHLEN ) {
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ size_t colon = 0;
+ int rid = 0;
+ for ( size_t i = 0; i < size; ++i ) {
+ if ( buf[i] == '\0' || buf[i] == '\n' ) {
+ if ( colon == 0 ) {
+ colon = i;
+ }
+ break;
+ }
+ if ( colon != 0 ) {
+ if ( !isdigit( buf[i] ) ) {
+ logadd( LOG_WARNING, "FUSE: Malformed rid" );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ rid = rid * 10 + ( buf[i] - '0' ); // Can overflow but who cares
+ } else if ( buf[i] == ':' ) {
+ colon = i; // Image name starting with ':' would be broken...
+ }
+ }
+ if ( rid < 0 || rid > 65535 ) {
+ logadd( LOG_WARNING, "FUSE: Invalid rid '%d'", rid );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ if ( colon == 0 ) {
+ colon = size;
+ }
+ lookup_t *lu = malloc( sizeof(lookup_t) );
+ lu->rid = (uint16_t)rid;
+ lu->req = req;
+ if ( snprintf( lu->name, PATHLEN, "%.*s", (int)colon, buf ) == -1 ) {
+ free( lu );
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ logadd( LOG_DEBUG1, "FUSE: Request for '%s:%d'", lu->name, (int)lu->rid );
+ dnbd3_image_t *image = image_getOrLoad( lu->name, lu->rid );
+ if ( image == NULL ) {
+ fuse_reply_err( lu->req, ENOENT );
+ free( lu );
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = addImage( &root->child, lu->name, lu );
+ if ( entry != NULL ) {
+ entry->size = image->virtualFilesize;
+ }
+ lu->rid = image->rid; // In case it was 0
+ mutex_unlock( &dirLock );
+ image_release( image );
+ if ( entry == NULL ) {
+ fuse_reply_err( lu->req, EINVAL );
+ free( lu );
+ } else {
+ fuse_reply_write( lu->req, size );
+ }
+ }
+}
+
+static void ll_read( fuse_req_t req, fuse_ino_t ino UNUSED, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ if ( fi->fh == 0 ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ dnbd3_image_t *image = handle->image;
+ if ( off < 0 || (uint64_t)off >= image->virtualFilesize ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ if ( off + size > image->virtualFilesize ) {
+ size = image->virtualFilesize - off;
+ }
+
+ // Check if cached locally
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ // This is a proxyed image, check if we need to relay the request...
+ const uint64_t start = (uint64_t)off & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (off + size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ ref_put( &cache->reference );
+ if ( size > (uint32_t)_maxPayload ) {
+ size = (uint32_t)_maxPayload;
+ }
+ if ( !uplink_request( image, req, &uplinkCallback, 0, off, (uint32_t)size ) ) {
+ logadd( LOG_DEBUG1, "FUSE: Could not relay uncached request to upstream proxy for image %s:%d",
+ image->name, image->rid );
+ fuse_reply_err( req, EIO );
+ }
+ return; // ASYNC
+ }
+ ref_put( &cache->reference );
+ }
+
+ // Is cached
+ size_t readSize = size;
+ if ( off + readSize > image->realFilesize ) {
+ if ( (uint64_t)off >= image->realFilesize ) {
+ readSize = 0;
+ } else {
+ readSize = image->realFilesize - off;
+ }
+ }
+ struct fuse_bufvec *vec = calloc( 1, sizeof(*vec) + sizeof(struct fuse_buf) );
+ if ( readSize != 0 ) {
+ // Real data from file
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = readSize,
+ .flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_RETRY | FUSE_BUF_FD_SEEK,
+ .fd = image->readFd,
+ .pos = off,
+ };
+ }
+ if ( readSize != size ) {
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = size - readSize,
+ .mem = nullbytes,
+ .fd = -1,
+ };
+ }
+ fuse_reply_data( req, vec, FUSE_BUF_SPLICE_MOVE );
+ free( vec );
+}
+
+static bool statInternal(fuse_ino_t ino, struct stat *stbuf)
+{
+ switch ( ino ) {
+ case INO_ROOT:
+ case INO_DIR:
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_CTRL:
+ stbuf->st_mode = S_IFREG | 0222;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = 0;
+ clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return false;
+ }
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = 0;
+ stbuf->st_ino = ino;
+ return true;
+}
+
+/**
+ * HOLD LOCK
+ */
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name)
+{
+ if ( dir == NULL )
+ return NULL;
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ if ( strcmp( it->name, name ) == 0 )
+ return it;
+ }
+ return NULL;
+}
+
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino)
+{
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ logadd( LOG_DEBUG1, "ino %d is %s", (int)it->ino, it->name );
+ if ( it->ino == ino )
+ return it;
+ if ( it->img == NULL ) {
+ dir = inoRecursive( it->child, ino );
+ if ( dir != NULL )
+ return dir;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * HOLD LOCK
+ */
+static void entryToStat(dfuse_entry_t *entry, struct stat *stbuf)
+{
+ if ( entry->img == NULL ) {
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ } else {
+ stbuf->st_mode = S_IFREG | 0444;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = entry->size;
+ }
+ stbuf->st_ino = entry->ino;
+ stbuf->st_uid = 0;
+ stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
+}
+
+static void ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ logadd( LOG_DEBUG2, "Lookup at ino %d for '%s'", (int)parent, name );
+ if ( parent == INO_ROOT ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, NAME_DIR ) == 0 ) {
+ e.ino = INO_DIR;
+ } else if ( strcmp( name, NAME_CTRL ) == 0 ) {
+ e.ino = INO_CTRL;
+ e.attr_timeout = e.entry_timeout = 3600;
+ }
+ if ( e.ino != 0 && statInternal( e.ino, &e.attr ) ) {
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *dir = inoRecursive( root, parent );
+ if ( dir != NULL ) {
+ if ( dir->img != NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOTDIR );
+ return;
+ }
+ dfuse_entry_t *entry = dirLookup( dir->child, name );
+ if ( entry != NULL ) {
+ struct fuse_entry_param e = { .ino = entry->ino };
+ entryToStat( entry, &e.attr );
+ mutex_unlock( &dirLock );
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ }
+ mutex_unlock( &dirLock );
+ }
+ fuse_reply_err( req, ENOENT );
+}
+
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
+{
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
+}
+
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
+{
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
+ }
+ return fuse_reply_buf( req, NULL, 0 );
+}
+
+static void ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, NAME_CTRL, INO_CTRL );
+ dirbuf_add( req, &b, NAME_DIR, INO_DIR );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
+ }
+}
+
+static void ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi UNUSED)
+{
+ struct stat stbuf = { .st_ino = 0 };
+ if ( !statInternal( ino, &stbuf ) ) {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry != NULL ) {
+ entryToStat( entry, &stbuf );
+ }
+ mutex_unlock( &dirLock );
+ }
+ if ( stbuf.st_ino == 0 ) {
+ fuse_reply_err( req, ENOENT );
+ } else {
+ fuse_reply_attr( req, &stbuf, 0 );
+ }
+}
+
+void ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr UNUSED, int to_set UNUSED, struct fuse_file_info *fi)
+{
+ ll_getattr( req, ino, fi );
+}
+
+void ll_release(fuse_req_t req, fuse_ino_t ino UNUSED, struct fuse_file_info *fi)
+{
+ if ( fi->fh != 0 ) {
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ image_release( handle->image );
+ mutex_lock( &dirLock );
+ handle->entry->refcount--;
+ mutex_unlock( &dirLock );
+ free( handle );
+ }
+ fuse_reply_err( req, 0 );
+}
+
+static void uplinkCallback(void *data, uint64_t handle UNUSED, uint64_t start UNUSED, uint32_t length, const char *buffer)
+{
+ fuse_req_t req = (fuse_req_t)data;
+ if ( buffer == NULL ) {
+ fuse_reply_err( req, EIO );
+ } else {
+ fuse_reply_buf( req, buffer, length );
+ }
+}
+
+#define DUMP(key,type) logadd( LOG_DEBUG1, "FUSE: " #key ": " type, conn->key )
+void ll_init(void *userdata, struct fuse_conn_info *conn)
+{
+ DUMP( capable, "%u" );
+ DUMP( congestion_threshold, "%u" );
+ DUMP( max_background, "%u" );
+ //DUMP( max_read, "%u" );
+ DUMP( max_readahead, "%u" );
+ DUMP( max_write, "%u" );
+ DUMP( want, "%u" );
+ conn->want |= FUSE_CAP_SPLICE_READ | FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
+}
+#undef DUMP
+
+/* map the implemented fuse operations */
+static struct fuse_lowlevel_ops fuseOps = {
+ .lookup = ll_lookup,
+ .getattr = ll_getattr,
+ .setattr = ll_setattr,
+ .readdir = ll_readdir,
+ .open = ll_open,
+ .release = ll_release,
+ .read = ll_read,
+ .write = ll_write,
+ .init = ll_init,
+ //.destroy = ll_destroy,
+};
+
+bool dfuse_init(const char *opts, const char *dir)
+{
+ int ex = INIT_NONE;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_INPROGRESS ) ) {
+ logadd( LOG_ERROR, "Calling dfuse_init twice" );
+ exit( 1 );
+ }
+ mutex_init( &initLock, LOCK_FUSE_INIT );
+ mutex_lock( &initLock );
+ mutex_init( &dirLock, LOCK_FUSE_DIR );
+ clock_gettime( CLOCK_REALTIME, &startupTime );
+ struct fuse_args args = FUSE_ARGS_INIT( 0, NULL );
+ fuse_opt_add_arg( &args, "dnbd3fs" ); // argv[0]
+ if ( opts != NULL ) {
+ fuse_opt_add_arg( &args, opts );
+ }
+ fuse_opt_add_arg( &args, "-odefault_permissions" );
+ fuse_opt_add_arg( &args, dir ); // last param is mount point
+ //
+ if ( fuse_parse_cmdline( &args, &fuseMountPoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Error parsing command line" );
+ goto fail;
+ }
+ fuseChannel = fuse_mount( fuseMountPoint, &args );
+ if ( fuseChannel == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Cannot mount to %s", dir );
+ goto fail;
+ }
+ fuseSession = fuse_lowlevel_new( &args, &fuseOps, sizeof( fuseOps ), NULL );
+ if ( fuseSession == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Error initializing fuse session" );
+ goto fail;
+ }
+ fuse_session_add_chan( fuseSession, fuseChannel );
+ if ( 0 != thread_create( &fuseThreadId, NULL, &fuseMainLoop, (void *)NULL ) ) {
+ logadd( LOG_ERROR, "FUSE: Could not start thread" );
+ goto fail;
+ }
+ haveThread = true;
+ // Init OK
+ mutex_unlock( &initLock );
+ return true;
+fail:
+ cleanupFuse();
+ fuse_opt_free_args( &args );
+ initState = INIT_SHUTDOWN;
+ mutex_unlock( &initLock );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+ if ( initState == INIT_NONE )
+ return;
+ for ( ;; ) {
+ int ex = INIT_DONE;
+ if ( atomic_compare_exchange_strong( &initState, &ex, INIT_SHUTDOWN ) )
+ break; // OK, do the shutdown
+ if ( ex == INIT_INPROGRESS )
+ continue; // dfuse_init in progress, wait for mutex
+ // Wrong state
+ logadd( LOG_WARNING, "Called dfuse_shutdown without dfuse_init first" );
+ return;
+ }
+ logadd( LOG_INFO, "Shutting down fuse mainloop..." );
+ mutex_lock( &initLock );
+ if ( fuseSession != NULL ) {
+ fuse_session_exit( fuseSession );
+ }
+ if ( !haveThread ) {
+ cleanupFuse();
+ }
+ mutex_unlock( &initLock );
+ if ( haveThread ) {
+ logadd( LOG_DEBUG1, "FUSE: Sending USR1 to mainloop thread" );
+ pthread_kill( fuseThreadId, SIGUSR1 );
+ pthread_join( fuseThreadId, NULL );
+ }
+}
+
+static void* fuseMainLoop(void *data UNUSED)
+{
+ int ex = INIT_INPROGRESS;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_DONE ) ) {
+ logadd( LOG_WARNING, "FUSE: Unexpected state in fuseMainLoop: %d", ex );
+ return NULL;
+ }
+ setThreadName( "fuse" );
+ logadd( LOG_INFO, "FUSE: Starting mainloop" );
+ fuse_session_loop_mt( fuseSession );
+ logadd( LOG_INFO, "FUSE: Left mainloop" );
+ mutex_lock( &initLock );
+ cleanupFuse();
+ mutex_unlock( &initLock );
+ return NULL;
+}
+
+static void cleanupFuse()
+{
+ if ( fuseChannel != NULL ) {
+ fuse_session_remove_chan( fuseChannel );
+ }
+ if ( fuseSession != NULL ) {
+ fuse_session_destroy( fuseSession );
+ fuseSession = NULL;
+ }
+ if ( fuseMountPoint != NULL && fuseChannel != NULL ) {
+ fuse_unmount( fuseMountPoint, fuseChannel );
+ }
+ fuseChannel = NULL;
+}
+
+#endif // DNBD3_SERVER_FUSE
diff --git a/src/server/fuse.h b/src/server/fuse.h
new file mode 100644
index 0000000..f01ad58
--- /dev/null
+++ b/src/server/fuse.h
@@ -0,0 +1,10 @@
+#ifndef _FUSE_H_
+#define _FUSE_H_
+
+#include <stdbool.h>
+
+bool dfuse_init(const char *opts, const char *dir);
+
+void dfuse_shutdown();
+
+#endif
diff --git a/src/server/globals.c b/src/server/globals.c
index f8c3f66..f6432cb 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -1,7 +1,7 @@
#include "globals.h"
#include "ini.h"
#include "locks.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/log.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
@@ -19,22 +19,26 @@ atomic_int _clientPenalty = 0;
atomic_bool _isProxy = false;
atomic_int _backgroundReplication = BGR_FULL;
atomic_int _bgrMinClients = 0;
+atomic_int _bgrWindowSize = 1;
atomic_bool _lookupMissingForProxy = true;
atomic_bool _sparseFiles = false;
+atomic_bool _ignoreAllocErrors = false;
atomic_bool _removeMissingImages = true;
-atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
-atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT;
+atomic_uint _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
+atomic_uint _clientTimeout = SOCKET_TIMEOUT_CLIENT;
atomic_bool _closeUnusedFd = false;
atomic_bool _vmdkLegacyMode = false;
// Not really needed anymore since we have '+' and '-' in alt-servers
atomic_bool _proxyPrivateOnly = false;
+atomic_bool _pretendClient = false;
atomic_int _autoFreeDiskSpaceDelay = 3600 * 10;
// [limits]
atomic_int _maxClients = SERVER_MAX_CLIENTS;
atomic_int _maxImages = SERVER_MAX_IMAGES;
-atomic_int _maxPayload = 9000000; // 9MB
+atomic_uint _maxPayload = 9000000; // 9MB
atomic_uint_fast64_t _maxReplicationSize = (uint64_t)100000000000LL;
-atomic_bool _pretendClient = false;
+atomic_uint _maxPrefetch = 262144; // 256KB
+atomic_uint _minRequestSize = 0;
/**
* True when loading config the first time. Consecutive loads will
@@ -58,31 +62,35 @@ static const char* units = "KMGTPEZY";
static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname);
static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname);
-static bool parse32(const char *in, atomic_int *out, const char *optname) UNUSED;
-static bool parse32u(const char *in, atomic_int *out, const char *optname);
+static bool parse32(const char *in, atomic_int *out, const char *optname);
+static bool parse32u(const char *in, atomic_uint *out, const char *optname);
static int ini_handler(void *custom UNUSED, const char* section, const char* key, const char* value)
{
if ( initialLoad ) {
if ( _basePath == NULL ) SAVE_TO_VAR_STR( dnbd3, basePath );
SAVE_TO_VAR_BOOL( dnbd3, vmdkLegacyMode );
- SAVE_TO_VAR_UINT( dnbd3, listenPort );
- SAVE_TO_VAR_UINT( limits, maxClients );
- SAVE_TO_VAR_UINT( limits, maxImages );
+ SAVE_TO_VAR_INT( dnbd3, listenPort );
+ SAVE_TO_VAR_INT( limits, maxClients );
+ SAVE_TO_VAR_INT( limits, maxImages );
}
SAVE_TO_VAR_BOOL( dnbd3, isProxy );
SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+ SAVE_TO_VAR_INT( dnbd3, bgrWindowSize );
SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
+ SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages );
SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd );
- SAVE_TO_VAR_UINT( dnbd3, serverPenalty );
- SAVE_TO_VAR_UINT( dnbd3, clientPenalty );
+ SAVE_TO_VAR_INT( dnbd3, serverPenalty );
+ SAVE_TO_VAR_INT( dnbd3, clientPenalty );
SAVE_TO_VAR_UINT( dnbd3, uplinkTimeout );
SAVE_TO_VAR_UINT( dnbd3, clientTimeout );
SAVE_TO_VAR_UINT( limits, maxPayload );
SAVE_TO_VAR_UINT64( limits, maxReplicationSize );
+ SAVE_TO_VAR_UINT( limits, maxPrefetch );
+ SAVE_TO_VAR_UINT( limits, minRequestSize );
SAVE_TO_VAR_BOOL( dnbd3, pretendClient );
SAVE_TO_VAR_INT( dnbd3, autoFreeDiskSpaceDelay );
if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) {
@@ -111,7 +119,10 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
void globals_loadConfig()
{
char *name = NULL;
- asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
+ if ( asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ) == -1 ) {
+ logadd( LOG_ERROR, "Memory allocation error for config filename" );
+ exit( 1 );
+ }
if ( name == NULL ) return;
if ( initialLoad ) {
mutex_init( &loadLock, LOCK_LOAD_CONFIG );
@@ -125,9 +136,30 @@ void globals_loadConfig()
if ( initialLoad ) {
sanitizeFixedConfig();
}
- if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
- logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
- _sparseFiles = false;
+ if ( _isProxy ) {
+ if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
+ logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
+ _sparseFiles = false;
+ }
+ if ( _bgrWindowSize < 1 ) {
+ _bgrWindowSize = 1;
+ } else if ( _bgrWindowSize > UPLINK_MAX_QUEUE - 10 ) {
+ _bgrWindowSize = UPLINK_MAX_QUEUE - 10;
+ logadd( LOG_MINOR, "Limiting bgrWindowSize to %d, because of UPLINK_MAX_QUEUE",
+ _bgrWindowSize );
+ }
+ if ( _maxPayload < 256 * 1024 ) {
+ logadd( LOG_WARNING, "maxPayload was increased to 256k" );
+ _maxPayload = 256 * 1024;
+ }
+ if ( _maxPrefetch > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing maxPrefetch to maxPayload" );
+ _maxPrefetch = _maxPayload;
+ }
+ if ( _minRequestSize > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing minRequestSize to maxPayload" );
+ _minRequestSize = _maxPayload;
+ }
}
// Dump config as interpreted
char buffer[2000];
@@ -281,7 +313,7 @@ static bool parse32(const char *in, atomic_int *out, const char *optname)
return true;
}
-static bool parse32u(const char *in, atomic_int *out, const char *optname)
+static bool parse32u(const char *in, atomic_uint *out, const char *optname)
{
atomic_int_fast64_t v;
if ( !parse64( in, &v, optname ) ) return false;
@@ -289,7 +321,7 @@ static bool parse32u(const char *in, atomic_int *out, const char *optname)
logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)0, (int)INT_MAX, in );
return false;
}
- *out = (int)v;
+ *out = (unsigned int)v;
return true;
}
@@ -320,8 +352,10 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PBOOL(backgroundReplication);
}
PINT(bgrMinClients);
+ PINT(bgrWindowSize);
PBOOL(lookupMissingForProxy);
PBOOL(sparseFiles);
+ PBOOL(ignoreAllocErrors);
PBOOL(removeMissingImages);
PINT(uplinkTimeout);
PINT(clientTimeout);
@@ -335,6 +369,8 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PINT(maxImages);
PINT(maxPayload);
PUINT64(maxReplicationSize);
+ PINT(maxPrefetch);
+ PINT(minRequestSize);
return size - rem;
}
diff --git a/src/server/globals.h b/src/server/globals.h
index df8c595..bde1184 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -1,9 +1,9 @@
#ifndef _GLOBALS_H_
#define _GLOBALS_H_
-#include "../types.h"
-#include "../shared/fdsignal.h"
-#include "../serverconfig.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/config/server.h>
#include <stdint.h>
#include <stdatomic.h>
#include <time.h>
@@ -18,18 +18,30 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t;
typedef struct _dnbd3_image dnbd3_image_t;
typedef struct _dnbd3_client dnbd3_client_t;
-typedef struct
+typedef void (*uplink_callback)(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
+
+typedef struct _dnbd3_queue_client
{
- uint64_t handle; // Client defined handle to pass back in reply
- uint64_t from; // First byte offset of requested block (ie. 4096)
- uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
- dnbd3_client_t * client; // Client to send reply to
- int status; // status of this entry: ULR_*
-#ifdef _DEBUG
- ticks entered; // When this request entered the queue (for debugging)
+ struct _dnbd3_queue_client *next;
+ void* data; // Passed back to callback
+ uint64_t handle; // Passed back to callback
+ uint64_t from, to; // Client range
+ uplink_callback callback; // Callback function
+} dnbd3_queue_client_t;
+
+typedef struct _dnbd3_queue_entry
+{
+ struct _dnbd3_queue_entry *next;
+ uint64_t handle; // Our handle for this entry
+ uint64_t from; // First byte offset of requested block (ie. 4096)
+ uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+ dnbd3_queue_client_t *clients;
+#ifdef DEBUG
+ ticks entered; // When this request entered the queue (for debugging)
#endif
- uint8_t hopCount; // How many hops this request has already taken across proxies
-} dnbd3_queued_request_t;
+ uint8_t hopCount; // How many hops this request has already taken across proxies
+ bool sent; // Already sent to uplink?
+} dnbd3_queue_entry_t;
typedef struct _ns
{
@@ -91,11 +103,12 @@ struct _dnbd3_uplink
bool cycleDetected; // connection cycle between proxies detected for current remote server
int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at
// If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
- uint64_t replicationHandle; // Handle of pending replication request
atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
- atomic_int queueLen; // length of queue
- uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives)
- dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+ atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
+ int queueLen; // length of queue
+ int idleTime; // How many seconds the uplink was idle (apart from keep-alives)
+ dnbd3_queue_entry_t *queue;
+ atomic_uint_fast32_t queueId;
dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
};
@@ -110,6 +123,8 @@ typedef struct
typedef struct
{
ref reference;
+ atomic_bool dirty; // Cache map has been modified outside uplink (only integrity checker for now)
+ bool unchanged; // How many times in a row a reloaded cache map went unchanged
_Atomic uint8_t map[];
} dnbd3_cache_map_t;
@@ -128,7 +143,6 @@ struct _dnbd3_image
uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k)
uint64_t realFilesize; // actual file size on disk
ticks atime; // last access time
- ticks lastWorkCheck; // last time a non-working image has been checked
ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image
uint32_t masterCrc32; // CRC-32 of the crc-32 list
@@ -136,10 +150,18 @@ struct _dnbd3_image
atomic_int completenessEstimate; // Completeness estimate in percent
atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
- atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected
+ struct {
+ atomic_bool read; // Error reading from file
+ atomic_bool write; // Error writing to file
+ atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back
+ atomic_bool uplink; // No uplink connected
+ atomic_bool queue; // Too many requests waiting on uplink
+ } problem;
uint16_t rid; // revision of image
+ bool accessed; // image was accessed since .meta was written
pthread_mutex_t lock;
};
+#define PIMG(x) (x)->name, (int)(x)->rid
struct _dnbd3_client
{
@@ -147,6 +169,7 @@ struct _dnbd3_client
atomic_uint_fast64_t bytesSent; // Byte counter for this client.
dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake
int sock;
+ _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server
bool isServer; // true if a server in proxy mode, false if real client
dnbd3_host_t host;
char hostName[HOSTNAMELEN]; // inet_ntop version of host
@@ -206,12 +229,12 @@ extern atomic_bool _removeMissingImages;
/**
* Read timeout when waiting for or sending data on an uplink
*/
-extern atomic_int _uplinkTimeout;
+extern atomic_uint _uplinkTimeout;
/**
* Read timeout when waiting for or sending data from/to client
*/
-extern atomic_int _clientTimeout;
+extern atomic_uint _clientTimeout;
/**
* If true, images with no active client will have their fd closed after some
@@ -234,6 +257,11 @@ extern atomic_int _backgroundReplication;
extern atomic_int _bgrMinClients;
/**
+ * How many in-flight replication requests we should target (per uplink)
+ */
+extern atomic_int _bgrWindowSize;
+
+/**
* (In proxy mode): If connecting client is a proxy, and the requested image
* is not known locally, should we ask our known alt servers for it?
* Otherwise the request is rejected.
@@ -255,6 +283,12 @@ extern atomic_bool _lookupMissingForProxy;
extern atomic_bool _sparseFiles;
/**
+ * If true, don't abort image replication if preallocating
+ * the image fails, but retry with sparse file.
+ */
+extern atomic_bool _ignoreAllocErrors;
+
+/**
* Port to listen on (default: #define PORT (5003))
*/
extern atomic_int _listenPort;
@@ -275,7 +309,7 @@ extern atomic_int _maxImages;
* Usually this isn't even a megabyte for "real" clients (blockdev
* or fuse).
*/
-extern atomic_int _maxPayload;
+extern atomic_uint _maxPayload;
/**
* If in proxy mode, don't replicate images that are
@@ -298,6 +332,21 @@ extern atomic_bool _pretendClient;
extern atomic_int _autoFreeDiskSpaceDelay;
/**
+ * When handling a client request, this sets the maximum amount
+ * of bytes we prefetch offset right at the end of the client request.
+ * The prefetch size will be MIN( length * 3, _maxPrefetch ), if
+ * length <= _maxPrefetch, so effectively, setting this to 0 disables
+ * any prefetching.
+ */
+extern atomic_uint _maxPrefetch;
+
+/**
+ * Use with care. Can severely degrade performance.
+ * Set either 0 or very high.
+ */
+extern atomic_uint _minRequestSize;
+
+/**
* Load the server configuration.
*/
void globals_loadConfig();
diff --git a/src/server/helper.h b/src/server/helper.h
index 102cb36..3e1b661 100644
--- a/src/server/helper.h
+++ b/src/server/helper.h
@@ -2,8 +2,8 @@
#define HELPER_H_
#include "server.h"
-#include "../shared/log.h"
-#include "../types.h"
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
diff --git a/src/server/image.c b/src/server/image.c
index 16dae45..51fd5b6 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -5,9 +5,9 @@
#include "locks.h"
#include "integrity.h"
#include "altservers.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
#include "reference.h"
#include <assert.h>
@@ -46,16 +46,21 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image);
static dnbd3_image_t* image_free(dnbd3_image_t *image);
static bool image_load_all_internal(char *base, char *path);
static bool image_addToList(dnbd3_image_t *image);
-static bool image_load(char *base, char *path, int withUplink);
+static bool image_load(char *base, char *path, bool withUplink);
static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize);
static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
static bool image_ensureDiskSpace(uint64_t size, bool force);
static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count);
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
static void* closeUnusedFds(void*);
+static bool isImageFromUpstream(dnbd3_image_t *image);
+static void* saveLoadAllCacheMaps(void*);
+static void saveCacheMap(dnbd3_image_t *image);
static void allocCacheMap(dnbd3_image_t *image, bool complete);
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime);
+static void loadImageMeta(dnbd3_image_t *image);
static void cmfree(ref *ref)
{
@@ -73,6 +78,7 @@ void image_serverStartup()
mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
mutex_init( &reloadLock, LOCK_RELOAD );
server_addJob( &closeUnusedFds, NULL, 10, 900 );
+ server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 );
}
/**
@@ -118,39 +124,35 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
const uint64_t firstByteInMap = start >> 15;
const uint64_t lastByteInMap = (end - 1) >> 15;
uint64_t pos;
- // First byte
- uint8_t fb = 0, lb = 0;
- for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- fb |= bit_mask;
- }
- // Last byte
- if ( lastByteInMap != firstByteInMap ) {
- for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
- assert( lastByteInMap == (pos >> 15) );
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- lb |= bit_mask;
- }
- }
- atomic_thread_fence( memory_order_acquire );
- if ( set ) {
- uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
- uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
- setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] );
+ // First and last byte masks
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ if ( firstByteInMap == lastByteInMap ) {
+ if ( set ) {
+ uint8_t o = atomic_fetch_or( &cache->map[firstByteInMap], (uint8_t)(fb & lb) );
+ setNewBlocks = o != ( o | (fb & lb) );
+ } else {
+ atomic_fetch_and( &cache->map[firstByteInMap], (uint8_t)~(fb & lb) );
+ }
} else {
- atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
- atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
- }
- const uint8_t nval = set ? 0xff : 0;
- // Everything in between
- for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
- if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
- setNewBlocks = true;
+ atomic_thread_fence( memory_order_acquire );
+ if ( set ) {
+ uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+ uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+ setNewBlocks = ( fo != ( fo | fb ) || lo != ( lo | lb ) );
+ } else {
+ atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+ atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+ }
+ // Everything in between
+ const uint8_t nval = set ? 0xff : 0;
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+ setNewBlocks = true;
+ }
}
+ atomic_thread_fence( memory_order_release );
}
- atomic_thread_fence( memory_order_release );
if ( setNewBlocks && image->crc32 != NULL ) {
// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
// for checking, even though this might lead to checking some hash block again, if it was
@@ -164,6 +166,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
integrity_check( image, block, false );
}
}
+ } else if ( !set ) {
+ cache->dirty = true;
}
ref_put( &cache->reference );
}
@@ -239,35 +243,74 @@ bool image_isComplete(dnbd3_image_t *image)
*/
bool image_ensureOpen(dnbd3_image_t *image)
{
- if ( image->readFd != -1 ) return image;
- int newFd = open( image->path, O_RDONLY );
+ bool sizeChanged = false;
+ if ( image->readFd != -1 && !image->problem.changed )
+ return true;
+ int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd );
if ( newFd == -1 ) {
- logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "[access] Cannot open '%s' for reading (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
} else {
- // Check size
+ // Check size + read access
+ char buffer[100];
const off_t flen = lseek( newFd, 0, SEEK_END );
if ( flen == -1 ) {
- logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Could not seek to end of %s (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
} else if ( (uint64_t)flen != image->realFilesize ) {
- logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+ if ( !image->problem.changed ) {
+ logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64,
+ image->realFilesize, (uint64_t)flen );
+ }
+ sizeChanged = true;
+ } else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)",
+ (int)sizeof(buffer), image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
}
}
if ( newFd == -1 ) {
- mutex_lock( &image->lock );
- image->working = false;
- mutex_unlock( &image->lock );
+ if ( sizeChanged ) {
+ image->problem.changed = true;
+ }
return false;
}
+
+ // Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32,
+ // but only if the size we just got above is correct.
+ if ( image->problem.changed && !sizeChanged ) {
+ if ( image->crc32 == NULL ) {
+ // Cannot verify further, hope for the best
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) );
+ } else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
+ // This should have checked the first block (if complete) -> All is well again
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) );
+ }
+ } else {
+ image->problem.changed = sizeChanged;
+ }
+
mutex_lock( &image->lock );
if ( image->readFd == -1 ) {
image->readFd = newFd;
+ image->problem.read = false;
mutex_unlock( &image->lock );
} else {
- // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+ // There was a race while opening the file (happens cause not locked cause blocking),
+ // we lost the race so close new fd and proceed.
+ // *OR* we dup()'ed above for cheating when the image changed before.
mutex_unlock( &image->lock );
close( newFd );
}
@@ -296,10 +339,9 @@ dnbd3_image_t* image_byId(int imgId)
* point...
* Locks on: imageListLock, _images[].lock
*/
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool ensureFdOpen)
{
int i;
- const char *removingText = _removeMissingImages ? ", removing from list" : "";
dnbd3_image_t *candidate = NULL;
// Simple sanity check
const size_t slen = strlen( name );
@@ -326,84 +368,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
candidate->users++;
mutex_unlock( &imageListLock );
- // Found, see if it works
- // TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
- // TODO: But remember size-changed images forever
- if ( candidate->working || checkIfWorking ) {
- // Is marked working, but might not have an fd open
- if ( !image_ensureOpen( candidate ) ) {
- mutex_lock( &candidate->lock );
- timing_get( &candidate->lastWorkCheck );
- mutex_unlock( &candidate->lock );
- if ( _removeMissingImages ) {
- candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
- }
- return candidate;
- }
- }
-
- if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
-
- // ...not working...
-
- // Don't re-check too often
- mutex_lock( &candidate->lock );
- bool check;
- declare_now;
- check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
- if ( check ) {
- candidate->lastWorkCheck = now;
- }
- mutex_unlock( &candidate->lock );
- if ( !check ) {
+ if ( !ensureFdOpen ) // Don't want to re-check
return candidate;
- }
- // reaching this point means:
- // 1) We should check if the image is working, it might or might not be in working state right now
- // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
- // 3) We made sure not to re-check this image too often
-
- // Common for ro and rw images: Size check, read check
- const off_t len = lseek( candidate->readFd, 0, SEEK_END );
- bool reload = false;
- if ( len == -1 ) {
- logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
- reload = true;
- } else if ( (uint64_t)len != candidate->realFilesize ) {
- logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
- ". Try sending SIGHUP to server if you know what you're doing.",
- candidate->path, candidate->realFilesize, (uint64_t)len );
- } else {
- // Seek worked, file size is same, now see if we can read from file
- char buffer[100];
- if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
- logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.",
- (int)sizeof(buffer), candidate->path, errno, removingText );
- reload = true;
- } else if ( !candidate->working ) {
- // Seems everything is fine again \o/
- candidate->working = true;
- logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
- }
- }
+ if ( image_ensureOpen( candidate ) && !candidate->problem.read )
+ return candidate; // We have a read fd and no read or changed problems
- if ( reload ) {
+ // -- image could not be opened again, or is open but has problem --
+
+ if ( _removeMissingImages && !file_isReadable( candidate->path ) ) {
+ candidate = image_remove( candidate );
+ // No image_release here, the image is still returned and should be released by caller
+ } else if ( candidate->readFd != -1 ) {
+ // We cannot just close the fd as it might be in use. Make a copy and remove old entry.
+ candidate = image_remove( candidate );
// Could not access the image with exising fd - mark for reload which will re-open the file.
// make a copy of the image struct but keep the old one around. If/When it's not being used
// anymore, it will be freed automatically.
- logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path );
+ logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path );
dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
img->path = strdup( candidate->path );
img->name = strdup( candidate->name );
img->virtualFilesize = candidate->virtualFilesize;
img->realFilesize = candidate->realFilesize;
- img->atime = now;
+ timing_get( &img->atime );
img->masterCrc32 = candidate->masterCrc32;
img->readFd = -1;
img->rid = candidate->rid;
img->users = 1;
- img->working = false;
+ img->problem.read = true;
+ img->problem.changed = candidate->problem.changed;
img->ref_cacheMap = NULL;
mutex_init( &img->lock, LOCK_IMAGE );
if ( candidate->crc32 != NULL ) {
@@ -419,18 +413,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
if ( image_addToList( img ) ) {
image_release( candidate );
candidate = img;
+ // Check if image is incomplete, initialize uplink
+ if ( candidate->ref_cacheMap != NULL ) {
+ uplink_init( candidate, -1, NULL, -1 );
+ }
+ // Try again with new instance
+ image_ensureOpen( candidate );
} else {
img->users = 0;
image_free( img );
}
- // Check if image is incomplete, initialize uplink
- if ( candidate->ref_cacheMap != NULL ) {
- uplink_init( candidate, -1, NULL, -1 );
- }
- // readFd == -1 and working == FALSE at this point,
- // this function needs some splitting up for handling as we need to run most
- // of the above code again. for now we know that the next call for this
- // name:rid will get ne newly inserted "img" and try to re-open the file.
+ // readFd == -1 and problem.read == true
}
return candidate; // We did all we can, hopefully it's working
@@ -449,6 +442,7 @@ dnbd3_image_t* image_lock(dnbd3_image_t *image)
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) {
+ assert( _images[i]->id == image->id );
image->users++;
mutex_unlock( &imageListLock );
return image;
@@ -479,6 +473,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
// responsible for freeing it
for (int i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) { // Found, do nothing
+ assert( _images[i]->id == image->id );
mutex_unlock( &imageListLock );
return NULL;
}
@@ -518,6 +513,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
mutex_lock( &imageListLock );
for ( int i = _num_images - 1; i >= 0; --i ) {
if ( _images[i] == image ) {
+ assert( _images[i]->id == image->id );
_images[i] = NULL;
mustFree = ( image->users == 0 );
}
@@ -630,12 +626,18 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
{
assert( image != NULL );
assert( image->users == 0 );
- logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid );
+ logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) );
// uplink_shutdown might return false to tell us
// that the shutdown is in progress. Bail out since
// this will get called again when the uplink is done.
if ( !uplink_shutdown( image ) )
return NULL;
+ if ( isImageFromUpstream( image ) ) {
+ saveMetaData( image, NULL, 0 );
+ if ( image->ref_cacheMap != NULL ) {
+ saveCacheMap( image );
+ }
+ }
mutex_lock( &image->lock );
ref_setref( &image->ref_cacheMap, NULL );
free( image->crc32 );
@@ -700,7 +702,8 @@ static bool image_load_all_internal(char *base, char *path)
while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) {
entry = *entryPtr;
- if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue;
+ if ( entry.d_name[0] == '.' )
+ continue; // No hidden files, no . or ..
if ( strlen( entry.d_name ) > SUBDIR_LEN ) {
logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN );
continue;
@@ -717,7 +720,7 @@ static bool image_load_all_internal(char *base, char *path)
if ( S_ISDIR( st.st_mode ) ) {
image_load_all_internal( base, subpath ); // Recurse
} else if ( !isForbiddenExtension( subpath ) ) {
- image_load( base, subpath, true ); // Load image if possible
+ image_load( base, subpath, false ); // Load image if possible
}
}
closedir( dir );
@@ -756,10 +759,9 @@ static bool image_addToList(dnbd3_image_t *image)
* Note that this is NOT THREAD SAFE so make sure its always
* called on one thread only.
*/
-static bool image_load(char *base, char *path, int withUplink)
+static bool image_load(char *base, char *path, bool withUplink)
{
int revision = -1;
- struct stat st;
dnbd3_cache_map_t *cache = NULL;
uint32_t *crc32list = NULL;
dnbd3_image_t *existing = NULL;
@@ -824,7 +826,9 @@ static bool image_load(char *base, char *path, int withUplink)
fdImage = open( path, O_RDONLY );
}
if ( fdImage == -1 ) {
- logadd( LOG_ERROR, "Could not open '%s' for reading...", path );
+ if ( errno != ENOENT ) {
+ logadd( LOG_ERROR, "[load] Cannot open '%s' for reading (errno=%d)", path, errno );
+ }
goto load_error;
}
// Determine file size
@@ -855,16 +859,16 @@ static bool image_load(char *base, char *path, int withUplink)
// Compare data just loaded to identical image we apparently already loaded
if ( existing != NULL ) {
if ( existing->realFilesize != realFilesize ) {
- logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) );
// Image will be replaced below
} else if ( existing->crc32 != NULL && crc32list != NULL
&& memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) {
- logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) );
logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." );
logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" );
// Image will be replaced below
} else if ( existing->crc32 == NULL && crc32list != NULL ) {
- logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) );
existing->crc32 = crc32list;
existing->masterCrc32 = masterCrc;
crc32list = NULL;
@@ -872,7 +876,7 @@ static bool image_load(char *base, char *path, int withUplink)
goto load_error; // Keep existing
} else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
// Just ignore that fact, if replication is really complete the cache map will be removed anyways
- logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) );
function_return = true;
goto load_error; // Keep existing
} else {
@@ -900,19 +904,10 @@ static bool image_load(char *base, char *path, int withUplink)
image->rid = (uint16_t)revision;
image->users = 0;
image->readFd = -1;
- image->working = ( cache == NULL );
timing_get( &image->nextCompletenessEstimate );
image->completenessEstimate = -1;
mutex_init( &image->lock, LOCK_IMAGE );
- int32_t offset;
- if ( stat( path, &st ) == 0 ) {
- // Negatively offset atime by file modification time
- offset = (int32_t)( st.st_mtime - time( NULL ) );
- if ( offset > 0 ) offset = 0;
- } else {
- offset = 0;
- }
- timing_gets( &image->atime, offset );
+ loadImageMeta( image );
// Prevent freeing in cleanup
cache = NULL;
@@ -925,7 +920,7 @@ static bool image_load(char *base, char *path, int withUplink)
// Image is definitely incomplete, initialize uplink worker
if ( image->ref_cacheMap != NULL ) {
- image->working = false;
+ image->problem.uplink = true;
if ( withUplink ) {
uplink_init( image, -1, NULL, -1 );
}
@@ -937,14 +932,14 @@ static bool image_load(char *base, char *path, int withUplink)
// Keep fd for reading
fdImage = -1;
// Check CRC32
- image_checkRandomBlocks( image, 4 );
+ image_checkRandomBlocks( image, 4, -1 );
} else {
logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
image->readFd = -1; // Keep fdImage instead, will be closed below
image = image_free( image );
goto load_error;
}
- logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
+ logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) );
function_return = true;
// Clean exit:
@@ -1027,10 +1022,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
return retval;
}
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
+/**
+ * Check up to count random blocks from given image. If fromFd is -1, the check will
+ * be run asynchronously using the integrity checker. Otherwise, the check will
+ * happen in the function and return the result of the check.
+ * @param image image to check
+ * @param count number of blocks to check (max)
+ * @param fromFd, check synchronously and use this fd for reading, -1 = async
+ * @return true = OK, false = error. Meaningless if fromFd == -1
+ */
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd)
{
if ( image->crc32 == NULL )
- return;
+ return true;
// This checks the first block and (up to) count - 1 random blocks for corruption
// via the known crc32 list. This is very sloppy and is merely supposed to detect
// accidental corruption due to broken dnbd3-proxy functionality or file system
@@ -1038,7 +1042,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
assert( count > 0 );
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
- int blocks[count];
+ int blocks[count+1]; // +1 for "-1" in sync case
int index = 0, j;
int block;
if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
@@ -1062,9 +1066,16 @@ while_end: ;
if ( cache != NULL ) {
ref_put( &cache->reference );
}
- for ( int i = 0; i < index; ++i ) {
- integrity_check( image, blocks[i], true );
+ if ( fromFd == -1 ) {
+ // Async
+ for ( int i = 0; i < index; ++i ) {
+ integrity_check( image, blocks[i], true );
+ }
+ return true;
}
+ // Sync
+ blocks[index] = -1;
+ return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize );
}
/**
@@ -1079,7 +1090,7 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_ERROR, "revision id invalid: %d", revision );
return false;
}
- char path[PATHLEN], cache[PATHLEN];
+ char path[PATHLEN], cache[PATHLEN+4];
char *lastSlash = strrchr( image, '/' );
if ( lastSlash == NULL ) {
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
@@ -1090,7 +1101,7 @@ bool image_create(char *image, int revision, uint64_t size)
*lastSlash = '/';
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
}
- snprintf( cache, PATHLEN, "%s.map", path );
+ snprintf( cache, PATHLEN+4, "%s.map", path );
size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
const int mapsize = IMGSIZE_TO_MAPBYTES(size);
// Write files
@@ -1111,14 +1122,19 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err );
}
// Now write image
+ bool fallback = false;
if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) {
logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk"
" space without actually writing all zeroes to the block device." );
logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect"
" divine performance during replication." );
- goto failure_cleanup;
- } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) {
+ if ( !_ignoreAllocErrors ) {
+ goto failure_cleanup;
+ }
+ fallback = true;
+ }
+ if ( ( _sparseFiles || fallback ) && !file_setSize( fdImage, size ) ) {
logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." );
goto failure_cleanup;
@@ -1162,14 +1178,18 @@ dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
// Sanity check
if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
|| name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
- // If in proxy mode, check with upstream server first
+ // Re-check latest local revision
+ image = loadImageServer( name, revision );
+ // If in proxy mode, check with upstream servers
if ( _isProxy ) {
+ // Forget the locally loaded one
+ image_release( image );
+ // Check with upstream - if unsuccessful, will return the same
+ // as loadImageServer did
image = loadImageProxy( name, revision, len );
- if ( image != NULL )
- return image;
}
// Lookup on local storage
- return loadImageServer( name, revision );
+ return image;
}
/**
@@ -1227,19 +1247,20 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
int uplinkSock = -1;
dnbd3_host_t uplinkServer;
const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV );
- uint16_t remoteProtocolVersion;
uint16_t remoteRid = revision;
- uint64_t remoteImageSize;
+ uint16_t acceptedRemoteRid = 0;
+ uint16_t remoteProtocolVersion = 0;
struct sockaddr_storage sa;
socklen_t salen;
poll_list_t *cons = sock_newPollList();
logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count );
for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects
- char *remoteName;
+ char *remoteName = NULL;
+ uint64_t remoteImageSize = 0;
bool ok = false;
int sock;
if ( i >= count ) {
- sock = sock_multiConnect( cons, NULL, 100, 1000 );
+ sock = sock_multiConnect( cons, NULL, 100, _uplinkTimeout );
if ( sock == -2 ) break;
} else {
if ( log_hasMask( LOG_DEBUG2 ) ) {
@@ -1248,7 +1269,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
host[len] = '\0';
logadd( LOG_DEBUG2, "Trying to replicate from %s", host );
}
- sock = sock_multiConnect( cons, &servers[i], 100, 1000 );
+ sock = sock_multiConnect( cons, &servers[i], 100, _uplinkTimeout );
}
if ( sock == -1 || sock == -2 ) continue;
salen = sizeof(sa);
@@ -1273,7 +1294,11 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
} else {
ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc.
}
- ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ if ( ok ) {
+ ok = image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ } else {
+ logadd( LOG_INFO, "Not enough space to replicate '%s:%d'", name, (int)revision );
+ }
mutex_unlock( &reloadLock );
if ( !ok ) goto server_fail;
@@ -1282,26 +1307,32 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) {
uplinkServer.type = 0;
}
- break;
+ acceptedRemoteRid = remoteRid;
+ break; // TODO: Maybe we should try the remaining servers if rid == 0, in case there's an even newer one
server_fail: ;
close( sock );
}
sock_destroyPollList( cons );
- // If we still have a pointer to a local image, release the reference
- if ( image != NULL ) image_release( image );
+ // If we still have a pointer to a local image, compare rid
+ if ( image != NULL ) {
+ if ( ( revision == 0 && image->rid >= acceptedRemoteRid ) || ( image->rid == revision ) ) {
+ return image;
+ }
+ // release the reference
+ image_release( image );
+ }
// If everything worked out, this call should now actually return the image
- image = image_get( name, remoteRid, false );
+ image = image_get( name, acceptedRemoteRid, false );
if ( image != NULL && uplinkSock != -1 ) {
// If so, init the uplink and pass it the socket
- sock_setTimeout( uplinkSock, _uplinkTimeout );
if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) {
close( uplinkSock );
} else {
// Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
int i = 0;
- while ( !image->working && ++i < 100 )
+ while ( image->problem.uplink && ++i < 100 )
usleep( 2000 );
}
} else if ( uplinkSock != -1 ) {
@@ -1318,6 +1349,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
{
char imageFile[PATHLEN] = "";
uint16_t detectedRid = 0;
+ bool isLegacyFile = false;
if ( requestedRid != 0 ) {
snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid );
@@ -1354,6 +1386,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
&& ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) {
snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name );
detectedRid = 1;
+ isLegacyFile = true;
}
logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile );
// No file was determined, or it doesn't seem to exist/be readable
@@ -1361,7 +1394,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
logadd( LOG_DEBUG2, "Not found, bailing out" );
return image_get( name, requestedRid, true );
}
- if ( !_vmdkLegacyMode && requestedRid == 0 ) {
+ if ( !isLegacyFile && requestedRid == 0 ) {
// rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0
while ( detectedRid != 0 ) {
dnbd3_image_t *image = image_get( name, detectedRid, true );
@@ -1429,9 +1462,13 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name );
} else {
int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 );
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, crc32list, crc32len );
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, crc32list, crc32len );
close( fd );
+ if ( (size_t)ret != crc32len + sizeof(masterCrc) ) {
+ logadd( LOG_WARNING, "Could not save freshly received crc32 list for %s:%d", name, (int)revision );
+ unlink( crcFile );
+ }
}
}
free( crc32list );
@@ -1564,14 +1601,23 @@ json_t* image_getListAsJson()
ref_put( &uplink->reference );
}
- jsonImage = json_pack( "{sisssisisisisI}",
+ int problems = 0;
+#define addproblem(name,val) if (image->problem.name) problems |= (1 << val)
+ addproblem(read, 0);
+ addproblem(write, 1);
+ addproblem(changed, 2);
+ addproblem(uplink, 3);
+ addproblem(queue, 4);
+
+ jsonImage = json_pack( "{sisssisisisisIsi}",
"id", image->id, // id, name, rid never change, so access them without locking
"name", image->name,
"rid", (int) image->rid,
"users", image->users,
"complete", completeness,
"idle", idleTime,
- "size", (json_int_t)image->virtualFilesize );
+ "size", (json_int_t)image->virtualFilesize,
+ "problems", problems );
if ( bytesReceived != 0 ) {
json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) );
}
@@ -1594,7 +1640,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image)
assert( image != NULL );
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
if ( cache == NULL )
- return image->working ? 100 : 0;
+ return 100;
const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
if ( unlikely( len == 0 ) ) {
ref_put( &cache->reference );
@@ -1705,46 +1751,51 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
/**
* Make sure at least size bytes are available in _basePath.
* Will delete old images to make room for new ones.
- * TODO: Store last access time of images. Currently the
- * last access time is reset to the file modification time
- * on server restart. Thus it will
- * currently only delete images if server uptime is > 24 hours.
+ * It will only delete images if a configurable uptime is
+ * reached.
* This can be overridden by setting force to true, in case
* free space is desperately needed.
* Return true iff enough space is available. false in random other cases
*/
static bool image_ensureDiskSpace(uint64_t size, bool force)
{
- for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
+ for ( int maxtries = 0; maxtries < 50; ++maxtries ) {
uint64_t available;
if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
- logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno );
+ logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left.", errno );
return true;
}
if ( available > size )
return true; // Yay
- if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
+ if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) {
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but auto-freeing of disk space is disabled.",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+ }
if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 );
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)), _autoFreeDiskSpaceDelay / 60 );
return false;
}
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)) );
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
// Find least recently used image
dnbd3_image_t *oldest = NULL;
int i;
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
dnbd3_image_t *current = _images[i];
- if ( current == NULL ) continue;
- if ( current->users == 0 ) { // Not in use :-)
- if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
- // Oldest access time so far
- oldest = current;
- }
- }
+ if ( current == NULL || current->users != 0 )
+ continue; // Empty slot or in use
+ if ( oldest != NULL && timing_1le2( &oldest->atime, &current->atime ) )
+ continue; // Already got a newer one
+ if ( !isImageFromUpstream( current ) )
+ continue; // Not replicated, don't touch
+ // Oldest access time so far
+ oldest = current;
}
if ( oldest != NULL ) {
oldest->users++;
@@ -1760,7 +1811,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
image_release( oldest ); // We did users++ above; image might have to be freed entirely
return false;
}
- logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
+ logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) );
char *filename = strdup( oldest->path ); // Copy name as we remove the image first
oldest = image_remove( oldest ); // Remove from list first...
oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed
@@ -1790,15 +1841,14 @@ static void* closeUnusedFds(void* nix UNUSED)
timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
int fds[FDCOUNT];
int fdindex = 0;
+ setThreadName( "unused-fd-close" );
mutex_lock( &imageListLock );
for ( int i = 0; i < _num_images; ++i ) {
dnbd3_image_t * const image = _images[i];
if ( image == NULL || image->readFd == -1 )
continue;
- // TODO: Also close for idle uplinks (uplink_connectionShouldShutdown)
- // TODO: And close writeFd for idle uplinks....
if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
- logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) );
fds[fdindex++] = image->readFd;
image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock
if ( fdindex == FDCOUNT )
@@ -1813,6 +1863,177 @@ static void* closeUnusedFds(void* nix UNUSED)
return NULL;
}
+static bool isImageFromUpstream(dnbd3_image_t *image)
+{
+ if ( !_isProxy )
+ return false; // Nothing to do
+ // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+ // for which we have any upstream servers configured. If there's none, don't touch
+ // the cache map on disk.
+ if ( !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
+ return true;
+}
+
+static void* saveLoadAllCacheMaps(void* nix UNUSED)
+{
+ static ticks nextSave;
+ declare_now;
+ bool full = timing_reached( &nextSave, &now );
+ time_t walltime = 0;
+ setThreadName( "cache-mapper" );
+ if ( full ) {
+ walltime = time( NULL );
+ // Update at start to avoid concurrent runs
+ timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
+ }
+ mutex_lock( &imageListLock );
+ for ( int i = 0; i < _num_images; ++i ) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image == NULL )
+ continue;
+ image->users++;
+ mutex_unlock( &imageListLock );
+ const bool fromUpstream = isImageFromUpstream( image );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ if ( fromUpstream ) {
+ // Replicated image, we're responsible for updating the map, so save it
+ // Save if dirty bit is set, blocks were invalidated
+ bool save = cache->dirty;
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( !save ) {
+ // Otherwise, consider longer timeout and byte count limits of uplink
+ if ( uplink != NULL ) {
+ assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+ uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+ if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+ save = true;
+ }
+ }
+ }
+ if ( save ) {
+ cache->dirty = false;
+ if ( uplink != NULL ) {
+ uplink->bytesReceivedLastSave = uplink->bytesReceived;
+ }
+ saveCacheMap( image );
+ }
+ if ( uplink != NULL ) {
+ ref_put( &uplink->reference );
+ }
+ } else {
+ // We're not replicating this image, if there's a cache map, reload
+ // it periodically, since we might read from a shared storage that
+ // another server instance is writing to.
+ if ( full || ( !cache->unchanged && !image->problem.read ) ) {
+ logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
+ dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+ if ( onDisk == NULL ) {
+ // Should be complete now
+ logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
+ ref_setref( &image->ref_cacheMap, NULL );
+ } else {
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+ // Unchanged
+ cache->unchanged = true;
+ onDisk->reference.free( &onDisk->reference );
+ } else {
+ // Replace
+ ref_setref( &image->ref_cacheMap, &onDisk->reference );
+ logadd( LOG_DEBUG2, "Map changed" );
+ }
+ }
+ }
+ } // end reload cache map
+ ref_put( &cache->reference );
+ } // end has cache map
+ if ( full && fromUpstream ) {
+ saveMetaData( image, &now, walltime );
+ }
+ image_release( image ); // Always do this instead of users-- to handle freeing
+ mutex_lock( &imageListLock );
+ }
+ mutex_unlock( &imageListLock );
+ return NULL;
+}
+
+/**
+ * Saves the cache map of the given image.
+ * Return false if this image doesn't have a cache map, or if the image
+ * doesn't have any uplink to replicate from. In this case the image might
+ * still have a cache map that was loaded from disk, and should be reloaded
+ * periodically.
+ * @param image the image
+ */
+static void saveCacheMap(dnbd3_image_t *image)
+{
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL )
+ return; // Race - wasn't NULL in function call above...
+
+ logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) );
+ const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+ char mapfile[strlen( image->path ) + 4 + 1];
+ strcpy( mapfile, image->path );
+ strcat( mapfile, ".map" );
+
+ int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+ if ( fd == -1 ) {
+ const int err = errno;
+ ref_put( &cache->reference );
+ logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+ return;
+ }
+
+ // On Linux we could use readFd, but in general it's not guaranteed to work
+ int imgFd = open( image->path, O_WRONLY );
+ if ( imgFd == -1 ) {
+ logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno );
+ } else {
+ if ( fsync( imgFd ) == -1 ) {
+ logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno );
+ dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize);
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( old == NULL ) {
+ // Could not load old map. FS might be toast.
+ logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." );
+ memset( cache->map, 0, mapSize );
+ } else {
+ // AND the maps together to be safe
+ for ( int i = 0; i < mapSize; ++i ) {
+ cache->map[i] &= old->map[i];
+ }
+ old->reference.free( &old->reference );
+ }
+ }
+ close( imgFd );
+ }
+
+ // Write current map to file
+ size_t done = 0;
+ while ( done < size ) {
+ const ssize_t ret = write( fd, cache->map + done, size - done );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+ break;
+ }
+ if ( ret <= 0 ) {
+ logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+ break;
+ }
+ done += (size_t)ret;
+ }
+ ref_put( &cache->reference );
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+ }
+ close( fd );
+ // TODO fsync on parent directory
+}
+
static void allocCacheMap(dnbd3_image_t *image, bool complete)
{
const uint8_t val = complete ? 0xff : 0;
@@ -1822,7 +2043,7 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
memset( cache->map, val, byteSize );
mutex_lock( &image->lock );
if ( image->ref_cacheMap != NULL ) {
- logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) );
free( cache );
} else {
ref_setref( &image->ref_cacheMap, &cache->reference );
@@ -1830,3 +2051,77 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
mutex_unlock( &image->lock );
}
+/**
+ * It's assumed you hold a reference to the image
+ */
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime)
+{
+ if ( !image->accessed )
+ return;
+ ticks tmp;
+ uint32_t diff;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot asprintf meta" );
+ return;
+ }
+ if ( now == NULL ) {
+ timing_get( &tmp );
+ now = &tmp;
+ walltime = time( NULL );
+ }
+ mutex_lock( &image->lock );
+ image->accessed = false;
+ diff = timing_diff( &image->atime, now );
+ mutex_unlock( &image->lock );
+ FILE *f = fopen( fn, "w" );
+ if ( f == NULL ) {
+ logadd( LOG_WARNING, "Cannot open %s for writing", fn );
+ } else {
+ fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) );
+ fclose( f );
+ }
+ free( fn );
+ // TODO: fsync() dir
+}
+
+static void loadImageMeta(dnbd3_image_t *image)
+{
+ int32_t offset = 1;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "asprintf load" );
+ } else {
+ int fh = open( fn, O_RDONLY );
+ free( fn );
+ if ( fh != -1 ) {
+ char buf[200];
+ ssize_t ret = read( fh, buf, sizeof(buf)-1 );
+ close( fh );
+ if ( ret > 0 ) {
+ buf[ret] = '\0';
+ // Do it the cheap way until we actually store more stuff
+ char *pos = strstr( buf, "atime=" );
+ if ( pos != NULL ) {
+ offset = (int32_t)( atol( pos + 6 ) - time( NULL ) );
+ }
+ }
+ }
+ }
+ if ( offset == 1 ) {
+ // Nothing from .meta file, use old guesstimate
+ struct stat st;
+ if ( stat( image->path, &st ) == 0 ) {
+ // Negatively offset atime by file modification time
+ offset = (int32_t)( st.st_mtime - time( NULL ) );
+ } else {
+ offset = 0;
+ }
+ image->accessed = true;
+ }
+ if ( offset > 0 ) {
+ offset = 0;
+ }
+ timing_gets( &image->atime, offset );
+}
+
diff --git a/src/server/image.h b/src/server/image.h
index 89791fc..7b6583c 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -19,7 +19,7 @@ bool image_ensureOpen(dnbd3_image_t *image);
dnbd3_image_t* image_byId(int imgId);
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool checkIfWorking);
bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
@@ -49,6 +49,52 @@ void image_closeUnusedFd();
bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
+bool image_saveCacheMap(dnbd3_image_t *image);
+
+/**
+ * Check if given range is cached. Be careful when using this function because:
+ * 1) you need to hold a reference to the cache map
+ * 2) start and end are assumed to be 4k aligned
+ * 3) start and end are not checked to be in bounds (we don't know the image in this context)
+ */
+static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end)
+{
+ const uint64_t firstByteInMap = start >> 15;
+ const uint64_t lastByteInMap = (end - 1) >> 15;
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ uint64_t pos;
+ uint8_t b;
+ bool isCached;
+ if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+ b = cache->map[firstByteInMap];
+ isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+ } else {
+ isCached = true;
+ atomic_thread_fence( memory_order_acquire );
+ // First byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+ isCached = ( ( b & fb ) == fb );
+ }
+ // Last byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+ isCached = ( ( b & lb ) == lb );
+ }
+ // Middle, must be all bits set (0xff)
+ if ( isCached ) {
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+ isCached = false;
+ break;
+ }
+ }
+ }
+ }
+ return isCached;
+}
+
// one byte in the map covers 8 4kib blocks, so 32kib per byte
// "+ (1 << 15) - 1" is required to account for the last bit of
// the image that is smaller than 32kib
diff --git a/src/server/ini.c b/src/server/ini.c
index c796d5c..37c44a3 100644
--- a/src/server/ini.c
+++ b/src/server/ini.c
@@ -52,7 +52,7 @@ static char* find_char_or_comment(const char* s, char c)
/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
static char* strncpy0(char* dest, const char* src, size_t size)
{
- strncpy( dest, src, size );
+ strncpy( dest, src, size - 1 );
dest[size - 1] = '\0';
return dest;
}
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 4006dfc..91e53b8 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED)
readFd = directFd;
}
}
- if ( readFd == -1 ) { // Try buffered; flush to disk for that
- image_ensureOpen( image );
- readFd = image->readFd;
+ if ( readFd == -1 ) { // Try buffered as fallback
+ if ( image_ensureOpen( image ) && !image->problem.read ) {
+ readFd = image->readFd;
+ }
}
if ( readFd == -1 ) {
logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
@@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED)
// Done with this task as nothing left
checkQueue[i].image = NULL;
if ( i + 1 == queueLen ) queueLen--;
- // Mark as working again if applicable
- if ( !foundCorrupted ) {
- dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
- if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
- mutex_lock( &image->lock );
- image->working = uplink->current.fd != -1 && image->readFd != -1;
- mutex_unlock( &image->lock );
- ref_put( &uplink->reference );
- }
- }
} else {
// Still more blocks to go...
checkQueue[i].block = blocks[0];
@@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED)
}
if ( foundCorrupted && !_shutdown ) {
// Something was fishy, make sure uplink exists
- mutex_lock( &image->lock );
- image->working = false;
- mutex_unlock( &image->lock );
uplink_init( image, -1, NULL, -1 );
}
// Release :-)
diff --git a/src/server/locks.c b/src/server/locks.c
index b39576b..3be73b3 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -7,9 +7,9 @@
#include "locks.h"
#include "helper.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/timing.h>
-#ifdef _DEBUG
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
#define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
#define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
#define MAXLPT 20
diff --git a/src/server/locks.h b/src/server/locks.h
index e5c9801..3b04caa 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -23,10 +23,12 @@
#define LOCK_UPLINK_RTT 200
#define LOCK_UPLINK_SEND 210
#define LOCK_RPC_ACL 220
+#define LOCK_FUSE_INIT 300
+#define LOCK_FUSE_DIR 310
//
-#ifdef _DEBUG
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
@@ -55,10 +57,12 @@ void debug_dump_lock_stats();
#endif
-#ifdef DEBUG_THREADS
+#ifdef DNBD3_SERVER_DEBUG_THREADS
+
+#include <dnbd3/shared/log.h>
extern int debugThreadCount;
-#define thread_create(thread,attr,routine,arg) (logadd( LOG_THREAD CREATE, "%d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
+#define thread_create(thread,attr,routine,arg) (logadd( LOG_INFO, "THREAD_CREATE: %d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg)
{
int i;
@@ -68,26 +72,26 @@ static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_att
return pthread_create( thread, attr, start_routine, arg );
}
-#define thread_detach(thread) (logadd( LOG_THREAD DETACH, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
+#define thread_detach(thread) (logadd( LOG_INFO, "THREAD_DETACH: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
static inline int debug_thread_detach(pthread_t thread)
{
const int ret = pthread_detach(thread);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD DETACH, "Tried to detach invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_DETACH: Tried to detach invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
}
-#define thread_join(thread,value) (logadd( LOG_THREAD JOIN, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
+#define thread_join(thread,value) (logadd( LOG_INFO, "THREAD_JOIN: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
static inline int debug_thread_join(pthread_t thread, void **value_ptr)
{
const int ret = pthread_join(thread, value_ptr);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD JOIN, "Tried to join invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_JOIN: Tried to join invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
@@ -99,6 +103,6 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr)
#define thread_detach(thread) pthread_detach( thread )
#define thread_join(thread,value) pthread_join( thread, value )
-#endif
+#endif /* DNBD3_SERVER_DEBUG_THREADS */
#endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
index aba4e7d..eb51d29 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -26,10 +26,10 @@
#include "altservers.h"
#include "reference.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
-#include "../shared/protocol.h"
-#include "../serialize.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/serialize.h>
#include <assert.h>
@@ -58,11 +58,12 @@ static atomic_uint_fast64_t totalBytesSent = 0;
static bool addToList(dnbd3_client_t *client);
static void removeFromList(dnbd3_client_t *client);
static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client);
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
static inline bool recv_request_header(int sock, dnbd3_request_t *request)
{
ssize_t ret, fails = 0;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
// Read request header from socket
@@ -89,7 +90,7 @@ static inline bool recv_request_header(int sock, dnbd3_request_t *request)
static inline bool recv_request_payload(int sock, uint32_t size, serialized_buffer_t *payload)
{
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
if ( size == 0 ) {
@@ -113,7 +114,7 @@ static inline bool recv_request_payload(int sock, uint32_t size, serialized_buff
* Send reply with optional payload. payload can be null. The caller has to
* acquire the sendMutex first.
*/
-static inline bool send_reply(int sock, dnbd3_reply_t *reply, void *payload)
+static inline bool send_reply(int sock, dnbd3_reply_t *reply, const void *payload)
{
const uint32_t size = reply->size;
fixup_reply( *reply );
@@ -159,7 +160,7 @@ void* net_handleNewConnection(void *clientPtr)
// Await data from client. Since this is a fresh connection, we expect data right away
sock_setTimeout( client->sock, _clientTimeout );
do {
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
const int ret = (int)recv( 0, &request, sizeof(request), MSG_WAITALL );
#else
const int ret = (int)recv( client->sock, &request, sizeof(request), MSG_WAITALL );
@@ -197,6 +198,7 @@ void* net_handleNewConnection(void *clientPtr)
client->hostName[HOSTNAMELEN-1] = '\0';
mutex_unlock( &client->lock );
client->bytesSent = 0;
+ client->relayedCount = 0;
if ( !addToList( client ) ) {
freeClientStruct( client );
@@ -207,6 +209,7 @@ void* net_handleNewConnection(void *clientPtr)
dnbd3_reply_t reply;
dnbd3_image_t *image = NULL;
+ dnbd3_cache_map_t *cache = NULL;
int image_file = -1;
int num;
@@ -215,7 +218,6 @@ void* net_handleNewConnection(void *clientPtr)
serialized_buffer_t payload;
uint16_t rid, client_version;
- uint64_t start, end;
dnbd3_server_entry_t server_list[NUMBER_SERVERS];
@@ -262,22 +264,24 @@ void* net_handleNewConnection(void *clientPtr)
atomic_thread_fence( memory_order_release );
if ( unlikely( image == NULL ) ) {
//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
- } else if ( unlikely( !image->working ) ) {
+ } else if ( unlikely( image->problem.read || image->problem.changed ) ) {
logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
client->hostName, image_name, (int)rid );
} else {
// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
bOk = true;
if ( image->ref_cacheMap != NULL ) {
- dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
- if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
+ if ( image->problem.queue || image->problem.write ) {
bOk = ( rand() % 4 ) == 1;
}
- if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
- usleep( 100000 ); // server gets a penalty and is less likely to be selected
- }
- if ( uplink != NULL ) {
- ref_put( &uplink->reference );
+ if ( bOk ) {
+ if ( image->problem.write ) { // Wait 100ms if local caching is not working so this
+ usleep( 100000 ); // server gets a penalty and is less likely to be selected
+ }
+ if ( image->problem.uplink ) {
+ // Penaltize depending on completeness, if no uplink is available
+ usleep( ( 100 - image->completenessEstimate ) * 100 );
+ }
}
}
if ( bOk ) {
@@ -286,6 +290,7 @@ void* net_handleNewConnection(void *clientPtr)
if ( !client->isServer ) {
// Only update immediately if this is a client. Servers are handled on disconnect.
timing_get( &image->atime );
+ image->accessed = true;
}
mutex_unlock( &image->lock );
serializer_reset_write( &payload );
@@ -313,9 +318,8 @@ void* net_handleNewConnection(void *clientPtr)
// client handling mainloop
while ( recv_request_header( client->sock, &request ) ) {
if ( _shutdown ) break;
- switch ( request.cmd ) {
+ if ( likely ( request.cmd == CMD_GET_BLOCK ) ) {
- case CMD_GET_BLOCK:;
const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
reply.handle = request.handle;
if ( unlikely( offset >= image->virtualFilesize ) ) {
@@ -324,7 +328,7 @@ void* net_handleNewConnection(void *clientPtr)
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
}
if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
// Sanity check
@@ -332,63 +336,36 @@ void* net_handleNewConnection(void *clientPtr)
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
}
- dnbd3_cache_map_t *cache;
- if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
+ if ( cache == NULL ) {
+ cache = ref_get_cachemap( image );
+ }
+
+ if ( request.size != 0 && cache != NULL ) {
// This is a proxyed image, check if we need to relay the request...
- start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- bool isCached = true;
- const uint64_t firstByteInMap = start >> 15;
- const uint64_t lastByteInMap = (end - 1) >> 15;
- uint64_t pos;
- uint8_t b;
- atomic_thread_fence( memory_order_acquire );
- // Middle - quick checking
- if ( isCached ) {
- for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
- if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
- isCached = false;
- break;
- }
- }
- }
- // First byte
- if ( isCached ) {
- b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
- for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (b & bit_mask) == 0 ) {
- isCached = false;
- break;
+ const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ if ( unlikely( client->relayedCount > 250 ) ) {
+ logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" );
+ for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) {
+ usleep( 10000 );
}
- }
- }
- // Last byte - only check if request spans multiple bytes in cache map
- if ( isCached && firstByteInMap != lastByteInMap ) {
- b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
- for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
- assert( lastByteInMap == (pos >> 15) );
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (b & bit_mask) == 0 ) {
- isCached = false;
- break;
+ if ( client->relayedCount > 250 ) {
+ logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" );
+ goto exit_client_cleanup;
}
}
- }
- ref_put( &cache->reference );
- if ( !isCached ) {
- if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
- logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+ client->relayedCount++;
+ if ( !uplink_requestClient( client, &uplinkCallback, request.handle, offset, request.size, request.hops ) ) {
+ client->relayedCount--;
+ logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
client->hostName, image->name, image->rid );
- image->working = false;
goto exit_client_cleanup;
}
- break; // DONE, exit request.cmd switch
+ continue; // Reply arrives on uplink some time later, handle next request now
}
}
@@ -419,7 +396,7 @@ void* net_handleNewConnection(void *clientPtr)
// TODO: Should we consider EOPNOTSUPP on BSD for sendfile and fallback to read/write?
// Linux would set EINVAL or ENOSYS instead, which it unfortunately also does for a couple of other failures :/
// read/write would kill performance anyways so a fallback would probably be of little use either way.
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
char buf[1000];
size_t cnt = realBytes - done;
if ( cnt > 1000 ) {
@@ -456,7 +433,7 @@ void* net_handleNewConnection(void *clientPtr)
}
if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
- image->working = false;
+ image->problem.read = true;
}
}
goto exit_client_cleanup;
@@ -473,7 +450,16 @@ void* net_handleNewConnection(void *clientPtr)
if ( lock ) mutex_unlock( &client->sendMutex );
// Global per-client counter
client->bytesSent += request.size; // Increase counter for statistics.
- break;
+ continue;
+ }
+ // Any other command
+ // Release cache map every now and then, in case the image was replicated
+ // entirely. Will be re-grabbed on next CMD_GET_BLOCK otherwise.
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ cache = NULL;
+ }
+ switch ( request.cmd ) {
case CMD_GET_SERVERS:
// Build list of known working alt servers
@@ -522,9 +508,9 @@ set_name: ;
logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
break;
- }
- }
- }
+ } // end switch
+ } // end loop
+ } // end bOk
exit_client_cleanup: ;
// First remove from list, then add to counter to prevent race condition
removeFromList( client );
@@ -533,8 +519,12 @@ exit_client_cleanup: ;
if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
mutex_lock( &image->lock );
timing_get( &image->atime );
+ image->accessed = true;
mutex_unlock( &image->lock );
}
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ }
freeClientStruct( client ); // This will also call image_release on client->image
return NULL ;
fail_preadd: ;
@@ -695,9 +685,21 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
if ( client->image != NULL ) {
dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
if ( uplink != NULL ) {
- uplink_removeClient( uplink, client );
+ if ( client->relayedCount != 0 ) {
+ uplink_removeEntry( uplink, client, &uplinkCallback );
+ }
ref_put( &uplink->reference );
}
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount );
+ int i;
+ for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) {
+ usleep( 10000 );
+ }
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount );
+ }
+ }
}
mutex_lock( &client->sendMutex );
if ( client->sock != -1 ) {
@@ -739,15 +741,21 @@ static bool addToList(dnbd3_client_t *client)
return true;
}
-void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle)
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer)
{
- dnbd3_reply_t reply;
- reply.magic = dnbd3_packet_magic;
- reply.cmd = cmd;
- reply.handle = handle;
- reply.size = 0;
+ dnbd3_client_t *client = (dnbd3_client_t*)data;
+ dnbd3_reply_t reply = {
+ .magic = dnbd3_packet_magic,
+ .cmd = buffer == NULL ? CMD_ERROR : CMD_GET_BLOCK,
+ .handle = handle,
+ .size = length,
+ };
mutex_lock( &client->sendMutex );
- send_reply( client->sock, &reply, NULL );
+ send_reply( client->sock, &reply, buffer );
+ if ( buffer == NULL ) {
+ shutdown( client->sock, SHUT_RDWR );
+ }
+ client->relayedCount--;
mutex_unlock( &client->sendMutex );
}
diff --git a/src/server/net.h b/src/server/net.h
index 7719aef..2d6e5e7 100644
--- a/src/server/net.h
+++ b/src/server/net.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -37,6 +37,4 @@ void net_disconnectAll();
void net_waitForAllDisconnected();
-void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle);
-
#endif /* NET_H_ */
diff --git a/src/server/picohttpparser/CMakeLists.txt b/src/server/picohttpparser/CMakeLists.txt
new file mode 100644
index 0000000..cc6ec96
--- /dev/null
+++ b/src/server/picohttpparser/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(picohttpparser
+ LANGUAGES C)
+
+set(PICOHTTPPARSER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.c)
+set(PICOHTTPPARSER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.h)
+
+add_library(picohttpparser STATIC ${PICOHTTPPARSER_SOURCE_FILES})
+target_include_directories(picohttpparser PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/server/reference.h b/src/server/reference.h
index 4eda546..75a681f 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref )
return ref;
}
+static inline void ref_inc( ref *ref )
+{
+ ++ref->count;
+}
+
static inline void ref_put( ref *ref )
{
if ( --ref->count == 0 ) {
diff --git a/src/server/rpc.c b/src/server/rpc.c
index a454d6d..119bbd5 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -5,7 +5,9 @@
#include "locks.h"
#include "image.h"
#include "altservers.h"
-#include "../shared/sockhelper.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include "fileutil.h"
#include "picohttpparser/picohttpparser.h"
#include "urldecode.h"
@@ -101,8 +103,8 @@ void rpc_init()
int fd = open( "/dev/urandom", O_RDONLY );
if ( fd != -1 ) {
uint32_t bla = 1;
- read( fd, &bla, 4 );
- randomRunId = (randomRunId << 32) | bla;
+ (void)!read( fd, &bla, 4 );
+ randomRunId = ((randomRunId & 0xffffffff) << 32) | bla;
}
close( fd );
}
@@ -144,7 +146,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
while ( !_shutdown ) {
// Read request from client
struct phr_header headers[100];
- size_t numHeaders, prevLen = 0, consumed;
+ size_t numHeaders, prevLen = 0, consumed = 0;
struct string method, path;
int minorVersion;
while ( !_shutdown ) {
@@ -174,7 +176,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
// Reaching here means partial request or parse error
if ( pret == -2 ) { // Partial, keep reading
prevLen = hoff;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
ssize_t ret = recv( 0, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
#else
ssize_t ret = recv( sock, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
@@ -259,7 +261,7 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
{
bool ok;
bool stats = false, images = false, clients = false, space = false;
- bool logfile = false, config = false, altservers = false;
+ bool logfile = false, config = false, altservers = false, version = false;
#define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true
for (size_t i = 0; i < fields_num; ++i) {
if ( !equals( &fields[i].name, &STR_Q ) ) continue;
@@ -270,9 +272,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
else SETVAR(logfile);
else SETVAR(config);
else SETVAR(altservers);
+ else SETVAR(version);
}
#undef SETVAR
- if ( ( stats || space ) && !(permissions & ACL_STATS) ) {
+ if ( ( stats || space || version ) && !(permissions & ACL_STATS) ) {
return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive );
}
if ( images && !(permissions & ACL_IMAGE_LIST) ) {
@@ -308,6 +311,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
statisticsJson = json_pack( "{sI}",
"runId", randomRunId );
}
+ if ( version ) {
+ json_object_set_new( statisticsJson, "version", json_string( DNBD3_VERSION_LONG ", built " DNBD3_BUILD_DATE ) );
+ json_object_set_new( statisticsJson, "build", json_string( DNBD3_BUILD ) );
+ }
if ( space ) {
uint64_t spaceTotal = 0, spaceAvail = 0;
file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail );
@@ -405,9 +412,11 @@ static bool sendReply(int sock, const char *status, const char *ctype, const cha
if ( keepAlive == HTTP_CLOSE ) {
// Wait for flush
shutdown( sock, SHUT_WR );
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
+ // Don't wait too long in case other side ignores the shutdown
+ sock_setTimeout( sock, 600 );
while ( read( sock, buffer, sizeof buffer ) > 0 );
return false;
}
@@ -451,7 +460,7 @@ static int getacl(dnbd3_host_t *host)
if ( aclRules[i].bitMask != 0 && aclRules[i].host[aclRules[i].bytes] != ( host->addr[aclRules[i].bytes] & aclRules[i].bitMask ) ) continue;
return aclRules[i].permissions;
}
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
return 0x7fffff;
#else
return 0;
@@ -487,7 +496,7 @@ static void addacl(int argc, char **argv, void *data UNUSED)
*slash++ = '\0';
}
if ( !parse_address( argv[0], &host ) ) goto unlock_end;
- long int bits;
+ long int bits = 0;
if ( slash != NULL ) {
char *last;
bits = strtol( slash, &last, 10 );
diff --git a/src/server/serialize.c b/src/server/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/server/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/server/server.c b/src/server/server.c
index 0dddea7..0f75935 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -29,10 +29,12 @@
#include "integrity.h"
#include "threadpool.h"
#include "rpc.h"
+#include "fuse.h"
-#include "../version.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
#include <signal.h>
#include <getopt.h>
@@ -104,10 +106,14 @@ static void queueJobInternal(job_t *job);
*/
void dnbd3_printHelp(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
+ printf( "Version: %s\n\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
printf( "Usage: %s [OPTIONS]...\n", argv_0 );
printf( "Start the DNBD3 server\n" );
printf( "-c or --config Configuration directory (default /etc/dnbd3-server/)\n" );
+#ifdef DNBD3_SERVER_FUSE
+ printf( "-m or --mount FUSE mount point\n");
+#endif
printf( "-n or --nodaemon Start server in foreground\n" );
printf( "-b or --bind Local Address to bind to\n" );
printf( "-h or --help Show this help text and quit\n" );
@@ -126,7 +132,8 @@ void dnbd3_printHelp(char *argv_0)
*/
void dnbd3_printVersion()
{
- printf( "Version: %s\n", VERSION_STRING );
+ printf( "dnbd3-server version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
exit( 0 );
}
@@ -140,6 +147,8 @@ _Noreturn static void dnbd3_cleanup()
_shutdown = true;
logadd( LOG_INFO, "Cleanup..." );
+ dfuse_shutdown();
+
if ( hasTimerThread ) {
pthread_kill( timerThread, SIGINT );
thread_join( timerThread, NULL );
@@ -190,11 +199,13 @@ int main(int argc, char *argv[])
char *paramCreate = NULL;
char *bindAddress = NULL;
char *errorMsg = NULL;
+ char *mountDir = NULL;
int64_t paramSize = -1;
int paramRevision = -1;
- static const char *optString = "b:c:d:hnv?";
+ static const char *optString = "b:c:m:d:hnv?";
static const struct option longOpts[] = {
{ "config", required_argument, NULL, 'c' },
+ { "mount", required_argument, NULL, 'm' },
{ "nodaemon", no_argument, NULL, 'n' },
{ "reload", no_argument, NULL, 'r' },
{ "help", no_argument, NULL, 'h' },
@@ -209,6 +220,16 @@ int main(int argc, char *argv[])
{ 0, 0, 0, 0 }
};
+ log_init();
+
+ /* set proper output stream for AFL */
+#ifdef DNBD3_SERVER_AFL
+ if ( log_setConsoleOutputStream(stderr) < 0 ) {
+ logadd( LOG_ERROR, "Failed to set output stream for AFL to stderr" );
+ exit( EXIT_FAILURE );
+ }
+#endif
+
mainPid = getpid();
mainThread = pthread_self();
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
@@ -218,6 +239,13 @@ int main(int argc, char *argv[])
case 'c':
_configDir = strdup( optarg );
break;
+ case 'm':
+#ifndef DNBD3_SERVER_FUSE
+ fprintf( stderr, "FUSE support not enabled at build time.\n" );
+ return 8;
+#endif
+ mountDir = strdup( optarg );
+ break;
case 'n':
demonize = 0;
break;
@@ -263,6 +291,7 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+
// Load general config
if ( _configDir == NULL ) _configDir = strdup( "/etc/dnbd3-server" );
@@ -275,9 +304,7 @@ int main(int argc, char *argv[])
timing_setBase();
timing_get( &startupTime );
-#ifdef AFL_MODE
- // ###### AFL
- //
+#ifdef DNBD3_SERVER_AFL
image_serverStartup();
net_init();
uplink_globalsInit();
@@ -301,9 +328,7 @@ int main(int argc, char *argv[])
net_handleNewConnection( dnbd3_client );
exit( 0 );
}
- //
- // ###### AFL END
-#endif
+#endif /* DNBD3_SERVER_AFL */
// One-shots first:
@@ -315,7 +340,10 @@ int main(int argc, char *argv[])
// No one-shot detected, normal server operation or errormsg serving
if ( demonize ) {
logadd( LOG_INFO, "Forking into background, see log file for further information" );
- daemon( 1, 0 );
+ if ( daemon( 0, 0 ) == -1 ) {
+ logadd( LOG_ERROR, "Could not daemon(): errno=%d", errno );
+ exit( 1 );
+ }
}
if ( errorMsg != NULL ) {
setupNetwork( bindAddress );
@@ -339,7 +367,15 @@ int main(int argc, char *argv[])
net_init();
uplink_globalsInit();
rpc_init();
- logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE );
+ if ( mountDir != NULL && !dfuse_init( "-oallow_other", mountDir ) ) {
+ logadd( LOG_ERROR, "Cannot mount fuse directory to %s", mountDir );
+ dnbd3_cleanup();
+ return EXIT_FAILURE;
+ }
+ logadd( LOG_INFO, "DNBD3 server starting...." );
+ logadd( LOG_INFO, "Machine type: " DNBD3_ENDIAN_MODE );
+ logadd( LOG_INFO, "Build Type: %s", DNBD3_BUILD );
+ logadd( LOG_INFO, "Version: %s, built %s", DNBD3_VERSION_LONG, DNBD3_BUILD_DATE );
if ( altservers_load() < 0 ) {
logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
@@ -379,10 +415,11 @@ int main(int argc, char *argv[])
// Initialize thread pool
if ( !threadpool_init( 8 ) ) {
logadd( LOG_ERROR, "Could not init thread pool!\n" );
+ dnbd3_cleanup();
exit( EXIT_FAILURE );
}
- logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
+ logadd( LOG_INFO, "Server is ready." );
if ( thread_create( &timerThread, NULL, &timerMainloop, NULL ) == 0 ) {
hasTimerThread = true;
@@ -398,7 +435,7 @@ int main(int argc, char *argv[])
if ( sigReload ) {
sigReload = false;
logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" );
- threadpool_run( &server_asyncImageListLoad, NULL );
+ threadpool_run( &server_asyncImageListLoad, NULL, "IMAGE_RELOAD" );
}
if ( sigLogCycle ) {
sigLogCycle = false;
@@ -425,7 +462,7 @@ int main(int argc, char *argv[])
continue;
}
- if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) {
+ if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client, "CLIENT" ) ) {
logadd( LOG_ERROR, "Could not start thread for new connection." );
free( dnbd3_client );
continue;
@@ -520,10 +557,11 @@ static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) {
pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread
}
- }
- if ( pthread_equal( pthread_self(), mainThread ) ) {
- // Signal received by main thread -- handle
- dnbd3_handleSignal( signum );
+ // Source is not this process -- only then do we honor signals
+ if ( pthread_equal( pthread_self(), mainThread ) ) {
+ // Signal received by main thread -- handle
+ dnbd3_handleSignal( signum );
+ }
}
}
@@ -568,7 +606,7 @@ static int handlePendingJobs(void)
jobHead = *temp; // Make it list head
*temp = NULL; // Split off part before that
while ( todo != NULL ) {
- threadpool_run( todo->startRoutine, todo->arg );
+ threadpool_run( todo->startRoutine, todo->arg, "TIMER_TASK" );
old = todo;
todo = todo->next;
if ( old->intervalSecs == 0 ) {
diff --git a/src/server/server.h b/src/server/server.h
index a026eb6..e93d8f5 100644
--- a/src/server/server.h
+++ b/src/server/server.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -22,7 +22,7 @@
#define SERVER_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
uint32_t dnbd3_serverUptime();
void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs);
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 0b46fd6..a21bd0d 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -8,6 +8,7 @@ typedef struct _entry_t {
dnbd3_signal_t* signal;
void *(*startRoutine)(void *);
void * arg;
+ const char *name;
} entry_t;
static void *threadpool_worker(void *entryPtr);
@@ -56,21 +57,22 @@ void threadpool_waitEmpty()
} while ( activeThreads != 0 );
}
-bool threadpool_run(void *(*startRoutine)(void *), void *arg)
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name)
{
if ( unlikely( _shutdown ) ) {
logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
return false;
}
+#ifdef DEBUG
if ( unlikely( startRoutine == NULL ) ) {
logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
return false; // Or bail out!?
}
+#endif
entry_t *entry = NULL;
for ( int i = 0; i < maxIdleThreads; ++i ) {
- entry_t *cur = pool[i];
- if ( cur != NULL && atomic_compare_exchange_weak( &pool[i], &cur, NULL ) ) {
- entry = cur;
+ entry = atomic_exchange( &pool[i], NULL );
+ if ( entry != NULL ) {
break;
}
}
@@ -87,7 +89,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
return false;
}
if ( 0 != thread_create( &(entry->thread), &threadAttrs, threadpool_worker, (void*)entry ) ) {
- logadd( LOG_WARNING, "Could not create new thread for thread pool\n" );
+ logadd( LOG_WARNING, "Could not create new thread for thread pool (%d active)\n", (int)activeThreads );
signal_close( entry->signal );
free( entry );
return false;
@@ -96,6 +98,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
}
entry->startRoutine = startRoutine;
entry->arg = arg;
+ entry->name = name;
atomic_thread_fence( memory_order_release );
signal_call( entry->signal );
return true;
@@ -120,10 +123,15 @@ keep_going:;
logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
continue;
}
+#ifdef DEBUG
if ( entry->startRoutine == NULL ) {
logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
exit( 1 );
}
+ if ( entry->name != NULL ) {
+ setThreadName( entry->name );
+ }
+#endif
// Start assigned work
(*entry->startRoutine)( entry->arg );
// Reset vars for safety
@@ -143,6 +151,7 @@ keep_going:;
// Reaching here means pool is full; just let the thread exit
break;
}
+ setThreadName( "[dead]" );
signal_close( entry->signal );
free( entry );
activeThreads--;
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index ee0b3aa..c30d44f 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -1,7 +1,7 @@
#ifndef _THREADPOOL_H_
#define _THREADPOOL_H_
-#include "../types.h"
+#include <dnbd3/types.h>
/**
* Initialize the thread pool. This must be called before using
@@ -26,9 +26,10 @@ void threadpool_waitEmpty();
* Run a thread using the thread pool.
* @param startRoutine function to run in new thread
* @param arg argument to pass to thead
+ * @param name STRING CONSTANT (literal) for debugging purposes
* @return true if thread was started
*/
-bool threadpool_run(void *(*startRoutine)(void *), void *arg);
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name);
#endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index f39e633..8a83124 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -4,10 +4,11 @@
#include "image.h"
#include "altservers.h"
#include "net.h"
-#include "../shared/sockhelper.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
+#include "threadpool.h"
#include "reference.h"
#include <assert.h>
@@ -17,49 +18,35 @@
#include <unistd.h>
#include <stdatomic.h>
+static const uint8_t HOP_FLAG_BGR = 0x80;
+static const uint8_t HOP_FLAG_PREFETCH = 0x40;
#define FILE_BYTES_PER_MAP_BYTE ( DNBD3_BLOCK_SIZE * 8 )
#define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
#define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
-#define REP_NONE ( (uint64_t)0xffffffffffffffff )
-
-// Status of request in queue
-
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
-
-static const char *const NAMES_ULR[4] = {
- [ULR_FREE] = "ULR_FREE",
- [ULR_NEW] = "ULR_NEW",
- [ULR_PENDING] = "ULR_PENDING",
- [ULR_PROCESSING] = "ULR_PROCESSING",
-};
-
static atomic_uint_fast64_t totalBytesReceived = 0;
+typedef struct {
+ uint64_t start, end, handle;
+} req_t;
+
static void cancelAllRequests(dnbd3_uplink_t *uplink);
-static void uplink_free(ref *ref);
+static void freeUplinkStruct(ref *ref);
static void* uplink_mainloop(void *data);
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
-static void uplink_handleReceive(dnbd3_uplink_t *uplink);
-static int uplink_sendKeepalive(const int fd);
-static void uplink_addCrc32(dnbd3_uplink_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly);
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
+static void handleReceive(dnbd3_uplink_t *uplink);
+static bool sendKeepalive(dnbd3_uplink_t *uplink);
+static void requestCrc32List(dnbd3_uplink_t *uplink);
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink);
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
+
+#define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) )
// ############ uplink connection handling
@@ -81,6 +68,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
{
if ( !_isProxy || _shutdown ) return false;
assert( image != NULL );
+ if ( sock == -1 && !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
mutex_lock( &image->lock );
dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
if ( uplink != NULL ) {
@@ -97,13 +86,15 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
}
uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
// Start with one reference for the uplink thread. We'll return it when the thread finishes
- ref_init( &uplink->reference, uplink_free, 1 );
+ ref_init( &uplink->reference, freeUplinkStruct, 1 );
mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
uplink->image = image;
uplink->bytesReceived = 0;
- uplink->idleTime = 0;
+ uplink->bytesReceivedLastSave = 0;
+ uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
+ uplink->queue = NULL;
uplink->queueLen = 0;
uplink->cacheFd = -1;
uplink->signal = signal_new();
@@ -111,12 +102,14 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
goto failure;
}
- uplink->replicationHandle = REP_NONE;
mutex_lock( &uplink->rttLock );
mutex_lock( &uplink->sendMutex );
uplink->current.fd = -1;
mutex_unlock( &uplink->sendMutex );
uplink->cycleDetected = false;
+ image->problem.uplink = true;
+ image->problem.write = true;
+ image->problem.queue = false;
if ( sock != -1 ) {
uplink->better.fd = sock;
int index = altservers_hostToIndex( host );
@@ -139,7 +132,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
return true;
failure: ;
if ( uplink != NULL ) {
- image->users++; // Expected by uplink_free()
+ image->users++; // Expected by freeUplinkStruct()
ref_put( &uplink->reference ); // The ref for the uplink thread that never was
}
mutex_unlock( &image->lock );
@@ -166,13 +159,13 @@ bool uplink_shutdown(dnbd3_image_t *image)
image->users++; // Prevent free while uplink shuts down
signal_call( uplink->signal );
} else {
- logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid );
+ logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) );
}
cancelAllRequests( uplink );
ref_setref( &image->uplinkref, NULL );
- ref_put( &uplink->reference );
mutex_unlock( &uplink->queueLock );
bool retval = ( exp && image->users == 0 );
+ ref_put( &uplink->reference );
mutex_unlock( &image->lock );
return retval;
}
@@ -183,19 +176,28 @@ bool uplink_shutdown(dnbd3_image_t *image)
*/
static void cancelAllRequests(dnbd3_uplink_t *uplink)
{
- for ( int i = 0; i < uplink->queueLen; ++i ) {
- if ( uplink->queue[i].status != ULR_FREE ) {
- net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
- uplink->queue[i].status = ULR_FREE;
+ dnbd3_queue_entry_t *it = uplink->queue;
+ while ( it != NULL ) {
+ dnbd3_queue_client_t *cit = it->clients;
+ while ( cit != NULL ) {
+ (*cit->callback)( cit->data, cit->handle, 0, 0, NULL );
+ dnbd3_queue_client_t *next = cit->next;
+ free( cit );
+ cit = next;
}
+ dnbd3_queue_entry_t *next = it->next;
+ free( it );
+ it = next;
}
+ uplink->queue = NULL;
uplink->queueLen = 0;
+ uplink->image->problem.queue = false;
}
-static void uplink_free(ref *ref)
+static void freeUplinkStruct(ref *ref)
{
dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
- logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
+ logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) );
assert( uplink->queueLen == 0 );
if ( uplink->signal != NULL ) {
signal_close( uplink->signal );
@@ -226,35 +228,36 @@ static void uplink_free(ref *ref)
* Remove given client from uplink request queue
* Locks on: uplink.queueLock
*/
-void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback)
{
mutex_lock( &uplink->queueLock );
- for (int i = uplink->queueLen - 1; i >= 0; --i) {
- if ( uplink->queue[i].client == client ) {
- // Make sure client doesn't get destroyed while we're sending it data
- mutex_lock( &client->sendMutex );
- mutex_unlock( &client->sendMutex );
- uplink->queue[i].client = NULL;
- uplink->queue[i].status = ULR_FREE;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; ) {
+ if ( (**cit).data == data && (**cit).callback == callback ) {
+ (*(**cit).callback)( (**cit).data, (**cit).handle, 0, 0, NULL );
+ dnbd3_queue_client_t *entry = *cit;
+ *cit = (**cit).next;
+ free( entry );
+ } else {
+ cit = &(**cit).next;
+ }
}
- if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
}
mutex_unlock( &uplink->queueLock );
}
/**
- * Request a chunk of data through an uplink server
- * Locks on: image.lock, uplink.queueLock
+ * Called from a client (proxy) connection to request a missing part of the image.
+ * The caller has made sure that the range is actually missing.
*/
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
{
- if ( client == NULL || client->image == NULL )
- return false;
- if ( length > (uint32_t)_maxPayload ) {
- logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
+ assert( client != NULL && callback != NULL );
+ if ( ( hops & 0x3f ) > 60 ) { // This is just silly
+ logadd( LOG_WARNING, "Refusing to relay a request that has > 60 hops" );
return false;
}
- dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
+ dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
if ( unlikely( uplink == NULL ) ) {
uplink_init( client->image, -1, NULL, -1 );
uplink = ref_get_uplink( &client->image->uplinkref );
@@ -263,160 +266,275 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
return false;
}
}
- if ( uplink->shutdown ) {
- logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
- goto fail_ref;
- }
// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
// This might be a false positive if there are multiple instances running on the same host (IP)
- if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
+ bool ret;
+ if ( hops > 1 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
uplink->cycleDetected = true;
signal_call( uplink->signal );
logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
- goto fail_ref;
+ ret = false;
+ } else {
+ ret = uplink_requestInternal( uplink, (void*)client, callback, handle, start, length, hops );
}
+ ref_put( &uplink->reference );
+ return ret;
+}
- int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
- int existingType = -1; // ULR_* type of existing request
- int i;
- int freeSlot = -1;
- int firstUsedSlot = -1;
- bool requestLoop = false;
- const uint64_t end = start + length;
+/**
+ * Called by integrated fuse module
+ */
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length)
+{
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( unlikely( uplink == NULL ) ) {
+ uplink_init( image, -1, NULL, -1 );
+ uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink == NULL ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ return false;
+ }
+ }
+ bool ret = uplink_requestInternal( uplink, data, callback, handle, start, length, 0 );
+ ref_put( &uplink->reference );
+ return ret;
+}
+
+static void extendRequest(uint64_t start, uint64_t *end, const dnbd3_image_t *image, uint32_t wanted)
+{
+ uint32_t length = (uint32_t)( *end - start );
+ if ( length >= wanted )
+ return;
+ length = wanted;
+ if ( unlikely( _backgroundReplication == BGR_HASHBLOCK
+ && *end / HASH_BLOCK_SIZE != (start + length) / HASH_BLOCK_SIZE ) ) {
+ // Don't extend across hash-block border in this mode
+ *end = ( start + length ) & ~( HASH_BLOCK_SIZE - 1 );
+ } else {
+ *end = start + length;
+ }
+ if ( unlikely( *end > image->virtualFilesize ) ) {
+ *end = image->virtualFilesize;
+ }
+ *end = ( *end + DNBD3_BLOCK_SIZE - 1 ) & ~( DNBD3_BLOCK_SIZE - 1 );
+ //logadd( LOG_DEBUG2, "Extended %"PRIx64" from %"PRIx64" to %"PRIx64, start, end, req.end );
+}
+
+static bool requestBlock(dnbd3_uplink_t *uplink, req_t *req, uint8_t hops)
+{
+ if ( uplink->current.fd == -1 )
+ return false;
+ return dnbd3_get_block( uplink->current.fd, req->start,
+ (uint32_t)( req->end - req->start ), req->handle,
+ COND_HOPCOUNT( uplink->current.version, hops ) );
+}
+
+/**
+ * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL.
+ * If callback is NULL, this is assumed to be a background replication request.
+ * Locks on: uplink.queueLock, uplink.sendMutex
+ */
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+{
+ assert( uplink != NULL );
+ assert( data == NULL || callback != NULL );
+ if ( ( hops & HOP_FLAG_BGR ) // This is a background replication request
+ && _backgroundReplication != BGR_FULL ) { // Deny if we're not doing BGR
+ // TODO: Allow BGR_HASHBLOCK too, but only if hash block isn't completely empty
+ logadd( LOG_DEBUG2, "Dopping client because of BGR policy" );
+ return false;
+ }
+ if ( uplink->shutdown ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
+ return false;
+ }
+ if ( length > (uint32_t)_maxPayload ) {
+ logadd( LOG_WARNING, "UPLINK: Cannot relay request; length of %" PRIu32 " exceeds maximum payload",
+ length );
+ return false;
+ }
+ hops++;
+ if ( callback == NULL ) {
+ // Set upper-most bit for replication requests that we fire
+ // In client mode, at least set prefetch flag to prevent prefetch cascading
+ hops |= (uint8_t)( _pretendClient ? HOP_FLAG_PREFETCH : HOP_FLAG_BGR );
+ }
+
+ req_t req, preReq;
+ dnbd3_queue_entry_t *request = NULL, *last = NULL, *pre = NULL;
+ bool isNew;
+ const uint64_t end = start + length;
+ req.start = start & ~(DNBD3_BLOCK_SIZE - 1);
+ req.end = end;
+ /* Don't do this -- this breaks matching of prefetch jobs, since they'd
+ * be misaligned, and the next client request wouldn't match anything.
+ * To improve this, we need to be able to attach a queue_client to multiple queue_entries
+ * and then serve it once all the queue_entries are done (atomic_int in queue_client).
+ * But currently we directly send the receive buffer's content to the queue_client after
+ * receiving the payload, as this will also work when the local cache is borked (we just
+ * tunnel though the traffic). One could argue that this mode of operation is nonsense,
+ * and we should just drop all affected clients. Then as a next step, don't serve the
+ * clients form the receive buffer, but just issue a normal sendfile() call after writing
+ * the received data to the local cache.
+ */
+ if ( callback != NULL && _minRequestSize != 0 ) {
+ // Not background replication request, extend request size
+ extendRequest( req.start, &req.end, uplink->image, _minRequestSize );
+ }
+ req.end = (req.end + DNBD3_BLOCK_SIZE - 1) & ~(DNBD3_BLOCK_SIZE - 1);
+ // Critical section - work with the queue
mutex_lock( &uplink->queueLock );
if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
goto fail_lock;
}
- for (i = 0; i < uplink->queueLen; ++i) {
- // find free slot to place this request into
- if ( uplink->queue[i].status == ULR_FREE ) {
- if ( freeSlot == -1 || existingType != ULR_PROCESSING ) {
- freeSlot = i;
- }
- continue;
- }
- if ( firstUsedSlot == -1 ) {
- firstUsedSlot = i;
- }
- // find existing request to attach to
- if ( uplink->queue[i].from > start || uplink->queue[i].to < end )
- continue; // Range not suitable
- // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious
- if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) {
- requestLoop = true;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->from <= start && it->to >= end ) {
+ // Matching range, attach
+ request = it;
break;
}
- if ( foundExisting == -1 || existingType == ULR_PROCESSING ) {
- foundExisting = i;
- existingType = uplink->queue[i].status;
+ if ( it->next == NULL ) {
+ // Not matching, last in list, remember
+ last = it;
+ break;
}
}
- if ( unlikely( requestLoop ) ) {
- uplink->cycleDetected = true;
- signal_call( uplink->signal );
- logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
- goto fail_lock;
- }
- if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
- freeSlot = -1; // Not attaching to existing request, make it use a higher slot
- }
- if ( freeSlot == -1 ) {
- if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
- logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
+ dnbd3_queue_client_t **c = NULL;
+ if ( request == NULL ) {
+ // No existing request to attach to
+ if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) {
+ logadd( LOG_WARNING,
+ "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." );
+ goto fail_lock;
+ }
+ uplink->queueLen++;
+ if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = true;
+ }
+ request = malloc( sizeof(*request) );
+ if ( last == NULL ) {
+ uplink->queue = request;
+ } else {
+ last->next = request;
+ }
+ request->next = NULL;
+ request->handle = ++uplink->queueId;
+ request->from = req.start;
+ request->to = req.end;
+#ifdef DEBUG
+ timing_get( &request->entered );
+#endif
+ request->hopCount = hops;
+ request->sent = true; // Optimistic; would be set to false on failure
+ if ( callback == NULL ) {
+ // BGR
+ request->clients = NULL;
+ } else {
+ c = &request->clients;
+ }
+ isNew = true;
+ } else if ( callback == NULL ) {
+ // Replication request that maches existing request. Do nothing
+ isNew = false;
+ } else {
+ // Existing request. Check if potential cycle
+ if ( hops > request->hopCount && request->from == start && request->to == end ) {
+ logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
goto fail_lock;
}
- freeSlot = uplink->queueLen++;
+ // Count number if clients, get tail of list
+ int count = 0;
+ c = &request->clients;
+ while ( *c != NULL ) {
+ c = &(**c).next;
+ if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) {
+ logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count );
+ goto fail_lock;
+ }
+ }
+ isNew = false;
}
- // Do not send request to uplink server if we have a matching pending request AND the request either has the
- // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
- // explicitly send this request to the uplink server. The second condition mentioned here is to prevent
- // a race condition where the reply for the outstanding request already arrived and the uplink thread
- // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
- // already have passed the index of the free slot we determined, but not reached the existing request we just found above.
- if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) {
- foundExisting = -1; // -1 means "send request"
+ // Prefetch immediately, without unlocking the list - the old approach of
+ // async prefetching in another thread was sometimes so slow that we'd process
+ // another request from the same client before the prefetch job would execute.
+ if ( callback != NULL && ( isNew || request->clients == NULL || request->clients->data == data )
+ && !( hops & (HOP_FLAG_BGR | HOP_FLAG_PREFETCH) ) // No cascading of prefetches
+ && end == request->to && length <= _maxPrefetch ) {
+ // Only if this is a client request, and the !! end boundary matches exactly !!
+ // (See above for reason why)
+ // - We neither check the local cache, nor other pending requests. Worth it?
+ // Complexity vs. probability
+ preReq.start = end;
+ preReq.end = end;
+ extendRequest( preReq.start, &preReq.end, uplink->image, MIN( length * 3, _maxPrefetch ) );
+ if ( preReq.start < preReq.end ) {
+ //logadd( LOG_DEBUG2, "Prefetching @ %"PRIx64" - %"PRIx64, preReq.start, preReq.end );
+ uplink->queueLen++;
+ pre = malloc( sizeof(*pre) );
+ pre->next = request->next;
+ request->next = pre;
+ pre->handle = preReq.handle = ++uplink->queueId;
+ pre->from = preReq.start;
+ pre->to = preReq.end;
+ pre->hopCount = hops | HOP_FLAG_PREFETCH;
+ pre->sent = true; // Optimistic; would be set to false on failure
+ pre->clients = NULL;
+#ifdef DEBUG
+ timing_get( &pre->entered );
+#endif
+ }
}
-#ifdef _DEBUG
- if ( foundExisting != -1 ) {
- logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot );
- logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
- "New %" PRIu64 "-%" PRIu64 " (%p)\n",
- uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
- start, end, (void*)client );
+ // // // //
+ // Copy data - need this after unlocking
+ req.handle = request->handle;
+ if ( callback != NULL ) {
+ assert( c != NULL );
+ *c = malloc( sizeof( *request->clients ) );
+ (**c).next = NULL;
+ (**c).handle = handle;
+ (**c).from = start;
+ (**c).to = end;
+ (**c).data = data;
+ (**c).callback = callback;
}
-#endif
- // Fill structure
- uplink->queue[freeSlot].from = start;
- uplink->queue[freeSlot].to = end;
- uplink->queue[freeSlot].handle = handle;
- uplink->queue[freeSlot].client = client;
- //int old = uplink->queue[freeSlot].status;
- uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW :
- ( existingType == ULR_NEW ? ULR_PENDING : existingType ) );
- uplink->queue[freeSlot].hopCount = hops;
-#ifdef _DEBUG
- timing_get( &uplink->queue[freeSlot].entered );
- //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
-#endif
mutex_unlock( &uplink->queueLock );
+ // End queue critical section
+ if ( pre == NULL && !isNew )
+ return true; // Nothing to do
- if ( foundExisting != -1 ) {
- ref_put( &uplink->reference );
- return true; // Attached to pending request, do nothing
+ // Fire away the request(s)
+ mutex_lock( &uplink->sendMutex );
+ bool ret1 = true;
+ bool ret2 = true;
+ if ( isNew ) {
+ ret1 = requestBlock( uplink, &req, hops );
}
-
- // See if we can fire away the request
- if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
- logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
- } else {
- if ( unlikely( uplink->current.fd == -1 ) ) {
- mutex_unlock( &uplink->sendMutex );
- logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
- } else {
- const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- if ( hops < 200 ) ++hops;
- const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
- mutex_unlock( &uplink->sendMutex );
- if ( unlikely( !ret ) ) {
- logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
- } else {
- // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
- int state;
- mutex_lock( &uplink->queueLock );
- if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
- state = uplink->queue[freeSlot].status;
- if ( uplink->queue[freeSlot].status == ULR_NEW ) {
- uplink->queue[freeSlot].status = ULR_PENDING;
- }
- } else {
- state = -1;
- }
- mutex_unlock( &uplink->queueLock );
- if ( state == -1 ) {
- logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
- } else if ( state == ULR_NEW ) {
- //logadd( LOG_DEBUG2, "Direct uplink request" );
- } else {
- logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
- }
- ref_put( &uplink->reference );
- return true;
- }
- // Fall through to waking up sender thread
- }
+ if ( pre != NULL ) {
+ ret2 = requestBlock( uplink, &preReq, hops | HOP_FLAG_PREFETCH );
+ }
+ if ( !ret1 || !ret2 ) { // Set with send locked
+ uplink->image->problem.uplink = true;
+ }
+ mutex_unlock( &uplink->sendMutex );
+ // markRequestUnsend locks the queue, would violate locking order with send mutex
+ if ( !ret1 ) {
+ markRequestUnsent( uplink, req.handle );
+ logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle );
+ }
+ if ( !ret2 ) {
+ markRequestUnsent( uplink, preReq.handle );
}
- if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+ if ( ( !ret1 || !ret2 ) && signal_call( uplink->signal ) == SIGNAL_ERROR ) {
logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
}
- ref_put( &uplink->reference );
return true;
+
fail_lock:
mutex_unlock( &uplink->queueLock );
-fail_ref:
- ref_put( &uplink->reference );
return false;
}
@@ -431,11 +549,10 @@ static void* uplink_mainloop(void *data)
#define EV_COUNT (2)
struct pollfd events[EV_COUNT];
dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
- int numSocks, i, waitTime;
+ int numSocks, waitTime;
int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
int rttTestResult;
uint32_t discoverFailCount = 0;
- uint32_t unsavedSeconds = 0;
ticks nextAltCheck, lastKeepalive;
char buffer[200];
memset( events, 0, sizeof(events) );
@@ -447,7 +564,7 @@ static void* uplink_mainloop(void *data)
thread_detach( uplink->thread );
blockNoncriticalSignals();
// Make sure file is open for writing
- if ( !uplink_reopenCacheFd( uplink, false ) ) {
+ if ( !reopenCacheFd( uplink, false ) ) {
// It might have failed - still offer proxy mode, we just can't cache
logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
}
@@ -460,14 +577,14 @@ static void* uplink_mainloop(void *data)
}
while ( !_shutdown && !uplink->shutdown ) {
// poll()
- waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
- if ( waitTime == 0 ) {
+ if ( uplink->rttTestResult == RTT_DOCHANGE ) {
// 0 means poll, since we're about to change the server
+ waitTime = 0;
} else {
declare_now;
waitTime = (int)timing_diffMs( &now, &nextAltCheck );
if ( waitTime < 100 ) waitTime = 100;
- if ( waitTime > 10000 ) waitTime = 10000;
+ else if ( waitTime > 10000 ) waitTime = 10000;
}
events[EV_SOCKET].fd = uplink->current.fd;
numSocks = poll( events, EV_COUNT, waitTime );
@@ -494,8 +611,7 @@ static void* uplink_mainloop(void *data)
mutex_unlock( &uplink->rttLock );
discoverFailCount = 0;
if ( fd != -1 ) close( fd );
- uplink->replicationHandle = REP_NONE;
- uplink->image->working = true;
+ uplink->image->problem.uplink = false;
uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
buffer[0] = '@';
if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
@@ -504,12 +620,17 @@ static void* uplink_mainloop(void *data)
}
// If we don't have a crc32 list yet, see if the new server has one
if ( uplink->image->crc32 == NULL ) {
- uplink_addCrc32( uplink );
+ requestCrc32List( uplink );
}
// Re-send all pending requests
- uplink_sendRequests( uplink, false );
- uplink_sendReplicationRequest( uplink );
+ sendQueuedRequests( uplink, false );
+ sendReplicationRequest( uplink );
events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+ if ( uplink->image->problem.uplink ) {
+ // Some of the requests above must have failed again already :-(
+ logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
+ connectionFailed( uplink, true );
+ }
timing_gets( &nextAltCheck, altCheckInterval );
// The rtt worker already did the handshake for our image, so there's nothing
// more to do here
@@ -517,6 +638,7 @@ static void* uplink_mainloop(void *data)
// Check events
// Signal
if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+ uplink->image->problem.uplink = true;
logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
goto cleanup;
} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
@@ -526,46 +648,37 @@ static void* uplink_mainloop(void *data)
}
if ( uplink->current.fd != -1 ) {
// Uplink seems fine, relay requests to it...
- uplink_sendRequests( uplink, true );
+ sendQueuedRequests( uplink, true );
} else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long
uplink->idleTime = 0;
}
}
// Uplink socket
if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
- uplink_connectionFailed( uplink, true );
+ connectionFailed( uplink, true );
logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents );
setThreadName( "panic-uplink" );
} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
- uplink_handleReceive( uplink );
+ handleReceive( uplink );
if ( _shutdown || uplink->shutdown ) goto cleanup;
}
declare_now;
uint32_t timepassed = timing_diff( &lastKeepalive, &now );
- if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+ if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL
+ || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
lastKeepalive = now;
uplink->idleTime += timepassed;
- unsavedSeconds += timepassed;
- if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) {
- // fsync/save every 4 minutes, or every 60 seconds if uplink is idle
- unsavedSeconds = 0;
- uplink_saveCacheMap( uplink );
- }
// Keep-alive
- if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
- // Send keep-alive if nothing is happening
- if ( uplink_sendKeepalive( uplink->current.fd ) ) {
- // Re-trigger periodically, in case it requires a minimum user count
- uplink_sendReplicationRequest( uplink );
- } else {
- uplink_connectionFailed( uplink, true );
- logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
- setThreadName( "panic-uplink" );
+ if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
+ // Send keep-alive if nothing is happening, and try to trigger background rep.
+ if ( !sendKeepalive( uplink ) || !sendReplicationRequest( uplink ) ) {
+ connectionFailed( uplink, true );
+ logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
}
}
// Don't keep uplink established if we're idle for too much
- if ( uplink_connectionShouldShutdown( uplink ) ) {
- logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
+ if ( connectionShouldShutdown( uplink ) ) {
+ logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) );
goto cleanup;
}
}
@@ -578,6 +691,7 @@ static void* uplink_mainloop(void *data)
// Quit work if image is complete
logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
setThreadName( "finished-uplink" );
+ uplink->image->problem.uplink = false;
goto cleanup;
} else {
// Not complete - do measurement
@@ -592,46 +706,44 @@ static void* uplink_mainloop(void *data)
} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
discoverFailCount++;
- if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
- logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
- uplink->image->working = false;
- }
if ( uplink->current.fd == -1 ) {
uplink->cycleDetected = false;
}
}
timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED );
}
-#ifdef _DEBUG
+#ifdef DEBUG
if ( uplink->current.fd != -1 && !uplink->shutdown ) {
bool resend = false;
ticks deadline;
timing_set( &deadline, &now, -10 );
mutex_lock( &uplink->queueLock );
- for (i = 0; i < uplink->queueLen; ++i) {
- if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
- snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
- "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
- uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
- uplink->queue[i].entered = now;
-#ifdef _DEBUG_RESEND_STARVING
- uplink->queue[i].status = ULR_NEW;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( timing_reached( &it->entered, &deadline ) ) {
+ logadd( LOG_WARNING, "Starving request detected:"
+ " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d",
+ it->from, it->to, (int)it->sent, PIMG(uplink->image) );
+ it->entered = now;
+#ifdef DEBUG_RESEND_STARVING
+ it->sent = false;
resend = true;
#endif
- mutex_unlock( &uplink->queueLock );
- logadd( LOG_WARNING, "%s", buffer );
- mutex_lock( &uplink->queueLock );
}
}
mutex_unlock( &uplink->queueLock );
- if ( resend )
- uplink_sendRequests( uplink, true );
+ if ( resend ) {
+ sendQueuedRequests( uplink, true );
+ }
}
#endif
}
- cleanup: ;
- uplink_saveCacheMap( uplink );
+cleanup: ;
dnbd3_image_t *image = uplink->image;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ cache->dirty = true; // Force writeout of cache map
+ ref_put( &cache->reference );
+ }
mutex_lock( &image->lock );
bool exp = false;
if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
@@ -653,37 +765,60 @@ static void* uplink_mainloop(void *data)
return NULL ;
}
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
+/**
+ * Only called from uplink thread.
+ */
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly)
{
- // Scan for new requests
- int j;
+ assert_uplink_thread();
+ // Scan for new requests, or optionally, (re)send all
+ // Build a buffer, so if there aren't too many requests, we can send them after
+ // unlocking the queue again. Otherwise we need flushes during iteration, which
+ // is no ideal, but in that case the uplink is probably overwhelmed anyways.
+ // Try 125 as that's exactly 300bytes, usually 2*MTU.
+#define MAX_RESEND_BATCH 125
+ dnbd3_request_t reqs[MAX_RESEND_BATCH];
+ int count = 0;
mutex_lock( &uplink->queueLock );
- for (j = 0; j < uplink->queueLen; ++j) {
- if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue;
- uplink->queue[j].status = ULR_PENDING;
- uint8_t hops = uplink->queue[j].hopCount;
- const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- /*
- logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
- (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
- */
- mutex_unlock( &uplink->queueLock );
- if ( hops < 200 ) ++hops;
- mutex_lock( &uplink->sendMutex );
- const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
- mutex_unlock( &uplink->sendMutex );
- if ( !ret ) {
- // Non-critical - if the connection dropped or the server was changed
- // the thread will re-send this request as soon as the connection
- // is reestablished.
- logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
- altservers_serverFailed( uplink->current.index );
- return;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( newOnly && it->sent )
+ continue;
+ it->sent = true;
+ dnbd3_request_t *hdr = &reqs[count++];
+ hdr->magic = dnbd3_packet_magic;
+ hdr->cmd = CMD_GET_BLOCK;
+ hdr->size = (uint32_t)( it->to - it->from );
+ hdr->offset = it->from; // Offset first, then hops! (union)
+ hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount );
+ hdr->handle = it->handle;
+ fixup_request( *hdr );
+ if ( count == MAX_RESEND_BATCH ) {
+ bool ok = false;
+ logadd( LOG_DEBUG2, "BLOCKING resend of %d", count );
+ count = 0;
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 )
+ == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ if ( !ok ) {
+ uplink->image->problem.uplink = true;
+ break;
+ }
}
- mutex_lock( &uplink->queueLock );
}
mutex_unlock( &uplink->queueLock );
+ if ( count != 0 ) {
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ uplink->image->problem.uplink =
+ ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 )
+ != DNBD3_REQUEST_SIZE * count );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ }
+#undef MAX_RESEND_BATCH
}
/**
@@ -695,73 +830,97 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
* server. This means we might request data we already have, but it makes
* the code simpler. Worst case would be only one bit is zero, which means
* 4kb are missing, but we will request 32kb.
+ *
+ * Only called form uplink thread, so current.fd is assumed to be valid.
+ *
+ * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
*/
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink)
{
- if ( uplink == NULL || uplink->current.fd == -1 ) return;
- if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
- if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
- return; // Already a replication request on the wire, or no more blocks to replicate
+ assert_uplink_thread();
+ if ( uplink->current.fd == -1 )
+ return false; // Should never be called in this state, consider send error
+ if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
+ return true; // Don't do background replication
+ if ( uplink->nextReplicationIndex == -1 )
+ return true; // No more blocks to replicate
dnbd3_image_t * const image = uplink->image;
- if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
- if ( image->users < _bgrMinClients ) return; // Not enough active users
+ if ( image->users < _bgrMinClients )
+ return true; // Not enough active users
+ const int numNewRequests = numWantedReplicationRequests( uplink );
+ if ( numNewRequests <= 0 )
+ return true; // Already sufficient amount of requests on the wire
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
- if ( cache == NULL || image->users < _bgrMinClients ) {
+ if ( cache == NULL ) {
// No cache map (=image complete)
- ref_put( &cache->reference );
- return;
+ return true;
}
const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
const int lastBlockIndex = mapBytes - 1;
- int endByte;
- if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
- endByte = uplink->nextReplicationIndex + mapBytes;
- } else { // Hashblock based: Only look for match in current hash block
- endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
- if ( endByte > mapBytes ) {
- endByte = mapBytes;
+ for ( int bc = 0; bc < numNewRequests; ++bc ) {
+ int endByte;
+ if ( UPLINK_MAX_QUEUE - uplink->queueLen < 10 )
+ break; // Don't overload queue
+ if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+ endByte = uplink->nextReplicationIndex + mapBytes;
+ } else { // Hashblock based: Only look for match in current hash block
+ endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+ if ( endByte > mapBytes ) {
+ endByte = mapBytes;
+ }
}
- }
- atomic_thread_fence( memory_order_acquire );
- int replicationIndex = -1;
- for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
- const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
- if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
- && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
- // Found incomplete one
- replicationIndex = i;
+ atomic_thread_fence( memory_order_acquire );
+ int replicationIndex = -1;
+ for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
+ const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+ if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+ && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+ // Found incomplete one
+ replicationIndex = i;
+ break;
+ }
+ }
+ if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+ // Nothing left in current block, find next one
+ replicationIndex = findNextIncompleteHashBlock( uplink, endByte );
+ }
+ if ( replicationIndex == -1 ) {
+ // Replication might be complete, uplink_mainloop should take care....
+ uplink->nextReplicationIndex = -1;
break;
}
+ const uint64_t handle = ++uplink->queueId;
+ const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+ uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+ // Extend the default 32k request size if _minRequestSize is > 32k
+ for ( size_t extra = 1; extra < ( _minRequestSize / FILE_BYTES_PER_MAP_BYTE )
+ && offset + size < image->virtualFilesize
+ && _backgroundReplication == BGR_FULL; ++extra ) {
+ if ( atomic_load_explicit( &cache->map[replicationIndex+1], memory_order_relaxed ) == 0xff )
+ break; // Hit complete 32k block, stop here
+ replicationIndex++;
+ size += (uint32_t)MIN( image->virtualFilesize - offset - size, FILE_BYTES_PER_MAP_BYTE );
+ }
+ if ( !uplink_requestInternal( uplink, NULL, NULL, handle, offset, size, 0 ) ) {
+ logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)",
+ PIMG(uplink->image) );
+ ref_put( &cache->reference );
+ return false;
+ }
+ if ( replicationIndex == lastBlockIndex ) {
+ uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+ }
+ uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+ if ( _backgroundReplication == BGR_HASHBLOCK
+ && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+ // Just crossed a hash block boundary, look for new candidate starting at this very index
+ uplink->nextReplicationIndex = findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+ if ( uplink->nextReplicationIndex == -1 )
+ break;
+ }
}
ref_put( &cache->reference );
- if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
- // Nothing left in current block, find next one
- replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
- }
- if ( replicationIndex == -1 ) {
- // Replication might be complete, uplink_mainloop should take care....
- uplink->nextReplicationIndex = -1;
- return;
- }
- const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
- uplink->replicationHandle = offset;
- const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
- mutex_lock( &uplink->sendMutex );
- bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
- mutex_unlock( &uplink->sendMutex );
- if ( !sendOk ) {
- logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
- return;
- }
- if ( replicationIndex == lastBlockIndex ) {
- uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
- }
- uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
- if ( _backgroundReplication == BGR_HASHBLOCK
- && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
- // Just crossed a hash block boundary, look for new candidate starting at this very index
- uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
- }
+ return true;
}
/**
@@ -769,7 +928,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
* of a hash block which is neither completely empty nor completely
* replicated yet. Returns -1 if no match.
*/
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
{
int retval = -1;
dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
@@ -816,29 +975,32 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
/**
* Receive data from uplink server and process/dispatch
* Locks on: uplink.lock, images[].lock
+ * Only called from uplink thread, so current.fd is assumed to be valid.
*/
-static void uplink_handleReceive(dnbd3_uplink_t *uplink)
+static void handleReceive(dnbd3_uplink_t *uplink)
{
- dnbd3_reply_t inReply, outReply;
- int ret, i;
+ dnbd3_reply_t inReply;
+ int ret;
+ assert_uplink_thread();
+ assert( uplink->queueLen >= 0 );
for (;;) {
ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
if ( ret == REPLY_AGAIN ) break;
if ( unlikely( ret == REPLY_CLOSED ) ) {
- logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path );
+ logadd( LOG_INFO, "Uplink: Remote host hung up (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
- logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path );
+ logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret != REPLY_OK ) ) {
- logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path );
+ logadd( LOG_INFO, "Uplink: Connection error %d (%s:%d)", ret, PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
- logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path );
+ logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s:%d", inReply.size, PIMG(uplink->image) );
goto error_cleanup;
}
@@ -851,21 +1013,41 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
}
}
if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
- logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path );
+ logadd( LOG_INFO, "Lost connection to uplink server of %s:%d (payload)", PIMG(uplink->image) );
goto error_cleanup;
}
// Payload read completely
// Bail out if we're not interested
- if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+ if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) )
+ continue;
// Is a legit block reply
- struct iovec iov[2];
- const uint64_t start = inReply.handle;
- const uint64_t end = inReply.handle + inReply.size;
totalBytesReceived += inReply.size;
uplink->bytesReceived += inReply.size;
+ // Get entry from queue
+ dnbd3_queue_entry_t *entry;
+ mutex_lock( &uplink->queueLock );
+ for ( entry = uplink->queue; entry != NULL; entry = entry->next ) {
+ if ( entry->handle == inReply.handle )
+ break;
+ }
+ if ( entry == NULL ) {
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)",
+ inReply.handle, PIMG(uplink->image) );
+ continue;
+ }
+ const uint64_t start = entry->from;
+ const uint64_t end = entry->to;
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ // We don't remove the entry from the list here yet, to slightly increase the chance of other
+ // clients attaching to this request while we write the data to disk
+ if ( end - start != inReply.size ) {
+ logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)",
+ inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) );
+ }
// 1) Write to cache file
if ( unlikely( uplink->cacheFd == -1 ) ) {
- uplink_reopenCacheFd( uplink, false );
+ reopenCacheFd( uplink, false );
}
if ( likely( uplink->cacheFd != -1 ) ) {
int err = 0;
@@ -884,16 +1066,19 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
continue; // Success, retry write
}
if ( err == EBADF || err == EINVAL || err == EIO ) {
- if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
+ uplink->image->problem.write = true;
+ if ( !tryAgain || !reopenCacheFd( uplink, true ) )
break;
tryAgain = false;
continue; // Write handle to image successfully re-opened, try again
}
- logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err );
+ logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d",
+ PIMG(uplink->image), err );
break;
}
if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
- logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid );
+ logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d",
+ ret, PIMG(uplink->image) );
break;
}
done += (uint32_t)ret;
@@ -903,114 +1088,79 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
}
if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
- uplink->image->name, (int)uplink->image->rid, err );
+ PIMG(uplink->image), err );
}
}
- // 2) Figure out which clients are interested in it
- // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop
- // below; this prevents uplink_request() from attaching to this request
- // by populating a slot with index greater than the highest matching
- // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
- // where it's fine if the index is greater)
+ bool found = false;
+ dnbd3_queue_entry_t **it;
mutex_lock( &uplink->queueLock );
- for (i = 0; i < uplink->queueLen; ++i) {
- dnbd3_queued_request_t * const req = &uplink->queue[i];
- assert( req->status != ULR_PROCESSING );
- if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
- assert( req->client != NULL );
- if ( req->from >= start && req->to <= end ) { // Match :-)
- req->status = ULR_PROCESSING;
+ for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) {
+ if ( *it == entry && entry->handle == inReply.handle ) { // ABA check
+ assert( found == false );
+ *it = (**it).next;
+ found = true;
+ uplink->queueLen--;
+ break;
}
}
- // 3) Send to interested clients - iterate backwards so request collaboration works, and
- // so we can decrease queueLen on the fly while iterating. Should you ever change this to start
- // from 0, you also need to change the "attach to existing request"-logic in uplink_request()
- outReply.magic = dnbd3_packet_magic;
- bool served = false;
- for ( i = uplink->queueLen - 1; i >= 0; --i ) {
- dnbd3_queued_request_t * const req = &uplink->queue[i];
- if ( req->status == ULR_PROCESSING ) {
- size_t bytesSent = 0;
- assert( req->from >= start && req->to <= end );
- dnbd3_client_t * const client = req->client;
- outReply.cmd = CMD_GET_BLOCK;
- outReply.handle = req->handle;
- outReply.size = (uint32_t)( req->to - req->from );
- iov[0].iov_base = &outReply;
- iov[0].iov_len = sizeof outReply;
- iov[1].iov_base = uplink->recvBuffer + (req->from - start);
- iov[1].iov_len = outReply.size;
- fixup_reply( outReply );
- req->status = ULR_FREE;
- req->client = NULL;
- served = true;
- mutex_lock( &client->sendMutex );
- mutex_unlock( &uplink->queueLock );
- if ( client->sock != -1 ) {
- ssize_t sent = writev( client->sock, iov, 2 );
- if ( sent > (ssize_t)sizeof outReply ) {
- bytesSent = (size_t)sent - sizeof outReply;
- }
- }
- if ( bytesSent != 0 ) {
- client->bytesSent += bytesSent;
- }
- mutex_unlock( &client->sendMutex );
- mutex_lock( &uplink->queueLock );
- if ( i > uplink->queueLen ) {
- i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests
- }
- }
- if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
+ if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = false;
}
mutex_unlock( &uplink->queueLock );
-#ifdef _DEBUG
- if ( !served && start != uplink->replicationHandle ) {
- logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
+ if ( !found ) {
+ logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)",
+ PIMG(uplink->image) );
+ continue;
}
-#endif
- if ( start == uplink->replicationHandle ) {
- // Was our background replication
- uplink->replicationHandle = REP_NONE;
- // Try to remove from fs cache if no client was interested in this data
- if ( !served && uplink->cacheFd != -1 ) {
- posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
- }
+ dnbd3_queue_client_t *next;
+ for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
+ assert( c->from >= start && c->to <= end );
+ (*c->callback)( c->data, c->handle, c->from, (uint32_t)( c->to - c->from ),
+ (const char*)( uplink->recvBuffer + (c->from - start) ) );
+ next = c->next;
+ free( c );
}
- if ( served ) {
+ if ( entry->clients != NULL ) {
// Was some client -- reset idle counter
uplink->idleTime = 0;
// Re-enable replication if disabled
if ( uplink->nextReplicationIndex == -1 ) {
uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
}
+ } else {
+ if ( uplink->cacheFd != -1 ) {
+ // Try to remove from fs cache if no client was interested in this data
+ posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+ }
}
+ free( entry );
+ } // main receive loop
+ // Trigger background replication if applicable
+ if ( !sendReplicationRequest( uplink ) ) {
+ goto error_cleanup;
}
- if ( uplink->replicationHandle == REP_NONE ) {
- mutex_lock( &uplink->queueLock );
- const bool rep = ( uplink->queueLen == 0 );
- mutex_unlock( &uplink->queueLock );
- if ( rep ) uplink_sendReplicationRequest( uplink );
- }
+ // Normal end
return;
// Error handling from failed receive or message parsing
- error_cleanup: ;
- uplink_connectionFailed( uplink, true );
+error_cleanup: ;
+ connectionFailed( uplink, true );
}
/**
* Only call from uplink thread
*/
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
{
+ assert_uplink_thread();
if ( uplink->current.fd == -1 )
return;
+ setThreadName( "panic-uplink" );
altservers_serverFailed( uplink->current.index );
mutex_lock( &uplink->sendMutex );
+ uplink->image->problem.uplink = true;
close( uplink->current.fd );
uplink->current.fd = -1;
mutex_unlock( &uplink->sendMutex );
- uplink->replicationHandle = REP_NONE;
if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
uplink->nextReplicationIndex = 0;
}
@@ -1025,15 +1175,26 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
}
/**
- * Send keep alive request to server
+ * Send keep alive request to server.
+ * Called from uplink thread, current.fd must be valid.
*/
-static int uplink_sendKeepalive(const int fd)
+static bool sendKeepalive(dnbd3_uplink_t *uplink)
{
static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
- return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ assert_uplink_thread();
+ mutex_lock( &uplink->sendMutex );
+ bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ mutex_unlock( &uplink->sendMutex );
+ return sendOk;
}
-static void uplink_addCrc32(dnbd3_uplink_t *uplink)
+/**
+ * Request crclist from uplink.
+ * Called from uplink thread, current.fd must be valid.
+ * FIXME This is broken as it could happen that another message arrives after sending
+ * the request. Refactor, split and move receive into general receive handler.
+ */
+static void requestCrc32List(dnbd3_uplink_t *uplink)
{
dnbd3_image_t *image = uplink->image;
if ( image == NULL || image->virtualFilesize == 0 ) return;
@@ -1042,6 +1203,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
uint32_t *buffer = malloc( bytes );
mutex_lock( &uplink->sendMutex );
bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
+ if ( !sendOk ) {
+ uplink->image->problem.uplink = true;
+ }
mutex_unlock( &uplink->sendMutex );
if ( !sendOk || bytes == 0 ) {
free( buffer );
@@ -1051,7 +1215,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
lists_crc = net_order_32( lists_crc );
if ( lists_crc != masterCrc ) {
- logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name );
+ logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) );
free( buffer );
return;
}
@@ -1061,10 +1225,14 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
char path[len];
snprintf( path, len, "%s.crc", uplink->image->path );
const int fd = open( path, O_WRONLY | O_CREAT, 0644 );
- if ( fd >= 0 ) {
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, buffer, bytes );
+ if ( fd != -1 ) {
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, buffer, bytes );
close( fd );
+ if ( (size_t)ret != sizeof(masterCrc) + bytes ) {
+ unlink( path );
+ logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) );
+ }
}
}
@@ -1076,80 +1244,24 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
* it will be closed first. Otherwise, nothing will happen and true will be returned
* immediately.
*/
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
{
if ( uplink->cacheFd != -1 ) {
if ( !force ) return true;
close( uplink->cacheFd );
}
uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+ uplink->image->problem.write = uplink->cacheFd == -1;
return uplink->cacheFd != -1;
}
/**
- * Saves the cache map of the given image.
- * Return true on success.
- * Locks on: imageListLock, image.lock
+ * Returns true if the uplink has been idle for some time (apart from
+ * background replication, if it is set to hashblock, or if it has
+ * a minimum number of active clients configured that is not currently
+ * reached)
*/
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
-{
- dnbd3_image_t *image = uplink->image;
- assert( image != NULL );
-
- if ( uplink->cacheFd != -1 ) {
- if ( fsync( uplink->cacheFd ) == -1 ) {
- // A failing fsync means we have no guarantee that any data
- // since the last fsync (or open if none) has been saved. Apart
- // from keeping the cache map from the last successful fsync
- // around and restoring it there isn't much we can do to recover
- // a consistent state. Bail out.
- logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
- logadd( LOG_ERROR, "Bailing out immediately" );
- exit( 1 );
- }
- }
-
- dnbd3_cache_map_t *cache = ref_get_cachemap( image );
- if ( cache == NULL )
- return true;
- logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
- const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
- assert( image->path != NULL );
- char mapfile[strlen( image->path ) + 4 + 1];
- strcpy( mapfile, image->path );
- strcat( mapfile, ".map" );
-
- int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
- if ( fd == -1 ) {
- const int err = errno;
- ref_put( &cache->reference );
- logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
- return false;
- }
-
- size_t done = 0;
- while ( done < size ) {
- const ssize_t ret = write( fd, cache->map + done, size - done );
- if ( ret == -1 ) {
- if ( errno == EINTR ) continue;
- logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
- break;
- }
- if ( ret <= 0 ) {
- logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
- break;
- }
- done += (size_t)ret;
- }
- ref_put( &cache->reference );
- if ( fsync( fd ) == -1 ) {
- logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
- }
- close( fd );
- return true;
-}
-
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink)
{
return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
@@ -1165,3 +1277,44 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
return false;
return altservers_toString( current, buffer, len );
}
+
+/**
+ * Get number of replication requests that should be sent right now to
+ * meet the configured bgrWindowSize. Returns 0 if any client requests
+ * are pending.
+ * This applies a sort of "slow start" in case the uplink was recently
+ * dealing with actual client requests, in that the uplink's idle time
+ * (in seconds) is an upper bound for the number returned, so we don't
+ * saturate the uplink with loads of requests right away, in case that
+ * client triggers more requests to the uplink server.
+ */
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
+{
+ int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 );
+ if ( uplink->queueLen == 0 )
+ return ret;
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->clients == NULL ) {
+ ret--;
+ } else {
+ ret = 0; // Do not allow BGR if client requests are being handled
+ break;
+ }
+ }
+ mutex_unlock( &uplink->queueLock );
+ return ret;
+}
+
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle)
+{
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->handle == handle ) {
+ it->sent = false;
+ break;
+ }
+ }
+ mutex_unlock( &uplink->queueLock );
+}
+
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 49ff0b4..b6037d6 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -2,7 +2,7 @@
#define _UPLINK_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
void uplink_globalsInit();
@@ -10,9 +10,11 @@ uint64_t uplink_getTotalBytesReceived();
bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version);
-void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback);
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
+
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length);
bool uplink_shutdown(dnbd3_image_t *image);
diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt
new file mode 100644
index 0000000..a1bd49a
--- /dev/null
+++ b/src/shared/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-shared
+ LANGUAGES C)
+
+# find atomic library required by dnbd3-shared
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to get POLLRDHUP support for signals
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_SHARED_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/crc32.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/log.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sockhelper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/timing.c)
+set(DNBD3_SHARED_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/eventfd.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe64.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe_malloc.c)
+
+add_library(dnbd3-shared STATIC ${DNBD3_SHARED_SOURCE_FILES})
+target_include_directories(dnbd3-shared PUBLIC ${PROJECT_INCLUDE_DIR})
+
+add_linter(dnbd3-shared-lint "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
+add_linter_fix(dnbd3-shared-lint-fix "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index db941d3..6cf9a18 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -38,24 +38,23 @@
*/
-#include "../types.h"
+#include <dnbd3/types.h>
#include <stddef.h>
-#define FAR
+#if defined(__x86_64__) || defined(__amd64__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <stdatomic.h>
+#define zalign(n) __attribute__((aligned(n)))
+#endif
+
#define OF(args) args
-#define local static
/* Definitions for doing the crc four data bytes at a time. */
-#if !defined(NOBYFOUR)
-# define BYFOUR
-#endif
-#ifdef BYFOUR
-# define TBLS 8
-#else
-# define TBLS 1
-#endif /* BYFOUR */
+#define TBLS 8
-local const uint32_t crc_table[TBLS][256] =
+static const uint32_t crc_table[TBLS][256] =
{
{
0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U,
@@ -110,7 +109,6 @@ local const uint32_t crc_table[TBLS][256] =
0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U,
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
0x2d02ef8dU
-#ifdef BYFOUR
},
{
0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U,
@@ -489,38 +487,159 @@ local const uint32_t crc_table[TBLS][256] =
0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU,
0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U,
0xf10605deU
-#endif
}
};
-#ifdef NO_ENDIAN
-// Currently not in use, always use the BYFOUR method with known endianness
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+#define PCLMUL_MIN_LEN 64
+#define PCLMUL_ALIGN 16
+#define PCLMUL_ALIGN_MASK 15
-/* ========================================================================= */
-uint32_t crc32(crc, buf, len)
- uint32_t crc;
- const uint8_t *buf;
- size_t len;
+#if defined(__x86_64__) || defined(__amd64__)
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ *
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+static uint32_t
+__attribute__((target("pclmul,sse4.1")))
+crc32pclmul(uint32_t crc, const uint8_t *buf, size_t len)
{
- if (buf == NULL) return 0;
+ /*
+ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+ * the CRC32+Barrett polynomials given at the end of the paper.
+ */
+ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+ /*
+ * There's at least one block of 64.
+ */
+ x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+ x0 = _mm_load_si128((__m128i *)k1k2);
+
+ buf += 64;
+ len -= 64;
- crc = crc ^ 0xffffffffU;
- while (len >= 8) {
- DO8;
- len -= 8;
+ /*
+ * Parallel fold blocks of 64, if any.
+ */
+ while (len >= 64)
+ {
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+ x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+ x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+ x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+ y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, x5);
+ x2 = _mm_xor_si128(x2, x6);
+ x3 = _mm_xor_si128(x3, x7);
+ x4 = _mm_xor_si128(x4, x8);
+
+ x1 = _mm_xor_si128(x1, y5);
+ x2 = _mm_xor_si128(x2, y6);
+ x3 = _mm_xor_si128(x3, y7);
+ x4 = _mm_xor_si128(x4, y8);
+
+ buf += 64;
+ len -= 64;
}
- if (len) do {
- DO1;
- } while (--len);
- return crc ^ 0xffffffffU;
+
+ /*
+ * Fold into 128-bits.
+ */
+ x0 = _mm_load_si128((__m128i *)k3k4);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x3);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x4);
+ x1 = _mm_xor_si128(x1, x5);
+
+ /*
+ * Single fold blocks of 16, if any.
+ */
+ while (len >= 16)
+ {
+ x2 = _mm_loadu_si128((__m128i *)buf);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ buf += 16;
+ len -= 16;
+ }
+
+ /*
+ * Fold 128-bits to 64-bits.
+ */
+ x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+ x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+ x1 = _mm_srli_si128(x1, 8);
+ x1 = _mm_xor_si128(x1, x2);
+
+ x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+ x2 = _mm_srli_si128(x1, 4);
+ x1 = _mm_and_si128(x1, x3);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Barret reduce to 32-bits.
+ */
+ x0 = _mm_load_si128((__m128i*)poly);
+
+ x2 = _mm_and_si128(x1, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+ x2 = _mm_and_si128(x2, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Return the crc32.
+ */
+ return _mm_extract_epi32(x1, 1);
}
#endif
-#ifdef BYFOUR
-
/*
This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
integer pointer type. This violates the strict aliasing rule, where a
@@ -533,7 +652,7 @@ uint32_t crc32(crc, buf, len)
writes to the buffer that is passed to these routines.
*/
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
/* ========================================================================= */
#define DOLIT4 c ^= *buf4++; \
c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
@@ -547,25 +666,36 @@ uint32_t crc32(crc, buf, len)
size_t len;
{
if (buf == NULL) return 0;
- register uint32_t c;
- register const uint32_t FAR *buf4;
+ uint32_t c;
c = ~crc;
- while (len && ((uintptr_t)buf & 3)) {
+ while (len && ((uintptr_t)buf & PCLMUL_ALIGN_MASK)) {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
len--;
}
-
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
- while (len >= 32) {
- DOLIT32;
- len -= 32;
- }
- while (len >= 4) {
- DOLIT4;
- len -= 4;
+#if defined(__x86_64__) || defined(__amd64__)
+ static atomic_int pclmul = -1;
+ if (pclmul == -1) {
+ pclmul = __builtin_cpu_supports("pclmul") && __builtin_cpu_supports("sse4.1");
}
- buf = (const uint8_t FAR *)buf4;
+ if (pclmul && len >= PCLMUL_MIN_LEN) {
+ c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
+ buf += len & ~PCLMUL_ALIGN_MASK;
+ len &= PCLMUL_ALIGN_MASK;
+ } else
+#endif
+ do {
+ const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const uint8_t *)buf4;
+ } while (0);
if (len) do {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
@@ -575,7 +705,7 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#ifdef BIG_ENDIAN
+#ifdef DNBD3_BIG_ENDIAN
/* ========================================================================= */
#define DOBIG4 c ^= *buf4++; \
c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
@@ -590,7 +720,7 @@ uint32_t crc32(crc, buf, len)
{
if (buf == NULL) return 0;
register uint32_t c;
- register const uint32_t FAR *buf4;
+ register const uint32_t *buf4;
c = ~net_order_32(crc);
while (len && ((uintptr_t)buf & 3)) {
@@ -598,7 +728,7 @@ uint32_t crc32(crc, buf, len)
len--;
}
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+ buf4 = (const uint32_t *)(const void *)buf;
while (len >= 32) {
DOBIG32;
len -= 32;
@@ -607,7 +737,7 @@ uint32_t crc32(crc, buf, len)
DOBIG4;
len -= 4;
}
- buf = (const uint8_t FAR *)buf4;
+ buf = (const uint8_t *)buf4;
if (len) do {
c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
@@ -617,5 +747,3 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#endif /* BYFOUR */
-
diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c
index 087b6f1..1db59bd 100644
--- a/src/shared/fdsignal.c
+++ b/src/shared/fdsignal.c
@@ -1,4 +1,4 @@
-#include "fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
#if defined(__linux__)
//#warning "Using eventfd based signalling"
diff --git a/src/shared/log.c b/src/shared/log.c
index 055acb4..3a4739d 100644
--- a/src/shared/log.c
+++ b/src/shared/log.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Simon Rettberg
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,7 +18,7 @@
*
*/
-#include "log.h"
+#include <dnbd3/shared/log.h>
#include <stdarg.h>
#include <pthread.h>
#include <stdlib.h>
@@ -36,6 +36,7 @@ static _Atomic logmask_t maskCon = 15;
static char *logFile = NULL;
static int logFd = -1;
+static FILE *logOutStream;
static bool consoleTimestamps = false;
@@ -43,6 +44,10 @@ static bool consoleTimestamps = false;
static int writeLevel(char *buffer, logmask_t level);
+void log_init(void) {
+ logOutStream = stdout;
+}
+
bool log_hasMask(const logmask_t mask)
{
return ( ( maskFile | maskCon ) & mask ) == mask;
@@ -63,6 +68,15 @@ void log_setConsoleTimestamps(bool on)
consoleTimestamps = on;
}
+int log_setConsoleOutputStream(FILE *outputStream)
+{
+ if ( outputStream != stdout && outputStream != stderr )
+ return -EINVAL;
+
+ logOutStream = outputStream;
+ return 0;
+}
+
bool log_openLogFile(const char *path)
{
pthread_mutex_lock( &logLock );
@@ -93,10 +107,10 @@ void logadd(const logmask_t mask, const char *fmt, ...)
struct tm timeinfo;
char buffer[LINE_LEN];
bool toFile = maskFile & mask;
- bool toStdout = maskCon & mask;
+ bool toOutStream = maskCon & mask;
size_t offset;
- if ( toFile || ( toStdout && consoleTimestamps ) ) {
+ if ( toFile || ( toOutStream && consoleTimestamps ) ) {
time( &rawtime );
localtime_r( &rawtime, &timeinfo );
offset = strftime( buffer, LINE_LEN, "[%d.%m. %H:%M:%S] ", &timeinfo );
@@ -134,15 +148,11 @@ void logadd(const logmask_t mask, const char *fmt, ...)
}
pthread_mutex_unlock( &logLock );
}
- if ( toStdout ) {
- if ( consoleTimestamps ) stdoutLine = buffer;
-#ifdef AFL_MODE
- fputs( stdoutLine, stderr );
- fflush( stderr );
-#else
- fputs( stdoutLine, stdout );
- fflush( stdout );
-#endif
+ if ( toOutStream ) {
+ if ( consoleTimestamps )
+ stdoutLine = buffer;
+ fputs( stdoutLine, logOutStream );
+ fflush( logOutStream );
}
}
diff --git a/src/serialize.c b/src/shared/serialize.c
index 0bc0dcd..1f7cddd 100644
--- a/src/serialize.c
+++ b/src/shared/serialize.c
@@ -1,6 +1,6 @@
-#include "serialize.h"
-#include "types.h"
-
+// SPDX-License-Identifier: GPL-2.0
+#include <dnbd3/shared/serialize.h>
+#include <dnbd3/types.h>
void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len)
{
@@ -16,14 +16,17 @@ void serializer_reset_write(serialized_buffer_t *buffer)
uint8_t serializer_get_uint8(serialized_buffer_t *buffer)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return 0;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return 0;
return (uint8_t)*buffer->buffer_pointer++;
}
uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
{
uint16_t ret;
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 2);
buffer->buffer_pointer += 2;
return net_order_16(ret);
@@ -32,7 +35,9 @@ uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
{
uint64_t ret;
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 8);
buffer->buffer_pointer += 8;
return net_order_64(ret);
@@ -41,22 +46,29 @@ uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
char *serializer_get_string(serialized_buffer_t *buffer)
{
char *ptr = buffer->buffer_pointer, *start = buffer->buffer_pointer;
- if (ptr >= buffer->buffer_end) return NULL;
- while (ptr < buffer->buffer_end && *ptr) ++ptr;
- if (*ptr) return NULL; // String did not terminate within buffer (possibly corrupted/malicious packet)
+
+ if (ptr >= buffer->buffer_end)
+ return NULL;
+ while (ptr < buffer->buffer_end && *ptr)
+ ++ptr;
+ // String did not terminate within buffer (possibly corrupted/malicious packet)
+ if (*ptr)
+ return NULL;
buffer->buffer_pointer = ptr + 1;
return start;
}
void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return;
*buffer->buffer_pointer++ = (char)value;
}
void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
{
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return;
value = net_order_16(value);
memcpy(buffer->buffer_pointer, &value, 2);
buffer->buffer_pointer += 2;
@@ -64,7 +76,8 @@ void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
{
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return;
value = net_order_64(value);
memcpy(buffer->buffer_pointer, &value, 8);
buffer->buffer_pointer += 8;
@@ -73,12 +86,14 @@ void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
void serializer_put_string(serialized_buffer_t *buffer, const char *value)
{
const size_t len = strlen(value) + 1;
- if (buffer->buffer_pointer + len > buffer->buffer_end) return;
+
+ if (buffer->buffer_pointer + len > buffer->buffer_end)
+ return;
memcpy(buffer->buffer_pointer, value, len);
buffer->buffer_pointer += len;
}
uint32_t serializer_get_written_length(serialized_buffer_t *buffer)
{
- return (uint32_t)( buffer->buffer_pointer - buffer->buffer );
+ return (uint32_t)(buffer->buffer_pointer - buffer->buffer);
}
diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
index ec80659..5096320 100644
--- a/src/shared/sockhelper.c
+++ b/src/shared/sockhelper.c
@@ -1,6 +1,8 @@
-#include "sockhelper.h"
-#include "log.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <arpa/inet.h> // inet_ntop
+#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <unistd.h>
@@ -19,8 +21,7 @@ struct _poll_list {
int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms)
{
// TODO: Move out of here, this unit should contain general socket functions
- // TODO: Abstract away from sockaddr_in* like the rest of the functions here do,
- // so WITH_IPV6 can finally be removed as everything is transparent. b- but how?
+ // TODO: Abstract away from sockaddr_in* like the rest of the functions here
struct sockaddr_storage ss;
int proto, addrlen;
memset( &ss, 0, sizeof ss );
@@ -32,9 +33,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr4->sin_port = addr->port;
proto = PF_INET;
addrlen = sizeof *addr4;
- }
-#ifdef WITH_IPV6
- else if ( addr->type == HOST_IP6 ) {
+ } else if ( addr->type == HOST_IP6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)&ss;
addr6->sin6_family = AF_INET6;
@@ -42,9 +41,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr6->sin6_port = addr->port;
proto = PF_INET6;
addrlen = sizeof *addr6;
- }
-#endif
- else {
+ } else {
logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type );
errno = EAFNOSUPPORT;
return -1;
@@ -57,11 +54,13 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
} else {
sock_setTimeout( client_sock, connect_ms );
}
- int e2;
+ // NODELAY makes sense for the client side, which should be all users in this code base
+ int e2 = 1;
+ setsockopt( client_sock, IPPROTO_TCP, TCP_NODELAY, (void *)&e2, sizeof(e2) );
for ( int i = 0; i < 5; ++i ) {
int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen );
e2 = errno;
- if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break;
+ if ( ret != -1 || ( connect_ms == -1 && errno == EINPROGRESS ) || errno == EISCONN ) break;
if ( errno == EINTR ) {
// http://www.madore.org/~david/computers/connect-intr.html
#ifdef __linux__
@@ -77,7 +76,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
errno = e2;
return -1;
}
- sockaddr_storage junk;
+ struct sockaddr_storage junk;
socklen_t more_junk = sizeof(junk);
if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) {
e2 = errno;
@@ -165,7 +164,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr4->sin_addr, 4 );
return true;
}
-#ifdef WITH_IPV6
+
if ( sa->sa_family == AF_INET6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)sa;
@@ -174,7 +173,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr6->sin6_addr, 16 );
return true;
}
-#endif
+
return false;
}
@@ -242,7 +241,10 @@ size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLe
outlen = snprintf( output, len, "[%s]:%s", host, port );
}
}
- if ( outlen <= 0 ) return 0;
+ if ( outlen <= 0 ) {
+ output[0] = '\0';
+ return 0;
+ }
return MIN( (size_t)outlen, len-1 );
}
@@ -346,7 +348,7 @@ int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_m
if ( i != list->count ) list->entry[i] = list->entry[list->count];
if ( fd != -1 ) {
sock_set_block( fd );
- if ( rw_ms != -1 && rw_ms != connect_ms ) {
+ if ( rw_ms != -1 ) {
sock_setTimeout( fd, rw_ms );
}
return fd;
diff --git a/src/shared/timing.c b/src/shared/timing.c
index 4ca1002..bdb8388 100644
--- a/src/shared/timing.c
+++ b/src/shared/timing.c
@@ -1,4 +1,4 @@
-#include "timing.h"
+#include <dnbd3/shared/timing.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/src/version.c.in b/src/version.c.in
deleted file mode 100644
index ab937a2..0000000
--- a/src/version.c.in
+++ /dev/null
@@ -1,3 +0,0 @@
-
-const char * VERSION_STRING = "@VERSION@";
-
diff --git a/src/version.h b/src/version.h
deleted file mode 100644
index 0c4a66b..0000000
--- a/src/version.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef VERSION_H_
-#define VERSION_H_
-
-extern const char *VERSION_STRING;
-
-// This is done in a little weird way but otherwise eclipse complains about
-// unresolvable symbols etc...
-#include "version.c"
-
-#endif /* VERSION_H_ */