summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.clang-format126
-rw-r--r--.github/workflows/build-kernel-module.yml113
-rw-r--r--.github/workflows/build-programs.yml66
-rw-r--r--.github/workflows/lint.yml34
-rw-r--r--.github/workflows/release.yml64
-rw-r--r--.gitignore2
-rw-r--r--CMakeLists.txt486
-rw-r--r--Kbuild.in2
-rw-r--r--LOCKS81
-rw-r--r--README.md456
-rwxr-xr-xbuild.sh6
-rw-r--r--cmake/Build.cmake27
-rw-r--r--cmake/CheckAFLCCompiler.cmake24
-rw-r--r--cmake/DockerImage.cmake25
-rw-r--r--cmake/FindCheckPatch.cmake31
-rw-r--r--cmake/FindClangFormat.cmake20
-rw-r--r--cmake/FindDocker.cmake20
-rw-r--r--cmake/FindFuse.cmake181
-rw-r--r--cmake/FindKernelHeaders.cmake61
-rw-r--r--cmake/FindLibatomic.cmake45
-rw-r--r--cmake/FindStdatomic.cmake14
-rw-r--r--cmake/GenerateBuild.cmake11
-rw-r--r--cmake/GenerateVersion.cmake20
-rw-r--r--cmake/InstallVersionFile.cmake.in8
-rw-r--r--cmake/Kernel.cmake58
-rw-r--r--cmake/Lint.cmake70
-rw-r--r--cmake/PostVersionPackaging.cmake9
-rw-r--r--cmake/PreVersionPackaging.cmake11
-rw-r--r--cmake/Version.cmake125
-rw-r--r--cmake/toolchain/Aarch64LinuxGnu.cmake24
-rw-r--r--cmake/toolchain/PowerpcLinuxGnu.cmake24
-rw-r--r--conf/README.server30
-rw-r--r--conf/alt-servers4
-rwxr-xr-xget-version.sh22
-rw-r--r--inc/dnbd3/build.h.in11
-rw-r--r--inc/dnbd3/config.h (renamed from src/config.h)2
-rw-r--r--inc/dnbd3/config/client.h52
-rw-r--r--inc/dnbd3/config/server.h (renamed from src/serverconfig.h)20
-rw-r--r--inc/dnbd3/shared/crc32.h (renamed from src/shared/crc32.h)0
-rw-r--r--inc/dnbd3/shared/fdsignal.h (renamed from src/shared/fdsignal.h)0
-rw-r--r--inc/dnbd3/shared/log.h (renamed from src/shared/log.h)17
-rw-r--r--inc/dnbd3/shared/protocol.h (renamed from src/shared/protocol.h)11
-rw-r--r--inc/dnbd3/shared/serialize.h (renamed from src/serialize.h)7
-rw-r--r--inc/dnbd3/shared/sockhelper.h (renamed from src/shared/sockhelper.h)2
-rw-r--r--inc/dnbd3/shared/timing.h (renamed from src/shared/timing.h)6
-rw-r--r--inc/dnbd3/types.h (renamed from src/types.h)62
-rw-r--r--inc/dnbd3/version.h.in12
-rwxr-xr-xpack.sh6
-rw-r--r--pkg/CMakeLists.txt8
-rw-r--r--pkg/config/CMakeLists.txt15
-rw-r--r--pkg/config/alt-servers15
-rw-r--r--pkg/config/rpc.acl (renamed from conf/rpc.acl)2
-rw-r--r--pkg/config/server.conf (renamed from conf/server.conf)50
-rw-r--r--pkg/docker/archlinux_dockerfile28
-rw-r--r--pkg/docker/ubuntu-20-04_dockerfile28
-rw-r--r--pkg/systemd/CMakeLists.txt13
-rw-r--r--pkg/systemd/dnbd3-server.service12
-rw-r--r--src/CMakeLists.txt24
-rw-r--r--src/bench/CMakeLists.txt22
-rw-r--r--src/bench/connection.c149
-rw-r--r--src/bench/connection.h4
-rw-r--r--src/bench/helper.h3
-rw-r--r--src/bench/main.c41
-rw-r--r--src/bench/serialize.c5
-rw-r--r--src/client/CMakeLists.txt18
-rw-r--r--src/client/client.c284
-rw-r--r--src/clientconfig.h36
-rw-r--r--src/fuse/CMakeLists.txt29
-rw-r--r--src/fuse/connection.c542
-rw-r--r--src/fuse/connection.h24
-rw-r--r--src/fuse/helper.c6
-rw-r--r--src/fuse/helper.h14
-rw-r--r--src/fuse/main.c427
-rw-r--r--src/fuse/serialize.c5
-rw-r--r--src/kernel/.clang-format552
-rw-r--r--src/kernel/CMakeLists.txt66
-rw-r--r--src/kernel/Kbuild5
-rw-r--r--src/kernel/blk.c740
-rw-r--r--src/kernel/blk.h18
-rw-r--r--src/kernel/core.c81
-rw-r--r--src/kernel/dnbd3.h84
-rw-r--r--src/kernel/dnbd3_main.c250
-rw-r--r--src/kernel/dnbd3_main.h148
-rw-r--r--src/kernel/net.c1929
-rw-r--r--src/kernel/net.h29
l---------src/kernel/serialize.c1
-rw-r--r--src/kernel/serialize_kmod.c5
-rw-r--r--src/kernel/sysfs.c177
-rw-r--r--src/kernel/sysfs.h20
-rw-r--r--src/kernel/utils.c41
-rw-r--r--src/kernel/utils.h29
-rw-r--r--src/server/CMakeLists.txt112
-rw-r--r--src/server/altservers.c866
-rw-r--r--src/server/altservers.h22
-rw-r--r--src/server/fileutil.c2
-rw-r--r--src/server/fuse.c661
-rw-r--r--src/server/fuse.h10
-rw-r--r--src/server/globals.c86
-rw-r--r--src/server/globals.h192
-rw-r--r--src/server/helper.h4
-rw-r--r--src/server/image.c1087
-rw-r--r--src/server/image.h52
-rw-r--r--src/server/ini.c14
-rw-r--r--src/server/integrity.c231
-rw-r--r--src/server/integrity.h2
-rw-r--r--src/server/locks.c324
-rw-r--r--src/server/locks.h60
-rw-r--r--src/server/net.c266
-rw-r--r--src/server/net.h2
-rw-r--r--src/server/picohttpparser/CMakeLists.txt11
-rw-r--r--src/server/picohttpparser/picohttpparser.c67
-rw-r--r--src/server/reference.c33
-rw-r--r--src/server/reference.h64
-rw-r--r--src/server/reftypes.h25
-rw-r--r--src/server/rpc.c102
-rw-r--r--src/server/serialize.c5
-rw-r--r--src/server/server.c243
-rw-r--r--src/server/server.h6
-rw-r--r--src/server/threadpool.c150
-rw-r--r--src/server/threadpool.h10
-rw-r--r--src/server/uplink.c1529
-rw-r--r--src/server/uplink.h12
-rw-r--r--src/shared/CMakeLists.txt28
-rw-r--r--src/shared/crc32.c238
-rw-r--r--src/shared/fdsignal.c4
-rw-r--r--src/shared/log.c36
-rw-r--r--src/shared/serialize.c (renamed from src/serialize.c)43
-rw-r--r--src/shared/sockhelper.c42
-rw-r--r--src/shared/timing.c2
-rw-r--r--src/version.c.in4
-rw-r--r--src/version.h30
131 files changed, 10219 insertions, 4945 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..6adc436
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -3
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: false
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: true
+ AfterControlStatement: false
+ AfterEnum: true
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: true
+ AfterUnion: true
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 6
+ContinuationIndentWidth: 6
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: true
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+#IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentGotoLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 3
+IndentWrappedFunctionNames: false
+#InsertTrailingCommas: Wrapped
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 60
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+#PenaltyIndentedWhitespace: 20
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+#SpaceAroundPointerQualifiers: Both
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInConditionalStatement: true
+SpacesInParentheses: true
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 3
+#UseTab: AlignWithSpaces
+UseTab: ForContinuationAndIndentation
+...
diff --git a/.github/workflows/build-kernel-module.yml b/.github/workflows/build-kernel-module.yml
new file mode 100644
index 0000000..526bf8e
--- /dev/null
+++ b/.github/workflows/build-kernel-module.yml
@@ -0,0 +1,113 @@
+name: Build dnbd3 kernel module
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - name: CentOS 8.4 [4.18.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-rpm
+ kernel-source: https://vault.centos.org/8.4.2105/BaseOS/Source/SPackages/kernel-4.18.0-305.25.1.el8_4.src.rpm
+ - name: CentOS 8.5 [4.18.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-rpm
+ kernel-source: https://vault.centos.org/8.5.2111/BaseOS/Source/SPackages/kernel-4.18.0-348.2.1.el8_5.src.rpm
+ - name: CentOS 9.3 [5.14.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-tar
+ kernel-source: https://files.bwlp.ks.uni-freiburg.de/stuff/centos/linux-5.14.0-362.18.1.el9_3.tar.xz
+ - name: CentOS 9.4 [5.14.0]
+ build-os: ubuntu-22.04
+ kernel-type: centos-tar
+ kernel-source: https://files.bwlp.ks.uni-freiburg.de/stuff/centos/linux-5.14.0-427.el9.tar.xz
+ - name: Vanilla [4.19.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 4.19.y
+ - name: Vanilla [5.4.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.4.y
+ - name: Vanilla [5.10.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.10.y
+ - name: Vanilla [5.15.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 5.15.y
+ - name: Vanilla [6.1.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 6.1.y
+ - name: Vanilla [6.6.x]
+ build-os: ubuntu-22.04
+ kernel-type: vanilla
+ kernel-version: 6.6.y
+ name: Build dnbd3 ${{ matrix.config.name }}
+ runs-on: ${{ matrix.config.build-os }}
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ libelf-dev \
+ rpm2cpio \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Checkout Vanilla kernel version [git]
+ if: matrix.config.kernel-type == 'vanilla'
+ run: git clone --depth 1 --branch "linux-${{ matrix.config.kernel-version }}" "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git" "../kernel"
+ - name: Checkout CentOS kernel version [rpm]
+ if: matrix.config.kernel-type == 'centos-rpm'
+ run: |
+ mkdir "${{ github.workspace }}/../kernel"
+ mkdir "${{ github.workspace }}/../kernel-download"
+ wget -q -O "${{ github.workspace }}/../kernel-download/kernel.rpm" "${{ matrix.config.kernel-source }}"
+ cd "${{ github.workspace }}/../kernel-download"
+ rpm2cpio "${{ github.workspace }}/../kernel-download/kernel.rpm" | cpio --extract --make-directories
+ tar --strip 1 -a -x -f linux-*.el*.tar.xz -C "${{ github.workspace }}/../kernel"
+ cd
+ rm -rf -- "${{ github.workspace }}/../kernel-download"
+ - name: Checkout CentOS kernel version [tar]
+ if: matrix.config.kernel-type == 'centos-tar'
+ run: |
+ mkdir "${{ github.workspace }}/../kernel"
+ mkdir "${{ github.workspace }}/../kernel-download"
+ wget -q -O "${{ github.workspace }}/../kernel-download/kernel.tar.xz" "${{ matrix.config.kernel-source }}"
+ tar --strip 1 -a -x -f "${{ github.workspace }}/../kernel-download/kernel.tar.xz" -C "${{ github.workspace }}/../kernel"
+ rm -rf -- "${{ github.workspace }}/../kernel-download"
+ - name: Configure kernel version
+ working-directory: ${{ github.workspace }}/../kernel
+ run: |
+ make defconfig
+ make modules_prepare
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D DNBD3_KERNEL_MODULE=ON \
+ -D KERNEL_BUILD_DIR=${{ github.workspace }}/../kernel \
+ -D KERNEL_INSTALL_DIR=${{ github.workspace }}/../kernel/extra \
+ -D KERNEL_SCRIPTS_DIR=${{ github.workspace }}/../kernel/scripts \
+ -D DNBD3_BENCHMARK=OFF \
+ -D DNBD3_CLIENT_FUSE=OFF \
+ -D DNBD3_SERVER=OFF \
+ -D DNBD3_SERVER_FUSE=OFF \
+ -D DNBD3_RELEASE_HARDEN=OFF
+ - name: Build dnbd3 kernel module
+ working-directory: ${{ github.workspace }}/build
+ run: make
diff --git a/.github/workflows/build-programs.yml b/.github/workflows/build-programs.yml
new file mode 100644
index 0000000..06642b5
--- /dev/null
+++ b/.github/workflows/build-programs.yml
@@ -0,0 +1,66 @@
+name: Build dnbd3 programs
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ config:
+ - name: debug [default] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Debug"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "OFF"
+ - name: debug [server with fuse support] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Debug"
+ build-cfg-server-fuse: "ON"
+ build-cfg-harden: "OFF"
+ - name: release [default] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "OFF"
+ - name: release [server with fuse support] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "ON"
+ build-cfg-harden: "OFF"
+ - name: release [default hardening] on Ubuntu 22.04
+ build-os: ubuntu-22.04
+ build-type: "Release"
+ build-cfg-server-fuse: "OFF"
+ build-cfg-harden: "ON"
+ name: Build dnbd3 ${{ matrix.config.name }}
+ runs-on: ${{ matrix.config.build-os }}
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=${{ matrix.config.build-type }} \
+ -D DNBD3_KERNEL_MODULE=OFF \
+ -D DNBD3_BENCHMARK=ON \
+ -D DNBD3_SERVER_FUSE=${{ matrix.config.build-cfg-server-fuse }} \
+ -D DNBD3_RELEASE_HARDEN=${{ matrix.config.build-cfg-harden }}
+ - name: Build dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..6d41378
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,34 @@
+name: Lint dnbd3
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+
+jobs:
+ lint:
+ name: Lint dnbd3
+ runs-on: ubuntu-22.04
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 build
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D DNBD3_BENCHMARK=ON
+ - name: Lint dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make lint
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..6a06173
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,64 @@
+name: Release dnbd3
+
+on:
+ release:
+ types: [published]
+
+jobs:
+ release:
+ name: Release dnbd3
+ # Use very old system (= libc) for building for best compatibility
+ runs-on: ubuntu-18.04
+ steps:
+ - name: Install dnbd3 dependencies
+ run: |
+ sudo apt-get update -y -qq
+ sudo apt-get install -y -qq make \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+ - name: Checkout dnbd3 repository
+ uses: actions/checkout@v4
+ - name: Fetch dnbd3 repository tags
+ run: git fetch --prune --unshallow
+ - name: Configure dnbd3 release
+ run: |
+ cmake -B ${{ github.workspace }}/build \
+ -S ${{ github.workspace }} \
+ -D CMAKE_BUILD_TYPE=Release \
+ -D DNBD3_KERNEL_MODULE=OFF \
+ -D DNBD3_BENCHMARK=OFF \
+ -D DNBD3_SERVER_FUSE=OFF \
+ -D DNBD3_RELEASE_HARDEN=OFF
+ - name: Build dnbd3 artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: make package
+ - name: Create links to artifacts
+ working-directory: ${{ github.workspace }}/build
+ run: |
+ ln -s dnbd3_*.deb dnbd3_linux_x86_64.deb
+ ln -s dnbd3_*.rpm dnbd3_linux_x86_64.rpm
+ ln -s dnbd3_*.tar.gz dnbd3_linux_x86_64.tar.gz
+ - name: Attach Debian artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.deb
+ asset_name: dnbd3_linux_x86_64.deb
+ tag: ${{ github.ref }}
+ - name: Attach RedHat artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.rpm
+ asset_name: dnbd3_linux_x86_64.rpm
+ tag: ${{ github.ref }}
+ - name: Attach generic artifacts to release
+ uses: svenstaro/upload-release-action@v2
+ with:
+ repo_token: ${{ secrets.GITHUB_TOKEN }}
+ file: ${{ github.workspace }}/build/dnbd3_linux_x86_64.tar.gz
+ asset_name: dnbd3_linux_x86_64.tar.gz
+ tag: ${{ github.ref }}
diff --git a/.gitignore b/.gitignore
index 38ae262..6617c58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,3 @@ build/
*.swp
.autotools
.idea
-/version.txt
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0141b05..69459dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,240 +1,246 @@
-################################################################################
-# GENERAL #
-################################################################################
-
-PROJECT(dnbd3 C)
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6.2)
-IF (CMAKE_BUILD_TYPE STREQUAL "")
- SET(CMAKE_BUILD_TYPE Debug)
-ENDIF()
-
-SET(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Path prefix for system installation")
-OPTION(BUILD_FUSE_CLIENT "Build dnbd3 fuse client" ON)
-OPTION(BUILD_SERVER "Build dnbd3 server" ON)
-OPTION(BUILD_STRESSTEST "Build dnbd3 stress testing tool" OFF)
-SET(EXTRA_C_FLAGS "" CACHE STRING "Additional options to pass to compiler")
-
-OPTION(SERVER_FOR_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
-
-# Is there a non-retarded way to check if build type is debug or release?
-# When specifying, it is case insensitive, so DeBuG would also enable debug builds,
-# but in cmake, we can only do case sensitive matches... :/
-string( TOLOWER "${CMAKE_BUILD_TYPE}" bt_lower )
-if (NOT bt_lower MATCHES "^(debug|release)$")
- message( FATAL_ERROR "Build type needs to be either Debug or Release" )
-endif()
-
-message( "Build Type selected: ${CMAKE_BUILD_TYPE}" )
-
-IF(CMAKE_SYSTEM_NAME MATCHES "BSD")
- message("Detected *BSD System: disable build of Kernel Module.")
- SET(BUILD_KERNEL_MODULE False)
-ELSE()
- OPTION(BUILD_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON)
-ENDIF()
-
-if(CMAKE_C_COMPILER MATCHES "clang")
- message( "Using clang flags." )
- SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O1 -fno-omit-frame-pointer -g -Wall -Wextra -Wpedantic -Wno-unused-result -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
- SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
-elseif (CMAKE_C_COMPILER MATCHES "(cc-)|(cc$)")
- message( "Using (g)cc flags." )
- SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O0 -g -Wall -Wextra -Wpedantic -Wconversion -Wno-sign-conversion -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
- SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
-else()
- message( FATAL_ERROR "Could not determine compiler type." )
-endif()
-#SET(CMAKE_CXX_FLAGS_DEBUG "-std=c99 -O0 -g -Wall -Wno-unused-result -D_GNU_SOURCE -D_DEBUG")
-#SET(CMAKE_CXX_FLAGS_RELEASE "-std=c99 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG" )
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
-
-ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
-ADD_DEFINITIONS(-DWITH_IPV6)
-
-FIND_PACKAGE(Threads)
-
-SET(DO_ABORT False)
-
-message( " *************************************************" )
-if(BUILD_FUSE_CLIENT)
- FIND_PACKAGE(Fuse)
- if(NOT FUSE_FOUND)
- message( " *** No fuse dev libs found, can't build dnbd3-fuse" )
- SET(DO_ABORT True)
- endif()
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-fuse" )
- SET(DO_ABORT True)
- endif()
-endif()
-if(BUILD_SERVER)
- FIND_PACKAGE(Jansson)
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-server" )
- SET(DO_ABORT True)
- endif()
- if(NOT JANSSON_FOUND)
- message( " *** No jansson lib found, can't build dnbd3-server" )
- SET(DO_ABORT True)
- endif()
-endif()
-if(BUILD_STRESSTEST)
- if(NOT THREADS_FOUND)
- message( " *** No threads found, can't build dnbd3-bench" )
- SET(DO_ABORT True)
- endif()
-endif()
-message( " *************************************************" )
-if(DO_ABORT)
- message( FATAL_ERROR "Aborting." )
-endif()
-
-#SET(FUSE_INCLUDE_DIR "")
-#SET(JANSSON_INCLUDE_DIR "")
-
-################################################################################
-# VERSION HEADER #
-################################################################################
-
-FILE(WRITE ${CMAKE_BINARY_DIR}/version.cmake
-"EXECUTE_PROCESS(
- COMMAND \${CMD}
- OUTPUT_VARIABLE VERSION
- OUTPUT_STRIP_TRAILING_WHITESPACE
- )
- CONFIGURE_FILE(\${SRC} \${DST} @ONLY)
-")
-ADD_CUSTOM_TARGET(
- version
- ${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/src/version.c.in
- -D DST=${CMAKE_BINARY_DIR}/version.c
- -D CMD=${CMAKE_SOURCE_DIR}/get-version.sh
- -P ${CMAKE_BINARY_DIR}/version.cmake
-)
-
-## This is required if you're not building the kernel module
-## TODO: Find a nicer way to avoid parent includes,
-## especially the ../version.h -> version.c -> version.h cycle
-FILE(GLOB COMMON_HEADER_FILES src/*.h)
-FOREACH(COMMON_HEADER_FILE ${COMMON_HEADER_FILES})
- CONFIGURE_FILE(${COMMON_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
-ENDFOREACH( COMMON_HEADER_FILE )
-
-
-################################################################################
-# CLIENT #
-################################################################################
-
-if(BUILD_KERNEL_MODULE)
- INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR})
- FILE(GLOB_RECURSE CLIENT_SRCS src/client/*.c)
- ADD_EXECUTABLE(dnbd3-client ${CLIENT_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-client)
- ADD_DEPENDENCIES(dnbd3-client version)
- INSTALL(TARGETS dnbd3-client RUNTIME DESTINATION sbin)
-ENDIF()
-
-
-################################################################################
-# SERVER #
-################################################################################
-
-if(BUILD_SERVER)
- IF(SERVER_FOR_AFL)
- message(" ######################## Building server for AFL mode - will be useless otherwise!")
- ADD_DEFINITIONS(-DAFL_MODE)
- ENDIF()
- INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${JANSSON_INCLUDE_DIR})
- FILE(GLOB SERVER_SRCS src/server/*.c src/shared/*.c src/server/picohttpparser/*.c)
- ADD_EXECUTABLE(dnbd3-server ${SERVER_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-server ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
- if(UNIX AND NOT APPLE)
- target_link_libraries(dnbd3-server rt)
- endif()
- ADD_DEPENDENCIES(dnbd3-server version)
- INSTALL(TARGETS dnbd3-server RUNTIME DESTINATION sbin)
-endif()
-
-
-
-################################################################################
-# FUSE #
-################################################################################
-
-if(BUILD_FUSE_CLIENT)
- INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${FUSE_INCLUDE_DIR})
- FILE(GLOB FUSE_SRCS src/fuse/*.c src/shared/*.c)
- ADD_EXECUTABLE(dnbd3-fuse ${FUSE_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-fuse ${CMAKE_THREAD_LIBS_INIT} ${FUSE_LIBRARIES})
- ADD_DEPENDENCIES(dnbd3-fuse version)
- INSTALL(TARGETS dnbd3-fuse RUNTIME DESTINATION bin)
-endif()
-
-################################################################################
-# STRESSTEST #
-################################################################################
-
-if(BUILD_STRESSTEST)
- FILE(GLOB BENCH_SRCS src/bench/*.c src/shared/*.c)
- ADD_EXECUTABLE(dnbd3-bench ${BENCH_SRCS})
- TARGET_LINK_LIBRARIES(dnbd3-bench ${CMAKE_THREAD_LIBS_INIT})
- ADD_DEPENDENCIES(dnbd3-bench version)
- INSTALL(TARGETS dnbd3-bench RUNTIME DESTINATION bin)
-endif()
-
-################################################################################
-# MODULE #
-################################################################################
-
-IF(BUILD_KERNEL_MODULE)
- SET(MODULE_NAME dnbd3)
- SET(MODULE_FILE ${MODULE_NAME}.ko)
- FILE(GLOB MODULE_SOURCE_FILES src/kernel/*.c src/serialize.c)
- FILE(GLOB MODULE_HEADER_FILES src/kernel/*.h)
-
- SET(KERNEL_DIR "" CACHE PATH "Path to kernel sources to compile against")
- IF(KERNEL_DIR STREQUAL "")
- SET(KERNEL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build")
- ENDIF()
-
- SET(KBUILD_COMMAND ${CMAKE_MAKE_PROGRAM} -C ${KERNEL_DIR}
- M=${CMAKE_BINARY_DIR} modules
- )
-
- CONFIGURE_FILE(Kbuild.in ${CMAKE_BINARY_DIR}/Kbuild)
-
- FOREACH(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES})
- CONFIGURE_FILE(${MODULE_SOURCE_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
- ENDFOREACH( MODULE_SOURCE_FILE )
-
- FOREACH(MODULE_HEADER_FILE ${MODULE_HEADER_FILES})
- CONFIGURE_FILE(${MODULE_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
- ENDFOREACH( MODULE_HEADER_FILE )
-
- ADD_CUSTOM_COMMAND(
- OUTPUT ${CMAKE_BINARY_DIR}/${MODULE_FILE}
- COMMAND ${KBUILD_COMMAND}
- WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
- DEPENDS ${MODULE_SOURCE_FILES} Kbuild.in
- VERBATIM
- )
-
- ADD_CUSTOM_TARGET(${MODULE_NAME} ALL DEPENDS ${CMAKE_BINARY_DIR}/${MODULE_FILE})
-
- INSTALL(FILES ${CMAKE_BINARY_DIR}/${MODULE_NAME}.ko
- DESTINATION /lib/modules/${CMAKE_SYSTEM_VERSION}/kernel/drivers/block
- PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
- )
-
- INSTALL(CODE "EXECUTE_PROCESS(COMMAND depmod -a)")
-ENDIF()
-
-
-#
-# Other install files
-#
-
-FILE(GLOB conf_files "${CMAKE_CURRENT_SOURCE_DIR}/conf/*")
-INSTALL(FILES ${conf_files} DESTINATION /etc/dnbd3-server/sample/)
-
+cmake_minimum_required(VERSION 3.10)
+
+# include CMake macros
+set(PROJECT_MODULES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+ ${PROJECT_MODULES_DIR})
+
+# define root CMake project
+project(dnbd3
+ DESCRIPTION "dnbd3 Linux kernel module, server, clients and utilities"
+ LANGUAGES C)
+
+# define project options to define build configuration
+OPTION(DNBD3_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON)
+OPTION(DNBD3_BENCHMARK "Enable build of dnbd3-bench" OFF)
+OPTION(DNBD3_CLIENT_FUSE "Enable build of dnbd3-fuse" ON)
+OPTION(DNBD3_SERVER "Enable build of dnbd3-server" ON)
+OPTION(DNBD3_SERVER_FUSE "Enable FUSE-Integration for dnbd3-server" OFF)
+OPTION(DNBD3_SERVER_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
+OPTION(DNBD3_SERVER_DEBUG_LOCKS "Add lock debugging code to dnbd3-server" OFF)
+OPTION(DNBD3_SERVER_DEBUG_THREADS "Add thread debugging code to dnbd3-server" OFF)
+OPTION(DNBD3_RELEASE_HARDEN "Compile dnbd3 programs in Release build with code hardening options" OFF)
+OPTION(DNBD3_PACKAGE_DOCKER "Enable packaging of Docker image" OFF)
+
+# set supported build configurations
+set(CMAKE_CONFIGURATION_TYPES Debug Release)
+
+# set compilation in debug mode as default configuration
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Debug)
+ message(STATUS "Build type is not set. Defaulting to ${CMAKE_BUILD_TYPE} build!")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# search for required packages
+find_package(Git REQUIRED)
+find_package(Threads REQUIRED)
+
+# include project version and build type related macros
+include(Version)
+include(Build)
+include(Lint)
+
+# check for system and enable or disable built of Linux kernel module
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+ # disable build of the dnbd3 Linux kernel module on a system other than Linux, eg. FreeBSD
+ message(STATUS "Detected non-Linux system: Disable build of the dnbd3 Linux kernel module")
+ set(DNBD3_KERNEL_MODULE OFF)
+endif(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+# enable or disable requirements for a built of the Linux kernel module
+if(DNBD3_KERNEL_MODULE)
+ # require Linux kernel headers
+ find_package(KernelHeaders REQUIRED)
+
+ # print configured settings
+ message(STATUS "Path to Linux kernel modules to compile against is " ${KERNEL_BUILD_DIR})
+ message(STATUS "Path to install Linux kernel modules is " ${KERNEL_INSTALL_DIR})
+endif(DNBD3_KERNEL_MODULE)
+
+# set include directories
+set(PROJECT_GEN_DIR ${CMAKE_BINARY_DIR}/generated)
+set(PROJECT_INCLUDE_DIR_PREFIX inc)
+set(PROJECT_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/${PROJECT_INCLUDE_DIR_PREFIX})
+set(PROJECT_INCLUDE_GEN_DIR ${PROJECT_GEN_DIR}/${PROJECT_INCLUDE_DIR_PREFIX})
+include_directories(${PROJECT_INCLUDE_DIR})
+
+# get all global header files for the linter
+set(DNBD3_HEADER_FILES ${PROJECT_INCLUDE_DIR}/dnbd3/build.h.in
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config/client.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/config/server.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/crc32.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/fdsignal.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/log.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/protocol.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/serialize.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/sockhelper.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/shared/timing.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/types.h
+ ${PROJECT_INCLUDE_DIR}/dnbd3/version.h.in)
+
+# add linter for header files
+add_linter(dnbd3-headers-lint "${DNBD3_HEADER_FILES}")
+add_linter_fix(dnbd3-headers-lint-fix "${DNBD3_HEADER_FILES}")
+
+# generate project version C header file from template
+# exposes dnbd3-generate-version and dnbd3-version target
+set(INCLUDE_VERSION_HEADER ${PROJECT_INCLUDE_DIR}/dnbd3/version.h)
+set(INCLUDE_VERSION_HEADER_TEMPLATE ${PROJECT_INCLUDE_DIR}/dnbd3/version.h.in)
+set(INCLUDE_VERSION_HEADER_GENERATE ${PROJECT_INCLUDE_GEN_DIR}/dnbd3/version.h)
+set(INCLUDE_VERSION_HEADER_GENERATE_PREFIX ${PROJECT_INCLUDE_DIR_PREFIX}/dnbd3)
+gen_project_version(${INCLUDE_VERSION_HEADER} ${INCLUDE_VERSION_HEADER_TEMPLATE} ${INCLUDE_VERSION_HEADER_GENERATE} ${GIT_EXECUTABLE} ${CMAKE_SOURCE_DIR})
+
+# generate project build type C header file from template
+# exposes dnbd3-generate-build and dnbd3-build target
+set(INCLUDE_BUILD_HEADER_TEMPLATE ${PROJECT_INCLUDE_DIR}/dnbd3/build.h.in)
+set(INCLUDE_BUILD_HEADER_GENERATE ${PROJECT_INCLUDE_GEN_DIR}/dnbd3/build.h)
+gen_build_type(${INCLUDE_BUILD_HEADER_TEMPLATE} ${INCLUDE_BUILD_HEADER_GENERATE})
+
+# add compile option to handle files greater than 2GB on a 32bit system
+add_definitions(-D_FILE_OFFSET_BITS=64)
+
+# define global C flags for compilation
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
+
+# enable all error warnings in Debug build configuration
+set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -Wpedantic -Wconversion -Wformat -Wformat-security -Werror=format-security -Wno-sign-conversion")
+set(CMAKE_C_FLAGS_RELEASE "-Wno-error")
+
+# set compilation optimization
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Og -DDEBUG")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -DNDEBUG")
+
+if(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+ # harden builds with specific C flags
+ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2 -fstack-protector-all -fstack-clash-protection")
+ # set specific hardened linker flags
+ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,relro,-z,now,-z,defs -pie")
+ # print status message of configuration
+ message(STATUS "Set compilation of DNBD3 with hardened code options - done")
+else(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+ # print status message of disabled configuration
+ message(STATUS "Disabled compilation of dnbd3 with hardened code options - done")
+endif(DNBD3_RELEASE_HARDEN AND CMAKE_BUILD_TYPE MATCHES "Release")
+
+# define packaging if Release build is enabled
+if(CMAKE_BUILD_TYPE MATCHES Release)
+ # get version source package or Git repository
+ get_repository_version(REPOSITORY_VERSION REPOSITORY_VERSION_SHORT REPOSITORY_BRANCH ${INCLUDE_VERSION_HEADER} ${CMAKE_BUILD_TYPE} ${GIT_EXECUTABLE} ${CMAKE_SOURCE_DIR})
+
+ # define project version
+ if(KernelHeaders_VERSION)
+ set(REPOSITORY_VERSION_FULL ${REPOSITORY_VERSION}-${KernelHeaders_VERSION})
+ else(KernelHeaders_VERSION)
+ set(REPOSITORY_VERSION_FULL ${REPOSITORY_VERSION})
+ endif(KernelHeaders_VERSION)
+
+ set(CPACK_GENERATOR "DEB;RPM;TGZ")
+ set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+ set(CPACK_MONOLITHIC_INSTALL True)
+ set(CPACK_PACKAGE_VERSION ${REPOSITORY_VERSION})
+ set(CPACK_PACKAGE_VERSION_FULL ${REPOSITORY_VERSION_FULL})
+ set(CPACK_PACKAGE_SECTION admin)
+ set(CPACK_PACKAGE_VENDOR "University of Freiburg")
+ set(CPACK_PACKAGE_CONTACT "Christian Rößler <christian.roessler@rz.uni-freiburg.de>")
+ set(CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/bwLehrpool/dnbd3")
+ set(CPACK_PACKAGE_CHECKSUM SHA256)
+ set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR})
+ set(CPACK_SOURCE_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}_${CPACK_PACKAGE_VERSION}_source)
+ set(CPACK_STRIP_FILES True)
+ set(CPACK_PACKAGE_RELOCATABLE False)
+ set(CPACK_SET_DESTDIR True)
+ set(CMAKE_INSTALL_PREFIX "/usr")
+ set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+ set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/COPYING)
+ set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_SOURCE_DIR}/README.md)
+
+ # set DEB generator specific packaging options
+ set(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6, libfuse2, libjansson4, libatomic1")
+ if(DNBD3_KERNEL_MODULE)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postinst "depmod -a\n")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postrm "depmod -a\n")
+ set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postinst
+ ${CMAKE_CURRENT_BINARY_DIR}/package/deb/postrm)
+ endif(DNBD3_KERNEL_MODULE)
+
+ # set RPM generator specific packaging options
+ set(CPACK_RPM_PACKAGE_REQUIRES "glibc, fuse-libs, jansson, libatomic")
+ set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/lib"
+ "/lib/modules"
+ "/lib/modules/${CMAKE_SYSTEM_VERSION}"
+ "/lib/modules/${CMAKE_SYSTEM_VERSION}/extra"
+ "/etc"
+ "/usr"
+ "/usr/lib"
+ "/usr/lib/systemd"
+ "/usr/lib/systemd/system")
+ if(DNBD3_KERNEL_MODULE)
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/post "depmod -a\n")
+ file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/postun "depmod -a\n")
+ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/post)
+ set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE ${CMAKE_CURRENT_BINARY_DIR}/package/rpm/postun)
+ endif(DNBD3_KERNEL_MODULE)
+
+ # configure source packaging
+ set(CPACK_SOURCE_GENERATOR "TGZ;ZIP")
+ set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR}" "/"
+ "${PROJECT_GEN_DIR}" "/")
+ set(CPACK_SOURCE_IGNORE_FILES "/build/"
+ "/.git/"
+ "/.github/"
+ ".gitignore"
+ "version.h.in")
+
+ # include CPack functionality
+ include(CPack)
+
+ # prepare source packaging
+ add_custom_command(OUTPUT ${INCLUDE_VERSION_HEADER}
+ COMMAND ${CMAKE_COMMAND} -D VERSION_HEADER_INPUT_FILE=${INCLUDE_VERSION_HEADER_GENERATE}
+ -D VERSION_HEADER_OUTPUT_FILE=${INCLUDE_VERSION_HEADER}
+ -P ${PROJECT_MODULES_DIR}/PreVersionPackaging.cmake
+ COMMENT "Prepare version.h"
+ DEPENDS dnbd3-generate-version)
+
+ # main source packaging
+ add_custom_target(package_source_main
+ COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target package_source
+ DEPENDS ${INCLUDE_VERSION_HEADER}
+ VERBATIM
+ USES_TERMINAL)
+
+ # post source packaging and exported target to build source packages
+ add_custom_target(source
+ COMMAND ${CMAKE_COMMAND} -D VERSION_HEADER_INPUT_FILE=${INCLUDE_VERSION_HEADER_GENERATE}
+ -D VERSION_HEADER_OUTPUT_FILE=${INCLUDE_VERSION_HEADER}
+ -P ${PROJECT_MODULES_DIR}/PostVersionPackaging.cmake
+ COMMENT "Cleanup version.h"
+ DEPENDS package_source_main)
+
+ # include target to make docker image
+ if(NOT DNBD3_KERNEL_MODULE AND DNBD3_SERVER AND DNBD3_PACKAGE_DOCKER)
+ find_package(Docker REQUIRED)
+ include(DockerImage)
+
+ set(DOCKER_TAG ${CPACK_PACKAGE_NAME}:${REPOSITORY_VERSION_SHORT})
+
+ # define Ubuntu docker image
+ set(DOCKER_FILE_UBUNTU ${CMAKE_SOURCE_DIR}/pkg/docker/ubuntu-20-04_dockerfile)
+ set(PACKAGE_FILE_UBUNTU ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}.deb)
+ set(DOCKER_IMAGE_UBUNTU ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}_ubuntu-20-04_docker.tar)
+ add_docker_image(docker-ubuntu-20-04 ${DOCKER_IMAGE_UBUNTU} ${DOCKER_FILE_UBUNTU} ${DOCKER_TAG} ${PACKAGE_FILE_UBUNTU} ${CMAKE_BINARY_DIR})
+
+ # define Archlinux docker image
+ set(DOCKER_FILE_ARCHLINUX ${CMAKE_SOURCE_DIR}/pkg/docker/archlinux_dockerfile)
+ set(PACKAGE_FILE_ARCHLINUX ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}.tar.gz)
+ set(DOCKER_IMAGE_ARCHLINUX ${CPACK_PACKAGE_NAME}_${REPOSITORY_VERSION_FULL}_${CMAKE_SYSTEM_PROCESSOR}_archlinux_docker.tar)
+ add_docker_image(docker-archlinux ${DOCKER_IMAGE_ARCHLINUX} ${DOCKER_FILE_ARCHLINUX} ${DOCKER_TAG} ${PACKAGE_FILE_ARCHLINUX} ${CMAKE_BINARY_DIR})
+ endif(NOT DNBD3_KERNEL_MODULE AND DNBD3_SERVER AND DNBD3_PACKAGE_DOCKER)
+endif(CMAKE_BUILD_TYPE MATCHES Release)
+
+# add all dnbd3 related projects from the source code directory
+add_subdirectory(src)
+
+# add configuration and operational files for packaging purposes
+add_subdirectory(pkg)
diff --git a/Kbuild.in b/Kbuild.in
deleted file mode 100644
index 667cee0..0000000
--- a/Kbuild.in
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-m := ${MODULE_NAME}.o
-${MODULE_NAME}-objs += core.o blk.o net.o sysfs.o utils.o serialize_kmod.o \ No newline at end of file
diff --git a/LOCKS b/LOCKS
deleted file mode 100644
index 4b5b07c..0000000
--- a/LOCKS
+++ /dev/null
@@ -1,81 +0,0 @@
-Some notes about locking in dnbd3
-
-The order of aquiring multiple locks is
-VERY IMPORTANT, as you'll produce a possible deadlock
-if you do it in the wrong order.
-Take very good care of locking order if you have lots
-of functions that call each other. You might lose
-track of what's going on. ;)
-
-===== FUSE =====
-mutexInit
-newAltLock
-altLock
-connection.sendMutex
-requests.lock
-
-===== SERVER =====
-This is a list of used locks, in the order they
-have to be aquired if you must hold multiple locks:
-remoteCloneLock | reloadLock
-_clients_lock
-_clients[].lock
-integrityQueueLock
-_images_lock
-_images[].lock
-pendingLockConsume
-pendingLockProduce
-uplink.queueLock
-altServersLock
-client.sendMutex
-client.statsLock
-statisticsSentLock
-statisticsReceivedLock
-uplink.rttLock
-uplink.sendMutex
-
-If you need to lock multiple clients/images/... at once,
-lock the client with the lowest array index first.
-
-If the program logic would require to aquire the
-locks in a different order, you HAVE TO rework the
-code.
-For example, if you hold the lock for client 10 and
-you need to look up some other client. You MUST NOT
-simply fetch the _clients_lock now and then iterate
-over the clients until you find the one you need,
-as it violates the above order to first lock on the
-clients array and then the clients lock.
-Instead, you need to release client 10's lock,
-then lock on _clients_lock and iterate over the
-clients. Now you check if you either encounter
-the client you originally held the lock on, or
-the client you are looking for. You immediately
-lock on those two. You can then release the
-_clients_lock and work with both clients.
-pseudo code:
-
-// client10 is assumed to be a pointer to
-// a client, which happens to be at index 10
-lock (client10->lock);
-....
-// oh, i need another client
-unlock(client10->lock);
-lock(_clients_lock);
-client clientA = NULL, clientB = NULL;
-for (i = 0; i < _num_clients; ++i) {
- if (client[i] == client10) {
- clientA = client[i];
- lock(clientA.lock);
- } else if (client[i].something == <whatever>) {
- clientB = client[i];
- lock(clientB.lock);
- }
-}
-unlock(_clients_lock);
-if (clientA && clientB) { // Make sure we actually found both!
- // DO something important with both clients
-}
-if (clientA) unlock(clientA.lock);
-if (clientB) unlock(clientB.lock);
-
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ebf1aea
--- /dev/null
+++ b/README.md
@@ -0,0 +1,456 @@
+# dnbd3 - distributed network block device (version 3)
+
+The distributed network block device version 3 (dnbd3) is a network protocol similar to [nbd](https://github.com/NetworkBlockDevice/nbd) to implement a distributed block-based read-only storage system. Such a distributed block-based storage system consists of dnbd3 components, namly one or more servers and several clients. Servers are meant to expose virtual disk images as block devices to clients using dnbd3. Clients read data blocks from servers and implement failover and a load balancing mechanism to connect to the fastest server available for data exchange.
+
+This repository contains the source code for the following dnbd3 components:
+
+ - **dnbd3**: Linux kernel module client for dnbd3
+ - **dnbd3-bench**: Benchmark utility to test dnbd3
+ - **dnbd3-fuse**: Fuse client for dnbd3
+ - **dnbd3-server**: Server to serve virtual disk images for dnbd3
+
+The dnbd3 components have been built/tested on the following Linux kernel versions and Unix distributions:
+
+ - Generic/Vanilla Kernel **4.9** to **6.1**
+ - Archlinux with **Linux kernel 5.15.x** or **6.1.x**
+ - Raspberry Pi OS with **Linux kernel 5.4.x**
+ - Ubuntu 20.04 with **Linux kernel 5.4.x**
+ - Ubuntu 18.04 with **Linux kernel 4.19.x**
+ - CentOS 8 with **Linux kernel 4.18.x**
+ - CentOS 7 with **Linux kernel 3.10.x**
+ - AlmaLinux 8 with **Linux kernel 4.18.x**
+ - Rocky Linux 8 with **Linux kernel 4.18.x**
+ - FreeBSD 12.x and 13.x (only user space programs, eg. dnbd3-server)
+
+
+## Build
+
+### Preliminaries
+A build of the dnbd3 components requires the installation of the following build tools and libraries under your supported Unix distribution.
+
+#### Archlinux with Linux kernel 5.15.x or 5.10.x
+```shell
+pacman -S git \
+ make \
+ cmake \
+ gcc \
+ clang \
+ linux-headers \ # or linux-lts-headers
+ fuse2 \
+ jansson \
+ afl \
+ dpkg \
+ rpm-tools
+```
+
+#### Raspberry Pi OS with Linux kernel 5.4.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ raspberrypi-kernel-headers \
+ libfuse-dev \
+ libjansson-dev \
+ afl \
+ rpm
+```
+
+#### Ubuntu 20.04 with Linux kernel 5.4.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ rpm
+```
+
+Note that `afl` is not available on Ubuntu 20.04 and should be built from the [original sources](https://github.com/google/AFL).
+
+#### Ubuntu 18.04 with Linux kernel 4.19.x
+```shell
+apt-get install git \
+ make \
+ cmake \
+ gcc \
+ clang-format \
+ linux-headers-generic \
+ libfuse-dev \
+ libjansson-dev \
+ afl \
+ rpm
+```
+
+#### {CentOS, AlmaLinux, Rocky Linux} 8 with Linux kernel 4.18.x
+```shell
+yum install git \
+ make \
+ cmake \
+ gcc \
+ clang-tools-extra \
+ kernel-devel \
+ elfutils-libelf-devel \
+ fuse-devel \
+ jansson-devel \
+ libatomic \
+ rpm-build
+```
+
+Note that `afl` is not available on CentOS 8 and the likes and should be built from the [original sources](https://github.com/google/AFL).
+
+#### CentOS 7 with Linux kernel 3.10.x
+Before any required preliminaries can be installed, enable the `epel` package repository with the following command line calls:
+
+```shell
+yum install epel-release
+yum repolist # refresh epel package repository
+```
+
+The `epel` package repository enables the installation of `cmake3` on CentOS 7 which is later required to build dnbd3 components.
+Then, install the required preliminaries with the following command line call as usual:
+
+```shell
+yum install git \
+ make \
+ cmake3 \
+ gcc \
+ kernel-devel \
+ elfutils-libelf-devel \
+ rpm-build
+```
+
+Note that `afl` is not available on CentOS 7 and should be built from the [original sources](https://github.com/google/AFL).
+
+> **Warning: All dnbd3 components can only be built if a GCC compiler with stdatomic support is used.
+> This feature is available with GCC 4.9 or later as part of the C11 language support.
+> Since CentOS 7 is shipped with GCC 4.8 you have to install a new GCC version greater or equal than GCC 4.9.**
+
+The installation of GCC 7.3 on CentOS requires some additional instructions as follows.
+First, install Software Collections on your system that allows you to build, install, and use multiple versions of GCC on the same system withoutaffecting system-wide installed packages. Software collections is part of the CentOS `extras` repository and can be installed by running the following command:
+
+```shell
+yum install centos-release-scl
+```
+
+After installation of Software Collections, install the Developer Toolset in version 7 and additional packages with the following command line call:
+
+```shell
+yum install devtoolset-7 \
+ devtoolset-7-libatomic-devel \
+ llvm-toolset-7-git-clang-format \
+ fuse-devel \
+ jansson-devel
+```
+
+To access GCC 7.3, you need to launch a new shell instance using the Software Collections `scl` tool:
+
+```shell
+scl enable devtoolset-7 llvm-toolset-7 bash
+```
+
+Now, GCC 7.3 is the default version in your current shell.
+This allows you to build all dnbd3 components on CentOS 7.
+
+
+#### FreeBSD 12.x and 13.x
+```shell
+pkg install git \
+ cmake \
+ gcc \
+ clang-devel \
+ pkgconf \
+ fusefs-libs \
+ jansson \
+ afl \
+ rpm4
+```
+
+
+### Preparation
+Before a build takes place, you should create a `build` directory inside the root folder of the repository. After that, change your working directory to that new directory as follows:
+
+```shell
+mkdir build
+cd build
+```
+
+
+### Configuration
+A build of the dnbd3 components can be configured and customized by the following configuration variables (CMake cache entries):
+
+| Variable | Type | Values | Default value | Description |
+|:-----------------------------|:-------|:----------------------------------------|:--------------------------------------|----------------------------------------------------------------------|
+| `CMAKE_BUILD_TYPE` | STRING | {`Debug`, `Release`} | `Debug` | Build configuration of the dnbd3 project. |
+| `KERNEL_BUILD_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/build | Path to Linux kernel modules to compile against. |
+| `KERNEL_INSTALL_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/extra | Path to install Linux kernel modules. |
+| `KERNEL_SCRIPTS_DIR` | PATH | {`a` .. `z`, `A` .. `Z`, `/`, `_`, `-`} | /lib/modules/`uname -r`/build/scripts | Path to Linux kernel scripts directory. |
+| `DNBD3_KERNEL_MODULE` | OPTION | {`ON`, `OFF`} | `ON` | Build the dnbd3 Linux kernel module. |
+| `DNBD3_BENCHMARK` | OPTION | {`ON`, `OFF`} | `OFF` | Enable build of dnbd3-bench. |
+| `DNBD3_CLIENT_FUSE` | OPTION | {`ON`, `OFF`} | `ON` | Enable build of dnbd3-fuse. |
+| `DNBD3_SERVER` | OPTION | {`ON`, `OFF`} | `ON` | Enable build of dnbd3-server. |
+| `DNBD3_SERVER_FUSE` | OPTION | {`ON`, `OFF`} | `OFF` | Enable FUSE-Integration for dnbd3-server. |
+| `DNBD3_SERVER_AFL` | OPTION | {`ON`, `OFF`} | `OFF` | Build dnbd3-server for usage with afl-fuzz. |
+| `DNBD3_SERVER_DEBUG_LOCKS` | OPTION | {`ON`, `OFF`} | `OFF` | Add lock debugging code to dnbd3-server. |
+| `DNBD3_SERVER_DEBUG_THREADS` | OPTION | {`ON`, `OFF`} | `OFF` | Add thread debugging code to dnbd3-server. |
+| `DNBD3_RELEASE_HARDEN` | OPTION | {`ON`, `OFF`} | `OFF` | Compile dnbd3 programs in Release build with code hardening options. |
+| `DNBD3_PACKAGE_DOCKER` | OPTION | {`ON`, `OFF`} | `OFF` | Enable packaging of Docker image. |
+
+A value from the range of appropriate values can be assigend to each configuration variable by executing CMake once with the following command pattern:
+
+```shell
+cmake -D<VARIABLE>=<VALUE> [-D ...] ../.
+```
+
+> **Note that the default compiler on FreeBSD 12.x and 13.x is clang/llvm and should be changed to gcc by appending the set CMake compiler configuration variable -DCMAKE_C_COMPILER=gcc to the CMake configuration command.**
+
+
+### Cross-Compiling
+With the help of CMake, it is also possible to cross-compile the dnbd3 components for a Linux target architecture other than the compiling Linux host architecture. This repository is shipped with two CMake toolchain files to cross-compile all components for the following two Linux target architectures if necessary.
+
+> **Note that all used header files (eg. Linux kernel headers) and libraries (eg. jansson, fuse) for the target architecture are installed and set up properly, so that the cross-compiler can find and use them.**
+
+
+#### Cross-Compiling for _powerpc_ Target
+If you want to cross-compile all dnbd3 components for the _powerpc_ Linux target architecture (eg. for a Mac G5), make sure that the `powerpc-linux-gnu-gcc` cross-compiler is installed on your host system. Then, call CMake with the shipped toolchain file for this specific cross-compiler as follows.
+
+```shell
+cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/PowerpcLinuxGnu.cmake [-D ...] ../.
+```
+
+
+#### Cross-Compiling for _aarch64_ Target
+If you want to cross-compile all dnbd3 components for the _aarch64_ Linux target architecture (eg. for a Raspberry Pi 4), make sure that the `aarch64-linux-gnu-gcc` cross-compiler is installed on your host system. Then, call CMake with the shipped toolchain file for this specific cross-compiler as follows.
+
+```shell
+cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/Aarch64LinuxGnu.cmake [-D ...] ../.
+```
+
+
+### Debug
+In the `Debug` build configuration, all dnbd3 components can be built by calling `make`:
+
+```shell
+make
+```
+
+Optionally, the output files can be installed with superuser permissions on the local system using the Makefile target `install`:
+
+```shell
+sudo make install
+sudo depmod -a # only required if DNBD3_KERNEL_MODULE is enabled
+```
+
+
+### Packages
+In the `Release` build configuration, installation packages can be built by calling the make target `package`:
+
+```shell
+make package
+```
+
+This target creates a Debian installation package (\*.deb), a RPM installation package (\*.rpm) and a compressed archive (\*.tar.gz) containing the built dnbd3 components.
+
+
+### Sources
+In the `Release` build configuration, sources can be built by calling the make target `source`:
+
+```shell
+make source
+```
+
+This target creates compressed archives (\*_source.tar.gz and \*_source.zip) containing the source code of this repository for code distribution purposes.
+
+
+### Docker image
+A docker image of the built dnbd3 components can be created in the `Release` build configuration with the option `DNBD3_PACKAGE_DOCKER=ON`, `DNBD3_SERVER=ON` and `DNBD3_KERNEL_MODULE=OFF`. The image is based on Ubuntu 20.04 and a created docker container from it starts the embedded dnbd3-server automatically.
+
+Before the image is built, make sure that your docker daemon runs and you are a member of the `docker` group to access the docker deamon without any super user privileges. Then, build the docker image based on either Ubuntu 20.04 or Archlinux by calling one of the following Make target:
+
+```
+make docker-ubuntu-20-04
+make docker-archlinux
+```
+
+The built docker image is saved as archive file (\*_ubuntu-20-04_docker.tar) and can be deployed to other machines. On each machine, the created image can be loaded with the following docker client call:
+
+```shell
+docker image load -i *_ubuntu-20-04_docker.tar
+```
+
+After the image is loaded, a docker network needs to be available so that each docker container based on this image can establish a network connection. Therefore, a docker network called `dnbd3` is created with the following docker client call:
+
+```shell
+docker network create --driver=bridge --subnet=192.168.100.0/24 dnbd3
+```
+
+If the network is present, docker containers with a name of form `dnbd3-server<NUMBER>` and an IPv4 address from the network's subnet can be created using docker client calls like the following ones:
+
+```
+docker container create --name dnbd3-server1 --ip 192.168.100.10 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server2 --ip 192.168.100.50 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server3 --ip 192.168.100.100 --network dnbd3 <IMAGE_TAG>
+docker container create --name dnbd3-server4 --ip 192.168.100.123 --network dnbd3 <IMAGE_TAG>
+```
+
+Note that the image is already tagged with an `IMAGE_TAG` which is set to the current dnbd3 package version number and follows the format `dnbd3:<DNBD3_VERSION>`. The `IMAGE_TAG` can be reused to create a docker container. Finally, each container based on the image can be started with the following docker client call:
+
+```
+docker container start -a dnbd3-server<MUNBER>
+```
+
+
+## Configuration of _dnbd3-server_
+The dnbd3-server is started according to the following command line call.
+
+```shell
+dnbd3-server -c <CONFIG_DIR>
+```
+
+An operation of the dnbd3-server requires a configuration directory to provide proper functionality. The configuration directory should contain two configuration files, namely the _alt-servers_ and the _server.conf_ file.
+
+
+### Configuration file _alt-servers_
+The _alt-servers_ configuration file specifies the list of known alt-servers for the dnbd3-server. The configuration in the file is specified the INI file format as shown in the following.
+
+```ini
+[Address]
+comment=Whatever
+for=purpose # where purpose is either "client" or "replication"
+namespace=some/path/
+```
+
+All fields in an INI section are optional. If the `for` key is missing, the dnbd3-server will be used for replication and will be propagated to clients that request a list of alt servers. The `namespace` key can be specified multiple times per INI section. If this key is missing, the server will be used for all image names. Otherwise, it will only be used for images which's name starts with one of the given strings.
+
+If the dnbd3-server is not running in proxy mode, this file won't do much.
+
+
+### Configuration file _server.conf_
+The _server.conf_ file is the main configuration file of the dnbd3-server. The configuration in the file is specified the INI file format as shown in the following.
+
+```ini
+[dnbd3]
+basePath=/srv/openslx/dnbd3 # virtual root of image files
+serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs)
+clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs)
+isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image
+uplinkTimeout=1250 # r/w timeout for connections to uplink servers
+```
+
+
+## Debugging
+Debugging of the Linux kernel modules and the user space utility requires this project to be built in the `Debug` configuration.
+
+### Linux kernel module
+The Linux kernel module **dnbd3** supports the Linux kernel's dynamic debug feature if the Linux kernel is built with the enabled kernel configuration `CONFIG_DYNAMIC_DEBUG`. The dynamic debug feature allows the printing of customizable debug messages into the Linux kernel's message buffer.
+
+Dynamic debug for the modules can be either enabled at module initialization or during operation. At module initialization, dynamic debug can be enabled by modprobe using the "fake" module parameter `dyndbg`:
+
+```shell
+modprobe dnbd3 dyndbg=+pflmt
+```
+
+The module parameter `dyndbg` customizes the debug messages written into the Linux kernel's message buffer. The specific value `+pflmt` enables all debug messages in the source code and includes function name (`f`), line number (`l`), module name (`m`) and thread ID (`t`) for each executed debug statement from the source code.
+
+During operation, debug messages from debug statements in the code can be customized and enabled dynamically as well using the debugfs control file `<DEBUG_FS>/dynamic_debug/control` where `DEBUG_FS` is the mount point of a mounted DebugFS, eg. `/sys/kernel/debug`:
+
+```shell
+echo "module dnbd3 +pflmt" > <DEBUG_FS>/dynamic_debug/control
+```
+
+More information regarding the Linux kernel's dynamic debug feature can be found in the [Linux kernel documentation](https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html).
+
+
+## Development notes
+
+### Code style of source code files
+The code style fo all source code files can be checked by calling the make target `lint`:
+
+```shell
+make lint
+```
+
+If some source code files do not meet the project's code style, they can be fixed automatically by calling the make target `lint-fix`:
+
+```shell
+make lint-fix
+```
+
+
+### Resource locking in dnbd3
+The order of aquiring multiple locks is very important, as you'll produce a possible deadlock if you do it in the wrong order. Take very good care of locking order if you have lots of functions that call each other. You might lose track of what's going on.
+
+
+#### dnbd3-fuse
+This is a list of used locks, in the order they have to be aquired if you must hold multiple locks.
+
+```
+mutexInit
+newAltLock
+altLock
+connection.sendMutex
+requests.lock
+```
+
+
+#### dnbd3-server
+This is a list of used locks, in the order they have to be aquired if you must hold multiple locks. Take a look at the lock priority defines in _src/server/locks.h_ for the effective order.
+
+```
+reloadLock
+loadLock
+remoteCloneLock
+_clients_lock
+_clients[].lock
+integrityQueueLock
+imageListLock
+_images[].lock
+uplink.queueLock
+altServersLock
+client.sendMutex
+uplink.rttLock
+uplink.sendMutex
+aclLock
+initLock
+dirLock
+```
+
+If you need to lock multiple clients or images or etc at once, lock the client with the lowest array index first.
+
+If the program logic would require to aquire the locks in a different order, you have to rework the code. For example, if you hold the lock for client 10 and you need to look up some other client. You must not simply fetch the _clients_lock now and then iterate over the clients until you find the one you need, as it violates the above order to first lock on the clients array and then the clients lock. Instead, you need to release client 10's lock, then lock on _clients_lock and iterate over the clients. Now you check if you either encounter the client you originally held the lock on, or the client you are looking for. You immediately lock on those two. You can then release the _clients_lock and work with both clients.
+This described implementation advice is visualized in the following pseudo C code.
+
+```C
+/* client10 is assumed to be a pointer to a client, which happens to be at index 10 */
+lock (client10->lock);
+/* ... */
+/* we need another client */
+unlock(client10->lock);
+
+lock(_clients_lock);
+client clientA = NULL, clientB = NULL;
+for (i = 0; i < _num_clients; ++i) {
+ if (client[i] == client10) {
+ clientA = client[i];
+ lock(clientA.lock);
+ } else if (client[i].something == <whatever>) {
+ clientB = client[i];
+ lock(clientB.lock);
+ }
+}
+unlock(_clients_lock);
+
+if (clientA && clientB) {
+ /* make sure we actually found both */
+ /* do something important with both clients */
+}
+
+if (clientA)
+ unlock(clientA.lock);
+if (clientB)
+ unlock(clientB.lock);
+```
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 6726a86..0000000
--- a/build.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-mkdir -p build
-cd build/
-cmake ..
-make
diff --git a/cmake/Build.cmake b/cmake/Build.cmake
new file mode 100644
index 0000000..a7f4c07
--- /dev/null
+++ b/cmake/Build.cmake
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+macro(gen_build_type BUILD_INPUT_FILE_TEMPLATE BUILD_OUTPUT_FILE)
+ get_filename_component(BUILD_OUTPUT_FILENAME ${BUILD_OUTPUT_FILE} NAME)
+ # command that will trigger a rebuild of build.h every time
+ add_custom_command(OUTPUT regenerate-build-file
+ COMMAND ${CMAKE_COMMAND} -E sleep 0
+ COMMENT "Trigger generating ${BUILD_OUTPUT_FILENAME}")
+
+ # call the GenerateBuild.cmake file to generate the build.h file
+ add_custom_command(OUTPUT ${BUILD_OUTPUT_FILE}
+ COMMAND ${CMAKE_COMMAND} -D BUILD_INPUT_FILE_TEMPLATE=${BUILD_INPUT_FILE_TEMPLATE}
+ -D BUILD_OUTPUT_FILE=${BUILD_OUTPUT_FILE}
+ -D BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -P ${PROJECT_MODULES_DIR}/GenerateBuild.cmake
+ COMMENT "Generating ${BUILD_OUTPUT_FILENAME}"
+ DEPENDS regenerate-build-file)
+ add_custom_target(dnbd3-generate-build DEPENDS ${BUILD_OUTPUT_FILE})
+
+ # create target to expose project build type
+ add_library(dnbd3-build INTERFACE)
+ target_include_directories(dnbd3-build INTERFACE ${PROJECT_INCLUDE_GEN_DIR})
+ add_dependencies(dnbd3-build dnbd3-generate-build)
+endmacro(gen_build_type BUILD_INPUT_FILE_TEMPLATE BUILD_OUTPUT_FILE)
diff --git a/cmake/CheckAFLCCompiler.cmake b/cmake/CheckAFLCCompiler.cmake
new file mode 100644
index 0000000..249248b
--- /dev/null
+++ b/cmake/CheckAFLCCompiler.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if corresponding AFL C Compiler form original C compiler is available
+# if an AFL C compiler is available, then the path to the AFL C compiler is returned in AFL_C_COMPILER
+macro(check_afl_c_compiler AFL_C_COMPILER AFL_C_COMPILER_NAME C_COMPILER_PATH C_COMPILER_ID)
+ # determine AFL C compiler suffix from original C compiler ID
+ if(${C_COMPILER_ID} MATCHES "GNU")
+ set(AFL_C_COMPILER_SUFFIX "gcc")
+ elseif(${C_COMPILER_ID} MATCHES "Clang")
+ set(AFL_C_COMPILER_SUFFIX "clang")
+ else(${C_COMPILER_ID} MATCHES "Clang")
+ get_filename_component(AFL_C_COMPILER_SUFFIX ${C_COMPILER_PATH} NAME)
+ endif(${C_COMPILER_ID} MATCHES "GNU")
+
+ # define search file name and search for AFL C compiler program
+ set(AFL_C_COMPILER_SEARCH_NAME "afl-${AFL_C_COMPILER_SUFFIX}")
+ find_program(${AFL_C_COMPILER} NAMES ${AFL_C_COMPILER_SEARCH_NAME})
+
+ # return the AFL C compiler name to the caller
+ set(${AFL_C_COMPILER_NAME} ${AFL_C_COMPILER_SEARCH_NAME})
+endmacro(check_afl_c_compiler) \ No newline at end of file
diff --git a/cmake/DockerImage.cmake b/cmake/DockerImage.cmake
new file mode 100644
index 0000000..83f4b9d
--- /dev/null
+++ b/cmake/DockerImage.cmake
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# create a pseudo target to do packaging before docker image is built
+add_custom_target(package_docker
+ COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target package
+ VERBATIM)
+
+# macro to build a docker image based on a provided Dockerfile and an installation package
+macro(add_docker_image TARGET_NAME DOCKER_IMAGE DOCKER_FILE DOCKER_TAG PACKAGE_FILE BUILD_DIR)
+ get_filename_component(PACKAGE_FILE_PATH ${PACKAGE_FILE} PATH)
+ get_filename_component(PACKAGE_FILE_NAME ${PACKAGE_FILE} NAME)
+
+ # commands and target to build docker image
+ add_custom_command(OUTPUT ${DOCKER_IMAGE}
+ COMMAND docker image build -t ${DOCKER_TAG} --file ${DOCKER_FILE} --build-arg DNBD3_PACKAGE_FILE_NAME=${PACKAGE_FILE_NAME} ${BUILD_DIR}
+ COMMAND docker image save -o ${DOCKER_IMAGE} ${DOCKER_TAG}
+ COMMAND docker image rm ${DOCKER_TAG}
+ DEPENDS ${DOCKER_FILE}
+ package_docker)
+ add_custom_target(${TARGET_NAME}
+ DEPENDS ${DOCKER_IMAGE})
+endmacro(add_docker_image TARGET_NAME DOCKER_IMAGE DOCKER_FILE PACKAGE_FILE)
diff --git a/cmake/FindCheckPatch.cmake b/cmake/FindCheckPatch.cmake
new file mode 100644
index 0000000..8454e6b
--- /dev/null
+++ b/cmake/FindCheckPatch.cmake
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if custom Linux kernel script directory was specified
+if(NOT KERNEL_SCRIPTS_DIR)
+ set(KERNEL_SCRIPTS_DIR "${KERNEL_BUILD_DIR}/scripts"
+ CACHE PATH "Path to Linux kernel scripts directory")
+endif(NOT KERNEL_SCRIPTS_DIR)
+
+# find the checkpatch.pl script in the given KERNEL_SCRIPTS_DIR
+find_program(CheckPatch_EXECUTABLE
+ NAMES checkpatch.pl
+ PATHS ${KERNEL_SCRIPTS_DIR})
+
+
+# get the checkpatch.pl version
+if(CheckPatch_EXECUTABLE)
+ execute_process(COMMAND ${CheckPatch_EXECUTABLE} --version
+ OUTPUT_VARIABLE CheckPatch_VERBOSE_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ string(REGEX REPLACE ".*Version:.([0-9]+\\.[0-9]+).*" "\\1" CheckPatch_VERSION "${CheckPatch_VERBOSE_VERSION}")
+endif(CheckPatch_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CheckPatch
+ FOUND_VAR CheckPatch_FOUND
+ REQUIRED_VARS CheckPatch_EXECUTABLE
+ VERSION_VAR CheckPatch_VERSION
+ FAIL_MESSAGE "checkpatch.pl is not available! Please install checkpatch.pl to lint and format the source code!")
diff --git a/cmake/FindClangFormat.cmake b/cmake/FindClangFormat.cmake
new file mode 100644
index 0000000..a6c77d4
--- /dev/null
+++ b/cmake/FindClangFormat.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_program(ClangFormat_EXECUTABLE NAMES clang-format)
+
+if(ClangFormat_EXECUTABLE)
+ execute_process(COMMAND clang-format --version
+ OUTPUT_VARIABLE ClangFormat_VERBOSE_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" ClangFormat_VERSION ${ClangFormat_VERBOSE_VERSION})
+endif(ClangFormat_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ClangFormat
+ FOUND_VAR ClangFormat_FOUND
+ REQUIRED_VARS ClangFormat_EXECUTABLE
+ VERSION_VAR ClangFormat_VERSION
+ FAIL_MESSAGE "clang-format is not available! Please install clang-format to lint and format the source code!")
diff --git a/cmake/FindDocker.cmake b/cmake/FindDocker.cmake
new file mode 100644
index 0000000..ef3046d
--- /dev/null
+++ b/cmake/FindDocker.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_program(Docker_EXECUTABLE NAMES docker)
+
+if(Docker_EXECUTABLE)
+ execute_process(COMMAND docker version --format "{{.Server.Version}}"
+ OUTPUT_VARIABLE Docker_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif(Docker_EXECUTABLE)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Docker
+ FOUND_VAR Docker_FOUND
+ REQUIRED_VARS Docker_EXECUTABLE
+ VERSION_VAR Docker_VERSION
+ FAIL_MESSAGE "Docker is not available! Please install docker to build and run containers!")
+
diff --git a/cmake/FindFuse.cmake b/cmake/FindFuse.cmake
index b9c6f91..09e8ba0 100644
--- a/cmake/FindFuse.cmake
+++ b/cmake/FindFuse.cmake
@@ -1,30 +1,167 @@
-# - Find fuse
-# Find the native fuse includes and library
+# This module can find FUSE Library
#
-# FUSE_INCLUDE_DIR - where to find fuse/fuse.h.
-# FUSE_LIBRARIES - List of libraries when using fuse.
-# FUSE_FOUND - True if fuse found.
+# Requirements:
+# - CMake >= 2.8.3
+#
+# The following variables will be defined for your use:
+# - FUSE_FOUND : was FUSE found?
+# - FUSE_INCLUDE_DIRS : FUSE include directory
+# - FUSE_LIBRARIES : FUSE library
+# - FUSE_DEFINITIONS : FUSE cflags
+# - FUSE_VERSION : complete version of FUSE (major.minor)
+# - FUSE_MAJOR_VERSION : major version of FUSE
+# - FUSE_MINOR_VERSION : minor version of FUSE
+#
+# Example Usage:
+#
+# 1. Copy this file in the root of your project source directory
+# 2. Then, tell CMake to search this non-standard module in your project directory by adding to your CMakeLists.txt:
+# set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+# 3. Finally call find_package() once, here are some examples to pick from
+#
+# Require FUSE 2.6 or later
+# find_package(FUSE 2.6 REQUIRED)
+#
+# if(FUSE_FOUND)
+# add_definitions(${FUSE_DEFINITIONS})
+# include_directories(${FUSE_INCLUDE_DIRS})
+# add_executable(myapp myapp.c)
+# target_link_libraries(myapp ${FUSE_LIBRARIES})
+# endif()
+
+#=============================================================================
+# Copyright (c) 2012, julp
+#
+# Distributed under the OSI-approved BSD License
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#=============================================================================
+
+cmake_minimum_required(VERSION 2.8.12)
+
+########## Private ##########
+function(fusedebug _varname)
+ if(FUSE_DEBUG)
+ message("${_varname} = ${${_varname}}")
+ endif(FUSE_DEBUG)
+endfunction(fusedebug)
+
+########## Public ##########
+set(FUSE_FOUND TRUE)
+set(FUSE_LIBRARIES )
+set(FUSE_DEFINITIONS )
+set(FUSE_INCLUDE_DIRS )
+
+find_package(PkgConfig QUIET)
+
+set(PC_FUSE_INCLUDE_DIRS "/usr/include/fuse")
+set(PC_FUSE_LIBRARY_DIRS )
+if(PKG_CONFIG_FOUND)
+ pkg_check_modules(PC_FUSE "fuse" QUIET)
+ if(PC_FUSE_FOUND)
+# fusedebug(PC_FUSE_LIBRARIES)
+# fusedebug(PC_FUSE_LIBRARY_DIRS)
+# fusedebug(PC_FUSE_LDFLAGS)
+# fusedebug(PC_FUSE_LDFLAGS_OTHER)
+# fusedebug(PC_FUSE_INCLUDE_DIRS)
+# fusedebug(PC_FUSE_CFLAGS)
+# fusedebug(PC_FUSE_CFLAGS_OTHER)
+ set(FUSE_DEFINITIONS "${PC_FUSE_CFLAGS_OTHER}")
+ endif(PC_FUSE_FOUND)
+endif(PKG_CONFIG_FOUND)
+
+find_path(
+ FUSE_INCLUDE_DIRS
+ NAMES fuse_common.h fuse_lowlevel.h fuse.h
+ PATHS "${PC_FUSE_INCLUDE_DIRS}"
+ DOC "Include directories for FUSE"
+)
+
+if(NOT FUSE_INCLUDE_DIRS)
+ set(FUSE_FOUND FALSE)
+endif(NOT FUSE_INCLUDE_DIRS)
+
+find_library(
+ FUSE_LIBRARIES
+ NAMES "fuse"
+ PATHS "${PC_FUSE_LIBRARY_DIRS}"
+ DOC "Libraries for FUSE"
+)
+if(NOT FUSE_LIBRARIES)
+ set(FUSE_FOUND FALSE)
+endif(NOT FUSE_LIBRARIES)
-IF (FUSE_INCLUDE_DIR)
- # Already in cache, be silent
- SET(FUSE_FIND_QUIETLY TRUE)
-ENDIF (FUSE_INCLUDE_DIR)
+if(FUSE_FOUND)
+ if(EXISTS "${FUSE_INCLUDE_DIRS}/fuse_common.h")
+ file(READ "${FUSE_INCLUDE_DIRS}/fuse_common.h" _contents)
+ string(REGEX REPLACE ".*# *define *FUSE_MAJOR_VERSION *([0-9]+).*" "\\1" FUSE_MAJOR_VERSION "${_contents}")
+ string(REGEX REPLACE ".*# *define *FUSE_MINOR_VERSION *([0-9]+).*" "\\1" FUSE_MINOR_VERSION "${_contents}")
+ set(FUSE_VERSION "${FUSE_MAJOR_VERSION}.${FUSE_MINOR_VERSION}")
+ endif()
-FIND_PATH(FUSE_INCLUDE_DIR fuse/fuse.h)
+ include(CheckCSourceCompiles)
+ # Backup CMAKE_REQUIRED_*
+ set(OLD_CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}")
+ set(OLD_CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+ set(OLD_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
+ # Add FUSE compilation flags
+ set(CMAKE_REQUIRED_INCLUDES "${CMAKE_REQUIRED_INCLUDES}" "${FUSE_INCLUDE_DIRS}")
+ set(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}" "${FUSE_LIBRARIES}")
+ set(CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}" "${FUSE_DEFINITIONS}")
+ check_c_source_compiles("#include <stdlib.h>
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
-SET(FUSE_NAMES fuse)
-FIND_LIBRARY(FUSE_LIBRARY NAMES ${FUSE_NAMES} )
+int main(void) {
+return 0;
+}" FUSE_CFLAGS_CHECK)
+ if(NOT FUSE_CFLAGS_CHECK)
+ set(FUSE_DEFINITIONS "-D_FILE_OFFSET_BITS=64")
+ # Should we run again previous test to assume the failure was due to missing definition -D_FILE_OFFSET_BITS=64?
+ endif(NOT FUSE_CFLAGS_CHECK)
+ # Restore CMAKE_REQUIRED_*
+ set(CMAKE_REQUIRED_INCLUDES "${OLD_CMAKE_REQUIRED_INCLUDES}")
+ set(CMAKE_REQUIRED_LIBRARIES "${OLD_CMAKE_REQUIRED_LIBRARIES}")
+ set(CMAKE_REQUIRED_DEFINITIONS "${OLD_CMAKE_REQUIRED_DEFINITIONS}")
+endif(FUSE_FOUND)
-# handle the QUIETLY and REQUIRED arguments and set FUSE_FOUND to TRUE if
-# all listed variables are TRUE
-INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(FUSE REQUIRED FUSE_LIBRARY FUSE_INCLUDE_DIR)
+if(FUSE_INCLUDE_DIRS)
+ include(FindPackageHandleStandardArgs)
+ if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY)
+ find_package_handle_standard_args(Fuse REQUIRED_VARS FUSE_LIBRARIES FUSE_INCLUDE_DIRS VERSION_VAR FUSE_VERSION)
+ else()
+ find_package_handle_standard_args(Fuse "FUSE not found" FUSE_LIBRARIES FUSE_INCLUDE_DIRS)
+ endif()
+else(FUSE_INCLUDE_DIRS)
+ if(FUSE_FIND_REQUIRED AND NOT FUSE_FIND_QUIETLY)
+ message(FATAL_ERROR "Could not find FUSE include directory")
+ endif()
+endif(FUSE_INCLUDE_DIRS)
-IF(FUSE_FOUND)
- SET( FUSE_LIBRARIES ${FUSE_LIBRARY} )
-ELSE(FUSE_FOUND)
- SET( FUSE_LIBRARIES )
-ENDIF(FUSE_FOUND)
+mark_as_advanced(
+ FUSE_INCLUDE_DIRS
+ FUSE_LIBRARIES
+)
-MARK_AS_ADVANCED( FUSE_LIBRARY FUSE_INCLUDE_DIR )
+# IN (args)
+fusedebug("FUSE_FIND_COMPONENTS")
+fusedebug("FUSE_FIND_REQUIRED")
+fusedebug("FUSE_FIND_QUIETLY")
+fusedebug("FUSE_FIND_VERSION")
+# OUT
+# Found
+fusedebug("FUSE_FOUND")
+# Definitions
+fusedebug("FUSE_DEFINITIONS")
+# Linking
+fusedebug("FUSE_INCLUDE_DIRS")
+fusedebug("FUSE_LIBRARIES")
+# Version
+fusedebug("FUSE_MAJOR_VERSION")
+fusedebug("FUSE_MINOR_VERSION")
+fusedebug("FUSE_VERSION")
diff --git a/cmake/FindKernelHeaders.cmake b/cmake/FindKernelHeaders.cmake
new file mode 100644
index 0000000..c04243e
--- /dev/null
+++ b/cmake/FindKernelHeaders.cmake
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# check if custom Linux kernel build directory was specified
+if(NOT KERNEL_BUILD_DIR)
+ set(KERNEL_BUILD_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build"
+ CACHE PATH "Path to Linux kernel modules to compile against")
+endif(NOT KERNEL_BUILD_DIR)
+
+# check if custom Linux kernel output directory was specified
+if(NOT KERNEL_INSTALL_DIR)
+ set(KERNEL_INSTALL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/extra"
+ CACHE PATH "Path to install Linux kernel modules")
+endif(NOT KERNEL_INSTALL_DIR)
+
+if(NOT EXISTS "${KERNEL_BUILD_DIR}/Module.symvers")
+ message(WARNING "\n\nModule.symvers not found in ${KERNEL_BUILD_DIR}\n"
+ "Your kernel sources don't seem to belong to a built kernel,"
+ " expect missing symbols when building kernel module.\n\n")
+endif()
+
+# find the Linux kernel headers from given KERNEL_BUILD_DIR
+find_path(KernelHeaders_INCLUDE_DIR
+ NAMES linux/kernel.h
+ linux/module.h
+ generated/utsrelease.h
+ PATHS ${KERNEL_BUILD_DIR}/include
+ NO_DEFAULT_PATH)
+
+# get Linux kernel headers version
+file(READ "${KERNEL_BUILD_DIR}/include/generated/utsrelease.h" tmpvar)
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+if("${KernelHeaders_VERSION}" EQUAL "")
+ file(READ "${KERNEL_BUILD_DIR}/include/config/kernel.release" tmpvar)
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+endif()
+if("${KernelHeaders_VERSION}" EQUAL "")
+ string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${KernelHeaders_INCLUDE_DIR})
+endif()
+if("${KernelHeaders_VERSION}" EQUAL "")
+ message(FATAL_ERROR "Cannot determine kernel version")
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KernelHeaders
+ FOUND_VAR KernelHeaders_FOUND
+ REQUIRED_VARS KernelHeaders_INCLUDE_DIR
+ VERSION_VAR KernelHeaders_VERSION
+ FAIL_MESSAGE "Linux kernel headers are not available! Please install them to build kernel modules!")
+
+mark_as_advanced(KernelHeaders_INCLUDE_DIR KernelHeaders_MODULE_INSTALL_DIR)
+
+# print found information
+if(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
+ message(VERBOSE "KERNEL_BUILD_DIR: ${KERNEL_BUILD_DIR}")
+ message(VERBOSE "KERNEL_INSTALL_DIR: ${KERNEL_INSTALL_DIR}")
+ message(VERBOSE "KernelHeaders_FOUND: ${KernelHeaders_FOUND}")
+ message(VERBOSE "KernelHeaders_VERSION: ${KernelHeaders_VERSION}")
+endif(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
diff --git a/cmake/FindLibatomic.cmake b/cmake/FindLibatomic.cmake
new file mode 100644
index 0000000..e1c4915
--- /dev/null
+++ b/cmake/FindLibatomic.cmake
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# Use pkg-config to get the directories and then use these values
+# in the FIND_PATH() and FIND_LIBRARY() calls
+find_package(PkgConfig QUIET)
+pkg_check_modules(PKG_Libatomic QUIET libatomic)
+
+set(Libatomic_COMPILE_OPTIONS ${PKG_Libatomic_CFLAGS_OTHER})
+set(Libatomic_VERSION ${PKG_Libatomic_VERSION})
+
+find_library(Libatomic_LIBRARY
+ NAMES atomic
+ HINTS ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}
+ ${PKG_Libatomic_LIBRARY_DIRS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Libatomic
+ FOUND_VAR Libatomic_FOUND
+ REQUIRED_VARS Libatomic_LIBRARY
+ VERSION_VAR Libatomic_VERSION
+ FAIL_MESSAGE "Library 'atomic' is not available! Please install this required library!")
+
+if(Libatomic_FOUND AND NOT TARGET Libatomic::Libatomic)
+ add_library(Libatomic::Libatomic UNKNOWN IMPORTED)
+ set_target_properties(Libatomic::Libatomic PROPERTIES
+ IMPORTED_LOCATION "${Libatomic_LIBRARY}"
+ INTERFACE_COMPILE_OPTIONS "${Libatomic_COMPILE_OPTIONS}")
+endif(Libatomic_FOUND AND NOT TARGET Libatomic::Libatomic)
+
+mark_as_advanced(Libatomic_LIBRARY)
+
+if(Libatomic_FOUND)
+ set(Libatomic_LIBRARIES ${Libatomic_LIBRARY})
+endif(Libatomic_FOUND)
+
+# print found information
+if(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
+ message(VERBOSE "Libatomic_FOUND: ${Libatomic_FOUND}")
+ message(VERBOSE "Libatomic_VERSION: ${Libatomic_VERSION}")
+ message(VERBOSE "Libatomic_COMPILE_OPTIONS: ${Libatomic_COMPILE_OPTIONS}")
+ message(VERBOSE "Libatomic_LIBRARIES: ${Libatomic_LIBRARIES}")
+endif(${CMAKE_VERSION} VERSION_GREATER "3.15.0")
diff --git a/cmake/FindStdatomic.cmake b/cmake/FindStdatomic.cmake
new file mode 100644
index 0000000..d7ee9b8
--- /dev/null
+++ b/cmake/FindStdatomic.cmake
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_file(Stdatomic_INCLUDE_FILE
+ NAMES stdatomic.h
+ HINTS ${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Stdatomic
+ FOUND_VAR Stdatomic_FOUND
+ REQUIRED_VARS Stdatomic_INCLUDE_FILE
+ FAIL_MESSAGE "Compiler does not support atomic operations!")
diff --git a/cmake/GenerateBuild.cmake b/cmake/GenerateBuild.cmake
new file mode 100644
index 0000000..96b2906
--- /dev/null
+++ b/cmake/GenerateBuild.cmake
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# set current build type of the project
+set(DNBD3_BUILD ${BUILD_TYPE})
+string(TIMESTAMP DNBD3_BUILD_DATE "%Y-%m-%d" UTC)
+
+# write dnbd3 build type into a new C source file based on the specified build file template
+configure_file(${BUILD_INPUT_FILE_TEMPLATE} ${BUILD_OUTPUT_FILE})
diff --git a/cmake/GenerateVersion.cmake b/cmake/GenerateVersion.cmake
new file mode 100644
index 0000000..b7579bc
--- /dev/null
+++ b/cmake/GenerateVersion.cmake
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# set CMake module path to include version macros
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+ ${VERSION_MODULE_PATH})
+
+# include version macros
+include(Version)
+
+# get Git version of Git repository
+get_repository_version(DNBD3_VERSION DNBD3_VERSION_SHORT DNBD3_BRANCH ${VERSION_INPUT_FILE} ${VERSION_BUILD_TYPE} ${GIT_EXECUTABLE} ${REPOSITORY_DIR})
+
+# generate version header if header does not exists
+if(NOT EXISTS ${VERSION_INPUT_FILE})
+ # write dnbd3 version into a new C source file based on the specified version template
+ configure_file(${VERSION_INPUT_FILE_TEMPLATE} ${VERSION_OUTPUT_FILE})
+endif(NOT EXISTS ${VERSION_INPUT_FILE})
diff --git a/cmake/InstallVersionFile.cmake.in b/cmake/InstallVersionFile.cmake.in
new file mode 100644
index 0000000..8121c25
--- /dev/null
+++ b/cmake/InstallVersionFile.cmake.in
@@ -0,0 +1,8 @@
+#
+# AUTOGENERATED: DO NOT EDIT THIS FILE
+#
+
+if(CPACK_SOURCE_INSTALLED_DIRECTORIES AND EXISTS "@INCLUDE_VERSION_HEADER_GENERATE@")
+ file(INSTALL "@INCLUDE_VERSION_HEADER_GENERATE@"
+ DESTINATION "@INCLUDE_VERSION_HEADER_GENERATE_PREFIX@")
+endif(CPACK_SOURCE_INSTALLED_DIRECTORIES AND EXISTS "@INCLUDE_VERSION_HEADER_GENERATE@")
diff --git a/cmake/Kernel.cmake b/cmake/Kernel.cmake
new file mode 100644
index 0000000..9ecbbba
--- /dev/null
+++ b/cmake/Kernel.cmake
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMake macros to build and install Linux kernel modules
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+# macro to define kernel module targets
+macro(add_kernel_module MODULE_NAME KERNEL_BUILD_DIR KERNEL_INSTALL_DIR MODULE_MACRO MODULE_SOURCE_FILES MODULE_HEADER_FILES BUILD_SOURCE_FILE)
+ # create directory for kernel module
+ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME})
+ # copy build source file
+ get_filename_component(BUILD_SOURCE_FILENAME ${BUILD_SOURCE_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${BUILD_SOURCE_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${BUILD_SOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${BUILD_SOURCE_FILE})
+ set(BUILD_SOURCE_FILE_PREPARED ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${BUILD_SOURCE_FILENAME})
+ # copy source files
+ foreach(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES})
+ get_filename_component(MODULE_SOURCE_FILENAME ${MODULE_SOURCE_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_SOURCE_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MODULE_SOURCE_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${MODULE_SOURCE_FILE})
+ set(MODULE_SOURCE_FILES_PREPARED ${MODULE_SOURCE_FILES_PREPARED}
+ ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_SOURCE_FILENAME})
+ endforeach()
+ # copy header files
+ foreach(MODULE_HEADER_FILE ${MODULE_HEADER_FILES})
+ get_filename_component(MODULE_HEADER_FILENAME ${MODULE_HEADER_FILE} NAME)
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_HEADER_FILENAME}
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${MODULE_HEADER_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ DEPENDS ${MODULE_HEADER_FILE})
+ set(MODULE_HEADER_FILES_PREPARED ${MODULE_HEADER_FILES_PREPARED}
+ ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_HEADER_FILENAME})
+ endforeach()
+ # check if module depends on another module
+ if(NOT ${ARGV7} STREQUAL "")
+ set(MODULE_EXTRA_SYMBOLS ${CMAKE_CURRENT_BINARY_DIR}/${ARGV7}/Module.symvers)
+ endif()
+ # define build command
+ set(MODULE_BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${MODULE_MACRO}
+ -C ${KERNEL_BUILD_DIR}
+ M=${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME} modules
+ EXTRA_CFLAGS=${KERNEL_C_FLAGS}
+ KBUILD_MODPOST_WARN=1
+ KBUILD_EXTRA_SYMBOLS=${MODULE_EXTRA_SYMBOLS})
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko
+ COMMAND ${MODULE_BUILD_COMMAND}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}
+ COMMENT "Build kernel module ${MODULE_NAME}"
+ DEPENDS ${BUILD_SOURCE_FILE_PREPARED} ${MODULE_HEADER_FILES_PREPARED} ${MODULE_SOURCE_FILES_PREPARED}
+ VERBATIM)
+ add_custom_target(${MODULE_NAME} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko ${ARGV7})
+ # install kernel module
+ install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${MODULE_NAME}/${MODULE_NAME}.ko
+ DESTINATION ${KERNEL_INSTALL_DIR}
+ PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+ COMPONENT kernel)
+endmacro(add_kernel_module)
diff --git a/cmake/Lint.cmake b/cmake/Lint.cmake
new file mode 100644
index 0000000..4453fe3
--- /dev/null
+++ b/cmake/Lint.cmake
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMake macros to check style of source code files
+# Copyright (C) 2021 Manuel Bentele <development@manuel-bentele.de>
+#
+
+find_package(ClangFormat)
+find_package(CheckPatch)
+
+if(ClangFormat_FOUND OR CheckPatch_FOUND)
+ # add target to trigger all linter targets
+ add_custom_target(lint)
+endif(ClangFormat_FOUND OR CheckPatch_FOUND)
+
+# macro to define lint targets
+macro(add_linter LINT_NAME LINT_SOURCE_FILES)
+ if(ClangFormat_FOUND)
+ add_custom_target(${LINT_NAME}
+ COMMAND ${ClangFormat_EXECUTABLE} --Werror --dry-run ${LINT_SOURCE_FILES} ${ARGN}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_SOURCE_FILES} ${ARGN})
+ add_dependencies(lint ${LINT_NAME})
+ endif(ClangFormat_FOUND)
+endmacro(add_linter)
+
+# macro to define lint targets for kernel source code files
+macro(add_kernel_linter LINT_NAME LINT_IGNORE_OPTIONS LINT_SOURCE_FILES LINT_HEADER_FILES)
+ if(CheckPatch_FOUND)
+ set(LINT_IGNORE_ARGS "")
+ foreach(IGNORE_OPTION ${LINT_IGNORE_OPTIONS})
+ list(APPEND LINT_IGNORE_ARGS "--ignore" "${IGNORE_OPTION}")
+ endforeach(IGNORE_OPTION ${LINT_IGNORE_OPTIONS})
+ add_custom_target(${LINT_NAME}
+ COMMAND ${CheckPatch_EXECUTABLE} --no-tree --max-line-length=120 ${LINT_IGNORE_ARGS} -f ${LINT_SOURCE_FILES} ${LINT_HEADER_FILES}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_SOURCE_FILES} ${LINT_HEADER_FILES})
+ add_dependencies(lint ${LINT_NAME})
+ endif(CheckPatch_FOUND)
+endmacro(add_kernel_linter)
+
+if(ClangFormat_FOUND OR CheckPatch_FOUND)
+ # add target to trigger all formatter targets
+ add_custom_target(lint-fix)
+endif(ClangFormat_FOUND OR CheckPatch_FOUND)
+
+# macro to define formatter targets
+macro(add_linter_fix LINT_FIX_NAME LINT_FIX_SOURCE_FILES)
+ if(ClangFormat_FOUND)
+ add_custom_target(${LINT_FIX_NAME}
+ COMMAND ${ClangFormat_EXECUTABLE} --Werror -i ${LINT_FIX_SOURCE_FILES} ${ARGN}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_FIX_SOURCE_FILES} ${ARGN})
+ add_dependencies(lint-fix ${LINT_FIX_NAME})
+ endif(ClangFormat_FOUND)
+endmacro(add_linter_fix)
+
+# macro to define formatter targets for kernel source code files
+macro(add_kernel_linter_fix LINT_FIX_NAME LINT_FIX_IGNORE_OPTIONS LINT_FIX_SOURCE_FILES LINT_FIX_HEADER_FILES)
+ if(CheckPatch_FOUND)
+ set(LINT_FIX_IGNORE_ARGS "")
+ foreach(IGNORE_OPTION ${LINT_FIX_IGNORE_OPTIONS})
+ list(APPEND LINT_FIX_IGNORE_ARGS "--ignore" "${IGNORE_OPTION}")
+ endforeach(IGNORE_OPTION ${LINT_FIX_IGNORE_OPTIONS})
+ add_custom_target(${LINT_FIX_NAME}
+ COMMAND ${CheckPatch_EXECUTABLE} --no-tree --max-line-length=120 ${LINT_FIX_IGNORE_ARGS} --fix-inplace -f ${LINT_FIX_SOURCE_FILES} ${LINT_FIX_HEADER_FILES}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ DEPENDS ${LINT_FIX_SOURCE_FILES} ${LINT_FIX_HEADER_FILES})
+ add_dependencies(lint-fix ${LINT_FIX_NAME})
+ endif(CheckPatch_FOUND)
+endmacro(add_kernel_linter_fix)
diff --git a/cmake/PostVersionPackaging.cmake b/cmake/PostVersionPackaging.cmake
new file mode 100644
index 0000000..877cd12
--- /dev/null
+++ b/cmake/PostVersionPackaging.cmake
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+if(EXISTS ${VERSION_HEADER_INPUT_FILE})
+ # remove version.h if generated version.h is available from a Git build
+ file(REMOVE ${VERSION_HEADER_OUTPUT_FILE})
+endif(EXISTS ${VERSION_HEADER_INPUT_FILE})
diff --git a/cmake/PreVersionPackaging.cmake b/cmake/PreVersionPackaging.cmake
new file mode 100644
index 0000000..e960155
--- /dev/null
+++ b/cmake/PreVersionPackaging.cmake
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+if(EXISTS ${VERSION_HEADER_INPUT_FILE})
+ # copy generated version.h into the source repository for packaging purposes
+ get_filename_component(VERSION_HEADER_OUTPUT_FILE_PATH ${VERSION_HEADER_OUTPUT_FILE} PATH)
+ file(COPY ${VERSION_HEADER_INPUT_FILE}
+ DESTINATION ${VERSION_HEADER_OUTPUT_FILE_PATH})
+endif(EXISTS ${VERSION_HEADER_INPUT_FILE})
diff --git a/cmake/Version.cmake b/cmake/Version.cmake
new file mode 100644
index 0000000..0f26944
--- /dev/null
+++ b/cmake/Version.cmake
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+macro(gen_project_version VERSION_INPUT_FILE VERSION_INPUT_FILE_TEMPLATE VERSION_OUTPUT_FILE GIT_EXECUTABLE REPOSITORY_DIR)
+ get_filename_component(VERSION_OUTPUT_FILENAME ${VERSION_OUTPUT_FILE} NAME)
+ # command that will trigger a rebuild of version.h every time
+ add_custom_command(OUTPUT regenerate-version-file
+ COMMAND ${CMAKE_COMMAND} -E sleep 0
+ COMMENT "Trigger generating ${VERSION_OUTPUT_FILENAME}")
+
+ # call the GenerateVersion.cmake file to generate the version.c file
+ add_custom_command(OUTPUT ${VERSION_OUTPUT_FILE}
+ COMMAND ${CMAKE_COMMAND} -D VERSION_MODULE_PATH=${PROJECT_MODULES_DIR}
+ -D VERSION_INPUT_FILE=${VERSION_INPUT_FILE}
+ -D VERSION_INPUT_FILE_TEMPLATE=${VERSION_INPUT_FILE_TEMPLATE}
+ -D VERSION_OUTPUT_FILE=${VERSION_OUTPUT_FILE}
+ -D VERSION_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -D GIT_EXECUTABLE=${GIT_EXECUTABLE}
+ -D REPOSITORY_DIR=${REPOSITORY_DIR}
+ -P ${PROJECT_MODULES_DIR}/GenerateVersion.cmake
+ COMMENT "Generating ${VERSION_OUTPUT_FILENAME}"
+ DEPENDS regenerate-version-file)
+ add_custom_target(dnbd3-generate-version DEPENDS ${VERSION_OUTPUT_FILE})
+
+ # create target to expose project version
+ add_library(dnbd3-version INTERFACE)
+ target_include_directories(dnbd3-version INTERFACE ${PROJECT_INCLUDE_GEN_DIR})
+ add_dependencies(dnbd3-version dnbd3-generate-version)
+endmacro(gen_project_version VERSION_INPUT_FILE VERSION_INPUT_FILE_TEMPLATE VERSION_OUTPUT_FILE)
+
+# macro to get Git version information
+macro(get_repository_version REPOSITORY_VERSION REPOSITORY_VERSION_SHORT REPOSITORY_BRANCH VERSION_HEADER_FILE VERSION_BUILD_TYPE GIT_EXECUTABLE REPOSITORY_DIR)
+ # set empty Git version information
+ set(GIT_VERSION "")
+ # set empty Git branch information
+ set(GIT_BRANCH "")
+
+ # check if generated version header from source package is available
+ if(EXISTS ${VERSION_HEADER_FILE})
+ # get version information from the generated version header of the source package
+ file(READ ${VERSION_HEADER_FILE} GIT_VERSION_VERBOSE)
+ string(REGEX MATCH "DNBD3_VERSION[ \t]+\"([0-9][A-Za-z0-9.+~-]*)\"" GIT_VERSION ${GIT_VERSION_VERBOSE})
+ set(GIT_VERSION "${CMAKE_MATCH_1}")
+
+ # get branch information from the generated version header of the source package
+ file(READ ${VERSION_HEADER_FILE} GIT_BRANCH_VERBOSE)
+ string(REGEX MATCH "DNBD3_BRANCH[ \t]+\"([0-9][A-Za-z0-9.+~-]*)\"" GIT_BRANCH ${GIT_BRANCH_VERBOSE})
+ set(GIT_BRANCH "${CMAKE_MATCH_1}")
+ else(EXISTS ${VERSION_HEADER_FILE})
+ # get detailed Git version information from Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags HEAD
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_VERSION_VERBOSE
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # parse version information from repository if Git command succeeds
+ if(GIT_RETURN_CODE EQUAL 0)
+ # remove the first letter of the version to satisfy packaging rules
+ string(REGEX MATCH "([0-9]+:)?[0-9][A-Za-z0-9.+~-]*" GIT_VERSION ${GIT_VERSION_VERBOSE})
+ endif(GIT_RETURN_CODE EQUAL 0)
+
+ # overwrite version from Git if version is unknown
+ if(GIT_VERSION STREQUAL "")
+ # overwrite version information with unknown version 'v0.0'
+ set(GIT_VERSION "0.0")
+
+ # print a message in Release build configuration to warn about the unknown version
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "The version information from Git tags in this dnbd3 Git repository is missing! Please fetch all Git tags of this repository for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_VERSION STREQUAL "")
+
+ set(${REPOSITORY_VERSION_SHORT} ${GIT_VERSION})
+
+ # get current branch of Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_BRANCH_VERBOSE
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # check output to get branch information
+ if(GIT_RETURN_CODE EQUAL 0)
+ set(GIT_BRANCH ${GIT_BRANCH_VERBOSE})
+ endif(GIT_RETURN_CODE EQUAL 0)
+
+ if(GIT_BRANCH STREQUAL "")
+ # overwrite branch information with 'unknown' branch
+ set(GIT_BRANCH "unknown")
+
+ # print a message in Release build configuration to warn about the unknown branch
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "The current branch in the dnbd3 Git repository is unknown! Please check the branches of this repository for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_BRANCH STREQUAL "")
+
+ # get status of Git repository
+ execute_process(COMMAND ${GIT_EXECUTABLE} status --porcelain
+ WORKING_DIRECTORY ${REPOSITORY_DIR}
+ OUTPUT_VARIABLE GIT_STATUS
+ RESULT_VARIABLE GIT_RETURN_CODE
+ ERROR_QUIET
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+ # check if Git repository is dirty
+ if(GIT_RETURN_CODE EQUAL 0 AND NOT GIT_STATUS STREQUAL "")
+ # the Git repository is dirty, thus extend the version information
+ set(GIT_VERSION "${GIT_VERSION}+MOD")
+
+ # print a message in Release build configuration to warn about the dirty repository
+ if(${VERSION_BUILD_TYPE} MATCHES "Release")
+ message(WARNING "This dnbd3 Git repository is dirty! Please commit or revert all changes for a ${VERSION_BUILD_TYPE} build!")
+ endif(${VERSION_BUILD_TYPE} MATCHES "Release")
+ endif(GIT_RETURN_CODE EQUAL 0 AND NOT GIT_STATUS STREQUAL "")
+ endif(EXISTS ${VERSION_HEADER_FILE})
+
+ # return version and branch to caller
+ set(${REPOSITORY_VERSION} ${GIT_VERSION})
+ set(${REPOSITORY_BRANCH} ${GIT_BRANCH})
+endmacro(get_repository_version)
diff --git a/cmake/toolchain/Aarch64LinuxGnu.cmake b/cmake/toolchain/Aarch64LinuxGnu.cmake
new file mode 100644
index 0000000..59c5f00
--- /dev/null
+++ b/cmake/toolchain/Aarch64LinuxGnu.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMAKE toolchain file for cross compilation with aarch64-linux-gnu-gcc
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+set(CMAKE_LINKER aarch64-linux-gnu-ld)
+set(CMAKE_ASM_COMPILER aarch64-linux-gnu-as)
+set(CMAKE_OBJCOPY aarch64-linux-gnu-objcopy)
+set(CMAKE_STRIP aarch64-linux-gnu-strip)
+set(CMAKE_CPP aarch64-linux-gnu-cpp)
+
+# path of headers and libraries for aarch64-linux-gnu target
+set(CMAKE_FIND_ROOT_PATH "/usr/aarch64-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/cmake/toolchain/PowerpcLinuxGnu.cmake b/cmake/toolchain/PowerpcLinuxGnu.cmake
new file mode 100644
index 0000000..d3c1ca5
--- /dev/null
+++ b/cmake/toolchain/PowerpcLinuxGnu.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# CMAKE toolchain file for cross compilation with powerpc-linux-gnu-gcc
+#
+# Copyright (C) 2020 Manuel Bentele <development@manuel-bentele.de>
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR ppc)
+
+set(CMAKE_C_COMPILER powerpc-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER powerpc-linux-gnu-g++)
+set(CMAKE_LINKER powerpc-linux-gnu-ld)
+set(CMAKE_ASM_COMPILER powerpc-linux-gnu-as)
+set(CMAKE_OBJCOPY powerpc-linux-gnu-objcopy)
+set(CMAKE_STRIP powerpc-linux-gnu-strip)
+set(CMAKE_CPP powerpc-linux-gnu-cpp)
+
+# path of headers and libraries for powerpc-linux-gnu target
+set(CMAKE_FIND_ROOT_PATH "/usr/powerpc-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/conf/README.server b/conf/README.server
deleted file mode 100644
index 285758b..0000000
--- a/conf/README.server
+++ /dev/null
@@ -1,30 +0,0 @@
-Configuration for dnbd3-server
-
-The server requires a config directory.
-Start it like so: ./dnbd3-server -c ./my-config/
-
-There are two files in that dir
-
-== alt-servers ==
-List of known alt-servers for this server.
-Format:
-[PREFIX]<IP:PORT> [Comment]
-
-Prefix can be:
-+ - Only report server to clients as alt-server, but don't use for replication
-- - Only use server for replication, but don't advertise to clients
-No prefix means server will be advertised to clients and is used for replication
-
-If you're not running in proxy mode, this file won't do much for you
-
-== server.conf ==
-
-Main configuration file. Ini format.
-
-[dnbd3]
-basePath=/srv/openslx/dnbd3 # virtual root of image files
-serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs)
-clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs)
-isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image
-uplinkTimeout=1250 # r/w timeout for connections to uplink servers
-
diff --git a/conf/alt-servers b/conf/alt-servers
deleted file mode 100644
index fd2f2ec..0000000
--- a/conf/alt-servers
+++ /dev/null
@@ -1,4 +0,0 @@
-192.168.100.10 Some alt server
-+192.168.100.100 My first alt server that will not be used for replication
--192.168.100.50 Super sectret alt server that will be used for replication, but clients don't know about it
-
diff --git a/get-version.sh b/get-version.sh
deleted file mode 100755
index 1d4a8cb..0000000
--- a/get-version.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/sh
-
-# Always create version string for repository this script lies in,
-# not the cwd... Makes usage easier in cmake
-ARG0="$0"
-SELF="$(readlink -f "${ARG0}")"
-ROOT_DIR="$(dirname "${SELF}")"
-cd "$ROOT_DIR"
-
-if [ -d .git ]; then
- [ -n "$(git diff)" ] && MODDED='+MOD'
- echo $(git describe)$MODDED, branch $(git rev-parse --abbrev-ref HEAD), built "$(date +%Y-%m-%d)"
- exit 0
-fi
-
-if [ -f "version.txt" ]; then
- cat "version.txt"
- exit 0
-fi
-
-echo "-unknown-"
-
diff --git a/inc/dnbd3/build.h.in b/inc/dnbd3/build.h.in
new file mode 100644
index 0000000..062ed17
--- /dev/null
+++ b/inc/dnbd3/build.h.in
@@ -0,0 +1,11 @@
+/*
+ * AUTOGENERATED: DO NOT EDIT THIS FILE
+ */
+
+#ifndef BUILD_H_
+#define BUILD_H_
+
+#define DNBD3_BUILD "@DNBD3_BUILD@"
+#define DNBD3_BUILD_DATE "@DNBD3_BUILD_DATE@"
+
+#endif /* BUILD_H_ */
diff --git a/src/config.h b/inc/dnbd3/config.h
index 50336af..eb4b8b1 100644
--- a/src/config.h
+++ b/inc/dnbd3/config.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
diff --git a/inc/dnbd3/config/client.h b/inc/dnbd3/config/client.h
new file mode 100644
index 0000000..55cf8b3
--- /dev/null
+++ b/inc/dnbd3/config/client.h
@@ -0,0 +1,52 @@
+#ifndef _CLIENTCONFIG_H_
+#define _CLIENTCONFIG_H_
+
+// Which is the minimum protocol version the client expects from the server
+#define MIN_SUPPORTED_SERVER 2
+
+// Send keepalive every X seconds
+#define KEEPALIVE_INTERVAL 10
+
+// in seconds if not stated otherwise
+#define SOCKET_TIMEOUT_SEND 2
+
+// Socker receive timeout. Must be higher than keepalive interval, otherwise
+// the connection might be aborted when idle
+#define SOCKET_TIMEOUT_RECV 13
+
+// During discovery, we use very short minimum timeouts (unless in panic mode)
+#define SOCKET_TIMEOUT_DISCOVERY 1
+
+// IO timeout for block layer
+#define BLOCK_LAYER_TIMEOUT 10
+
+#define RTT_THRESHOLD_FACTOR(us) (((us) * 3) / 4) // 3/4 = current to best must be 25% worse
+#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
+#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
+// This must be a power of two:
+#define RTT_BLOCK_SIZE 4096
+
+// Interval of several repeating tasks (in seconds)
+#define TIMER_INTERVAL_PROBE_STARTUP 2
+#define TIMER_INTERVAL_PROBE_SWITCH 10
+#define TIMER_INTERVAL_PROBE_PANIC 2
+#define TIMER_INTERVAL_PROBE_MAX 45
+// How many discover runs after setting up a device should be considered the startup phase
+// during that phase, check all servers, before we start doing it selectively
+// and also don't increase the discover interval during this period
+#define DISCOVER_STARTUP_PHASE_COUNT 6
+// How many servers should be tested at maximum after above
+#define DISCOVER_REDUCED_SERVER_COUNT 3
+// Number of RTT probes to keep in history and average the value over
+#define DISCOVER_HISTORY_SIZE 4
+
+// Number of unsuccessful alt_server probes before read errors are reported to the block layer
+// (ALL servers will be probed this many times)
+// Set to 0 to disable
+#define PROBE_COUNT_TIMEOUT 0
+
+// ++ Kernel module ++
+#define DEFAULT_READ_AHEAD_KB 512
+#define NUMBER_DEVICES 8
+
+#endif
diff --git a/src/serverconfig.h b/inc/dnbd3/config/server.h
index 0cbb320..b6eee2c 100644
--- a/src/serverconfig.h
+++ b/inc/dnbd3/config/server.h
@@ -1,21 +1,27 @@
#ifndef _SERVERCONFIG_H_
#define _SERVERCONFIG_H_
-#include "config.h"
+#include <dnbd3/config.h>
// +++++ Performance/memory related
#define SERVER_MAX_CLIENTS 4000
#define SERVER_MAX_IMAGES 5000
-#define SERVER_MAX_ALTS 100
+#define SERVER_MAX_ALTS 50
// +++++ Uplink handling (proxy mode)
-#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value
-#define SERVER_BAD_UPLINK_THRES 40 // Thresold for numFails at which we ignore a server for the time span below
+#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again
+#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally
+#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
+#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
#define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
-#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_QUEUE 500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request
#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients
#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
-#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
+// Wait a maximum of 5 minutes before saving cache map (if data was received at all)
+#define CACHE_MAP_MAX_SAVE_DELAY 300
+// If more than 500MB have been received from uplink without saving cache map, do so
+#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000)
// Time in ms to wait for a read/write call to complete on an uplink connection
#define SOCKET_TIMEOUT_UPLINK 5000
@@ -33,7 +39,7 @@
#define SERVER_RTT_PROBES 5 // How many probes to average over
#define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes
#define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes
-#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad
+#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while
#define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times
#define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes
diff --git a/src/shared/crc32.h b/inc/dnbd3/shared/crc32.h
index 00b8bdd..00b8bdd 100644
--- a/src/shared/crc32.h
+++ b/inc/dnbd3/shared/crc32.h
diff --git a/src/shared/fdsignal.h b/inc/dnbd3/shared/fdsignal.h
index 960a2a9..960a2a9 100644
--- a/src/shared/fdsignal.h
+++ b/inc/dnbd3/shared/fdsignal.h
diff --git a/src/shared/log.h b/inc/dnbd3/shared/log.h
index 5b1e8f7..2a15f1d 100644
--- a/src/shared/log.h
+++ b/inc/dnbd3/shared/log.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Simon Rettberg
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -23,6 +23,7 @@
#include <stdbool.h>
#include <unistd.h>
+#include <stdio.h>
typedef unsigned int logmask_t;
#define LOG_ERROR ((logmask_t)1) // Fatal error, server will terminate
@@ -32,6 +33,10 @@ typedef unsigned int logmask_t;
#define LOG_DEBUG1 ((logmask_t)16) // Debug information, use this for non-spammy stuff
#define LOG_DEBUG2 ((logmask_t)32) // Use this for debug messages that will show up a lot
+/**
+ * Initialize the logging (constructor)
+ */
+void log_init(void);
/**
* Check if cansoleMask | fileMask has all of mask set.
@@ -45,6 +50,16 @@ void log_setConsoleMask(logmask_t mask);
void log_setConsoleTimestamps(bool on);
/**
+ * Set console output stream
+ * The output stream can be either stdout or stderr
+ *
+ * Note: A call of this function is optional and only required if the output
+ * stream should be changed from stdout to stderr since the log
+ * implementation defaults to the output stream stdout
+ */
+int log_setConsoleOutputStream(FILE *outputStream);
+
+/**
* Open or reopen the log file. If path is NULL and the
* function was called with a path before, the same path
* will be used again.
diff --git a/src/shared/protocol.h b/inc/dnbd3/shared/protocol.h
index 92dbe11..1dd47f8 100644
--- a/src/shared/protocol.h
+++ b/inc/dnbd3/shared/protocol.h
@@ -1,10 +1,9 @@
#ifndef _PROTOCOL_H_
#define _PROTOCOL_H_
-#include "sockhelper.h"
-
-#include "../types.h"
-#include "../serialize.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+#include <dnbd3/shared/sockhelper.h>
#include <errno.h>
#include <sys/types.h>
@@ -20,7 +19,7 @@
#define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 )
// 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var
-#define SI_SERVER_FLAGS ( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) )
+#define SI_SERVER_FLAGS ( (uint8_t)( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) )
#define REPLY_OK (0)
#define REPLY_ERRNO (-1)
@@ -69,10 +68,8 @@ static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid,
request.magic = dnbd3_packet_magic;
request.cmd = CMD_SELECT_IMAGE;
request.size = (uint32_t)len;
-#ifdef _DEBUG
request.handle = 0;
request.offset = 0;
-#endif
fixup_request( request );
iov[0].iov_base = &request;
iov[0].iov_len = sizeof(request);
diff --git a/src/serialize.h b/inc/dnbd3/shared/serialize.h
index 1b73531..b808fd0 100644
--- a/src/serialize.h
+++ b/inc/dnbd3/shared/serialize.h
@@ -1,11 +1,10 @@
#ifndef SERIALIZER_H_
#define SERIALIZER_H_
-// Careful with includes - this is used in kernel module too
-#include "config.h"
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
-typedef struct
-{
+typedef struct {
char buffer[MAX_PAYLOAD]; // This MUST be the first member or send_reply() will blow up
char *buffer_end;
char *buffer_pointer;
diff --git a/src/shared/sockhelper.h b/inc/dnbd3/shared/sockhelper.h
index 8d70789..5c7d903 100644
--- a/src/shared/sockhelper.h
+++ b/inc/dnbd3/shared/sockhelper.h
@@ -6,7 +6,7 @@
* abstract from the IP version by using getaddrinfo() and thelike.
*/
-#include "../types.h"
+#include <dnbd3/types.h>
#include <stdint.h>
#include <sys/socket.h>
#include <string.h>
diff --git a/src/shared/timing.h b/inc/dnbd3/shared/timing.h
index f3d8802..2530416 100644
--- a/src/shared/timing.h
+++ b/inc/dnbd3/shared/timing.h
@@ -1,10 +1,6 @@
#ifndef _D_TIMING_H
#define _D_TIMING_H
-#ifndef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 199309L
-#endif
-
#include <time.h>
#include <stdint.h>
#include <stdbool.h>
@@ -22,7 +18,7 @@ extern struct timespec basetime;
/**
* Assign src to dst while adding secs seconds.
*/
-#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + secs; (dst)->tv_nsec = (src)->tv_nsec; } while (0)
+#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + (secs); (dst)->tv_nsec = (src)->tv_nsec; } while (0)
/**
* Define variable now, initialize to timing_get.
diff --git a/src/types.h b/inc/dnbd3/types.h
index ec37d9b..699fa68 100644
--- a/src/types.h
+++ b/inc/dnbd3/types.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,10 +21,15 @@
#ifndef TYPES_H_
#define TYPES_H_
-#include "config.h"
-#ifndef KERNEL_MODULE
+#include <dnbd3/config.h>
+#ifdef DNBD3_KERNEL_MODULE
+#include <linux/kernel.h>
+#include <linux/string.h>
+#else
#include <stdint.h>
#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
#endif
#ifndef MIN
@@ -34,6 +39,9 @@
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
#ifdef __GNUC__
#define UNUSED __attribute__ ((unused))
#else
@@ -62,7 +70,7 @@
#include <netinet/in.h>
#endif
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
#define send(a,b,c,d) write(a,b,c)
#define recv(a,b,c,d) read(a,b,c)
#endif
@@ -77,7 +85,7 @@
#define IOCTL_REM_SRV _IO(0xab, 5)
#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72);
+#define dnbd3_packet_magic ((uint16_t)( (0x73 << 8) | (0x72) ))
// Flip bytes around on big endian when putting stuff on the net
#define net_order_64(a) ((uint64_t)((((a) & 0xFFull) << 56) | (((a) & 0xFF00ull) << 40) | (((a) & 0xFF0000ull) << 24) | (((a) & 0xFF000000ull) << 8) | (((a) & 0xFF00000000ull) >> 8) | (((a) & 0xFF0000000000ull) >> 24) | (((a) & 0xFF000000000000ull) >> 40) | (((a) & 0xFF00000000000000ull) >> 56)))
#define net_order_32(a) ((uint32_t)((((a) & (uint32_t)0xFF) << 24) | (((a) & (uint32_t)0xFF00) << 8) | (((a) & (uint32_t)0xFF0000) >> 8) | (((a) & (uint32_t)0xFF000000) >> 24)))
@@ -91,22 +99,18 @@ static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72);
(a).cmd = net_order_16((a).cmd); \
(a).size = net_order_32((a).size); \
} while (0)
-#define ENDIAN_MODE "Big Endian"
-#ifndef BIG_ENDIAN
-#define BIG_ENDIAN
-#endif
+#define DNBD3_ENDIAN_MODE "Big Endian"
+#define DNBD3_BIG_ENDIAN
#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
-static const uint16_t dnbd3_packet_magic = (0x73) | (0x72 << 8);
+#define dnbd3_packet_magic ((uint16_t)( (0x73) | (0x72 << 8) ))
// Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
#define net_order_64(a) (a)
#define net_order_32(a) (a)
#define net_order_16(a) (a)
#define fixup_request(a) while(0)
#define fixup_reply(a) while(0)
-#define ENDIAN_MODE "Little Endian"
-#ifndef LITTLE_ENDIAN
-#define LITTLE_ENDIAN
-#endif
+#define DNBD3_ENDIAN_MODE "Little Endian"
+#define DNBD3_LITTLE_ENDIAN
#else
#error "Unknown Endianness"
#endif
@@ -117,27 +121,27 @@ static const dnbd3_af HOST_NONE = (dnbd3_af)0;
static const dnbd3_af HOST_IP4 = (dnbd3_af)2;
static const dnbd3_af HOST_IP6 = (dnbd3_af)10;
-#pragma pack(1)
-typedef struct dnbd3_host_t
+typedef struct __attribute__((packed)) dnbd3_host_t
{
uint8_t addr[16]; // 16byte (network representation, so it can be directly passed to socket functions)
uint16_t port; // 2byte (network representation, so it can be directly passed to socket functions)
dnbd3_af type; // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored)
} dnbd3_host_t;
-#pragma pack(0)
-#pragma pack(1)
-typedef struct
+/* IOCTLs */
+#define MAX_HOSTS_PER_IOCTL NUMBER_SERVERS
+
+typedef struct __attribute__((packed))
{
uint16_t len;
- dnbd3_host_t host;
+ dnbd3_host_t hosts[MAX_HOSTS_PER_IOCTL];
+ uint8_t hosts_num;
uint16_t imgnamelen;
char *imgname;
int rid;
int read_ahead_kb;
uint8_t use_server_provided_alts;
} dnbd3_ioctl_t;
-#pragma pack(0)
// network
#define CMD_GET_BLOCK 1
@@ -150,18 +154,17 @@ typedef struct
#define CMD_GET_CRC32 8
#define DNBD3_REQUEST_SIZE 24
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
{
uint16_t magic; // 2byte
uint16_t cmd; // 2byte
uint32_t size; // 4byte
union {
struct {
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
uint64_t offset_small:56; // 7byte
uint8_t hops; // 1byte
-#elif defined(BIG_ENDIAN)
+#elif defined(DNBD3_BIG_ENDIAN)
uint8_t hops; // 1byte
uint64_t offset_small:56; // 7byte
#endif
@@ -170,27 +173,22 @@ typedef struct
};
uint64_t handle; // 8byte
} dnbd3_request_t;
-#pragma pack(0)
_Static_assert( sizeof(dnbd3_request_t) == DNBD3_REQUEST_SIZE, "dnbd3_request_t is messed up" );
#define DNBD3_REPLY_SIZE 16
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
{
uint16_t magic; // 2byte
uint16_t cmd; // 2byte
uint32_t size; // 4byte
uint64_t handle; // 8byte
} dnbd3_reply_t;
-#pragma pack(0)
_Static_assert( sizeof(dnbd3_reply_t) == DNBD3_REPLY_SIZE, "dnbd3_reply_t is messed up" );
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
{
dnbd3_host_t host;
uint8_t failures; // 1byte (number of times server has been consecutively unreachable)
} dnbd3_server_entry_t;
-#pragma pack(0)
#endif /* TYPES_H_ */
diff --git a/inc/dnbd3/version.h.in b/inc/dnbd3/version.h.in
new file mode 100644
index 0000000..727c8b8
--- /dev/null
+++ b/inc/dnbd3/version.h.in
@@ -0,0 +1,12 @@
+/*
+ * AUTOGENERATED: DO NOT EDIT THIS FILE
+ */
+
+#ifndef VERSION_H_
+#define VERSION_H_
+
+#define DNBD3_VERSION "@DNBD3_VERSION@"
+#define DNBD3_BRANCH "@DNBD3_BRANCH@"
+#define DNBD3_VERSION_LONG "@GIT_VERSION@, branch @DNBD3_BRANCH@"
+
+#endif /* VERSION_H_ */
diff --git a/pack.sh b/pack.sh
deleted file mode 100755
index 9cbe5c4..0000000
--- a/pack.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./get-version.sh > version.txt
-tar ckzf dnbd3.tar.gz src cmake CMakeLists.txt get-version.sh version.txt
-rm -- version.txt
-
diff --git a/pkg/CMakeLists.txt b/pkg/CMakeLists.txt
new file mode 100644
index 0000000..3060345
--- /dev/null
+++ b/pkg/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-pkg
+ LANGUAGES C)
+
+add_subdirectory(config)
+add_subdirectory(systemd)
diff --git a/pkg/config/CMakeLists.txt b/pkg/config/CMakeLists.txt
new file mode 100644
index 0000000..efbd2bf
--- /dev/null
+++ b/pkg/config/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-config
+ LANGUAGES C)
+
+# define all configuration files
+set(DNBD3_CONFIG_FILES ${CMAKE_CURRENT_SOURCE_DIR}/alt-servers
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.acl
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.conf)
+
+# install configuration files into sample directory
+install(FILES ${DNBD3_CONFIG_FILES}
+ DESTINATION /etc/dnbd3-server/sample
+ COMPONENT server)
diff --git a/pkg/config/alt-servers b/pkg/config/alt-servers
new file mode 100644
index 0000000..d9d2636
--- /dev/null
+++ b/pkg/config/alt-servers
@@ -0,0 +1,15 @@
+[192.168.100.10]
+comment=Some alt server
+
+[192.168.100.100]
+comment=My first alt server that will not be used for replication
+for=client
+
+[192.168.100.50]
+comment=Super secret alt server that will be used for replication, but clients don't know about it
+for=replication
+
+[192.168.100.123]
+comment=Also just for replication, and only for images starting with foobar/baz/
+namespace=foobar/baz/
+for=replication
diff --git a/conf/rpc.acl b/pkg/config/rpc.acl
index 5167ae3..b3b4561 100644
--- a/conf/rpc.acl
+++ b/pkg/config/rpc.acl
@@ -1,5 +1,5 @@
# Everything from localhost
127.0.0.0/8 ALL
+
# Some info reading for another machine
132.230.8.113 STATS CLIENT_LIST IMAGE_LIST
-
diff --git a/conf/server.conf b/pkg/config/server.conf
index 2f43247..22bd14a 100644
--- a/conf/server.conf
+++ b/pkg/config/server.conf
@@ -1,49 +1,84 @@
[dnbd3]
; port to listen on (default: 5003)
listenPort=5003
+
; relative root directory for images, ending in .r[1-9][0-9]*
-basePath=/mnt/storage/dnbd3
+basePath=/mnt/storage
+
; artificial connection delay for connecting servers
serverPenalty=100000
+
; artificial connection delay for connecting clients
clientPenalty=0
+
; is this server a proxy? if true, requests for non-existing images will be relayed to known alt-servers
isProxy=true
+
; if proxy is true and an image is incomplete, should idle bandwidth be used to replicate missing blocks?
backgroundReplication=true
+
; minimum amount of connected clients for background replication to kick in
bgrMinClients=0
-; if isProxy==true and another proxy requests and image that we don't have, should we ask our alt-servers for it?
+
+; if another proxy requests and image that we don't have, should we ask our alt-servers for it?
lookupMissingForProxy=true
-; create sparse files instead of preallocating; ignored if backgroundReplication=true -- only recommended if cache space is small
+
+; create sparse files instead of preallocating; ignored if backgroundReplication=true
+; -- only recommended if cache space is small
sparseFiles=false
+
; if true (which is the default), images will automatically be removed from the list if they can't be accessed
removeMissingImages=true
+
; timeout in ms for send/recv on connections to uplink servers (used for replication)
-uplinkTimeout=1250
+uplinkTimeout=5000
+
; timeout in ms for send/recv on connections to clients (using an image on this server)
clientTimeout=15000
+
; set this to true to close handles of unused images after some timeout
closeUnusedFd=false
+
; set this to true to load files without the .r[0-9]+ extension too, assuming RID=1
vmdkLegacyMode=false
+; Don't set the server flag when connecting to alt-servers
+; Intended for if the proxy is used for on-client caching
+pretendClient=false
+
+; When running in proxy mode and running out of space, automatically delete oldest image(s) to make
+; the newly replicated image fit. In sparse mode, this will make sure at least 2GB of free space are
+; available when replicating a new image. During normal operation, it will free at least 256MB whenever
+; an attempt to write more data to cache fails. In non-sparse mode, whenever a new image is replicated,
+; as much space as is required to store the entire image will be made available.
+; However, after startup the proxy will refuse to delete any images for the time span given below, to be
+; able to gather up to date usage information for the images available. If unitless, the value is
+; interpreted in seconds. Valid suffixes are m, h, d.
+; Setting this to -1 disables deletion of images. If the cache partition is full, no more images will
+; be replicated unless you manually free up more disk space.
+autoFreeDiskSpaceDelay=10h
+
[limits]
maxClients=2000
maxImages=1000
maxPayload=9M
maxReplicationSize=150G
-; Log related config
+; Maximum number of bytes to prefetch when relaying client request to upstream server
+maxPrefetch=256k
+
[logging]
; log file path and name
; comment out to disable logging to file
; protip: use SIGUSR2 to reopen log file
file=./dnbd3.log
+
; which type of messages to log to file
fileMask=ERROR WARNING MINOR INFO DEBUG1
-; which to log to console (stdout)
+
+; which type of messages to log to console (stdout)
consoleMask=ERROR WARNING MINOR INFO
+
; Valid types (warning: specifying invalid types will not yield an error!)
; ERROR Fatal error, server will terminate
; WARNING Major issue, something is broken but keep running
@@ -51,7 +86,6 @@ consoleMask=ERROR WARNING MINOR INFO
; INFO Informational message
; DEBUG1 Debug information, used for medium verbosity
; DEBUG2 Used for debug messages that would show up a lot
-;
+
; Whether timestamps should be output to console too (or just to file if false)
consoleTimestamps=false
-
diff --git a/pkg/docker/archlinux_dockerfile b/pkg/docker/archlinux_dockerfile
new file mode 100644
index 0000000..ea6145b
--- /dev/null
+++ b/pkg/docker/archlinux_dockerfile
@@ -0,0 +1,28 @@
+# use Archlinux as base image
+FROM archlinux:latest
+
+# declare arguments that should be set by 'docker build --build-arg ...'
+ARG DNBD3_PACKAGE_FILE_NAME
+
+# copy built package file from host to docker image
+COPY ${DNBD3_PACKAGE_FILE_NAME} /tmp
+
+# install required dependencies
+RUN pacman --noconfirm -Sy
+RUN pacman --noconfirm -S fuse2 jansson
+
+# install installation package
+RUN tar -xf /tmp/${DNBD3_PACKAGE_FILE_NAME} --strip-components=1 -C /
+
+# use default config for dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/server.conf /etc/dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/alt-servers /etc/dnbd3-server
+
+# make default storage point for dnbd3-server
+RUN mkdir -p /mnt/storage
+
+# expose the port of the dnbd3-server to the host
+EXPOSE 5003
+
+# run dnbd3-server
+CMD [ "dnbd3-server", "-n" ]
diff --git a/pkg/docker/ubuntu-20-04_dockerfile b/pkg/docker/ubuntu-20-04_dockerfile
new file mode 100644
index 0000000..ad2adcb
--- /dev/null
+++ b/pkg/docker/ubuntu-20-04_dockerfile
@@ -0,0 +1,28 @@
+# use Ubuntu 20.04 as base image
+FROM ubuntu:focal
+
+# declare arguments that should be set by 'docker build --build-arg ...'
+ARG DNBD3_PACKAGE_FILE_NAME
+
+# copy built package file from host to docker image
+COPY ${DNBD3_PACKAGE_FILE_NAME} /tmp
+
+# install required dependencies
+RUN apt-get update
+RUN apt-get install -y libfuse2 libjansson4
+
+# install installation package
+RUN dpkg -i /tmp/${DNBD3_PACKAGE_FILE_NAME}
+
+# use default config for dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/server.conf /etc/dnbd3-server
+RUN ln -s /etc/dnbd3-server/sample/alt-servers /etc/dnbd3-server
+
+# make default storage point for dnbd3-server
+RUN mkdir -p /mnt/storage
+
+# expose the port of the dnbd3-server to the host
+EXPOSE 5003
+
+# run dnbd3-server
+CMD [ "dnbd3-server", "-n" ]
diff --git a/pkg/systemd/CMakeLists.txt b/pkg/systemd/CMakeLists.txt
new file mode 100644
index 0000000..b094b4b
--- /dev/null
+++ b/pkg/systemd/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-systemd
+ LANGUAGES C)
+
+# define all systemd related files
+set(DNBD3_SYSTEMD_FILES ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3-server.service)
+
+# install systemd service files
+install(FILES ${DNBD3_SYSTEMD_FILES}
+ DESTINATION /usr/lib/systemd/system
+ COMPONENT server)
diff --git a/pkg/systemd/dnbd3-server.service b/pkg/systemd/dnbd3-server.service
new file mode 100644
index 0000000..de800ce
--- /dev/null
+++ b/pkg/systemd/dnbd3-server.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=DNBD3 server
+Wants=network-online.target
+After=network-online.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/dnbd3-server -n -c /etc/dnbd3-server
+Restart=on-failure
+
+[Install]
+WantedBy=multi-user.target
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..96ffcae
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-src
+ LANGUAGES C)
+
+if(DNBD3_BENCHMARK)
+ add_subdirectory(bench)
+endif(DNBD3_BENCHMARK)
+
+if(DNBD3_KERNEL_MODULE)
+ add_subdirectory(client)
+ add_subdirectory(kernel)
+endif(DNBD3_KERNEL_MODULE)
+
+if(DNBD3_CLIENT_FUSE)
+ add_subdirectory(fuse)
+endif(DNBD3_CLIENT_FUSE)
+
+if(DNBD3_SERVER)
+ add_subdirectory(server)
+endif(DNBD3_SERVER)
+
+add_subdirectory(shared)
diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt
new file mode 100644
index 0000000..24542a7
--- /dev/null
+++ b/src/bench/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-bench
+ LANGUAGES C)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_BENCH_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_BENCH_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h)
+
+add_executable(dnbd3-bench ${DNBD3_BENCH_SOURCE_FILES})
+target_link_libraries(dnbd3-bench dnbd3-version dnbd3-shared ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS dnbd3-bench RUNTIME DESTINATION bin
+ COMPONENT bench)
+
+add_linter(dnbd3-bench-lint "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
+add_linter_fix(dnbd3-bench-lint-fix "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
diff --git a/src/bench/connection.c b/src/bench/connection.c
index 129ae3c..974bc8a 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -1,10 +1,10 @@
#include "connection.h"
#include "helper.h"
-#include "../config.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
#include <stdlib.h>
#include <pthread.h>
@@ -18,23 +18,10 @@ static const size_t SHORTBUF = 100;
#define SOCKET_KEEPALIVE_TIMEOUT (3)
#define MAX_ALTS (8)
#define MAX_HOSTS_PER_ADDRESS (2)
-// If a server wasn't reachable this many times, we slowly start skipping it on measurements
-static const int FAIL_BACKOFF_START_COUNT = 8;
#define RTT_COUNT (4)
/* Module variables */
-
-// Init guard
-static bool connectionInitDone = false;
-static bool keepRunning = true;
-
-static struct {
- int sockFd;
- pthread_mutex_t sendMutex;
- dnbd3_signal_t* panicSignal;
- dnbd3_host_t currentServer;
- uint64_t startupTime;
-} connection;
+static char trash[4096];
// Known alt servers
typedef struct _alt_server {
@@ -54,13 +41,14 @@ bool connection_init_n_times(
const char *lowerImage,
const uint16_t rid,
int ntimes,
- BenchCounters* counters,
- bool closeSockets
+ uint64_t blockSize,
+ BenchCounters* counters
) {
for (int run_i = 0; run_i < ntimes; ++run_i) {
counters->attempts++;
- printf(".");
+ putchar('.');
+ fflush(stdout);
int sock = -1;
char host[SHORTBUF];
serialized_buffer_t buffer;
@@ -68,66 +56,85 @@ bool connection_init_n_times(
char *remoteName;
uint64_t remoteSize;
- if ( !connectionInitDone && keepRunning ) {
- dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
- const char *current, *end;
- int altIndex = 0;
- memset( altservers, 0, sizeof altservers );
- connection.sockFd = -1;
- current = hosts;
- do {
- // Get next host from string
- while ( *current == ' ' ) current++;
- end = strchr( current, ' ' );
- size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
- if ( len > SHORTBUF ) len = SHORTBUF;
- snprintf( host, len, "%s", current );
- int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
- for ( int i = 0; i < newHosts; ++i ) {
- if ( altIndex >= MAX_ALTS )
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
+ memset( altservers, 0, sizeof altservers );
+ current = hosts;
+ do {
+ // Get next host from string
+ while ( *current == ' ' ) current++;
+ end = strchr( current, ' ' );
+ size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
+ if ( len > SHORTBUF ) len = SHORTBUF;
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ altservers[altIndex].host = tempHosts[i];
+ altIndex += 1;
+ }
+ current = end + 1;
+ } while ( end != NULL && altIndex < MAX_ALTS );
+ // Connect
+ for ( int i = 0; i < altIndex; ++i ) {
+ if ( altservers[i].host.type == 0 )
+ continue;
+ // Try to connect
+ dnbd3_reply_t reply;
+ sock = sock_connect( &altservers[i].host, 3500, 10000 );
+ if ( sock == -1 ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not connect to host (errno=%d)", errno );
+ } else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not send select image" );
+ } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
+ } else if ( rid != 0 && rid != remoteRid ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "rid mismatch" );
+ //} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
+ } else if ( !dnbd3_get_block( sock, (((uint64_t)rand() << 16) + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "send: get block failed" );
+ } else if ( !dnbd3_get_reply( sock, &reply ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "recv: get block header failed" );
+ } else if ( reply.cmd != CMD_GET_BLOCK ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "recv: get block reply is not CMD_GET_BLOCK" );
+ } else {
+ int rv, togo = blockSize;
+ do {
+ rv = recv( sock, trash, MIN( sizeof(trash), togo ), MSG_WAITALL|MSG_NOSIGNAL );
+ if ( rv == -1 && errno == EINTR )
+ continue;
+ if ( rv <= 0 )
break;
- altservers[altIndex].host = tempHosts[i];
- altIndex += 1;
- }
- current = end + 1;
- } while ( end != NULL && altIndex < MAX_ALTS );
- logadd( LOG_INFO, "Got %d servers from init call", altIndex );
- // Connect
- for ( int i = 0; i < altIndex; ++i ) {
- if ( altservers[i].host.type == 0 )
- continue;
- // Try to connect
- sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
- if ( sock == -1 ) {
- counters->fails++;
- logadd( LOG_ERROR, "Could not connect to host" );
- } else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
- counters->fails++;
- logadd( LOG_ERROR, "Could not send select image" );
- } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ togo -= rv;
+ } while ( togo > 0 );
+ if ( togo != 0 ) {
counters->fails++;
- logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
- } else if ( rid != 0 && rid != remoteRid ) {
- counters->fails++;
- logadd( LOG_ERROR, "rid mismatch" );
+ logadd( LOG_ERROR, "recv: get block payload failed (remaining %d)", togo );
} else {
counters->success++;
- break;
- }
- // Failed
- logadd( LOG_DEBUG1, "Server does not offer requested image... " );
- if ( sock != -1 ) {
close( sock );
sock = -1;
+ continue;
}
}
+ // Failed
if ( sock != -1 ) {
- // connectionInitDone = true;
- if (closeSockets) {
- close( sock );
- }
+ close( sock );
+ sock = -1;
}
}
+ if ( sock != -1 ) {
+ close( sock );
+ }
}
return true;
}
diff --git a/src/bench/connection.h b/src/bench/connection.h
index 9cb59ef..422c93e 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -1,7 +1,7 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
#include <stdbool.h>
#include <stdint.h>
#include "helper.h"
@@ -19,7 +19,7 @@ typedef struct _dnbd3_async {
} dnbd3_async_t;
-bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters, bool closeSockets);
+bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, uint64_t blockSize, BenchCounters* counters);
bool connection_init(const char *hosts, const char *image, const uint16_t rid);
diff --git a/src/bench/helper.h b/src/bench/helper.h
index 8342a79..53f32bf 100644
--- a/src/bench/helper.h
+++ b/src/bench/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
@@ -29,6 +29,7 @@ typedef struct BenchThreadData {
char* server_address;
char * image_name;
int runs;
+ int bs;
int threadNumber;
bool closeSockets;
} BenchThreadData;
diff --git a/src/bench/main.c b/src/bench/main.c
index 2f32dbf..37e2821 100644
--- a/src/bench/main.c
+++ b/src/bench/main.c
@@ -4,8 +4,9 @@
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/version.h>
#include <stdio.h>
#include <stdlib.h>
@@ -17,12 +18,9 @@
#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
-/* Debug/Benchmark variables */
-static bool useDebug = false;
-
-
static void printUsage(char *argv0, int exitCode)
{
+ printf( "Version: %s\n", DNBD3_VERSION_LONG );
printf( "Usage: %s [--debug] --host <serverAddress(es)> --image <imageName> [--rid revision]\n", argv0 );
printf( "Or: %s [-d] -h <serverAddress(es)> -i <imageName> [-r revision]\n", argv0 );
printf( " -h --host List of space separated hosts to use\n" );
@@ -30,19 +28,18 @@ static void printUsage(char *argv0, int exitCode)
printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" );
printf( " -n --runs Number of connection attempts per thread\n" );
printf( " -t --threads number of threads\n" );
- printf( " -l --log Write log to given location\n" );
- printf( " -d --debug Don't fork and print debug output (fuse > stderr, dnbd3 > stdout)\n" );
- // // fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ printf( " -b --blocksize Size of blocks to request (def. 4096)\n" );
exit( exitCode );
}
-static const char *optString = "h:i:n:t:HvVd";
+static const char *optString = "b:h:i:n:t:Hv";
static const struct option longOpts[] = {
{ "host", required_argument, NULL, 'h' },
{ "image", required_argument, NULL, 'i' },
{ "nruns", optional_argument, NULL, 'n' },
- { "threads", optional_argument, NULL, 't' },
- { "help", optional_argument, NULL, 'H' },
+ { "threads", required_argument, NULL, 't' },
+ { "blocksize", required_argument, NULL, 'b' },
+ { "help", no_argument, NULL, 'H' },
{ "version", no_argument, NULL, 'v' },
{ 0, 0, 0, 0 }
};
@@ -59,11 +56,11 @@ void* runBenchThread(void* t) {
BenchThreadData* data = t;
connection_init_n_times(
data->server_address,
- data->server_address,
+ data->image_name,
0,
data->runs,
- data->counter,
- data->closeSockets);
+ data->bs,
+ data->counter);
printf("Thread #%d finished\n", data->threadNumber);
return NULL;
}
@@ -77,6 +74,9 @@ int main(int argc, char *argv[])
bool closeSockets = false;
int n_runs = 100;
int n_threads = 1;
+ int bs = 4096;
+
+ log_init();
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
@@ -85,10 +85,10 @@ int main(int argc, char *argv[])
while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
switch ( opt ) {
case 'h':
- server_address = optarg;
+ server_address = strdup(optarg);
break;
case 'i':
- image_Name = optarg;
+ image_Name = strdup(optarg);
break;
case 'n':
n_runs = atoi(optarg);
@@ -96,15 +96,15 @@ int main(int argc, char *argv[])
case 't':
n_threads = atoi(optarg);
break;
+ case 'b':
+ bs = atoi(optarg);
+ break;
case 'c':
closeSockets = true;
break;
case 'H':
printUsage( argv[0], 0 );
break;
- case 'd':
- useDebug = true;
- break;
default:
printUsage( argv[0], EXIT_FAILURE );
}
@@ -126,6 +126,7 @@ int main(int argc, char *argv[])
server_address,
image_Name,
n_runs,
+ bs,
i,
closeSockets};
threadData[i] = tmp2;
diff --git a/src/bench/serialize.c b/src/bench/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/bench/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
new file mode 100644
index 0000000..41f182e
--- /dev/null
+++ b/src/client/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-client
+ LANGUAGES C)
+
+# add compile option to enable enhanced BSD netdb features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_CLIENT_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/client.c)
+
+add_executable(dnbd3-client ${DNBD3_CLIENT_SOURCE_FILES})
+target_link_libraries(dnbd3-client dnbd3-version dnbd3-build dnbd3-shared)
+install(TARGETS dnbd3-client RUNTIME DESTINATION bin
+ COMPONENT kernel)
+
+add_linter(dnbd3-client-lint "${DNBD3_CLIENT_SOURCE_FILES}")
+add_linter_fix(dnbd3-client-lint-fix "${DNBD3_CLIENT_SOURCE_FILES}")
diff --git a/src/client/client.c b/src/client/client.c
index 37f0558..0cf222e 100644
--- a/src/client/client.c
+++ b/src/client/client.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,9 +18,10 @@
*
*/
-#include "../clientconfig.h"
-#include "../types.h"
-#include "../version.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/types.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include <stdio.h>
#include <stdlib.h>
@@ -33,19 +34,19 @@
#include <arpa/inet.h>
#include <string.h>
#include <sys/stat.h>
+#include <sys/socket.h>
#include <sys/un.h>
#include <errno.h>
-#define SOCK_PATH "/var/run/dnbd3.socket"
+#define SOCK_PATH "/run/dnbd3.socket"
#define SOCK_BUFFER 1000
#define DEV_LEN 15
#define MAX_DEVS 50
-
+#define TMP_STR_LEN 100
static int openDevices[MAX_DEVS];
-static const char *optString = "f:h:i:r:d:a:cs:HV?k";
+static const char *optString = "h:i:r:d:a:cs:SA:R:HV?k";
static const struct option longOpts[] = {
- { "file", required_argument, NULL, 'f' },
{ "host", required_argument, NULL, 'h' },
{ "image", required_argument, NULL, 'i' },
{ "rid", required_argument, NULL, 'r' },
@@ -53,8 +54,9 @@ static const struct option longOpts[] = {
{ "ahead", required_argument, NULL, 'a' },
{ "close", no_argument, NULL, 'c' },
{ "switch", required_argument, NULL, 's' },
- { "add", required_argument, NULL, 'adds' },
- { "remove", required_argument, NULL, 'rems' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "add", required_argument, NULL, 'A' },
+ { "remove", required_argument, NULL, 'R' },
{ "help", no_argument, NULL, 'H' },
{ "version", no_argument, NULL, 'V' },
{ "daemon", no_argument, NULL, 'D' },
@@ -66,9 +68,9 @@ static const struct option longOpts[] = {
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg);
static void dnbd3_client_daemon();
-static void dnbd3_daemon_action(int client, int argc, char **argv);
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv);
static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host);
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead);
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers);
static int dnbd3_daemon_send(int argc, char **argv);
static void dnbd3_print_help(char *argv_0);
static void dnbd3_print_version();
@@ -84,11 +86,11 @@ static char host_to_string(const dnbd3_host_t *host, char *target, size_t target
if ( targetlen < 10 ) return false;
if ( host->type == HOST_IP6 ) {
*target++ = '[';
- inet_ntop( AF_INET6, host->addr, target, targetlen - 10 );
+ inet_ntop( AF_INET6, host->addr, target, (socklen_t)targetlen - 10 );
target += strlen( target );
*target++ = ']';
} else if ( host->type == HOST_IP4 ) {
- inet_ntop( AF_INET, host->addr, target, targetlen - 8 );
+ inet_ntop( AF_INET, host->addr, target, (socklen_t)targetlen - 8 );
target += strlen( target );
} else {
snprintf( target, targetlen, "<?addrtype=%d>", (int)host->type );
@@ -135,8 +137,9 @@ static char parse_address(char *string, dnbd3_host_t *host)
// Scan for port
char *portpos = NULL, *ptr = string;
while ( *ptr ) {
- if ( *ptr == ':' )
- portpos = ptr;
+ if ( *ptr == ':' ) {
+ portpos = ptr;
+ }
++ptr;
}
if ( portpos == NULL ) return 0; // No port in string
@@ -192,33 +195,77 @@ static int dnbd3_get_ip(char *hostname, dnbd3_host_t *host)
return true;
}
+/* parses hosts from space separated cmdln string, resolves them and saves them into hosts */
+static int dnbd3_get_resolved_hosts(char *hosts_str, dnbd3_host_t *hosts, const size_t hosts_len)
+{
+ char *hosts_current_token = hosts_str;
+ char *hosts_last_host;
+ int hosts_index = 0;
+ char host_str[TMP_STR_LEN];
+ size_t host_str_len = 0;
+
+ do {
+ /* get next host from string */
+ while ( *hosts_current_token == ' ' ) {
+ hosts_current_token++;
+ }
+
+ /* buffer substring of host to get ip from it */
+ hosts_last_host = strchr( hosts_current_token, ' ' );
+ host_str_len = (hosts_last_host == NULL ? TMP_STR_LEN : (size_t)(hosts_last_host - hosts_current_token) + 1);
+ if ( host_str_len > TMP_STR_LEN ) {
+ host_str_len = TMP_STR_LEN;
+ }
+
+ snprintf( host_str, host_str_len, "%s", hosts_current_token );
+
+ if ( !dnbd3_get_ip( host_str, &hosts[hosts_index] ) )
+ return false;
+
+ hosts_index++;
+
+ /* continue processing of hosts */
+ hosts_current_token = hosts_last_host + 1;
+
+ } while ( hosts_last_host != NULL && hosts_index < hosts_len );
+
+ return hosts_index;
+}
+
int main(int argc, char *argv[])
{
char *dev = NULL;
char host[50];
int action = -1;
+ bool learnNewServers = true;
+ int active_device_num = 0;
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(dnbd3_ioctl_t) );
- msg.len = (uint16_t)sizeof(dnbd3_ioctl_t);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
+ msg.hosts_num = 0;
msg.read_ahead_kb = DEFAULT_READ_AHEAD_KB;
- msg.host.port = htons( PORT );
- msg.host.type = 0;
msg.imgname = NULL;
- msg.use_server_provided_alts = true;
int opt = 0;
int longIndex = 0;
+ // In case the client was invoked as a suid binary, change uid back to original user
+ // and warn the user as this was legacy mode
+ if ( geteuid() == 0 && getuid() != 0 ) {
+ fprintf( stderr, "Warning! %s is a setuid binary. This is deprecated and not needed anymore.\n", argv[0] );
+ fprintf( stderr, "Switching back o user %d\n", (int)getuid() );
+ setgid( getgid() );
+ setuid( getuid() );
+ }
+
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
while ( opt != -1 ) {
switch ( opt ) {
- case 'f':
- break;
case 'h':
- if ( !dnbd3_get_ip( optarg, &msg.host ) ) exit( EXIT_FAILURE );
+ msg.hosts_num = (uint8_t)dnbd3_get_resolved_hosts( optarg, msg.hosts, MAX_HOSTS_PER_IOCTL );
+ if ( !msg.hosts_num )
+ exit( EXIT_FAILURE );
break;
case 'i':
action = IOCTL_OPEN;
@@ -238,25 +285,34 @@ int main(int argc, char *argv[])
action = IOCTL_CLOSE;
break;
case 's':
- dnbd3_get_ip( optarg, &msg.host );
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_SWITCH;
break;
- case 'adds':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_ADD_SRV;
break;
- case 'rems':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'R':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_REM_SRV;
break;
case 'H':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'V':
- dnbd3_print_version();
+ dnbd3_print_version( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case '?':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'D':
dnbd3_client_daemon();
@@ -265,6 +321,14 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+ /* abort if sticky mode is set and image will not be opened */
+ if ( !learnNewServers && action != IOCTL_OPEN ) {
+ printf( "ERROR: sticky mode can only be set if image will be opened.\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ msg.use_server_provided_alts = learnNewServers;
+
// See if socket exists, if so, try to send to daemon
struct stat st;
if ( stat( SOCK_PATH, &st ) == 0 ) {
@@ -275,39 +339,37 @@ int main(int argc, char *argv[])
// Direct requests
- // In case the client was invoked as a suid binary, change uid back to original user
- // when being used for direct ioctl, so that the device's permissions are taken into account
- if ( geteuid() == 0 ) {
- setgid( getgid() );
- setuid( getuid() );
- }
-
- host_to_string( &msg.host, host, 50 );
-
// close device
- if ( action == IOCTL_CLOSE && msg.host.type == 0 && dev && (msg.imgname == NULL )) {
+ if ( action == IOCTL_CLOSE && msg.hosts_num == 0 && dev && (msg.imgname == NULL )) {
printf( "INFO: Closing device %s\n", dev );
- if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Couldn't close device.\n" );
exit( EXIT_FAILURE );
}
// switch host
- if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.host.type != 0 && dev && (msg.imgname == NULL )) {
+ if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.hosts_num == 1 && dev && (msg.imgname == NULL )) {
+ host_to_string( &msg.hosts[0], host, 50 );
if ( action == IOCTL_SWITCH ) printf( "INFO: Switching device %s to %s\n", dev, host );
if ( action == IOCTL_ADD_SRV ) printf( "INFO: %s: adding %s\n", dev, host );
if ( action == IOCTL_REM_SRV ) printf( "INFO: %s: removing %s\n", dev, host );
- if ( dnbd3_ioctl( dev, action, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Failed! Maybe the device is not connected?\n" );
exit( EXIT_FAILURE );
}
// connect
- if ( action == IOCTL_OPEN && msg.host.type != 0 && dev && (msg.imgname != NULL )) {
- printf( "INFO: Connecting device %s to %s for image %s\n", dev, host, msg.imgname );
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) exit( EXIT_SUCCESS );
- printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
- exit( EXIT_FAILURE );
+ if ( action == IOCTL_OPEN && msg.hosts_num > 0 && dev && (msg.imgname != NULL )) {
+ printf( "INFO: Connecting device %s for image %s\n", dev, msg.imgname );
+ active_device_num = dnbd3_ioctl( dev, IOCTL_OPEN, &msg );
+ if ( active_device_num >= 0 ) {
+ host_to_string( &msg.hosts[active_device_num], host, 50 );
+ printf( "INFO: Device %s for image %s is connected to server %s\n", dev, msg.imgname, host);
+ exit( EXIT_SUCCESS );
+ } else {
+ printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
+ exit( EXIT_FAILURE );
+ }
}
dnbd3_print_help( argv[0] );
@@ -317,17 +379,19 @@ int main(int argc, char *argv[])
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg)
{
const int fd = open( dev, O_WRONLY );
- if ( fd < 0 ) {
- printf( "open() for %s failed.\n", dev );
- return false;
+ if ( fd == -1 ) {
+ perror( "open() failed" );
+ return -ENODEV;
+ }
+ if ( msg != NULL && msg->imgname != NULL ) {
+ msg->imgnamelen = (uint16_t)strlen( msg->imgname );
}
- if ( msg != NULL && msg->imgname != NULL ) msg->imgnamelen = (uint16_t)strlen( msg->imgname );
const int ret = ioctl( fd, command, msg );
if ( ret < 0 ) {
- printf( "ioctl() failed.\n" );
+ perror( "ioctl() failed" );
}
close( fd );
- return ret >= 0;
+ return ret;
}
static void dnbd3_client_daemon()
@@ -338,11 +402,8 @@ static void dnbd3_client_daemon()
struct timeval tv;
int done, ret, len;
socklen_t socklen;
-
- if ( geteuid() != 0 ) {
- printf( "Only root can run the dnbd3-client in daemon mode!\n" );
- exit( 1 );
- }
+ struct ucred ucred;
+ int fdTest;
if ( (listener = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) {
perror( "socket" );
@@ -356,12 +417,21 @@ static void dnbd3_client_daemon()
perror( "bind" );
exit( 1 );
}
- chmod( addrLocal.sun_path, 0600 );
+ fchmod( listener, 0666 );
+ chmod( SOCK_PATH, 0666 );
if ( listen( listener, 5 ) == -1 ) {
perror( "listen" );
+ unlink( addrLocal.sun_path );
exit( 1 );
}
+ fdTest = open( "/dev/dnbd0", O_RDWR );
+ if ( fdTest == -1 ) {
+ perror( "Opening /dev/dnbd0 failed. Daemon will probably not work" );
+ } else {
+ close( fdTest );
+ }
+
memset( openDevices, -1, sizeof(openDevices) );
for (;;) {
@@ -372,6 +442,14 @@ static void dnbd3_client_daemon()
continue;
}
+ socklen = sizeof(ucred);
+ if ( getsockopt( client, SOL_SOCKET, SO_PEERCRED, &ucred, &socklen ) == -1 ) {
+ perror( "Could not get credentials of connection" );
+ close( client );
+ continue;
+ }
+ printf("Call from user %d\n", (int)ucred.uid );
+
tv.tv_sec = 1;
tv.tv_usec = 0;
setsockopt( client, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) );
@@ -398,27 +476,28 @@ static void dnbd3_client_daemon()
}
if ( pos >= end ) break;
argv[argc++] = pos;
- printf("Arg %d: '%s'\n", argc, pos);
+ //printf("Arg %d: '%s'\n", argc, pos);
while ( *pos != '\0' ) { // This will always be in bounds because of -4 above
if ( ++pos >= end ) break;
}
}
- dnbd3_daemon_action( client, argc, argv );
+ dnbd3_daemon_action( client, (int)ucred.uid, argc, argv );
}
close( client );
}
}
-static void dnbd3_daemon_action(int client, int argc, char **argv)
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv)
{
int opt = 0;
int longIndex = 0;
char *host = NULL, *image = NULL, *device = NULL;
- int rid = 0, uid = 0, killMe = false, ahead = 512;
+ int rid = 0, killMe = false, ahead = 512;
int len;
int action = -1;
const char *actionName = NULL;
+ bool learnNewServers = true;
optind = 1;
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
@@ -439,18 +518,18 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
case 'r':
rid = atoi( optarg );
break;
- case 'U':
- uid = atoi( optarg );
- break;
case 'c':
action = IOCTL_CLOSE;
actionName = "Close";
break;
- case 'adds':
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
action = IOCTL_ADD_SRV;
actionName = "Add Server";
break;
- case 'rems':
+ case 'R':
action = IOCTL_REM_SRV;
actionName = "Remove Server";
break;
@@ -465,14 +544,14 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
}
if ( killMe ) {
- if ( uid != 0 ) {
+ if ( uid != geteuid() ) {
printf( "Ignoring kill request by user %d\n", uid );
close( client );
return;
}
printf( "Received kill request; exiting.\n" );
- close( client );
unlink( SOCK_PATH );
+ close( client );
exit( 0 );
}
@@ -486,7 +565,7 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
return;
}
if ( action == IOCTL_OPEN && host != NULL && image != NULL && rid >= 0 ) {
- device = dnbd3_daemon_open( uid, host, image, rid, ahead );
+ device = dnbd3_daemon_open( uid, host, image, rid, ahead, learnNewServers);
if ( device != NULL ) {
len = strlen( device );
send( client, &len, sizeof(len), 0 );
@@ -509,11 +588,9 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
} else {
index = atoi( device );
}
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(msg) );
- msg.len = (uint16_t)sizeof(msg);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
if ( host != NULL ) {
- dnbd3_get_ip( host, &msg.host );
+ dnbd3_get_ip( host, &msg.hosts[0] );
}
if ( index < 0 || index >= MAX_DEVS ) {
printf( "%s request with invalid device id %d\n", actionName, index );
@@ -528,7 +605,7 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
printf( "%s: User %d cannot access %s owned by %d\n", actionName, uid, dev, openDevices[index] );
return false;
}
- if ( dnbd3_ioctl( dev, action, &msg ) ) {
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) {
printf( "%s request for device %s of user %d successful\n", actionName, dev, uid );
openDevices[index] = -1;
return true;
@@ -537,23 +614,26 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
return false;
}
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead)
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers)
{
int i, sameUser = 0;
struct stat st;
static char dev[DEV_LEN];
printf( "Opening a device for %s on %s\n", image, host );
// Check number of open devices
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] == uid ) sameUser++;
- }
- if ( sameUser > 1 ) {
- printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
- return NULL ;
+ if ( uid != 0 ) {
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] == uid ) sameUser++;
+ }
+ if ( sameUser > 1 ) {
+ printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
+ return NULL;
+ }
}
// Find free device
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] != -1 ) continue;
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] != -1 )
+ continue;
snprintf( dev, DEV_LEN, "/dev/dnbd%d", i );
if ( stat( dev, &st ) == -1 ) {
break;
@@ -561,16 +641,16 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
// Open
dnbd3_ioctl_t msg;
msg.len = (uint16_t)sizeof(msg);
- if ( !dnbd3_get_ip( host, &msg.host ) ) {
+ if ( !dnbd3_get_ip( host, &msg.hosts[0] ) ) {
printf( "Cannot parse host address %s\n", host );
return NULL ;
}
msg.imgname = image;
msg.imgnamelen = strlen( image );
msg.rid = rid;
- msg.use_server_provided_alts = true;
+ msg.use_server_provided_alts = doLearnNewServers;
msg.read_ahead_kb = readAhead;
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) {
+ if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) >= 0 ) {
openDevices[i] = uid;
printf( "Device %s now occupied by %d\n", dev, uid );
return dev;
@@ -584,7 +664,6 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
static int dnbd3_daemon_send(int argc, char **argv)
{
- const int uid = getuid();
int s, i, len;
struct sockaddr_un remote;
char buffer[SOCK_BUFFER];
@@ -604,7 +683,6 @@ static int dnbd3_daemon_send(int argc, char **argv)
// (Re)build argument string into a single one, arguments separated by null chars
char *pos = buffer;
char *end = buffer + SOCK_BUFFER;
- pos += snprintf( pos, end - pos, "--user%c%d", (int)'\0', uid ) + 1;
for (i = 1; i < argc && pos < end; ++i) {
pos += snprintf( pos, end - pos, "%s", argv[i] ) + 1;
}
@@ -643,28 +721,28 @@ static int dnbd3_daemon_send(int argc, char **argv)
static void dnbd3_print_help(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
- printf( "\nUsage: %s\n"
- "\t-h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n", argv_0 );
- printf( "Start the DNBD3 client.\n" );
- //printf("-f or --file \t\t Configuration file (default /etc/dnbd3-client.conf)\n");
- printf( "-h or --host \t\t Host running dnbd3-server.\n" );
+ printf( "Usage: %s\n", argv_0 );
+ printf( " -h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n" );
+ printf( "Start the DNBD3 client.\n\n" );
+ printf( "-h or --host \t\t List of space separated hosts to use.\n" );
printf( "-i or --image \t\t Image name of exported image.\n" );
printf( "-r or --rid \t\t Release-ID of exported image (default 0, latest).\n" );
printf( "-d or --device \t\t DNBD3 device name.\n" );
printf( "-a or --ahead \t\t Read ahead in KByte (default %i).\n", DEFAULT_READ_AHEAD_KB );
printf( "-c or --close \t\t Disconnect and close device.\n" );
printf( "-s or --switch \t\t Switch dnbd3-server on device (DEBUG).\n" );
+ printf( "-S or --sticky \t\t Use only servers from command line (no learning from servers)\n" );
+ printf( "-A or --add \t\t Add given dnbd3-server on device.\n");
+ printf( "-R or --remove \t\t Remove given dnbd3-server on device.\n");
printf( "-H or --help \t\t Show this help text and quit.\n" );
printf( "-V or --version \t Show version and quit.\n\n" );
- printf( "\t--daemon \t Run as helper daemon\n" );
- printf( "\t--kill \t Kill running helper daemon\n" );
+ printf( " --daemon \t\t Run as helper daemon\n" );
+ printf( " --kill \t\t Kill running helper daemon\n\n" );
printf( "The helper daemon makes it possible for normal users to connect dnbd3 devices.\n" );
- printf( "The client binary needs to be a setuid program for this to work!\n\n" );
}
void dnbd3_print_version()
{
- printf( "Version: %s\n", VERSION_STRING );
- exit( EXIT_SUCCESS );
+ printf( "dnbd3-client version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
}
diff --git a/src/clientconfig.h b/src/clientconfig.h
deleted file mode 100644
index f35f673..0000000
--- a/src/clientconfig.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _CLIENTCONFIG_H_
-#define _CLIENTCONFIG_H_
-
-// Which is the minimum protocol version the client expects from the server
-#define MIN_SUPPORTED_SERVER 2
-
-// in seconds if not stated otherwise (MS = milliseconds)
-#define SOCKET_TIMEOUT_CLIENT_DATA 2
-#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1
-
-#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
-#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
-#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
-// This must be a power of two:
-#define RTT_BLOCK_SIZE 4096
-
-#define STARTUP_MODE_DURATION 30
-// Interval of several repeating tasks (in seconds)
-#define TIMER_INTERVAL_PROBE_STARTUP 4
-#define TIMER_INTERVAL_PROBE_NORMAL 22
-#define TIMER_INTERVAL_PROBE_PANIC 2
-#define TIMER_INTERVAL_KEEPALIVE_PACKET 6
-
-// Expect a keepalive response every X seconds
-#define SOCKET_KEEPALIVE_TIMEOUT 8
-
-// Number of unsuccessful alt_server probes before read errors are reported to the block layer
-// (ALL servers will be probed this many times)
-// Set to 0 to disable
-#define PROBE_COUNT_TIMEOUT 0
-
-// ++ Kernel module ++
-#define DEFAULT_READ_AHEAD_KB 512
-#define NUMBER_DEVICES 8
-
-#endif
diff --git a/src/fuse/CMakeLists.txt b/src/fuse/CMakeLists.txt
new file mode 100644
index 0000000..be062f0
--- /dev/null
+++ b/src/fuse/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-fuse
+ LANGUAGES C)
+
+find_package(Fuse REQUIRED)
+
+# find atomic library required by dnbd3-fuse
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_FUSE_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_FUSE_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h)
+
+add_executable(dnbd3-fuse ${DNBD3_FUSE_SOURCE_FILES})
+target_include_directories(dnbd3-fuse PRIVATE ${FUSE_INCLUDE_DIRS})
+target_link_libraries(dnbd3-fuse dnbd3-build dnbd3-version dnbd3-shared ${FUSE_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS dnbd3-fuse RUNTIME DESTINATION bin
+ COMPONENT fuse)
+
+add_linter(dnbd3-fuse-lint "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
+add_linter_fix(dnbd3-fuse-lint-fix "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
diff --git a/src/fuse/connection.c b/src/fuse/connection.c
index fc9f05b..e760d98 100644
--- a/src/fuse/connection.c
+++ b/src/fuse/connection.c
@@ -1,19 +1,21 @@
#include "connection.h"
#include "helper.h"
-#include "../clientconfig.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <stdio.h>
+#include <stdatomic.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <inttypes.h>
+#include <signal.h>
/* Constants */
static const size_t SHORTBUF = 100;
@@ -30,9 +32,18 @@ static const int FAIL_BACKOFF_START_COUNT = 8;
static bool connectionInitDone = false;
static bool threadInitDone = false;
static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER;
-static bool keepRunning = true;
+// For multi-threaded concurrent connection during init
+static pthread_mutex_t mutexCondConn = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t condConn = PTHREAD_COND_INITIALIZER;
+static atomic_int pendingConnectionAttempts = 0;
+// Shutdown flag
+atomic_bool keepRunning = true;
+// Should we learn new alt-servers from servers we connect to?
static bool learnNewServers;
+static pthread_t tidReceiver;
+static pthread_t tidBackground;
+
// List of pending requests
static struct {
dnbd3_async_t *head;
@@ -55,15 +66,21 @@ static struct {
ticks startupTime;
} connection;
+struct conn_data {
+ char *lowerImage;
+ uint16_t rid;
+ int idx;
+};
+
// Known alt servers
typedef struct _alt_server {
dnbd3_host_t host;
- int consecutiveFails;
- int rtt;
+ atomic_int consecutiveFails;
+ atomic_int rtt;
int rtts[RTT_COUNT];
int rttIndex;
- int bestCount;
- int liveRtt;
+ atomic_int bestCount;
+ atomic_int liveRtt;
} alt_server_t;
static dnbd3_server_entry_t newservers[MAX_ALTS];
@@ -83,136 +100,232 @@ static pthread_rwlock_t altLock = PTHREAD_RWLOCK_INITIALIZER;
/* Static methods */
-static void* connection_receiveThreadMain(void *sock);
-static void* connection_backgroundThread(void *something);
+static void* connectThread(void * data);
+static void* connection_receiveThreadMain( void *sock );
+static void* connection_backgroundThread( void *something );
-static void addAltServers();
+static bool hasAltServer( dnbd3_host_t *host );
+static void addAltServers( void );
static void sortAltServers();
static void probeAltServers();
-static void switchConnection(int sockFd, alt_server_t *srv);
-static void requestAltServers();
-static bool throwDataAway(int sockFd, uint32_t amount);
+static void switchConnection( int sockFd, alt_server_t *srv );
+static void requestAltServers( void );
+static bool sendAltServerRequest( int sock );
+static bool throwDataAway( int sockFd, uint32_t amount );
+
+static void enqueueRequest( dnbd3_async_t *request );
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request );
-static void enqueueRequest(dnbd3_async_t *request);
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request);
+static void blockSignals();
-bool connection_init(const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew)
+bool connection_init( const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew )
{
- int sock = -1;
char host[SHORTBUF];
- size_t hlen;
- serialized_buffer_t buffer;
- uint16_t remoteVersion, remoteRid;
- char *remoteName;
- uint64_t remoteSize;
- struct sockaddr_storage sa;
- socklen_t salen;
- poll_list_t *cons = sock_newPollList();
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
timing_setBase();
pthread_mutex_lock( &mutexInit );
- if ( !connectionInitDone && keepRunning ) {
- dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
- const char *current, *end;
- int altIndex = 0;
- learnNewServers = doLearnNew;
- memset( altservers, 0, sizeof altservers );
- connection.sockFd = -1;
- current = hosts;
- do {
- // Get next host from string
- while ( *current == ' ' ) current++;
- end = strchr( current, ' ' );
- size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
- if ( len > SHORTBUF ) len = SHORTBUF;
- snprintf( host, len, "%s", current );
- int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
- for ( int i = 0; i < newHosts; ++i ) {
- if ( altIndex >= MAX_ALTS )
- break;
- altservers[altIndex].host = tempHosts[i];
- altIndex += 1;
- }
- current = end + 1;
- } while ( end != NULL && altIndex < MAX_ALTS );
- logadd( LOG_INFO, "Got %d servers from init call", altIndex );
- // Connect
- for ( int i = 0; i < altIndex + 5; ++i ) {
- if ( i >= altIndex ) {
- // Additional iteration - no corresponding slot in altservers, this
- // is just so we can make a final calls with longer timeout
- sock = sock_multiConnect( cons, NULL, 400, 1000 );
- if ( sock == -2 ) {
- logadd( LOG_ERROR, "Could not connect to any host" );
- sock = -1;
- break;
- }
- } else {
- if ( altservers[i].host.type == 0 )
- continue;
- // Try to connect - 100ms timeout
- sock = sock_multiConnect( cons, &altservers[i].host, 100, 1000 );
- }
- if ( sock == -2 || sock == -1 )
- continue;
- salen = sizeof(sa);
- if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
- logadd( LOG_ERROR, "getpeername on successful connection failed!? (errno=%d)", errno );
- close( sock );
- sock = -1;
+ if ( connectionInitDone ) {
+ pthread_mutex_unlock( &mutexInit );
+ return false;
+ }
+ learnNewServers = doLearnNew;
+ memset( altservers, 0, sizeof altservers );
+ connection.sockFd = -1;
+ current = hosts;
+ pthread_attr_t threadAttrs;
+ pthread_attr_init( &threadAttrs );
+ pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
+ // Resolve all hosts and connect
+ pthread_mutex_lock( &mutexCondConn );
+ do {
+ // Get next host from string
+ while ( *current == ' ' || *current == '\t' || *current == '\n' ) {
+ current++;
+ }
+ end = current;
+ while ( *end != ' ' && *end != '\t' && *end != '\n' && *end != '\0' ) {
+ end++;
+ }
+ if ( end == current )
+ break;
+ size_t len = (size_t)( end - current ) + 1;
+ if ( len > SHORTBUF ) {
+ len = SHORTBUF;
+ }
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ if ( hasAltServer( &tempHosts[i] ) )
continue;
- }
- hlen = sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
- logadd( LOG_INFO, "Connected to %.*s", (int)hlen, host );
- if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
- logadd( LOG_ERROR, "Could not send select image" );
- } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
- logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
- } else if ( rid != 0 && rid != remoteRid ) {
- logadd( LOG_ERROR, "rid mismatch (want: %d, got: %d)", (int)rid, (int)remoteRid );
- } else {
- logadd( LOG_INFO, "Requested: '%s:%d'", lowerImage, (int)rid );
- logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
- sock_setTimeout( sock, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
- image.name = strdup( remoteName );
- image.rid = remoteRid;
- image.size = remoteSize;
- if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &connection.currentServer ) ) {
- logadd( LOG_ERROR, "sockaddr to dnbd3_host_t failed!?" );
- connection.currentServer.type = 0;
+ altservers[altIndex].host = tempHosts[i];
+ // Start thread for async connect if not connected yet
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd == -1 ) {
+ pthread_t t;
+ struct conn_data *cd = malloc( sizeof(*cd) );
+ // We cannot be sure a thread is taking longer than this function runs, so better copy
+ cd->lowerImage = strdup( lowerImage );
+ cd->rid = rid;
+ cd->idx = altIndex;
+ pendingConnectionAttempts++;
+ if ( ( errno = pthread_create( &t, &threadAttrs, &connectThread, (void*)cd ) ) != 0 ) {
+ pendingConnectionAttempts--;
+ logadd( LOG_ERROR, "Could not create connect thread %d, errno=%d", cd->idx, errno );
+ free( cd->lowerImage );
+ free( cd );
+ continue;
}
- connection.panicSignal = signal_new();
- timing_get( &connection.startupTime );
- connection.sockFd = sock;
- requests.head = NULL;
- requests.tail = NULL;
- requestAltServers();
- break;
- }
- // Failed
- if ( sock != -1 ) {
- close( sock );
- sock = -1;
+ struct timespec timeout;
+ clock_gettime( CLOCK_REALTIME, &timeout );
+ timeout.tv_nsec += 200 * 1000 * 1000;
+ if ( timeout.tv_nsec >= 1000 * 1000 * 1000 ) {
+ timeout.tv_nsec -= 1000 * 1000 * 1000;
+ timeout.tv_sec += 1;
+ }
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &timeout );
}
+ // End async connect
+ altIndex += 1;
}
- if ( sock != -1 ) {
- connectionInitDone = true;
- }
+ current = end + 1;
+ } while ( *end != '\0' && altIndex < MAX_ALTS );
+ logadd( LOG_INFO, "Got %d servers from init call", altIndex );
+ // Wait a maximum of five seconds if we're not connected yet
+ if ( connection.sockFd == -1 && pendingConnectionAttempts > 0 ) {
+ struct timespec end;
+ clock_gettime( CLOCK_REALTIME, &end );
+ end.tv_sec += 5;
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &end );
+ }
+ pthread_mutex_unlock( &mutexCondConn );
+ pthread_attr_destroy( &threadAttrs );
+ if ( connection.sockFd != -1 ) {
+ connectionInitDone = true;
}
pthread_mutex_unlock( &mutexInit );
- sock_destroyPollList( cons );
- return sock != -1;
+ return connectionInitDone;
+}
+
+static void* connectThread(void * data)
+{
+ struct conn_data *cd = (struct conn_data*)data;
+ int idx = cd->idx;
+ int sock = -1;
+ serialized_buffer_t buffer;
+ uint16_t remoteVersion, remoteRid;
+ char *remoteName;
+ uint64_t remoteSize;
+ char host[SHORTBUF];
+ struct sockaddr_storage sa;
+ socklen_t salen = sizeof(sa);
+
+ if ( idx < 0 || idx >= MAX_ALTS || altservers[idx].host.type == 0 ) {
+ logadd( LOG_ERROR, "BUG: Index out of range, or empty server in connect thread (%d)", idx );
+ goto bailout;
+ }
+
+ sock_printHost( &altservers[idx].host, host, sizeof(host) );
+ logadd( LOG_INFO, "Trying to connect to %s", host );
+ sock = sock_connect( &altservers[idx].host, 1500, SOCKET_TIMEOUT_RECV * 1000 );
+ if ( sock == -1 ) {
+ logadd( LOG_INFO, "[%s] Connection failed", host );
+ goto bailout;
+ }
+
+ salen = sizeof( sa );
+ if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
+ logadd( LOG_ERROR, "[%s] getpeername on successful connection failed!? (errno=%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
+ logadd( LOG_INFO, "[%s] Connected", host );
+ if ( !dnbd3_select_image( sock, cd->lowerImage, cd->rid, 0 ) ) {
+ logadd( LOG_ERROR, "[%s] Could not send select image", host );
+ goto bailout;
+ }
+
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_ERROR, "[%s] Could not read select image reply (%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ if ( cd->rid != 0 && cd->rid != remoteRid ) {
+ logadd( LOG_ERROR, "[%s] rid mismatch (want: %d, got: %d)",
+ host, (int)cd->rid, (int)remoteRid );
+ goto bailout;
+ }
+ // Seems we got a winner
+ pthread_mutex_lock( &mutexCondConn );
+ if ( connection.sockFd != -1 || connectionInitDone ) {
+ pthread_mutex_unlock( &mutexCondConn );
+ logadd( LOG_INFO, "[%s] Raced by other connection", host );
+ goto bailout;
+ }
+ logadd( LOG_INFO, "Requested: '%s:%d'", cd->lowerImage, (int)cd->rid );
+ logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
+ image.name = strdup( remoteName );
+ image.rid = remoteRid;
+ image.size = remoteSize;
+ connection.currentServer = altservers[idx].host;
+ connection.panicSignal = signal_new();
+ timing_get( &connection.startupTime );
+ requests.head = NULL;
+ requests.tail = NULL;
+ if ( learnNewServers && !sendAltServerRequest( sock ) )
+ goto bailout;
+ // Everything good, tell main connect function
+ connection.sockFd = sock;
+ atomic_thread_fence( memory_order_release );
+ pendingConnectionAttempts--;
+ if ( idx != 0 ) {
+ // Make server first in list - enough to swap host, other data has not changed yet
+ lock_write( &altLock );
+ dnbd3_host_t tmp = altservers[idx].host;
+ altservers[idx].host = altservers[0].host;
+ altservers[0].host = tmp;
+ unlock_rw( &altLock );
+ }
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ return NULL;
+
+bailout:
+ if ( sock != -1 ) {
+ close( sock );
+ }
+ free( cd->lowerImage );
+ free( cd );
+ // Last one has to wake up main thread, which is waiting for up to 5 seconds for
+ // any connect thread to succeed. If none succeeded, there is no point in waiting
+ // any longer.
+ if ( --pendingConnectionAttempts == 0 ) {
+ pthread_mutex_lock( &mutexCondConn );
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ }
+ return NULL;
}
bool connection_initThreads()
{
pthread_mutex_lock( &mutexInit );
- if ( !keepRunning || !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
+ if ( !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
pthread_mutex_unlock( &mutexInit );
return false;
}
bool success = true;
- pthread_t thread;
threadInitDone = true;
logadd( LOG_DEBUG1, "Initializing stuff" );
if ( pthread_mutex_init( &connection.sendMutex, NULL ) != 0
@@ -220,10 +333,10 @@ bool connection_initThreads()
logadd( LOG_ERROR, "Mutex or spinlock init failure" );
success = false;
} else {
- if ( pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)connection.sockFd ) != 0 ) {
+ if ( pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)connection.sockFd ) != 0 ) {
logadd( LOG_ERROR, "Could not create receive thread" );
success = false;
- } else if ( pthread_create( &thread, NULL, &connection_backgroundThread, NULL ) != 0 ) {
+ } else if ( pthread_create( &tidBackground, NULL, &connection_backgroundThread, NULL ) != 0 ) {
logadd( LOG_ERROR, "Could not create background thread" );
success = false;
}
@@ -241,7 +354,7 @@ uint64_t connection_getImageSize()
return image.size;
}
-bool connection_read(dnbd3_async_t *request)
+bool connection_read( dnbd3_async_t *request )
{
if ( !connectionInitDone ) return false;
pthread_mutex_lock( &connection.sendMutex );
@@ -250,9 +363,7 @@ bool connection_read(dnbd3_async_t *request)
if ( !dnbd3_get_block( connection.sockFd, request->offset, request->length, (uint64_t)request, 0 ) ) {
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
- pthread_mutex_unlock( &connection.sendMutex );
signal_call( connection.panicSignal );
- return true;
}
}
pthread_mutex_unlock( &connection.sendMutex );
@@ -261,24 +372,36 @@ bool connection_read(dnbd3_async_t *request)
void connection_close()
{
- if ( keepRunning ) {
- logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
- }
+ static bool signalled = false;
+ logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
pthread_mutex_lock( &mutexInit );
keepRunning = false;
+ if ( threadInitDone && !signalled ) {
+ signalled = true;
+ pthread_kill( tidReceiver, SIGHUP );
+ pthread_kill( tidBackground, SIGHUP );
+ }
+ pthread_mutex_unlock( &mutexInit );
if ( !connectionInitDone ) {
- pthread_mutex_unlock( &mutexInit );
return;
}
- pthread_mutex_unlock( &mutexInit );
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
+ logadd( LOG_DEBUG1, "Shutting down socket..." );
shutdown( connection.sockFd, SHUT_RDWR );
}
pthread_mutex_unlock( &connection.sendMutex );
}
-size_t connection_printStats(char *buffer, const size_t len)
+void connection_join()
+{
+ if ( !threadInitDone )
+ return;
+ pthread_join( tidReceiver, NULL );
+ pthread_join( tidBackground, NULL );
+}
+
+size_t connection_printStats( char *buffer, const size_t len )
{
int ret;
size_t remaining = len;
@@ -308,7 +431,7 @@ size_t connection_printStats(char *buffer, const size_t len)
*buffer++ = ' ';
}
const size_t addrlen = sock_printHost( &altservers[i].host, buffer, remaining );
- remaining -= (addrlen + 1); // For space or * above
+ remaining -= ( addrlen + 1 ); // For space or * above
buffer += addrlen;
if ( remaining < 3 )
break;
@@ -324,7 +447,7 @@ size_t connection_printStats(char *buffer, const size_t len)
width += 3;
}
ret = snprintf( buffer, remaining, "% *d %s Unreachable:% 5d BestCount:% 5d Live:% 5dµs\n",
- width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
+ width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
if ( ret < 0 ) {
ret = 0;
}
@@ -339,23 +462,23 @@ size_t connection_printStats(char *buffer, const size_t len)
return len - remaining;
}
-static void* connection_receiveThreadMain(void *sockPtr)
+static void* connection_receiveThreadMain( void *sockPtr )
{
int sockFd = (int)(size_t)sockPtr;
dnbd3_reply_t reply;
- pthread_detach( pthread_self() );
+ blockSignals();
while ( keepRunning ) {
int ret;
do {
ret = dnbd3_read_reply( sockFd, &reply, true );
+ if ( !keepRunning ) goto fail;
if ( ret == REPLY_OK ) break;
} while ( ret == REPLY_INTR || ret == REPLY_AGAIN );
if ( ret != REPLY_OK ) {
logadd( LOG_DEBUG1, "Error receiving reply on receiveThread (%d)", ret );
goto fail;
}
-
if ( reply.cmd == CMD_GET_BLOCK ) {
// Get block reply. find matching request
dnbd3_async_t *request = removeRequest( (dnbd3_async_t*)reply.handle );
@@ -390,10 +513,8 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
unlock_rw( &altLock );
}
- // Success, wake up caller
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ fuse_reply_buf( request->fuse_req, request->buffer, request->length );
+ free( request );
}
} else if ( reply.cmd == CMD_GET_SERVERS ) {
// List of known alt servers
@@ -416,7 +537,6 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
}
}
- logadd( LOG_DEBUG1, "Aus der Schleife rausgeflogen! ARRRRRRRRRR" );
fail:;
// Make sure noone is trying to use the socket for sending by locking,
pthread_mutex_lock( &connection.sendMutex );
@@ -424,7 +544,9 @@ fail:;
// as someone could have established a new connection already
if ( connection.sockFd == sockFd ) {
connection.sockFd = -1;
- signal_call( connection.panicSignal );
+ if ( keepRunning ) {
+ signal_call( connection.panicSignal );
+ }
}
pthread_mutex_unlock( &connection.sendMutex );
// As we're the only reader, it's safe to close the socket now
@@ -432,11 +554,12 @@ fail:;
return NULL;
}
-static void* connection_backgroundThread(void *something UNUSED)
+static void* connection_backgroundThread( void *something UNUSED )
{
ticks nextKeepalive;
ticks nextRttCheck;
+ blockSignals();
timing_get( &nextKeepalive );
nextRttCheck = nextKeepalive;
while ( keepRunning ) {
@@ -446,6 +569,8 @@ static void* connection_backgroundThread(void *something UNUSED)
uint32_t wt2 = timing_diffMs( &now, &nextRttCheck );
if ( wt1 > 0 && wt2 > 0 ) {
int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 );
+ if ( !keepRunning )
+ break;
if ( waitRes == SIGNAL_ERROR ) {
logadd( LOG_WARNING, "Error waiting on signal in background thread! Errno = %d", errno );
}
@@ -460,20 +585,20 @@ static void* connection_backgroundThread(void *something UNUSED)
}
sortAltServers();
probeAltServers();
- if ( panic || timing_diff( &connection.startupTime, &now ) <= STARTUP_MODE_DURATION ) {
+ if ( panic || timing_diff( &connection.startupTime, &now ) <= DISCOVER_STARTUP_PHASE_COUNT * TIMER_INTERVAL_PROBE_STARTUP ) {
timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_STARTUP );
} else {
- timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_NORMAL );
+ timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_MAX );
}
}
// Send keepalive packet
if ( timing_reachedPrecise( &nextKeepalive, &now ) ) {
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
- dnbd3_request_t request;
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_KEEPALIVE;
- request.handle = request.offset = request.size = 0;
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_KEEPALIVE,
+ };
fixup_request( request );
ssize_t ret = sock_sendAll( connection.sockFd, &request, sizeof request, 2 );
if ( (size_t)ret != sizeof request ) {
@@ -483,7 +608,7 @@ static void* connection_backgroundThread(void *something UNUSED)
}
}
pthread_mutex_unlock( &connection.sendMutex );
- timing_addSeconds( &nextKeepalive, &now, TIMER_INTERVAL_KEEPALIVE_PACKET );
+ timing_addSeconds( &nextKeepalive, &now, KEEPALIVE_INTERVAL );
}
}
return NULL;
@@ -491,7 +616,20 @@ static void* connection_backgroundThread(void *something UNUSED)
// Private quick helpers
-static void addAltServers()
+/**
+ * Check if given host is in list of altsevers.
+ * Does not lock 'altLock', do so at caller site.
+ */
+static bool hasAltServer( dnbd3_host_t *host )
+{
+ for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
+ if ( isSameAddress( host, &altservers[eIdx].host ) )
+ return true;
+ }
+ return false;
+}
+
+static void addAltServers( void )
{
pthread_mutex_lock( &newAltLock );
lock_write( &altLock );
@@ -499,11 +637,8 @@ static void addAltServers()
if ( newservers[nIdx].host.type == 0 )
continue;
// Got a new alt server, see if it's already known
- for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
- if ( isSameAddress( &newservers[nIdx].host, &altservers[eIdx].host ) ) {
- goto skip_server;
- }
- }
+ if ( hasAltServer( &newservers[nIdx].host ) )
+ continue;
// Not known yet, add - find free slot
int slot = -1;
for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
@@ -528,9 +663,8 @@ static void addAltServers()
altservers[slot].host = newservers[nIdx].host;
altservers[slot].liveRtt = 0;
}
-skip_server:;
}
- memset( newservers, 0, sizeof(newservers) );
+ memset( newservers, 0, sizeof( newservers ) );
unlock_rw( &altLock );
pthread_mutex_unlock( &newAltLock );
}
@@ -604,7 +738,7 @@ static void probeAltServers()
pthread_spin_lock( &requests.lock );
if ( requests.head != NULL ) {
if ( !panic && current != NULL ) {
- const int maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
+ const uint64_t maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
dnbd3_async_t *iterator;
for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) {
// A request with measurement tag is pending
@@ -626,7 +760,7 @@ static void probeAltServers()
}
lock_read( &altLock );
- for ( int altIndex = 0; altIndex < (panic ? MAX_ALTS : MAX_ALTS_ACTIVE); ++altIndex ) {
+ for ( int altIndex = 0; altIndex < ( panic ? MAX_ALTS : MAX_ALTS_ACTIVE ); ++altIndex ) {
alt_server_t * const srv = &altservers[altIndex];
if ( srv->host.type == 0 )
continue;
@@ -634,65 +768,65 @@ static void probeAltServers()
&& rand() % srv->consecutiveFails >= FAIL_BACKOFF_START_COUNT ) {
continue;
}
+ srv->rttIndex += 1;
if ( srv->rttIndex >= RTT_COUNT ) {
srv->rttIndex = 0;
- } else {
- srv->rttIndex += 1;
}
// Probe
+ char hstr[100];
+ sock_printHost( &srv->host, hstr, 100 );
ticks start;
timing_get( &start );
errno = 0;
int sock = sock_connect( &srv->host, panic ? 1000 : 333, 1000 );
if ( sock == -1 ) {
- logadd( LOG_DEBUG1, "Could not connect for probing. errno = %d", errno );
+ logadd( LOG_DEBUG1, "%s probe: Could not connect for probing. errno = %d", hstr, errno );
goto fail;
}
if ( !dnbd3_select_image( sock, image.name, image.rid, 0 ) ) {
- logadd( LOG_DEBUG1, "probe: select_image failed" );
+ logadd( LOG_DEBUG1, "%s probe: select_image failed (sock=%d, errno=%d)", hstr, sock, errno );
goto fail;
}
- if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize )) {
- logadd( LOG_DEBUG1, "probe: select image reply failed" );
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_DEBUG1, "%s probe: select image reply failed", hstr );
goto fail;
}
if ( remoteProto < MIN_SUPPORTED_SERVER ) {
- logadd( LOG_WARNING, "Unsupported remote version (local: %d, remote: %d)", (int)PROTOCOL_VERSION, (int)remoteProto );
+ logadd( LOG_WARNING, "%s probe: Unsupported remote version (local: %d, remote: %d)", hstr, (int)PROTOCOL_VERSION, (int)remoteProto );
srv->consecutiveFails += 10;
goto fail;
}
if ( remoteRid != image.rid || strcmp( remoteName, image.name ) != 0 ) {
- logadd( LOG_WARNING, "Remote rid or name mismatch (got '%s')", remoteName );
+ logadd( LOG_WARNING, "%s probe: Remote rid or name mismatch (got '%s')", hstr, remoteName );
srv->consecutiveFails += 10;
goto fail;
}
if ( !dnbd3_get_block( sock, testOffset, testLength, 0, 0 ) ) {
- logadd( LOG_DEBUG1, "-> block request fail" );
+ logadd( LOG_DEBUG1, "%s probe: -> block request fail", hstr );
goto fail;
}
int a = 111;
- if ( !(a = dnbd3_get_reply( sock, &reply )) || reply.size != testLength ) {
- logadd( LOG_DEBUG1, "<- get block reply fail %d %d", a, (int)reply.size );
+ if ( !( a = dnbd3_get_reply( sock, &reply ) ) || reply.size != testLength ) {
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply fail %d %d", hstr, a, (int)reply.size );
goto fail;
}
if ( request != NULL && removeRequest( request ) != NULL ) {
// Request successfully removed from queue
const ssize_t ret = sock_recv( sock, request->buffer, request->length );
if ( ret != (ssize_t)request->length ) {
- logadd( LOG_DEBUG1, "[RTT] receiving payload for a block reply failed" );
+ logadd( LOG_DEBUG1, "%s probe: receiving payload for a block reply failed", hstr );
// Failure, add to queue again
connection_read( request );
goto fail;
}
- // Success, wake up caller
- logadd( LOG_DEBUG1, "[RTT] Successful direct probe" );
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ // Success, reply to fuse
+ fuse_reply_buf( request->fuse_req, request->buffer, request->length );
+ free( request );
+ logadd( LOG_DEBUG1, "%s probe: Successful direct probe", hstr );
} else {
// Wasn't a request that's in our request queue
if ( !throwDataAway( sock, testLength ) ) {
- logadd( LOG_DEBUG1, "<- get block reply payload fail" );
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply payload fail", hstr );
goto fail;
}
}
@@ -701,7 +835,7 @@ static void probeAltServers()
// Panic mode? Just switch to server
if ( panic ) {
unlock_rw( &altLock );
- switchConnection( sock, srv );
+ if ( keepRunning ) switchConnection( sock, srv );
return;
}
// Non-panic mode:
@@ -733,7 +867,8 @@ static void probeAltServers()
close( sock );
}
continue;
-fail:;
+fail:
+ ;
if ( sock != -1 ) {
close( sock );
}
@@ -774,7 +909,7 @@ fail:;
// Regular logic: Apply threshold when considering switch
if ( !doSwitch && current != NULL ) {
doSwitch = current->rtt > best->rtt + RTT_ABSOLUTE_THRESHOLD
- || RTT_THRESHOLD_FACTOR(current->rtt) > best->rtt + 1000;
+ || RTT_THRESHOLD_FACTOR( current->rtt ) > best->rtt + 1000;
}
}
// Switch if a better server was found
@@ -796,11 +931,10 @@ fail:;
}
}
-static void switchConnection(int sockFd, alt_server_t *srv)
+static void switchConnection( int sockFd, alt_server_t *srv )
{
- pthread_t thread;
struct sockaddr_storage addr;
- socklen_t addrLen = sizeof(addr);
+ socklen_t addrLen = sizeof( addr );
char message[200] = "Connection switched to ";
const size_t len = strlen( message );
int ret;
@@ -829,9 +963,10 @@ static void switchConnection(int sockFd, alt_server_t *srv)
signal_call( connection.panicSignal );
return;
}
+ pthread_detach( tidReceiver );
timing_get( &connection.startupTime );
- pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)sockFd );
- sock_printable( (struct sockaddr*)&addr, sizeof(addr), message + len, sizeof(message) - len );
+ pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)sockFd );
+ sock_printable( (struct sockaddr*)&addr, sizeof( addr ), message + len, sizeof( message ) - len );
logadd( LOG_INFO, "%s", message );
// resend queue
if ( queue != NULL ) {
@@ -855,22 +990,28 @@ static void switchConnection(int sockFd, alt_server_t *srv)
/**
* Does not lock, so get the sendMutex first!
*/
-static void requestAltServers()
+static void requestAltServers( void )
{
if ( connection.sockFd == -1 || !learnNewServers )
return;
- dnbd3_request_t request = { 0 };
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_GET_SERVERS;
- fixup_request( request );
- if ( sock_sendAll( connection.sockFd, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) {
- logadd( LOG_WARNING, "Connection failed while requesting alt server list" );
+ if ( !sendAltServerRequest( connection.sockFd ) ) {
+ logadd( LOG_WARNING, "Main connection failed while requesting alt server list" );
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
}
}
-static bool throwDataAway(int sockFd, uint32_t amount)
+static bool sendAltServerRequest( int sock )
+{
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_GET_SERVERS,
+ };
+ fixup_request( request );
+ return sock_sendAll( sock, &request, sizeof( request ), 2 ) == (ssize_t)sizeof( request );
+}
+
+static bool throwDataAway( int sockFd, uint32_t amount )
{
size_t done = 0;
char tempBuffer[SHORTBUF];
@@ -883,11 +1024,9 @@ static bool throwDataAway(int sockFd, uint32_t amount)
return true;
}
-static void enqueueRequest(dnbd3_async_t *request)
+static void enqueueRequest( dnbd3_async_t *request )
{
request->next = NULL;
- request->finished = false;
- request->success = false;
//logadd( LOG_DEBUG2, "Queue: %p @ %s : %d", request, file, line );
// Measure latency and add to switch formula
timing_get( &request->time );
@@ -901,7 +1040,7 @@ static void enqueueRequest(dnbd3_async_t *request)
pthread_spin_unlock( &requests.lock );
}
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request )
{
pthread_spin_lock( &requests.lock );
//logadd( LOG_DEBUG2, "Remov: %p @ %s : %d", request, file, line );
@@ -925,3 +1064,20 @@ static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
return iterator;
}
+static void blockSignals()
+{
+ sigset_t sigmask;
+ if ( pthread_sigmask( 0, NULL, &sigmask ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot get current sigmask of thread" );
+ sigemptyset( &sigmask );
+ }
+ sigaddset( &sigmask, SIGUSR1 );
+ sigaddset( &sigmask, SIGUSR2 );
+ sigaddset( &sigmask, SIGPIPE );
+ sigaddset( &sigmask, SIGINT );
+ sigaddset( &sigmask, SIGTERM );
+ sigdelset( &sigmask, SIGHUP );
+ if ( pthread_sigmask( SIG_SETMASK, &sigmask, NULL ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot set sigmask of thread" );
+ }
+}
diff --git a/src/fuse/connection.h b/src/fuse/connection.h
index cae554c..b869ac6 100644
--- a/src/fuse/connection.h
+++ b/src/fuse/connection.h
@@ -1,35 +1,41 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/timing.h>
+#include <stdatomic.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
+#include <sys/socket.h>
+#define FUSE_USE_VERSION 30
+#include <fuse_lowlevel.h>
+
+extern atomic_bool keepRunning;
struct _dnbd3_async;
typedef struct _dnbd3_async {
struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller)
- dnbd3_signal_t* signal; // Used to signal the caller
- char* buffer; // Caller-provided buffer to be filled
ticks time; // When request was put on wire, 0 if not measuring
uint64_t offset;
uint32_t length;
- bool finished; // Will be set to true if the request has been handled
- bool success; // Will be set to true if the request succeeded
+ fuse_req_t fuse_req;
+ char buffer[]; // Must be last member!
} dnbd3_async_t;
-bool connection_init(const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers);
+bool connection_init( const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers );
bool connection_initThreads();
uint64_t connection_getImageSize();
-bool connection_read(dnbd3_async_t *request);
+bool connection_read( dnbd3_async_t *request );
void connection_close();
-size_t connection_printStats(char *buffer, const size_t len);
+void connection_join();
+
+size_t connection_printStats( char *buffer, const size_t len );
#endif /* CONNECTION_H_ */
diff --git a/src/fuse/helper.c b/src/fuse/helper.c
index d81b08f..f54073b 100644
--- a/src/fuse/helper.c
+++ b/src/fuse/helper.c
@@ -18,8 +18,8 @@ void printLog( log_info *info )
}
//rewind(file);
- fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) );
- fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", (uint64_t)( info->imageSize/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", (uint64_t)( info->receivedBytes/ ( 1024ll*1024ll ) ) );
fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount );
fprintf( logFile, "Blocksize: 4KiB\n\n" );
fprintf( logFile, "Block access count:\n" );
@@ -29,7 +29,7 @@ void printLog( log_info *info )
if ( i % 50 == 0 ) {
fprintf( logFile, "\n" );
}
- fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] );
+ fprintf( logFile, "%i ", (int) info->blockRequestCount[i] );
}
fprintf( logFile, "\n" );
fclose( logFile );
diff --git a/src/fuse/helper.h b/src/fuse/helper.h
index 9e5d127..b1fa513 100644
--- a/src/fuse/helper.h
+++ b/src/fuse/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
@@ -18,18 +18,18 @@ typedef struct log_info {
-void printLog(log_info *info);
+void printLog( log_info *info );
-int connect_to_server(char *server_adress, int port);
+int connect_to_server( char *server_adress, int port );
-static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddressPort( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( a->port == b->port ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
-static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddress( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
#endif
diff --git a/src/fuse/main.c b/src/fuse/main.c
index 1a5643c..e06f6e8 100644
--- a/src/fuse/main.c
+++ b/src/fuse/main.c
@@ -5,18 +5,26 @@
* See the file COPYING.
*
* Changed by Stephan Schwaer
+ * FUSE lowlevel by Alan Reichert
* */
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
#define FUSE_USE_VERSION 30
-#include <fuse.h>
+#include <dnbd3/config.h>
+#include <fuse_lowlevel.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
/* for printing uint */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
@@ -27,8 +35,14 @@
#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
-static const char * const IMAGE_PATH = "/img";
-static const char * const STATS_PATH = "/status";
+#define INO_ROOT (1)
+#define INO_STATS (2)
+#define INO_IMAGE (3)
+
+static const char *IMAGE_NAME = "img";
+static const char *STATS_NAME = "status";
+
+static struct fuse_session *_fuseSession = NULL;
static uint64_t imageSize;
/* Debug/Benchmark variables */
@@ -36,237 +50,243 @@ static bool useDebug = false;
static log_info logInfo;
static struct timespec startupTime;
static uid_t owner;
-static bool keepRunning = true;
-static void (*fuse_sigIntHandler)(int) = NULL;
-static void (*fuse_sigTermHandler)(int) = NULL;
-static struct fuse_operations dnbd3_fuse_no_operations;
-
-#define SIGPOOLSIZE 6
-static pthread_spinlock_t sigLock;
-static dnbd3_signal_t *signalPool[SIGPOOLSIZE];
-static dnbd3_signal_t **sigEnd = signalPool + SIGPOOLSIZE;
-static void signalInit()
-{
- pthread_spin_init( &sigLock, PTHREAD_PROCESS_PRIVATE );
- for ( size_t i = 0; i < SIGPOOLSIZE; ++i ) {
- signalPool[i] = NULL;
- }
-}
-static inline dnbd3_signal_t *signalGet()
-{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it != NULL ) {
- dnbd3_signal_t *ret = *it;
- *it = NULL;
- pthread_spin_unlock( &sigLock );
- return ret;
- }
- }
- pthread_spin_unlock( &sigLock );
- return signal_newBlocking();
-}
-static inline void signalPut(dnbd3_signal_t *signal)
-{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it == NULL ) {
- *it = signal;
- pthread_spin_unlock( &sigLock );
- return;
- }
- }
- pthread_spin_unlock( &sigLock );
- signal_close( signal );
-}
-static int image_getattr(const char *path, struct stat *stbuf)
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize );
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset );
+static void image_destroy( void *private_data );
+static void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+static void image_ll_init( void *userdata, struct fuse_conn_info *conn );
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi );
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi );
+static int image_stat( fuse_ino_t ino, struct stat *stbuf );
+static void printUsage( char *argv0, int exitCode );
+static void printVersion();
+
+static int image_stat( fuse_ino_t ino, struct stat *stbuf )
{
- int res = 0;
- memset( stbuf, 0, sizeof( struct stat ) );
- stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
- stbuf->st_uid = owner;
- if ( strcmp( path, "/" ) == 0 ) {
+ switch ( ino ) {
+ case INO_ROOT:
stbuf->st_mode = S_IFDIR | 0550;
stbuf->st_nlink = 2;
- } else if ( strcmp( path, IMAGE_PATH ) == 0 ) {
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_IMAGE:
stbuf->st_mode = S_IFREG | 0440;
stbuf->st_nlink = 1;
stbuf->st_size = imageSize;
- } else if ( strcmp( path, STATS_PATH ) == 0 ) {
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_STATS:
stbuf->st_mode = S_IFREG | 0440;
stbuf->st_nlink = 1;
stbuf->st_size = 4096;
clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return -1;
+ }
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = owner;
+ stbuf->st_ino = ino;
+ return 0;
+}
+
+static void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ struct stat stbuf = { 0 };
+ ( void ) fi;
+
+ if ( image_stat( ino, &stbuf ) == -1 ) {
+ fuse_reply_err( req, ENOENT );
} else {
- res = -ENOENT;
+ fuse_reply_attr( req, &stbuf, ino == INO_IMAGE ? 1200 : 1 ); // seconds validity timeout
}
- return res;
}
-static int image_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset UNUSED, struct fuse_file_info *fi UNUSED)
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name )
{
- if ( strcmp( path, "/" ) != 0 ) {
- return -ENOENT;
+ ( void )parent;
+
+ if ( strcmp( name, IMAGE_NAME ) == 0 || strcmp( name, STATS_NAME ) == 0 ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, IMAGE_NAME ) == 0 ) {
+ e.ino = INO_IMAGE;
+ e.attr_timeout = e.entry_timeout = 1200;
+ } else {
+ e.ino = INO_STATS;
+ e.attr_timeout = e.entry_timeout = 0;
+ }
+ if ( image_stat( e.ino, &e.attr ) == 0 ) {
+ fuse_reply_entry( req, &e );
+ return;
+ }
}
- filler( buf, ".", NULL, 0 );
- filler( buf, "..", NULL, 0 );
- filler( buf, IMAGE_PATH + 1, NULL, 0 );
- filler( buf, STATS_PATH + 1, NULL, 0 );
- return 0;
+ fuse_reply_err( req, ENOENT );
}
-static int image_open(const char *path, struct fuse_file_info *fi)
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
{
- if ( strcmp( path, IMAGE_PATH ) != 0 && strcmp( path, STATS_PATH ) != 0 ) {
- return -ENOENT;
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
+}
+
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
+{
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
}
- if ( ( fi->flags & 3 ) != O_RDONLY ) {
- return -EACCES;
+ return fuse_reply_buf( req, NULL, 0 );
+}
+
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ ( void ) fi;
+
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, ENOTDIR );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, IMAGE_NAME, INO_IMAGE );
+ dirbuf_add( req, &b, STATS_NAME, INO_STATS );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
}
- return 0;
}
-static int fillStatsFile(char *buf, size_t size, off_t offset) {
- if ( offset == 0 ) {
- return (int)connection_printStats( buf, size );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ if ( ino != INO_IMAGE && ino != INO_STATS ) {
+ fuse_reply_err( req, EISDIR );
+ } else if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ // auto caching
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
}
+}
+
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset ) {
char buffer[4096];
int ret = (int)connection_printStats( buffer, sizeof buffer );
int len = MIN( ret - (int)offset, (int)size );
- if ( len == 0 )
- return 0;
if ( len < 0 ) {
- return -EOF;
+ fuse_reply_err( req, 0 );
+ return;
}
- memcpy( buf, buffer + offset, len );
- return len;
+ fuse_reply_buf( req, buffer + offset, len );
}
-static int image_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi UNUSED)
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi )
{
- if ( size > __INT_MAX__ ) {
- // fuse docs say we MUST fill the buffer with exactly size bytes and return size,
- // otherwise the buffer will we padded with zeros. Since the return value is just
- // an int, we could not properly fulfill read requests > 2GB. Since there is no
- // mention of a guarantee that this will never happen, better add a safety check.
- // Way to go fuse.
- return -EIO;
- }
- if ( path[1] == STATS_PATH[1] ) {
- return fillStatsFile( buf, size, offset );
+ assert( ino == INO_STATS || ino == INO_IMAGE );
+
+ ( void )fi;
+
+ if ( ino == INO_STATS ) {
+ fillStatsFile( req, size, offset );
+ return;
}
if ( (uint64_t)offset >= imageSize ) {
- return 0;
+ fuse_reply_err( req, 0 );
+ return;
}
-
if ( offset + size > imageSize ) {
size = imageSize - offset;
}
+ if ( size == 0 || size > UINT32_MAX ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
if ( useDebug ) {
- /* count the requested blocks */
uint64_t startBlock = offset / ( 4096 );
const uint64_t endBlock = ( offset + size - 1 ) / ( 4096 );
- for ( ; startBlock <= endBlock; startBlock++ ) {
+ for ( ; startBlock <= endBlock; startBlock++ )
+ {
++logInfo.blockRequestCount[startBlock];
}
}
-
- dnbd3_async_t request;
- request.buffer = buf;
- request.length = (uint32_t)size;
- request.offset = offset;
- request.signal = signalGet();
-
- if ( !connection_read( &request ) ) {
- signalPut( request.signal );
- return -EINVAL;
- }
- while ( !request.finished ) {
- int ret = signal_wait( request.signal, 5000 );
- if ( !keepRunning ) {
- connection_close();
- break;
- }
- if ( ret < 0 ) {
- debugf( "fuse_read signal wait returned %d", ret );
- }
- }
- signalPut( request.signal );
- if ( request.success ) {
- return request.length;
- } else {
- return -EIO;
+ dnbd3_async_t *request = malloc( sizeof(dnbd3_async_t) + size );
+ request->length = (uint32_t)size;
+ request->offset = offset;
+ request->fuse_req = req;
+
+ if ( !connection_read( request ) ) {
+ fuse_reply_err( req, EIO );
+ free( request );
}
}
-static void image_sigHandler(int signum) {
- keepRunning = false;
- if ( signum == SIGINT && fuse_sigIntHandler != NULL ) {
- fuse_sigIntHandler(signum);
- }
- if ( signum == SIGTERM && fuse_sigTermHandler != NULL ) {
- fuse_sigTermHandler(signum);
- }
+static void noopSigHandler( int signum )
+{
+ (void)signum;
}
-static void* image_init(struct fuse_conn_info *conn UNUSED)
+static void image_ll_init( void *userdata, struct fuse_conn_info *conn )
{
+ ( void ) userdata;
+ ( void ) conn;
if ( !connection_initThreads() ) {
logadd( LOG_ERROR, "Could not initialize threads for dnbd3 connection, exiting..." );
- exit( EXIT_FAILURE );
+ if ( _fuseSession != NULL ) {
+ fuse_session_exit( _fuseSession );
+ }
}
- // Prepare our handler
- struct sigaction newHandler;
- memset( &newHandler, 0, sizeof(newHandler) );
- newHandler.sa_handler = &image_sigHandler;
- sigemptyset( &newHandler.sa_mask );
- struct sigaction oldHandler;
- // Retrieve old handlers when setting
- sigaction( SIGINT, &newHandler, &oldHandler );
- fuse_sigIntHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGINT handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- sigaction( SIGTERM, &newHandler, &oldHandler );
- fuse_sigTermHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGTERM handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- return NULL;
}
/* close the connection */
-static void image_destroy(void *private_data UNUSED)
+static void image_destroy( void *private_data UNUSED )
{
if ( useDebug ) {
printLog( &logInfo );
}
connection_close();
- return;
}
/* map the implemented fuse operations */
-static struct fuse_operations image_oper = {
- .getattr = image_getattr,
- .readdir = image_readdir,
- .open = image_open,
- .read = image_read,
- .init = image_init,
+static struct fuse_lowlevel_ops image_oper = {
+ .lookup = image_ll_lookup,
+ .getattr = image_ll_getattr,
+ .readdir = image_ll_readdir,
+ .open = image_ll_open,
+ .read = image_ll_read,
+ .init = image_ll_init,
.destroy = image_destroy,
};
static void printVersion()
{
char *arg[] = { "foo", "-V" };
- printf( "DNBD3-Fuse Version 1.2.3.4, protocol version %d\n", (int)PROTOCOL_VERSION );
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ printf( "dnbd3-fuse version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
+ printf( "Protocol version: %d\n", (int)PROTOCOL_VERSION );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
exit( 0 );
}
-static void printUsage(char *argv0, int exitCode)
+static void printUsage( char *argv0, int exitCode )
{
char *arg[] = { argv0, "-h" };
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
printf( "\n" );
printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
@@ -284,19 +304,19 @@ static void printUsage(char *argv0, int exitCode)
static const char *optString = "dfHh:i:l:o:r:SsVv";
static const struct option longOpts[] = {
- { "debug", no_argument, NULL, 'd' },
- { "help", no_argument, NULL, 'H' },
- { "host", required_argument, NULL, 'h' },
- { "image", required_argument, NULL, 'i' },
- { "log", required_argument, NULL, 'l' },
- { "option", required_argument, NULL, 'o' },
- { "rid", required_argument, NULL, 'r' },
- { "sticky", no_argument, NULL, 'S' },
- { "version", no_argument, NULL, 'v' },
- { 0, 0, 0, 0 }
+ { "debug", no_argument, NULL, 'd' },
+ { "help", no_argument, NULL, 'H' },
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "log", required_argument, NULL, 'l' },
+ { "option", required_argument, NULL, 'o' },
+ { "rid", required_argument, NULL, 'r' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "version", no_argument, NULL, 'v' },
+ { 0, 0, 0, 0 }
};
-int main(int argc, char *argv[])
+int main( int argc, char *argv[] )
{
char *server_address = NULL;
char *image_Name = NULL;
@@ -306,6 +326,12 @@ int main(int argc, char *argv[])
int newArgc;
int opt, lidx;
bool learnNewServers = true;
+ bool single_thread = false;
+ struct fuse_chan *ch;
+ char *mountpoint;
+ int foreground = 0;
+
+ log_init();
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
@@ -316,9 +342,10 @@ int main(int argc, char *argv[])
log_setConsoleTimestamps( true );
log_setFileMask( 65535 );
- newArgv = calloc( argc + 10, sizeof(char*) );
+ newArgv = calloc( argc + 10, sizeof( char* ) );
newArgv[0] = argv[0];
newArgc = 1;
+
while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
switch ( opt ) {
case 'h':
@@ -328,7 +355,7 @@ int main(int argc, char *argv[])
image_Name = optarg;
break;
case 'r':
- rid = (uint16_t)atoi(optarg);
+ rid = (uint16_t)atoi( optarg );
break;
case 'o':
newArgv[newArgc++] = "-o";
@@ -357,15 +384,16 @@ int main(int argc, char *argv[])
case 'd':
useDebug = true;
newArgv[newArgc++] = "-d";
+ foreground = 1;
break;
case 's':
- newArgv[newArgc++] = "-s";
+ single_thread = true;
break;
case 'S':
learnNewServers = false;
break;
case 'f':
- newArgv[newArgc++] = "-f";
+ foreground = 1;
break;
default:
printUsage( argv[0], EXIT_FAILURE );
@@ -386,6 +414,17 @@ int main(int argc, char *argv[])
}
}
+ // Prepare our handler
+ struct sigaction newHandler;
+ memset( &newHandler, 0, sizeof( newHandler ) );
+ newHandler.sa_handler = &noopSigHandler;
+ sigemptyset( &newHandler.sa_mask );
+ sigaction( SIGHUP, &newHandler, NULL );
+ sigset_t sigmask;
+ sigemptyset( &sigmask );
+ sigaddset( &sigmask, SIGHUP );
+ pthread_sigmask( SIG_BLOCK, &sigmask, NULL );
+
if ( !connection_init( server_address, image_Name, rid, learnNewServers ) ) {
logadd( LOG_ERROR, "Could not connect to any server. Bye.\n" );
return EXIT_FAILURE;
@@ -404,17 +443,51 @@ int main(int argc, char *argv[])
// Since dnbd3 is always read only and the remote image will not change
newArgv[newArgc++] = "-o";
- newArgv[newArgc++] = "ro,auto_cache,default_permissions";
+ newArgv[newArgc++] = "ro,default_permissions";
// Mount point goes last
newArgv[newArgc++] = argv[optind];
- printf( "ImagePathName: %s\nFuseArgs:",IMAGE_PATH );
+ printf( "ImagePathName: /%s\nFuseArgs:", IMAGE_NAME );
for ( int i = 0; i < newArgc; ++i ) {
printf( " '%s'", newArgv[i] );
}
- putchar('\n');
+ putchar( '\n' );
clock_gettime( CLOCK_REALTIME, &startupTime );
owner = getuid();
- signalInit();
- return fuse_main( newArgc, newArgv, &image_oper, NULL );
+
+ // Fuse lowlevel loop
+ struct fuse_args args = FUSE_ARGS_INIT( newArgc, newArgv );
+ int fuse_err = 1;
+ if ( fuse_parse_cmdline( &args, &mountpoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Parsing command line failed" );
+ } else if ( ( ch = fuse_mount( mountpoint, &args ) ) == NULL ) {
+ logadd( LOG_ERROR, "Mounting file system failed" );
+ } else {
+ _fuseSession = fuse_lowlevel_new( &args, &image_oper, sizeof( image_oper ), NULL );
+ if ( _fuseSession == NULL ) {
+ logadd( LOG_ERROR, "Could not initialize fuse session" );
+ } else {
+ if ( fuse_set_signal_handlers( _fuseSession ) == -1 ) {
+ logadd( LOG_ERROR, "Could not install fuse signal handlers" );
+ } else {
+ fuse_session_add_chan( _fuseSession, ch );
+ fuse_daemonize( foreground );
+ if ( single_thread ) {
+ fuse_err = fuse_session_loop( _fuseSession );
+ } else {
+ fuse_err = fuse_session_loop_mt( _fuseSession ); //MT produces errors (race conditions) in libfuse and didnt improve speed at all
+ }
+ fuse_remove_signal_handlers( _fuseSession );
+ fuse_session_remove_chan( ch );
+ }
+ fuse_session_destroy( _fuseSession );
+ _fuseSession = NULL;
+ }
+ fuse_unmount( mountpoint, ch );
+ }
+ fuse_opt_free_args( &args );
+ free( newArgv );
+ connection_join();
+ logadd( LOG_DEBUG1, "Terminating. FUSE REPLIED: %d\n", fuse_err );
+ return fuse_err;
}
diff --git a/src/fuse/serialize.c b/src/fuse/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/fuse/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/kernel/.clang-format b/src/kernel/.clang-format
new file mode 100644
index 0000000..c1fe2c6
--- /dev/null
+++ b/src/kernel/.clang-format
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | sort | uniq
+ForEachMacros:
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - '__ata_qc_for_each'
+ - 'ata_qc_for_each'
+ - 'ata_qc_for_each_raw'
+ - 'ata_qc_for_each_with_internal'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - '__bio_for_each_bvec'
+ - 'bio_for_each_bvec'
+ - 'bio_for_each_bvec_all'
+ - 'bio_for_each_integrity_vec'
+ - '__bio_for_each_segment'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'bitmap_for_each_clear_region'
+ - 'bitmap_for_each_set_region'
+ - 'blkg_for_each_descendant_post'
+ - 'blkg_for_each_descendant_pre'
+ - 'blk_queue_for_each_rl'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'bpf_for_each_spilled_reg'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'cxl_for_each_cmd'
+ - 'device_for_each_child_node'
+ - 'dma_fence_chain_for_each'
+ - 'do_for_each_ftrace_op'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_atomic_for_each_plane_damage'
+ - 'drm_client_for_each_connector_iter'
+ - 'drm_client_for_each_modeset'
+ - 'drm_connector_for_each_possible_encoder'
+ - 'drm_for_each_bridge_in_chain'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_crtc_reverse'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_for_each_privobj'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'flow_action_for_each'
+ - 'for_each_active_dev_scope'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_aggr_pgid'
+ - 'for_each_available_child_of_node'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_bvec'
+ - 'for_each_card_auxs'
+ - 'for_each_card_auxs_safe'
+ - 'for_each_card_components'
+ - 'for_each_card_dapms'
+ - 'for_each_card_pre_auxs'
+ - 'for_each_card_prelinks'
+ - 'for_each_card_rtds'
+ - 'for_each_card_rtds_safe'
+ - 'for_each_card_widgets'
+ - 'for_each_card_widgets_safe'
+ - 'for_each_cgroup_storage_type'
+ - 'for_each_child_of_node'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_cmsghdr'
+ - 'for_each_compatible_node'
+ - 'for_each_component_dais'
+ - 'for_each_component_dais_safe'
+ - 'for_each_comp_order'
+ - 'for_each_console'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_not'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dapm_widgets'
+ - 'for_each_dev_addr'
+ - 'for_each_dev_scope'
+ - 'for_each_displayid_db'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_dpcm_be'
+ - 'for_each_dpcm_be_rollback'
+ - 'for_each_dpcm_be_safe'
+ - 'for_each_dpcm_fe'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_element'
+ - 'for_each_element_extid'
+ - 'for_each_element_id'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_free_mem_pfn_range_in_zone'
+ - 'for_each_free_mem_pfn_range_in_zone_from'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_hstate'
+ - 'for_each_if'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_nr'
+ - 'for_each_link_codecs'
+ - 'for_each_link_cpus'
+ - 'for_each_link_platforms'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_member'
+ - 'for_each_memcg_cache_index'
+ - 'for_each_mem_pfn_range'
+ - '__for_each_mem_range'
+ - 'for_each_mem_range'
+ - '__for_each_mem_range_rev'
+ - 'for_each_mem_range_rev'
+ - 'for_each_mem_region'
+ - 'for_each_migratetype_order'
+ - 'for_each_msi_entry'
+ - 'for_each_msi_entry_safe'
+ - 'for_each_net'
+ - 'for_each_net_continue_reverse'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_continue_reverse'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_net_rcu'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_mst_mgr_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_nonreserved_multicast_dest_pgid'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_cpu_node'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_old_mst_mgr_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_mst_mgr_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_plane_in_state_reverse'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pci_msi_entry'
+ - 'for_each_pcm_streams'
+ - 'for_each_physmem_range'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_present_cpu'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_property_of_node'
+ - 'for_each_registered_fb'
+ - 'for_each_requested_gpio'
+ - 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_mem_range'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_rtd_codec_dais'
+ - 'for_each_rtd_components'
+ - 'for_each_rtd_cpu_dais'
+ - 'for_each_rtd_dais'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_set_clump8'
+ - 'for_each_sg'
+ - 'for_each_sg_dma_page'
+ - 'for_each_sg_page'
+ - 'for_each_sgtable_dma_page'
+ - 'for_each_sgtable_dma_sg'
+ - 'for_each_sgtable_page'
+ - 'for_each_sgtable_sg'
+ - 'for_each_sibling_event'
+ - 'for_each_subelement'
+ - 'for_each_subelement_extid'
+ - 'for_each_subelement_id'
+ - '__for_each_thread'
+ - 'for_each_thread'
+ - 'for_each_unicast_dest_pgid'
+ - 'for_each_vsi'
+ - 'for_each_wakeup_source'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'genradix_for_each'
+ - 'genradix_for_each_from'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hctx_for_each_ctx'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - 'hlist_for_each_entry_srcu'
+ - '__hlist_for_each_rcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'i3c_bus_for_each_i2cdev'
+ - 'i3c_bus_for_each_i3cdev'
+ - 'ide_host_for_each_port'
+ - 'ide_port_for_each_dev'
+ - 'ide_port_for_each_present_dev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_continue_ul'
+ - 'idr_for_each_entry_ul'
+ - 'in_dev_for_each_ifa_rcu'
+ - 'in_dev_for_each_ifa_rtnl'
+ - 'inet_bind_bucket_for_each'
+ - 'inet_lhash2_for_each_icsk_rcu'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_func_safe'
+ - 'klp_for_each_func_static'
+ - 'klp_for_each_object'
+ - 'klp_for_each_object_safe'
+ - 'klp_for_each_object_static'
+ - 'kunit_suite_for_each_test_case'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_vcpu'
+ - 'list_for_each'
+ - 'list_for_each_codec'
+ - 'list_for_each_codec_safe'
+ - 'list_for_each_continue'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_rcu'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_entry_srcu'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'mci_for_each_dimm'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'nanddev_io_for_each_page'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_nested'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'pcl_for_each_chunk'
+ - 'pcl_for_each_segment'
+ - 'pcm_for_each_format'
+ - 'ping_portaddr_for_each_entry'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_block'
+ - 'rdma_for_each_port'
+ - 'rdma_umem_for_each_dma_block'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_from'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_from'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_from'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_from'
+ - '__rq_for_each_bio'
+ - 'rq_for_each_bvec'
+ - 'rq_for_each_segment'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'shdma_for_each_chan'
+ - '__shost_for_each_device'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_array_for_each'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'tb_property_for_each'
+ - 'tcf_exts_for_each_action'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'virtio_device_for_each_vq'
+ - 'while_for_each_ftrace_op'
+ - 'xa_for_each'
+ - 'xa_for_each_marked'
+ - 'xa_for_each_range'
+ - 'xa_for_each_start'
+ - 'xas_for_each'
+ - 'xas_for_each_conflict'
+ - 'xas_for_each_marked'
+ - 'xbc_array_for_each_value'
+ - 'xbc_for_each_key_value'
+ - 'xbc_node_for_each_array_value'
+ - 'xbc_node_for_each_child'
+ - 'xbc_node_for_each_key_value'
+ - 'zorro_for_each_dev'
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
new file mode 100644
index 0000000..6bc61ff
--- /dev/null
+++ b/src/kernel/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-kernel
+ LANGUAGES C)
+
+# include macros to define Linux kernel build targets
+include(Kernel)
+
+# set C flags for a Linux kernel module
+set(KERNEL_C_FLAGS "-DDNBD3_KERNEL_MODULE -I ${PROJECT_INCLUDE_GEN_DIR}"
+ CACHE STRING "C flags to be used for building the dnbd3 kernel module")
+# set C flags for the debug mode of a Linux kernel module
+set(KERNEL_C_FLAGS_DEBUG "-g -DDEBUG"
+ CACHE STRING "Additional C flags to be used for building the dnbd3 kernel module in debug mode")
+
+# append include directories to the C flags
+get_property(KERNEL_INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+foreach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} -I ${KERNEL_INCLUDE_DIR}")
+endforeach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+
+# append debug C flags if debug mode is enabled
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} ${KERNEL_C_FLAGS_DEBUG}")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+# dnbd3 Linux kernel module
+set(KERNEL_MODULE_DNBD3_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.c)
+set(KERNEL_MODULE_DNBD3_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.h)
+
+add_kernel_module(dnbd3 "${KERNEL_BUILD_DIR}"
+ "${KERNEL_INSTALL_DIR}"
+ "CONFIG_BLK_DEV_DNBD3=m"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}"
+ ${CMAKE_CURRENT_SOURCE_DIR}/Kbuild)
+
+# add dependency to generate project version header before dnbd3.ko is built
+add_dependencies(dnbd3 dnbd3-generate-version)
+
+set(CHECKPATCH_IGNORE_WARNINGS "NEW_TYPEDEFS"
+ "MSLEEP"
+ "CONSTANT_COMPARISON"
+ "DEEP_INDENTATION"
+ "PREFER_PR_LEVEL"
+ "LINUX_VERSION_CODE"
+ "JIFFIES_COMPARISON"
+ "KREALLOC_ARG_REUSE")
+
+add_kernel_linter(dnbd3-lint "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+add_kernel_linter_fix(dnbd3-lint-fix "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+
+add_linter_fix(dnbd3-lint-fix-clang "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
diff --git a/src/kernel/Kbuild b/src/kernel/Kbuild
new file mode 100644
index 0000000..26afa98
--- /dev/null
+++ b/src/kernel/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Linux kernel module dnbd3
+obj-$(CONFIG_BLK_DEV_DNBD3) := dnbd3.o
+dnbd3-y += dnbd3_main.o blk.o net.o serialize.o sysfs.o
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index 889b988..69e4583 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,248 +19,259 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "blk.h"
#include "net.h"
#include "sysfs.h"
+#include "dnbd3_main.h"
#include <linux/pagemap.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#define dnbd3_req_read(req) \
- req_op(req) == REQ_OP_READ
-#define dnbd3_req_fs(req) \
- dnbd3_req_read(req) || req_op(req) == REQ_OP_WRITE
-#define dnbd3_req_special(req) \
- blk_rq_is_private(req)
-#else
-#define dnbd3_req_read(req) \
- rq_data_dir(req) == READ
-#define dnbd3_req_fs(req) \
- req->cmd_type == REQ_TYPE_FS
-#define dnbd3_req_special(req) \
- req->cmd_type == REQ_TYPE_SPECIAL
-#endif
-
-int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+static int dnbd3_close_device(dnbd3_device_t *dev)
{
- struct gendisk *disk;
- struct request_queue *blk_queue;
-
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_receive);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
+ int result;
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- memset(&dev->initial_server, 0, sizeof(dev->initial_server));
- dev->better_sock = NULL;
+ if (dev->imgname)
+ dev_info(dnbd3_device_to_dev(dev), "closing down device.\n");
+ dev->panic = false;
+ result = dnbd3_net_disconnect(dev);
+ kfree(dev->imgname);
dev->imgname = NULL;
- dev->rid = 0;
- dev->update_available = 0;
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- dev->disconnecting = 0;
- dev->panic = 0;
- dev->panic_count = 0;
- dev->reported_size = 0;
-
- if (!(disk = alloc_disk(1)))
- {
- printk("ERROR: dnbd3 alloc_disk failed.\n");
- return -EIO;
- }
-
- disk->major = major;
- disk->first_minor = minor;
- sprintf(disk->disk_name, "dnbd%d", minor);
- set_capacity(disk, 0);
- set_disk_ro(disk, 1);
- disk->fops = &dnbd3_blk_ops;
-
- spin_lock_init(&dev->blk_lock);
- if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL)
- {
- printk("ERROR: dnbd3 blk_init_queue failed.\n");
- return -EIO;
- }
-
- blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
- blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
-
- disk->queue = blk_queue;
- disk->private_data = dev;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-#else
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-#endif
-#define ONE_MEG (1048576)
- blk_queue_max_segment_size(disk->queue, ONE_MEG);
- blk_queue_max_segments(disk->queue, 0xffff);
- blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
- disk->queue->limits.max_sectors = 256;
- dev->disk = disk;
-#undef ONE_MEG
- add_disk(disk);
- dnbd3_sysfs_init(dev);
- return 0;
+ /* new requests might have been queued up, */
+ /* but now that imgname is NULL no new ones can show up */
+ blk_mq_freeze_queue(dev->queue);
+ set_capacity(dev->disk, 0);
+ blk_mq_unfreeze_queue(dev->queue);
+ return result;
}
-int dnbd3_blk_del_device(dnbd3_device_t *dev)
-{
- dnbd3_sysfs_exit(dev);
- dnbd3_net_disconnect(dev);
- del_gendisk(dev->disk);
- put_disk(dev->disk);
- blk_cleanup_queue(dev->disk->queue);
- return 0;
-}
-
-struct block_device_operations dnbd3_blk_ops =
- { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, };
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
int result = -100;
dnbd3_device_t *dev = bdev->bd_disk->private_data;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
struct request_queue *blk_queue = dev->disk->queue;
+#endif
char *imgname = NULL;
dnbd3_ioctl_t *msg = NULL;
- //unsigned long irqflags;
+ int i = 0, j;
+ u8 locked = 0;
- while (dev->disconnecting)
- {
- // do nothing
- }
-
- if (arg != 0)
- {
+ if (arg != 0) {
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
- if (msg == NULL) return -ENOMEM;
- if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg))
- {
+ if (msg == NULL)
+ return -ENOMEM;
+ if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) {
result = -ENOEXEC;
goto cleanup_return;
}
- if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0)
- {
+ if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) {
result = -ENOENT;
goto cleanup_return;
}
- if (msg->imgname != NULL && msg->imgnamelen > 0)
- {
+ if (msg->imgname != NULL && msg->imgnamelen > 0) {
imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
- if (imgname == NULL)
- {
+ if (imgname == NULL) {
result = -ENOMEM;
goto cleanup_return;
}
- if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0)
- {
+ if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) {
result = -ENOENT;
goto cleanup_return;
}
imgname[msg->imgnamelen] = '\0';
- //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname);
}
}
-
- switch (cmd)
- {
+ switch (cmd) {
case IOCTL_OPEN:
- if (dev->imgname != NULL)
- {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
+ break;
}
- else if (imgname == NULL)
- {
+ locked = 1;
+ if (dev->imgname != NULL) {
+ result = -EBUSY;
+ } else if (imgname == NULL) {
result = -EINVAL;
- }
- else if (msg == NULL)
- {
+ } else if (msg == NULL) {
result = -EINVAL;
- }
- else
- {
- if (sizeof(msg->host) != sizeof(dev->cur_server.host))
- printk("Odd size bug#1 triggered in IOCTL\n");
- memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host));
- dev->cur_server.failures = 0;
- memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server));
+ } else {
+ /* assert that at least one and not to many hosts are given */
+ if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) {
+ result = -EINVAL;
+ break;
+ }
+
dev->imgname = imgname;
dev->rid = msg->rid;
dev->use_server_provided_alts = msg->use_server_provided_alts;
- // Forget all alt servers on explicit connect, set first al server to initial server
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0]));
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- if (blk_queue->backing_dev_info != NULL) {
+
+ dev_info(dnbd3_device_to_dev(dev), "opening device.\n");
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set optimal request size for the queue to half the read-ahead
+ blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512));
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ // set readahead from optimal request size of the queue
+ // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE
+ blk_queue_update_readahead(dev->queue);
+#endif
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ if (blk_queue->backing_dev_info != NULL)
blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
- }
#else
blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
#endif
- if (dnbd3_net_connect(dev) == 0)
- {
- result = 0;
- imgname = NULL; // Prevent kfree at the end
+
+ /* add specified servers to alt server list */
+ for (i = 0; i < NUMBER_SERVERS; i++)
+ dev->alt_servers[i].host.ss_family = 0;
+ for (i = 0; i < msg->hosts_num; i++) {
+ /* copy provided host into corresponding alt server slot */
+ if (dnbd3_add_server(dev, &msg->hosts[i]) == 0)
+ dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n",
+ &dev->alt_servers[i].host);
+ else
+ dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n",
+ &dev->alt_servers[i].host);
}
- else
- {
- result = -ENOENT;
+
+ /*
+ * probe added alt servers in specified order and
+ * choose first working server as initial server
+ */
+ result = -EPROTONOSUPPORT;
+ for (i = 0; i < NUMBER_SERVERS; i++) {
+ /* probe added alt server */
+ if (dev->alt_servers[i].host.ss_family == 0)
+ continue; // Empty slot
+
+ result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true);
+ if (result == 0) {
+ /* connection established, store index of server and exit loop */
+ result = i;
+ break;
+ }
+ }
+
+ if (result >= 0) {
+ /* connection was successful */
+ dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n",
+ &dev->cur_server.host);
+ imgname = NULL; // Prevent kfree at the end
+ } else {
+ /* probing failed */
dev->imgname = NULL;
}
}
break;
case IOCTL_CLOSE:
- dnbd3_blk_fail_all_requests(dev);
- result = dnbd3_net_disconnect(dev);
- dnbd3_blk_fail_all_requests(dev);
- set_capacity(dev->disk, 0);
- if (dev->imgname)
- {
- kfree(dev->imgname);
- dev->imgname = NULL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
}
+ locked = 1;
+ result = dnbd3_close_device(dev);
break;
case IOCTL_SWITCH:
- result = -EINVAL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
+ }
+ locked = 1;
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ } else if (msg == NULL) {
+ result = -EINVAL;
+ } else {
+ dnbd3_alt_server_t *alt_server;
+ struct sockaddr_storage new_addr;
+
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(&msg->hosts[0], dev);
+ if (alt_server == NULL) {
+ mutex_unlock(&dev->alt_servers_lock);
+ /* specified server is not known, so do not switch */
+ result = -ENOENT;
+ } else {
+ /* specified server is known, so try to switch to it */
+ new_addr = alt_server->host;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &new_addr)) {
+ /* specified server is current server, so do not switch */
+ result = 0;
+ } else {
+ dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
+ &new_addr);
+ result = dnbd3_new_connection(dev, &new_addr, false);
+ if (result != 0) {
+ /* switching didn't work */
+ result = -EAGAIN;
+ }
+ }
+ if (result == 0) {
+ /* fake RTT so we don't switch away again soon */
+ mutex_lock(&dev->alt_servers_lock);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ alt_server = &dev->alt_servers[i];
+ if (is_same_server(&alt_server->host, &new_addr)) {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ alt_server->rtts[j] = 1;
+ alt_server->best_count = 100;
+ } else {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ if (alt_server->rtts[j] < 500000)
+ alt_server->rtts[j] = 500000;
+ alt_server->best_count = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ }
+ }
+ }
break;
case IOCTL_ADD_SRV:
- case IOCTL_REM_SRV:
- if (dev->imgname == NULL)
- {
- result = -ENOENT;
+ case IOCTL_REM_SRV: {
+ struct sockaddr_storage addr;
+ dnbd3_host_t *host;
+
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ break;
}
- else if (dev->new_servers_num >= NUMBER_SERVERS)
- {
- result = -EAGAIN;
+ if (msg == NULL) {
+ result = -EINVAL;
+ break;
}
- else if (msg == NULL)
- {
+ host = &msg->hosts[0];
+ if (!dnbd3_host_to_sockaddr(host, &addr)) {
result = -EINVAL;
+ break;
}
- else
- {
- memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host));
- dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM
- ++dev->new_servers_num;
- result = 0;
+
+ if (cmd == IOCTL_ADD_SRV) {
+ result = dnbd3_add_server(dev, host);
+ if (result == -EEXIST)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr);
+ else if (result == -ENOSPC)
+ dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr);
+ } else { // IOCTL_REM_SRV
+ result = dnbd3_rem_server(dev, host);
+ if (result == -ENOENT)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr);
}
break;
-
+ }
case BLKFLSBUF:
result = 0;
break;
@@ -270,113 +282,325 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
}
cleanup_return:
- if (msg) kfree(msg);
- if (imgname) kfree(imgname);
+ kfree(msg);
+ kfree(imgname);
+ if (locked)
+ dnbd3_flag_reset(dev->connection_lock);
return result;
}
-/**
- * dev->blk_lock and q->queue_lock are being held
- * when this is called!
+static const struct block_device_operations dnbd3_blk_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = dnbd3_blk_ioctl,
+};
+
+static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+}
+
+/*
+ * Linux kernel blk-mq driver function (entry point) to handle block IO requests
*/
-void dnbd3_blk_request(struct request_queue *q)
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
- struct request *req;
- dnbd3_device_t *dev;
+ struct request *rq = bd->rq;
+ dnbd3_device_t *dev = rq->q->queuedata;
+ struct dnbd3_cmd *cmd;
- while ((req = blk_fetch_request(q)) != NULL)
- {
- dev = req->rq_disk->private_data;
+ if (dev->imgname == NULL || !device_active(dev))
+ return BLK_STS_IOERR;
- if (dev->imgname == NULL)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (req_op(rq) != REQ_OP_READ)
+ return BLK_STS_IOERR;
- if (!(dnbd3_req_fs(req)))
- {
- __blk_end_request_all(req, 0);
- continue;
- }
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
+ return BLK_STS_TIMEOUT;
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (rq_data_dir(rq) != READ)
+ return BLK_STS_NOTSUPP;
- if (!(dnbd3_req_read(req)))
- {
- __blk_end_request_all(req, -EACCES);
- continue;
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32);
+ blk_mq_start_request(rq);
+ dnbd3_add_queue(dev, rq);
+ return BLK_STS_OK;
+}
+
+static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ , bool reserved
+#endif
+ )
+{
+ unsigned long irqflags;
+ struct request *rq_iter;
+ bool found = false;
+ dnbd3_device_t *dev = req->q->queuedata;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->send_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // If still in send queue, do nothing
+ if (found)
+ return BLK_EH_RESET_TIMER;
+
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ list_del_init(&req->queuelist);
+ break;
}
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (!found) {
+ dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n");
+ // Assume it was fnished concurrently
+ return BLK_EH_DONE;
+ }
+ // Add to send queue again and trigger work, reset timeout
+ dnbd3_add_queue(dev, req);
+ return BLK_EH_RESET_TIMER;
+}
+
+static
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+const
+#endif
+struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .timeout = dnbd3_rq_timeout,
+};
+
+int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ int ret;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->index = minor;
+ // lock for imgname, cur_server etc.
+ spin_lock_init(&dev->blk_lock);
+ spin_lock_init(&dev->send_queue_lock);
+ spin_lock_init(&dev->recv_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+ INIT_LIST_HEAD(&dev->recv_queue);
+ dnbd3_flag_reset(dev->connection_lock);
+ dnbd3_flag_reset(dev->discover_running);
+ mutex_init(&dev->alt_servers_lock);
+ dnbd3_net_work_init(dev);
+
+ // memset has done this already but I like initial values to be explicit
+ dev->imgname = NULL;
+ dev->rid = 0;
+ dev->update_available = false;
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->reported_size = 0;
+
+ // set up tag_set for blk-mq
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tag_set.driver_data = dev;
+ dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ;
+
+ ret = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (ret) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n");
+ goto out;
+ }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set up blk-mq and disk
+ dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev);
+ if (IS_ERR(dev->disk)) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n");
+ ret = PTR_ERR(dev->disk);
+ goto out_cleanup_tags;
+ }
+ dev->queue = dev->disk->queue;
+#else
+ // set up blk-mq
+ dev->queue = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(dev->queue)) {
+ ret = PTR_ERR(dev->queue);
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n");
+ goto out_cleanup_tags;
+ }
+ dev->queue->queuedata = dev;
+#endif
+
+ blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+ blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue);
+#else
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue);
+#endif
+#define ONE_MEG (1048576)
+ blk_queue_max_segment_size(dev->queue, ONE_MEG);
+ blk_queue_max_segments(dev->queue, 0xffff);
+ blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
+ dev->queue->limits.max_sectors = 256;
+#undef ONE_MEG
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ // set up disk
+ dev->disk = alloc_disk(1);
+ if (!dev->disk) {
+ dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n");
+ ret = -ENOMEM;
+ goto out_cleanup_queue;
+ }
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \
+ || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ dev->disk->flags |= GENHD_FL_NO_PART;
+#else
+ dev->disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ dev->disk->major = major;
+ dev->disk->first_minor = minor;
+ dev->disk->minors = 1;
+ dev->disk->fops = &dnbd3_blk_ops;
+ dev->disk->private_data = dev;
+ dev->disk->queue = dev->queue;
+ sprintf(dev->disk->disk_name, "dnbd%d", minor);
+ set_capacity(dev->disk, 0);
+ set_disk_ro(dev->disk, 1);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ ret = add_disk(dev->disk);
+ if (ret != 0)
+ goto out_cleanup_queue;
+#else
+ add_disk(dev->disk);
+#endif
+
+ // set up sysfs
+ dnbd3_sysfs_init(dev);
+
+ return 0;
+
+out_cleanup_queue:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+out_cleanup_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out:
+ mutex_destroy(&dev->alt_servers_lock);
+ return ret;
+}
+
+int dnbd3_blk_del_device(dnbd3_device_t *dev)
+{
+ while (!dnbd3_flag_get(dev->connection_lock))
+ schedule();
+ dnbd3_close_device(dev);
+ dnbd3_sysfs_exit(dev);
+ del_gendisk(dev->disk);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+ put_disk(dev->disk);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+ blk_mq_free_tag_set(&dev->tag_set);
+ mutex_destroy(&dev->alt_servers_lock);
+ return 0;
+}
+
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev)
+{
+ struct request *blk_request;
+ unsigned long flags;
+ struct list_head local_copy;
+ int count = 0;
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
+ INIT_LIST_HEAD(&local_copy);
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ list_add_tail(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
}
+ // Do this even if we didn't move anything from the recv list to the send
+ // list. It might have already contained something, which needs to be
+ // re-requested anyways if this was called because of a server switch.
+ spin_lock_irqsave(&dev->blk_lock, flags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, flags);
}
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
{
- struct request *blk_request, *tmp_request;
- struct request *blk_request2, *tmp_request2;
+ struct request *blk_request;
unsigned long flags;
struct list_head local_copy;
- int dup;
+ int count = 0;
+
INIT_LIST_HEAD(&local_copy);
- spin_lock_irqsave(&dev->blk_lock, flags);
- while (!list_empty(&dev->request_queue_receive))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- while (!list_empty(&dev->request_queue_send))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ while (!list_empty(&dev->send_queue)) {
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist)
- {
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
list_del_init(&blk_request->queuelist);
- if (dnbd3_req_fs(blk_request))
- {
- spin_lock_irqsave(&dev->blk_lock, flags);
- __blk_end_request_all(blk_request, -EIO);
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- }
- else if (dnbd3_req_special(blk_request))
- {
- kfree(blk_request);
- }
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
}
}
diff --git a/src/kernel/blk.h b/src/kernel/blk.h
index 5091d19..c6dcb8d 100644
--- a/src/kernel/blk.h
+++ b/src/kernel/blk.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,22 +22,17 @@
#ifndef BLK_H_
#define BLK_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV
-#endif
-
-extern struct block_device_operations dnbd3_blk_ops;
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
-
-void dnbd3_blk_request(struct request_queue *q);
+// The device has been set up via IOCTL_OPEN and hasn't been closed yet
+#define device_active(dev) ((dev)->reported_size != 0)
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor);
int dnbd3_blk_del_device(dnbd3_device_t *dev);
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev);
+
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev);
#endif /* BLK_H_ */
diff --git a/src/kernel/core.c b/src/kernel/core.c
deleted file mode 100644
index 69a2540..0000000
--- a/src/kernel/core.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include "clientconfig.h"
-#include "dnbd3.h"
-#include "blk.h"
-
-int major;
-static unsigned int max_devs = NUMBER_DEVICES;
-static dnbd3_device_t *dnbd3_device;
-
-static int __init dnbd3_init(void)
-{
- int i;
-
- dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL);
- if (!dnbd3_device)
- return -ENOMEM;
-
- // initialize block device
- if ((major = register_blkdev(0, "dnbd3")) == 0)
- {
- printk("ERROR: dnbd3 register_blkdev failed.\n");
- return -EIO;
- }
-
- printk("DNBD3 kernel module loaded. Machine type: " ENDIAN_MODE "\n");
-
- // add MAX_NUMBER_DEVICES devices
- for (i = 0; i < max_devs; i++)
- {
- if (dnbd3_blk_add_device(&dnbd3_device[i], i) != 0)
- {
- printk("ERROR: adding device failed.\n");
- return -EIO; // TODO: delete all devices added so far. it could happen that it's not the first one that fails. also call unregister_blkdev and free memory
- }
- }
-
- printk("INFO: dnbd3 init successful (%i devices).\n", max_devs);
- return 0;
-}
-
-static void __exit dnbd3_exit(void)
-{
- int i;
-
- for (i = 0; i < max_devs; i++)
- {
- dnbd3_blk_del_device(&dnbd3_device[i]);
- }
-
- unregister_blkdev(major, "dnbd3");
- kfree(dnbd3_device);
- printk("INFO: dnbd3 exit.\n");
-}
-
-module_init( dnbd3_init);
-module_exit( dnbd3_exit);
-
-MODULE_DESCRIPTION("Distributed Network Block Device 3");
-MODULE_LICENSE("GPL");
-
-module_param(max_devs, int, 0444);
-MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
deleted file mode 100644
index f8af69f..0000000
--- a/src/kernel/dnbd3.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef DNBD_H_
-#define DNBD_H_
-
-#include <linux/version.h>
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <net/sock.h>
-
-#define KERNEL_MODULE
-#include "config.h"
-#include "types.h"
-#include "serialize.h"
-
-extern int major;
-
-typedef struct
-{
- dnbd3_host_t host;
- unsigned long rtts[4]; // Last four round trip time measurements in µs
- uint16_t protocol_version; // dnbd3 protocol version of this server
- uint8_t failures; // How many times the server was unreachable
-} dnbd3_server_t;
-
-typedef struct
-{
- // block
- struct gendisk *disk;
- spinlock_t blk_lock;
-
- // sysfs
- struct kobject kobj;
-
- // network
- char *imgname;
- struct socket *sock;
- dnbd3_server_t cur_server, initial_server;
- unsigned long cur_rtt;
- serialized_buffer_t payload_buffer;
- dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers
- int new_servers_num; // number of new alt servers that are waiting to be copied to above array
- dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers
- uint8_t discover, panic, disconnecting, update_available, panic_count;
- uint8_t use_server_provided_alts;
- uint16_t rid;
- uint32_t heartbeat_count;
- uint64_t reported_size;
- // server switch
- struct socket *better_sock;
-
- // process
- struct task_struct * thread_send;
- struct task_struct * thread_receive;
- struct task_struct *thread_discover;
- struct timer_list hb_timer;
- wait_queue_head_t process_queue_send;
- wait_queue_head_t process_queue_receive;
- wait_queue_head_t process_queue_discover;
- struct list_head request_queue_send;
- struct list_head request_queue_receive;
-
-} dnbd3_device_t;
-
-#endif /* DNBD_H_ */
diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c
new file mode 100644
index 0000000..cb42567
--- /dev/null
+++ b/src/kernel/dnbd3_main.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <dnbd3/config/client.h>
+#include <dnbd3/version.h>
+#include <net/ipv6.h>
+#include "dnbd3_main.h"
+#include "blk.h"
+
+int major;
+static unsigned int max_devs = NUMBER_DEVICES;
+static dnbd3_device_t *dnbd3_devices;
+
+struct device *dnbd3_device_to_dev(dnbd3_device_t *dev)
+{
+ return disk_to_dev(dev->disk);
+}
+
+int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest)
+{
+ struct sockaddr_in *sin4;
+ struct sockaddr_in6 *sin6;
+
+ memset(dest, 0, sizeof(*dest));
+ if (host->type == HOST_IP4) {
+ sin4 = (struct sockaddr_in *)dest;
+ sin4->sin_family = AF_INET;
+ memcpy(&(sin4->sin_addr), host->addr, 4);
+ sin4->sin_port = host->port;
+ } else if (host->type == HOST_IP6) {
+ sin6 = (struct sockaddr_in6 *)dest;
+ sin6->sin6_family = AF_INET6;
+ memcpy(&(sin6->sin6_addr), host->addr, 16);
+ sin6->sin6_port = host->port;
+ } else
+ return 0;
+ return 1;
+}
+
+int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y)
+{
+ if (x->ss_family != y->ss_family)
+ return 0;
+ switch (x->ss_family) {
+ case AF_INET: {
+ const struct sockaddr_in *sinx = (const struct sockaddr_in *)x;
+ const struct sockaddr_in *siny = (const struct sockaddr_in *)y;
+
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x;
+ const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y;
+
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Get a free slot pointer from the alt_servers list. Tries to find an
+ * entirely empty slot first, then looks for a slot with a server that
+ * wasn't reachable recently, finally returns NULL if none of the
+ * conditions match.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+static dnbd3_alt_server_t *get_free_alt_server(dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
+ return &dev->alt_servers[i];
+ }
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].failures > 10)
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (is_same_server(addr, &dev->alt_servers[i].host))
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+/**
+ * Returns pointer to existing entry in alt_servers that matches the given
+ * alt server, or NULL if not found.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev)
+{
+ struct sockaddr_storage addr;
+
+ if (!dnbd3_host_to_sockaddr(host, &addr))
+ return NULL;
+ return get_existing_alt_from_addr(&addr, dev);
+}
+
+int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ int result;
+ dnbd3_alt_server_t *alt_server;
+
+ if (host->type != HOST_IP4 && host->type != HOST_IP6)
+ return -EINVAL;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // ADD
+ if (alt_server != NULL) {
+ // Exists
+ result = -EEXIST;
+ } else {
+ // OK add
+ alt_server = get_free_alt_server(dev);
+ if (alt_server == NULL) {
+ result = -ENOSPC;
+ } else {
+ dnbd3_host_to_sockaddr(host, &alt_server->host);
+ alt_server->protocol_version = 0;
+ alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
+ = alt_server->rtts[3] = RTT_UNREACHABLE;
+ alt_server->failures = 0;
+ alt_server->best_count = 0;
+ result = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ dnbd3_alt_server_t *alt_server;
+ int result;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // REMOVE
+ if (alt_server == NULL) {
+ // Not found
+ result = -ENOENT;
+ } else {
+ // Remove
+ alt_server->host.ss_family = 0;
+ result = 0;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+static int __init dnbd3_init(void)
+{
+ int i;
+
+ dnbd3_devices = kcalloc(max_devs, sizeof(*dnbd3_devices), GFP_KERNEL);
+ if (!dnbd3_devices)
+ return -ENOMEM;
+
+ // initialize block device
+ major = register_blkdev(0, "dnbd3");
+ if (major == 0) {
+ pr_err("register_blkdev failed\n");
+ return -EIO;
+ }
+
+ pr_info("kernel module in version %s loaded\n", DNBD3_VERSION);
+ pr_debug("machine type %s\n", DNBD3_ENDIAN_MODE);
+
+ // add MAX_NUMBER_DEVICES devices
+ for (i = 0; i < max_devs; i++) {
+ if (dnbd3_blk_add_device(&dnbd3_devices[i], i) != 0) {
+ pr_err("dnbd3_blk_add_device failed\n");
+ // TODO: delete all devices added so far.
+ // It could happen that it's not the first one that fails.
+ // Also call unregister_blkdev and free memory.
+ return -EIO;
+ }
+ }
+
+ pr_info("init successful (%i devices)\n", max_devs);
+
+ return 0;
+}
+
+static void __exit dnbd3_exit(void)
+{
+ int i;
+
+ pr_debug("exiting kernel module...\n");
+ for (i = 0; i < max_devs; i++)
+ dnbd3_blk_del_device(&dnbd3_devices[i]);
+
+ unregister_blkdev(major, "dnbd3");
+ kfree(dnbd3_devices);
+
+ pr_info("exit kernel module done\n");
+}
+
+module_init(dnbd3_init);
+module_exit(dnbd3_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device 3");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DNBD3_VERSION);
+
+module_param(max_devs, int, 0444);
+MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h
new file mode 100644
index 0000000..a932ba2
--- /dev/null
+++ b/src/kernel/dnbd3_main.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef DNBD_H_
+#define DNBD_H_
+
+#include <dnbd3/config/client.h>
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/blk-mq.h>
+
+#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION)
+#define RHEL_CHECK_VERSION(CONDITION) (CONDITION)
+#else
+#define RHEL_CHECK_VERSION(CONDITION) (0)
+#endif
+
+extern int major;
+
+typedef struct {
+ unsigned long rtts[DISCOVER_HISTORY_SIZE]; // Last X round trip time measurements in µs
+ uint16_t protocol_version; // dnbd3 protocol version of this server
+ uint8_t failures; // How many times the server was unreachable
+ uint8_t best_count; // Number of times server measured best
+ struct sockaddr_storage host; // Address of server
+} dnbd3_alt_server_t;
+
+typedef struct {
+ // block
+ int index;
+ struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue *queue;
+ spinlock_t blk_lock;
+
+ // sysfs
+ struct kobject kobj;
+
+ char *imgname;
+ uint16_t rid;
+ struct socket *sock;
+ struct { // use blk_lock
+ unsigned long rtt;
+ struct sockaddr_storage host;
+ uint16_t protocol_version;
+ } cur_server;
+ serialized_buffer_t payload_buffer;
+ struct mutex alt_servers_lock;
+ dnbd3_alt_server_t alt_servers[NUMBER_SERVERS];
+ bool use_server_provided_alts;
+ bool panic;
+ u8 panic_count;
+ bool update_available;
+ atomic_t connection_lock;
+ // Size if image/device - this is 0 if the device is not in use,
+ // otherwise this is also the value we expect from alt servers.
+ uint64_t reported_size;
+ struct delayed_work keepalive_work;
+
+ // sending
+ struct workqueue_struct *send_wq;
+ spinlock_t send_queue_lock;
+ struct list_head send_queue;
+ struct mutex send_mutex;
+ struct work_struct send_work;
+ // receiving
+ struct workqueue_struct *recv_wq;
+ spinlock_t recv_queue_lock;
+ struct list_head recv_queue;
+ struct mutex recv_mutex;
+ struct work_struct recv_work;
+ // discover
+ atomic_t discover_running;
+ struct delayed_work discover_work;
+ u32 discover_interval;
+ u32 discover_count;
+
+} dnbd3_device_t;
+
+struct dnbd3_cmd {
+ u64 handle;
+};
+
+extern inline struct device *dnbd3_device_to_dev(dnbd3_device_t *dev);
+
+extern inline int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y);
+
+extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev);
+
+extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+extern int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+#define dnbd3_flag_get(x) (atomic_cmpxchg(&(x), 0, 1) == 0)
+#define dnbd3_flag_reset(x) atomic_set(&(x), 0)
+#define dnbd3_flag_taken(x) (atomic_read(&(x)) != 0)
+
+/*
+ * shims for making older kernels look like the current one, if possible, to avoid too
+ * much inline #ifdef which makes code harder to read.
+ */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
+#define BLK_EH_DONE BLK_EH_NOT_HANDLED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
+#define blk_status_t int
+#define BLK_STS_OK 0
+#define BLK_STS_IOERR (-EIO)
+#define BLK_STS_TIMEOUT (-ETIME)
+#define BLK_STS_NOTSUPP (-ENOTSUPP)
+#endif
+
+#endif /* DNBD_H_ */
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 9e48b86..5ef4016 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,1106 +19,1112 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "net.h"
#include "blk.h"
-#include "utils.h"
+#include "dnbd3_main.h"
-#include "serialize.h"
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/random.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
+#define get_random_u32 prandom_u32
+#endif
#include <linux/time.h>
-#include <linux/signal.h>
+#include <linux/ktime.h>
+#include <linux/tcp.h>
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
-#else
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern((af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
+#ifndef ktime_to_s
+#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC)
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-// cmd_flags and cmd_type are merged into cmd_flags now
-#if REQ_FLAG_BITS > 24
-#error "Fix CMD bitshift"
-#endif
-// Pack into cmd_flags field by shifting CMD_* into unused bits of cmd_flags
-#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = REQ_OP_DRV_IN | ((cmd) << REQ_FLAG_BITS)
-#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS)
-#define dnbd3_req_op(req) req_op(req)
-#define DNBD3_DEV_READ REQ_OP_READ
-#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
+#ifdef DEBUG
+#define ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+ } while (0)
#else
-// Old way with type and flags separated
-#define dnbd3_cmd_to_priv(req, cmd) do { \
- (req)->cmd_type = REQ_TYPE_SPECIAL; \
- (req)->cmd_flags = (cmd); \
-} while (0)
-#define dnbd3_priv_to_cmd(req) (req)->cmd_flags
-#define dnbd3_req_op(req) (req)->cmd_type
-#define DNBD3_DEV_READ REQ_TYPE_FS
-#define DNBD3_REQ_OP_SPECIAL REQ_TYPE_SPECIAL
+#define ASSERT(x) \
+ do { \
+ } while (0)
#endif
-/**
- * Some macros for easier debug output. Location in source-code
- * as well as server IP:port info will be printed.
- * The error_* macros include a "goto error;" at the end
- */
-#if 1 // Change to 0 to disable debug messages
-#define debug_print_va_host(_host, _fmt, ...) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " _fmt " (%s, %pI4:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " _fmt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_va_host(_host, _fmt, ...) do { \
- debug_print_va_host(_host, _fmt, __VA_ARGS__); \
- goto error; \
-} while(0)
-#define debug_dev_va(_fmt, ...) debug_print_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define error_dev_va(_fmt, ...) debug_error_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define debug_alt_va(_fmt, ...) debug_print_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-#define error_alt_va(_fmt, ...) debug_error_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-
-#define debug_print_host(_host, txt) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " txt " (%s, %pI4:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " txt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_host(_host, txt) do { \
- debug_print_host(_host, txt); \
- goto error; \
-} while(0)
-#define debug_dev(txt) debug_print_host(dev->cur_server.host, txt)
-#define error_dev(txt) debug_error_host(dev->cur_server.host, txt)
-#define debug_alt(txt) debug_print_host(dev->alt_servers[i].host, txt)
-#define error_alt(txt) debug_error_host(dev->alt_servers[i].host, txt)
-
-#else // Silent
-#define debug_dev(x) do { } while(0)
-#define error_dev(x) goto error
-#define debug_dev_va(x, ...) do { } while(0)
-#define error_dev_va(x, ...) goto error
-#define debug_alt(x) do { } while(0)
-#define error_alt(x) goto error
-#define debug_alt_va(x, ...) do { } while(0)
-#define error_alt_va(x, ...) goto error
-#endif
+#define dnbd3_dev_dbg_host(dev, host, fmt, ...) \
+ dev_dbg(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_info_host(dev, host, fmt, ...) \
+ dev_info(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_err_host(dev, host, fmt, ...) \
+ dev_err(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
-static inline int is_same_server(const dnbd3_server_t * const a, const dnbd3_server_t * const b)
-{
- return (a->host.type == b->host.type) && (a->host.port == b->host.port)
- && (0 == memcmp(a->host.addr, b->host.addr, (a->host.type == HOST_IP4 ? 4 : 16)));
-}
+#define dnbd3_dev_dbg_cur(dev, fmt, ...) \
+ dnbd3_dev_dbg_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_info_cur(dev, fmt, ...) \
+ dnbd3_dev_info_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_err_cur(dev, fmt, ...) \
+ dnbd3_dev_err_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
-static inline dnbd3_server_t *get_existing_server(const dnbd3_server_entry_t * const newserver,
- dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if ((newserver->host.type == dev->alt_servers[i].host.type)
- && (newserver->host.port == dev->alt_servers[i].host.port)
- && (0
- == memcmp(newserver->host.addr, dev->alt_servers[i].host.addr, (newserver->host.type == HOST_IP4 ? 4 : 16))))
- {
- return &dev->alt_servers[i];
- break;
- }
- }
- return NULL ;
-}
-
-static inline dnbd3_server_t *get_free_alt_server(dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == 0)
- return &dev->alt_servers[i];
- }
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].failures > 10)
- return &dev->alt_servers[i];
- }
- return NULL ;
-}
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes);
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count);
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr);
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size);
-int dnbd3_net_connect(dnbd3_device_t *dev)
-{
- struct request *req1 = NULL;
- struct timeval timeout;
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, u16 protocol_version);
- if (dev->disconnecting) {
- debug_dev("CONNECT: Still disconnecting!!!\n");
- while (dev->disconnecting)
- schedule();
- }
- if (dev->thread_receive != NULL) {
- debug_dev("CONNECT: Still receiving!!!\n");
- while (dev->thread_receive != NULL)
- schedule();
- }
- if (dev->thread_send != NULL) {
- debug_dev("CONNECT: Still sending!!!\n");
- while (dev->thread_send != NULL)
- schedule();
- }
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket **sock_out);
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
- timeout.tv_usec = 0;
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info);
- // do some checks before connecting
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket *sock);
- req1 = kmalloc(sizeof(*req1), GFP_ATOMIC );
- if (!req1)
- error_dev("FATAL: Kmalloc(1) failed.");
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd);
- if (dev->cur_server.host.port == 0 || dev->cur_server.host.type == 0 || dev->imgname == NULL )
- error_dev("FATAL: Host, port or image name not set.");
- if (dev->sock)
- error_dev("ERROR: Already connected.");
-
- if (dev->cur_server.host.type != HOST_IP4 && dev->cur_server.host.type != HOST_IP6)
- error_dev_va("ERROR: Unknown address type %d", (int)dev->cur_server.host.type);
-
- debug_dev("INFO: Connecting...");
-
- if (dev->better_sock == NULL )
- {
- // no established connection yet from discovery thread, start new one
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov[2];
- uint16_t rid;
- char *name;
- int mlen;
- init_msghdr(msg);
-
- if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0)
- error_dev("ERROR: Couldn't create socket (v6).");
-
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- dev->sock->sk->sk_allocation = GFP_NOIO;
- if (dev->cur_server.host.type == HOST_IP4)
- {
- struct sockaddr_in sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- memcpy(&(sin.sin_addr), dev->cur_server.host.addr, 4);
- sin.sin_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v4)");
- }
- else
- {
- struct sockaddr_in6 sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin6_family = AF_INET6;
- memcpy(&(sin.sin6_addr), dev->cur_server.host.addr, 16);
- sin.sin6_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v6)");
- }
- // Request filesize
- dnbd3_request.magic = dnbd3_packet_magic;
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(&dev->payload_buffer);
- serializer_put_uint16(&dev->payload_buffer, PROTOCOL_VERSION);
- serializer_put_string(&dev->payload_buffer, dev->imgname);
- serializer_put_uint16(&dev->payload_buffer, dev->rid);
- serializer_put_uint8(&dev->payload_buffer, 0); // is_server = false
- iov[1].iov_base = &dev->payload_buffer;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&dev->payload_buffer);
- fixup_request(dnbd3_request);
- mlen = sizeof(dnbd3_request) + iov[1].iov_len;
- if (kernel_sendmsg(dev->sock, &msg, iov, 2, mlen) != mlen)
- error_dev("ERROR: Couldn't send CMD_SIZE_REQUEST.");
- // receive reply header
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_dev("FATAL: Received corrupted reply header after CMD_SIZE_REQUEST.");
- // check reply header
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 3 || dnbd3_reply.size > MAX_PAYLOAD
- || dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("FATAL: Received invalid reply to CMD_SIZE_REQUEST, image doesn't exist on server.");
- // receive reply payload
- iov[0].iov_base = &dev->payload_buffer;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_dev("FATAL: Cold not read CMD_SELECT_IMAGE payload on handshake.");
- // handle/check reply payload
- serializer_reset_read(&dev->payload_buffer, dnbd3_reply.size);
- dev->cur_server.protocol_version = serializer_get_uint16(&dev->payload_buffer);
- if (dev->cur_server.protocol_version < MIN_SUPPORTED_SERVER)
- error_dev("FATAL: Server version is lower than min supported version.");
- name = serializer_get_string(&dev->payload_buffer);
- if (dev->rid != 0 && strcmp(name, dev->imgname) != 0)
- error_dev_va("FATAL: Server offers image '%s', requested '%s'", name, dev->imgname);
- if (strlen(dev->imgname) < strlen(name))
- {
- dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_ATOMIC );
- if (dev->imgname == NULL )
- error_dev("FATAL: Reallocating buffer for new image name failed");
- }
- strcpy(dev->imgname, name);
- rid = serializer_get_uint16(&dev->payload_buffer);
- if (dev->rid != 0 && dev->rid != rid)
- error_dev_va("FATAL: Server provides rid %d, requested was %d.", (int)rid, (int)dev->rid);
- dev->rid = rid;
- dev->reported_size = serializer_get_uint64(&dev->payload_buffer);
- if (dev->reported_size < 4096)
- error_dev("ERROR: Reported size by server is < 4096");
- // store image information
- set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
- debug_dev_va("INFO: Filesize: %llu.", dev->reported_size);
- dev->update_available = 0;
- }
- else // Switching server, connection is already established and size request was executed
- {
- debug_dev("INFO: On-the-fly server change.");
- dev->sock = dev->better_sock;
- dev->better_sock = NULL;
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- }
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic);
- dev->panic = 0;
- dev->panic_count = 0;
+static void dnbd3_discover(dnbd3_device_t *dev);
- // Enqueue request to request_queue_send for a fresh list of alt servers
- dnbd3_cmd_to_priv(req1, CMD_GET_SERVERS);
- list_add(&req1->queuelist, &dev->request_queue_send);
+static void dnbd3_internal_discover(dnbd3_device_t *dev);
- // create required threads
- dev->thread_send = kthread_create(dnbd3_net_send, dev, dev->disk->disk_name);
- dev->thread_receive = kthread_create(dnbd3_net_receive, dev, dev->disk->disk_name);
- dev->thread_discover = kthread_create(dnbd3_net_discover, dev, dev->disk->disk_name);
- // start them up
- wake_up_process(dev->thread_send);
- wake_up_process(dev->thread_receive);
- wake_up_process(dev->thread_discover);
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms);
- wake_up(&dev->process_queue_send);
+// Use as write-only dump, don't care about race conditions etc.
+static u8 __garbage_mem[PAGE_SIZE];
- // add heartbeat timer
- dev->heartbeat_count = 0;
+/**
+ * Delayed work triggering sending of keepalive packet.
+ */
+static void dnbd3_keepalive_workfn(struct work_struct *work)
+{
+ unsigned long irqflags;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, keepalive_work.work);
-// init_timer_key changed from kernel version 4.14 to 4.15, see and compare to 4.15:
-// https://elixir.bootlin.com/linux/v4.14.32/source/include/linux/timer.h#L98
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
- timer_setup(&dev->hb_timer, dnbd3_net_heartbeat, 0);
-#else
- // Old timer setup
- init_timer(&dev->hb_timer);
- dev->hb_timer.data = (unsigned long)dev;
- dev->hb_timer.function = dnbd3_net_heartbeat;
-#endif
- dev->hb_timer.expires = jiffies + HZ;
- add_timer(&dev->hb_timer);
- return 0;
- error: ;
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ dnbd3_send_empty_request(dev, CMD_KEEPALIVE);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->keepalive_work, KEEPALIVE_INTERVAL * HZ);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
- if (req1)
- kfree(req1);
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-int dnbd3_net_disconnect(dnbd3_device_t *dev)
+/**
+ * Delayed work triggering discovery (alt server check)
+ */
+static void dnbd3_discover_workfn(struct work_struct *work)
{
- if (dev->disconnecting)
- return 0;
-
- if (dev->cur_server.host.port)
- debug_dev("INFO: Disconnecting device.");
-
- dev->disconnecting = 1;
-
- // clear heartbeat timer
- del_timer(&dev->hb_timer);
-
- dev->discover = 0;
-
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
-
- // kill sending and receiving threads
- if (dev->thread_send)
- {
- kthread_stop(dev->thread_send);
- }
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, discover_work.work);
- if (dev->thread_receive)
- {
- kthread_stop(dev->thread_receive);
- }
+ dnbd3_discover(dev);
+}
- if (dev->thread_discover)
- {
- kthread_stop(dev->thread_discover);
- dev->thread_discover = NULL;
- }
+/**
+ * For manually triggering an immediate discovery
+ */
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic)
+{
+ unsigned long irqflags;
- // clear socket
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ if (!device_active(dev))
+ return;
+ if (panic && dnbd3_flag_get(dev->connection_lock)) {
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (!dev->panic) {
+ // Panic freshly turned on
+ dev->panic = true;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC;
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->connection_lock);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
-
- dev->disconnecting = 0;
-
- return 0;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, 1);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg)
-{
- dnbd3_device_t *dev = (dnbd3_device_t *)container_of(arg, dnbd3_device_t, hb_timer);
-#else
-void dnbd3_net_heartbeat(unsigned long arg)
+/**
+ * Wrapper for the actual discover function below. Check run conditions
+ * here and re-schedule delayed task here.
+ */
+static void dnbd3_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = (dnbd3_device_t *)arg;
-#endif
- // Because different events need different intervals, the timer is called once a second.
- // Other intervals can be derived using dev->heartbeat_count.
-#define timeout_seconds(x) (dev->heartbeat_count % (x) == 0)
-
- if (!dev->panic)
- {
- if (timeout_seconds(TIMER_INTERVAL_KEEPALIVE_PACKET))
- {
- struct request *req = kmalloc(sizeof(struct request), GFP_ATOMIC );
- // send keepalive
- if (req)
- {
- dnbd3_cmd_to_priv(req, CMD_KEEPALIVE);
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- wake_up(&dev->process_queue_send);
- }
- else
- {
- debug_dev("ERROR: Couldn't create keepalive request.");
- }
- }
- if ((dev->heartbeat_count > STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_NORMAL))
- || (dev->heartbeat_count <= STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_STARTUP)))
- {
- // Normal discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ unsigned long irqflags;
+
+ if (!device_active(dev) || dnbd3_flag_taken(dev->connection_lock))
+ return; // device not active anymore, or just about to switch
+ if (!dnbd3_flag_get(dev->discover_running))
+ return; // Already busy
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ cancel_delayed_work(&dev->discover_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_internal_discover(dev);
+ dev->discover_count++;
+ // Re-queueing logic
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, dev->discover_interval * HZ);
+ if (dev->discover_interval < TIMER_INTERVAL_PROBE_MAX
+ && dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT) {
+ dev->discover_interval += 2;
}
}
- else if (timeout_seconds(TIMER_INTERVAL_PROBE_PANIC))
- {
- // Panic discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
-
- dev->hb_timer.expires = jiffies + HZ;
-
- ++dev->heartbeat_count;
- add_timer(&dev->hb_timer);
-#undef timeout_seconds
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->discover_running);
}
-int dnbd3_net_discover(void *data)
+/**
+ * Discovery. Probe all (or some) known alt servers,
+ * and initiate connection switch if appropriate
+ */
+static void dnbd3_internal_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = data;
- struct sockaddr_in sin4;
- struct sockaddr_in6 sin6;
struct socket *sock, *best_sock = NULL;
+ dnbd3_alt_server_t *alt;
+ struct sockaddr_storage host_compare, best_server;
+ uint16_t remote_version;
+ ktime_t start, end;
+ unsigned long rtt = 0, best_rtt = 0;
+ int i, j, k, isize, fails, rtt_threshold;
+ int do_change = 0;
+ u8 check_order[NUMBER_SERVERS];
+ const bool ready = dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT;
+ const u32 turn = dev->discover_count % DISCOVER_HISTORY_SIZE;
+
+ // Shuffle alt_servers
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ check_order[i] = i;
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- dnbd3_server_t *alt_server;
- struct msghdr msg;
- struct kvec iov[2];
-
- char *buf, *name;
- serialized_buffer_t *payload;
- uint64_t filesize;
- uint16_t rid;
-
- struct timeval start, end;
- unsigned long rtt, best_rtt = 0;
- unsigned long irqflags;
- int i, j, isize, best_server, current_server;
- int turn = 0;
- int ready = 0, do_change = 0;
- char check_order[NUMBER_SERVERS];
- int mlen;
-
- struct request *last_request = (struct request *)123, *cur_request = (struct request *)456;
-
- struct timeval timeout;
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DISCOVERY;
- timeout.tv_usec = 0;
-
- memset(&sin4, 0, sizeof(sin4));
- memset(&sin6, 0, sizeof(sin6));
-
- init_msghdr(msg);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ j = get_random_u32() % NUMBER_SERVERS;
+ if (j != i) {
+ int tmp = check_order[i];
- buf = kmalloc(4096, GFP_KERNEL);
- if (!buf)
- {
- debug_dev("FATAL: Kmalloc failed (discover)");
- return -1;
+ check_order[i] = check_order[j];
+ check_order[j] = tmp;
+ }
}
- payload = (serialized_buffer_t *)buf; // Reuse this buffer to save kernel mem
- dnbd3_request.magic = dnbd3_packet_magic;
+ best_server.ss_family = 0;
+ best_rtt = RTT_UNREACHABLE;
- for (i = 0; i < NUMBER_SERVERS; ++i) {
- check_order[i] = i;
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_discover,
- kthread_should_stop() || dev->discover || dev->thread_discover == NULL);
+ if (!ready || dev->panic)
+ isize = NUMBER_SERVERS;
+ else
+ isize = 3;
- if (kthread_should_stop() || dev->imgname == NULL || dev->thread_discover == NULL )
+ for (j = 0; j < NUMBER_SERVERS; ++j) {
+ if (!device_active(dev))
break;
+ i = check_order[j];
+ mutex_lock(&dev->alt_servers_lock);
+ host_compare = dev->alt_servers[i].host;
+ fails = dev->alt_servers[i].failures;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (host_compare.ss_family == 0)
+ continue; // Empty slot
+ // Reduced probability for hosts that have been unreachable
+ if (!dev->panic && fails > 50 && (get_random_u32() % 4) != 0)
+ continue; // If not in panic mode, skip server if it failed too many times
+ if (isize-- <= 0 && !is_same_server(&dev->cur_server.host, &host_compare))
+ continue; // Only test isize servers plus current server
+
+ // Initialize socket and connect
+ sock = NULL;
+ if (dnbd3_connect(dev, &host_compare, &sock) != 0)
+ goto error;
- if (!dev->discover)
- continue;
- dev->discover = 0;
-
- if (dev->reported_size < 4096)
- continue;
-
- // Check if the list of alt servers needs to be updated and do so if necessary
- if (dev->new_servers_num)
- {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- for (i = 0; i < dev->new_servers_num; ++i)
- {
- if (dev->new_servers[i].host.type != HOST_IP4 && dev->new_servers[i].host.type != HOST_IP6) // Invalid entry?
- continue;
- alt_server = get_existing_server(&dev->new_servers[i], dev);
- if (alt_server != NULL ) // Server already known
- {
- if (dev->new_servers[i].failures == 1)
- {
- // REMOVE request
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Removing alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Removing alt server %pI6", alt_server->host.addr);
- alt_server->host.type = 0;
- continue;
- }
- // ADD, so just reset fail counter
- alt_server->failures = 0;
- continue;
- }
- if (dev->new_servers[i].failures == 1) // REMOVE, but server is not in list anyways
- continue;
- alt_server = get_free_alt_server(dev);
- if (alt_server == NULL ) // All NUMBER_SERVERS slots are taken, ignore entry
- continue;
- // Add new server entry
- alt_server->host = dev->new_servers[i].host;
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Adding alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Adding alt server %pI6", alt_server->host.addr);
- alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE;
- alt_server->protocol_version = 0;
- alt_server->failures = 0;
- }
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
+ remote_version = 0;
+ if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false))
+ goto error;
- current_server = best_server = -1;
- best_rtt = 0xFFFFFFFul;
- if (dev->heartbeat_count < STARTUP_MODE_DURATION || dev->panic)
- {
- isize = NUMBER_SERVERS;
- }
- else
- {
- isize = 3;
- }
- if (NUMBER_SERVERS > isize) {
- for (i = 0; i < isize; ++i) {
- j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS;
- if (j != i) {
- mlen = check_order[i];
- check_order[i] = check_order[j];
- check_order[j] = mlen;
- }
+ // panic mode, take first responding server
+ if (dev->panic) {
+ dnbd3_dev_info_host(dev, &host_compare, "panic mode, changing to new server\n");
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_host(dev, &host_compare, "...raced, ignoring\n");
+ } else {
+ // Check global flag, a connect might have been in progress
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000);
+ if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0)
+ sock_release(sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
}
}
- for (j = 0; j < NUMBER_SERVERS; ++j)
- {
- i = check_order[j];
- if (dev->alt_servers[i].host.type == 0) // Empty slot
- continue;
- if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times
- continue;
- if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- continue;
-
- // Initialize socket and connect
- if (dnbd3_sock_create(dev->alt_servers[i].host.type, SOCK_STREAM, IPPROTO_TCP, &sock) < 0)
- {
- debug_alt("ERROR: Couldn't create socket (discover).");
- sock = NULL;
- continue;
- }
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- sock->sk->sk_allocation = GFP_NOIO;
- if (dev->alt_servers[i].host.type == HOST_IP4)
- {
- sin4.sin_family = AF_INET;
- memcpy(&sin4.sin_addr, dev->alt_servers[i].host.addr, 4);
- sin4.sin_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin4, sizeof(sin4), 0) < 0)
- goto error;
- }
- else
- {
- sin6.sin6_family = AF_INET6;
- memcpy(&sin6.sin6_addr, dev->alt_servers[i].host.addr, 16);
- sin6.sin6_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin6, sizeof(sin6), 0) < 0)
- goto error;
- }
+ // actual rtt measurement is just the first block requests and reply
+ start = ktime_get_real();
+ if (!dnbd3_request_test_block(dev, &host_compare, sock))
+ goto error;
+ end = ktime_get_real();
- // Request filesize
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(payload);
- serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
- serializer_put_string(payload, dev->imgname); // image name
- serializer_put_uint16(payload, dev->rid); // revision id
- serializer_put_uint8(payload, 0); // are we a server? (no!)
- iov[1].iov_base = payload;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(payload);
- fixup_request(dnbd3_request);
- mlen = iov[1].iov_len + sizeof(dnbd3_request);
- if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen)
- error_alt("ERROR: Requesting image size failed.");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving image size packet (header) failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4)
- error_alt("ERROR: Content of image size packet (header) mismatched (discover).");
-
- // receive data
- iov[0].iov_base = payload;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_alt("ERROR: Receiving image size packet (payload) failed (discover).");
- serializer_reset_read(payload, dnbd3_reply.size);
-
- dev->alt_servers[i].protocol_version = serializer_get_uint16(payload);
- if (dev->alt_servers[i].protocol_version < MIN_SUPPORTED_SERVER)
- error_alt_va("ERROR: Server version too old (client: %d, server: %d, min supported: %d).",
- (int)PROTOCOL_VERSION, (int)dev->alt_servers[i].protocol_version, (int)MIN_SUPPORTED_SERVER);
-
- name = serializer_get_string(payload);
- if (name == NULL )
- error_alt("ERROR: Server did not supply an image name (discover).");
-
- if (strcmp(name, dev->imgname) != 0)
- error_alt_va("ERROR: Image name does not match requested one (client: '%s', server: '%s') (discover).",
- dev->imgname, name);
-
- rid = serializer_get_uint16(payload);
- if (rid != dev->rid)
- error_alt_va("ERROR: Server supplied wrong rid (client: '%d', server: '%d') (discover).",
- (int)dev->rid, (int)rid);
-
- filesize = serializer_get_uint64(payload);
- if (filesize != dev->reported_size)
- error_alt_va("ERROR: Reported image size of %llu does not match expected value %llu.(discover).",
- (unsigned long long)filesize, (unsigned long long)dev->reported_size);
-
- // panic mode, take first responding server
- if (dev->panic)
- {
- dev->panic = 0;
- debug_alt("WARN: Panic mode, changing server:");
- if (best_sock != NULL )
- sock_release(best_sock);
- dev->better_sock = sock; // Pass over socket to take a shortcut in *_connect();
- kfree(buf);
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[i], sizeof(dev->cur_server));
- dnbd3_net_connect(dev);
- return 0;
- }
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ dev->alt_servers[i].protocol_version = remote_version;
+ dev->alt_servers[i].rtts[turn] =
+ (unsigned long)ktime_us_delta(end, start);
- // Request block
- dnbd3_request.cmd = CMD_GET_BLOCK;
- // Do *NOT* pick a random block as it has proven to cause severe
- // cache thrashing on the server
- dnbd3_request.offset = 0;
- dnbd3_request.size = RTT_BLOCK_SIZE;
- fixup_request(dnbd3_request);
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
-
- // start rtt measurement
- do_gettimeofday(&start);
-
- if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0)
- error_alt("ERROR: Requesting test block failed (discover).");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving test block header packet failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic
- != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE)
- error_alt_va("ERROR: Unexpected reply to block request: cmd=%d, size=%d (discover).",
- (int)dnbd3_reply.cmd, (int)dnbd3_reply.size);
-
- // receive data
- iov[0].iov_base = buf;
- iov[0].iov_len = RTT_BLOCK_SIZE;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE)
- error_alt("ERROR: Receiving test block payload failed (discover).");
-
- do_gettimeofday(&end); // end rtt measurement
-
- dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull
- + (end.tv_usec - start.tv_usec));
-
- rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2]
- + dev->alt_servers[i].rtts[3]) / 4;
-
- if (best_rtt > rtt)
- {
- // This one is better, keep socket open in case we switch
- best_rtt = rtt;
- best_server = i;
- if (best_sock != NULL )
- sock_release(best_sock);
- best_sock = sock;
- sock = NULL;
- }
- else
- {
- // Not better, discard connection
- sock_release(sock);
- sock = NULL;
- }
+ rtt = 0;
- // update cur servers rtt
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = rtt;
- current_server = i;
- }
+ for (k = 0; k < DISCOVER_HISTORY_SIZE; ++k)
+ rtt += dev->alt_servers[i].rtts[k];
+ rtt /= DISCOVER_HISTORY_SIZE;
dev->alt_servers[i].failures = 0;
+ if (dev->alt_servers[i].best_count > 1)
+ dev->alt_servers[i].best_count -= 2;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
- continue;
-
- error: ;
- ++dev->alt_servers[i].failures;
+ if (best_rtt > rtt) {
+ // This one is better, keep socket open in case we switch
+ best_rtt = rtt;
+ best_server = host_compare;
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ best_sock = sock;
+ sock = NULL;
+ } else {
+ // Not better, discard connection
sock_release(sock);
sock = NULL;
- dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = RTT_UNREACHABLE;
- current_server = i;
- }
- continue;
}
- if (dev->panic)
- {
- // After 21 retries, bail out by reporting errors to block layer
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count < 255 && ++dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
- dnbd3_blk_fail_all_requests(dev);
- }
+ // update cur servers rtt
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = rtt;
- if (best_server == -1 || kthread_should_stop() || dev->thread_discover == NULL ) // No alt server could be reached at all or thread should stop
- {
- if (best_sock != NULL ) // Should never happen actually
- {
- sock_release(best_sock);
- best_sock = NULL;
- }
- continue;
- }
+ continue;
- do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0
- && RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500;
-
- if (ready && !do_change) {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (!list_empty(&dev->request_queue_send))
- {
- cur_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- do_change = (cur_request == last_request);
- if (do_change)
- printk("WARNING: Hung request on %s\n", dev->disk->disk_name);
- }
- else
- {
- cur_request = (struct request *)123;
- }
- last_request = cur_request;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+error:
+ if (sock != NULL) {
+ sock_release(sock);
+ sock = NULL;
}
-
- // take server with lowest rtt
- if (do_change)
- {
- printk("INFO: Server %d on %s is faster (%lluµs vs. %lluµs)\n", best_server, dev->disk->disk_name,
- (unsigned long long)best_rtt, (unsigned long long)dev->cur_rtt);
- kfree(buf);
- dev->better_sock = best_sock; // Take shortcut by continuing to use open connection
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[best_server], sizeof(dev->cur_server));
- dev->cur_rtt = best_rtt;
- dnbd3_net_connect(dev);
- return 0;
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ if (remote_version)
+ dev->alt_servers[i].protocol_version = remote_version;
+ ++dev->alt_servers[i].failures;
+ dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
+ if (dev->alt_servers[i].best_count > 2)
+ dev->alt_servers[i].best_count -= 3;
}
-
- // Clean up connection that was held open for quicker server switch
- if (best_sock != NULL )
- {
- sock_release(best_sock);
- best_sock = NULL;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = RTT_UNREACHABLE;
+ } // END - for loop over alt_servers
+
+ if (best_server.ss_family == 0) {
+ // No alt server could be reached
+ ASSERT(!best_sock);
+ if (dev->panic) {
+ if (dev->panic_count < 255)
+ dev->panic_count++;
+ // If probe timeout is set, report error to block layer
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
+ dnbd3_blk_fail_all_requests(dev);
}
+ return;
+ }
- if (!ready || (start.tv_usec & 15) != 0)
- turn = (turn + 1) % 4;
- if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing
- ready = 1;
-
+ // If best server was repeatedly measured best, lower the switching threshold more
+ mutex_lock(&dev->alt_servers_lock);
+ alt = get_existing_alt_from_addr(&best_server, dev);
+ if (alt != NULL) {
+ if (alt->best_count < 178)
+ alt->best_count += 3;
+ rtt_threshold = 1800 - (alt->best_count * 10);
+ remote_version = alt->protocol_version;
+ } else {
+ rtt_threshold = 1800;
+ remote_version = 0;
}
- kfree(buf);
- return 0;
+ mutex_unlock(&dev->alt_servers_lock);
+
+ do_change = ready && !is_same_server(&best_server, &dev->cur_server.host)
+ && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold;
+
+ // take server with lowest rtt
+ // if a (dis)connect is already in progress, we do nothing, this is not panic mode
+ if (do_change && device_active(dev) && dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_cur(dev, "server %pISpc is faster (%lluµs vs. %lluµs)\n",
+ &best_server,
+ (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
+ set_socket_timeout(best_sock, false, // recv
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_RECV * 1000) + 500);
+ set_socket_timeout(best_sock, true, // send
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_SEND * 1000) + 500);
+ if (dnbd3_set_primary_connection(dev, best_sock, &best_server, remote_version) != 0)
+ sock_release(best_sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
+ }
+
+ // Clean up connection that was held open for quicker server switch
+ if (best_sock != NULL)
+ sock_release(best_sock);
}
-int dnbd3_net_send(void *data)
+/**
+ * Worker for sending pending requests. This will be triggered whenever
+ * we get a new request from the block layer. The worker will then
+ * work through all the requests in the send queue, request them from
+ * the server, and return again.
+ */
+static void dnbd3_send_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request;
-
- dnbd3_request_t dnbd3_request;
- struct msghdr msg;
- struct kvec iov;
-
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, send_work);
+ struct request *blk_request;
+ struct dnbd3_cmd *cmd;
unsigned long irqflags;
- init_msghdr(msg);
-
- dnbd3_request.magic = dnbd3_packet_magic;
-
- set_user_nice(current, -20);
-
- // move already sent requests to request_queue_send again
- while (!list_empty(&dev->request_queue_receive))
- {
- printk("WARN: Request queue was not empty on %s\n", dev->disk->disk_name);
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- list_add(&blk_request->queuelist, &dev->request_queue_send);
- }
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_send, kthread_should_stop() || !list_empty(&dev->request_queue_send));
-
- if (kthread_should_stop())
+ mutex_lock(&dev->send_mutex);
+ while (dev->sock && device_active(dev)) {
+ // extract next block request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ if (list_empty(&dev->send_queue)) {
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
break;
-
- // extract block request
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (list_empty(&dev->request_queue_send))
- {
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
- blk_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- // what to do?
- switch (dnbd3_req_op(blk_request))
- {
- case DNBD3_DEV_READ:
- dnbd3_request.cmd = CMD_GET_BLOCK;
- dnbd3_request.offset = blk_rq_pos(blk_request) << 9; // *512
- dnbd3_request.size = blk_rq_bytes(blk_request); // bytes left to complete entire request
- // enqueue request to request_queue_receive
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- list_add_tail(&blk_request->queuelist, &dev->request_queue_receive);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- break;
- case DNBD3_REQ_OP_SPECIAL:
- dnbd3_request.cmd = dnbd3_priv_to_cmd(blk_request);
- dnbd3_request.size = 0;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // append to receive queue
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_add_tail(&blk_request->queuelist, &dev->recv_queue);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+
+ cmd = blk_mq_rq_to_pdu(blk_request);
+ if (!dnbd3_send_request(dev->sock, CMD_GET_BLOCK, cmd->handle,
+ blk_rq_pos(blk_request) << 9 /* sectors */, blk_rq_bytes(blk_request))) {
+ if (!dnbd3_flag_taken(dev->connection_lock)) {
+ dnbd3_dev_err_cur(dev, "connection to server lost (send)\n");
+ dnbd3_start_discover(dev, true);
+ }
break;
-
- default:
- printk("ERROR: Unknown command (send %u %u)\n", (int)blk_request->cmd_flags, (int)dnbd3_req_op(blk_request));
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
-
- // send net request
- dnbd3_request.handle = (uint64_t)(uintptr_t)blk_request; // Double cast to prevent warning on 32bit
- fixup_request(dnbd3_request);
- iov.iov_base = &dnbd3_request;
- iov.iov_len = sizeof(dnbd3_request);
- if (kernel_sendmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_request)) != sizeof(dnbd3_request))
- {
- debug_dev("ERROR: Connection to server lost (send)");
- goto error;
- }
- wake_up(&dev->process_queue_receive);
}
-
- dev->thread_send = NULL;
- return 0;
-
- error: ;
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
- dev->thread_send = NULL;
- return -1;
+ mutex_unlock(&dev->send_mutex);
}
-int dnbd3_net_receive(void *data)
+/**
+ * The receive workfn stays active for as long as the connection to a server
+ * lasts, i.e. it only gets restarted when we switch to a new server.
+ */
+static void dnbd3_recv_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request, *received_request;
-
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, recv_work);
+ struct request *blk_request;
+ struct request *rq_iter;
+ struct dnbd3_cmd *cmd;
+ dnbd3_reply_t reply_hdr;
struct req_iterator iter;
struct bio_vec bvec_inst;
struct bio_vec *bvec = &bvec_inst;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov;
void *kaddr;
unsigned long irqflags;
- sigset_t blocked, oldset;
uint16_t rid;
- unsigned long int recv_timeout = jiffies;
-
- int count, remaining, ret;
-
- init_msghdr(msg);
- set_user_nice(current, -20);
+ int remaining;
+ int ret;
- while (!kthread_should_stop())
- {
+ mutex_lock(&dev->recv_mutex);
+ while (dev->sock) {
// receive net reply
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = sizeof(dnbd3_reply);
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags);
- if (ret == -EAGAIN)
- {
- if (jiffies < recv_timeout) recv_timeout = jiffies; // Handle overflow
- if ((jiffies - recv_timeout) / HZ > SOCKET_KEEPALIVE_TIMEOUT)
- error_dev_va("ERROR: Receive timeout reached (%d of %d secs).", (int)((jiffies - recv_timeout) / HZ), (int)SOCKET_KEEPALIVE_TIMEOUT);
- continue;
+ ret = dnbd3_recv_reply(dev->sock, &reply_hdr);
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(dev, "remote peer has performed an orderly shutdown\n");
+ goto out_unlock;
+ } else if (ret < 0) {
+ if (ret == -EAGAIN) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "receive timeout reached\n");
+ } else {
+ /* for all errors other than -EAGAIN, print errno */
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "connection to server lost (receive, errno=%d)\n", ret);
+ }
+ goto out_unlock;
}
- if (ret <= 0)
- error_dev("ERROR: Connection to server lost (receive)");
- if (ret != sizeof(dnbd3_reply))
- error_dev("ERROR: Recv msg header.");
- fixup_reply(dnbd3_reply);
- // check error
- if (dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("ERROR: Wrong packet magic (Receive).");
- if (dnbd3_reply.cmd == 0)
- error_dev("ERROR: Command was 0 (Receive).");
+ /* check if arrived data is valid */
+ if (ret != sizeof(reply_hdr)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv partial msg header (%d/%d bytes)\n",
+ ret, (int)sizeof(reply_hdr));
+ goto out_unlock;
+ }
- // Update timeout
- recv_timeout = jiffies;
+ // check error
+ if (reply_hdr.magic != dnbd3_packet_magic) {
+ dnbd3_dev_err_cur(dev, "wrong packet magic (receive)\n");
+ goto out_unlock;
+ }
// what to do?
- switch (dnbd3_reply.cmd)
- {
+ switch (reply_hdr.cmd) {
case CMD_GET_BLOCK:
// search for replied request in queue
blk_request = NULL;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(received_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- if ((uint64_t)(uintptr_t)received_request == dnbd3_reply.handle) // Double cast to prevent warning on 32bit
- {
- blk_request = received_request;
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ cmd = blk_mq_rq_to_pdu(rq_iter);
+ if (cmd->handle == reply_hdr.handle) {
+ blk_request = rq_iter;
+ list_del_init(&blk_request->queuelist);
break;
}
}
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- if (blk_request == NULL )
- error_dev_va("ERROR: Received block data for unrequested handle (%llu: %llu).\n",
- (unsigned long long)dnbd3_reply.handle, (unsigned long long)dnbd3_reply.size);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (blk_request == NULL) {
+ dnbd3_dev_err_cur(dev, "received block data for unrequested handle (%llx: len=%llu)\n",
+ reply_hdr.handle,
+ (u64)reply_hdr.size);
+ goto out_unlock;
+ }
// receive data and answer to block layer
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
- rq_for_each_segment(bvec_inst, blk_request, iter)
+ rq_for_each_segment(bvec_inst, blk_request, iter) {
#else
- rq_for_each_segment(bvec, blk_request, iter)
+ rq_for_each_segment(bvec, blk_request, iter) {
#endif
- {
- siginitsetinv(&blocked, sigmask(SIGKILL));
- sigprocmask(SIG_SETMASK, &blocked, &oldset);
-
kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
iov.iov_base = kaddr;
iov.iov_len = bvec->bv_len;
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags) != bvec->bv_len)
- {
- kunmap(bvec->bv_page);
- sigprocmask(SIG_SETMASK, &oldset, NULL );
- error_dev("ERROR: Receiving from net to block layer.");
- }
+ ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
kunmap(bvec->bv_page);
-
- sigprocmask(SIG_SETMASK, &oldset, NULL );
+ if (ret != bvec->bv_len) {
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(
+ dev, "remote peer has performed an orderly shutdown\n");
+ } else if (ret < 0) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "disconnect: receiving from net to block layer\n");
+ } else {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "receiving from net to block layer (%d bytes)\n", ret);
+ }
+ // Requeue request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ goto out_unlock;
+ }
}
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- __blk_end_request_all(blk_request, 0);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
+ blk_mq_end_request(blk_request, BLK_STS_OK);
+ break;
case CMD_GET_SERVERS:
- if (!dev->use_server_provided_alts)
- {
- remaining = dnbd3_reply.size;
- goto consume_payload;
- }
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- count = MIN(NUMBER_SERVERS, dnbd3_reply.size / sizeof(dnbd3_server_entry_t));
-
- if (count != 0)
- {
- iov.iov_base = dev->new_servers;
- iov.iov_len = count * sizeof(dnbd3_server_entry_t);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags)
- != (count * sizeof(dnbd3_server_entry_t)))
- error_dev("ERROR: Recv CMD_GET_SERVERS payload.");
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = count;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
- // If there were more servers than accepted, remove the remaining data from the socket buffer
- remaining = dnbd3_reply.size - (count * sizeof(dnbd3_server_entry_t));
- consume_payload: while (remaining > 0)
- {
- count = MIN(sizeof(dnbd3_reply), remaining); // Abuse the reply struct as the receive buffer
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = count;
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
- if (ret <= 0)
- error_dev("ERROR: Recv additional payload from CMD_GET_SERVERS.");
- remaining -= ret;
+ remaining = reply_hdr.size;
+ if (dev->use_server_provided_alts) {
+ dnbd3_server_entry_t new_server;
+
+ while (remaining >= sizeof(dnbd3_server_entry_t)) {
+ if (dnbd3_recv_bytes(dev->sock, &new_server, sizeof(new_server))
+ != sizeof(new_server)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv CMD_GET_SERVERS payload\n");
+ goto out_unlock;
+ }
+ // TODO: Log
+ if (new_server.failures == 0) { // ADD
+ dnbd3_add_server(dev, &new_server.host);
+ } else { // REM
+ dnbd3_rem_server(dev, &new_server.host);
+ }
+ remaining -= sizeof(new_server);
+ }
}
- continue;
+ if (!dnbd3_drain_socket(dev, dev->sock, remaining))
+ goto out_unlock;
+ break;
case CMD_LATEST_RID:
- if (dnbd3_reply.size != 2)
- {
- printk("ERROR: CMD_LATEST_RID.size != 2.\n");
+ if (reply_hdr.size < 2) {
+ dev_err(dnbd3_device_to_dev(dev), "CMD_LATEST_RID.size < 2\n");
continue;
}
- iov.iov_base = &rid;
- iov.iov_len = sizeof(rid);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags) <= 0)
- {
- printk("ERROR: Could not receive CMD_LATEST_RID payload.\n");
- }
- else
- {
+ if (dnbd3_recv_bytes(dev->sock, &rid, 2) != 2) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dev_err(dnbd3_device_to_dev(dev), "could not receive CMD_LATEST_RID payload\n");
+ } else {
rid = net_order_16(rid);
- printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid);
+ dnbd3_dev_info_cur(dev, "latest rid of %s is %d (currently using %d)\n",
+ dev->imgname, (int)rid, (int)dev->rid);
dev->update_available = (rid > dev->rid ? 1 : 0);
}
+ if (reply_hdr.size > 2)
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size - 2);
continue;
case CMD_KEEPALIVE:
- if (dnbd3_reply.size != 0)
- printk("ERROR: keep alive packet with payload.\n");
+ if (reply_hdr.size != 0) {
+ dev_dbg(dnbd3_device_to_dev(dev), "keep alive packet with payload\n");
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size);
+ }
continue;
default:
- printk("ERROR: Unknown command (Receive)\n");
- continue;
+ dev_err(dnbd3_device_to_dev(dev), "unknown command: %d (receive), aborting connection\n", (int)reply_hdr.cmd);
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ // This will check if we actually still need a new connection
+ dnbd3_start_discover(dev, true);
+ mutex_unlock(&dev->recv_mutex);
+}
+/**
+ * Set send or receive timeout of given socket
+ */
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+ int opt = set_send ? SO_SNDTIMEO_NEW : SO_RCVTIMEO_NEW;
+ struct __kernel_sock_timeval timeout;
+#else
+ int opt = set_send ? SO_SNDTIMEO : SO_RCVTIMEO;
+ struct timeval timeout;
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+ sockptr_t timeout_ptr = KERNEL_SOCKPTR(&timeout);
+#else
+ char *timeout_ptr = (char *)&timeout;
+#endif
+
+ timeout.tv_sec = timeout_ms / 1000;
+ timeout.tv_usec = (timeout_ms % 1000) * 1000;
+ sock_setsockopt(sock, SOL_SOCKET, opt, timeout_ptr, sizeof(timeout));
+}
+
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out)
+{
+ ktime_t start;
+ int ret, connect_time_ms;
+ struct socket *sock;
+ int retries = 4;
+ const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in)
+ : sizeof(struct sockaddr_in6);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+ ret = sock_create_kern(&init_net, addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#else
+ ret = sock_create_kern(addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#endif
+ if (ret < 0) {
+ dev_err(dnbd3_device_to_dev(dev), "couldn't create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Only one retry, TCP no delay */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+ tcp_sock_set_syncnt(sock->sk, 1);
+ tcp_sock_set_nodelay(sock->sk);
+ /* because of our aggressive timeouts, this is pointless */
+ sock_no_linger(sock->sk);
+#else
+ /* add legacy version of this, but ignore others as they're not that important */
+ ret = 1;
+ kernel_setsockopt(sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&ret, sizeof(ret));
+#endif
+ /* allow this socket to use reserved mem (vm.mem_free_kbytes) */
+ sk_set_memalloc(sock->sk);
+ sock->sk->sk_allocation = GFP_NOIO;
+
+ if (dev->panic && dev->panic_count > 1) {
+ /* in panic mode for some time, start increasing timeouts */
+ connect_time_ms = dev->panic_count * 1000;
+ } else {
+ /* otherwise, use 2*RTT of current server */
+ connect_time_ms = dev->cur_server.rtt * 2 / 1000;
+ }
+ /* but obey a minimal configurable value, and maximum sanity check */
+ if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000)
+ connect_time_ms = SOCKET_TIMEOUT_SEND * 1000;
+ else if (connect_time_ms > 60000)
+ connect_time_ms = 60000;
+ set_socket_timeout(sock, false, connect_time_ms); // recv
+ set_socket_timeout(sock, true, connect_time_ms); // send
+ start = ktime_get_real();
+ while (--retries > 0) {
+ ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0);
+ connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start);
+ if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) {
+ /* Either I'm losing my mind or there was a specific build of kernel
+ * 5.x where SO_RCVTIMEO didn't affect the connect call above, so
+ * this function would hang for over a minute for unreachable hosts.
+ * Leave in this debug check for twice the configured timeout
+ */
+ dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n",
+ connect_time_ms);
}
+ if (ret != 0) {
+ if (ret == -EINTR)
+ dnbd3_dev_dbg_host(dev, addr, "connect: interrupted system call (blocked %dms)\n",
+ connect_time_ms);
+ else
+ dnbd3_dev_dbg_host(dev, addr, "connect: failed (%d, blocked %dms)\n",
+ ret, connect_time_ms);
+ goto error;
+ }
+ *sock_out = sock;
+ return 0;
}
+error:
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
- printk("dnbd3_net_receive terminated normally.\n");
- dev->thread_receive = NULL;
- return 0;
+#define dnbd3_err_dbg_host(...) do { \
+ if (dev->panic || dev->sock == NULL) \
+ dnbd3_dev_err_host(__VA_ARGS__); \
+ else \
+ dnbd3_dev_dbg_host(__VA_ARGS__); \
+} while (0)
+
+/**
+ * Execute protocol handshake on a newly connected socket.
+ * If this is the initial connection to any server, ie. we're being called
+ * through the initial ioctl() to open a device, we'll store the rid, filesize
+ * etc. in the dev struct., otherwise, this is a potential switch to another
+ * server, so we validate the filesize, rid, name against what we expect.
+ * The server's protocol version is returned in 'remote_version'
+ */
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_data)
+{
+ unsigned long irqflags;
+ const char *name;
+ uint64_t filesize;
+ int mlen;
+ uint16_t rid;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov[2];
+ serialized_buffer_t *payload;
+ dnbd3_reply_t reply_hdr;
+ dnbd3_request_t request_hdr = { .magic = dnbd3_packet_magic };
+
+ payload = kmalloc(sizeof(*payload), GFP_KERNEL);
+ if (payload == NULL)
+ goto error;
+
+ if (copy_data && device_active(dev))
+ dev_warn(dnbd3_device_to_dev(dev), "Called handshake function with copy_data enabled when reported_size is not zero\n");
+
+ // Request filesize
+ request_hdr.cmd = CMD_SELECT_IMAGE;
+ iov[0].iov_base = &request_hdr;
+ iov[0].iov_len = sizeof(request_hdr);
+ serializer_reset_write(payload);
+ serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
+ serializer_put_string(payload, dev->imgname); // image name
+ serializer_put_uint16(payload, dev->rid); // revision id
+ serializer_put_uint8(payload, 0); // are we a server? (no!)
+ iov[1].iov_base = payload;
+ request_hdr.size = iov[1].iov_len = serializer_get_written_length(payload);
+ fixup_request(request_hdr);
+ mlen = iov[0].iov_len + iov[1].iov_len;
+ if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) {
+ dnbd3_err_dbg_host(dev, addr, "requesting image size failed\n");
+ goto error;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving image size packet (header) failed\n");
+ goto error;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic
+ || reply_hdr.cmd != CMD_SELECT_IMAGE || reply_hdr.size < 4
+ || reply_hdr.size > sizeof(*payload)) {
+ dnbd3_err_dbg_host(dev, addr,
+ "corrupt CMD_SELECT_IMAGE reply\n");
+ goto error;
+ }
+
+ // receive data
+ iov[0].iov_base = payload;
+ iov[0].iov_len = reply_hdr.size;
+ if (kernel_recvmsg(sock, &msg, iov, 1, reply_hdr.size, msg.msg_flags)
+ != reply_hdr.size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "receiving payload of CMD_SELECT_IMAGE reply failed\n");
+ goto error;
+ }
+ serializer_reset_read(payload, reply_hdr.size);
+
+ *remote_version = serializer_get_uint16(payload);
+ name = serializer_get_string(payload);
+ rid = serializer_get_uint16(payload);
+ filesize = serializer_get_uint64(payload);
+
+ if (*remote_version < MIN_SUPPORTED_SERVER) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server version too old (client: %d, server: %d, min supported: %d)\n",
+ (int)PROTOCOL_VERSION, (int)*remote_version,
+ (int)MIN_SUPPORTED_SERVER);
+ goto error;
+ }
+ if (name == NULL) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply an image name\n");
+ goto error;
+ }
+ if (rid == 0) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply a revision id\n");
+ goto error;
+ }
+
+ if (copy_data) {
+ if (filesize < DNBD3_BLOCK_SIZE) {
+ dnbd3_err_dbg_host(dev, addr, "reported size by server is < 4096\n");
+ goto error;
+ }
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (strlen(dev->imgname) < strlen(name)) {
+ dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_KERNEL);
+ if (dev->imgname == NULL) {
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_err_dbg_host(dev, addr, "reallocating buffer for new image name failed\n");
+ goto error;
+ }
+ }
+ strcpy(dev->imgname, name);
+ dev->rid = rid;
+ // store image information
+ dev->reported_size = filesize;
+ dev->update_available = 0;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
+ dnbd3_dev_dbg_host(dev, addr, "image size: %llu\n", dev->reported_size);
+ } else {
+ /* switching connection, sanity checks */
+ if (rid != dev->rid) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server supplied wrong rid (client: '%d', server: '%d')\n",
+ (int)dev->rid, (int)rid);
+ goto error;
+ }
+
+ if (strcmp(name, dev->imgname) != 0) {
+ dnbd3_err_dbg_host(dev, addr, "server offers image '%s', requested '%s'\n", name, dev->imgname);
+ goto error;
+ }
+
+ if (filesize != dev->reported_size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "reported image size of %llu does not match expected value %llu\n",
+ (unsigned long long)filesize, (unsigned long long)dev->reported_size);
+ goto error;
+ }
+ }
+ kfree(payload);
+ return true;
+
+error:
+ kfree(payload);
+ return false;
+}
+
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ dnbd3_request_t request_hdr = {
+ .magic = dnbd3_packet_magic,
+ .cmd = cmd,
+ .size = size,
+ .offset = offset,
+ .handle = handle,
+ };
+ struct kvec iov = { .iov_base = &request_hdr, .iov_len = sizeof(request_hdr) };
+
+ fixup_request(request_hdr);
+ return kernel_sendmsg(sock, &msg, &iov, 1, sizeof(request_hdr)) == sizeof(request_hdr);
+}
+
+/**
+ * Send a request with given cmd type and empty payload.
+ */
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd)
+{
+ int ret;
+
+ mutex_lock(&dev->send_mutex);
+ ret = dev->sock
+ && dnbd3_send_request(dev->sock, cmd, 0, 0, 0);
+ mutex_unlock(&dev->send_mutex);
+ return ret;
+}
+
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov = { .iov_base = buffer, .iov_len = count };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, count, msg.msg_flags);
+}
+
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr)
+{
+ int ret = dnbd3_recv_bytes(sock, reply_hdr, sizeof(*reply_hdr));
+
+ fixup_reply(*reply_hdr);
+ return ret;
+}
+
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes)
+{
+ int ret;
+ struct kvec iov;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+
+ while (bytes > 0) {
+ iov.iov_base = __garbage_mem;
+ iov.iov_len = sizeof(__garbage_mem);
+ ret = kernel_recvmsg(sock, &msg, &iov, 1, MIN(bytes, iov.iov_len), msg.msg_flags);
+ if (ret <= 0) {
+ dnbd3_dev_err_cur(dev, "draining payload failed (ret=%d)\n", ret);
+ return false;
+ }
+ bytes -= ret;
+ }
+ return true;
+}
+
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock)
+{
+ dnbd3_reply_t reply_hdr;
+
+ // Request block
+ if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) {
+ dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n");
+ return false;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n");
+ return false;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK
+ || reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) {
+ dnbd3_err_dbg_host(dev, addr,
+ "unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n",
+ (int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle);
+ return false;
+ }
- error:
+ // receive data
+ return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE);
+}
+#undef dnbd3_err_dbg_host
+
+static void replace_main_socket(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ mutex_lock(&dev->send_mutex);
+ // First, shutdown connection, so receive worker will leave its mainloop
if (dev->sock)
kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ mutex_lock(&dev->recv_mutex);
+ // Receive worker is done, get rid of socket and replace
+ if (dev->sock)
+ sock_release(dev->sock);
+ dev->sock = sock;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (addr == NULL) {
+ memset(&dev->cur_server, 0, sizeof(dev->cur_server));
+ } else {
+ dev->cur_server.host = *addr;
+ dev->cur_server.rtt = 0;
+ dev->cur_server.protocol_version = protocol_version;
}
- dev->thread_receive = NULL;
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ mutex_unlock(&dev->recv_mutex);
+ mutex_unlock(&dev->send_mutex);
}
+static void dnbd3_release_resources(dnbd3_device_t *dev)
+{
+ if (dev->send_wq)
+ destroy_workqueue(dev->send_wq);
+ dev->send_wq = NULL;
+ if (dev->recv_wq)
+ destroy_workqueue(dev->recv_wq);
+ dev->recv_wq = NULL;
+ mutex_destroy(&dev->send_mutex);
+ mutex_destroy(&dev->recv_mutex);
+}
+
+/**
+ * Establish new connection on a dnbd3 device.
+ * Return 0 on success, errno otherwise
+ */
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init)
+{
+ unsigned long irqflags;
+ struct socket *sock = NULL;
+ uint16_t proto_version;
+ int ret;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (init && device_active(dev)) {
+ dnbd3_dev_err_cur(dev, "device already configured/connected\n");
+ return -EBUSY;
+ }
+ if (!init && !device_active(dev)) {
+ dev_warn(dnbd3_device_to_dev(dev), "connection switch called on unconfigured device\n");
+ return -ENOTCONN;
+ }
+
+ dnbd3_dev_dbg_host(dev, addr, "connecting...\n");
+ ret = dnbd3_connect(dev, addr, &sock);
+ if (ret != 0 || sock == NULL)
+ goto error;
+
+ /* execute the "select image" handshake */
+ // if init is true, reported_size will be set
+ if (!dnbd3_execute_handshake(dev, sock, addr, &proto_version, init)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (init) {
+ // We're setting up the device for use - allocate resources
+ // Do not goto error before this
+ ASSERT(!dev->send_wq);
+ ASSERT(!dev->recv_wq);
+ mutex_init(&dev->send_mutex);
+ mutex_init(&dev->recv_mutex);
+ // a designated queue for sending, that allows one active task only
+ dev->send_wq = alloc_workqueue("dnbd%d-send",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ 1, dev->index);
+ dev->recv_wq = alloc_workqueue("dnbd%d-recv",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ 1, dev->index);
+ if (!dev->send_wq || !dev->recv_wq) {
+ ret = -ENOMEM;
+ goto error_dealloc;
+ }
+ }
+
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000); // recv
+ dnbd3_set_primary_connection(dev, sock, addr, proto_version);
+ sock = NULL; // In case we ever goto error* after this point
+
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (init) {
+ dev->discover_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_STARTUP;
+ // discovery and keepalive are not critical, use the power efficient queue
+ queue_delayed_work(system_power_efficient_wq, &dev->discover_work,
+ dev->discover_interval * HZ);
+ queue_delayed_work(system_power_efficient_wq, &dev->keepalive_work,
+ KEEPALIVE_INTERVAL * HZ);
+ // but the receiver is performance critical AND runs indefinitely, use the
+ // the cpu intensive queue, as jobs submitted there will not cound towards
+ // the concurrency limit of per-cpu worker threads. It still feels a little
+ // dirty to avoid managing our own thread, but nbd does it too.
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ return 0;
+
+error_dealloc:
+ if (init) {
+ // If anything fails during initialization, free resources again
+ dnbd3_release_resources(dev);
+ }
+error:
+ if (init)
+ dev->reported_size = 0;
+ if (sock)
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
+
+void dnbd3_net_work_init(dnbd3_device_t *dev)
+{
+ INIT_WORK(&dev->send_work, dnbd3_send_workfn);
+ INIT_WORK(&dev->recv_work, dnbd3_recv_workfn);
+ INIT_DELAYED_WORK(&dev->discover_work, dnbd3_discover_workfn);
+ INIT_DELAYED_WORK(&dev->keepalive_work, dnbd3_keepalive_workfn);
+}
+
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (addr->ss_family == 0 || dev->imgname == NULL || sock == NULL) {
+ dnbd3_dev_err_cur(dev, "connect: host, image name or sock not set\n");
+ return -EINVAL;
+ }
+
+ replace_main_socket(dev, sock, addr, protocol_version);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_SWITCH;
+ queue_work(dev->recv_wq, &dev->recv_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+
+ if (dev->use_server_provided_alts)
+ dnbd3_send_empty_request(dev, CMD_GET_SERVERS);
+
+ dnbd3_dev_info_cur(dev, "connection switched\n");
+ dnbd3_blk_requeue_all_requests(dev);
+ return 0;
+}
+
+/**
+ * Disconnect the device, shutting it down.
+ */
+int dnbd3_net_disconnect(dnbd3_device_t *dev)
+{
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (!device_active(dev))
+ return -ENOTCONN;
+ dev_dbg(dnbd3_device_to_dev(dev), "disconnecting device ...\n");
+
+ dev->reported_size = 0;
+ /* quickly fail all requests */
+ dnbd3_blk_fail_all_requests(dev);
+ replace_main_socket(dev, NULL, NULL, 0);
+
+ cancel_delayed_work_sync(&dev->keepalive_work);
+ cancel_delayed_work_sync(&dev->discover_work);
+ cancel_work_sync(&dev->send_work);
+ cancel_work_sync(&dev->recv_work);
+
+ dnbd3_blk_fail_all_requests(dev);
+ dnbd3_release_resources(dev);
+ dev_dbg(dnbd3_device_to_dev(dev), "all workers shut down\n");
+ return 0;
+}
diff --git a/src/kernel/net.h b/src/kernel/net.h
index a06a20c..69fa523 100644
--- a/src/kernel/net.h
+++ b/src/kernel/net.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,30 +22,12 @@
#ifndef NET_H_
#define NET_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#define init_msghdr(h) do { \
- h.msg_name = NULL; \
- h.msg_namelen = 0; \
- h.msg_control = NULL; \
- h.msg_controllen = 0; \
- h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
- } while (0)
+void dnbd3_net_work_init(dnbd3_device_t *dev);
-int dnbd3_net_connect(dnbd3_device_t *lo);
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init);
-int dnbd3_net_disconnect(dnbd3_device_t *lo);
-
-int dnbd3_net_send(void *data);
-
-int dnbd3_net_receive(void *data);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg);
-#else
-void dnbd3_net_heartbeat(unsigned long arg);
-#endif
-
-int dnbd3_net_discover(void *data);
+int dnbd3_net_disconnect(dnbd3_device_t *dev);
#endif /* NET_H_ */
diff --git a/src/kernel/serialize.c b/src/kernel/serialize.c
new file mode 120000
index 0000000..5a4e4ac
--- /dev/null
+++ b/src/kernel/serialize.c
@@ -0,0 +1 @@
+../shared/serialize.c \ No newline at end of file
diff --git a/src/kernel/serialize_kmod.c b/src/kernel/serialize_kmod.c
deleted file mode 100644
index 50746df..0000000
--- a/src/kernel/serialize_kmod.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-#define KERNEL_MODULE
-#include "serialize.c"
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
index 4406072..9deba96 100644
--- a/src/kernel/sysfs.c
+++ b/src/kernel/sysfs.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,156 +22,138 @@
#include <linux/kobject.h>
#include "sysfs.h"
-#include "utils.h"
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
+/**
+ * Print currently connected server IP:PORT
+ */
ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev)
{
- if (dev->cur_server.host.type == HOST_IP4)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- else if (dev->cur_server.host.type == HOST_IP6)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- *buf = '\0';
- return 0;
-}
-
-ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev)
-{
- return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE);
-}
+ ssize_t ret;
-ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev)
-{
- int i, num = 0;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type) ++num;
- }
- return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE);
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%pISpc\n", &dev->cur_server.host), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * List alt servers. One line per server, format is:
+ * IP:PORT RTT consecutive_failures best_count
+ */
ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev)
{
- int i, size = PAGE_SIZE, ret;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == HOST_IP4)
- ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else if (dev->alt_servers[i].host.type == HOST_IP6)
- ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else
+ int i, size = PAGE_SIZE;
+ ssize_t ret;
+
+ if (mutex_lock_interruptible(&dev->alt_servers_lock) != 0)
+ return 0;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
continue;
+
+ ret = MIN(snprintf(buf, size, "%pISpc %llu %d %d\n", &dev->alt_servers[i].host,
+ (unsigned long long)((dev->alt_servers[i].rtts[0] +
+ dev->alt_servers[i].rtts[1] +
+ dev->alt_servers[i].rtts[2] +
+ dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures,
+ (int)dev->alt_servers[i].best_count),
+ size);
size -= ret;
buf += ret;
- if (size <= 0)
- {
+ if (size <= 0) {
size = 0;
break;
}
}
+ mutex_unlock(&dev->alt_servers_lock);
return PAGE_SIZE - size;
}
+/**
+ * Show name of image in use
+ */
ssize_t show_image_name(char *buf, dnbd3_device_t *dev)
{
- if (dev->imgname == NULL) return sprintf(buf, "(null)");
- return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ ssize_t ret;
+
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * Show rid of image in use
+ */
ssize_t show_rid(char *buf, dnbd3_device_t *dev)
{
+ // No locking here, primitive type, no pointer to allocated memory
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE);
}
ssize_t show_update_available(char *buf, dnbd3_device_t *dev)
{
+ // Same story
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE);
}
-device_attr_t cur_server_addr =
-{
- .attr = {.name = "cur_server_addr", .mode = 0444 },
- .show = show_cur_server_addr,
- .store = NULL,
-};
-
-device_attr_t cur_server_rtt =
-{
- .attr = {.name = "cur_server_rtt", .mode = 0444 },
- .show = show_cur_server_rtt,
- .store = NULL,
-};
-
-device_attr_t alt_server_num =
-{
- .attr = {.name = "alt_server_num", .mode = 0444 },
- .show = show_alt_server_num,
- .store = NULL,
+device_attr_t cur_server_addr = {
+ .attr = { .name = "cur_server_addr", .mode = 0444 },
+ .show = show_cur_server_addr,
+ .store = NULL,
};
-device_attr_t alt_servers =
-{
- .attr = {.name = "alt_servers", .mode = 0444 },
- .show = show_alt_servers,
- .store = NULL,
+device_attr_t alt_servers = {
+ .attr = { .name = "alt_servers", .mode = 0444 },
+ .show = show_alt_servers,
+ .store = NULL,
};
-device_attr_t image_name =
-{
- .attr = {.name = "image_name", .mode = 0444 },
- .show = show_image_name,
- .store = NULL,
+device_attr_t image_name = {
+ .attr = { .name = "image_name", .mode = 0444 },
+ .show = show_image_name,
+ .store = NULL,
};
-device_attr_t rid =
-{
- .attr = {.name = "rid", .mode = 0444 },
- .show = show_rid,
- .store = NULL,
+device_attr_t rid = {
+ .attr = { .name = "rid", .mode = 0444 },
+ .show = show_rid,
+ .store = NULL,
};
-device_attr_t update_available =
-{
- .attr = {.name = "update_available", .mode = 0444 },
- .show = show_update_available,
- .store = NULL,
+device_attr_t update_available = {
+ .attr = { .name = "update_available", .mode = 0444 },
+ .show = show_update_available,
+ .store = NULL,
};
ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
device_attr_t *device_attr = container_of(attr, device_attr_t, attr);
dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj);
+
return device_attr->show(buf, dev);
}
-struct attribute *device_attrs[] =
-{
+struct attribute *device_attrs[] = {
&cur_server_addr.attr,
- &cur_server_rtt.attr,
- &alt_server_num.attr,
&alt_servers.attr,
- &image_name.attr,
- &rid.attr,
+ &image_name.attr, &rid.attr,
&update_available.attr,
NULL,
};
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ATTRIBUTE_GROUPS(device);
+#endif
-struct sysfs_ops device_ops =
-{
+const struct sysfs_ops device_ops = {
.show = device_show,
};
@@ -179,14 +162,16 @@ void release(struct kobject *kobj)
kobj->state_initialized = 0;
}
-struct kobj_type device_ktype =
-{
+struct kobj_type device_ktype = {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
.default_attrs = device_attrs,
+#else
+ .default_groups = device_groups,
+#endif
.sysfs_ops = &device_ops,
.release = release,
};
-
void dnbd3_sysfs_init(dnbd3_device_t *dev)
{
int error;
@@ -196,7 +181,7 @@ void dnbd3_sysfs_init(dnbd3_device_t *dev)
error = kobject_init_and_add(kobj, ktype, parent, "%s", "net");
if (error)
- printk("Error initializing dnbd3 device!\n");
+ dev_err(dnbd3_device_to_dev(dev), "initializing sysfs for device failed!\n");
}
void dnbd3_sysfs_exit(dnbd3_device_t *dev)
diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h
index 0a747a5..1db4a07 100644
--- a/src/kernel/sysfs.h
+++ b/src/kernel/sysfs.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,25 +22,16 @@
#ifndef SYSFS_H_
#define SYSFS_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
void dnbd3_sysfs_init(dnbd3_device_t *dev);
void dnbd3_sysfs_exit(dnbd3_device_t *dev);
-typedef struct
-{
+typedef struct {
struct attribute attr;
- ssize_t (*show)(char *, dnbd3_device_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_device_t *);
+ ssize_t (*show)(char *buf, dnbd3_device_t *dev);
+ ssize_t (*store)(const char *buf, size_t len, dnbd3_device_t *dev);
} device_attr_t;
-typedef struct
-{
- struct attribute attr;
- ssize_t (*show)(char *, dnbd3_server_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_server_t *);
-} server_attr_t;
-
-
#endif /* SYSFS_H_ */
diff --git a/src/kernel/utils.c b/src/kernel/utils.c
deleted file mode 100644
index 902025f..0000000
--- a/src/kernel/utils.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include <linux/kernel.h>
-
-#include "utils.h"
-
-unsigned int inet_addr(char *str)
-{
- int a, b, c, d;
- char arr[4];
- sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d);
- arr[0] = a;
- arr[1] = b;
- arr[2] = c;
- arr[3] = d;
- return *(unsigned int *) arr;
-}
-
-void inet_ntoa(struct in_addr addr, char *str)
-{
- unsigned char *ptr = (unsigned char *) &addr;
- sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff);
-}
diff --git a/src/kernel/utils.h b/src/kernel/utils.h
deleted file mode 100644
index e54b3cf..0000000
--- a/src/kernel/utils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef UTILS_H_
-#define UTILS_H_
-
-#include <linux/in.h>
-
-unsigned int inet_addr(char *str);
-void inet_ntoa(struct in_addr addr, char *str);
-
-#endif /* UTILS_H_ */
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
new file mode 100644
index 0000000..9a1e1c4
--- /dev/null
+++ b/src/server/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-server
+ LANGUAGES C)
+
+# find Jansson package required by the dnbd3-server
+find_package(Jansson)
+if(NOT JANSSON_FOUND)
+ message(FATAL_ERROR "*** No jansson lib found, can't build dnbd3-server!")
+endif(NOT JANSSON_FOUND)
+
+# find atomic library required by the dnbd3-server
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX features
+add_definitions(-D_GNU_SOURCE)
+
+if(DNBD3_SERVER_AFL)
+ # check if DNBD3_RELEASE_HARDEN is disabled
+ if(DNBD3_RELEASE_HARDEN)
+ message(FATAL_ERROR "DNBD3_SERVER_AFL can only be enabled if DNBD3_RELEASE_HARDEN is disabled")
+ endif(DNBD3_RELEASE_HARDEN)
+
+ # build dnbd3-server with AFL support
+ message(STATUS "Building dnbd3-server with AFL support")
+ add_definitions(-DDNBD3_SERVER_AFL)
+
+ # change compiler for dnbd3-server sources if AFL enabled
+ include(CheckAFLCCompiler)
+ check_afl_c_compiler(AFL_C_COMPILER AFL_C_COMPILER_NAME ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ID})
+ if(AFL_C_COMPILER)
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER} - done")
+ # change C compiler to a corresponding AFL C compiler
+ set(CMAKE_C_COMPILER "${AFL_C_COMPILER}")
+ else(AFL_C_COMPILER)
+ # no corresponding AFL C compiler found
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER_NAME} - failed")
+ message(FATAL_ERROR "No corresponding AFL C compiler ${AFL_C_COMPILER_NAME} was found for the C compiler ${CMAKE_C_COMPILER}!")
+ endif(AFL_C_COMPILER)
+endif(DNBD3_SERVER_AFL)
+
+set(DNBD3_SERVER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.c)
+set(DNBD3_SERVER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reftypes.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.h)
+
+add_executable(dnbd3-server ${DNBD3_SERVER_SOURCE_FILES})
+target_include_directories(dnbd3-server PRIVATE ${JANSSON_INCLUDE_DIR})
+target_link_libraries(dnbd3-server dnbd3-version dnbd3-build dnbd3-shared picohttpparser Libatomic::Libatomic ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
+
+if(DNBD3_SERVER_FUSE)
+ find_package(Fuse REQUIRED)
+ # include Fuse headers and link with Fuse library
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_FUSE)
+ target_include_directories(dnbd3-server PRIVATE ${FUSE_INCLUDE_DIRS})
+ target_link_libraries(dnbd3-server ${FUSE_LIBRARIES})
+endif(DNBD3_SERVER_FUSE)
+
+if(UNIX AND NOT APPLE)
+ # link dnbd3-server with librt if server is compiled for a Unix system
+ target_link_libraries(dnbd3-server rt)
+endif(UNIX AND NOT APPLE)
+
+if(DNBD3_SERVER_DEBUG_LOCKS)
+ # enable debugging of locks used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_LOCKS)
+endif(DNBD3_SERVER_DEBUG_LOCKS)
+
+if(DNBD3_SERVER_DEBUG_THREADS)
+ # enable debugging of threads used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_THREADS)
+endif(DNBD3_SERVER_DEBUG_THREADS)
+
+install(TARGETS dnbd3-server RUNTIME DESTINATION bin
+ COMPONENT server)
+
+add_linter(dnbd3-server-lint "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+add_linter_fix(dnbd3-server-lint-fix "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+
+# add external dependency (HTTP parser) for the dnbd3-server
+add_subdirectory(picohttpparser)
diff --git a/src/server/altservers.c b/src/server/altservers.c
index bbbc584..4413ca6 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -1,69 +1,41 @@
+#include "ini.h"
#include "altservers.h"
#include "locks.h"
+#include "threadpool.h"
#include "helper.h"
#include "image.h"
#include "fileutil.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../serverconfig.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/config/server.h>
+#include "reference.h"
+
#include <assert.h>
#include <inttypes.h>
#include <jansson.h>
-#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid)
+#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image))
#define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
#define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
-static dnbd3_connection_t *pending[SERVER_MAX_PENDING_ALT_CHECKS];
-static pthread_mutex_t pendingLockWrite; // Lock for adding something to pending. (NULL -> nonNULL)
-static pthread_mutex_t pendingLockConsume; // Lock for removing something (nonNULL -> NULL)
-static dnbd3_signal_t* runSignal = NULL;
-
static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
-static int numAltServers = 0;
+static atomic_int numAltServers = 0;
static pthread_mutex_t altServersLock;
-static pthread_t altThread;
-
-static void *altservers_main(void *data);
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt);
+static void *altservers_runCheck(void *data);
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current);
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink);
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt);
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server);
void altservers_init()
{
srand( (unsigned int)time( NULL ) );
- // Init spinlock
- mutex_init( &pendingLockWrite );
- mutex_init( &pendingLockConsume );
- mutex_init( &altServersLock );
- // Init signal
- runSignal = signal_new();
- if ( runSignal == NULL ) {
- logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." );
- exit( EXIT_FAILURE );
- }
- memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) );
- if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) {
- logadd( LOG_ERROR, "Could not start altservers connector thread" );
- exit( EXIT_FAILURE );
- }
- // Init waiting links queue -- this is currently a global static array so
- // it will already be zero, but in case we refactor later do it explicitly
- // while also holding the write lock so thread sanitizer is happy
- mutex_lock( &pendingLockWrite );
- for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
- pending[i] = NULL;
- }
- mutex_unlock( &pendingLockWrite );
-}
-
-void altservers_shutdown()
-{
- if ( runSignal == NULL ) return;
- signal_call( runSignal ); // Wake altservers thread up
- thread_join( altThread, NULL );
+ // Init lock
+ mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
}
-static void addalt(int argc, char **argv, void *data)
+static void addAltFromLegacy(int argc, char **argv, void *data)
{
char *shost;
dnbd3_host_t host;
@@ -81,29 +53,82 @@ static void addalt(int argc, char **argv, void *data)
return;
}
if ( argc == 1 ) argv[1] = "";
- if ( altservers_add( &host, argv[1], isPrivate, isClientOnly ) ) {
+ if ( altservers_add( &host, argv[1], isPrivate, isClientOnly, NULL ) ) {
(*(int*)data)++;
}
}
+static int addAltFromIni(void *countptr, const char* section, const char* key, const char* value)
+{
+ dnbd3_host_t host;
+ char *strhost = strdup( section );
+ if ( !parse_address( strhost, &host ) ) {
+ free( strhost );
+ logadd( LOG_WARNING, "Invalid host section in alt-servers file ignored: '%s'", section );
+ return 1;
+ }
+ free( strhost );
+ int index;
+ if ( altservers_add( &host, "", false, false, &index ) ) {
+ (*(int*)countptr)++;
+ }
+ if ( index == -1 )
+ return 1;
+ if ( strcmp( key, "for" ) == 0 ) {
+ if ( strncmp( value, "client", 6 ) == 0 ) {
+ altServers[index].isClientOnly = true;
+ altServers[index].isPrivate = false;
+ } else if ( strcmp( value, "replication" ) == 0 ) {
+ altServers[index].isClientOnly = false;
+ altServers[index].isPrivate = true;
+ } else {
+ logadd( LOG_WARNING, "Invalid value in alt-servers section %s for key %s: '%s'", section, key, value );
+ }
+ } else if ( strcmp( key, "comment" ) == 0 ) {
+ snprintf( altServers[index].comment, COMMENT_LENGTH, "%s", value );
+ } else if ( strcmp( key, "namespace" ) == 0 ) {
+ dnbd3_ns_t *elem = malloc( sizeof(*elem) );
+ elem->name = strdup( value );
+ elem->len = strlen( value );
+ do {
+ elem->next = altServers[index].nameSpaces;
+ } while ( !atomic_compare_exchange_weak( &altServers[index].nameSpaces, &elem->next, elem ) );
+ } else {
+ logadd( LOG_DEBUG1, "Unknown key in alt-servers section: '%s'", key );
+ }
+ return 1;
+}
+
int altservers_load()
{
int count = 0;
char *name;
if ( asprintf( &name, "%s/%s", _configDir, "alt-servers" ) == -1 ) return -1;
- file_loadLineBased( name, 1, 2, &addalt, (void*)&count );
+ if ( !file_isReadable( name ) ) {
+ free( name );
+ return 0;
+ }
+ ini_parse( name, &addAltFromIni, &count );
+ if ( numAltServers == 0 ) {
+ logadd( LOG_INFO, "Could not parse %s as .ini file, trying to load as legacy format.", name );
+ file_loadLineBased( name, 1, 2, &addAltFromLegacy, (void*)&count );
+ }
free( name );
logadd( LOG_DEBUG1, "Added %d alt servers\n", count );
return count;
}
-bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly)
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly, int *index)
{
int i, freeSlot = -1;
+ if ( index == NULL ) {
+ index = &freeSlot;
+ }
mutex_lock( &altServersLock );
for (i = 0; i < numAltServers; ++i) {
if ( isSameAddressPort( &altServers[i].host, host ) ) {
mutex_unlock( &altServersLock );
+ *index = i;
return false;
} else if ( freeSlot == -1 && altServers[i].host.type == 0 ) {
freeSlot = i;
@@ -113,6 +138,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
if ( numAltServers >= SERVER_MAX_ALTS ) {
logadd( LOG_WARNING, "Cannot add another alt server, maximum of %d already reached.", (int)SERVER_MAX_ALTS );
mutex_unlock( &altServersLock );
+ *index = -1;
return false;
}
freeSlot = numAltServers++;
@@ -120,62 +146,48 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
altServers[freeSlot].host = *host;
altServers[freeSlot].isPrivate = isPrivate;
altServers[freeSlot].isClientOnly = isClientOnly;
+ altServers[freeSlot].nameSpaces = NULL;
if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment );
mutex_unlock( &altServersLock );
+ *index = freeSlot;
return true;
}
/**
* ONLY called from the passed uplink's main thread
*/
-void altservers_findUplink(dnbd3_connection_t *uplink)
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
{
- int i;
+ if ( uplink->shutdown )
+ return;
+ if ( uplink->current.fd != -1 && numAltServers <= 1 )
+ return;
// if betterFd != -1 it means the uplink is supposed to switch to another
// server. As this function here is called by the uplink thread, it can
// never be that the uplink is supposed to switch, but instead calls
// this function.
- assert( uplink->betterFd == -1 );
- mutex_lock( &pendingLockWrite );
+ assert( uplink->better.fd == -1 );
// it is however possible that an RTT measurement is currently in progress,
// so check for that case and do nothing if one is in progress
- if ( uplink->rttTestResult == RTT_INPROGRESS ) {
- for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
- if ( pending[i] != uplink ) continue;
- // Yep, measuring right now
- mutex_unlock( &pendingLockWrite );
- return;
+ if ( uplink->rttTestResult != RTT_INPROGRESS ) {
+ dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
+ if ( current == uplink ) {
+ threadpool_run( &altservers_runCheck, uplink, "UPLINK" );
+ } else if ( current != NULL ) {
+ ref_put( &current->reference );
}
}
- // Find free slot for measurement
- for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
- if ( pending[i] != NULL ) continue;
- pending[i] = uplink;
- uplink->rttTestResult = RTT_INPROGRESS;
- mutex_unlock( &pendingLockWrite );
- signal_call( runSignal ); // Wake altservers thread up
- return;
- }
- // End of loop - no free slot
- mutex_unlock( &pendingLockWrite );
- logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." );
}
-/**
- * The given uplink is about to disappear, so remove it from any queues
- */
-void altservers_removeUplink(dnbd3_connection_t *uplink)
+static bool isImageAllowed(dnbd3_alt_server_t *alt, const char *image)
{
- mutex_lock( &pendingLockConsume );
- mutex_lock( &pendingLockWrite );
- for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
- if ( pending[i] == uplink ) {
- uplink->rttTestResult = RTT_NOT_REACHABLE;
- pending[i] = NULL;
- }
+ if ( alt->nameSpaces == NULL )
+ return true;
+ for ( dnbd3_ns_t *it = alt->nameSpaces; it != NULL; it = it->next ) {
+ if ( strncmp( it->name, image, it->len ) == 0 )
+ return true;
}
- mutex_unlock( &pendingLockWrite );
- mutex_unlock( &pendingLockConsume );
+ return false;
}
/**
@@ -184,95 +196,154 @@ void altservers_removeUplink(dnbd3_connection_t *uplink)
* Private servers are excluded, so this is what you want to call to
* get a list of servers you can tell a client about
*/
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size)
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size)
{
- if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
+ dnbd3_host_t *host = &client->host;
+ if ( host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 )
+ return 0;
int i, j;
int count = 0;
- int scores[size];
- int score;
- mutex_lock( &altServersLock );
+ uint16_t scores[SERVER_MAX_ALTS] = { 0 };
if ( size > numAltServers ) size = numAltServers;
- for (i = 0; i < numAltServers; ++i) {
- if ( altServers[i].host.type == 0 ) continue; // Slot is empty
- if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers
- if ( host->type == altServers[i].host.type ) {
- score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails;
- } else {
- score = -( altServers[i].numFails + 128 ); // Wrong address family
- }
- if ( count == 0 ) {
- // Trivial - this is the first entry
- output[0].host = altServers[i].host;
- output[0].failures = 0;
- scores[0] = score;
- count++;
- } else {
- // Other entries already exist, insert in proper position
- for (j = 0; j < size; ++j) {
- if ( j < count && score <= scores[j] ) continue;
- if ( j > count ) break; // Should never happen but just in case...
- if ( j < count && j + 1 < size ) {
- // Check if we're in the middle and need to move other entries...
- memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) );
- memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) );
- }
- if ( count < size ) {
- count++;
- }
- output[j].host = altServers[i].host;
- output[j].failures = 0;
- scores[j] = score;
- break;
+ mutex_lock( &altServersLock );
+ for ( i = 0; i < numAltServers; ++i ) {
+ if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
+ continue; // Slot is empty or uplink is for replication only
+ if ( !isImageAllowed( &altServers[i], client->image->name ) )
+ continue;
+ scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
+ }
+ while ( count < size ) {
+ i = -1;
+ for ( j = 0; j < numAltServers; ++j ) {
+ if ( scores[j] == 0 )
+ continue;
+ if ( i == -1 || scores[j] > scores[i] ) {
+ i = j;
}
}
+ if ( i == -1 )
+ break;
+ scores[i] = 0;
+ output[count].host = altServers[i].host;
+ output[count].failures = 0;
+ count++;
}
mutex_unlock( &altServersLock );
return count;
}
+bool altservers_toString(int server, char *buffer, size_t len)
+{
+ return host_to_string( &altServers[server].host, buffer, len );
+}
+
+static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now )
+{
+ dnbd3_alt_local_t *local = ( uplink == NULL ? NULL : &uplink->altData[server] );
+ dnbd3_alt_server_t *global = &altServers[server];
+ if ( global->isClientOnly || ( !global->isPrivate && _proxyPrivateOnly ) )
+ return false;
+ // Blocked locally (image not found on server...)
+ if ( local != NULL && local->blocked ) {
+ if ( --local->fails > 0 )
+ return false;
+ local->blocked = false;
+ }
+ if ( global->blocked ) {
+ if ( timing_diff( &global->lastFail, now ) < SERVER_GLOBAL_DUP_TIME )
+ return false;
+ global->lastFail = *now;
+ if ( --global->fails > 0 )
+ return false;
+ global->blocked = false;
+ }
+ // Not blocked, depend on both fail counters
+ int fails = ( local == NULL ? 0 : local->fails ) + global->fails;
+ return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN;
+}
+
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size)
+{
+ int idx[size];
+ int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
+ for ( int i = 0; i < num; ++i ) {
+ servers[i] = altServers[idx[i]].host;
+ }
+ return num;
+}
+
+/**
+ * Returns true if there is at least one alt-server the
+ * given image name would be allowed to be cloned from.
+ */
+bool altservers_imageHasAltServers(const char *image)
+{
+ bool ret = false;
+ mutex_lock( &altServersLock );
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) )
+ continue;
+ if ( !isImageAllowed( &altServers[i], image ) )
+ continue;
+ ret = true;
+ break;
+ }
+ mutex_unlock( &altServersLock );
+ return ret;
+}
+
/**
* Get <size> alt servers. If there are more alt servers than
* requested, random servers will be picked.
* This function is suited for finding uplink servers as
* it includes private servers and ignores any "client only" servers
+ * @param current index of server for current connection, or -1 in panic mode
*/
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency)
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current)
{
- if ( size <= 0 ) return 0;
- int count = 0, i;
- ticks now;
- timing_get( &now );
+ if ( size <= 0 )
+ return 0;
+ int count = 0;
+ declare_now;
mutex_lock( &altServersLock );
- // Flip first server in list with a random one every time this is called
- if ( numAltServers > 1 ) {
- const dnbd3_alt_server_t tmp = altServers[0];
- do {
- i = rand() % numAltServers;
- } while ( i == 0 );
- altServers[0] = altServers[i];
- altServers[i] = tmp;
- }
- // We iterate over the list twice. First run adds servers with 0 failures only,
- // second one also considers those that failed (not too many times)
- if ( size > numAltServers ) size = numAltServers;
- for (i = 0; i < numAltServers * 2; ++i) {
- dnbd3_alt_server_t *srv = &altServers[i % numAltServers];
- if ( srv->host.type == 0 ) continue; // Slot is empty
- if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore!
- if ( srv->isClientOnly ) continue;
- bool first = ( i < numAltServers );
- if ( first ) {
- if ( srv->numFails > 0 ) continue;
- } else {
- if ( srv->numFails == 0 ) continue; // Already added in first iteration
- if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row
- && timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore!
- if ( !emergency ) srv->numFails--;
+ // If we don't have enough servers to randomize, take a shortcut
+ if ( numAltServers <= size ) {
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) {
+ if ( isImageAllowed( &altServers[i], image ) ) {
+ servers[count++] = i;
+ }
+ }
+ }
+ } else {
+ // Plenty of alt servers; randomize
+ uint8_t state[SERVER_MAX_ALTS] = { 0 };
+ if ( current != -1 ) { // Make sure we also test the current server
+ servers[count++] = current;
+ state[current] = 2;
+ }
+ for ( int tr = size * 10; tr > 0 && count < size; --tr ) {
+ int idx = rand() % numAltServers;
+ if ( state[idx] != 0 )
+ continue;
+ if ( !isImageAllowed( &altServers[idx], image ) ) {
+ state[idx] = 2; // Mark as used without adding, so it will be ignored in panic loop
+ } else if ( isUsableForUplink( uplink, idx, &now ) ) {
+ servers[count++] = idx;
+ state[idx] = 2; // Used
+ } else {
+ state[idx] = 1; // Potential
+ }
+ }
+ // If panic mode, consider others too
+ for ( int tr = size * 10; current == -1 && tr > 0 && count < size; --tr ) {
+ int idx = rand() % numAltServers;
+ if ( state[idx] == 2 )
+ continue;
+ servers[count++] = idx;
+ state[idx] = 2; // Used
}
- // server seems ok, include in output and decrease its fail counter
- output[count++] = srv->host;
- if ( count >= size ) break;
}
mutex_unlock( &altServersLock );
return count;
@@ -300,7 +371,7 @@ json_t* altservers_toJson()
"rtt", rtts,
"isPrivate", (int)src[i].isPrivate,
"isClientOnly", (int)src[i].isClientOnly,
- "numFails", src[i].numFails
+ "numFails", src[i].fails
);
json_array_append_new( list, server );
}
@@ -308,33 +379,27 @@ json_t* altservers_toJson()
}
/**
- * Update rtt history of given server - returns the new average for that server
+ * Update rtt history of given server - returns the new average for that server.
*/
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt)
{
- unsigned int avg = rtt;
- int i;
+ uint32_t avg = 0, j;
+ dnbd3_alt_local_t *local = &uplink->altData[index];
mutex_lock( &altServersLock );
- for (i = 0; i < numAltServers; ++i) {
- if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
- altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
-#if SERVER_RTT_PROBES == 5
- avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2]
- + altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES;
-#else
-#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES
- avg = 0;
- for (int j = 0; j < SERVER_RTT_PROBES; ++j) {
- avg += altServers[i].rtt[j];
+ if ( likely( local->initDone ) ) {
+ local->rtt[++local->rttIndex % SERVER_RTT_PROBES] = rtt;
+ for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+ avg += local->rtt[j];
}
avg /= SERVER_RTT_PROBES;
-#endif
- // If we got a new rtt value, server must be working
- if ( altServers[i].numFails > 0 ) {
- altServers[i].numFails--;
+ } else { // First rtt measurement -- copy to every slot
+ for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+ local->rtt[j] = rtt;
}
- break;
+ avg = rtt;
+ local->initDone = true;
}
+ altServers[index].rtt[++altServers[index].rttIndex % SERVER_RTT_PROBES] = avg;
mutex_unlock( &altServersLock );
return avg;
}
@@ -364,250 +429,249 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2)
* track of how often servers fail, and consider them disabled for some time if they
* fail too many times.
*/
-void altservers_serverFailed(const dnbd3_host_t * const host)
+void altservers_serverFailed(int server)
{
- int i;
- int foundIndex = -1, lastOk = -1;
- ticks now;
- timing_get( &now );
+ declare_now;
mutex_lock( &altServersLock );
- for (i = 0; i < numAltServers; ++i) {
- if ( foundIndex == -1 ) {
- // Looking for the failed server in list
- if ( isSameAddressPort( host, &altServers[i].host ) ) {
- foundIndex = i;
- }
- } else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) {
- lastOk = i;
- }
- }
- // Do only increase counter if last fail was not too recent. This is
- // to prevent the counter from increasing rapidly if many images use the
- // same uplink. If there's a network hickup, all uplinks will call this
- // function and would increase the counter too quickly, disabling the server.
- if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) {
- altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE;
- altServers[foundIndex].lastFail = now;
- if ( lastOk != -1 ) {
- // Make sure non-working servers are put at the end of the list, so they're less likely
- // to get picked when testing servers for uplink connections.
- const dnbd3_alt_server_t tmp = altServers[foundIndex];
- altServers[foundIndex] = altServers[lastOk];
- altServers[lastOk] = tmp;
+ if ( timing_diff( &altServers[server].lastFail, &now ) > SERVER_GLOBAL_DUP_TIME ) {
+ altServers[server].lastFail = now;
+ if ( altServers[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+ altServers[server].blocked = true;
}
}
mutex_unlock( &altServersLock );
}
+
/**
- * Mainloop of this module. It will wait for requests by uplinks to find a
- * suitable uplink server for them. If found, it will tell the uplink about
- * the best server found. Currently the RTT history is kept per server and
- * not per uplink, so if many images use the same uplink server, the history
- * will update quite quickly. Needs to be improved some time, ie. by only
- * updating the rtt if the last update was at least X seconds ago.
+ * Called from RTT checker if connecting to a server succeeded but
+ * subsequently selecting the given image failed. Handle this within
+ * the uplink and don't increase the global fail counter.
*/
-static void *altservers_main(void *data UNUSED)
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server)
+{
+ mutex_lock( &altServersLock );
+ if ( uplink->altData[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+ uplink->altData[server].blocked = true;
+ }
+ mutex_unlock( &altServersLock );
+}
+
+static void *altservers_runCheck(void *data)
+{
+ dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
+
+ assert( uplink != NULL );
+ setThreadName( "altserver-check" );
+ altservers_findUplinkInternal( uplink );
+ ref_put( &uplink->reference ); // Acquired in findUplinkAsync
+ return NULL;
+}
+
+void altservers_findUplink(dnbd3_uplink_t *uplink)
+{
+ altservers_findUplinkInternal( uplink );
+ // Above function is sync, which means normally when it
+ // returns, rttTestResult will not be RTT_INPROGRESS.
+ // But we might have an ansync call running in parallel, which would
+ // mean the above call returns immediately. Wait for that check
+ // to finish too.
+ while ( uplink->rttTestResult == RTT_INPROGRESS ) {
+ usleep( 5000 );
+ }
+}
+
+int altservers_hostToIndex(dnbd3_host_t *host)
+{
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( isSameAddressPort( host, &altServers[i].host ) )
+ return i;
+ }
+ return -1;
+}
+
+const dnbd3_host_t* altservers_indexToHost(int server)
+{
+ return &altServers[server].host;
+}
+
+// XXX Sync call above must block until async worker has finished XXX
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
{
const int ALTS = 4;
- int ret, itLink, itAlt, numAlts;
- bool found;
- char buffer[DNBD3_BLOCK_SIZE ];
- dnbd3_reply_t reply;
- dnbd3_host_t servers[ALTS + 1];
- serialized_buffer_t serialized;
+ int itAlt, numAlts, current;
+ bool panic;
+ int servers[ALTS + 1];
struct timespec start, end;
- ticks nextCloseUnusedFd;
- setThreadName( "altserver-check" );
- blockNoncriticalSignals();
- timing_gets( &nextCloseUnusedFd, 900 );
- // LOOP
- while ( !_shutdown ) {
- // Wait 5 seconds max.
- ret = signal_wait( runSignal, 5000 );
- if ( _shutdown ) goto cleanup;
- if ( ret == SIGNAL_ERROR ) {
- if ( errno == EAGAIN || errno == EINTR ) continue;
- logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno );
- usleep( 100000 );
+ if ( _shutdown )
+ return;
+ mutex_lock( &uplink->rttLock );
+ // Maybe we already have a result, or check is currently running
+ if ( uplink->better.fd != -1 || uplink->rttTestResult == RTT_INPROGRESS ) {
+ mutex_unlock( &uplink->rttLock );
+ return;
+ }
+ assert( uplink->rttTestResult != RTT_DOCHANGE );
+ uplink->rttTestResult = RTT_INPROGRESS;
+ panic = ( uplink->current.fd == -1 );
+ current = uplink->current.index; // Current server index (or last one in panic mode)
+ mutex_unlock( &uplink->rttLock );
+ // First, get 4 alt servers
+ numAlts = altservers_getListForUplink( uplink, uplink->image->name, servers, ALTS, panic ? -1 : current );
+ // If we're already connected and only got one server anyways, there isn't much to do
+ if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) {
+ uplink->rttTestResult = RTT_DONTCHANGE;
+ return;
+ }
+ dnbd3_image_t * const image = image_lock( uplink->image );
+ if ( image == NULL ) { // Check again after locking
+ uplink->rttTestResult = RTT_NOT_REACHABLE;
+ logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
+ return;
+ }
+ logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) );
+ assert( uplink->rttTestResult == RTT_INPROGRESS );
+ // Test them all
+ dnbd3_server_connection_t best = { .fd = -1 };
+ unsigned long bestRtt = RTT_UNREACHABLE;
+ unsigned long currentRtt = RTT_UNREACHABLE;
+ uint64_t offset = 0;
+ uint32_t length = DNBD3_BLOCK_SIZE;
+ // Try to use the range of the first request in the queue as RTT block.
+ // In case we have a cluster of servers where none of them has a complete
+ // copy, we at least make sure the one we're potentially switching to
+ // has the next block we're about to request.
+ mutex_lock( &uplink->queueLock );
+ if ( uplink->queue != NULL ) {
+ offset = uplink->queue->from;
+ length = (uint32_t)( uplink->queue->to - offset );
+ }
+ mutex_unlock( &uplink->queueLock );
+ for (itAlt = 0; itAlt < numAlts; ++itAlt) {
+ int server = servers[itAlt];
+ // Connect
+ clock_gettime( BEST_CLOCK_SOURCE, &start );
+ int sock = sock_connect( &altServers[server].host, 750, _uplinkTimeout );
+ if ( sock == -1 ) { // Connection failed means global error
+ altservers_serverFailed( server );
+ continue;
}
- // Work your way through the queue
- for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
- mutex_lock( &pendingLockWrite );
- if ( pending[itLink] == NULL ) {
- mutex_unlock( &pendingLockWrite );
- continue; // Check once before locking, as a mutex is expensive
- }
- mutex_unlock( &pendingLockWrite );
- mutex_lock( &pendingLockConsume );
- mutex_lock( &pendingLockWrite );
- dnbd3_connection_t * const uplink = pending[itLink];
- mutex_unlock( &pendingLockWrite );
- if ( uplink == NULL ) { // Check again after locking
- mutex_unlock( &pendingLockConsume );
- continue;
- }
- dnbd3_image_t * const image = image_lock( uplink->image );
- if ( image == NULL ) { // Check again after locking
- uplink->rttTestResult = RTT_NOT_REACHABLE;
- mutex_lock( &pendingLockWrite );
- pending[itLink] = NULL;
- mutex_unlock( &pendingLockWrite );
- mutex_unlock( &pendingLockConsume );
- logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" );
- continue;
- }
- LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
- assert( uplink->rttTestResult == RTT_INPROGRESS );
- // Now get 4 alt servers
- numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 );
- if ( uplink->fd != -1 ) {
- // Add current server if not already in list
- found = false;
- for (itAlt = 0; itAlt < numAlts; ++itAlt) {
- if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue;
- found = true;
- break;
- }
- if ( !found ) servers[numAlts++] = uplink->currentServer;
- }
- // Test them all
- int bestSock = -1;
- int bestIndex = -1;
- int bestProtocolVersion = -1;
- unsigned long bestRtt = RTT_UNREACHABLE;
- unsigned long currentRtt = RTT_UNREACHABLE;
- for (itAlt = 0; itAlt < numAlts; ++itAlt) {
- usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...)
- // Connect
- clock_gettime( BEST_CLOCK_SOURCE, &start );
- int sock = sock_connect( &servers[itAlt], 750, 1000 );
- if ( sock < 0 ) continue;
- // Select image ++++++++++++++++++++++++++++++
- if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
- goto server_failed;
- }
- // See if selecting the image succeeded ++++++++++++++++++++++++++++++
- uint16_t protocolVersion, rid;
- uint64_t imageSize;
- char *name;
- if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
- goto server_image_not_available;
- }
- if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed;
- if ( name == NULL || strcmp( name, image->name ) != 0 ) {
- ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name );
- }
- if ( rid != image->rid ) {
- ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid );
- }
- if ( imageSize != image->virtualFilesize ) {
- ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
- }
- // Request first block (NOT random!) ++++++++++++++++++++++++++++++
- if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
- LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink );
- }
- // See if requesting the block succeeded ++++++++++++++++++++++
- if ( !dnbd3_get_reply( sock, &reply ) ) {
- LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink );
- }
- // check reply header
- if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
- ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
- }
- if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
- ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink );
- }
- clock_gettime( BEST_CLOCK_SOURCE, &end );
- // Measurement done - everything fine so far
- mutex_lock( &uplink->rttLock );
- const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
- // Penaltize rtt if this was a cycle; this will treat this server with lower priority
- // in the near future too, so we prevent alternating between two servers that are both
- // part of a cycle and have the lowest latency.
- const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000
- + (end.tv_nsec - start.tv_nsec) / 1000
- + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
- unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
- // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
- if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
- mutex_unlock( &uplink->rttLock );
- if ( uplink->fd != -1 && isCurrent ) {
- // Was measuring current server
- currentRtt = avg;
- close( sock );
- } else if ( avg < bestRtt ) {
- // Was another server, update "best"
- if ( bestSock != -1 ) close( bestSock );
- bestSock = sock;
- bestRtt = avg;
- bestIndex = itAlt;
- bestProtocolVersion = protocolVersion;
- } else {
- // Was too slow, ignore
- close( sock );
- }
- // We're done, call continue
- continue;
- // Jump here if anything went wrong
- // This will cleanup and continue
- server_failed: ;
- altservers_serverFailed( &servers[itAlt] );
- server_image_not_available: ;
- close( sock );
- }
- // Done testing all servers. See if we should switch
- if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
- // yep
- if ( currentRtt > 10000000 || uplink->fd == -1 ) {
- LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
- } else {
- LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
- }
- sock_setTimeout( bestSock, _uplinkTimeout );
- mutex_lock( &uplink->rttLock );
- uplink->betterFd = bestSock;
- uplink->betterServer = servers[bestIndex];
- uplink->betterVersion = bestProtocolVersion;
- uplink->rttTestResult = RTT_DOCHANGE;
- mutex_unlock( &uplink->rttLock );
- signal_call( uplink->signal );
- } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) {
- // No server was reachable
- mutex_lock( &uplink->rttLock );
- uplink->rttTestResult = RTT_NOT_REACHABLE;
- mutex_unlock( &uplink->rttLock );
- } else {
- // nope
- if ( bestSock != -1 ) close( bestSock );
- mutex_lock( &uplink->rttLock );
- uplink->rttTestResult = RTT_DONTCHANGE;
- uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
- mutex_unlock( &uplink->rttLock );
- if ( !image->working ) {
- image->working = true;
- LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink );
- }
+ // Select image ++++++++++++++++++++++++++++++
+ if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
+ goto image_failed;
+ }
+ // See if selecting the image succeeded ++++++++++++++++++++++++++++++
+ uint16_t protocolVersion = 0;
+ uint16_t rid;
+ uint64_t imageSize;
+ char *name;
+ serialized_buffer_t serialized;
+ if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
+ goto image_failed;
+ }
+ if ( protocolVersion < MIN_SUPPORTED_SERVER ) { // Server version unsupported; global fail
+ goto server_failed;
+ }
+ if ( name == NULL || strcmp( name, image->name ) != 0 ) {
+ ERROR_GOTO( image_failed, "[RTT] Server offers image '%s' instead of '%s'", name, image->name );
+ }
+ if ( rid != image->rid ) {
+ ERROR_GOTO( image_failed, "[RTT] Server provides rid %d instead of %d", (int)rid, (int)image->rid );
+ }
+ if ( imageSize != image->virtualFilesize ) {
+ ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
+ }
+ // Request block (NOT random! First or from queue) ++++++++++++
+ if ( !dnbd3_get_block( sock, offset, length, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+ LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request block", server );
+ }
+ // See if requesting the block succeeded ++++++++++++++++++++++
+ dnbd3_reply_t reply;
+ if ( !dnbd3_get_reply( sock, &reply ) ) {
+ LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
+ }
+ // check reply header
+ if ( reply.cmd != CMD_GET_BLOCK || reply.size != length ) {
+ // Sanity check failed; count this as global error (malicious/broken server)
+ ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
+ }
+ // flush payload to include this into measurement
+ char buffer[DNBD3_BLOCK_SIZE];
+ uint32_t todo = length;
+ ssize_t ret;
+ while ( todo != 0 && ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
+ todo -= (uint32_t)ret;
+ }
+ if ( todo != 0 ) {
+ ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
+ }
+ clock_gettime( BEST_CLOCK_SOURCE, &end );
+ // Measurement done - everything fine so far
+ mutex_lock( &uplink->rttLock );
+ const bool isCurrent = ( uplink->current.index == server );
+ mutex_unlock( &uplink->rttLock );
+ uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
+ + (end.tv_nsec - start.tv_nsec) / 1000); // µs
+ uint32_t avg = altservers_updateRtt( uplink, server, rtt );
+ // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
+ if ( ( uplink->cycleDetected || panic ) && isCurrent ) {
+ avg = (avg * 2) + 50000;
+ }
+ if ( !panic && isCurrent ) {
+ // Was measuring current server
+ currentRtt = avg;
+ close( sock );
+ } else if ( avg < bestRtt ) {
+ // Was another server, update "best"
+ if ( best.fd != -1 ) {
+ close( best.fd );
}
- image_release( image );
- // end of loop over all pending uplinks
- mutex_lock( &pendingLockWrite );
- pending[itLink] = NULL;
- mutex_unlock( &pendingLockWrite );
- mutex_unlock( &pendingLockConsume );
+ best.fd = sock;
+ bestRtt = avg;
+ best.index = server;
+ best.version = protocolVersion;
+ } else {
+ // Was too slow, ignore
+ close( sock );
+ }
+ // We're done, call continue
+ continue;
+ // Jump here if anything went wrong
+ // This will cleanup and continue
+image_failed:
+ altservers_imageFailed( uplink, server );
+ goto failed;
+server_failed:
+ altservers_serverFailed( server );
+failed:
+ close( sock );
+ }
+ // Done testing all servers. See if we should switch
+ if ( best.fd != -1 && (panic || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
+ // yep
+ if ( currentRtt > 10000000 || panic ) {
+ LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
+ } else {
+ LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
}
- // Save cache maps of all images if applicable
- declare_now;
- // TODO: Has nothing to do with alt servers really, maybe move somewhere else?
- if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
- timing_gets( &nextCloseUnusedFd, 900 );
- image_closeUnusedFd();
+ mutex_lock( &uplink->rttLock );
+ uplink->better = best;
+ uplink->rttTestResult = RTT_DOCHANGE;
+ mutex_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
+ } else if ( best.fd == -1 && currentRtt == RTT_UNREACHABLE ) {
+ // No server was reachable, including current
+ uplink->rttTestResult = RTT_NOT_REACHABLE;
+ } else {
+ // nope
+ if ( best.fd != -1 ) {
+ close( best.fd );
}
+ uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
+ mutex_lock( &uplink->rttLock );
+ uplink->rttTestResult = RTT_DONTCHANGE;
+ mutex_unlock( &uplink->rttLock );
}
- cleanup: ;
- if ( runSignal != NULL ) signal_close( runSignal );
- runSignal = NULL;
- return NULL ;
+ image_release( image );
}
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 7b7b46d..78f6fcc 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -7,23 +7,29 @@ struct json_t;
void altservers_init();
-void altservers_shutdown();
-
int altservers_load();
-bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly, int *index);
+
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
+
+void altservers_findUplink(dnbd3_uplink_t *uplink);
-void altservers_findUplink(dnbd3_connection_t *uplink);
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size);
-void altservers_removeUplink(dnbd3_connection_t *uplink);
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
+bool altservers_imageHasAltServers(const char *image);
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency);
+bool altservers_toString(int server, char *buffer, size_t len);
int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
-void altservers_serverFailed(const dnbd3_host_t * const host);
+void altservers_serverFailed(int server);
+
+int altservers_hostToIndex(dnbd3_host_t *host);
+
+const dnbd3_host_t* altservers_indexToHost(int server);
struct json_t* altservers_toJson();
diff --git a/src/server/fileutil.c b/src/server/fileutil.c
index 336ab68..9a9f066 100644
--- a/src/server/fileutil.c
+++ b/src/server/fileutil.c
@@ -68,7 +68,7 @@ bool file_setSize(int fd, uint64_t size)
// Try really hard... image loading logic relies on the file
// having the proper apparent size
uint8_t byte = 0;
- pread( fd, &byte, 1, size - 1 );
+ (void)!pread( fd, &byte, 1, size - 1 );
if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true;
return false;
}
diff --git a/src/server/fuse.c b/src/server/fuse.c
new file mode 100644
index 0000000..12913a6
--- /dev/null
+++ b/src/server/fuse.c
@@ -0,0 +1,661 @@
+#include "fuse.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/log.h>
+
+#ifndef DNBD3_SERVER_FUSE
+
+//
+bool dfuse_init(const char *opts UNUSED, const char *dir UNUSED)
+{
+ logadd( LOG_ERROR, "FUSE: Not compiled in" );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+}
+
+#else
+
+#define PATHLEN (2000)
+static char nullbytes[DNBD3_BLOCK_SIZE];
+
+// FUSE ENABLED
+#define FUSE_USE_VERSION 30
+//
+#include <dnbd3/config.h>
+#include "locks.h"
+#include "threadpool.h"
+#include "image.h"
+#include "uplink.h"
+#include "reference.h"
+#include "helper.h"
+
+#include <fuse_lowlevel.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <signal.h>
+
+#define INO_ROOT (1)
+#define INO_CTRL (2)
+#define INO_DIR (3)
+static const char *NAME_CTRL = "control";
+static const char *NAME_DIR = "images";
+
+typedef struct {
+ fuse_req_t req;
+ uint16_t rid;
+ char name[PATHLEN];
+} lookup_t;
+
+static fuse_ino_t inoCounter = 10;
+typedef struct _dfuse_dir {
+ struct _dfuse_dir *next;
+ struct _dfuse_dir *child;
+ const char *name;
+ uint64_t size;
+ fuse_ino_t ino;
+ int refcount;
+ lookup_t *img;
+} dfuse_entry_t;
+
+typedef struct {
+ dfuse_entry_t *entry;
+ dnbd3_image_t *image;
+} cmdopen_t;
+
+static dfuse_entry_t sroot = {
+ .name = "images",
+ .ino = INO_DIR,
+ .refcount = 2,
+}, *root = &sroot;
+static pthread_mutex_t dirLock;
+
+#define INIT_NONE (0)
+#define INIT_DONE (1)
+#define INIT_SHUTDOWN (2)
+#define INIT_INPROGRESS (3)
+
+static struct fuse_session *fuseSession = NULL;
+static struct fuse_chan *fuseChannel = NULL;
+static char *fuseMountPoint = NULL;
+static pthread_t fuseThreadId;
+static bool haveThread = false;
+static _Atomic(int) initState = INIT_NONE;
+static pthread_mutex_t initLock;
+static struct timespec startupTime;
+
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name);
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino);
+
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer);
+static void cleanupFuse();
+static void* fuseMainLoop(void *data);
+
+static void ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ fi->fh = 0;
+ if ( ino == INO_CTRL ) {
+ if ( ( fi->flags & 3 ) != O_WRONLY ) {
+ fuse_reply_err( req, EINVAL );
+ } else {
+ fi->nonseekable = 1;
+ fuse_reply_open( req, fi );
+ }
+ } else if ( ino == INO_ROOT ) {
+ fuse_reply_err( req, EISDIR );
+ } else {
+ if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else if ( entry->img == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, EISDIR );
+ } else if ( entry->img->rid == 0 ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else {
+ entry->refcount++;
+ mutex_unlock( &dirLock );
+ dnbd3_image_t *image = image_get( entry->img->name, entry->img->rid, true );
+ if ( image == NULL ) {
+ fuse_reply_err( req, ENOENT );
+ mutex_lock( &dirLock );
+ entry->refcount--;
+ mutex_unlock( &dirLock );
+ } else {
+ cmdopen_t *handle = malloc( sizeof(cmdopen_t) );
+ handle->entry = entry;
+ handle->image = image;
+ fi->fh = (uintptr_t)handle;
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
+ }
+ }
+ }
+}
+
+static dfuse_entry_t* addImage(dfuse_entry_t **dir, const char *name, lookup_t *img)
+{
+ const char *slash = strchr( name, '/' );
+ if ( slash == NULL ) {
+ // Name portion at the end
+ char *path = NULL;
+ if ( asprintf( &path, "%s:%d", name, (int)img->rid ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ entry->img = img;
+ } else {
+ free( path );
+ if ( entry->img == NULL ) {
+ return NULL;
+ }
+ }
+ return entry;
+ } else {
+ // Dirname
+ char *path = NULL;
+ if ( asprintf( &path, "%.*s", (int)( slash - name ), name ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ } else {
+ free( path );
+ }
+ return addImage( &entry->child, slash + 1, img );
+ }
+}
+
+static void ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_CTRL ) {
+ fuse_reply_err( req, EROFS );
+ return;
+ }
+ if ( off != 0 ) {
+ fuse_reply_err( req, ESPIPE );
+ return;
+ }
+ if ( size >= PATHLEN ) {
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ size_t colon = 0;
+ int rid = 0;
+ for ( size_t i = 0; i < size; ++i ) {
+ if ( buf[i] == '\0' || buf[i] == '\n' ) {
+ if ( colon == 0 ) {
+ colon = i;
+ }
+ break;
+ }
+ if ( colon != 0 ) {
+ if ( !isdigit( buf[i] ) ) {
+ logadd( LOG_WARNING, "FUSE: Malformed rid" );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ rid = rid * 10 + ( buf[i] - '0' ); // Can overflow but who cares
+ } else if ( buf[i] == ':' ) {
+ colon = i; // Image name starting with ':' would be broken...
+ }
+ }
+ if ( rid < 0 || rid > 65535 ) {
+ logadd( LOG_WARNING, "FUSE: Invalid rid '%d'", rid );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ if ( colon == 0 ) {
+ colon = size;
+ }
+ lookup_t *lu = malloc( sizeof(lookup_t) );
+ lu->rid = (uint16_t)rid;
+ lu->req = req;
+ if ( snprintf( lu->name, PATHLEN, "%.*s", (int)colon, buf ) == -1 ) {
+ free( lu );
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ logadd( LOG_DEBUG1, "FUSE: Request for '%s:%d'", lu->name, (int)lu->rid );
+ dnbd3_image_t *image = image_getOrLoad( lu->name, lu->rid );
+ if ( image == NULL ) {
+ fuse_reply_err( lu->req, ENOENT );
+ free( lu );
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = addImage( &root->child, lu->name, lu );
+ if ( entry != NULL ) {
+ entry->size = image->virtualFilesize;
+ }
+ lu->rid = image->rid; // In case it was 0
+ mutex_unlock( &dirLock );
+ image_release( image );
+ if ( entry == NULL ) {
+ fuse_reply_err( lu->req, EINVAL );
+ free( lu );
+ } else {
+ fuse_reply_write( lu->req, size );
+ }
+ }
+}
+
+static void ll_read( fuse_req_t req, fuse_ino_t ino UNUSED, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ if ( fi->fh == 0 ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ dnbd3_image_t *image = handle->image;
+ if ( off < 0 || (uint64_t)off >= image->virtualFilesize ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ if ( off + size > image->virtualFilesize ) {
+ size = image->virtualFilesize - off;
+ }
+
+ // Check if cached locally
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ // This is a proxyed image, check if we need to relay the request...
+ const uint64_t start = (uint64_t)off & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (off + size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ ref_put( &cache->reference );
+ if ( size > (uint32_t)_maxPayload ) {
+ size = (uint32_t)_maxPayload;
+ }
+ if ( !uplink_request( image, req, &uplinkCallback, 0, off, (uint32_t)size ) ) {
+ logadd( LOG_DEBUG1, "FUSE: Could not relay uncached request to upstream proxy for image %s:%d",
+ image->name, image->rid );
+ fuse_reply_err( req, EIO );
+ }
+ return; // ASYNC
+ }
+ ref_put( &cache->reference );
+ }
+
+ // Is cached
+ size_t readSize = size;
+ if ( off + readSize > image->realFilesize ) {
+ if ( (uint64_t)off >= image->realFilesize ) {
+ readSize = 0;
+ } else {
+ readSize = image->realFilesize - off;
+ }
+ }
+ struct fuse_bufvec *vec = calloc( 1, sizeof(*vec) + sizeof(struct fuse_buf) );
+ if ( readSize != 0 ) {
+ // Real data from file
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = readSize,
+ .flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_RETRY | FUSE_BUF_FD_SEEK,
+ .fd = image->readFd,
+ .pos = off,
+ };
+ }
+ if ( readSize != size ) {
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = size - readSize,
+ .mem = nullbytes,
+ .fd = -1,
+ };
+ }
+ fuse_reply_data( req, vec, FUSE_BUF_SPLICE_MOVE );
+ free( vec );
+}
+
+static bool statInternal(fuse_ino_t ino, struct stat *stbuf)
+{
+ switch ( ino ) {
+ case INO_ROOT:
+ case INO_DIR:
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_CTRL:
+ stbuf->st_mode = S_IFREG | 0222;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = 0;
+ clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return false;
+ }
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = 0;
+ stbuf->st_ino = ino;
+ return true;
+}
+
+/**
+ * HOLD LOCK
+ */
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name)
+{
+ if ( dir == NULL )
+ return NULL;
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ if ( strcmp( it->name, name ) == 0 )
+ return it;
+ }
+ return NULL;
+}
+
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino)
+{
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ logadd( LOG_DEBUG1, "ino %d is %s", (int)it->ino, it->name );
+ if ( it->ino == ino )
+ return it;
+ if ( it->img == NULL ) {
+ dir = inoRecursive( it->child, ino );
+ if ( dir != NULL )
+ return dir;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * HOLD LOCK
+ */
+static void entryToStat(dfuse_entry_t *entry, struct stat *stbuf)
+{
+ if ( entry->img == NULL ) {
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ } else {
+ stbuf->st_mode = S_IFREG | 0444;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = entry->size;
+ }
+ stbuf->st_ino = entry->ino;
+ stbuf->st_uid = 0;
+ stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
+}
+
+static void ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ logadd( LOG_DEBUG2, "Lookup at ino %d for '%s'", (int)parent, name );
+ if ( parent == INO_ROOT ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, NAME_DIR ) == 0 ) {
+ e.ino = INO_DIR;
+ } else if ( strcmp( name, NAME_CTRL ) == 0 ) {
+ e.ino = INO_CTRL;
+ e.attr_timeout = e.entry_timeout = 3600;
+ }
+ if ( e.ino != 0 && statInternal( e.ino, &e.attr ) ) {
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *dir = inoRecursive( root, parent );
+ if ( dir != NULL ) {
+ if ( dir->img != NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOTDIR );
+ return;
+ }
+ dfuse_entry_t *entry = dirLookup( dir->child, name );
+ if ( entry != NULL ) {
+ struct fuse_entry_param e = { .ino = entry->ino };
+ entryToStat( entry, &e.attr );
+ mutex_unlock( &dirLock );
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ }
+ mutex_unlock( &dirLock );
+ }
+ fuse_reply_err( req, ENOENT );
+}
+
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
+{
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
+}
+
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
+{
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
+ }
+ return fuse_reply_buf( req, NULL, 0 );
+}
+
+static void ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, NAME_CTRL, INO_CTRL );
+ dirbuf_add( req, &b, NAME_DIR, INO_DIR );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
+ }
+}
+
+static void ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi UNUSED)
+{
+ struct stat stbuf = { .st_ino = 0 };
+ if ( !statInternal( ino, &stbuf ) ) {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry != NULL ) {
+ entryToStat( entry, &stbuf );
+ }
+ mutex_unlock( &dirLock );
+ }
+ if ( stbuf.st_ino == 0 ) {
+ fuse_reply_err( req, ENOENT );
+ } else {
+ fuse_reply_attr( req, &stbuf, 0 );
+ }
+}
+
+void ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr UNUSED, int to_set UNUSED, struct fuse_file_info *fi)
+{
+ ll_getattr( req, ino, fi );
+}
+
+void ll_release(fuse_req_t req, fuse_ino_t ino UNUSED, struct fuse_file_info *fi)
+{
+ if ( fi->fh != 0 ) {
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ image_release( handle->image );
+ mutex_lock( &dirLock );
+ handle->entry->refcount--;
+ mutex_unlock( &dirLock );
+ free( handle );
+ }
+ fuse_reply_err( req, 0 );
+}
+
+static void uplinkCallback(void *data, uint64_t handle UNUSED, uint64_t start UNUSED, uint32_t length, const char *buffer)
+{
+ fuse_req_t req = (fuse_req_t)data;
+ if ( buffer == NULL ) {
+ fuse_reply_err( req, EIO );
+ } else {
+ fuse_reply_buf( req, buffer, length );
+ }
+}
+
+#define DUMP(key,type) logadd( LOG_DEBUG1, "FUSE: " #key ": " type, conn->key )
+void ll_init(void *userdata, struct fuse_conn_info *conn)
+{
+ DUMP( capable, "%u" );
+ DUMP( congestion_threshold, "%u" );
+ DUMP( max_background, "%u" );
+ //DUMP( max_read, "%u" );
+ DUMP( max_readahead, "%u" );
+ DUMP( max_write, "%u" );
+ DUMP( want, "%u" );
+ conn->want |= FUSE_CAP_SPLICE_READ | FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
+}
+#undef DUMP
+
+/* map the implemented fuse operations */
+static struct fuse_lowlevel_ops fuseOps = {
+ .lookup = ll_lookup,
+ .getattr = ll_getattr,
+ .setattr = ll_setattr,
+ .readdir = ll_readdir,
+ .open = ll_open,
+ .release = ll_release,
+ .read = ll_read,
+ .write = ll_write,
+ .init = ll_init,
+ //.destroy = ll_destroy,
+};
+
+bool dfuse_init(const char *opts, const char *dir)
+{
+ int ex = INIT_NONE;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_INPROGRESS ) ) {
+ logadd( LOG_ERROR, "Calling dfuse_init twice" );
+ exit( 1 );
+ }
+ mutex_init( &initLock, LOCK_FUSE_INIT );
+ mutex_lock( &initLock );
+ mutex_init( &dirLock, LOCK_FUSE_DIR );
+ clock_gettime( CLOCK_REALTIME, &startupTime );
+ struct fuse_args args = FUSE_ARGS_INIT( 0, NULL );
+ fuse_opt_add_arg( &args, "dnbd3fs" ); // argv[0]
+ if ( opts != NULL ) {
+ fuse_opt_add_arg( &args, opts );
+ }
+ fuse_opt_add_arg( &args, "-odefault_permissions" );
+ fuse_opt_add_arg( &args, dir ); // last param is mount point
+ //
+ if ( fuse_parse_cmdline( &args, &fuseMountPoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Error parsing command line" );
+ goto fail;
+ }
+ fuseChannel = fuse_mount( fuseMountPoint, &args );
+ if ( fuseChannel == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Cannot mount to %s", dir );
+ goto fail;
+ }
+ fuseSession = fuse_lowlevel_new( &args, &fuseOps, sizeof( fuseOps ), NULL );
+ if ( fuseSession == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Error initializing fuse session" );
+ goto fail;
+ }
+ fuse_session_add_chan( fuseSession, fuseChannel );
+ if ( 0 != thread_create( &fuseThreadId, NULL, &fuseMainLoop, (void *)NULL ) ) {
+ logadd( LOG_ERROR, "FUSE: Could not start thread" );
+ goto fail;
+ }
+ haveThread = true;
+ // Init OK
+ mutex_unlock( &initLock );
+ return true;
+fail:
+ cleanupFuse();
+ fuse_opt_free_args( &args );
+ initState = INIT_SHUTDOWN;
+ mutex_unlock( &initLock );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+ if ( initState == INIT_NONE )
+ return;
+ for ( ;; ) {
+ int ex = INIT_DONE;
+ if ( atomic_compare_exchange_strong( &initState, &ex, INIT_SHUTDOWN ) )
+ break; // OK, do the shutdown
+ if ( ex == INIT_INPROGRESS )
+ continue; // dfuse_init in progress, wait for mutex
+ // Wrong state
+ logadd( LOG_WARNING, "Called dfuse_shutdown without dfuse_init first" );
+ return;
+ }
+ logadd( LOG_INFO, "Shutting down fuse mainloop..." );
+ mutex_lock( &initLock );
+ if ( fuseSession != NULL ) {
+ fuse_session_exit( fuseSession );
+ }
+ if ( !haveThread ) {
+ cleanupFuse();
+ }
+ mutex_unlock( &initLock );
+ if ( haveThread ) {
+ logadd( LOG_DEBUG1, "FUSE: Sending USR1 to mainloop thread" );
+ pthread_kill( fuseThreadId, SIGUSR1 );
+ pthread_join( fuseThreadId, NULL );
+ }
+}
+
+static void* fuseMainLoop(void *data UNUSED)
+{
+ int ex = INIT_INPROGRESS;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_DONE ) ) {
+ logadd( LOG_WARNING, "FUSE: Unexpected state in fuseMainLoop: %d", ex );
+ return NULL;
+ }
+ setThreadName( "fuse" );
+ logadd( LOG_INFO, "FUSE: Starting mainloop" );
+ fuse_session_loop_mt( fuseSession );
+ logadd( LOG_INFO, "FUSE: Left mainloop" );
+ mutex_lock( &initLock );
+ cleanupFuse();
+ mutex_unlock( &initLock );
+ return NULL;
+}
+
+static void cleanupFuse()
+{
+ if ( fuseChannel != NULL ) {
+ fuse_session_remove_chan( fuseChannel );
+ }
+ if ( fuseSession != NULL ) {
+ fuse_session_destroy( fuseSession );
+ fuseSession = NULL;
+ }
+ if ( fuseMountPoint != NULL && fuseChannel != NULL ) {
+ fuse_unmount( fuseMountPoint, fuseChannel );
+ }
+ fuseChannel = NULL;
+}
+
+#endif // DNBD3_SERVER_FUSE
diff --git a/src/server/fuse.h b/src/server/fuse.h
new file mode 100644
index 0000000..f01ad58
--- /dev/null
+++ b/src/server/fuse.h
@@ -0,0 +1,10 @@
+#ifndef _FUSE_H_
+#define _FUSE_H_
+
+#include <stdbool.h>
+
+bool dfuse_init(const char *opts, const char *dir);
+
+void dfuse_shutdown();
+
+#endif
diff --git a/src/server/globals.c b/src/server/globals.c
index 69e8a6e..f6432cb 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -1,7 +1,7 @@
#include "globals.h"
#include "ini.h"
#include "locks.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/log.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
@@ -19,21 +19,26 @@ atomic_int _clientPenalty = 0;
atomic_bool _isProxy = false;
atomic_int _backgroundReplication = BGR_FULL;
atomic_int _bgrMinClients = 0;
+atomic_int _bgrWindowSize = 1;
atomic_bool _lookupMissingForProxy = true;
atomic_bool _sparseFiles = false;
+atomic_bool _ignoreAllocErrors = false;
atomic_bool _removeMissingImages = true;
-atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
-atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT;
+atomic_uint _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
+atomic_uint _clientTimeout = SOCKET_TIMEOUT_CLIENT;
atomic_bool _closeUnusedFd = false;
atomic_bool _vmdkLegacyMode = false;
// Not really needed anymore since we have '+' and '-' in alt-servers
atomic_bool _proxyPrivateOnly = false;
+atomic_bool _pretendClient = false;
+atomic_int _autoFreeDiskSpaceDelay = 3600 * 10;
// [limits]
atomic_int _maxClients = SERVER_MAX_CLIENTS;
atomic_int _maxImages = SERVER_MAX_IMAGES;
-atomic_int _maxPayload = 9000000; // 9MB
+atomic_uint _maxPayload = 9000000; // 9MB
atomic_uint_fast64_t _maxReplicationSize = (uint64_t)100000000000LL;
-atomic_bool _pretendClient = false;
+atomic_uint _maxPrefetch = 262144; // 256KB
+atomic_uint _minRequestSize = 0;
/**
* True when loading config the first time. Consecutive loads will
@@ -57,32 +62,37 @@ static const char* units = "KMGTPEZY";
static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname);
static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname);
-static bool parse32(const char *in, atomic_int *out, const char *optname) UNUSED;
-static bool parse32u(const char *in, atomic_int *out, const char *optname);
+static bool parse32(const char *in, atomic_int *out, const char *optname);
+static bool parse32u(const char *in, atomic_uint *out, const char *optname);
static int ini_handler(void *custom UNUSED, const char* section, const char* key, const char* value)
{
if ( initialLoad ) {
if ( _basePath == NULL ) SAVE_TO_VAR_STR( dnbd3, basePath );
SAVE_TO_VAR_BOOL( dnbd3, vmdkLegacyMode );
- SAVE_TO_VAR_UINT( dnbd3, listenPort );
- SAVE_TO_VAR_UINT( limits, maxClients );
- SAVE_TO_VAR_UINT( limits, maxImages );
+ SAVE_TO_VAR_INT( dnbd3, listenPort );
+ SAVE_TO_VAR_INT( limits, maxClients );
+ SAVE_TO_VAR_INT( limits, maxImages );
}
SAVE_TO_VAR_BOOL( dnbd3, isProxy );
SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+ SAVE_TO_VAR_INT( dnbd3, bgrWindowSize );
SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
+ SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages );
SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd );
- SAVE_TO_VAR_UINT( dnbd3, serverPenalty );
- SAVE_TO_VAR_UINT( dnbd3, clientPenalty );
+ SAVE_TO_VAR_INT( dnbd3, serverPenalty );
+ SAVE_TO_VAR_INT( dnbd3, clientPenalty );
SAVE_TO_VAR_UINT( dnbd3, uplinkTimeout );
SAVE_TO_VAR_UINT( dnbd3, clientTimeout );
SAVE_TO_VAR_UINT( limits, maxPayload );
SAVE_TO_VAR_UINT64( limits, maxReplicationSize );
+ SAVE_TO_VAR_UINT( limits, maxPrefetch );
+ SAVE_TO_VAR_UINT( limits, minRequestSize );
SAVE_TO_VAR_BOOL( dnbd3, pretendClient );
+ SAVE_TO_VAR_INT( dnbd3, autoFreeDiskSpaceDelay );
if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) {
if ( strcmp( value, "hashblock" ) == 0 ) {
_backgroundReplication = BGR_HASHBLOCK;
@@ -109,10 +119,13 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
void globals_loadConfig()
{
char *name = NULL;
- asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
+ if ( asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ) == -1 ) {
+ logadd( LOG_ERROR, "Memory allocation error for config filename" );
+ exit( 1 );
+ }
if ( name == NULL ) return;
if ( initialLoad ) {
- mutex_init( &loadLock );
+ mutex_init( &loadLock, LOCK_LOAD_CONFIG );
}
if ( mutex_trylock( &loadLock ) != 0 ) {
logadd( LOG_INFO, "Ignoring config reload request due to already running reload" );
@@ -123,9 +136,30 @@ void globals_loadConfig()
if ( initialLoad ) {
sanitizeFixedConfig();
}
- if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
- logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
- _sparseFiles = false;
+ if ( _isProxy ) {
+ if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
+ logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
+ _sparseFiles = false;
+ }
+ if ( _bgrWindowSize < 1 ) {
+ _bgrWindowSize = 1;
+ } else if ( _bgrWindowSize > UPLINK_MAX_QUEUE - 10 ) {
+ _bgrWindowSize = UPLINK_MAX_QUEUE - 10;
+ logadd( LOG_MINOR, "Limiting bgrWindowSize to %d, because of UPLINK_MAX_QUEUE",
+ _bgrWindowSize );
+ }
+ if ( _maxPayload < 256 * 1024 ) {
+ logadd( LOG_WARNING, "maxPayload was increased to 256k" );
+ _maxPayload = 256 * 1024;
+ }
+ if ( _maxPrefetch > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing maxPrefetch to maxPayload" );
+ _maxPrefetch = _maxPayload;
+ }
+ if ( _minRequestSize > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing minRequestSize to maxPayload" );
+ _minRequestSize = _maxPayload;
+ }
}
// Dump config as interpreted
char buffer[2000];
@@ -229,6 +263,15 @@ static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optnam
while ( *end == ' ' ) end++;
if ( *end == '\0' ) {
exp = 0;
+ } else if ( *end == 'm' ) {
+ exp = 1;
+ base = 60;
+ } else if ( *end == 'h' ) {
+ exp = 1;
+ base = 3600;
+ } else if ( *end == 'd' ) {
+ exp = 1;
+ base = 24 * 3600;
} else {
char *pos = strchr( units, *end > 'Z' ? (*end - 32) : *end );
if ( pos == NULL ) {
@@ -270,7 +313,7 @@ static bool parse32(const char *in, atomic_int *out, const char *optname)
return true;
}
-static bool parse32u(const char *in, atomic_int *out, const char *optname)
+static bool parse32u(const char *in, atomic_uint *out, const char *optname)
{
atomic_int_fast64_t v;
if ( !parse64( in, &v, optname ) ) return false;
@@ -278,7 +321,7 @@ static bool parse32u(const char *in, atomic_int *out, const char *optname)
logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)0, (int)INT_MAX, in );
return false;
}
- *out = (int)v;
+ *out = (unsigned int)v;
return true;
}
@@ -309,8 +352,10 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PBOOL(backgroundReplication);
}
PINT(bgrMinClients);
+ PINT(bgrWindowSize);
PBOOL(lookupMissingForProxy);
PBOOL(sparseFiles);
+ PBOOL(ignoreAllocErrors);
PBOOL(removeMissingImages);
PINT(uplinkTimeout);
PINT(clientTimeout);
@@ -318,11 +363,14 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PBOOL(vmdkLegacyMode);
PBOOL(proxyPrivateOnly);
PBOOL(pretendClient);
+ PINT(autoFreeDiskSpaceDelay);
P_ARG("[limits]\n");
PINT(maxClients);
PINT(maxImages);
PINT(maxPayload);
PUINT64(maxReplicationSize);
+ PINT(maxPrefetch);
+ PINT(minRequestSize);
return size - rem;
}
diff --git a/src/server/globals.h b/src/server/globals.h
index b248800..bde1184 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -1,101 +1,133 @@
#ifndef _GLOBALS_H_
#define _GLOBALS_H_
-#include "../types.h"
-#include "../shared/fdsignal.h"
-#include "../serverconfig.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/config/server.h>
#include <stdint.h>
#include <stdatomic.h>
#include <time.h>
#include <pthread.h>
+#include "reftypes.h"
typedef struct timespec ticks;
// ######### All structs/types used by the server ########
-typedef struct _dnbd3_connection dnbd3_connection_t;
+typedef struct _dnbd3_uplink dnbd3_uplink_t;
typedef struct _dnbd3_image dnbd3_image_t;
typedef struct _dnbd3_client dnbd3_client_t;
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
-typedef struct
+typedef void (*uplink_callback)(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
+
+typedef struct _dnbd3_queue_client
+{
+ struct _dnbd3_queue_client *next;
+ void* data; // Passed back to callback
+ uint64_t handle; // Passed back to callback
+ uint64_t from, to; // Client range
+ uplink_callback callback; // Callback function
+} dnbd3_queue_client_t;
+
+typedef struct _dnbd3_queue_entry
{
- uint64_t handle; // Client defined handle to pass back in reply
- uint64_t from; // First byte offset of requested block (ie. 4096)
- uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
- dnbd3_client_t * client; // Client to send reply to
- int status; // status of this entry: ULR_*
-#ifdef _DEBUG
- ticks entered; // When this request entered the queue (for debugging)
+ struct _dnbd3_queue_entry *next;
+ uint64_t handle; // Our handle for this entry
+ uint64_t from; // First byte offset of requested block (ie. 4096)
+ uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+ dnbd3_queue_client_t *clients;
+#ifdef DEBUG
+ ticks entered; // When this request entered the queue (for debugging)
#endif
- uint8_t hopCount; // How many hops this request has already taken across proxies
-} dnbd3_queued_request_t;
+ uint8_t hopCount; // How many hops this request has already taken across proxies
+ bool sent; // Already sent to uplink?
+} dnbd3_queue_entry_t;
+
+typedef struct _ns
+{
+ struct _ns *next;
+ char *name;
+ size_t len;
+} dnbd3_ns_t;
+
+typedef struct
+{
+ int fails; // Hard fail: Connection failed
+ int rttIndex;
+ uint32_t rtt[SERVER_RTT_PROBES];
+ bool isPrivate, isClientOnly;
+ bool blocked; // If true count down fails until 0 to enable again
+ ticks lastFail; // Last hard fail
+ dnbd3_host_t host;
+ char comment[COMMENT_LENGTH];
+ _Atomic(dnbd3_ns_t *) nameSpaces; // Linked list of name spaces
+} dnbd3_alt_server_t;
+
+typedef struct
+{
+ int fails; // Soft fail: Image not found
+ int rttIndex;
+ uint32_t rtt[SERVER_RTT_PROBES];
+ bool blocked; // True if server is to be ignored and fails should be counted down
+ bool initDone;
+} dnbd3_alt_local_t;
+
+typedef struct {
+ int fd; // Socket fd for this connection
+ int version; // Protocol version of remote server
+ int index; // Entry in uplinks list
+} dnbd3_server_connection_t;
#define RTT_IDLE 0 // Not in progress
#define RTT_INPROGRESS 1 // In progess, not finished
#define RTT_DONTCHANGE 2 // Finished, but no better alternative found
#define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd
#define RTT_NOT_REACHABLE 4 // No uplink was reachable
-struct _dnbd3_connection
+struct _dnbd3_uplink
{
- int fd; // socket fd to remote server
- int version; // remote server protocol version
+ ref reference;
+ dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected
+ dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none
dnbd3_signal_t* signal; // used to wake up the process
pthread_t thread; // thread holding the connection
pthread_mutex_t sendMutex; // For locking socket while sending
pthread_mutex_t queueLock; // lock for synchronization on request queue etc.
dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer
- dnbd3_host_t currentServer; // Current server we're connected to
pthread_mutex_t rttLock; // When accessing rttTestResult, betterFd or betterServer
- int rttTestResult; // RTT_*
+ atomic_int rttTestResult; // RTT_*
int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD!
- int betterVersion; // protocol version of better server
- int betterFd; // Active connection to better server, ready to use
- dnbd3_host_t betterServer; // The better server
uint8_t *recvBuffer; // Buffer for receiving payload
uint32_t recvBufferLen; // Len of ^^
- volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
+ atomic_bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
bool replicatedLastBlock; // bool telling if the last block has been replicated yet
bool cycleDetected; // connection cycle between proxies detected for current remote server
int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at
// If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
- uint64_t replicationHandle; // Handle of pending replication request
atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
+ atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
int queueLen; // length of queue
- uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives)
- dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+ int idleTime; // How many seconds the uplink was idle (apart from keep-alives)
+ dnbd3_queue_entry_t *queue;
+ atomic_uint_fast32_t queueId;
+ dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
};
typedef struct
{
- char comment[COMMENT_LENGTH];
- dnbd3_host_t host;
- unsigned int rtt[SERVER_RTT_PROBES];
- unsigned int rttIndex;
- bool isPrivate, isClientOnly;
- ticks lastFail;
- int numFails;
-} dnbd3_alt_server_t;
-
-typedef struct
-{
uint8_t host[16];
int bytes;
int bitMask;
int permissions;
} dnbd3_access_rule_t;
+typedef struct
+{
+ ref reference;
+ atomic_bool dirty; // Cache map has been modified outside uplink (only integrity checker for now)
+ bool unchanged; // How many times in a row a reloaded cache map went unchanged
+ _Atomic uint8_t map[];
+} dnbd3_cache_map_t;
+
/**
* Image struct. An image path could be something like
* /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1
@@ -106,35 +138,44 @@ struct _dnbd3_image
{
char *path; // absolute path of the image
char *name; // public name of the image (usually relative path minus revision ID)
- dnbd3_connection_t *uplink; // pointer to a server connection
- uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete
+ weakref uplinkref; // pointer to a server connection
+ weakref ref_cacheMap; // cache map telling which parts are locally cached, NULL if complete
uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k)
uint64_t realFilesize; // actual file size on disk
ticks atime; // last access time
- ticks lastWorkCheck; // last time a non-working image has been checked
ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image
uint32_t masterCrc32; // CRC-32 of the crc-32 list
int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al)
- int completenessEstimate; // Completeness estimate in percent
- int users; // clients currently using this image
+ atomic_int completenessEstimate; // Completeness estimate in percent
+ atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
- bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected
+ struct {
+ atomic_bool read; // Error reading from file
+ atomic_bool write; // Error writing to file
+ atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back
+ atomic_bool uplink; // No uplink connected
+ atomic_bool queue; // Too many requests waiting on uplink
+ } problem;
uint16_t rid; // revision of image
+ bool accessed; // image was accessed since .meta was written
pthread_mutex_t lock;
};
+#define PIMG(x) (x)->name, (int)(x)->rid
struct _dnbd3_client
{
#define HOSTNAMELEN (48)
atomic_uint_fast64_t bytesSent; // Byte counter for this client.
- dnbd3_image_t *image; // Image in use by this client, or NULL during handshake
+ dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake
int sock;
+ _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server
bool isServer; // true if a server in proxy mode, false if real client
dnbd3_host_t host;
char hostName[HOSTNAMELEN]; // inet_ntop version of host
pthread_mutex_t sendMutex; // Held while writing to sock if image is incomplete (since uplink uses socket too)
pthread_mutex_t lock;
+ pthread_t thread;
};
// #######################################################
@@ -188,12 +229,12 @@ extern atomic_bool _removeMissingImages;
/**
* Read timeout when waiting for or sending data on an uplink
*/
-extern atomic_int _uplinkTimeout;
+extern atomic_uint _uplinkTimeout;
/**
* Read timeout when waiting for or sending data from/to client
*/
-extern atomic_int _clientTimeout;
+extern atomic_uint _clientTimeout;
/**
* If true, images with no active client will have their fd closed after some
@@ -216,6 +257,11 @@ extern atomic_int _backgroundReplication;
extern atomic_int _bgrMinClients;
/**
+ * How many in-flight replication requests we should target (per uplink)
+ */
+extern atomic_int _bgrWindowSize;
+
+/**
* (In proxy mode): If connecting client is a proxy, and the requested image
* is not known locally, should we ask our known alt servers for it?
* Otherwise the request is rejected.
@@ -237,6 +283,12 @@ extern atomic_bool _lookupMissingForProxy;
extern atomic_bool _sparseFiles;
/**
+ * If true, don't abort image replication if preallocating
+ * the image fails, but retry with sparse file.
+ */
+extern atomic_bool _ignoreAllocErrors;
+
+/**
* Port to listen on (default: #define PORT (5003))
*/
extern atomic_int _listenPort;
@@ -257,7 +309,7 @@ extern atomic_int _maxImages;
* Usually this isn't even a megabyte for "real" clients (blockdev
* or fuse).
*/
-extern atomic_int _maxPayload;
+extern atomic_uint _maxPayload;
/**
* If in proxy mode, don't replicate images that are
@@ -273,6 +325,28 @@ extern atomic_uint_fast64_t _maxReplicationSize;
extern atomic_bool _pretendClient;
/**
+ * Minimum uptime in seconds before proxy starts deleting old
+ * images if running out of space. -1 disables automatic deletion.
+ * Only relevant in proxy mode.
+ */
+extern atomic_int _autoFreeDiskSpaceDelay;
+
+/**
+ * When handling a client request, this sets the maximum amount
+ * of bytes we prefetch offset right at the end of the client request.
+ * The prefetch size will be MIN( length * 3, _maxPrefetch ), if
+ * length <= _maxPrefetch, so effectively, setting this to 0 disables
+ * any prefetching.
+ */
+extern atomic_uint _maxPrefetch;
+
+/**
+ * Use with care. Can severely degrade performance.
+ * Set either 0 or very high.
+ */
+extern atomic_uint _minRequestSize;
+
+/**
* Load the server configuration.
*/
void globals_loadConfig();
diff --git a/src/server/helper.h b/src/server/helper.h
index 102cb36..3e1b661 100644
--- a/src/server/helper.h
+++ b/src/server/helper.h
@@ -2,8 +2,8 @@
#define HELPER_H_
#include "server.h"
-#include "../shared/log.h"
-#include "../types.h"
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
diff --git a/src/server/image.c b/src/server/image.c
index bfba6cb..51fd5b6 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -5,9 +5,10 @@
#include "locks.h"
#include "integrity.h"
#include "altservers.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
+#include "reference.h"
#include <assert.h>
#include <fcntl.h>
@@ -45,29 +46,44 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image);
static dnbd3_image_t* image_free(dnbd3_image_t *image);
static bool image_load_all_internal(char *base, char *path);
static bool image_addToList(dnbd3_image_t *image);
-static bool image_load(char *base, char *path, int withUplink);
+static bool image_load(char *base, char *path, bool withUplink);
static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize);
static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
static bool image_ensureDiskSpace(uint64_t size, bool force);
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map);
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
+static void* closeUnusedFds(void*);
+static bool isImageFromUpstream(dnbd3_image_t *image);
+static void* saveLoadAllCacheMaps(void*);
+static void saveCacheMap(dnbd3_image_t *image);
+static void allocCacheMap(dnbd3_image_t *image, bool complete);
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime);
+static void loadImageMeta(dnbd3_image_t *image);
+
+static void cmfree(ref *ref)
+{
+ dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference);
+ logadd( LOG_DEBUG2, "Freeing a cache map" );
+ free( cache );
+}
// ##########################################
void image_serverStartup()
{
srand( (unsigned int)time( NULL ) );
- mutex_init( &imageListLock );
- mutex_init( &remoteCloneLock );
- mutex_init( &reloadLock );
+ mutex_init( &imageListLock, LOCK_IMAGE_LIST );
+ mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
+ mutex_init( &reloadLock, LOCK_RELOAD );
+ server_addJob( &closeUnusedFds, NULL, 10, 900 );
+ server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 );
}
/**
* Update cache-map of given image for the given byte range
* start (inclusive) - end (exclusive)
- * Locks on: images[].lock
*/
void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set)
{
@@ -88,32 +104,54 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
if ( start >= end )
return;
bool setNewBlocks = false;
- uint64_t pos = start;
- mutex_lock( &image->lock );
- if ( image->cache_map == NULL ) {
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL ) {
// Image seems already complete
if ( set ) {
// This makes no sense
- mutex_unlock( &image->lock );
- logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path );
+ logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path );
return;
}
// Recreate a cache map, set it to all 1 initially as we assume the image was complete
- const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
- image->cache_map = malloc( byteSize );
- memset( image->cache_map, 0xff, byteSize );
- }
- while ( pos < end ) {
- const size_t map_y = (int)( pos >> 15 );
- const int map_x = (int)( (pos >> 12) & 7 ); // mod 8
- const int bit_mask = 1 << map_x;
+ allocCacheMap( image, true );
+ cache = ref_get_cachemap( image );
+ if ( cache == NULL ) {
+ logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path );
+ return;
+ }
+ }
+ // Set/unset
+ const uint64_t firstByteInMap = start >> 15;
+ const uint64_t lastByteInMap = (end - 1) >> 15;
+ uint64_t pos;
+ // First and last byte masks
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ if ( firstByteInMap == lastByteInMap ) {
+ if ( set ) {
+ uint8_t o = atomic_fetch_or( &cache->map[firstByteInMap], (uint8_t)(fb & lb) );
+ setNewBlocks = o != ( o | (fb & lb) );
+ } else {
+ atomic_fetch_and( &cache->map[firstByteInMap], (uint8_t)~(fb & lb) );
+ }
+ } else {
+ atomic_thread_fence( memory_order_acquire );
if ( set ) {
- if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true;
- image->cache_map[map_y] |= (uint8_t)bit_mask;
+ uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+ uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+ setNewBlocks = ( fo != ( fo | fb ) || lo != ( lo | lb ) );
} else {
- image->cache_map[map_y] &= (uint8_t)~bit_mask;
+ atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+ atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+ }
+ // Everything in between
+ const uint8_t nval = set ? 0xff : 0;
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+ setNewBlocks = true;
+ }
}
- pos += DNBD3_BLOCK_SIZE;
+ atomic_thread_fence( memory_order_release );
}
if ( setNewBlocks && image->crc32 != NULL ) {
// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
@@ -122,19 +160,16 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
// First set start and end to borders of hash blocks
start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1);
end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1);
- pos = start;
- while ( pos < end ) {
- if ( image->cache_map == NULL ) break;
+ for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) {
const int block = (int)( pos / HASH_BLOCK_SIZE );
- if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) {
- mutex_unlock( &image->lock );
- integrity_check( image, block );
- mutex_lock( &image->lock );
+ if ( image_isHashBlockComplete( cache, block, image->realFilesize ) ) {
+ integrity_check( image, block, false );
}
- pos += HASH_BLOCK_SIZE;
}
+ } else if ( !set ) {
+ cache->dirty = true;
}
- mutex_unlock( &image->lock );
+ ref_put( &cache->reference );
}
/**
@@ -146,20 +181,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
bool image_isComplete(dnbd3_image_t *image)
{
assert( image != NULL );
- mutex_lock( &image->lock );
if ( image->virtualFilesize == 0 ) {
- mutex_unlock( &image->lock );
return false;
}
- if ( image->cache_map == NULL ) {
- mutex_unlock( &image->lock );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL ) {
return true;
}
bool complete = true;
int j;
const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
for (j = 0; j < map_len_bytes - 1; ++j) {
- if ( image->cache_map[j] != 0xFF ) {
+ if ( cache->map[j] != 0xFF ) {
complete = false;
break;
}
@@ -174,18 +207,27 @@ bool image_isComplete(dnbd3_image_t *image)
for (j = 0; j < blocks_in_last_byte; ++j)
last_byte |= (uint8_t)(1 << j);
}
- complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte);
+ complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte);
}
- if ( !complete ) {
- mutex_unlock( &image->lock );
+ ref_put( &cache->reference );
+ if ( !complete )
return false;
+ mutex_lock( &image->lock );
+ // Lock and make sure current cache map is still the one we saw complete
+ dnbd3_cache_map_t *current = ref_get_cachemap( image );
+ if ( current == cache ) {
+ // Set cache map NULL as it's complete
+ ref_setref( &image->ref_cacheMap, NULL );
+ }
+ if ( current != NULL ) {
+ ref_put( &current->reference );
}
- char mapfile[PATHLEN] = "";
- free( image->cache_map );
- image->cache_map = NULL;
- snprintf( mapfile, PATHLEN, "%s.map", image->path );
mutex_unlock( &image->lock );
- unlink( mapfile );
+ if ( current == cache ) { // Successfully set cache map to NULL above
+ char mapfile[PATHLEN] = "";
+ snprintf( mapfile, PATHLEN, "%s.map", image->path );
+ unlink( mapfile );
+ }
return true;
}
@@ -201,49 +243,105 @@ bool image_isComplete(dnbd3_image_t *image)
*/
bool image_ensureOpen(dnbd3_image_t *image)
{
- if ( image->readFd != -1 ) return image;
- int newFd = open( image->path, O_RDONLY );
- if ( newFd != -1 ) {
- // Check size
+ bool sizeChanged = false;
+ if ( image->readFd != -1 && !image->problem.changed )
+ return true;
+ int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd );
+ if ( newFd == -1 ) {
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "[access] Cannot open '%s' for reading (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
+ } else {
+ // Check size + read access
+ char buffer[100];
const off_t flen = lseek( newFd, 0, SEEK_END );
if ( flen == -1 ) {
- logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Could not seek to end of %s (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
} else if ( (uint64_t)flen != image->realFilesize ) {
- logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+ if ( !image->problem.changed ) {
+ logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64,
+ image->realFilesize, (uint64_t)flen );
+ }
+ sizeChanged = true;
+ } else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)",
+ (int)sizeof(buffer), image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
}
}
if ( newFd == -1 ) {
- mutex_lock( &image->lock );
- image->working = false;
- mutex_unlock( &image->lock );
+ if ( sizeChanged ) {
+ image->problem.changed = true;
+ }
return false;
}
+
+ // Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32,
+ // but only if the size we just got above is correct.
+ if ( image->problem.changed && !sizeChanged ) {
+ if ( image->crc32 == NULL ) {
+ // Cannot verify further, hope for the best
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) );
+ } else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
+ // This should have checked the first block (if complete) -> All is well again
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) );
+ }
+ } else {
+ image->problem.changed = sizeChanged;
+ }
+
mutex_lock( &image->lock );
if ( image->readFd == -1 ) {
image->readFd = newFd;
+ image->problem.read = false;
mutex_unlock( &image->lock );
} else {
- // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+ // There was a race while opening the file (happens cause not locked cause blocking),
+ // we lost the race so close new fd and proceed.
+ // *OR* we dup()'ed above for cheating when the image changed before.
mutex_unlock( &image->lock );
close( newFd );
}
return image->readFd != -1;
}
+dnbd3_image_t* image_byId(int imgId)
+{
+ int i;
+ mutex_lock( &imageListLock );
+ for (i = 0; i < _num_images; ++i) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image != NULL && image->id == imgId ) {
+ image->users++;
+ mutex_unlock( &imageListLock );
+ return image;
+ }
+ }
+ mutex_unlock( &imageListLock );
+ return NULL;
+}
+
/**
* Get an image by name+rid. This function increases a reference counter,
* so you HAVE TO CALL image_release for every image_get() call at some
* point...
* Locks on: imageListLock, _images[].lock
*/
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool ensureFdOpen)
{
int i;
- const char *removingText = _removeMissingImages ? ", removing from list" : "";
dnbd3_image_t *candidate = NULL;
// Simple sanity check
const size_t slen = strlen( name );
@@ -267,119 +365,65 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
return NULL ;
}
- mutex_lock( &candidate->lock );
- mutex_unlock( &imageListLock );
candidate->users++;
- mutex_unlock( &candidate->lock );
-
- // Found, see if it works
-// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
-// TODO: But remember size-changed images forever
- if ( candidate->working || checkIfWorking ) {
- // Is marked working, but might not have an fd open
- if ( !image_ensureOpen( candidate ) ) {
- mutex_lock( &candidate->lock );
- timing_get( &candidate->lastWorkCheck );
- mutex_unlock( &candidate->lock );
- if ( _removeMissingImages ) {
- candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
- }
- return candidate;
- }
- }
-
- if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
-
- // ...not working...
+ mutex_unlock( &imageListLock );
- // Don't re-check too often
- mutex_lock( &candidate->lock );
- bool check;
- declare_now;
- check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
- if ( check ) {
- candidate->lastWorkCheck = now;
- }
- mutex_unlock( &candidate->lock );
- if ( !check ) {
+ if ( !ensureFdOpen ) // Don't want to re-check
return candidate;
- }
- // reaching this point means:
- // 1) We should check if the image is working, it might or might not be in working state right now
- // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
- // 3) We made sure not to re-check this image too often
-
- // Common for ro and rw images: Size check, read check
- const off_t len = lseek( candidate->readFd, 0, SEEK_END );
- bool reload = false;
- if ( len == -1 ) {
- logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
- reload = true;
- } else if ( (uint64_t)len != candidate->realFilesize ) {
- logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
- ". Try sending SIGHUP to server if you know what you're doing.",
- candidate->path, candidate->realFilesize, (uint64_t)len );
- } else {
- // Seek worked, file size is same, now see if we can read from file
- char buffer[100];
- if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
- logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.",
- (int)sizeof(buffer), candidate->path, errno, removingText );
- reload = true;
- } else if ( !candidate->working ) {
- // Seems everything is fine again \o/
- candidate->working = true;
- logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
- }
- }
+ if ( image_ensureOpen( candidate ) && !candidate->problem.read )
+ return candidate; // We have a read fd and no read or changed problems
- if ( reload ) {
+ // -- image could not be opened again, or is open but has problem --
+
+ if ( _removeMissingImages && !file_isReadable( candidate->path ) ) {
+ candidate = image_remove( candidate );
+ // No image_release here, the image is still returned and should be released by caller
+ } else if ( candidate->readFd != -1 ) {
+ // We cannot just close the fd as it might be in use. Make a copy and remove old entry.
+ candidate = image_remove( candidate );
// Could not access the image with exising fd - mark for reload which will re-open the file.
// make a copy of the image struct but keep the old one around. If/When it's not being used
// anymore, it will be freed automatically.
+ logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path );
dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
img->path = strdup( candidate->path );
img->name = strdup( candidate->name );
img->virtualFilesize = candidate->virtualFilesize;
img->realFilesize = candidate->realFilesize;
- img->atime = now;
+ timing_get( &img->atime );
img->masterCrc32 = candidate->masterCrc32;
img->readFd = -1;
img->rid = candidate->rid;
img->users = 1;
- img->working = false;
- mutex_init( &img->lock );
+ img->problem.read = true;
+ img->problem.changed = candidate->problem.changed;
+ img->ref_cacheMap = NULL;
+ mutex_init( &img->lock, LOCK_IMAGE );
if ( candidate->crc32 != NULL ) {
const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
img->crc32 = malloc( mb );
memcpy( img->crc32, candidate->crc32, mb );
}
- mutex_lock( &candidate->lock );
- if ( candidate->cache_map != NULL ) {
- const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize );
- img->cache_map = malloc( mb );
- memcpy( img->cache_map, candidate->cache_map, mb );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( candidate );
+ if ( cache != NULL ) {
+ ref_setref( &img->ref_cacheMap, &cache->reference );
+ ref_put( &cache->reference );
}
- mutex_unlock( &candidate->lock );
if ( image_addToList( img ) ) {
image_release( candidate );
candidate = img;
+ // Check if image is incomplete, initialize uplink
+ if ( candidate->ref_cacheMap != NULL ) {
+ uplink_init( candidate, -1, NULL, -1 );
+ }
+ // Try again with new instance
+ image_ensureOpen( candidate );
} else {
img->users = 0;
image_free( img );
}
- // readFd == -1 and working == FALSE at this point,
- // this function needs some splitting up for handling as we need to run most
- // of the above code again. for now we know that the next call for this
- // name:rid will get ne newly inserted "img" and try to re-open the file.
- }
-
- // Check if image is incomplete, handle
- if ( candidate->cache_map != NULL ) {
- if ( candidate->uplink == NULL ) {
- uplink_init( candidate, -1, NULL, -1 );
- }
+ // readFd == -1 and problem.read == true
}
return candidate; // We did all we can, hopefully it's working
@@ -391,17 +435,16 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
* Every call to image_lock() needs to be followed by a call to image_release() at some point.
* Locks on: imageListLock, _images[].lock
*/
-dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users--
+dnbd3_image_t* image_lock(dnbd3_image_t *image)
{
if ( image == NULL ) return NULL ;
int i;
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) {
- mutex_lock( &image->lock );
- mutex_unlock( &imageListLock );
+ assert( _images[i]->id == image->id );
image->users++;
- mutex_unlock( &image->lock );
+ mutex_unlock( &imageListLock );
return image;
}
}
@@ -419,12 +462,9 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
{
if ( image == NULL ) return NULL;
mutex_lock( &imageListLock );
- mutex_lock( &image->lock );
assert( image->users > 0 );
- image->users--;
- bool inUse = image->users != 0;
- mutex_unlock( &image->lock );
- if ( inUse ) { // Still in use, do nothing
+ // Decrement and check for 0
+ if ( --image->users != 0 ) { // Still in use, do nothing
mutex_unlock( &imageListLock );
return NULL;
}
@@ -433,13 +473,14 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
// responsible for freeing it
for (int i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) { // Found, do nothing
+ assert( _images[i]->id == image->id );
mutex_unlock( &imageListLock );
return NULL;
}
}
mutex_unlock( &imageListLock );
// So it wasn't in the images list anymore either, get rid of it
- if ( !inUse ) image = image_free( image );
+ image = image_free( image );
return NULL;
}
@@ -470,15 +511,14 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
{
bool mustFree = false;
mutex_lock( &imageListLock );
- mutex_lock( &image->lock );
for ( int i = _num_images - 1; i >= 0; --i ) {
if ( _images[i] == image ) {
+ assert( _images[i]->id == image->id );
_images[i] = NULL;
mustFree = ( image->users == 0 );
}
if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--;
}
- mutex_unlock( &image->lock );
mutex_unlock( &imageListLock );
if ( mustFree ) image = image_free( image );
return image;
@@ -493,17 +533,7 @@ void image_killUplinks()
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
if ( _images[i] == NULL ) continue;
- mutex_lock( &_images[i]->lock );
- if ( _images[i]->uplink != NULL ) {
- mutex_lock( &_images[i]->uplink->queueLock );
- if ( !_images[i]->uplink->shutdown ) {
- thread_detach( _images[i]->uplink->thread );
- _images[i]->uplink->shutdown = true;
- }
- mutex_unlock( &_images[i]->uplink->queueLock );
- signal_call( _images[i]->uplink->signal );
- }
- mutex_unlock( &_images[i]->lock );
+ uplink_shutdown( _images[i] );
}
mutex_unlock( &imageListLock );
}
@@ -542,18 +572,14 @@ bool image_loadAll(char *path)
// Lock again, see if image is still there, free if required
mutex_lock( &imageListLock );
if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue;
- // Image needs to be removed
+ // File not readable but still in list -- needs to be removed
imgHandle = _images[i];
_images[i] = NULL;
if ( i + 1 == _num_images ) _num_images--;
- mutex_lock( &imgHandle->lock );
- const bool freeImg = ( imgHandle->users == 0 );
- mutex_unlock( &imgHandle->lock );
- // We unlocked, but the image has been removed from the list already, so
- // there's no way the users-counter can increase at this point.
- if ( freeImg ) {
+ if ( imgHandle->users == 0 ) {
// Image is not in use anymore, free the dangling entry immediately
- mutex_unlock( &imageListLock ); // image_free might do several fs operations; unlock
+ mutex_unlock( &imageListLock ); // image_free locks on this, and
+ // might do several fs operations; unlock
image_free( imgHandle );
mutex_lock( &imageListLock );
}
@@ -581,12 +607,10 @@ bool image_tryFreeAll()
{
mutex_lock( &imageListLock );
for (int i = _num_images - 1; i >= 0; --i) {
- if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race...
+ if ( _images[i] != NULL && _images[i]->users == 0 ) {
dnbd3_image_t *image = _images[i];
_images[i] = NULL;
- mutex_unlock( &imageListLock );
image = image_free( image );
- mutex_lock( &imageListLock );
}
if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--;
}
@@ -596,37 +620,44 @@ bool image_tryFreeAll()
/**
* Free image. DOES NOT check if it's in use.
- * Indirectly locks on imageListLock, image.lock, uplink.queueLock
+ * (Indirectly) locks on image.lock, uplink.queueLock
*/
static dnbd3_image_t* image_free(dnbd3_image_t *image)
{
assert( image != NULL );
- if ( !_shutdown ) {
- logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
+ assert( image->users == 0 );
+ logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) );
+ // uplink_shutdown might return false to tell us
+ // that the shutdown is in progress. Bail out since
+ // this will get called again when the uplink is done.
+ if ( !uplink_shutdown( image ) )
+ return NULL;
+ if ( isImageFromUpstream( image ) ) {
+ saveMetaData( image, NULL, 0 );
+ if ( image->ref_cacheMap != NULL ) {
+ saveCacheMap( image );
+ }
}
- //
- uplink_shutdown( image );
mutex_lock( &image->lock );
- free( image->cache_map );
+ ref_setref( &image->ref_cacheMap, NULL );
free( image->crc32 );
free( image->path );
free( image->name );
- image->cache_map = NULL;
image->crc32 = NULL;
image->path = NULL;
image->name = NULL;
mutex_unlock( &image->lock );
if ( image->readFd != -1 ) close( image->readFd );
mutex_destroy( &image->lock );
- //
- memset( image, 0, sizeof(*image) );
free( image );
return NULL ;
}
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
+bool image_isHashBlockComplete(dnbd3_cache_map_t * const cache, const uint64_t block, const uint64_t realFilesize)
{
- if ( cacheMap == NULL ) return true;
+ if ( cache == NULL )
+ return true;
+ const atomic_uint_least8_t *cacheMap = cache->map;
const uint64_t end = (block + 1) * HASH_BLOCK_SIZE;
if ( end <= realFilesize ) {
// Trivial case: block in question is not the last block (well, or image size is multiple of HASH_BLOCK_SIZE)
@@ -671,7 +702,8 @@ static bool image_load_all_internal(char *base, char *path)
while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) {
entry = *entryPtr;
- if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue;
+ if ( entry.d_name[0] == '.' )
+ continue; // No hidden files, no . or ..
if ( strlen( entry.d_name ) > SUBDIR_LEN ) {
logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN );
continue;
@@ -688,7 +720,7 @@ static bool image_load_all_internal(char *base, char *path)
if ( S_ISDIR( st.st_mode ) ) {
image_load_all_internal( base, subpath ); // Recurse
} else if ( !isForbiddenExtension( subpath ) ) {
- image_load( base, subpath, true ); // Load image if possible
+ image_load( base, subpath, false ); // Load image if possible
}
}
closedir( dir );
@@ -727,11 +759,10 @@ static bool image_addToList(dnbd3_image_t *image)
* Note that this is NOT THREAD SAFE so make sure its always
* called on one thread only.
*/
-static bool image_load(char *base, char *path, int withUplink)
+static bool image_load(char *base, char *path, bool withUplink)
{
int revision = -1;
- struct stat st;
- uint8_t *cache_map = NULL;
+ dnbd3_cache_map_t *cache = NULL;
uint32_t *crc32list = NULL;
dnbd3_image_t *existing = NULL;
int fdImage = -1;
@@ -795,7 +826,9 @@ static bool image_load(char *base, char *path, int withUplink)
fdImage = open( path, O_RDONLY );
}
if ( fdImage == -1 ) {
- logadd( LOG_ERROR, "Could not open '%s' for reading...", path );
+ if ( errno != ENOENT ) {
+ logadd( LOG_ERROR, "[load] Cannot open '%s' for reading (errno=%d)", path, errno );
+ }
goto load_error;
}
// Determine file size
@@ -814,45 +847,36 @@ static bool image_load(char *base, char *path, int withUplink)
}
// 1. Allocate memory for the cache map if the image is incomplete
- cache_map = image_loadCacheMap( path, virtualFilesize );
+ cache = image_loadCacheMap( path, virtualFilesize );
// XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented)
// 2. Load CRC-32 list of image
- bool doFullCheck = false;
uint32_t masterCrc = 0;
const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize );
crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc );
- // Check CRC32
- if ( crc32list != NULL ) {
- if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) {
- logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path );
- doFullCheck = true;
- }
- }
-
// Compare data just loaded to identical image we apparently already loaded
if ( existing != NULL ) {
if ( existing->realFilesize != realFilesize ) {
- logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) );
// Image will be replaced below
} else if ( existing->crc32 != NULL && crc32list != NULL
&& memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) {
- logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) );
logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." );
logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" );
// Image will be replaced below
} else if ( existing->crc32 == NULL && crc32list != NULL ) {
- logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) );
existing->crc32 = crc32list;
existing->masterCrc32 = masterCrc;
crc32list = NULL;
function_return = true;
goto load_error; // Keep existing
- } else if ( existing->cache_map != NULL && cache_map == NULL ) {
+ } else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
// Just ignore that fact, if replication is really complete the cache map will be removed anyways
- logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) );
function_return = true;
goto load_error; // Keep existing
} else {
@@ -870,41 +894,33 @@ static bool image_load(char *base, char *path, int withUplink)
dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) );
image->path = strdup( path );
image->name = strdup( imgName );
- image->cache_map = cache_map;
+ image->ref_cacheMap = NULL;
+ ref_setref( &image->ref_cacheMap, &cache->reference );
image->crc32 = crc32list;
image->masterCrc32 = masterCrc;
- image->uplink = NULL;
+ image->uplinkref = NULL;
image->realFilesize = realFilesize;
image->virtualFilesize = virtualFilesize;
image->rid = (uint16_t)revision;
image->users = 0;
image->readFd = -1;
- image->working = (image->cache_map == NULL );
timing_get( &image->nextCompletenessEstimate );
image->completenessEstimate = -1;
- mutex_init( &image->lock );
- int32_t offset;
- if ( stat( path, &st ) == 0 ) {
- // Negatively offset atime by file modification time
- offset = (int32_t)( st.st_mtime - time( NULL ) );
- if ( offset > 0 ) offset = 0;
- } else {
- offset = 0;
- }
- timing_gets( &image->atime, offset );
+ mutex_init( &image->lock, LOCK_IMAGE );
+ loadImageMeta( image );
// Prevent freeing in cleanup
- cache_map = NULL;
+ cache = NULL;
crc32list = NULL;
// Get rid of cache map if image is complete
- if ( image->cache_map != NULL ) {
+ if ( image->ref_cacheMap != NULL ) {
image_isComplete( image );
}
// Image is definitely incomplete, initialize uplink worker
- if ( image->cache_map != NULL ) {
- image->working = false;
+ if ( image->ref_cacheMap != NULL ) {
+ image->problem.uplink = true;
if ( withUplink ) {
uplink_init( image, -1, NULL, -1 );
}
@@ -915,40 +931,37 @@ static bool image_load(char *base, char *path, int withUplink)
if ( image_addToList( image ) ) {
// Keep fd for reading
fdImage = -1;
+ // Check CRC32
+ image_checkRandomBlocks( image, 4, -1 );
} else {
logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
image->readFd = -1; // Keep fdImage instead, will be closed below
image = image_free( image );
goto load_error;
}
- logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
- // CRC errors found...
- if ( doFullCheck ) {
- logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid );
- integrity_check( image, -1 );
- }
-
+ logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) );
function_return = true;
// Clean exit:
load_error: ;
if ( existing != NULL ) existing = image_release( existing );
if ( crc32list != NULL ) free( crc32list );
- if ( cache_map != NULL ) free( cache_map );
+ if ( cache != NULL ) free( cache );
if ( fdImage != -1 ) close( fdImage );
return function_return;
}
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
{
- uint8_t *retval = NULL;
+ dnbd3_cache_map_t *retval = NULL;
char mapFile[strlen( imagePath ) + 10 + 1];
sprintf( mapFile, "%s.map", imagePath );
int fdMap = open( mapFile, O_RDONLY );
- if ( fdMap >= 0 ) {
+ if ( fdMap != -1 ) {
const int map_size = IMGSIZE_TO_MAPBYTES( fileSize );
- retval = calloc( 1, map_size );
- const ssize_t rd = read( fdMap, retval, map_size );
+ retval = calloc( 1, sizeof(*retval) + map_size );
+ ref_init( &retval->reference, cmfree, 0 );
+ const ssize_t rd = read( fdMap, retval->map, map_size );
if ( map_size != rd ) {
logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath );
// Could not read complete map, that means the rest of the image file will be considered incomplete
@@ -1009,18 +1022,35 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
return retval;
}
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map)
+/**
+ * Check up to count random blocks from given image. If fromFd is -1, the check will
+ * be run asynchronously using the integrity checker. Otherwise, the check will
+ * happen in the function and return the result of the check.
+ * @param image image to check
+ * @param count number of blocks to check (max)
+ * @param fromFd, check synchronously and use this fd for reading, -1 = async
+ * @return true = OK, false = error. Meaningless if fromFd == -1
+ */
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd)
{
+ if ( image->crc32 == NULL )
+ return true;
// This checks the first block and (up to) count - 1 random blocks for corruption
// via the known crc32 list. This is very sloppy and is merely supposed to detect
// accidental corruption due to broken dnbd3-proxy functionality or file system
- // corruption.
+ // corruption, or people replacing/updating images which is a very stupid thing.
assert( count > 0 );
- const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize );
- int blocks[count + 1];
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
+ int blocks[count+1]; // +1 for "-1" in sync case
int index = 0, j;
int block;
- if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0;
+ if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
+ blocks[index++] = 0;
+ }
+ if ( hashBlocks > 1 && image_isHashBlockComplete( cache, hashBlocks - 1, image->virtualFilesize ) ) {
+ blocks[index++] = hashBlocks - 1;
+ }
int tries = count * 5; // Try only so many times to find a non-duplicate complete block
while ( index + 1 < count && --tries > 0 ) {
block = rand() % hashBlocks; // Random block
@@ -1028,11 +1058,24 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t
if ( blocks[j] == block ) goto while_end;
}
// Block complete? If yes, add to list
- if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block;
+ if ( image_isHashBlockComplete( cache, block, image->virtualFilesize ) ) {
+ blocks[index++] = block;
+ }
while_end: ;
}
- blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1
- return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ }
+ if ( fromFd == -1 ) {
+ // Async
+ for ( int i = 0; i < index; ++i ) {
+ integrity_check( image, blocks[i], true );
+ }
+ return true;
+ }
+ // Sync
+ blocks[index] = -1;
+ return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize );
}
/**
@@ -1047,7 +1090,7 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_ERROR, "revision id invalid: %d", revision );
return false;
}
- char path[PATHLEN], cache[PATHLEN];
+ char path[PATHLEN], cache[PATHLEN+4];
char *lastSlash = strrchr( image, '/' );
if ( lastSlash == NULL ) {
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
@@ -1058,7 +1101,7 @@ bool image_create(char *image, int revision, uint64_t size)
*lastSlash = '/';
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
}
- snprintf( cache, PATHLEN, "%s.map", path );
+ snprintf( cache, PATHLEN+4, "%s.map", path );
size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
const int mapsize = IMGSIZE_TO_MAPBYTES(size);
// Write files
@@ -1079,14 +1122,19 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err );
}
// Now write image
+ bool fallback = false;
if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) {
logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk"
" space without actually writing all zeroes to the block device." );
logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect"
" divine performance during replication." );
- goto failure_cleanup;
- } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) {
+ if ( !_ignoreAllocErrors ) {
+ goto failure_cleanup;
+ }
+ fallback = true;
+ }
+ if ( ( _sparseFiles || fallback ) && !file_setSize( fdImage, size ) ) {
logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." );
goto failure_cleanup;
@@ -1111,8 +1159,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
* revision 0 is requested, it will:
* a) Try to clone it from an authoritative dnbd3 server, if
* the server is running in proxy mode.
- * b) Try to load it from disk by constructing the appropriate file name, if not
- * running in proxy mode.
+ * b) Try to load it from disk by constructing the appropriate file name.
*
* If the return value is not NULL,
* image_release needs to be called on the image at some point.
@@ -1120,21 +1167,29 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
*/
dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
{
+ dnbd3_image_t *image;
// specific revision - try shortcut
if ( revision != 0 ) {
- dnbd3_image_t *image = image_get( name, revision, true );
- if ( image != NULL ) return image;
+ image = image_get( name, revision, true );
+ if ( image != NULL )
+ return image;
}
const size_t len = strlen( name );
// Sanity check
if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
|| name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
- // Call specific function depending on whether this is a proxy or not
+ // Re-check latest local revision
+ image = loadImageServer( name, revision );
+ // If in proxy mode, check with upstream servers
if ( _isProxy ) {
- return loadImageProxy( name, revision, len );
- } else {
- return loadImageServer( name, revision );
+ // Forget the locally loaded one
+ image_release( image );
+ // Check with upstream - if unsuccessful, will return the same
+ // as loadImageServer did
+ image = loadImageProxy( name, revision, len );
}
+ // Lookup on local storage
+ return image;
}
/**
@@ -1191,20 +1246,21 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
dnbd3_host_t servers[REP_NUM_SRV];
int uplinkSock = -1;
dnbd3_host_t uplinkServer;
- const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false );
- uint16_t remoteProtocolVersion;
+ const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV );
uint16_t remoteRid = revision;
- uint64_t remoteImageSize;
+ uint16_t acceptedRemoteRid = 0;
+ uint16_t remoteProtocolVersion = 0;
struct sockaddr_storage sa;
socklen_t salen;
poll_list_t *cons = sock_newPollList();
logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count );
for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects
- char *remoteName;
+ char *remoteName = NULL;
+ uint64_t remoteImageSize = 0;
bool ok = false;
int sock;
if ( i >= count ) {
- sock = sock_multiConnect( cons, NULL, 100, 1000 );
+ sock = sock_multiConnect( cons, NULL, 100, _uplinkTimeout );
if ( sock == -2 ) break;
} else {
if ( log_hasMask( LOG_DEBUG2 ) ) {
@@ -1213,7 +1269,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
host[len] = '\0';
logadd( LOG_DEBUG2, "Trying to replicate from %s", host );
}
- sock = sock_multiConnect( cons, &servers[i], 100, 1000 );
+ sock = sock_multiConnect( cons, &servers[i], 100, _uplinkTimeout );
}
if ( sock == -1 || sock == -2 ) continue;
salen = sizeof(sa);
@@ -1238,7 +1294,11 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
} else {
ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc.
}
- ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ if ( ok ) {
+ ok = image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ } else {
+ logadd( LOG_INFO, "Not enough space to replicate '%s:%d'", name, (int)revision );
+ }
mutex_unlock( &reloadLock );
if ( !ok ) goto server_fail;
@@ -1247,26 +1307,32 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) {
uplinkServer.type = 0;
}
- break;
+ acceptedRemoteRid = remoteRid;
+ break; // TODO: Maybe we should try the remaining servers if rid == 0, in case there's an even newer one
server_fail: ;
close( sock );
}
sock_destroyPollList( cons );
- // If we still have a pointer to a local image, release the reference
- if ( image != NULL ) image_release( image );
+ // If we still have a pointer to a local image, compare rid
+ if ( image != NULL ) {
+ if ( ( revision == 0 && image->rid >= acceptedRemoteRid ) || ( image->rid == revision ) ) {
+ return image;
+ }
+ // release the reference
+ image_release( image );
+ }
// If everything worked out, this call should now actually return the image
- image = image_get( name, remoteRid, false );
+ image = image_get( name, acceptedRemoteRid, false );
if ( image != NULL && uplinkSock != -1 ) {
// If so, init the uplink and pass it the socket
- sock_setTimeout( uplinkSock, _uplinkTimeout );
if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) {
close( uplinkSock );
} else {
// Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
int i = 0;
- while ( !image->working && ++i < 100 )
+ while ( image->problem.uplink && ++i < 100 )
usleep( 2000 );
}
} else if ( uplinkSock != -1 ) {
@@ -1283,6 +1349,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
{
char imageFile[PATHLEN] = "";
uint16_t detectedRid = 0;
+ bool isLegacyFile = false;
if ( requestedRid != 0 ) {
snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid );
@@ -1319,6 +1386,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
&& ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) {
snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name );
detectedRid = 1;
+ isLegacyFile = true;
}
logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile );
// No file was determined, or it doesn't seem to exist/be readable
@@ -1326,7 +1394,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
logadd( LOG_DEBUG2, "Not found, bailing out" );
return image_get( name, requestedRid, true );
}
- if ( !_vmdkLegacyMode && requestedRid == 0 ) {
+ if ( !isLegacyFile && requestedRid == 0 ) {
// rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0
while ( detectedRid != 0 ) {
dnbd3_image_t *image = image_get( name, detectedRid, true );
@@ -1394,9 +1462,13 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name );
} else {
int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 );
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, crc32list, crc32len );
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, crc32list, crc32len );
close( fd );
+ if ( (size_t)ret != crc32len + sizeof(masterCrc) ) {
+ logadd( LOG_WARNING, "Could not save freshly received crc32 list for %s:%d", name, (int)revision );
+ unlink( crcFile );
+ }
}
}
free( crc32list );
@@ -1504,9 +1576,9 @@ json_t* image_getListAsJson()
json_t *imagesJson = json_array();
json_t *jsonImage;
int i;
- char uplinkName[100] = { 0 };
+ char uplinkName[100];
uint64_t bytesReceived;
- int users, completeness, idleTime;
+ int completeness, idleTime;
declare_now;
mutex_lock( &imageListLock );
@@ -1514,30 +1586,38 @@ json_t* image_getListAsJson()
if ( _images[i] == NULL ) continue;
dnbd3_image_t *image = _images[i];
mutex_lock( &image->lock );
- mutex_unlock( &imageListLock );
- users = image->users;
idleTime = (int)timing_diff( &image->atime, &now );
completeness = image_getCompletenessEstimate( image );
- if ( image->uplink == NULL ) {
+ mutex_unlock( &image->lock );
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink == NULL ) {
bytesReceived = 0;
uplinkName[0] = '\0';
} else {
- bytesReceived = image->uplink->bytesReceived;
- if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) {
+ bytesReceived = uplink->bytesReceived;
+ if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) {
uplinkName[0] = '\0';
}
+ ref_put( &uplink->reference );
}
- image->users++; // Prevent freeing after we unlock
- mutex_unlock( &image->lock );
- jsonImage = json_pack( "{sisssisisisisI}",
+ int problems = 0;
+#define addproblem(name,val) if (image->problem.name) problems |= (1 << val)
+ addproblem(read, 0);
+ addproblem(write, 1);
+ addproblem(changed, 2);
+ addproblem(uplink, 3);
+ addproblem(queue, 4);
+
+ jsonImage = json_pack( "{sisssisisisisIsi}",
"id", image->id, // id, name, rid never change, so access them without locking
"name", image->name,
"rid", (int) image->rid,
- "users", users,
+ "users", image->users,
"complete", completeness,
"idle", idleTime,
- "size", (json_int_t)image->virtualFilesize );
+ "size", (json_int_t)image->virtualFilesize,
+ "problems", problems );
if ( bytesReceived != 0 ) {
json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) );
}
@@ -1546,8 +1626,6 @@ json_t* image_getListAsJson()
}
json_array_append_new( imagesJson, jsonImage );
- image = image_release( image ); // Since we did image->users++;
- mutex_lock( &imageListLock );
}
mutex_unlock( &imageListLock );
return imagesJson;
@@ -1556,30 +1634,37 @@ json_t* image_getListAsJson()
/**
* Get completeness of an image in percent. Only estimated, not exact.
* Returns: 0-100
- * DOES NOT LOCK, so make sure to do so before calling
*/
int image_getCompletenessEstimate(dnbd3_image_t * const image)
{
assert( image != NULL );
- if ( image->cache_map == NULL ) return image->working ? 100 : 0;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL )
+ return 100;
+ const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( unlikely( len == 0 ) ) {
+ ref_put( &cache->reference );
+ return 0;
+ }
declare_now;
if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) {
// Since this operation is relatively expensive, we cache the result for a while
+ ref_put( &cache->reference );
return image->completenessEstimate;
}
int i;
int percent = 0;
- const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
- if ( len == 0 ) return 0;
for ( i = 0; i < len; ++i ) {
- if ( image->cache_map[i] == 0xff ) {
+ const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+ if ( v == 0xff ) {
percent += 100;
- } else if ( image->cache_map[i] != 0 ) {
+ } else if ( v != 0 ) {
percent += 50;
}
}
+ ref_put( &cache->reference );
image->completenessEstimate = percent / len;
- timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 );
+ timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 );
return image->completenessEstimate;
}
@@ -1611,7 +1696,7 @@ bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks
static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc)
{
// Make buffer 4k aligned in case fd has O_DIRECT set
-#define BSIZE 262144
+#define BSIZE (512*1024)
char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE];
char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) );
// How many bytes to read from the input file
@@ -1666,61 +1751,73 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
/**
* Make sure at least size bytes are available in _basePath.
* Will delete old images to make room for new ones.
- * TODO: Store last access time of images. Currently the
- * last access time is reset to the file modification time
- * on server restart. Thus it will
- * currently only delete images if server uptime is > 10 hours.
+ * It will only delete images if a configurable uptime is
+ * reached.
* This can be overridden by setting force to true, in case
* free space is desperately needed.
* Return true iff enough space is available. false in random other cases
*/
static bool image_ensureDiskSpace(uint64_t size, bool force)
{
- for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
+ for ( int maxtries = 0; maxtries < 50; ++maxtries ) {
uint64_t available;
if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
- const int e = errno;
- logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e );
+ logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left.", errno );
return true;
}
- if ( available > size ) return true;
- if ( !force && dnbd3_serverUptime() < 10 * 3600 ) {
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)) );
+ if ( available > size )
+ return true; // Yay
+ if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) {
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but auto-freeing of disk space is disabled.",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
+ return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+ }
+ if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)), _autoFreeDiskSpaceDelay / 60 );
return false;
}
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)) );
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
// Find least recently used image
dnbd3_image_t *oldest = NULL;
- int i; // XXX improve locking
+ int i;
+ mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
- if ( _images[i] == NULL ) continue;
- dnbd3_image_t *current = image_lock( _images[i] );
- if ( current == NULL ) continue;
- if ( current->users == 1 ) { // Just from the lock above
- if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
- // Oldest access time so far
- oldest = current;
- }
- }
- current = image_release( current );
+ dnbd3_image_t *current = _images[i];
+ if ( current == NULL || current->users != 0 )
+ continue; // Empty slot or in use
+ if ( oldest != NULL && timing_1le2( &oldest->atime, &current->atime ) )
+ continue; // Already got a newer one
+ if ( !isImageFromUpstream( current ) )
+ continue; // Not replicated, don't touch
+ // Oldest access time so far
+ oldest = current;
+ }
+ if ( oldest != NULL ) {
+ oldest->users++;
+ }
+ mutex_unlock( &imageListLock );
+ if ( oldest == NULL ) {
+ logadd( LOG_INFO, "All images are currently in use :-(" );
+ return false;
}
declare_now;
- if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) {
- if ( oldest == NULL ) {
- logadd( LOG_INFO, "All images are currently in use :-(" );
- } else {
- logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" );
- }
+ if ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) {
+ logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" );
+ image_release( oldest ); // We did users++ above; image might have to be freed entirely
return false;
}
- oldest = image_lock( oldest );
- if ( oldest == NULL ) continue; // Image freed in the meantime? Try again
- logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
- char *filename = strdup( oldest->path );
- oldest = image_remove( oldest );
- oldest = image_release( oldest );
+ logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) );
+ char *filename = strdup( oldest->path ); // Copy name as we remove the image first
+ oldest = image_remove( oldest ); // Remove from list first...
+ oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed
+ // Technically the image might have been grabbed again, but chances for
+ // this should be close to zero anyways since the image went unused for more than 24 hours..
+ // Proper fix would be a "delete" flag in the image struct that will be checked in image_free
unlink( filename );
size_t len = strlen( filename ) + 10;
char buffer[len];
@@ -1735,62 +1832,296 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
return false;
}
-void image_closeUnusedFd()
+#define FDCOUNT (400)
+static void* closeUnusedFds(void* nix UNUSED)
{
- int fd, i;
+ if ( !_closeUnusedFd )
+ return NULL;
ticks deadline;
timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
- char imgstr[300];
+ int fds[FDCOUNT];
+ int fdindex = 0;
+ setThreadName( "unused-fd-close" );
mutex_lock( &imageListLock );
- for (i = 0; i < _num_images; ++i) {
+ for ( int i = 0; i < _num_images; ++i ) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image == NULL || image->readFd == -1 )
+ continue;
+ if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
+ logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) );
+ fds[fdindex++] = image->readFd;
+ image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock
+ if ( fdindex == FDCOUNT )
+ break;
+ }
+ }
+ mutex_unlock( &imageListLock );
+ // Do this after unlock since close might block
+ for ( int i = 0; i < fdindex; ++i ) {
+ close( fds[i] );
+ }
+ return NULL;
+}
+
+static bool isImageFromUpstream(dnbd3_image_t *image)
+{
+ if ( !_isProxy )
+ return false; // Nothing to do
+ // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+ // for which we have any upstream servers configured. If there's none, don't touch
+ // the cache map on disk.
+ if ( !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
+ return true;
+}
+
+static void* saveLoadAllCacheMaps(void* nix UNUSED)
+{
+ static ticks nextSave;
+ declare_now;
+ bool full = timing_reached( &nextSave, &now );
+ time_t walltime = 0;
+ setThreadName( "cache-mapper" );
+ if ( full ) {
+ walltime = time( NULL );
+ // Update at start to avoid concurrent runs
+ timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
+ }
+ mutex_lock( &imageListLock );
+ for ( int i = 0; i < _num_images; ++i ) {
dnbd3_image_t * const image = _images[i];
if ( image == NULL )
continue;
- mutex_lock( &image->lock );
+ image->users++;
mutex_unlock( &imageListLock );
- if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) {
- snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
- fd = image->readFd;
- image->readFd = -1;
- } else {
- fd = -1;
- }
- mutex_unlock( &image->lock );
- if ( fd != -1 ) {
- close( fd );
- logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr );
+ const bool fromUpstream = isImageFromUpstream( image );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ if ( fromUpstream ) {
+ // Replicated image, we're responsible for updating the map, so save it
+ // Save if dirty bit is set, blocks were invalidated
+ bool save = cache->dirty;
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( !save ) {
+ // Otherwise, consider longer timeout and byte count limits of uplink
+ if ( uplink != NULL ) {
+ assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+ uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+ if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+ save = true;
+ }
+ }
+ }
+ if ( save ) {
+ cache->dirty = false;
+ if ( uplink != NULL ) {
+ uplink->bytesReceivedLastSave = uplink->bytesReceived;
+ }
+ saveCacheMap( image );
+ }
+ if ( uplink != NULL ) {
+ ref_put( &uplink->reference );
+ }
+ } else {
+ // We're not replicating this image, if there's a cache map, reload
+ // it periodically, since we might read from a shared storage that
+ // another server instance is writing to.
+ if ( full || ( !cache->unchanged && !image->problem.read ) ) {
+ logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
+ dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+ if ( onDisk == NULL ) {
+ // Should be complete now
+ logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
+ ref_setref( &image->ref_cacheMap, NULL );
+ } else {
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+ // Unchanged
+ cache->unchanged = true;
+ onDisk->reference.free( &onDisk->reference );
+ } else {
+ // Replace
+ ref_setref( &image->ref_cacheMap, &onDisk->reference );
+ logadd( LOG_DEBUG2, "Map changed" );
+ }
+ }
+ }
+ } // end reload cache map
+ ref_put( &cache->reference );
+ } // end has cache map
+ if ( full && fromUpstream ) {
+ saveMetaData( image, &now, walltime );
}
+ image_release( image ); // Always do this instead of users-- to handle freeing
mutex_lock( &imageListLock );
}
mutex_unlock( &imageListLock );
+ return NULL;
}
-/*
- void image_find_latest()
- {
- // Not in array or most recent rid is requested, try file system
- if (revision != 0) {
- // Easy case - specific RID
- char
- } else {
- // Determine base directory where the image in question has to reside.
- // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04"
- // Then searchPath has to be set to "/srv/rz/ubuntu"
- char searchPath[strlen(_basePath) + len + 1];
- char *lastSlash = strrchr(name, '/');
- char *baseName; // Name of the image. In the example above, it will be "default-13.04"
- if ( lastSlash == NULL ) {
- *searchPath = '\0';
- baseName = name;
- } else {
- char *from = name, *to = searchPath;
- while (from < lastSlash) *to++ = *from++;
- *to = '\0';
- baseName = lastSlash + 1;
- }
- // Now we have the search path in our real file system and the expected image name.
- // The revision naming sceme is <IMAGENAME>.r<RID>, so if we're looking for revision 13,
- // our example image has to be named default-13.04.r13
- }
- }
+/**
+ * Saves the cache map of the given image.
+ * Return false if this image doesn't have a cache map, or if the image
+ * doesn't have any uplink to replicate from. In this case the image might
+ * still have a cache map that was loaded from disk, and should be reloaded
+ * periodically.
+ * @param image the image
*/
+static void saveCacheMap(dnbd3_image_t *image)
+{
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL )
+ return; // Race - wasn't NULL in function call above...
+
+ logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) );
+ const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+ char mapfile[strlen( image->path ) + 4 + 1];
+ strcpy( mapfile, image->path );
+ strcat( mapfile, ".map" );
+
+ int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+ if ( fd == -1 ) {
+ const int err = errno;
+ ref_put( &cache->reference );
+ logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+ return;
+ }
+
+ // On Linux we could use readFd, but in general it's not guaranteed to work
+ int imgFd = open( image->path, O_WRONLY );
+ if ( imgFd == -1 ) {
+ logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno );
+ } else {
+ if ( fsync( imgFd ) == -1 ) {
+ logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno );
+ dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize);
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( old == NULL ) {
+ // Could not load old map. FS might be toast.
+ logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." );
+ memset( cache->map, 0, mapSize );
+ } else {
+ // AND the maps together to be safe
+ for ( int i = 0; i < mapSize; ++i ) {
+ cache->map[i] &= old->map[i];
+ }
+ old->reference.free( &old->reference );
+ }
+ }
+ close( imgFd );
+ }
+
+ // Write current map to file
+ size_t done = 0;
+ while ( done < size ) {
+ const ssize_t ret = write( fd, cache->map + done, size - done );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+ break;
+ }
+ if ( ret <= 0 ) {
+ logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+ break;
+ }
+ done += (size_t)ret;
+ }
+ ref_put( &cache->reference );
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+ }
+ close( fd );
+ // TODO fsync on parent directory
+}
+
+static void allocCacheMap(dnbd3_image_t *image, bool complete)
+{
+ const uint8_t val = complete ? 0xff : 0;
+ const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize );
+ ref_init( &cache->reference, cmfree, 0 );
+ memset( cache->map, val, byteSize );
+ mutex_lock( &image->lock );
+ if ( image->ref_cacheMap != NULL ) {
+ logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) );
+ free( cache );
+ } else {
+ ref_setref( &image->ref_cacheMap, &cache->reference );
+ }
+ mutex_unlock( &image->lock );
+}
+
+/**
+ * It's assumed you hold a reference to the image
+ */
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime)
+{
+ if ( !image->accessed )
+ return;
+ ticks tmp;
+ uint32_t diff;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot asprintf meta" );
+ return;
+ }
+ if ( now == NULL ) {
+ timing_get( &tmp );
+ now = &tmp;
+ walltime = time( NULL );
+ }
+ mutex_lock( &image->lock );
+ image->accessed = false;
+ diff = timing_diff( &image->atime, now );
+ mutex_unlock( &image->lock );
+ FILE *f = fopen( fn, "w" );
+ if ( f == NULL ) {
+ logadd( LOG_WARNING, "Cannot open %s for writing", fn );
+ } else {
+ fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) );
+ fclose( f );
+ }
+ free( fn );
+ // TODO: fsync() dir
+}
+
+static void loadImageMeta(dnbd3_image_t *image)
+{
+ int32_t offset = 1;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "asprintf load" );
+ } else {
+ int fh = open( fn, O_RDONLY );
+ free( fn );
+ if ( fh != -1 ) {
+ char buf[200];
+ ssize_t ret = read( fh, buf, sizeof(buf)-1 );
+ close( fh );
+ if ( ret > 0 ) {
+ buf[ret] = '\0';
+ // Do it the cheap way until we actually store more stuff
+ char *pos = strstr( buf, "atime=" );
+ if ( pos != NULL ) {
+ offset = (int32_t)( atol( pos + 6 ) - time( NULL ) );
+ }
+ }
+ }
+ }
+ if ( offset == 1 ) {
+ // Nothing from .meta file, use old guesstimate
+ struct stat st;
+ if ( stat( image->path, &st ) == 0 ) {
+ // Negatively offset atime by file modification time
+ offset = (int32_t)( st.st_mtime - time( NULL ) );
+ } else {
+ offset = 0;
+ }
+ image->accessed = true;
+ }
+ if ( offset > 0 ) {
+ offset = 0;
+ }
+ timing_gets( &image->atime, offset );
+}
+
diff --git a/src/server/image.h b/src/server/image.h
index 4668eff..7b6583c 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -9,7 +9,7 @@ void image_serverStartup();
bool image_isComplete(dnbd3_image_t *image);
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
+bool image_isHashBlockComplete(dnbd3_cache_map_t * const cache, const uint64_t block, const uint64_t fileSize);
void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set);
@@ -17,7 +17,9 @@ void image_markComplete(dnbd3_image_t *image);
bool image_ensureOpen(dnbd3_image_t *image);
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
+dnbd3_image_t* image_byId(int imgId);
+
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool checkIfWorking);
bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
@@ -47,6 +49,52 @@ void image_closeUnusedFd();
bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
+bool image_saveCacheMap(dnbd3_image_t *image);
+
+/**
+ * Check if given range is cached. Be careful when using this function because:
+ * 1) you need to hold a reference to the cache map
+ * 2) start and end are assumed to be 4k aligned
+ * 3) start and end are not checked to be in bounds (we don't know the image in this context)
+ */
+static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end)
+{
+ const uint64_t firstByteInMap = start >> 15;
+ const uint64_t lastByteInMap = (end - 1) >> 15;
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ uint64_t pos;
+ uint8_t b;
+ bool isCached;
+ if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+ b = cache->map[firstByteInMap];
+ isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+ } else {
+ isCached = true;
+ atomic_thread_fence( memory_order_acquire );
+ // First byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+ isCached = ( ( b & fb ) == fb );
+ }
+ // Last byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+ isCached = ( ( b & lb ) == lb );
+ }
+ // Middle, must be all bits set (0xff)
+ if ( isCached ) {
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+ isCached = false;
+ break;
+ }
+ }
+ }
+ }
+ return isCached;
+}
+
// one byte in the map covers 8 4kib blocks, so 32kib per byte
// "+ (1 << 15) - 1" is required to account for the last bit of
// the image that is smaller than 32kib
diff --git a/src/server/ini.c b/src/server/ini.c
index 216543b..37c44a3 100644
--- a/src/server/ini.c
+++ b/src/server/ini.c
@@ -52,7 +52,7 @@ static char* find_char_or_comment(const char* s, char c)
/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
static char* strncpy0(char* dest, const char* src, size_t size)
{
- strncpy( dest, src, size );
+ strncpy( dest, src, size - 1 );
dest[size - 1] = '\0';
return dest;
}
@@ -110,7 +110,17 @@ int ini_parse_file(FILE* file, int (*handler)(void*, const char*, const char*, c
#endif
else if ( *start == '[' ) {
/* A "[section]" line */
- end = find_char_or_comment( start + 1, ']' );
+ int cnt = 0;
+ char *f = start, *sstart = start;
+ while ( *++f ) {
+ if ( *f == '[' ) cnt++;
+ if ( *f == ']' ) cnt--;
+ if ( cnt < 0 ) {
+ sstart = f - 1;
+ break;
+ }
+ }
+ end = find_char_or_comment( sstart + 1, ']' );
if ( *end == ']' ) {
*end = '\0';
strncpy0( section, start + 1, sizeof(section) );
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 8f17855..91e53b8 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -4,6 +4,7 @@
#include "locks.h"
#include "image.h"
#include "uplink.h"
+#include "reference.h"
#include <assert.h>
#include <sys/syscall.h>
@@ -12,6 +13,8 @@
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
#define CHECK_QUEUE_SIZE 200
@@ -29,9 +32,10 @@ static queue_entry checkQueue[CHECK_QUEUE_SIZE];
static pthread_mutex_t integrityQueueLock;
static pthread_cond_t queueSignal;
static int queueLen = -1;
-static volatile bool bRunning = false;
+static atomic_bool bRunning = false;
static void* integrity_main(void *data);
+static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end);
/**
* Initialize the integrity check thread
@@ -39,7 +43,7 @@ static void* integrity_main(void *data);
void integrity_init()
{
assert( queueLen == -1 );
- mutex_init( &integrityQueueLock );
+ mutex_init( &integrityQueueLock, LOCK_INTEGRITY_QUEUE );
pthread_cond_init( &queueSignal, NULL );
mutex_lock( &integrityQueueLock );
queueLen = 0;
@@ -55,13 +59,14 @@ void integrity_init()
void integrity_shutdown()
{
assert( queueLen != -1 );
+ if ( !bRunning )
+ return;
logadd( LOG_DEBUG1, "Shutting down integrity checker...\n" );
+ pthread_kill( thread, SIGINT );
mutex_lock( &integrityQueueLock );
pthread_cond_signal( &queueSignal );
mutex_unlock( &integrityQueueLock );
thread_join( thread, NULL );
- while ( bRunning )
- usleep( 10000 );
mutex_destroy( &integrityQueueLock );
pthread_cond_destroy( &queueSignal );
logadd( LOG_DEBUG1, "Integrity checker exited normally.\n" );
@@ -73,32 +78,42 @@ void integrity_shutdown()
* make sure it is before calling, otherwise it will result in falsely
* detected corruption.
*/
-void integrity_check(dnbd3_image_t *image, int block)
+void integrity_check(dnbd3_image_t *image, int block, bool blocking)
{
+ int freeSlot;
if ( !bRunning ) {
logadd( LOG_MINOR, "Ignoring check request; thread not running..." );
return;
}
- int i, freeSlot = -1;
+start_over:
+ freeSlot = -1;
mutex_lock( &integrityQueueLock );
- for (i = 0; i < queueLen; ++i) {
+ for (int i = 0; i < queueLen; ++i) {
if ( freeSlot == -1 && checkQueue[i].image == NULL ) {
freeSlot = i;
- } else if ( checkQueue[i].image == image
- && checkQueue[i].block <= block && checkQueue[i].block + checkQueue[i].count >= block ) {
- // Already queued check dominates this one, or at least lies directly before this block
- if ( checkQueue[i].block + checkQueue[i].count == block ) {
- // It's directly before this one; expand range
+ } else if ( checkQueue[i].image == image && checkQueue[i].block <= block ) {
+ if ( checkQueue[i].count == CHECK_ALL ) {
+ logadd( LOG_DEBUG2, "Dominated by full image scan request (%d/%d) (at %d)", i, queueLen, checkQueue[i].block );
+ } else if ( checkQueue[i].block + checkQueue[i].count == block ) {
checkQueue[i].count += 1;
+ logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
+ } else if ( checkQueue[i].block + checkQueue[i].count > block ) {
+ logadd( LOG_DEBUG2, "Dominated by existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
+ } else {
+ continue;
}
- logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (%d +%d)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
mutex_unlock( &integrityQueueLock );
return;
}
}
if ( freeSlot == -1 ) {
- if ( queueLen >= CHECK_QUEUE_SIZE ) {
+ if ( unlikely( queueLen >= CHECK_QUEUE_SIZE ) ) {
mutex_unlock( &integrityQueueLock );
+ if ( blocking ) {
+ logadd( LOG_INFO, "Check queue full, waiting a couple seconds...\n" );
+ sleep( 3 );
+ goto start_over;
+ }
logadd( LOG_INFO, "Check queue full, discarding check request...\n" );
return;
}
@@ -119,11 +134,9 @@ void integrity_check(dnbd3_image_t *image, int block)
static void* integrity_main(void * data UNUSED)
{
int i;
- uint8_t *buffer = NULL;
- size_t bufferSize = 0;
setThreadName( "image-check" );
blockNoncriticalSignals();
-#if defined(linux) || defined(__linux)
+#if defined(__linux__)
// Setting nice of this thread - this is not POSIX conforming, so check if other platforms support this.
// POSIX says that setpriority() should set the nice value of all threads belonging to the current process,
// but on linux you can do this per thread.
@@ -146,79 +159,71 @@ static void* integrity_main(void * data UNUSED)
// We have the image. Call image_release() some time
const int qCount = checkQueue[i].count;
bool foundCorrupted = false;
- mutex_lock( &image->lock );
if ( image->crc32 != NULL && image->realFilesize != 0 ) {
int blocks[2] = { checkQueue[i].block, -1 };
mutex_unlock( &integrityQueueLock );
- // Make copy of crc32 list as it might go away
const uint64_t fileSize = image->realFilesize;
const int numHashBlocks = IMGSIZE_TO_HASHBLOCKS(fileSize);
- const size_t required = numHashBlocks * sizeof(uint32_t);
- if ( buffer == NULL || required > bufferSize ) {
- bufferSize = required;
- if ( buffer != NULL ) free( buffer );
- buffer = malloc( bufferSize );
- }
- memcpy( buffer, image->crc32, required );
- mutex_unlock( &image->lock );
- // Open for direct I/O if possible; this prevents polluting the fs cache
- int fd = open( image->path, O_RDONLY | O_DIRECT );
- bool direct = fd != -1;
- if ( unlikely( !direct ) ) {
- // Try unbuffered; flush to disk for that
- logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path );
- image_ensureOpen( image );
- fd = image->readFd;
- }
int checkCount = MIN( qCount, 5 );
- if ( fd != -1 ) {
- while ( blocks[0] < numHashBlocks && !_shutdown ) {
- const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
- const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
- bool complete = true;
- if ( qCount == CHECK_ALL ) {
+ int readFd = -1, directFd = -1;
+ while ( blocks[0] < numHashBlocks && !_shutdown ) {
+ const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
+ const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
+ bool complete = true;
+ if ( qCount == CHECK_ALL ) {
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
// When checking full image, skip incomplete blocks, otherwise assume block is complete
- mutex_lock( &image->lock );
- complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
- mutex_unlock( &image->lock );
- }
-#if defined(linux) || defined(__linux)
- if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) {
-#else
- if ( fsync( fd ) == -1 ) {
-#endif
- logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path );
- exit( 1 );
+ complete = image_isHashBlockComplete( cache, blocks[0], fileSize );
+ ref_put( &cache->reference );
}
+ }
+ // Flush to disk if there's an uplink, as that means the block might have been written recently
+ if ( image->uplinkref != NULL ) {
+ flushFileRange( image, start, end );
+ }
+ if ( _shutdown )
+ break;
+ // Open for direct I/O if possible; this prevents polluting the fs cache
+ if ( directFd == -1 && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
// Use direct I/O only if read length is multiple of 4096 to be on the safe side
- int tfd;
- if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
- // Suitable for direct io
- tfd = fd;
- } else if ( !image_ensureOpen( image ) ) {
- logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
- break;
+ directFd = open( image->path, O_RDONLY | O_DIRECT );
+ if ( directFd == -1 ) {
+ logadd( LOG_DEBUG2, "O_DIRECT failed for %s (errno=%d)", image->path, errno );
+ directFd = -2;
} else {
- tfd = image->readFd;
- // Evict from cache so we have to re-read, making sure data was properly stored
- posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED );
+ readFd = directFd;
}
- if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) {
- logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name );
- image_updateCachemap( image, start, end, false );
- // If this is not a full check, queue one
- if ( qCount != CHECK_ALL ) {
- logadd( LOG_INFO, "Queueing full check for %s", image->name );
- integrity_check( image, -1 );
- }
- foundCorrupted = true;
+ }
+ if ( readFd == -1 ) { // Try buffered as fallback
+ if ( image_ensureOpen( image ) && !image->problem.read ) {
+ readFd = image->readFd;
}
- blocks[0]++; // Increase before break, so it always points to the next block to check after loop
- if ( complete && --checkCount == 0 ) break;
}
- if ( direct ) {
- close( fd );
+ if ( readFd == -1 ) {
+ logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
+ } else if ( complete && !image_checkBlocksCrc32( readFd, image->crc32, blocks, fileSize ) ) {
+ bool iscomplete = true;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ iscomplete = image_isHashBlockComplete( cache, blocks[0], fileSize );
+ ref_put( &cache->reference );
+ }
+ logadd( LOG_WARNING, "Hash check for block %d of %s failed (complete: was: %d, is: %d)", blocks[0], image->name, (int)complete, (int)iscomplete );
+ image_updateCachemap( image, start, end, false );
+ // If this is not a full check, queue one
+ if ( qCount != CHECK_ALL ) {
+ logadd( LOG_INFO, "Queueing full check for %s", image->name );
+ integrity_check( image, -1, false );
+ }
+ foundCorrupted = true;
}
+ blocks[0]++; // Increase before break, so it always points to the next block to check after loop
+ if ( complete && --checkCount == 0 )
+ break;
+ }
+ if ( directFd != -1 && directFd != -2 ) {
+ close( directFd );
}
mutex_lock( &integrityQueueLock );
assert( checkQueue[i].image == image );
@@ -229,46 +234,70 @@ static void* integrity_main(void * data UNUSED)
logadd( LOG_WARNING, "BUG! checkQueue counter ran negative" );
}
}
- if ( checkCount > 0 || checkQueue[i].count <= 0 || fd == -1 ) {
- // Done with this task as nothing left, OR we don't have an fd to read from
- if ( fd == -1 ) {
- logadd( LOG_WARNING, "Cannot hash check %s: bad fd", image->path );
- }
+ if ( checkCount > 0 || checkQueue[i].count <= 0 ) {
+ // Done with this task as nothing left
checkQueue[i].image = NULL;
if ( i + 1 == queueLen ) queueLen--;
- // Mark as working again if applicable
- if ( !foundCorrupted ) {
- mutex_lock( &image->lock );
- if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper?
- image->working = image->uplink->fd != -1 && image->readFd != -1;
- }
- mutex_unlock( &image->lock );
- }
} else {
// Still more blocks to go...
checkQueue[i].block = blocks[0];
}
- } else {
- mutex_unlock( &image->lock );
}
- if ( foundCorrupted ) {
+ if ( foundCorrupted && !_shutdown ) {
// Something was fishy, make sure uplink exists
- mutex_lock( &image->lock );
- image->working = false;
- bool restart = image->uplink == NULL || image->uplink->shutdown;
- mutex_unlock( &image->lock );
- if ( restart ) {
- uplink_shutdown( image );
- uplink_init( image, -1, NULL, -1 );
- }
+ uplink_init( image, -1, NULL, -1 );
}
// Release :-)
image_release( image );
}
}
mutex_unlock( &integrityQueueLock );
- if ( buffer != NULL ) free( buffer );
bRunning = false;
return NULL;
}
+static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end)
+{
+ int flushFd;
+ int writableFd = -1;
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink != NULL ) { // Try to steal uplink's writable fd
+ if ( uplink->cacheFd != -1 ) {
+ writableFd = dup( uplink->cacheFd );
+ }
+ ref_put( &uplink->reference );
+ }
+ if ( writableFd == -1 ) { // Open file as writable
+ writableFd = open( image->path, O_WRONLY );
+ }
+ if ( writableFd == -1 ) { // Fallback to readFd (should work on Linux and BSD...)
+ logadd( LOG_WARNING, "flushFileRange: Cannot open %s for writing. Trying readFd.", image->path );
+ image_ensureOpen( image );
+ flushFd = image->readFd;
+ } else {
+ flushFd = writableFd;
+ }
+ if ( flushFd == -1 )
+ return;
+#if defined(__linux__)
+ while ( sync_file_range( flushFd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
+#else
+ while ( fsync( flushFd ) == -1 ) // TODO: fdatasync() should be available since FreeBSD 12.0 ... Might be a tad bit faster
+#endif
+ {
+ if ( _shutdown )
+ break;
+ int e = errno;
+ if ( e == EINTR )
+ continue;
+ logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, e );
+ if ( e == EIO ) {
+ exit( 1 );
+ }
+ }
+ // Evict from cache too so we have to re-read, making sure data was properly stored
+ posix_fadvise( flushFd, start, end - start, POSIX_FADV_DONTNEED );
+ if ( writableFd != -1 ) {
+ close( writableFd );
+ }
+}
diff --git a/src/server/integrity.h b/src/server/integrity.h
index c3c2b44..09d3785 100644
--- a/src/server/integrity.h
+++ b/src/server/integrity.h
@@ -7,6 +7,6 @@ void integrity_init();
void integrity_shutdown();
-void integrity_check(dnbd3_image_t *image, int block);
+void integrity_check(dnbd3_image_t *image, int block, bool blocking);
#endif /* INTEGRITY_H_ */
diff --git a/src/server/locks.c b/src/server/locks.c
index a5b7c76..3be73b3 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -7,52 +7,50 @@
#include "locks.h"
#include "helper.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/timing.h>
-#ifdef _DEBUG
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
#define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
#define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
+#define MAXLPT 20
#define LOCKLEN 60
typedef struct
{
- void *lock;
+ void * _Atomic lock;
ticks locktime;
- char locked;
- pthread_t thread;
+ bool _Atomic locked;
+ pthread_t _Atomic thread;
int lockId;
+ int prio;
char name[LOCKLEN];
char where[LOCKLEN];
} debug_lock_t;
typedef struct
{
- pthread_t tid;
+ pthread_t _Atomic tid;
ticks time;
char name[LOCKLEN];
char where[LOCKLEN];
-
+ debug_lock_t *locks[MAXLPT];
} debug_thread_t;
int debugThreadCount = 0;
static debug_lock_t locks[MAXLOCKS];
static debug_thread_t threads[MAXTHREADS];
-static int init_done = 0;
-static pthread_mutex_t initdestory;
+static pthread_mutex_t initdestory = PTHREAD_MUTEX_INITIALIZER;
static int lockId = 0;
-static pthread_t watchdog = 0;
-static dnbd3_signal_t* watchdogSignal = NULL;
-static void *debug_thread_watchdog(void *something);
+#define ULDE(...) do { \
+ pthread_mutex_unlock( &initdestory ); \
+ logadd( LOG_ERROR, __VA_ARGS__ ); \
+ debug_dump_lock_stats(); \
+ exit( 4 ); \
+} while(0)
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority)
{
- if ( !init_done ) {
- memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) );
- memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) );
- pthread_mutex_init( &initdestory, NULL );
- init_done = 1;
- }
int first = -1;
pthread_mutex_lock( &initdestory );
for (int i = 0; i < MAXLOCKS; ++i) {
@@ -63,20 +61,18 @@ int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex
if ( first == -1 && locks[i].lock == NULL ) first = i;
}
if ( first == -1 ) {
- logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line );
- pthread_mutex_unlock( &initdestory );
- debug_dump_lock_stats();
- exit( 4 );
+ ULDE( "No more free debug locks (%s:%d)\n", file, line );
}
locks[first].lock = (void*)lock;
- locks[first].locked = 0;
+ locks[first].locked = false;
+ locks[first].prio = priority;
snprintf( locks[first].name, LOCKLEN, "%s", name );
snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line );
pthread_mutex_unlock( &initdestory );
return pthread_mutex_init( lock, NULL );
}
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try)
{
debug_lock_t *l = NULL;
pthread_mutex_lock( &initdestory );
@@ -86,163 +82,180 @@ int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex
break;
}
}
- pthread_mutex_unlock( &initdestory );
if ( l == NULL ) {
- logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- debug_dump_lock_stats();
- exit( 4 );
+ ULDE( "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
}
debug_thread_t *t = NULL;
- pthread_mutex_lock( &initdestory );
+ int first = -1;
+ const pthread_t self = pthread_self();
for (int i = 0; i < MAXTHREADS; ++i) {
- if ( threads[i].tid != 0 ) continue;
- threads[i].tid = pthread_self();
- timing_get( &threads[i].time );
- snprintf( threads[i].name, LOCKLEN, "%s", name );
- snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
- t = &threads[i];
- break;
- }
- pthread_mutex_unlock( &initdestory );
- if ( t == NULL ) {
- logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
- }
- const int retval = pthread_mutex_lock( lock );
- pthread_mutex_lock( &initdestory );
- t->tid = 0;
- pthread_mutex_unlock( &initdestory );
- if ( l->locked ) {
- logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
- }
- l->locked = 1;
- timing_get( &l->locktime );
- l->thread = pthread_self();
- snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
- pthread_mutex_lock( &initdestory );
- l->lockId = ++lockId;
- pthread_mutex_unlock( &initdestory );
- return retval;
-}
-
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock)
-{
- debug_lock_t *l = NULL;
- pthread_mutex_lock( &initdestory );
- for (int i = 0; i < MAXLOCKS; ++i) {
- if ( locks[i].lock == lock ) {
- l = &locks[i];
+ if ( threads[i].tid == self ) {
+ t = &threads[i];
break;
}
+ if ( first == -1 && threads[i].tid == 0 ) {
+ first = i;
+ }
}
- pthread_mutex_unlock( &initdestory );
- if ( l == NULL ) {
- logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- debug_dump_lock_stats();
- exit( 4 );
- }
- debug_thread_t *t = NULL;
- pthread_mutex_lock( &initdestory );
- for (int i = 0; i < MAXTHREADS; ++i) {
- if ( threads[i].tid != 0 ) continue;
- threads[i].tid = pthread_self();
- timing_get( &threads[i].time );
- snprintf( threads[i].name, LOCKLEN, "%s", name );
- snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
- t = &threads[i];
- break;
- }
- pthread_mutex_unlock( &initdestory );
+ int idx;
if ( t == NULL ) {
- logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ if ( first == -1 ) {
+ ULDE( "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ }
+ t = &threads[first];
+ timing_get( &t->time );
+ t->tid = self;
+ snprintf( t->name, LOCKLEN, "%s", name );
+ snprintf( t->where, LOCKLEN, "%s:%d", file, line );
+ memset( t->locks, 0, sizeof(t->locks) );
+ idx = 0;
+ } else {
+ // Thread already has locks, check for order violation
+ idx = -1;
+ for (int i = 0; i < MAXLPT; ++i) {
+ if ( t->locks[i] == NULL ) {
+ if ( idx == -1 ) {
+ idx = i;
+ }
+ continue;
+ }
+ if ( t->locks[i]->prio >= l->prio ) {
+ ULDE( "Lock priority violation: %s at %s:%d (%d) when already holding %s at %s (%d)",
+ name, file, line, l->prio,
+ t->locks[i]->name, t->locks[i]->where, t->locks[i]->prio );
+ }
+ if ( t->locks[i] == l ) {
+ ULDE( "Tried to recusively lock %s in the same thread. Tried at %s:%d, when already locked at %s",
+ name, file, line, t->locks[i]->name );
+ }
+ }
+ if ( idx == -1 ) {
+ ULDE( "Thread %d tried to lock more than %d locks.", (int)self, (int)MAXLPT );
+ }
}
- const int retval = pthread_mutex_trylock( lock );
- pthread_mutex_lock( &initdestory );
- t->tid = 0;
pthread_mutex_unlock( &initdestory );
+ const int retval = try ? pthread_mutex_trylock( lock ) : pthread_mutex_lock( lock );
if ( retval == 0 ) {
+ timing_get( &l->locktime );
+ l->thread = self;
+ snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
+ pthread_mutex_lock( &initdestory );
if ( l->locked ) {
logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
exit( 4 );
}
- l->locked = 1;
- timing_get( &l->locktime );
- l->thread = pthread_self();
- snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
- pthread_mutex_lock( &initdestory );
+ l->locked = true;
+ t->locks[idx] = l;
l->lockId = ++lockId;
pthread_mutex_unlock( &initdestory );
+ } else if ( !try || retval != EBUSY ) {
+ logadd( LOG_ERROR, "Acquiring lock %s at %s:%d failed with error code %d", name, file, line, retval );
+ debug_dump_lock_stats();
+ exit( 4 );
}
return retval;
}
int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock)
{
- debug_lock_t *l = NULL;
+ debug_thread_t *t = NULL;
+ pthread_t self = pthread_self();
pthread_mutex_lock( &initdestory );
- for (int i = 0; i < MAXLOCKS; ++i) {
- if ( locks[i].lock == lock ) {
- l = &locks[i];
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid == self ) {
+ t = &threads[i];
break;
}
}
- pthread_mutex_unlock( &initdestory );
- if ( l == NULL ) {
- logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ if ( t == NULL ) {
+ ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
}
- if ( !l->locked ) {
- logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ int idx = -1;
+ int cnt = 0;
+ for (int i = 0; i < MAXLPT; ++i) {
+ if ( t->locks[i] == NULL )
+ continue;
+ cnt++;
+ if ( t->locks[i]->lock == lock ) {
+ idx = i;
+ }
}
- l->locked = 0;
+ if ( idx == -1 ) {
+ ULDE( "Unlock: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
+ }
+ debug_lock_t *l = t->locks[idx];
+ if ( l->thread != self || !l->locked ) {
+ ULDE( "Unlock sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
+ }
+ l->locked = false;
l->thread = 0;
+ t->locks[idx] = NULL;
+ if ( cnt == 1 ) {
+ t->tid = 0; // No more locks held, free up slot
+ }
snprintf( l->where, LOCKLEN, "U %s:%d", file, line );
- int retval = pthread_mutex_unlock( lock );
+ pthread_mutex_unlock( &initdestory );
+ const int retval = pthread_mutex_unlock( lock );
+ if ( retval != 0 ) {
+ logadd( LOG_ERROR, "pthread_mutex_unlock returned %d for %s at %s:%d", retval, name, file, line );
+ exit( 4 );
+ }
return retval;
}
int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock)
{
debug_lock_t *l = NULL;
+ debug_thread_t *t = NULL;
+ pthread_t self = pthread_self();
pthread_mutex_lock( &initdestory );
- for (int i = 0; i < MAXLOCKS; ++i) {
- if ( locks[i].lock == lock ) {
- l = &locks[i];
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid == self ) {
+ t = &threads[i];
break;
}
}
- pthread_mutex_unlock( &initdestory );
+ if ( t == NULL ) {
+ ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
+ }
+ int mp = 0, mpi = -1;
+ for (int i = 0; i < MAXLPT; ++i) {
+ if ( t->locks[i] == NULL )
+ continue;
+ if ( t->locks[i]->lock == lock ) {
+ l = t->locks[i];
+ } else if ( t->locks[i]->prio > mp ) {
+ mp = t->locks[i]->prio;
+ mpi = i;
+ }
+ }
if ( l == NULL ) {
- logadd( LOG_ERROR, "Tried to cond_wait on uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ ULDE( "cond_wait: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
}
- if ( !l->locked ) {
- logadd( LOG_ERROR, "Cond_wait sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ if ( l->thread != self || !l->locked ) {
+ ULDE( "cond_wait: Sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
}
- pthread_t self = pthread_self();
- if ( l->thread != self ) {
- logadd( LOG_ERROR, "Cond_wait called from non-owning thread for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
- exit( 4 );
+ if ( mp >= l->prio ) {
+ ULDE( "cond_wait: Yielding a mutex while holding another one with higher prio: %s at %s:%d (%d) while also holding %s at %s (%d)",
+ name, file, line, l->prio,
+ t->locks[mpi]->name, t->locks[mpi]->where, mp );
}
- l->locked = 0;
+ l->locked = false;
l->thread = 0;
- snprintf( l->where, LOCKLEN, "CW %s:%d", file, line );
+ snprintf( l->where, LOCKLEN, "CWU %s:%d", file, line );
+ pthread_mutex_unlock( &initdestory );
int retval = pthread_cond_wait( cond, lock );
if ( retval != 0 ) {
logadd( LOG_ERROR, "pthread_cond_wait returned %d for lock %p (%s) at %s:%d\n", retval, (void*)lock, name, file, line );
exit( 4 );
}
- if ( l->locked != 0 || l->thread != 0 ) {
+ if ( l->locked || l->thread != 0 ) {
logadd( LOG_ERROR, "Lock is not free after returning from pthread_cond_wait for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
exit( 4 );
}
- l->locked = 1;
l->thread = self;
timing_get( &l->locktime );
+ l->locked = true;
pthread_mutex_lock( &initdestory );
l->lockId = ++lockId;
pthread_mutex_unlock( &initdestory );
@@ -256,6 +269,7 @@ int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mu
if ( locks[i].lock == lock ) {
if ( locks[i].locked ) {
logadd( LOG_ERROR, "Tried to destroy lock %p (%s) at %s:%d when it is still locked\n", (void*)lock, name, file, line );
+ logadd( LOG_ERROR, "Currently locked by: %s", locks[i].where );
exit( 4 );
}
locks[i].lock = NULL;
@@ -289,63 +303,21 @@ void debug_dump_lock_stats()
"* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked );
}
}
- printf( "\n **** WAITING THREADS ****\n\n" );
+ printf( "\n **** ACTIVE THREADS ****\n\n" );
for (int i = 0; i < MAXTHREADS; ++i) {
- if ( threads[i].tid == 0 ) continue;
+ if ( threads[i].tid == 0 )
+ continue;
printf( "* *** Thread %d ***\n"
"* Lock: %s\n"
"* Where: %s\n"
"* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) );
- }
- pthread_mutex_unlock( &initdestory );
-}
-
-static void *debug_thread_watchdog(void *something UNUSED)
-{
- setThreadName( "debug-watchdog" );
- while ( !_shutdown ) {
- if ( init_done ) {
- declare_now;
- pthread_mutex_lock( &initdestory );
- for (int i = 0; i < MAXTHREADS; ++i) {
- if ( threads[i].tid == 0 ) continue;
- const uint32_t diff = timing_diff( &threads[i].time, &now );
- if ( diff > 6 && diff < 100000 ) {
- printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" );
- pthread_mutex_unlock( &initdestory );
- debug_dump_lock_stats();
- exit( 99 );
- }
- }
- pthread_mutex_unlock( &initdestory );
+ for (int j = 0; j < MAXLPT; ++j) {
+ if ( threads[i].locks[j] == NULL )
+ continue;
+ printf( " * Lock %s @ %s\n", threads[i].locks[j]->name, threads[i].locks[j]->where );
}
- if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 );
- }
- return NULL ;
-}
-
-#endif
-
-void debug_locks_start_watchdog()
-{
-#ifdef _DEBUG
- watchdogSignal = signal_new();
- if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) {
- logadd( LOG_ERROR, "Could not start debug-lock watchdog." );
- return;
}
-#endif
+ pthread_mutex_unlock( &initdestory );
}
-void debug_locks_stop_watchdog()
-{
-#ifdef _DEBUG
- _shutdown = true;
- printf( "Killing debug watchdog...\n" );
- pthread_mutex_lock( &initdestory );
- signal_call( watchdogSignal );
- pthread_mutex_unlock( &initdestory );
- thread_join( watchdog, NULL );
- signal_close( watchdogSignal );
#endif
-}
diff --git a/src/server/locks.h b/src/server/locks.h
index 7f72722..3b04caa 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -5,19 +5,40 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
-
-#ifdef _DEBUG
-
-#define mutex_init( lock ) debug_mutex_init( #lock, __FILE__, __LINE__, lock)
-#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock)
-#define mutex_trylock( lock ) debug_mutex_trylock( #lock, __FILE__, __LINE__, lock)
+#include <stdbool.h>
+
+// Lock priority
+
+#define LOCK_RELOAD 90
+#define LOCK_LOAD_CONFIG 100
+#define LOCK_REMOTE_CLONE 110
+#define LOCK_CLIENT_LIST 120
+#define LOCK_CLIENT 130
+#define LOCK_INTEGRITY_QUEUE 140
+#define LOCK_IMAGE_LIST 150
+#define LOCK_IMAGE 160
+#define LOCK_UPLINK_QUEUE 170
+#define LOCK_ALT_SERVER_LIST 180
+#define LOCK_CLIENT_SEND 190
+#define LOCK_UPLINK_RTT 200
+#define LOCK_UPLINK_SEND 210
+#define LOCK_RPC_ACL 220
+#define LOCK_FUSE_INIT 300
+#define LOCK_FUSE_DIR 310
+
+//
+
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
+
+#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
+#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
+#define mutex_trylock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, true)
#define mutex_unlock( lock ) debug_mutex_unlock( #lock, __FILE__, __LINE__, lock)
#define mutex_cond_wait( cond, lock ) debug_mutex_cond_wait( #lock, __FILE__, __LINE__, cond, lock)
#define mutex_destroy( lock ) debug_mutex_destroy( #lock, __FILE__, __LINE__, lock)
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock);
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority);
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try);
int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock);
int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock);
int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mutex_t *lock);
@@ -27,7 +48,7 @@ void debug_dump_lock_stats();
#else
-#define mutex_init( lock ) pthread_mutex_init(lock, NULL)
+#define mutex_init( lock, prio ) pthread_mutex_init(lock, NULL)
#define mutex_lock( lock ) pthread_mutex_lock(lock)
#define mutex_trylock( lock ) pthread_mutex_trylock(lock)
#define mutex_unlock( lock ) pthread_mutex_unlock(lock)
@@ -36,10 +57,12 @@ void debug_dump_lock_stats();
#endif
-#ifdef DEBUG_THREADS
+#ifdef DNBD3_SERVER_DEBUG_THREADS
+
+#include <dnbd3/shared/log.h>
extern int debugThreadCount;
-#define thread_create(thread,attr,routine,arg) (logadd( LOG_THREAD CREATE, "%d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
+#define thread_create(thread,attr,routine,arg) (logadd( LOG_INFO, "THREAD_CREATE: %d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg)
{
int i;
@@ -49,26 +72,26 @@ static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_att
return pthread_create( thread, attr, start_routine, arg );
}
-#define thread_detach(thread) (logadd( LOG_THREAD DETACH, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
+#define thread_detach(thread) (logadd( LOG_INFO, "THREAD_DETACH: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
static inline int debug_thread_detach(pthread_t thread)
{
const int ret = pthread_detach(thread);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD DETACH, "Tried to detach invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_DETACH: Tried to detach invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
}
-#define thread_join(thread,value) (logadd( LOG_THREAD JOIN, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
+#define thread_join(thread,value) (logadd( LOG_INFO, "THREAD_JOIN: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
static inline int debug_thread_join(pthread_t thread, void **value_ptr)
{
const int ret = pthread_join(thread, value_ptr);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD JOIN, "Tried to join invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_JOIN: Tried to join invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
@@ -80,9 +103,6 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr)
#define thread_detach(thread) pthread_detach( thread )
#define thread_join(thread,value) pthread_join( thread, value )
-#endif
-
-void debug_locks_start_watchdog();
-void debug_locks_stop_watchdog();
+#endif /* DNBD3_SERVER_DEBUG_THREADS */
#endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
index 9abe221..eb51d29 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -24,11 +24,12 @@
#include "locks.h"
#include "rpc.h"
#include "altservers.h"
+#include "reference.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
-#include "../shared/protocol.h"
-#include "../serialize.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/serialize.h>
#include <assert.h>
@@ -43,6 +44,7 @@
#include <jansson.h>
#include <inttypes.h>
#include <stdatomic.h>
+#include <signal.h>
static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS];
static int _num_clients = 0;
@@ -56,11 +58,12 @@ static atomic_uint_fast64_t totalBytesSent = 0;
static bool addToList(dnbd3_client_t *client);
static void removeFromList(dnbd3_client_t *client);
static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client);
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
static inline bool recv_request_header(int sock, dnbd3_request_t *request)
{
ssize_t ret, fails = 0;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
// Read request header from socket
@@ -87,7 +90,7 @@ static inline bool recv_request_header(int sock, dnbd3_request_t *request)
static inline bool recv_request_payload(int sock, uint32_t size, serialized_buffer_t *payload)
{
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
if ( size == 0 ) {
@@ -111,7 +114,7 @@ static inline bool recv_request_payload(int sock, uint32_t size, serialized_buff
* Send reply with optional payload. payload can be null. The caller has to
* acquire the sendMutex first.
*/
-static inline bool send_reply(int sock, dnbd3_reply_t *reply, void *payload)
+static inline bool send_reply(int sock, dnbd3_reply_t *reply, const void *payload)
{
const uint32_t size = reply->size;
fixup_reply( *reply );
@@ -145,18 +148,19 @@ static inline bool sendPadding( const int fd, uint32_t bytes )
void net_init()
{
- mutex_init( &_clients_lock );
+ mutex_init( &_clients_lock, LOCK_CLIENT_LIST );
}
void* net_handleNewConnection(void *clientPtr)
{
dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr;
dnbd3_request_t request;
+ client->thread = pthread_self();
// Await data from client. Since this is a fresh connection, we expect data right away
sock_setTimeout( client->sock, _clientTimeout );
do {
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
const int ret = (int)recv( 0, &request, sizeof(request), MSG_WAITALL );
#else
const int ret = (int)recv( client->sock, &request, sizeof(request), MSG_WAITALL );
@@ -186,14 +190,15 @@ void* net_handleNewConnection(void *clientPtr)
}
} while (0);
// Fully init client struct
- mutex_init( &client->lock );
- mutex_init( &client->sendMutex );
+ mutex_init( &client->lock, LOCK_CLIENT );
+ mutex_init( &client->sendMutex, LOCK_CLIENT_SEND );
mutex_lock( &client->lock );
host_to_string( &client->host, client->hostName, HOSTNAMELEN );
client->hostName[HOSTNAMELEN-1] = '\0';
mutex_unlock( &client->lock );
client->bytesSent = 0;
+ client->relayedCount = 0;
if ( !addToList( client ) ) {
freeClientStruct( client );
@@ -204,6 +209,7 @@ void* net_handleNewConnection(void *clientPtr)
dnbd3_reply_t reply;
dnbd3_image_t *image = NULL;
+ dnbd3_cache_map_t *cache = NULL;
int image_file = -1;
int num;
@@ -212,7 +218,6 @@ void* net_handleNewConnection(void *clientPtr)
serialized_buffer_t payload;
uint16_t rid, client_version;
- uint64_t start, end;
dnbd3_server_entry_t server_list[NUMBER_SERVERS];
@@ -229,7 +234,7 @@ void* net_handleNewConnection(void *clientPtr)
rid = serializer_get_uint16( &payload );
const uint8_t flags = serializer_get_uint8( &payload );
client->isServer = ( flags & FLAGS8_SERVER );
- if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) {
+ if ( unlikely( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) ) {
if ( client_version < MIN_SUPPORTED_CLIENT ) {
logadd( LOG_DEBUG1, "Client %s too old", client->hostName );
} else {
@@ -243,7 +248,7 @@ void* net_handleNewConnection(void *clientPtr)
// We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does...
// Reject, as this would basically force this proxy to do BGR too.
image = image_get( image_name, rid, true );
- if ( image != NULL && image->cache_map != NULL ) {
+ if ( image != NULL && image->ref_cacheMap != NULL ) {
// Only exception is if the image is complete locally
image = image_release( image );
}
@@ -255,27 +260,28 @@ void* net_handleNewConnection(void *clientPtr)
// No BGR mismatch, but don't lookup if image is unknown locally
image = image_get( image_name, rid, true );
}
- mutex_lock( &client->lock );
client->image = image;
- mutex_unlock( &client->lock );
- if ( image == NULL ) {
+ atomic_thread_fence( memory_order_release );
+ if ( unlikely( image == NULL ) ) {
//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
- } else if ( !image->working ) {
+ } else if ( unlikely( image->problem.read || image->problem.changed ) ) {
logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
client->hostName, image_name, (int)rid );
} else {
- bool penalty;
// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
bOk = true;
- if ( image->cache_map != NULL ) {
- mutex_lock( &image->lock );
- if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+ if ( image->ref_cacheMap != NULL ) {
+ if ( image->problem.queue || image->problem.write ) {
bOk = ( rand() % 4 ) == 1;
}
- penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1;
- mutex_unlock( &image->lock );
- if ( penalty ) { // Wait 100ms if local caching is not working so this
- usleep( 100000 ); // server gets a penalty and is less likely to be selected
+ if ( bOk ) {
+ if ( image->problem.write ) { // Wait 100ms if local caching is not working so this
+ usleep( 100000 ); // server gets a penalty and is less likely to be selected
+ }
+ if ( image->problem.uplink ) {
+ // Penaltize depending on completeness, if no uplink is available
+ usleep( ( 100 - image->completenessEstimate ) * 100 );
+ }
}
}
if ( bOk ) {
@@ -284,6 +290,7 @@ void* net_handleNewConnection(void *clientPtr)
if ( !client->isServer ) {
// Only update immediately if this is a client. Servers are handled on disconnect.
timing_get( &image->atime );
+ image->accessed = true;
}
mutex_unlock( &image->lock );
serializer_reset_write( &payload );
@@ -301,7 +308,7 @@ void* net_handleNewConnection(void *clientPtr)
}
}
- if ( bOk ) {
+ if ( likely( bOk ) ) {
// add artificial delay if applicable
if ( client->isServer && _serverPenalty != 0 ) {
usleep( _serverPenalty );
@@ -311,95 +318,62 @@ void* net_handleNewConnection(void *clientPtr)
// client handling mainloop
while ( recv_request_header( client->sock, &request ) ) {
if ( _shutdown ) break;
- switch ( request.cmd ) {
+ if ( likely ( request.cmd == CMD_GET_BLOCK ) ) {
- case CMD_GET_BLOCK:;
const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
- if ( offset >= image->virtualFilesize ) {
+ reply.handle = request.handle;
+ if ( unlikely( offset >= image->virtualFilesize ) ) {
// Sanity check
logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
}
- if ( offset + request.size > image->virtualFilesize ) {
+ if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
// Sanity check
logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName );
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
+ }
+
+ if ( cache == NULL ) {
+ cache = ref_get_cachemap( image );
}
- if ( request.size != 0 && image->cache_map != NULL ) {
+ if ( request.size != 0 && cache != NULL ) {
// This is a proxyed image, check if we need to relay the request...
- start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- bool isCached = true;
- mutex_lock( &image->lock );
- // Check again as we only aquired the lock just now
- if ( image->cache_map != NULL ) {
- const uint64_t firstByteInMap = start >> 15;
- const uint64_t lastByteInMap = (end - 1) >> 15;
- uint64_t pos;
- // Middle - quick checking
- if ( isCached ) {
- pos = firstByteInMap + 1;
- while ( pos < lastByteInMap ) {
- if ( image->cache_map[pos] != 0xff ) {
- isCached = false;
- break;
- }
- ++pos;
+ const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ if ( unlikely( client->relayedCount > 250 ) ) {
+ logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" );
+ for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) {
+ usleep( 10000 );
}
- }
- // First byte
- if ( isCached ) {
- pos = start;
- do {
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) {
- isCached = false;
- break;
- }
- pos += DNBD3_BLOCK_SIZE;
- } while ( firstByteInMap == (pos >> 15) && pos < end );
- }
- // Last byte - only check if request spans multiple bytes in cache map
- if ( isCached && firstByteInMap != lastByteInMap ) {
- pos = lastByteInMap << 15;
- while ( pos < end ) {
- assert( lastByteInMap == (pos >> 15) );
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) {
- isCached = false;
- break;
- }
- pos += DNBD3_BLOCK_SIZE;
+ if ( client->relayedCount > 250 ) {
+ logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" );
+ goto exit_client_cleanup;
}
}
- }
- mutex_unlock( &image->lock );
- if ( !isCached ) {
- if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
- logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+ client->relayedCount++;
+ if ( !uplink_requestClient( client, &uplinkCallback, request.handle, offset, request.size, request.hops ) ) {
+ client->relayedCount--;
+ logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
client->hostName, image->name, image->rid );
- image->working = false;
goto exit_client_cleanup;
}
- break; // DONE, exit request.cmd switch
+ continue; // Reply arrives on uplink some time later, handle next request now
}
}
reply.cmd = CMD_GET_BLOCK;
reply.size = request.size;
- reply.handle = request.handle;
fixup_reply( reply );
- const bool lock = image->uplink != NULL;
+ const bool lock = image->uplinkref != NULL;
if ( lock ) mutex_lock( &client->sendMutex );
// Send reply header
if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) {
@@ -422,7 +396,7 @@ void* net_handleNewConnection(void *clientPtr)
// TODO: Should we consider EOPNOTSUPP on BSD for sendfile and fallback to read/write?
// Linux would set EINVAL or ENOSYS instead, which it unfortunately also does for a couple of other failures :/
// read/write would kill performance anyways so a fallback would probably be of little use either way.
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
char buf[1000];
size_t cnt = realBytes - done;
if ( cnt > 1000 ) {
@@ -459,7 +433,7 @@ void* net_handleNewConnection(void *clientPtr)
}
if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
- image->working = false;
+ image->problem.read = true;
}
}
goto exit_client_cleanup;
@@ -476,11 +450,20 @@ void* net_handleNewConnection(void *clientPtr)
if ( lock ) mutex_unlock( &client->sendMutex );
// Global per-client counter
client->bytesSent += request.size; // Increase counter for statistics.
- break;
+ continue;
+ }
+ // Any other command
+ // Release cache map every now and then, in case the image was replicated
+ // entirely. Will be re-grabbed on next CMD_GET_BLOCK otherwise.
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ cache = NULL;
+ }
+ switch ( request.cmd ) {
case CMD_GET_SERVERS:
// Build list of known working alt servers
- num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS );
+ num = altservers_getListForClient( client, server_list, NUMBER_SERVERS );
reply.cmd = CMD_GET_SERVERS;
reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) );
mutex_lock( &client->sendMutex );
@@ -525,24 +508,27 @@ set_name: ;
logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
break;
- }
- }
- }
+ } // end switch
+ } // end loop
+ } // end bOk
exit_client_cleanup: ;
// First remove from list, then add to counter to prevent race condition
removeFromList( client );
totalBytesSent += client->bytesSent;
// Access time, but only if client didn't just probe
- if ( image != NULL ) {
+ if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
mutex_lock( &image->lock );
- if ( client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
- timing_get( &image->atime );
- }
+ timing_get( &image->atime );
+ image->accessed = true;
mutex_unlock( &image->lock );
}
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ }
freeClientStruct( client ); // This will also call image_release on client->image
return NULL ;
fail_preadd: ;
+ // This is before we even initialized any mutex
close( client->sock );
free( client );
return NULL;
@@ -609,6 +595,12 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
}
bs += client->bytesSent;
}
+ // Do this before unlocking the list, otherwise we might
+ // account for a client twice if it would disconnect after
+ // unlocking but before we add the count here.
+ if ( bytesSent != NULL ) {
+ *bytesSent = totalBytesSent + bs;
+ }
mutex_unlock( &_clients_lock );
if ( clientCount != NULL ) {
*clientCount = cc;
@@ -616,9 +608,6 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
if ( serverCount != NULL ) {
*serverCount = sc;
}
- if ( bytesSent != NULL ) {
- *bytesSent = totalBytesSent + bs;
- }
}
void net_disconnectAll()
@@ -626,11 +615,10 @@ void net_disconnectAll()
int i;
mutex_lock( &_clients_lock );
for (i = 0; i < _num_clients; ++i) {
- if ( _clients[i] == NULL ) continue;
- dnbd3_client_t * const client = _clients[i];
- mutex_lock( &client->lock );
- if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR );
- mutex_unlock( &client->lock );
+ if ( _clients[i] == NULL )
+ continue;
+ shutdown( _clients[i]->sock, SHUT_RDWR );
+ pthread_kill( _clients[i]->thread, SIGINT );
}
mutex_unlock( &_clients_lock );
}
@@ -668,11 +656,19 @@ static void removeFromList(dnbd3_client_t *client)
{
int i;
mutex_lock( &_clients_lock );
- for ( i = _num_clients - 1; i >= 0; --i ) {
- if ( _clients[i] == client ) {
- _clients[i] = NULL;
+ if ( _num_clients != 0 ) {
+ for ( i = _num_clients - 1; i >= 0; --i ) {
+ if ( _clients[i] == client ) {
+ _clients[i] = NULL;
+ break;
+ }
+ }
+ if ( i != 0 && i + 1 == _num_clients ) {
+ do {
+ i--;
+ } while ( _clients[i] == NULL && i > 0 );
+ _num_clients = i + 1;
}
- if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients;
}
mutex_unlock( &_clients_lock );
}
@@ -686,17 +682,33 @@ static void removeFromList(dnbd3_client_t *client)
static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
{
mutex_lock( &client->lock );
+ if ( client->image != NULL ) {
+ dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
+ if ( uplink != NULL ) {
+ if ( client->relayedCount != 0 ) {
+ uplink_removeEntry( uplink, client, &uplinkCallback );
+ }
+ ref_put( &uplink->reference );
+ }
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount );
+ int i;
+ for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) {
+ usleep( 10000 );
+ }
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount );
+ }
+ }
+ }
mutex_lock( &client->sendMutex );
- if ( client->sock != -1 ) close( client->sock );
+ if ( client->sock != -1 ) {
+ close( client->sock );
+ }
client->sock = -1;
mutex_unlock( &client->sendMutex );
- if ( client->image != NULL ) {
- mutex_lock( &client->image->lock );
- if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
- mutex_unlock( &client->image->lock );
- client->image = image_release( client->image );
- }
mutex_unlock( &client->lock );
+ client->image = image_release( client->image );
mutex_destroy( &client->lock );
mutex_destroy( &client->sendMutex );
free( client );
@@ -729,3 +741,21 @@ static bool addToList(dnbd3_client_t *client)
return true;
}
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer)
+{
+ dnbd3_client_t *client = (dnbd3_client_t*)data;
+ dnbd3_reply_t reply = {
+ .magic = dnbd3_packet_magic,
+ .cmd = buffer == NULL ? CMD_ERROR : CMD_GET_BLOCK,
+ .handle = handle,
+ .size = length,
+ };
+ mutex_lock( &client->sendMutex );
+ send_reply( client->sock, &reply, buffer );
+ if ( buffer == NULL ) {
+ shutdown( client->sock, SHUT_RDWR );
+ }
+ client->relayedCount--;
+ mutex_unlock( &client->sendMutex );
+}
+
diff --git a/src/server/net.h b/src/server/net.h
index 6813b49..2d6e5e7 100644
--- a/src/server/net.h
+++ b/src/server/net.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
diff --git a/src/server/picohttpparser/CMakeLists.txt b/src/server/picohttpparser/CMakeLists.txt
new file mode 100644
index 0000000..cc6ec96
--- /dev/null
+++ b/src/server/picohttpparser/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(picohttpparser
+ LANGUAGES C)
+
+set(PICOHTTPPARSER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.c)
+set(PICOHTTPPARSER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.h)
+
+add_library(picohttpparser STATIC ${PICOHTTPPARSER_SOURCE_FILES})
+target_include_directories(picohttpparser PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/server/picohttpparser/picohttpparser.c b/src/server/picohttpparser/picohttpparser.c
index cfa05ef..f077016 100644
--- a/src/server/picohttpparser/picohttpparser.c
+++ b/src/server/picohttpparser/picohttpparser.c
@@ -36,8 +36,6 @@
#endif
#include "picohttpparser.h"
-/* $Id$ */
-
#if __GNUC__ >= 3
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
@@ -73,9 +71,9 @@
#define ADVANCE_TOKEN(tok, toklen) \
do { \
const char *tok_start = buf; \
- static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \
+ static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
int found2; \
- buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
+ buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
if (!found2) { \
CHECK_EOF(); \
} \
@@ -138,15 +136,11 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, struct
const char *token_start = buf;
#ifdef __SSE4_2__
- static const char ranges1[] = "\0\010"
- /* allow HT */
- "\012\037"
- /* allow SP and up to but not including DEL */
- "\177\177"
- /* allow chars w. MSB set */
- ;
+ static const char ALIGNED(16) ranges1[16] = "\0\010" /* allow HT */
+ "\012\037" /* allow SP and up to but not including DEL */
+ "\177\177"; /* allow chars w. MSB set */
int found;
- buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
+ buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
if (found)
goto FOUND_CTL;
#else
@@ -325,9 +319,21 @@ static const char *parse_headers(const char *buf, const char *buf_end, struct ph
headers[*num_headers].name.s = NULL;
headers[*num_headers].name.l = 0;
}
- if ((buf = get_token_to_eol(buf, buf_end, &headers[*num_headers].value, ret)) == NULL) {
+ struct string value;
+ // DELETE
+ if ((buf = get_token_to_eol(buf, buf_end, &value, ret)) == NULL) {
return NULL;
}
+ /* remove trailing SPs and HTABs */
+ const char *value_end = value.s + value.l;
+ for (; value_end != value.s; --value_end) {
+ const char c = *(value_end - 1);
+ if (!(c == ' ' || c == '\t')) {
+ break;
+ }
+ }
+ headers[*num_headers].value.s = value.s;
+ headers[*num_headers].value.l = value_end - value.s;
}
return buf;
}
@@ -347,9 +353,17 @@ static const char *parse_request(const char *buf, const char *buf_end, struct st
/* parse request line */
ADVANCE_TOKEN(method->s, method->l);
- ++buf;
+ do {
+ ++buf;
+ } while (*buf == ' ');
ADVANCE_TOKEN(path->s, path->l);
- ++buf;
+ do {
+ ++buf;
+ } while (*buf == ' ');
+ if (method->l == 0 || path->l == 0) {
+ *ret = -1;
+ return NULL;
+ }
if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
return NULL;
}
@@ -402,10 +416,13 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
return NULL;
}
/* skip space */
- if (*buf++ != ' ') {
+ if (*buf != ' ') {
*ret = -1;
return NULL;
}
+ do {
+ ++buf;
+ } while (*buf == ' ');
/* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
if (buf_end - buf < 4) {
*ret = -2;
@@ -413,13 +430,21 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
}
PARSE_INT_3(status);
- /* skip space */
- if (*buf++ != ' ') {
- *ret = -1;
+ /* get message includig preceding space */
+ if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) {
return NULL;
}
- /* get message */
- if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) {
+ if (msg->l == 0) {
+ /* ok */
+ } else if (*msg->s == ' ') {
+ /* remove preceding space */
+ do {
+ ++msg->s;
+ --msg->l;
+ } while (*msg->s == ' ');
+ } else {
+ /* garbage found after status code */
+ *ret = -1;
return NULL;
}
diff --git a/src/server/reference.c b/src/server/reference.c
new file mode 100644
index 0000000..64109ca
--- /dev/null
+++ b/src/server/reference.c
@@ -0,0 +1,33 @@
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#include "reference.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count )
+{
+ reference->count = count;
+ reference->free = freefun;
+}
+
+_Noreturn void _ref_error( const char *message )
+{
+ fprintf( stderr, "%s\n", message );
+ abort();
+}
+
+void ref_setref( weakref *weakref, ref *ref )
+{
+ union _aligned_ref_ *new_weakref = 0;
+ if ( ref ) {
+ ( new_weakref = aligned_ref( ref->_aligned_ref ) )->ref = ref;
+ ref->count += sizeof( union _aligned_ref_ ) + 1;
+ }
+ char *old_weakref = (char *)atomic_exchange( weakref, new_weakref );
+ if ( !old_weakref )
+ return;
+ struct _ref_ *old_ref = aligned_ref( old_weakref )->ref;
+ old_ref->count += old_weakref - (char *)aligned_ref( old_weakref ) - sizeof( union _aligned_ref_ );
+ ref_put( old_ref );
+}
diff --git a/src/server/reference.h b/src/server/reference.h
new file mode 100644
index 0000000..75a681f
--- /dev/null
+++ b/src/server/reference.h
@@ -0,0 +1,64 @@
+#ifndef _REFERENCE_H_
+#define _REFERENCE_H_
+
+#include "reftypes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define container_of(ptr, type, member) \
+ ((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member)))
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count );
+
+void ref_setref( weakref *weakref, ref *ref );
+
+_Noreturn void _ref_error( const char *message );
+
+static inline ref *ref_get( weakref *weakref )
+{
+ char *old_weakref = (char *)*weakref;
+ do {
+ if ( old_weakref == NULL )
+ return NULL;
+ if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) {
+ old_weakref = (char *)*weakref;
+ continue;
+ }
+ } while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) );
+ struct _ref_ *ref = aligned_ref( old_weakref )->ref;
+ if ( unlikely( ++ref->count == -1 ) ) {
+ _ref_error( "Reference counter overflow. Aborting." );
+ }
+ char *cur_weakref = ( char * )*weakref;
+ do {
+ if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) {
+ ref->count--;
+ break;
+ }
+ } while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) );
+ return ref;
+}
+
+static inline void ref_inc( ref *ref )
+{
+ ++ref->count;
+}
+
+static inline void ref_put( ref *ref )
+{
+ if ( --ref->count == 0 ) {
+ ref->free( ref );
+ }
+}
+
+#define ref_get_uplink(wr) __extension__({ \
+ ref* ref = ref_get( wr ); \
+ ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
+})
+
+#define ref_get_cachemap(image) __extension__({ \
+ ref* ref = ref_get( &(image)->ref_cacheMap ); \
+ ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \
+})
+
+#endif
diff --git a/src/server/reftypes.h b/src/server/reftypes.h
new file mode 100644
index 0000000..45c0c20
--- /dev/null
+++ b/src/server/reftypes.h
@@ -0,0 +1,25 @@
+#ifndef _REFTYPES_H_
+#define _REFTYPES_H_
+
+#include <stdatomic.h>
+
+_Static_assert( sizeof( void * ) == sizeof( _Atomic( void * ) ), "Atomic pointer bad" );
+
+typedef _Atomic( void * ) weakref;
+
+#define aligned_ref(ptr) \
+ ((union _aligned_ref_ *)((ptr) - (uintptr_t)(ptr) % sizeof(union _aligned_ref_)))
+
+union _aligned_ref_ {
+ struct _ref_ *ref;
+ void *_padding[( 32 - 1 ) / sizeof( void * ) + 1];
+};
+
+typedef struct _ref_ {
+ _Atomic long count;
+ void ( *free )( struct _ref_ * );
+ char _padding[sizeof( union _aligned_ref_ )];
+ char _aligned_ref[sizeof( union _aligned_ref_ )];
+} ref;
+
+#endif
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 5dbcafe..119bbd5 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -5,10 +5,13 @@
#include "locks.h"
#include "image.h"
#include "altservers.h"
-#include "../shared/sockhelper.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include "fileutil.h"
#include "picohttpparser/picohttpparser.h"
#include "urldecode.h"
+#include "reference.h"
#include <jansson.h>
#include <sys/types.h>
@@ -43,7 +46,9 @@ _Static_assert( sizeof("test") == 5 && sizeof("test2") == 6, "Stringsize messup
DEFSTR(STR_CONNECTION, "connection")
DEFSTR(STR_CLOSE, "close")
DEFSTR(STR_QUERY, "/query")
+DEFSTR(STR_CACHEMAP, "/cachemap")
DEFSTR(STR_Q, "q")
+DEFSTR(STR_ID, "id")
static inline bool equals(struct string *s1,struct string *s2)
{
@@ -75,13 +80,13 @@ static json_int_t randomRunId;
static pthread_mutex_t aclLock;
#define MAX_CLIENTS 50
#define CUTOFF_START 40
-static pthread_mutex_t statusLock;
static struct {
- int count;
- bool overloaded;
+ atomic_int count;
+ atomic_bool overloaded;
} status;
static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
+static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive);
static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc);
static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value);
@@ -91,16 +96,15 @@ static void loadAcl();
void rpc_init()
{
- mutex_init( &aclLock );
- mutex_init( &statusLock );
+ mutex_init( &aclLock, LOCK_RPC_ACL );
randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL);
// </guard>
if ( sizeof(randomRunId) > 4 ) {
int fd = open( "/dev/urandom", O_RDONLY );
if ( fd != -1 ) {
uint32_t bla = 1;
- read( fd, &bla, 4 );
- randomRunId = (randomRunId << 32) | bla;
+ (void)!read( fd, &bla, 4 );
+ randomRunId = ((randomRunId & 0xffffffff) << 32) | bla;
}
close( fd );
}
@@ -123,10 +127,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
return;
}
do {
- mutex_lock( &statusLock );
const int curCount = ++status.count;
UPDATE_LOADSTATE( curCount );
- mutex_unlock( &statusLock );
if ( curCount > MAX_CLIENTS ) {
sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE );
goto func_return;
@@ -141,13 +143,13 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
bool hasName = false;
bool ok;
int keepAlive = HTTP_KEEPALIVE;
- do {
+ while ( !_shutdown ) {
// Read request from client
struct phr_header headers[100];
- size_t numHeaders, prevLen = 0, consumed;
+ size_t numHeaders, prevLen = 0, consumed = 0;
struct string method, path;
int minorVersion;
- do {
+ while ( !_shutdown ) {
// Parse before calling recv, there might be a complete pipelined request in the buffer already
// If the request is incomplete, we allow exactly one additional recv() to complete it.
// This should suffice for real world scenarios as I don't know of any
@@ -174,7 +176,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
// Reaching here means partial request or parse error
if ( pret == -2 ) { // Partial, keep reading
prevLen = hoff;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
ssize_t ret = recv( 0, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
#else
ssize_t ret = recv( sock, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
@@ -192,15 +194,15 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE );
goto func_return;
}
- } while ( true );
+ } // Loop while request header incomplete
+ if ( _shutdown )
+ break;
if ( keepAlive == HTTP_KEEPALIVE ) {
// Only keep the connection alive (and indicate so) if the client seems to support this
if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
keepAlive = HTTP_CLOSE;
} else { // And if there aren't too many active HTTP sessions
- mutex_lock( &statusLock );
if ( status.overloaded ) keepAlive = HTTP_CLOSE;
- mutex_unlock( &statusLock );
}
}
if ( method.s != NULL && path.s != NULL ) {
@@ -216,10 +218,13 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
// Don't care if GET or POST
if ( equals( &file, &STR_QUERY ) ) {
ok = handleStatus( sock, permissions, getv, getc, keepAlive );
+ } else if ( equals( &file, &STR_CACHEMAP ) ) {
+ ok = handleCacheMap( sock, permissions, getv, getc, keepAlive );
} else {
ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive );
}
- if ( !ok ) break;
+ if ( !ok )
+ break;
}
// hoff might be beyond end if the client sent another request (burst)
const ssize_t extra = hoff - consumed;
@@ -231,13 +236,11 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
hasName = true;
setThreadName( "HTTP" );
}
- } while (true);
+ } // Loop while more requests
func_return:;
do {
- mutex_lock( &statusLock );
const int curCount = --status.count;
UPDATE_LOADSTATE( curCount );
- mutex_unlock( &statusLock );
} while (0);
}
@@ -258,7 +261,7 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
{
bool ok;
bool stats = false, images = false, clients = false, space = false;
- bool logfile = false, config = false, altservers = false;
+ bool logfile = false, config = false, altservers = false, version = false;
#define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true
for (size_t i = 0; i < fields_num; ++i) {
if ( !equals( &fields[i].name, &STR_Q ) ) continue;
@@ -269,9 +272,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
else SETVAR(logfile);
else SETVAR(config);
else SETVAR(altservers);
+ else SETVAR(version);
}
#undef SETVAR
- if ( ( stats || space ) && !(permissions & ACL_STATS) ) {
+ if ( ( stats || space || version ) && !(permissions & ACL_STATS) ) {
return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive );
}
if ( images && !(permissions & ACL_IMAGE_LIST) ) {
@@ -307,6 +311,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
statisticsJson = json_pack( "{sI}",
"runId", randomRunId );
}
+ if ( version ) {
+ json_object_set_new( statisticsJson, "version", json_string( DNBD3_VERSION_LONG ", built " DNBD3_BUILD_DATE ) );
+ json_object_set_new( statisticsJson, "build", json_string( DNBD3_BUILD ) );
+ }
if ( space ) {
uint64_t spaceTotal = 0, spaceAvail = 0;
file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail );
@@ -347,6 +355,46 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
return ok;
}
+static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive)
+{
+ if ( !(permissions & ACL_IMAGE_LIST) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive );
+ }
+ int imgId = -1;
+ static const char one = (char)0xff;
+ for (size_t i = 0; i < fields_num; ++i) {
+ if ( equals( &fields[i].name, &STR_ID ) ) {
+ char *broken;
+ imgId = (int)strtol( fields[i].value.s, &broken, 10 );
+ if ( broken != fields[i].value.s )
+ break;
+ imgId = -1;
+ }
+ }
+ if ( imgId == -1 )
+ return sendReply( sock, "400 Bad Request", "text/plain", "Missing parameter 'id'", -1, keepAlive );
+ dnbd3_image_t *image = image_byId( imgId );
+ if ( image == NULL )
+ return sendReply( sock, "404 Not found", "text/plain", "Image not found", -1, keepAlive );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ image_release( image );
+ int len;
+ const char *map;
+ if ( cache == NULL ) {
+ map = &one;
+ len = 1;
+ } else {
+ _Static_assert( sizeof(const char) == sizeof(_Atomic uint8_t), "Atomic assumption exploded" );
+ map = (const char*)cache->map;
+ len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ }
+ bool ok = sendReply( sock, "200 OK", "application/octet-stream", map, len, keepAlive );
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ }
+ return ok;
+}
+
static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive)
{
if ( plen == -1 ) plen = strlen( payload );
@@ -364,9 +412,11 @@ static bool sendReply(int sock, const char *status, const char *ctype, const cha
if ( keepAlive == HTTP_CLOSE ) {
// Wait for flush
shutdown( sock, SHUT_WR );
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
+ // Don't wait too long in case other side ignores the shutdown
+ sock_setTimeout( sock, 600 );
while ( read( sock, buffer, sizeof buffer ) > 0 );
return false;
}
@@ -410,7 +460,7 @@ static int getacl(dnbd3_host_t *host)
if ( aclRules[i].bitMask != 0 && aclRules[i].host[aclRules[i].bytes] != ( host->addr[aclRules[i].bytes] & aclRules[i].bitMask ) ) continue;
return aclRules[i].permissions;
}
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
return 0x7fffff;
#else
return 0;
@@ -446,7 +496,7 @@ static void addacl(int argc, char **argv, void *data UNUSED)
*slash++ = '\0';
}
if ( !parse_address( argv[0], &host ) ) goto unlock_end;
- long int bits;
+ long int bits = 0;
if ( slash != NULL ) {
char *last;
bits = strtol( slash, &last, 10 );
diff --git a/src/server/serialize.c b/src/server/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/server/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/server/server.c b/src/server/server.c
index 10ab208..0f75935 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -29,14 +29,18 @@
#include "integrity.h"
#include "threadpool.h"
#include "rpc.h"
+#include "fuse.h"
-#include "../version.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
#include <signal.h>
#include <getopt.h>
#include <assert.h>
+#include <sys/types.h>
+#include <unistd.h>
#define LONGOPT_CRC4 1000
#define LONGOPT_ASSERT 1001
@@ -45,6 +49,26 @@
#define LONGOPT_SIZE 1004
#define LONGOPT_ERRORMSG 1005
+typedef struct _job job_t;
+
+struct _job {
+ job_t *next;
+ void *(*startRoutine)(void *);
+ void *arg;
+ ticks dueDate;
+ int intervalSecs;
+};
+
+static job_t *jobHead;
+static _Atomic(job_t *) newJob;
+static bool hasTimerThread = false;
+static pthread_t timerThread;
+
+static pid_t mainPid;
+static pthread_t mainThread;
+
+#define DEFAULT_TIMER_TIMEOUT (60)
+
static poll_list_t *listeners = NULL;
/**
@@ -71,15 +95,25 @@ static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data);
static void* server_asyncImageListLoad(void *data);
+static void* timerMainloop(void*);
+
+static int handlePendingJobs(void);
+
+static void queueJobInternal(job_t *job);
+
/**
* Print help text for usage instructions
*/
void dnbd3_printHelp(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
+ printf( "Version: %s\n\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
printf( "Usage: %s [OPTIONS]...\n", argv_0 );
printf( "Start the DNBD3 server\n" );
printf( "-c or --config Configuration directory (default /etc/dnbd3-server/)\n" );
+#ifdef DNBD3_SERVER_FUSE
+ printf( "-m or --mount FUSE mount point\n");
+#endif
printf( "-n or --nodaemon Start server in foreground\n" );
printf( "-b or --bind Local Address to bind to\n" );
printf( "-h or --help Show this help text and quit\n" );
@@ -98,21 +132,31 @@ void dnbd3_printHelp(char *argv_0)
*/
void dnbd3_printVersion()
{
- printf( "Version: %s\n", VERSION_STRING );
+ printf( "dnbd3-server version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
exit( 0 );
}
/**
* Clean up structs, connections, write out data, then exit
*/
-void dnbd3_cleanup()
+_Noreturn static void dnbd3_cleanup()
{
int retries;
_shutdown = true;
logadd( LOG_INFO, "Cleanup..." );
- if ( listeners != NULL ) sock_destroyPollList( listeners );
+ dfuse_shutdown();
+
+ if ( hasTimerThread ) {
+ pthread_kill( timerThread, SIGINT );
+ thread_join( timerThread, NULL );
+ }
+
+ if ( listeners != NULL ) {
+ sock_destroyPollList( listeners );
+ }
listeners = NULL;
// Kill connection to all clients
@@ -121,9 +165,6 @@ void dnbd3_cleanup()
// Disable threadpool
threadpool_close();
- // Terminate the altserver checking thread
- altservers_shutdown();
-
// Terminate all uplinks
image_killUplinks();
@@ -133,8 +174,7 @@ void dnbd3_cleanup()
// Wait for clients to disconnect
net_waitForAllDisconnected();
- // Watchdog not needed anymore
- debug_locks_stop_watchdog();
+ threadpool_waitEmpty();
// Clean up images
retries = 5;
@@ -159,11 +199,13 @@ int main(int argc, char *argv[])
char *paramCreate = NULL;
char *bindAddress = NULL;
char *errorMsg = NULL;
+ char *mountDir = NULL;
int64_t paramSize = -1;
int paramRevision = -1;
- static const char *optString = "b:c:d:hnv?";
+ static const char *optString = "b:c:m:d:hnv?";
static const struct option longOpts[] = {
{ "config", required_argument, NULL, 'c' },
+ { "mount", required_argument, NULL, 'm' },
{ "nodaemon", no_argument, NULL, 'n' },
{ "reload", no_argument, NULL, 'r' },
{ "help", no_argument, NULL, 'h' },
@@ -178,6 +220,18 @@ int main(int argc, char *argv[])
{ 0, 0, 0, 0 }
};
+ log_init();
+
+ /* set proper output stream for AFL */
+#ifdef DNBD3_SERVER_AFL
+ if ( log_setConsoleOutputStream(stderr) < 0 ) {
+ logadd( LOG_ERROR, "Failed to set output stream for AFL to stderr" );
+ exit( EXIT_FAILURE );
+ }
+#endif
+
+ mainPid = getpid();
+ mainThread = pthread_self();
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
while ( opt != -1 ) {
@@ -185,6 +239,13 @@ int main(int argc, char *argv[])
case 'c':
_configDir = strdup( optarg );
break;
+ case 'm':
+#ifndef DNBD3_SERVER_FUSE
+ fprintf( stderr, "FUSE support not enabled at build time.\n" );
+ return 8;
+#endif
+ mountDir = strdup( optarg );
+ break;
case 'n':
demonize = 0;
break;
@@ -201,6 +262,15 @@ int main(int argc, char *argv[])
case LONGOPT_CRC4:
return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE;
case LONGOPT_ASSERT:
+ printf( "Now leaking memory:\n" );
+ char *bla = malloc( 10 );
+ bla[2] = 3;
+ bla = NULL;
+ printf( "Testing use after free:\n" );
+ char *test = malloc( 10 );
+ test[0] = 1;
+ free( (void*)test );
+ test[1] = 2;
printf( "Testing a failing assertion:\n" );
assert( 4 == 5 );
printf( "Assertion 4 == 5 seems to hold. ;-)\n" );
@@ -221,6 +291,7 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+
// Load general config
if ( _configDir == NULL ) _configDir = strdup( "/etc/dnbd3-server" );
@@ -233,9 +304,7 @@ int main(int argc, char *argv[])
timing_setBase();
timing_get( &startupTime );
-#ifdef AFL_MODE
- // ###### AFL
- //
+#ifdef DNBD3_SERVER_AFL
image_serverStartup();
net_init();
uplink_globalsInit();
@@ -259,9 +328,7 @@ int main(int argc, char *argv[])
net_handleNewConnection( dnbd3_client );
exit( 0 );
}
- //
- // ###### AFL END
-#endif
+#endif /* DNBD3_SERVER_AFL */
// One-shots first:
@@ -273,7 +340,10 @@ int main(int argc, char *argv[])
// No one-shot detected, normal server operation or errormsg serving
if ( demonize ) {
logadd( LOG_INFO, "Forking into background, see log file for further information" );
- daemon( 1, 0 );
+ if ( daemon( 0, 0 ) == -1 ) {
+ logadd( LOG_ERROR, "Could not daemon(): errno=%d", errno );
+ exit( 1 );
+ }
}
if ( errorMsg != NULL ) {
setupNetwork( bindAddress );
@@ -297,22 +367,25 @@ int main(int argc, char *argv[])
net_init();
uplink_globalsInit();
rpc_init();
- logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE );
+ if ( mountDir != NULL && !dfuse_init( "-oallow_other", mountDir ) ) {
+ logadd( LOG_ERROR, "Cannot mount fuse directory to %s", mountDir );
+ dnbd3_cleanup();
+ return EXIT_FAILURE;
+ }
+ logadd( LOG_INFO, "DNBD3 server starting...." );
+ logadd( LOG_INFO, "Machine type: " DNBD3_ENDIAN_MODE );
+ logadd( LOG_INFO, "Build Type: %s", DNBD3_BUILD );
+ logadd( LOG_INFO, "Version: %s, built %s", DNBD3_VERSION_LONG, DNBD3_BUILD_DATE );
if ( altservers_load() < 0 ) {
logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
}
-#ifdef _DEBUG
- debug_locks_start_watchdog();
-#endif
-
// setup signal handler
- struct sigaction sa;
- memset( &sa, 0, sizeof(sa) );
- sa.sa_sigaction = dnbd3_handleSignal2;
- sa.sa_flags = SA_SIGINFO;
- //sa.sa_mask = ;
+ struct sigaction sa = {
+ .sa_sigaction = dnbd3_handleSignal2,
+ .sa_flags = SA_SIGINFO,
+ };
sigaction( SIGTERM, &sa, NULL );
sigaction( SIGINT, &sa, NULL );
sigaction( SIGUSR1, &sa, NULL );
@@ -342,10 +415,15 @@ int main(int argc, char *argv[])
// Initialize thread pool
if ( !threadpool_init( 8 ) ) {
logadd( LOG_ERROR, "Could not init thread pool!\n" );
+ dnbd3_cleanup();
exit( EXIT_FAILURE );
}
- logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
+ logadd( LOG_INFO, "Server is ready." );
+
+ if ( thread_create( &timerThread, NULL, &timerMainloop, NULL ) == 0 ) {
+ hasTimerThread = true;
+ }
// +++++++++++++++++++++++++++++++++++++++++++++++++++ main loop
struct sockaddr_storage client;
@@ -357,7 +435,7 @@ int main(int argc, char *argv[])
if ( sigReload ) {
sigReload = false;
logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" );
- threadpool_run( &server_asyncImageListLoad, NULL );
+ threadpool_run( &server_asyncImageListLoad, NULL, "IMAGE_RELOAD" );
}
if ( sigLogCycle ) {
sigLogCycle = false;
@@ -370,7 +448,7 @@ int main(int argc, char *argv[])
//
len = sizeof(client);
fd = sock_accept( listeners, &client, &len );
- if ( fd < 0 ) {
+ if ( fd == -1 ) {
const int err = errno;
if ( err == EINTR || err == EAGAIN ) continue;
logadd( LOG_ERROR, "Client accept failure (err=%d)", err );
@@ -384,7 +462,7 @@ int main(int argc, char *argv[])
continue;
}
- if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) {
+ if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client, "CLIENT" ) ) {
logadd( LOG_ERROR, "Could not start thread for new connection." );
free( dnbd3_client );
continue;
@@ -474,8 +552,17 @@ static void dnbd3_handleSignal(int signum)
static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
{
- memcpy( &lastSignal, info, sizeof(siginfo_t) );
- dnbd3_handleSignal( signum );
+ if ( info->si_pid != mainPid ) { // Source is not this process
+ memcpy( &lastSignal, info, sizeof(siginfo_t) ); // Copy signal info
+ if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) {
+ pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread
+ }
+ // Source is not this process -- only then do we honor signals
+ if ( pthread_equal( pthread_self(), mainThread ) ) {
+ // Signal received by main thread -- handle
+ dnbd3_handleSignal( signum );
+ }
+ }
}
uint32_t dnbd3_serverUptime()
@@ -493,3 +580,85 @@ static void* server_asyncImageListLoad(void *data UNUSED)
return NULL;
}
+static void* timerMainloop(void* stuff UNUSED)
+{
+ setThreadName( "timer" );
+ while ( !_shutdown ) {
+ // Handle jobs/timer events; returns timeout until next event
+ int to = handlePendingJobs();
+ sleep( MIN( MAX( 1, to ), DEFAULT_TIMER_TIMEOUT ) );
+ }
+ logadd( LOG_DEBUG1, "Timer thread done" );
+ return NULL;
+}
+
+static int handlePendingJobs(void)
+{
+ declare_now;
+ job_t *todo, **temp, *old;
+ int diff;
+ todo = jobHead;
+ for ( temp = &todo; *temp != NULL; temp = &(*temp)->next ) {
+ diff = (int)timing_diff( &now, &(*temp)->dueDate );
+ if ( diff > 0 ) // Found one that's in the future
+ break;
+ }
+ jobHead = *temp; // Make it list head
+ *temp = NULL; // Split off part before that
+ while ( todo != NULL ) {
+ threadpool_run( todo->startRoutine, todo->arg, "TIMER_TASK" );
+ old = todo;
+ todo = todo->next;
+ if ( old->intervalSecs == 0 ) {
+ free( old ); // oneshot
+ } else {
+ timing_set( &old->dueDate, &now, old->intervalSecs );
+ queueJobInternal( old ); // repeated
+ }
+ }
+ // See if any new jobs have been queued
+ while ( newJob != NULL ) {
+ todo = newJob;
+ // NULL should never happen since we're the only consumer
+ assert( todo != NULL );
+ if ( !atomic_compare_exchange_weak( &newJob, &todo, NULL ) )
+ continue;
+ do {
+ old = todo;
+ todo = todo->next;
+ queueJobInternal( old );
+ } while ( todo != NULL );
+ }
+ // Return new timeout
+ if ( jobHead == NULL )
+ return DEFAULT_TIMER_TIMEOUT;
+ return (int)timing_diff( &now, &jobHead->dueDate );
+}
+
+static void queueJobInternal(job_t *job)
+{
+ assert( job != NULL );
+ job_t **it;
+ for ( it = &jobHead; *it != NULL; it = &(*it)->next ) {
+ if ( timing_1le2( &job->dueDate, &(*it)->dueDate ) )
+ break;
+ }
+ job->next = *it;
+ *it = job;
+}
+
+void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs)
+{
+ declare_now;
+ job_t *new = malloc( sizeof(*new) );
+ new->startRoutine = startRoutine;
+ new->arg = arg;
+ new->intervalSecs = intervalSecs;
+ timing_set( &new->dueDate, &now, delaySecs );
+ for ( ;; ) {
+ new->next = newJob;
+ if ( atomic_compare_exchange_weak( &newJob, &new->next, new ) )
+ break;
+ }
+}
+
diff --git a/src/server/server.h b/src/server/server.h
index bab8421..e93d8f5 100644
--- a/src/server/server.h
+++ b/src/server/server.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -22,10 +22,10 @@
#define SERVER_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
-void dnbd3_cleanup();
uint32_t dnbd3_serverUptime();
+void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs);
#if !defined(_FILE_OFFSET_BITS) || _FILE_OFFSET_BITS != 64
#error Please set _FILE_OFFSET_BITS to 64 in your makefile/configuration
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index dac0980..a21bd0d 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -4,26 +4,31 @@
#include "locks.h"
typedef struct _entry_t {
- struct _entry_t *next;
pthread_t thread;
dnbd3_signal_t* signal;
void *(*startRoutine)(void *);
void * arg;
+ const char *name;
} entry_t;
static void *threadpool_worker(void *entryPtr);
static pthread_attr_t threadAttrs;
-
-static int maxIdleThreads = -1;
-static entry_t *pool = NULL;
-static pthread_mutex_t poolLock;
+static atomic_int maxIdleThreads = -1;
+static _Atomic(entry_t *) *pool = NULL;
+static atomic_int activeThreads = 0;
bool threadpool_init(int maxIdle)
{
- if ( maxIdle < 0 || maxIdleThreads >= 0 ) return false;
- mutex_init( &poolLock );
- maxIdleThreads = maxIdle;
+ if ( maxIdle < 0 )
+ return false;
+ int exp = -1;
+ if ( !atomic_compare_exchange_strong( &maxIdleThreads, &exp, maxIdle ) )
+ return false;
+ pool = malloc( maxIdle * sizeof(*pool) );
+ for ( int i = 0; i < maxIdle; ++i ) {
+ atomic_init( &pool[i], NULL );
+ }
pthread_attr_init( &threadAttrs );
pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
return true;
@@ -31,28 +36,48 @@ bool threadpool_init(int maxIdle)
void threadpool_close()
{
- _shutdown = true;
- if ( maxIdleThreads < 0 ) return;
- mutex_lock( &poolLock );
- maxIdleThreads = -1;
- entry_t *ptr = pool;
- while ( ptr != NULL ) {
- entry_t *current = ptr;
- ptr = ptr->next;
- signal_call( current->signal );
+ int max = atomic_exchange( &maxIdleThreads, -1 );
+ if ( max <= 0 )
+ return;
+ for ( int i = 0; i < max; ++i ) {
+ entry_t *cur = pool[i];
+ if ( cur != NULL && atomic_compare_exchange_strong( &pool[i], &cur, NULL ) ) {
+ signal_call( cur->signal );
+ }
}
- mutex_unlock( &poolLock );
- mutex_destroy( &poolLock );
}
-bool threadpool_run(void *(*startRoutine)(void *), void *arg)
+void threadpool_waitEmpty()
{
- mutex_lock( &poolLock );
- entry_t *entry = pool;
- if ( entry != NULL ) pool = entry->next;
- mutex_unlock( &poolLock );
- if ( entry == NULL ) {
- entry = (entry_t*)malloc( sizeof(entry_t) );
+ if ( activeThreads == 0 )
+ return;
+ do {
+ sleep( 1 );
+ logadd( LOG_INFO, "Threadpool: %d threads still active", (int)activeThreads );
+ } while ( activeThreads != 0 );
+}
+
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name)
+{
+ if ( unlikely( _shutdown ) ) {
+ logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
+ return false;
+ }
+#ifdef DEBUG
+ if ( unlikely( startRoutine == NULL ) ) {
+ logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
+ return false; // Or bail out!?
+ }
+#endif
+ entry_t *entry = NULL;
+ for ( int i = 0; i < maxIdleThreads; ++i ) {
+ entry = atomic_exchange( &pool[i], NULL );
+ if ( entry != NULL ) {
+ break;
+ }
+ }
+ if ( unlikely( entry == NULL ) ) {
+ entry = malloc( sizeof(entry_t) );
if ( entry == NULL ) {
logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" );
return false;
@@ -64,15 +89,17 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
return false;
}
if ( 0 != thread_create( &(entry->thread), &threadAttrs, threadpool_worker, (void*)entry ) ) {
- logadd( LOG_WARNING, "Could not create new thread for thread pool\n" );
+ logadd( LOG_WARNING, "Could not create new thread for thread pool (%d active)\n", (int)activeThreads );
signal_close( entry->signal );
free( entry );
return false;
}
+ activeThreads++;
}
- entry->next = NULL;
entry->startRoutine = startRoutine;
entry->arg = arg;
+ entry->name = name;
+ atomic_thread_fence( memory_order_release );
signal_call( entry->signal );
return true;
}
@@ -84,43 +111,50 @@ static void *threadpool_worker(void *entryPtr)
{
blockNoncriticalSignals();
entry_t *entry = (entry_t*)entryPtr;
+ int ret;
for ( ;; ) {
+keep_going:;
// Wait for signal from outside that we have work to do
- int ret = signal_clear( entry->signal );
- if ( _shutdown ) break;
- if ( ret > 0 ) {
- if ( entry->startRoutine == NULL ) {
- logadd( LOG_DEBUG1, "Worker woke up but has no work to do!" );
- continue;
- }
- // Start assigned work
- (*entry->startRoutine)( entry->arg );
- // Reset vars for safety
- entry->startRoutine = NULL;
- entry->arg = NULL;
- if ( _shutdown ) break;
- // Put thread back into pool if there are less than maxIdleThreds threads, just die otherwise
- int threadCount = 0;
- mutex_lock( &poolLock );
- entry_t *ptr = pool;
- while ( ptr != NULL ) {
- threadCount++;
- ptr = ptr->next;
- }
- if ( threadCount >= maxIdleThreads ) {
- mutex_unlock( &poolLock );
- break;
- }
- entry->next = pool;
- pool = entry;
- mutex_unlock( &poolLock );
- setThreadName( "[pool]" );
- } else {
+ ret = signal_clear( entry->signal );
+ atomic_thread_fence( memory_order_acquire );
+ if ( _shutdown )
+ break;
+ if ( ret <= 0 ) {
logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
+ continue;
+ }
+#ifdef DEBUG
+ if ( entry->startRoutine == NULL ) {
+ logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
+ exit( 1 );
+ }
+ if ( entry->name != NULL ) {
+ setThreadName( entry->name );
+ }
+#endif
+ // Start assigned work
+ (*entry->startRoutine)( entry->arg );
+ // Reset vars for safety
+ entry->startRoutine = NULL;
+ entry->arg = NULL;
+ atomic_thread_fence( memory_order_release );
+ if ( _shutdown )
+ break;
+ // Put thread back into pool
+ setThreadName( "[pool]" );
+ for ( int i = 0; i < maxIdleThreads; ++i ) {
+ entry_t *exp = NULL;
+ if ( atomic_compare_exchange_weak( &pool[i], &exp, entry ) ) {
+ goto keep_going;
+ }
}
+ // Reaching here means pool is full; just let the thread exit
+ break;
}
+ setThreadName( "[dead]" );
signal_close( entry->signal );
free( entry );
+ activeThreads--;
return NULL;
}
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index 15dd151..c30d44f 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -1,7 +1,7 @@
#ifndef _THREADPOOL_H_
#define _THREADPOOL_H_
-#include "../types.h"
+#include <dnbd3/types.h>
/**
* Initialize the thread pool. This must be called before using
@@ -18,12 +18,18 @@ bool threadpool_init(int maxIdleThreadCount);
void threadpool_close();
/**
+ * Block until all threads spawned have exited
+ */
+void threadpool_waitEmpty();
+
+/**
* Run a thread using the thread pool.
* @param startRoutine function to run in new thread
* @param arg argument to pass to thead
+ * @param name STRING CONSTANT (literal) for debugging purposes
* @return true if thread was started
*/
-bool threadpool_run(void *(*startRoutine)(void *), void *arg);
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name);
#endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 682b986..8a83124 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -3,10 +3,13 @@
#include "locks.h"
#include "image.h"
#include "altservers.h"
-#include "../shared/sockhelper.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include "net.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
+#include "threadpool.h"
+#include "reference.h"
#include <assert.h>
#include <inttypes.h>
@@ -15,25 +18,35 @@
#include <unistd.h>
#include <stdatomic.h>
+static const uint8_t HOP_FLAG_BGR = 0x80;
+static const uint8_t HOP_FLAG_PREFETCH = 0x40;
#define FILE_BYTES_PER_MAP_BYTE ( DNBD3_BLOCK_SIZE * 8 )
#define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
#define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
-#define REP_NONE ( (uint64_t)0xffffffffffffffff )
-
static atomic_uint_fast64_t totalBytesReceived = 0;
+typedef struct {
+ uint64_t start, end, handle;
+} req_t;
+
+static void cancelAllRequests(dnbd3_uplink_t *uplink);
+static void freeUplinkStruct(ref *ref);
static void* uplink_mainloop(void *data);
-static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly);
-static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex);
-static void uplink_handleReceive(dnbd3_connection_t *link);
-static int uplink_sendKeepalive(const int fd);
-static void uplink_addCrc32(dnbd3_connection_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_connection_t *link);
-static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force);
-static bool uplink_saveCacheMap(dnbd3_connection_t *link);
-static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link);
-static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew);
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly);
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
+static void handleReceive(dnbd3_uplink_t *uplink);
+static bool sendKeepalive(dnbd3_uplink_t *uplink);
+static void requestCrc32List(dnbd3_uplink_t *uplink);
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink);
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
+
+#define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) )
// ############ uplink connection handling
@@ -54,56 +67,73 @@ uint64_t uplink_getTotalBytesReceived()
bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version)
{
if ( !_isProxy || _shutdown ) return false;
- dnbd3_connection_t *link = NULL;
assert( image != NULL );
+ if ( sock == -1 && !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
mutex_lock( &image->lock );
- if ( image->uplink != NULL && !image->uplink->shutdown ) {
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink != NULL ) {
mutex_unlock( &image->lock );
- if ( sock >= 0 ) close( sock );
- return true; // There's already an uplink, so should we consider this success or failure?
+ if ( sock != -1 ) {
+ close( sock );
+ }
+ ref_put( &uplink->reference );
+ return true; // There's already an uplink
}
- if ( image->cache_map == NULL ) {
+ if ( image->ref_cacheMap == NULL ) {
logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
goto failure;
}
- link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) );
- mutex_init( &link->queueLock );
- mutex_init( &link->rttLock );
- mutex_init( &link->sendMutex );
- link->image = image;
- link->bytesReceived = 0;
- link->idleTime = 0;
- link->queueLen = 0;
- mutex_lock( &link->sendMutex );
- link->fd = -1;
- mutex_unlock( &link->sendMutex );
- link->cacheFd = -1;
- link->signal = NULL;
- link->replicationHandle = REP_NONE;
- mutex_lock( &link->rttLock );
- link->cycleDetected = false;
- if ( sock >= 0 ) {
- link->betterFd = sock;
- link->betterServer = *host;
- link->rttTestResult = RTT_DOCHANGE;
- link->betterVersion = version;
+ uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+ // Start with one reference for the uplink thread. We'll return it when the thread finishes
+ ref_init( &uplink->reference, freeUplinkStruct, 1 );
+ mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
+ mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
+ mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
+ uplink->image = image;
+ uplink->bytesReceived = 0;
+ uplink->bytesReceivedLastSave = 0;
+ uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
+ uplink->queue = NULL;
+ uplink->queueLen = 0;
+ uplink->cacheFd = -1;
+ uplink->signal = signal_new();
+ if ( uplink->signal == NULL ) {
+ logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
+ goto failure;
+ }
+ mutex_lock( &uplink->rttLock );
+ mutex_lock( &uplink->sendMutex );
+ uplink->current.fd = -1;
+ mutex_unlock( &uplink->sendMutex );
+ uplink->cycleDetected = false;
+ image->problem.uplink = true;
+ image->problem.write = true;
+ image->problem.queue = false;
+ if ( sock != -1 ) {
+ uplink->better.fd = sock;
+ int index = altservers_hostToIndex( host );
+ uplink->better.index = index == -1 ? 0 : index; // Prevent invalid array access
+ uplink->rttTestResult = RTT_DOCHANGE;
+ uplink->better.version = version;
} else {
- link->betterFd = -1;
- link->rttTestResult = RTT_IDLE;
+ uplink->better.fd = -1;
+ uplink->rttTestResult = RTT_IDLE;
}
- mutex_unlock( &link->rttLock );
- link->recvBufferLen = 0;
- link->shutdown = false;
- if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) {
+ mutex_unlock( &uplink->rttLock );
+ uplink->recvBufferLen = 0;
+ uplink->shutdown = false;
+ if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)uplink ) ) {
logadd( LOG_ERROR, "Could not start thread for new uplink." );
goto failure;
}
+ ref_setref( &image->uplinkref, &uplink->reference );
mutex_unlock( &image->lock );
return true;
failure: ;
- if ( link != NULL ) {
- free( link );
- link = image->uplink = NULL;
+ if ( uplink != NULL ) {
+ image->users++; // Expected by freeUplinkStruct()
+ ref_put( &uplink->reference ); // The ref for the uplink thread that never was
}
mutex_unlock( &image->lock );
return false;
@@ -114,201 +144,398 @@ failure: ;
* Calling it multiple times, even concurrently, will
* not break anything.
*/
-void uplink_shutdown(dnbd3_image_t *image)
+bool uplink_shutdown(dnbd3_image_t *image)
{
- bool join = false;
- pthread_t thread;
assert( image != NULL );
mutex_lock( &image->lock );
- if ( image->uplink == NULL ) {
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink == NULL ) {
mutex_unlock( &image->lock );
- return;
+ return true;
}
- dnbd3_connection_t * const uplink = image->uplink;
mutex_lock( &uplink->queueLock );
- if ( !uplink->shutdown ) {
- uplink->shutdown = true;
+ bool exp = false;
+ if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+ image->users++; // Prevent free while uplink shuts down
signal_call( uplink->signal );
- thread = uplink->thread;
- join = true;
+ } else {
+ logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) );
}
+ cancelAllRequests( uplink );
+ ref_setref( &image->uplinkref, NULL );
mutex_unlock( &uplink->queueLock );
- bool wait = image->uplink != NULL;
+ bool retval = ( exp && image->users == 0 );
+ ref_put( &uplink->reference );
mutex_unlock( &image->lock );
- if ( join ) thread_join( thread, NULL );
- while ( wait ) {
- usleep( 5000 );
- mutex_lock( &image->lock );
- wait = image->uplink != NULL && image->uplink->shutdown;
- mutex_unlock( &image->lock );
+ return retval;
+}
+
+/**
+ * Cancel all requests of this uplink.
+ * HOLD QUEUE LOCK WHILE CALLING
+ */
+static void cancelAllRequests(dnbd3_uplink_t *uplink)
+{
+ dnbd3_queue_entry_t *it = uplink->queue;
+ while ( it != NULL ) {
+ dnbd3_queue_client_t *cit = it->clients;
+ while ( cit != NULL ) {
+ (*cit->callback)( cit->data, cit->handle, 0, 0, NULL );
+ dnbd3_queue_client_t *next = cit->next;
+ free( cit );
+ cit = next;
+ }
+ dnbd3_queue_entry_t *next = it->next;
+ free( it );
+ it = next;
}
+ uplink->queue = NULL;
+ uplink->queueLen = 0;
+ uplink->image->problem.queue = false;
+}
+
+static void freeUplinkStruct(ref *ref)
+{
+ dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
+ logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) );
+ assert( uplink->queueLen == 0 );
+ if ( uplink->signal != NULL ) {
+ signal_close( uplink->signal );
+ }
+ if ( uplink->current.fd != -1 ) {
+ close( uplink->current.fd );
+ uplink->current.fd = -1;
+ }
+ if ( uplink->better.fd != -1 ) {
+ close( uplink->better.fd );
+ uplink->better.fd = -1;
+ }
+ mutex_destroy( &uplink->queueLock );
+ mutex_destroy( &uplink->rttLock );
+ mutex_destroy( &uplink->sendMutex );
+ free( uplink->recvBuffer );
+ uplink->recvBuffer = NULL;
+ if ( uplink->cacheFd != -1 ) {
+ close( uplink->cacheFd );
+ }
+ // Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code
+ // of the uplink thread, depending on who set the uplink->shutdown flag. (Or uplink_init if that failed)
+ image_release( uplink->image );
+ free( uplink ); // !!!
}
/**
* Remove given client from uplink request queue
* Locks on: uplink.queueLock
*/
-void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client)
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback)
{
mutex_lock( &uplink->queueLock );
- for (int i = uplink->queueLen - 1; i >= 0; --i) {
- if ( uplink->queue[i].client == client ) {
- uplink->queue[i].client = NULL;
- uplink->queue[i].status = ULR_FREE;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; ) {
+ if ( (**cit).data == data && (**cit).callback == callback ) {
+ (*(**cit).callback)( (**cit).data, (**cit).handle, 0, 0, NULL );
+ dnbd3_queue_client_t *entry = *cit;
+ *cit = (**cit).next;
+ free( entry );
+ } else {
+ cit = &(**cit).next;
+ }
}
- if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
}
mutex_unlock( &uplink->queueLock );
}
/**
- * Request a chunk of data through an uplink server
- * Locks on: image.lock, uplink.queueLock
+ * Called from a client (proxy) connection to request a missing part of the image.
+ * The caller has made sure that the range is actually missing.
*/
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
{
- if ( client == NULL || client->image == NULL ) return false;
- if ( length > (uint32_t)_maxPayload ) {
- logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
+ assert( client != NULL && callback != NULL );
+ if ( ( hops & 0x3f ) > 60 ) { // This is just silly
+ logadd( LOG_WARNING, "Refusing to relay a request that has > 60 hops" );
return false;
}
- mutex_lock( &client->image->lock );
- if ( client->image->uplink == NULL ) {
- mutex_unlock( &client->image->lock );
- logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
+ if ( unlikely( uplink == NULL ) ) {
+ uplink_init( client->image, -1, NULL, -1 );
+ uplink = ref_get_uplink( &client->image->uplinkref );
+ if ( uplink == NULL ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ return false;
+ }
+ }
+ // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
+ // This might be a false positive if there are multiple instances running on the same host (IP)
+ bool ret;
+ if ( hops > 1 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
+ uplink->cycleDetected = true;
+ signal_call( uplink->signal );
+ logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
+ ret = false;
+ } else {
+ ret = uplink_requestInternal( uplink, (void*)client, callback, handle, start, length, hops );
+ }
+ ref_put( &uplink->reference );
+ return ret;
+}
+
+/**
+ * Called by integrated fuse module
+ */
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length)
+{
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( unlikely( uplink == NULL ) ) {
+ uplink_init( image, -1, NULL, -1 );
+ uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink == NULL ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ return false;
+ }
+ }
+ bool ret = uplink_requestInternal( uplink, data, callback, handle, start, length, 0 );
+ ref_put( &uplink->reference );
+ return ret;
+}
+
+static void extendRequest(uint64_t start, uint64_t *end, const dnbd3_image_t *image, uint32_t wanted)
+{
+ uint32_t length = (uint32_t)( *end - start );
+ if ( length >= wanted )
+ return;
+ length = wanted;
+ if ( unlikely( _backgroundReplication == BGR_HASHBLOCK
+ && *end / HASH_BLOCK_SIZE != (start + length) / HASH_BLOCK_SIZE ) ) {
+ // Don't extend across hash-block border in this mode
+ *end = ( start + length ) & ~( HASH_BLOCK_SIZE - 1 );
+ } else {
+ *end = start + length;
+ }
+ if ( unlikely( *end > image->virtualFilesize ) ) {
+ *end = image->virtualFilesize;
+ }
+ *end = ( *end + DNBD3_BLOCK_SIZE - 1 ) & ~( DNBD3_BLOCK_SIZE - 1 );
+ //logadd( LOG_DEBUG2, "Extended %"PRIx64" from %"PRIx64" to %"PRIx64, start, end, req.end );
+}
+
+static bool requestBlock(dnbd3_uplink_t *uplink, req_t *req, uint8_t hops)
+{
+ if ( uplink->current.fd == -1 )
+ return false;
+ return dnbd3_get_block( uplink->current.fd, req->start,
+ (uint32_t)( req->end - req->start ), req->handle,
+ COND_HOPCOUNT( uplink->current.version, hops ) );
+}
+
+/**
+ * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL.
+ * If callback is NULL, this is assumed to be a background replication request.
+ * Locks on: uplink.queueLock, uplink.sendMutex
+ */
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+{
+ assert( uplink != NULL );
+ assert( data == NULL || callback != NULL );
+ if ( ( hops & HOP_FLAG_BGR ) // This is a background replication request
+ && _backgroundReplication != BGR_FULL ) { // Deny if we're not doing BGR
+ // TODO: Allow BGR_HASHBLOCK too, but only if hash block isn't completely empty
+ logadd( LOG_DEBUG2, "Dopping client because of BGR policy" );
return false;
}
- dnbd3_connection_t * const uplink = client->image->uplink;
if ( uplink->shutdown ) {
- mutex_unlock( &client->image->lock );
logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
return false;
}
- // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
- // This might be a false positive if there are multiple instances running on the same host (IP)
- if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) {
- mutex_unlock( &client->image->lock );
- logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
- mutex_lock( &uplink->rttLock );
- uplink->cycleDetected = true;
- mutex_unlock( &uplink->rttLock );
- signal_call( uplink->signal );
+ if ( length > (uint32_t)_maxPayload ) {
+ logadd( LOG_WARNING, "UPLINK: Cannot relay request; length of %" PRIu32 " exceeds maximum payload",
+ length );
return false;
}
- int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
- int existingType = -1; // ULR_* type of existing request
- int i;
- int freeSlot = -1;
- bool requestLoop = false;
- const uint64_t end = start + length;
+ hops++;
+ if ( callback == NULL ) {
+ // Set upper-most bit for replication requests that we fire
+ // In client mode, at least set prefetch flag to prevent prefetch cascading
+ hops |= (uint8_t)( _pretendClient ? HOP_FLAG_PREFETCH : HOP_FLAG_BGR );
+ }
+ req_t req, preReq;
+ dnbd3_queue_entry_t *request = NULL, *last = NULL, *pre = NULL;
+ bool isNew;
+ const uint64_t end = start + length;
+ req.start = start & ~(DNBD3_BLOCK_SIZE - 1);
+ req.end = end;
+ /* Don't do this -- this breaks matching of prefetch jobs, since they'd
+ * be misaligned, and the next client request wouldn't match anything.
+ * To improve this, we need to be able to attach a queue_client to multiple queue_entries
+ * and then serve it once all the queue_entries are done (atomic_int in queue_client).
+ * But currently we directly send the receive buffer's content to the queue_client after
+ * receiving the payload, as this will also work when the local cache is borked (we just
+ * tunnel though the traffic). One could argue that this mode of operation is nonsense,
+ * and we should just drop all affected clients. Then as a next step, don't serve the
+ * clients form the receive buffer, but just issue a normal sendfile() call after writing
+ * the received data to the local cache.
+ */
+ if ( callback != NULL && _minRequestSize != 0 ) {
+ // Not background replication request, extend request size
+ extendRequest( req.start, &req.end, uplink->image, _minRequestSize );
+ }
+ req.end = (req.end + DNBD3_BLOCK_SIZE - 1) & ~(DNBD3_BLOCK_SIZE - 1);
+ // Critical section - work with the queue
mutex_lock( &uplink->queueLock );
- mutex_unlock( &client->image->lock );
- for (i = 0; i < uplink->queueLen; ++i) {
- if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) {
- freeSlot = i;
- continue;
+ if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
+ goto fail_lock;
+ }
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->from <= start && it->to >= end ) {
+ // Matching range, attach
+ request = it;
+ break;
}
- if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue;
- if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) {
- if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) {
- requestLoop = true;
- break;
- }
- if ( foundExisting == -1 || existingType == ULR_PENDING ) {
- foundExisting = i;
- existingType = uplink->queue[i].status;
- if ( freeSlot != -1 ) break;
- }
+ if ( it->next == NULL ) {
+ // Not matching, last in list, remember
+ last = it;
+ break;
}
}
- if ( requestLoop ) {
- mutex_unlock( &uplink->queueLock );
- logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
- mutex_lock( &uplink->rttLock );
- uplink->cycleDetected = true;
- mutex_unlock( &uplink->rttLock );
- signal_call( uplink->signal );
- return false;
+ dnbd3_queue_client_t **c = NULL;
+ if ( request == NULL ) {
+ // No existing request to attach to
+ if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) {
+ logadd( LOG_WARNING,
+ "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." );
+ goto fail_lock;
+ }
+ uplink->queueLen++;
+ if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = true;
+ }
+ request = malloc( sizeof(*request) );
+ if ( last == NULL ) {
+ uplink->queue = request;
+ } else {
+ last->next = request;
+ }
+ request->next = NULL;
+ request->handle = ++uplink->queueId;
+ request->from = req.start;
+ request->to = req.end;
+#ifdef DEBUG
+ timing_get( &request->entered );
+#endif
+ request->hopCount = hops;
+ request->sent = true; // Optimistic; would be set to false on failure
+ if ( callback == NULL ) {
+ // BGR
+ request->clients = NULL;
+ } else {
+ c = &request->clients;
+ }
+ isNew = true;
+ } else if ( callback == NULL ) {
+ // Replication request that maches existing request. Do nothing
+ isNew = false;
+ } else {
+ // Existing request. Check if potential cycle
+ if ( hops > request->hopCount && request->from == start && request->to == end ) {
+ logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
+ goto fail_lock;
+ }
+ // Count number if clients, get tail of list
+ int count = 0;
+ c = &request->clients;
+ while ( *c != NULL ) {
+ c = &(**c).next;
+ if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) {
+ logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count );
+ goto fail_lock;
+ }
+ }
+ isNew = false;
}
- if ( freeSlot == -1 ) {
- if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
- mutex_unlock( &uplink->queueLock );
- logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
- return false;
+ // Prefetch immediately, without unlocking the list - the old approach of
+ // async prefetching in another thread was sometimes so slow that we'd process
+ // another request from the same client before the prefetch job would execute.
+ if ( callback != NULL && ( isNew || request->clients == NULL || request->clients->data == data )
+ && !( hops & (HOP_FLAG_BGR | HOP_FLAG_PREFETCH) ) // No cascading of prefetches
+ && end == request->to && length <= _maxPrefetch ) {
+ // Only if this is a client request, and the !! end boundary matches exactly !!
+ // (See above for reason why)
+ // - We neither check the local cache, nor other pending requests. Worth it?
+ // Complexity vs. probability
+ preReq.start = end;
+ preReq.end = end;
+ extendRequest( preReq.start, &preReq.end, uplink->image, MIN( length * 3, _maxPrefetch ) );
+ if ( preReq.start < preReq.end ) {
+ //logadd( LOG_DEBUG2, "Prefetching @ %"PRIx64" - %"PRIx64, preReq.start, preReq.end );
+ uplink->queueLen++;
+ pre = malloc( sizeof(*pre) );
+ pre->next = request->next;
+ request->next = pre;
+ pre->handle = preReq.handle = ++uplink->queueId;
+ pre->from = preReq.start;
+ pre->to = preReq.end;
+ pre->hopCount = hops | HOP_FLAG_PREFETCH;
+ pre->sent = true; // Optimistic; would be set to false on failure
+ pre->clients = NULL;
+#ifdef DEBUG
+ timing_get( &pre->entered );
+#endif
}
- freeSlot = uplink->queueLen++;
}
- // Do not send request to uplink server if we have a matching pending request AND the request either has the
- // status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise
- // explicitly send this request to the uplink server. The second condition mentioned here is to prevent
- // a race condition where the reply for the outstanding request already arrived and the uplink thread
- // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
- // already have passed the index of the free slot we determined, but not reached the existing request we just found above.
- if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request"
-#ifdef _DEBUG
- if ( foundExisting != -1 ) {
- logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot );
- logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
- "New %" PRIu64 "-%" PRIu64 " (%p)\n",
- uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
- start, end, (void*)client );
+ // // // //
+ // Copy data - need this after unlocking
+ req.handle = request->handle;
+ if ( callback != NULL ) {
+ assert( c != NULL );
+ *c = malloc( sizeof( *request->clients ) );
+ (**c).next = NULL;
+ (**c).handle = handle;
+ (**c).from = start;
+ (**c).to = end;
+ (**c).data = data;
+ (**c).callback = callback;
}
-#endif
- // Fill structure
- uplink->queue[freeSlot].from = start;
- uplink->queue[freeSlot].to = end;
- uplink->queue[freeSlot].handle = handle;
- uplink->queue[freeSlot].client = client;
- //int old = uplink->queue[freeSlot].status;
- uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING);
- uplink->queue[freeSlot].hopCount = hops;
-#ifdef _DEBUG
- timing_get( &uplink->queue[freeSlot].entered );
- //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
-#endif
mutex_unlock( &uplink->queueLock );
+ // End queue critical section
+ if ( pre == NULL && !isNew )
+ return true; // Nothing to do
- if ( foundExisting != -1 )
- return true; // Attached to pending request, do nothing
-
- // See if we can fire away the request
- if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
- logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
- } else {
- if ( uplink->fd == -1 ) {
- mutex_unlock( &uplink->sendMutex );
- logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
- } else {
- const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- if ( hops < 200 ) ++hops;
- const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) );
- mutex_unlock( &uplink->sendMutex );
- if ( !ret ) {
- logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
- } else {
- mutex_lock( &uplink->queueLock );
- if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client && uplink->queue[freeSlot].status == ULR_NEW ) {
- uplink->queue[freeSlot].status = ULR_PENDING;
- logadd( LOG_DEBUG2, "Succesful direct uplink request" );
- } else {
- logadd( LOG_DEBUG2, "Weird queue update fail for direct uplink request" );
- }
- mutex_unlock( &uplink->queueLock );
- return true;
- }
- // Fall through to waking up sender thread
- }
+ // Fire away the request(s)
+ mutex_lock( &uplink->sendMutex );
+ bool ret1 = true;
+ bool ret2 = true;
+ if ( isNew ) {
+ ret1 = requestBlock( uplink, &req, hops );
+ }
+ if ( pre != NULL ) {
+ ret2 = requestBlock( uplink, &preReq, hops | HOP_FLAG_PREFETCH );
+ }
+ if ( !ret1 || !ret2 ) { // Set with send locked
+ uplink->image->problem.uplink = true;
+ }
+ mutex_unlock( &uplink->sendMutex );
+ // markRequestUnsend locks the queue, would violate locking order with send mutex
+ if ( !ret1 ) {
+ markRequestUnsent( uplink, req.handle );
+ logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle );
+ }
+ if ( !ret2 ) {
+ markRequestUnsent( uplink, preReq.handle );
}
- if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed
- if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
- logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
- }
+ if ( ( !ret1 || !ret2 ) && signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+ logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
}
return true;
+
+fail_lock:
+ mutex_unlock( &uplink->queueLock );
+ return false;
}
/**
@@ -321,52 +548,47 @@ static void* uplink_mainloop(void *data)
#define EV_SOCKET (1)
#define EV_COUNT (2)
struct pollfd events[EV_COUNT];
- dnbd3_connection_t * const link = (dnbd3_connection_t*)data;
- int numSocks, i, waitTime;
+ dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
+ int numSocks, waitTime;
int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
+ int rttTestResult;
uint32_t discoverFailCount = 0;
- uint32_t unsavedSeconds = 0;
ticks nextAltCheck, lastKeepalive;
char buffer[200];
memset( events, 0, sizeof(events) );
timing_get( &nextAltCheck );
lastKeepalive = nextAltCheck;
//
- assert( link != NULL );
+ assert( uplink != NULL );
setThreadName( "idle-uplink" );
+ thread_detach( uplink->thread );
blockNoncriticalSignals();
// Make sure file is open for writing
- if ( !uplink_reopenCacheFd( link, false ) ) {
+ if ( !reopenCacheFd( uplink, false ) ) {
// It might have failed - still offer proxy mode, we just can't cache
- logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno );
+ logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
}
//
- link->signal = signal_new();
- if ( link->signal == NULL ) {
- logadd( LOG_WARNING, "error creating signal. Uplink unavailable." );
- goto cleanup;
- }
events[EV_SIGNAL].events = POLLIN;
- events[EV_SIGNAL].fd = signal_getWaitFd( link->signal );
+ events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal );
events[EV_SOCKET].fd = -1;
- while ( !_shutdown && !link->shutdown ) {
+ if ( uplink->rttTestResult != RTT_DOCHANGE ) {
+ altservers_findUplink( uplink ); // In case we didn't kickstart
+ }
+ while ( !_shutdown && !uplink->shutdown ) {
// poll()
- mutex_lock( &link->rttLock );
- waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1;
- mutex_unlock( &link->rttLock );
- if ( waitTime == 0 ) {
- // Nothing
- } else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) {
- waitTime = 1000;
+ if ( uplink->rttTestResult == RTT_DOCHANGE ) {
+ // 0 means poll, since we're about to change the server
+ waitTime = 0;
} else {
declare_now;
waitTime = (int)timing_diffMs( &now, &nextAltCheck );
if ( waitTime < 100 ) waitTime = 100;
- if ( waitTime > 5000 ) waitTime = 5000;
+ else if ( waitTime > 10000 ) waitTime = 10000;
}
- events[EV_SOCKET].fd = link->fd;
+ events[EV_SOCKET].fd = uplink->current.fd;
numSocks = poll( events, EV_COUNT, waitTime );
- if ( _shutdown || link->shutdown ) goto cleanup;
+ if ( _shutdown || uplink->shutdown ) goto cleanup;
if ( numSocks == -1 ) { // Error?
if ( errno == EINTR ) continue;
logadd( LOG_DEBUG1, "poll() error %d", (int)errno );
@@ -374,40 +596,41 @@ static void* uplink_mainloop(void *data)
continue;
}
// Check if server switch is in order
- mutex_lock( &link->rttLock );
- if ( link->rttTestResult != RTT_DOCHANGE ) {
- mutex_unlock( &link->rttLock );
- } else {
- link->rttTestResult = RTT_IDLE;
+ if ( unlikely( uplink->rttTestResult == RTT_DOCHANGE ) ) {
+ mutex_lock( &uplink->rttLock );
+ assert( uplink->rttTestResult == RTT_DOCHANGE );
+ uplink->rttTestResult = RTT_IDLE;
// The rttTest worker thread has finished our request.
// And says it's better to switch to another server
- const int fd = link->fd;
- mutex_lock( &link->sendMutex );
- link->fd = link->betterFd;
- mutex_unlock( &link->sendMutex );
- link->betterFd = -1;
- link->currentServer = link->betterServer;
- link->version = link->betterVersion;
- link->cycleDetected = false;
- mutex_unlock( &link->rttLock );
+ const int fd = uplink->current.fd;
+ mutex_lock( &uplink->sendMutex );
+ uplink->current = uplink->better;
+ mutex_unlock( &uplink->sendMutex );
+ uplink->better.fd = -1;
+ uplink->cycleDetected = false;
+ mutex_unlock( &uplink->rttLock );
discoverFailCount = 0;
if ( fd != -1 ) close( fd );
- link->replicationHandle = REP_NONE;
- link->image->working = true;
- link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
+ uplink->image->problem.uplink = false;
+ uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
buffer[0] = '@';
- if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) {
- logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 );
+ if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
+ logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 );
setThreadName( buffer );
}
// If we don't have a crc32 list yet, see if the new server has one
- if ( link->image->crc32 == NULL ) {
- uplink_addCrc32( link );
+ if ( uplink->image->crc32 == NULL ) {
+ requestCrc32List( uplink );
}
// Re-send all pending requests
- uplink_sendRequests( link, false );
- uplink_sendReplicationRequest( link );
+ sendQueuedRequests( uplink, false );
+ sendReplicationRequest( uplink );
events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+ if ( uplink->image->problem.uplink ) {
+ // Some of the requests above must have failed again already :-(
+ logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
+ connectionFailed( uplink, true );
+ }
timing_gets( &nextAltCheck, altCheckInterval );
// The rtt worker already did the handshake for our image, so there's nothing
// more to do here
@@ -415,206 +638,187 @@ static void* uplink_mainloop(void *data)
// Check events
// Signal
if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+ uplink->image->problem.uplink = true;
logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
goto cleanup;
} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
// signal triggered -> pending requests
- if ( signal_clear( link->signal ) == SIGNAL_ERROR ) {
- logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name );
+ if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) {
+ logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name );
}
- if ( link->fd != -1 ) {
+ if ( uplink->current.fd != -1 ) {
// Uplink seems fine, relay requests to it...
- uplink_sendRequests( link, true );
- } else { // No uplink; maybe it was shutdown since it was idle for too long
- link->idleTime = 0;
+ sendQueuedRequests( uplink, true );
+ } else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long
+ uplink->idleTime = 0;
}
}
// Uplink socket
if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
- uplink_connectionFailed( link, true );
- logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" );
+ connectionFailed( uplink, true );
+ logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents );
setThreadName( "panic-uplink" );
} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
- uplink_handleReceive( link );
- if ( _shutdown || link->shutdown ) goto cleanup;
+ handleReceive( uplink );
+ if ( _shutdown || uplink->shutdown ) goto cleanup;
}
declare_now;
uint32_t timepassed = timing_diff( &lastKeepalive, &now );
- if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+ if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL
+ || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
lastKeepalive = now;
- link->idleTime += timepassed;
- unsavedSeconds += timepassed;
- if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) {
- // fsync/save every 4 minutes, or every 60 seconds if link is idle
- unsavedSeconds = 0;
- uplink_saveCacheMap( link );
- }
+ uplink->idleTime += timepassed;
// Keep-alive
- if ( link->fd != -1 && link->replicationHandle == REP_NONE ) {
- // Send keep-alive if nothing is happening
- if ( uplink_sendKeepalive( link->fd ) ) {
- // Re-trigger periodically, in case it requires a minimum user count
- uplink_sendReplicationRequest( link );
- } else {
- uplink_connectionFailed( link, true );
- logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
- setThreadName( "panic-uplink" );
+ if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
+ // Send keep-alive if nothing is happening, and try to trigger background rep.
+ if ( !sendKeepalive( uplink ) || !sendReplicationRequest( uplink ) ) {
+ connectionFailed( uplink, true );
+ logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
}
}
- // Don't keep link established if we're idle for too much
- if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) {
- mutex_lock( &link->sendMutex );
- close( link->fd );
- link->fd = events[EV_SOCKET].fd = -1;
- mutex_unlock( &link->sendMutex );
- link->cycleDetected = false;
- if ( link->recvBufferLen != 0 ) {
- link->recvBufferLen = 0;
- free( link->recvBuffer );
- link->recvBuffer = NULL;
- }
- logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid );
- setThreadName( "idle-uplink" );
+ // Don't keep uplink established if we're idle for too much
+ if ( connectionShouldShutdown( uplink ) ) {
+ logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) );
+ goto cleanup;
}
}
// See if we should trigger an RTT measurement
- mutex_lock( &link->rttLock );
- const int rttTestResult = link->rttTestResult;
- mutex_unlock( &link->rttLock );
+ rttTestResult = uplink->rttTestResult;
if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
- if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) {
+ if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && discoverFailCount == 0 ) || uplink->cycleDetected ) {
// It seems it's time for a check
- if ( image_isComplete( link->image ) ) {
+ if ( image_isComplete( uplink->image ) ) {
// Quit work if image is complete
- logadd( LOG_INFO, "Replication of %s complete.", link->image->name );
+ logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
setThreadName( "finished-uplink" );
+ uplink->image->problem.uplink = false;
goto cleanup;
- } else if ( !uplink_connectionShouldShutdown( link ) ) {
+ } else {
// Not complete - do measurement
- altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous)
- if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
- link->nextReplicationIndex = 0;
+ altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous)
+ if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
+ uplink->nextReplicationIndex = 0;
}
}
altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX);
timing_set( &nextAltCheck, &now, altCheckInterval );
}
} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
- mutex_lock( &link->rttLock );
- link->rttTestResult = RTT_IDLE;
- mutex_unlock( &link->rttLock );
- discoverFailCount++;
- timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
+ if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
+ discoverFailCount++;
+ if ( uplink->current.fd == -1 ) {
+ uplink->cycleDetected = false;
+ }
+ }
+ timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED );
}
-#ifdef _DEBUG
- if ( link->fd != -1 && !link->shutdown ) {
+#ifdef DEBUG
+ if ( uplink->current.fd != -1 && !uplink->shutdown ) {
bool resend = false;
ticks deadline;
timing_set( &deadline, &now, -10 );
- mutex_lock( &link->queueLock );
- for (i = 0; i < link->queueLen; ++i) {
- if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) {
- snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
- "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name,
- link->queue[i].from, link->queue[i].to, link->queue[i].status );
- link->queue[i].entered = now;
-#ifdef _DEBUG_RESEND_STARVING
- link->queue[i].status = ULR_NEW;
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( timing_reached( &it->entered, &deadline ) ) {
+ logadd( LOG_WARNING, "Starving request detected:"
+ " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d",
+ it->from, it->to, (int)it->sent, PIMG(uplink->image) );
+ it->entered = now;
+#ifdef DEBUG_RESEND_STARVING
+ it->sent = false;
resend = true;
#endif
- mutex_unlock( &link->queueLock );
- logadd( LOG_WARNING, "%s", buffer );
- mutex_lock( &link->queueLock );
}
}
- mutex_unlock( &link->queueLock );
- if ( resend )
- uplink_sendRequests( link, true );
+ mutex_unlock( &uplink->queueLock );
+ if ( resend ) {
+ sendQueuedRequests( uplink, true );
+ }
}
#endif
}
- cleanup: ;
- altservers_removeUplink( link );
- uplink_saveCacheMap( link );
- mutex_lock( &link->image->lock );
- if ( link->image->uplink == link ) {
- link->image->uplink = NULL;
- }
- mutex_lock( &link->queueLock );
- const int fd = link->fd;
- const dnbd3_signal_t* signal = link->signal;
- mutex_lock( &link->sendMutex );
- link->fd = -1;
- mutex_unlock( &link->sendMutex );
- link->signal = NULL;
- if ( !link->shutdown ) {
- link->shutdown = true;
- thread_detach( link->thread );
+cleanup: ;
+ dnbd3_image_t *image = uplink->image;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ cache->dirty = true; // Force writeout of cache map
+ ref_put( &cache->reference );
}
- // Do not access link->image after unlocking, since we set
- // image->uplink to NULL. Acquire with image_lock first,
- // like done below when checking whether to re-init uplink
- mutex_unlock( &link->image->lock );
- mutex_unlock( &link->queueLock );
- if ( fd != -1 ) close( fd );
- if ( signal != NULL ) signal_close( signal );
- // Wait for the RTT check to finish/fail if it's in progress
- while ( link->rttTestResult == RTT_INPROGRESS )
- usleep( 10000 );
- if ( link->betterFd != -1 ) {
- close( link->betterFd );
+ mutex_lock( &image->lock );
+ bool exp = false;
+ if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+ image->users++; // We set the flag - hold onto image
}
- mutex_destroy( &link->queueLock );
- mutex_destroy( &link->rttLock );
- mutex_destroy( &link->sendMutex );
- free( link->recvBuffer );
- link->recvBuffer = NULL;
- if ( link->cacheFd != -1 ) {
- close( link->cacheFd );
+ dnbd3_uplink_t *current = ref_get_uplink( &image->uplinkref );
+ if ( current == uplink ) { // Set NULL if it's still us...
+ mutex_lock( &uplink->queueLock );
+ cancelAllRequests( uplink );
+ mutex_unlock( &uplink->queueLock );
+ ref_setref( &image->uplinkref, NULL );
}
- dnbd3_image_t *image = image_lock( link->image );
- free( link ); // !!!
- if ( image != NULL ) {
- if ( !_shutdown && image->cache_map != NULL ) {
- // Ingegrity checker must have found something in the meantime
- uplink_init( image, -1, NULL, 0 );
- }
- image_release( image );
+ if ( current != NULL ) { // Decrease ref in any case
+ ref_put( &current->reference );
}
+ mutex_unlock( &image->lock );
+ // Finally as the thread is done, decrease our own ref that we initialized with
+ ref_put( &uplink->reference );
return NULL ;
}
-static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly)
+/**
+ * Only called from uplink thread.
+ */
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly)
{
- // Scan for new requests
- int j;
- mutex_lock( &link->queueLock );
- for (j = 0; j < link->queueLen; ++j) {
- if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue;
- link->queue[j].status = ULR_PENDING;
- uint8_t hops = link->queue[j].hopCount;
- const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- /*
- logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
- (void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize );
- */
- mutex_unlock( &link->queueLock );
- if ( hops < 200 ) ++hops;
- mutex_lock( &link->sendMutex );
- const bool ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) );
- mutex_unlock( &link->sendMutex );
- if ( !ret ) {
- // Non-critical - if the connection dropped or the server was changed
- // the thread will re-send this request as soon as the connection
- // is reestablished.
- logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
- altservers_serverFailed( &link->currentServer );
- return;
+ assert_uplink_thread();
+ // Scan for new requests, or optionally, (re)send all
+ // Build a buffer, so if there aren't too many requests, we can send them after
+ // unlocking the queue again. Otherwise we need flushes during iteration, which
+ // is no ideal, but in that case the uplink is probably overwhelmed anyways.
+ // Try 125 as that's exactly 300bytes, usually 2*MTU.
+#define MAX_RESEND_BATCH 125
+ dnbd3_request_t reqs[MAX_RESEND_BATCH];
+ int count = 0;
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( newOnly && it->sent )
+ continue;
+ it->sent = true;
+ dnbd3_request_t *hdr = &reqs[count++];
+ hdr->magic = dnbd3_packet_magic;
+ hdr->cmd = CMD_GET_BLOCK;
+ hdr->size = (uint32_t)( it->to - it->from );
+ hdr->offset = it->from; // Offset first, then hops! (union)
+ hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount );
+ hdr->handle = it->handle;
+ fixup_request( *hdr );
+ if ( count == MAX_RESEND_BATCH ) {
+ bool ok = false;
+ logadd( LOG_DEBUG2, "BLOCKING resend of %d", count );
+ count = 0;
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 )
+ == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ if ( !ok ) {
+ uplink->image->problem.uplink = true;
+ break;
+ }
}
- mutex_lock( &link->queueLock );
}
- mutex_unlock( &link->queueLock );
+ mutex_unlock( &uplink->queueLock );
+ if ( count != 0 ) {
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ uplink->image->problem.uplink =
+ ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 )
+ != DNBD3_REQUEST_SIZE * count );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ }
+#undef MAX_RESEND_BATCH
}
/**
@@ -626,90 +830,118 @@ static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly)
* server. This means we might request data we already have, but it makes
* the code simpler. Worst case would be only one bit is zero, which means
* 4kb are missing, but we will request 32kb.
+ *
+ * Only called form uplink thread, so current.fd is assumed to be valid.
+ *
+ * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
*/
-static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink)
{
- if ( link == NULL || link->fd == -1 ) return;
- if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication
- if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE )
- return;
- dnbd3_image_t * const image = link->image;
- if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
- mutex_lock( &image->lock );
- if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) {
- // No cache map (=image complete), or replication pending, or not enough users, do nothing
- mutex_unlock( &image->lock );
- return;
+ assert_uplink_thread();
+ if ( uplink->current.fd == -1 )
+ return false; // Should never be called in this state, consider send error
+ if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
+ return true; // Don't do background replication
+ if ( uplink->nextReplicationIndex == -1 )
+ return true; // No more blocks to replicate
+ dnbd3_image_t * const image = uplink->image;
+ if ( image->users < _bgrMinClients )
+ return true; // Not enough active users
+ const int numNewRequests = numWantedReplicationRequests( uplink );
+ if ( numNewRequests <= 0 )
+ return true; // Already sufficient amount of requests on the wire
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL ) {
+ // No cache map (=image complete)
+ return true;
}
const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
const int lastBlockIndex = mapBytes - 1;
- int endByte;
- if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
- endByte = link->nextReplicationIndex + mapBytes;
- } else { // Hashblock based: Only look for match in current hash block
- endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
- if ( endByte > mapBytes ) {
- endByte = mapBytes;
+ for ( int bc = 0; bc < numNewRequests; ++bc ) {
+ int endByte;
+ if ( UPLINK_MAX_QUEUE - uplink->queueLen < 10 )
+ break; // Don't overload queue
+ if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+ endByte = uplink->nextReplicationIndex + mapBytes;
+ } else { // Hashblock based: Only look for match in current hash block
+ endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+ if ( endByte > mapBytes ) {
+ endByte = mapBytes;
+ }
}
- }
- int replicationIndex = -1;
- for ( int j = link->nextReplicationIndex; j < endByte; ++j ) {
- const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
- if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) {
- // Found incomplete one
- replicationIndex = i;
+ atomic_thread_fence( memory_order_acquire );
+ int replicationIndex = -1;
+ for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
+ const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+ if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+ && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+ // Found incomplete one
+ replicationIndex = i;
+ break;
+ }
+ }
+ if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+ // Nothing left in current block, find next one
+ replicationIndex = findNextIncompleteHashBlock( uplink, endByte );
+ }
+ if ( replicationIndex == -1 ) {
+ // Replication might be complete, uplink_mainloop should take care....
+ uplink->nextReplicationIndex = -1;
break;
}
+ const uint64_t handle = ++uplink->queueId;
+ const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+ uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+ // Extend the default 32k request size if _minRequestSize is > 32k
+ for ( size_t extra = 1; extra < ( _minRequestSize / FILE_BYTES_PER_MAP_BYTE )
+ && offset + size < image->virtualFilesize
+ && _backgroundReplication == BGR_FULL; ++extra ) {
+ if ( atomic_load_explicit( &cache->map[replicationIndex+1], memory_order_relaxed ) == 0xff )
+ break; // Hit complete 32k block, stop here
+ replicationIndex++;
+ size += (uint32_t)MIN( image->virtualFilesize - offset - size, FILE_BYTES_PER_MAP_BYTE );
+ }
+ if ( !uplink_requestInternal( uplink, NULL, NULL, handle, offset, size, 0 ) ) {
+ logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)",
+ PIMG(uplink->image) );
+ ref_put( &cache->reference );
+ return false;
+ }
+ if ( replicationIndex == lastBlockIndex ) {
+ uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+ }
+ uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+ if ( _backgroundReplication == BGR_HASHBLOCK
+ && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+ // Just crossed a hash block boundary, look for new candidate starting at this very index
+ uplink->nextReplicationIndex = findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+ if ( uplink->nextReplicationIndex == -1 )
+ break;
+ }
}
- mutex_unlock( &image->lock );
- if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
- // Nothing left in current block, find next one
- replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte );
- }
- if ( replicationIndex == -1 ) {
- // Replication might be complete, uplink_mainloop should take care....
- link->nextReplicationIndex = -1;
- return;
- }
- const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
- link->replicationHandle = offset;
- const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
- mutex_lock( &link->sendMutex );
- bool sendOk = dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) );
- mutex_unlock( &link->sendMutex );
- if ( !sendOk ) {
- logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
- return;
- }
- if ( replicationIndex == lastBlockIndex ) {
- link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
- }
- link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
- if ( _backgroundReplication == BGR_HASHBLOCK
- && link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
- // Just crossed a hash block boundary, look for new candidate starting at this very index
- link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex );
- }
+ ref_put( &cache->reference );
+ return true;
}
/**
- * find next index into cache_map that corresponds to the beginning
+ * find next index into cache map that corresponds to the beginning
* of a hash block which is neither completely empty nor completely
* replicated yet. Returns -1 if no match.
*/
-static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex)
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
{
int retval = -1;
- mutex_lock( &link->image->lock );
- const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize );
- const uint8_t *cache_map = link->image->cache_map;
- if ( cache_map != NULL ) {
- int j;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
+ if ( cache != NULL ) {
+ const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK );
+ atomic_thread_fence( memory_order_acquire );
+ int j;
for (j = 0; j < mapBytes; ++j) {
const int i = ( start + j ) % mapBytes;
- const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock );
- const bool isEmpty = cache_map[i] == 0;
+ const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+ const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
+ const bool isEmpty = b == 0;
if ( !isEmpty && !isFull ) {
// Neither full nor empty, replicate
if ( retval == -1 ) {
@@ -736,74 +968,97 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in
retval = -1;
}
}
- mutex_unlock( &link->image->lock );
+ ref_put( &cache->reference );
return retval;
}
/**
* Receive data from uplink server and process/dispatch
- * Locks on: link.lock, images[].lock
+ * Locks on: uplink.lock, images[].lock
+ * Only called from uplink thread, so current.fd is assumed to be valid.
*/
-static void uplink_handleReceive(dnbd3_connection_t *link)
+static void handleReceive(dnbd3_uplink_t *uplink)
{
- dnbd3_reply_t inReply, outReply;
- int ret, i;
+ dnbd3_reply_t inReply;
+ int ret;
+ assert_uplink_thread();
+ assert( uplink->queueLen >= 0 );
for (;;) {
- ret = dnbd3_read_reply( link->fd, &inReply, false );
- if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue;
+ ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
+ if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
if ( ret == REPLY_AGAIN ) break;
if ( unlikely( ret == REPLY_CLOSED ) ) {
- logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path );
+ logadd( LOG_INFO, "Uplink: Remote host hung up (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
- logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path );
+ logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret != REPLY_OK ) ) {
- logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path );
+ logadd( LOG_INFO, "Uplink: Connection error %d (%s:%d)", ret, PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
- logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path );
+ logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s:%d", inReply.size, PIMG(uplink->image) );
goto error_cleanup;
}
- if ( unlikely( link->recvBufferLen < inReply.size ) ) {
- link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536);
- link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen );
- if ( link->recvBuffer == NULL ) {
+ if ( unlikely( uplink->recvBufferLen < inReply.size ) ) {
+ uplink->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536);
+ uplink->recvBuffer = realloc( uplink->recvBuffer, uplink->recvBufferLen );
+ if ( uplink->recvBuffer == NULL ) {
logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" );
exit( 1 );
}
}
- if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) {
- logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path );
+ if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
+ logadd( LOG_INFO, "Lost connection to uplink server of %s:%d (payload)", PIMG(uplink->image) );
goto error_cleanup;
}
// Payload read completely
// Bail out if we're not interested
- if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+ if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) )
+ continue;
// Is a legit block reply
- struct iovec iov[2];
- const uint64_t start = inReply.handle;
- const uint64_t end = inReply.handle + inReply.size;
totalBytesReceived += inReply.size;
- link->bytesReceived += inReply.size;
+ uplink->bytesReceived += inReply.size;
+ // Get entry from queue
+ dnbd3_queue_entry_t *entry;
+ mutex_lock( &uplink->queueLock );
+ for ( entry = uplink->queue; entry != NULL; entry = entry->next ) {
+ if ( entry->handle == inReply.handle )
+ break;
+ }
+ if ( entry == NULL ) {
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)",
+ inReply.handle, PIMG(uplink->image) );
+ continue;
+ }
+ const uint64_t start = entry->from;
+ const uint64_t end = entry->to;
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ // We don't remove the entry from the list here yet, to slightly increase the chance of other
+ // clients attaching to this request while we write the data to disk
+ if ( end - start != inReply.size ) {
+ logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)",
+ inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) );
+ }
// 1) Write to cache file
- if ( unlikely( link->cacheFd == -1 ) ) {
- uplink_reopenCacheFd( link, false );
+ if ( unlikely( uplink->cacheFd == -1 ) ) {
+ reopenCacheFd( uplink, false );
}
- if ( likely( link->cacheFd != -1 ) ) {
+ if ( likely( uplink->cacheFd != -1 ) ) {
int err = 0;
bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid
uint32_t done = 0;
ret = 0;
while ( done < inReply.size ) {
- ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done );
+ ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done );
if ( unlikely( ret == -1 ) ) {
err = errno;
- if ( err == EINTR ) continue;
+ if ( err == EINTR && !_shutdown ) continue;
if ( err == ENOSPC || err == EDQUOT ) {
// try to free 256MiB
if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
@@ -811,150 +1066,135 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
continue; // Success, retry write
}
if ( err == EBADF || err == EINVAL || err == EIO ) {
- if ( !tryAgain || !uplink_reopenCacheFd( link, true ) )
+ uplink->image->problem.write = true;
+ if ( !tryAgain || !reopenCacheFd( uplink, true ) )
break;
tryAgain = false;
continue; // Write handle to image successfully re-opened, try again
}
- logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err );
+ logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d",
+ PIMG(uplink->image), err );
break;
}
if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
- logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid );
+ logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d",
+ ret, PIMG(uplink->image) );
break;
}
done += (uint32_t)ret;
}
if ( likely( done > 0 ) ) {
- image_updateCachemap( link->image, start, start + done, true );
+ image_updateCachemap( uplink->image, start, start + done, true );
}
if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
- link->image->name, (int)link->image->rid, err );
+ PIMG(uplink->image), err );
}
}
- // 2) Figure out which clients are interested in it
- mutex_lock( &link->queueLock );
- for (i = 0; i < link->queueLen; ++i) {
- dnbd3_queued_request_t * const req = &link->queue[i];
- assert( req->status != ULR_PROCESSING );
- if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
- assert( req->client != NULL );
- if ( req->from >= start && req->to <= end ) { // Match :-)
- req->status = ULR_PROCESSING;
+ bool found = false;
+ dnbd3_queue_entry_t **it;
+ mutex_lock( &uplink->queueLock );
+ for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) {
+ if ( *it == entry && entry->handle == inReply.handle ) { // ABA check
+ assert( found == false );
+ *it = (**it).next;
+ found = true;
+ uplink->queueLen--;
+ break;
}
}
- // 3) Send to interested clients - iterate backwards so request collaboration works, and
- // so we can decrease queueLen on the fly while iterating. Should you ever change this to start
- // from 0, you also need to change the "attach to existing request"-logic in uplink_request()
- outReply.magic = dnbd3_packet_magic;
- bool served = false;
- for ( i = link->queueLen - 1; i >= 0; --i ) {
- dnbd3_queued_request_t * const req = &link->queue[i];
- if ( req->status == ULR_PROCESSING ) {
- size_t bytesSent = 0;
- assert( req->from >= start && req->to <= end );
- dnbd3_client_t * const client = req->client;
- outReply.cmd = CMD_GET_BLOCK;
- outReply.handle = req->handle;
- outReply.size = (uint32_t)( req->to - req->from );
- iov[0].iov_base = &outReply;
- iov[0].iov_len = sizeof outReply;
- iov[1].iov_base = link->recvBuffer + (req->from - start);
- iov[1].iov_len = outReply.size;
- fixup_reply( outReply );
- req->status = ULR_FREE;
- req->client = NULL;
- served = true;
- mutex_lock( &client->sendMutex );
- mutex_unlock( &link->queueLock );
- if ( client->sock != -1 ) {
- ssize_t sent = writev( client->sock, iov, 2 );
- if ( sent > (ssize_t)sizeof outReply ) {
- bytesSent = (size_t)sent - sizeof outReply;
- }
- }
- mutex_unlock( &client->sendMutex );
- if ( bytesSent != 0 ) {
- client->bytesSent += bytesSent;
- }
- mutex_lock( &link->queueLock );
- }
- if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--;
+ if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = false;
}
- mutex_unlock( &link->queueLock );
-#ifdef _DEBUG
- if ( !served && start != link->replicationHandle ) {
- logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end );
+ mutex_unlock( &uplink->queueLock );
+ if ( !found ) {
+ logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)",
+ PIMG(uplink->image) );
+ continue;
}
-#endif
- if ( start == link->replicationHandle ) {
- // Was our background replication
- link->replicationHandle = REP_NONE;
- // Try to remove from fs cache if no client was interested in this data
- if ( !served && link->cacheFd != -1 ) {
- posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
- }
+ dnbd3_queue_client_t *next;
+ for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
+ assert( c->from >= start && c->to <= end );
+ (*c->callback)( c->data, c->handle, c->from, (uint32_t)( c->to - c->from ),
+ (const char*)( uplink->recvBuffer + (c->from - start) ) );
+ next = c->next;
+ free( c );
}
- if ( served ) {
+ if ( entry->clients != NULL ) {
// Was some client -- reset idle counter
- link->idleTime = 0;
+ uplink->idleTime = 0;
// Re-enable replication if disabled
- if ( link->nextReplicationIndex == -1 ) {
- link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
+ if ( uplink->nextReplicationIndex == -1 ) {
+ uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
+ }
+ } else {
+ if ( uplink->cacheFd != -1 ) {
+ // Try to remove from fs cache if no client was interested in this data
+ posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
}
}
+ free( entry );
+ } // main receive loop
+ // Trigger background replication if applicable
+ if ( !sendReplicationRequest( uplink ) ) {
+ goto error_cleanup;
}
- if ( link->replicationHandle == REP_NONE ) {
- mutex_lock( &link->queueLock );
- const bool rep = ( link->queueLen == 0 );
- mutex_unlock( &link->queueLock );
- if ( rep ) uplink_sendReplicationRequest( link );
- }
+ // Normal end
return;
// Error handling from failed receive or message parsing
- error_cleanup: ;
- uplink_connectionFailed( link, true );
+error_cleanup: ;
+ connectionFailed( uplink, true );
}
-static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew)
+/**
+ * Only call from uplink thread
+ */
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
{
- if ( link->fd == -1 )
+ assert_uplink_thread();
+ if ( uplink->current.fd == -1 )
return;
- altservers_serverFailed( &link->currentServer );
- mutex_lock( &link->sendMutex );
- close( link->fd );
- link->fd = -1;
- mutex_unlock( &link->sendMutex );
- link->replicationHandle = REP_NONE;
- if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
- link->nextReplicationIndex = 0;
+ setThreadName( "panic-uplink" );
+ altservers_serverFailed( uplink->current.index );
+ mutex_lock( &uplink->sendMutex );
+ uplink->image->problem.uplink = true;
+ close( uplink->current.fd );
+ uplink->current.fd = -1;
+ mutex_unlock( &uplink->sendMutex );
+ if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
+ uplink->nextReplicationIndex = 0;
}
if ( !findNew )
return;
- mutex_lock( &link->rttLock );
- bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1;
- mutex_unlock( &link->rttLock );
+ mutex_lock( &uplink->rttLock );
+ bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->better.fd != -1;
+ mutex_unlock( &uplink->rttLock );
if ( bail )
return;
- altservers_findUplink( link );
+ altservers_findUplinkAsync( uplink );
}
/**
- * Send keep alive request to server
+ * Send keep alive request to server.
+ * Called from uplink thread, current.fd must be valid.
*/
-static int uplink_sendKeepalive(const int fd)
+static bool sendKeepalive(dnbd3_uplink_t *uplink)
{
- static dnbd3_request_t request = { 0 };
- if ( request.magic == 0 ) {
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_KEEPALIVE;
- fixup_request( request );
- }
- return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
+ assert_uplink_thread();
+ mutex_lock( &uplink->sendMutex );
+ bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ mutex_unlock( &uplink->sendMutex );
+ return sendOk;
}
-static void uplink_addCrc32(dnbd3_connection_t *uplink)
+/**
+ * Request crclist from uplink.
+ * Called from uplink thread, current.fd must be valid.
+ * FIXME This is broken as it could happen that another message arrives after sending
+ * the request. Refactor, split and move receive into general receive handler.
+ */
+static void requestCrc32List(dnbd3_uplink_t *uplink)
{
dnbd3_image_t *image = uplink->image;
if ( image == NULL || image->virtualFilesize == 0 ) return;
@@ -962,7 +1202,10 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink)
uint32_t masterCrc;
uint32_t *buffer = malloc( bytes );
mutex_lock( &uplink->sendMutex );
- bool sendOk = dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes );
+ bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
+ if ( !sendOk ) {
+ uplink->image->problem.uplink = true;
+ }
mutex_unlock( &uplink->sendMutex );
if ( !sendOk || bytes == 0 ) {
free( buffer );
@@ -972,7 +1215,7 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink)
lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
lists_crc = net_order_32( lists_crc );
if ( lists_crc != masterCrc ) {
- logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name );
+ logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) );
free( buffer );
return;
}
@@ -982,10 +1225,14 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink)
char path[len];
snprintf( path, len, "%s.crc", uplink->image->path );
const int fd = open( path, O_WRONLY | O_CREAT, 0644 );
- if ( fd >= 0 ) {
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, buffer, bytes );
+ if ( fd != -1 ) {
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, buffer, bytes );
close( fd );
+ if ( (size_t)ret != sizeof(masterCrc) + bytes ) {
+ unlink( path );
+ logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) );
+ }
}
}
@@ -997,91 +1244,77 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink)
* it will be closed first. Otherwise, nothing will happen and true will be returned
* immediately.
*/
-static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force)
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
{
- if ( link->cacheFd != -1 ) {
+ if ( uplink->cacheFd != -1 ) {
if ( !force ) return true;
- close( link->cacheFd );
+ close( uplink->cacheFd );
}
- link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 );
- return link->cacheFd != -1;
+ uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+ uplink->image->problem.write = uplink->cacheFd == -1;
+ return uplink->cacheFd != -1;
}
/**
- * Saves the cache map of the given image.
- * Return true on success.
- * Locks on: imageListLock, image.lock
+ * Returns true if the uplink has been idle for some time (apart from
+ * background replication, if it is set to hashblock, or if it has
+ * a minimum number of active clients configured that is not currently
+ * reached)
*/
-static bool uplink_saveCacheMap(dnbd3_connection_t *link)
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink)
{
- dnbd3_image_t *image = link->image;
- assert( image != NULL );
-
- if ( link->cacheFd != -1 ) {
- if ( fsync( link->cacheFd ) == -1 ) {
- // A failing fsync means we have no guarantee that any data
- // since the last fsync (or open if none) has been saved. Apart
- // from keeping the cache_map from the last successful fsync
- // around and restoring it there isn't much we can do to recover
- // a consistent state. Bail out.
- logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
- logadd( LOG_ERROR, "Bailing out immediately" );
- exit( 1 );
- }
- }
-
- if ( image->cache_map == NULL ) return true;
- logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
- mutex_lock( &image->lock );
- // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to
- // figure out that this image's cache copy is complete
- if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) {
- mutex_unlock( &image->lock );
- return true;
- }
- const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
- uint8_t *map = malloc( size );
- memcpy( map, image->cache_map, size );
- // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
- // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
- mutex_unlock( &image->lock );
- assert( image->path != NULL );
- char mapfile[strlen( image->path ) + 4 + 1];
- strcpy( mapfile, image->path );
- strcat( mapfile, ".map" );
+ return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
+ && ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
+}
- int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
- if ( fd == -1 ) {
- const int err = errno;
- free( map );
- logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
+{
+ int current;
+ mutex_lock( &uplink->rttLock );
+ current = uplink->current.fd == -1 ? -1 : uplink->current.index;
+ mutex_unlock( &uplink->rttLock );
+ if ( current == -1 )
return false;
- }
+ return altservers_toString( current, buffer, len );
+}
- size_t done = 0;
- while ( done < size ) {
- const ssize_t ret = write( fd, map, size - done );
- if ( ret == -1 ) {
- if ( errno == EINTR ) continue;
- logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
- break;
- }
- if ( ret <= 0 ) {
- logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+/**
+ * Get number of replication requests that should be sent right now to
+ * meet the configured bgrWindowSize. Returns 0 if any client requests
+ * are pending.
+ * This applies a sort of "slow start" in case the uplink was recently
+ * dealing with actual client requests, in that the uplink's idle time
+ * (in seconds) is an upper bound for the number returned, so we don't
+ * saturate the uplink with loads of requests right away, in case that
+ * client triggers more requests to the uplink server.
+ */
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
+{
+ int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 );
+ if ( uplink->queueLen == 0 )
+ return ret;
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->clients == NULL ) {
+ ret--;
+ } else {
+ ret = 0; // Do not allow BGR if client requests are being handled
break;
}
- done += (size_t)ret;
- }
- if ( fsync( fd ) == -1 ) {
- logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
}
- close( fd );
- free( map );
- return true;
+ mutex_unlock( &uplink->queueLock );
+ return ret;
}
-static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link)
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle)
{
- return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && _backgroundReplication != BGR_FULL );
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->handle == handle ) {
+ it->sent = false;
+ break;
+ }
+ }
+ mutex_unlock( &uplink->queueLock );
}
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 2b41dfc..b6037d6 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -2,7 +2,7 @@
#define _UPLINK_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
void uplink_globalsInit();
@@ -10,10 +10,14 @@ uint64_t uplink_getTotalBytesReceived();
bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version);
-void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client);
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback);
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
-void uplink_shutdown(dnbd3_image_t *image);
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length);
+
+bool uplink_shutdown(dnbd3_image_t *image);
+
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len);
#endif /* UPLINK_H_ */
diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt
new file mode 100644
index 0000000..a1bd49a
--- /dev/null
+++ b/src/shared/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-shared
+ LANGUAGES C)
+
+# find atomic library required by dnbd3-shared
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to get POLLRDHUP support for signals
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_SHARED_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/crc32.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/log.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sockhelper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/timing.c)
+set(DNBD3_SHARED_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/eventfd.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe64.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe_malloc.c)
+
+add_library(dnbd3-shared STATIC ${DNBD3_SHARED_SOURCE_FILES})
+target_include_directories(dnbd3-shared PUBLIC ${PROJECT_INCLUDE_DIR})
+
+add_linter(dnbd3-shared-lint "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
+add_linter_fix(dnbd3-shared-lint-fix "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index db941d3..6cf9a18 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -38,24 +38,23 @@
*/
-#include "../types.h"
+#include <dnbd3/types.h>
#include <stddef.h>
-#define FAR
+#if defined(__x86_64__) || defined(__amd64__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <stdatomic.h>
+#define zalign(n) __attribute__((aligned(n)))
+#endif
+
#define OF(args) args
-#define local static
/* Definitions for doing the crc four data bytes at a time. */
-#if !defined(NOBYFOUR)
-# define BYFOUR
-#endif
-#ifdef BYFOUR
-# define TBLS 8
-#else
-# define TBLS 1
-#endif /* BYFOUR */
+#define TBLS 8
-local const uint32_t crc_table[TBLS][256] =
+static const uint32_t crc_table[TBLS][256] =
{
{
0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U,
@@ -110,7 +109,6 @@ local const uint32_t crc_table[TBLS][256] =
0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U,
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
0x2d02ef8dU
-#ifdef BYFOUR
},
{
0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U,
@@ -489,38 +487,159 @@ local const uint32_t crc_table[TBLS][256] =
0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU,
0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U,
0xf10605deU
-#endif
}
};
-#ifdef NO_ENDIAN
-// Currently not in use, always use the BYFOUR method with known endianness
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+#define PCLMUL_MIN_LEN 64
+#define PCLMUL_ALIGN 16
+#define PCLMUL_ALIGN_MASK 15
-/* ========================================================================= */
-uint32_t crc32(crc, buf, len)
- uint32_t crc;
- const uint8_t *buf;
- size_t len;
+#if defined(__x86_64__) || defined(__amd64__)
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ *
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+static uint32_t
+__attribute__((target("pclmul,sse4.1")))
+crc32pclmul(uint32_t crc, const uint8_t *buf, size_t len)
{
- if (buf == NULL) return 0;
+ /*
+ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+ * the CRC32+Barrett polynomials given at the end of the paper.
+ */
+ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+ /*
+ * There's at least one block of 64.
+ */
+ x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+ x0 = _mm_load_si128((__m128i *)k1k2);
+
+ buf += 64;
+ len -= 64;
- crc = crc ^ 0xffffffffU;
- while (len >= 8) {
- DO8;
- len -= 8;
+ /*
+ * Parallel fold blocks of 64, if any.
+ */
+ while (len >= 64)
+ {
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+ x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+ x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+ x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+ y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, x5);
+ x2 = _mm_xor_si128(x2, x6);
+ x3 = _mm_xor_si128(x3, x7);
+ x4 = _mm_xor_si128(x4, x8);
+
+ x1 = _mm_xor_si128(x1, y5);
+ x2 = _mm_xor_si128(x2, y6);
+ x3 = _mm_xor_si128(x3, y7);
+ x4 = _mm_xor_si128(x4, y8);
+
+ buf += 64;
+ len -= 64;
}
- if (len) do {
- DO1;
- } while (--len);
- return crc ^ 0xffffffffU;
+
+ /*
+ * Fold into 128-bits.
+ */
+ x0 = _mm_load_si128((__m128i *)k3k4);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x3);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x4);
+ x1 = _mm_xor_si128(x1, x5);
+
+ /*
+ * Single fold blocks of 16, if any.
+ */
+ while (len >= 16)
+ {
+ x2 = _mm_loadu_si128((__m128i *)buf);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ buf += 16;
+ len -= 16;
+ }
+
+ /*
+ * Fold 128-bits to 64-bits.
+ */
+ x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+ x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+ x1 = _mm_srli_si128(x1, 8);
+ x1 = _mm_xor_si128(x1, x2);
+
+ x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+ x2 = _mm_srli_si128(x1, 4);
+ x1 = _mm_and_si128(x1, x3);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Barret reduce to 32-bits.
+ */
+ x0 = _mm_load_si128((__m128i*)poly);
+
+ x2 = _mm_and_si128(x1, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+ x2 = _mm_and_si128(x2, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Return the crc32.
+ */
+ return _mm_extract_epi32(x1, 1);
}
#endif
-#ifdef BYFOUR
-
/*
This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
integer pointer type. This violates the strict aliasing rule, where a
@@ -533,7 +652,7 @@ uint32_t crc32(crc, buf, len)
writes to the buffer that is passed to these routines.
*/
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
/* ========================================================================= */
#define DOLIT4 c ^= *buf4++; \
c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
@@ -547,25 +666,36 @@ uint32_t crc32(crc, buf, len)
size_t len;
{
if (buf == NULL) return 0;
- register uint32_t c;
- register const uint32_t FAR *buf4;
+ uint32_t c;
c = ~crc;
- while (len && ((uintptr_t)buf & 3)) {
+ while (len && ((uintptr_t)buf & PCLMUL_ALIGN_MASK)) {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
len--;
}
-
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
- while (len >= 32) {
- DOLIT32;
- len -= 32;
- }
- while (len >= 4) {
- DOLIT4;
- len -= 4;
+#if defined(__x86_64__) || defined(__amd64__)
+ static atomic_int pclmul = -1;
+ if (pclmul == -1) {
+ pclmul = __builtin_cpu_supports("pclmul") && __builtin_cpu_supports("sse4.1");
}
- buf = (const uint8_t FAR *)buf4;
+ if (pclmul && len >= PCLMUL_MIN_LEN) {
+ c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
+ buf += len & ~PCLMUL_ALIGN_MASK;
+ len &= PCLMUL_ALIGN_MASK;
+ } else
+#endif
+ do {
+ const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const uint8_t *)buf4;
+ } while (0);
if (len) do {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
@@ -575,7 +705,7 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#ifdef BIG_ENDIAN
+#ifdef DNBD3_BIG_ENDIAN
/* ========================================================================= */
#define DOBIG4 c ^= *buf4++; \
c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
@@ -590,7 +720,7 @@ uint32_t crc32(crc, buf, len)
{
if (buf == NULL) return 0;
register uint32_t c;
- register const uint32_t FAR *buf4;
+ register const uint32_t *buf4;
c = ~net_order_32(crc);
while (len && ((uintptr_t)buf & 3)) {
@@ -598,7 +728,7 @@ uint32_t crc32(crc, buf, len)
len--;
}
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+ buf4 = (const uint32_t *)(const void *)buf;
while (len >= 32) {
DOBIG32;
len -= 32;
@@ -607,7 +737,7 @@ uint32_t crc32(crc, buf, len)
DOBIG4;
len -= 4;
}
- buf = (const uint8_t FAR *)buf4;
+ buf = (const uint8_t *)buf4;
if (len) do {
c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
@@ -617,5 +747,3 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#endif /* BYFOUR */
-
diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c
index 5e5cf7f..1db59bd 100644
--- a/src/shared/fdsignal.c
+++ b/src/shared/fdsignal.c
@@ -1,6 +1,6 @@
-#include "fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
-#if defined(linux) || defined(__linux) || defined(__linux__)
+#if defined(__linux__)
//#warning "Using eventfd based signalling"
#include "fdsignal.inc/eventfd.c"
#elif __SIZEOF_INT__ == 4 && __SIZEOF_POINTER__ == 8
diff --git a/src/shared/log.c b/src/shared/log.c
index 055acb4..3a4739d 100644
--- a/src/shared/log.c
+++ b/src/shared/log.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Simon Rettberg
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,7 +18,7 @@
*
*/
-#include "log.h"
+#include <dnbd3/shared/log.h>
#include <stdarg.h>
#include <pthread.h>
#include <stdlib.h>
@@ -36,6 +36,7 @@ static _Atomic logmask_t maskCon = 15;
static char *logFile = NULL;
static int logFd = -1;
+static FILE *logOutStream;
static bool consoleTimestamps = false;
@@ -43,6 +44,10 @@ static bool consoleTimestamps = false;
static int writeLevel(char *buffer, logmask_t level);
+void log_init(void) {
+ logOutStream = stdout;
+}
+
bool log_hasMask(const logmask_t mask)
{
return ( ( maskFile | maskCon ) & mask ) == mask;
@@ -63,6 +68,15 @@ void log_setConsoleTimestamps(bool on)
consoleTimestamps = on;
}
+int log_setConsoleOutputStream(FILE *outputStream)
+{
+ if ( outputStream != stdout && outputStream != stderr )
+ return -EINVAL;
+
+ logOutStream = outputStream;
+ return 0;
+}
+
bool log_openLogFile(const char *path)
{
pthread_mutex_lock( &logLock );
@@ -93,10 +107,10 @@ void logadd(const logmask_t mask, const char *fmt, ...)
struct tm timeinfo;
char buffer[LINE_LEN];
bool toFile = maskFile & mask;
- bool toStdout = maskCon & mask;
+ bool toOutStream = maskCon & mask;
size_t offset;
- if ( toFile || ( toStdout && consoleTimestamps ) ) {
+ if ( toFile || ( toOutStream && consoleTimestamps ) ) {
time( &rawtime );
localtime_r( &rawtime, &timeinfo );
offset = strftime( buffer, LINE_LEN, "[%d.%m. %H:%M:%S] ", &timeinfo );
@@ -134,15 +148,11 @@ void logadd(const logmask_t mask, const char *fmt, ...)
}
pthread_mutex_unlock( &logLock );
}
- if ( toStdout ) {
- if ( consoleTimestamps ) stdoutLine = buffer;
-#ifdef AFL_MODE
- fputs( stdoutLine, stderr );
- fflush( stderr );
-#else
- fputs( stdoutLine, stdout );
- fflush( stdout );
-#endif
+ if ( toOutStream ) {
+ if ( consoleTimestamps )
+ stdoutLine = buffer;
+ fputs( stdoutLine, logOutStream );
+ fflush( logOutStream );
}
}
diff --git a/src/serialize.c b/src/shared/serialize.c
index 0bc0dcd..1f7cddd 100644
--- a/src/serialize.c
+++ b/src/shared/serialize.c
@@ -1,6 +1,6 @@
-#include "serialize.h"
-#include "types.h"
-
+// SPDX-License-Identifier: GPL-2.0
+#include <dnbd3/shared/serialize.h>
+#include <dnbd3/types.h>
void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len)
{
@@ -16,14 +16,17 @@ void serializer_reset_write(serialized_buffer_t *buffer)
uint8_t serializer_get_uint8(serialized_buffer_t *buffer)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return 0;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return 0;
return (uint8_t)*buffer->buffer_pointer++;
}
uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
{
uint16_t ret;
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 2);
buffer->buffer_pointer += 2;
return net_order_16(ret);
@@ -32,7 +35,9 @@ uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
{
uint64_t ret;
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 8);
buffer->buffer_pointer += 8;
return net_order_64(ret);
@@ -41,22 +46,29 @@ uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
char *serializer_get_string(serialized_buffer_t *buffer)
{
char *ptr = buffer->buffer_pointer, *start = buffer->buffer_pointer;
- if (ptr >= buffer->buffer_end) return NULL;
- while (ptr < buffer->buffer_end && *ptr) ++ptr;
- if (*ptr) return NULL; // String did not terminate within buffer (possibly corrupted/malicious packet)
+
+ if (ptr >= buffer->buffer_end)
+ return NULL;
+ while (ptr < buffer->buffer_end && *ptr)
+ ++ptr;
+ // String did not terminate within buffer (possibly corrupted/malicious packet)
+ if (*ptr)
+ return NULL;
buffer->buffer_pointer = ptr + 1;
return start;
}
void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return;
*buffer->buffer_pointer++ = (char)value;
}
void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
{
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return;
value = net_order_16(value);
memcpy(buffer->buffer_pointer, &value, 2);
buffer->buffer_pointer += 2;
@@ -64,7 +76,8 @@ void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
{
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return;
value = net_order_64(value);
memcpy(buffer->buffer_pointer, &value, 8);
buffer->buffer_pointer += 8;
@@ -73,12 +86,14 @@ void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
void serializer_put_string(serialized_buffer_t *buffer, const char *value)
{
const size_t len = strlen(value) + 1;
- if (buffer->buffer_pointer + len > buffer->buffer_end) return;
+
+ if (buffer->buffer_pointer + len > buffer->buffer_end)
+ return;
memcpy(buffer->buffer_pointer, value, len);
buffer->buffer_pointer += len;
}
uint32_t serializer_get_written_length(serialized_buffer_t *buffer)
{
- return (uint32_t)( buffer->buffer_pointer - buffer->buffer );
+ return (uint32_t)(buffer->buffer_pointer - buffer->buffer);
}
diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
index ab34aa1..5096320 100644
--- a/src/shared/sockhelper.c
+++ b/src/shared/sockhelper.c
@@ -1,6 +1,8 @@
-#include "sockhelper.h"
-#include "log.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <arpa/inet.h> // inet_ntop
+#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <unistd.h>
@@ -19,8 +21,7 @@ struct _poll_list {
int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms)
{
// TODO: Move out of here, this unit should contain general socket functions
- // TODO: Abstract away from sockaddr_in* like the rest of the functions here do,
- // so WITH_IPV6 can finally be removed as everything is transparent. b- but how?
+ // TODO: Abstract away from sockaddr_in* like the rest of the functions here
struct sockaddr_storage ss;
int proto, addrlen;
memset( &ss, 0, sizeof ss );
@@ -32,9 +33,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr4->sin_port = addr->port;
proto = PF_INET;
addrlen = sizeof *addr4;
- }
-#ifdef WITH_IPV6
- else if ( addr->type == HOST_IP6 ) {
+ } else if ( addr->type == HOST_IP6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)&ss;
addr6->sin6_family = AF_INET6;
@@ -42,10 +41,9 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr6->sin6_port = addr->port;
proto = PF_INET6;
addrlen = sizeof *addr6;
- }
-#endif
- else {
+ } else {
logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type );
+ errno = EAFNOSUPPORT;
return -1;
}
int client_sock = socket( proto, SOCK_STREAM, IPPROTO_TCP );
@@ -56,9 +54,13 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
} else {
sock_setTimeout( client_sock, connect_ms );
}
+ // NODELAY makes sense for the client side, which should be all users in this code base
+ int e2 = 1;
+ setsockopt( client_sock, IPPROTO_TCP, TCP_NODELAY, (void *)&e2, sizeof(e2) );
for ( int i = 0; i < 5; ++i ) {
int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen );
- if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break;
+ e2 = errno;
+ if ( ret != -1 || ( connect_ms == -1 && errno == EINPROGRESS ) || errno == EISCONN ) break;
if ( errno == EINTR ) {
// http://www.madore.org/~david/computers/connect-intr.html
#ifdef __linux__
@@ -67,21 +69,26 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
struct pollfd unix_really_sucks = { .fd = client_sock, .events = POLLOUT | POLLIN };
while ( i-- > 0 ) {
int pr = poll( &unix_really_sucks, 1, connect_ms == 0 ? -1 : connect_ms );
+ e2 = errno;
if ( pr == 1 && ( unix_really_sucks.revents & POLLOUT ) ) break;
if ( pr == -1 && errno == EINTR ) continue;
close( client_sock );
+ errno = e2;
return -1;
}
- sockaddr_storage junk;
+ struct sockaddr_storage junk;
socklen_t more_junk = sizeof(junk);
if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) {
+ e2 = errno;
close( client_sock );
+ errno = e2;
return -1;
}
break;
#endif
} // EINTR
close( client_sock );
+ errno = e2;
return -1;
}
if ( connect_ms != -1 && connect_ms != rw_ms ) {
@@ -157,7 +164,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr4->sin_addr, 4 );
return true;
}
-#ifdef WITH_IPV6
+
if ( sa->sa_family == AF_INET6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)sa;
@@ -166,7 +173,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr6->sin6_addr, 16 );
return true;
}
-#endif
+
return false;
}
@@ -234,7 +241,10 @@ size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLe
outlen = snprintf( output, len, "[%s]:%s", host, port );
}
}
- if ( outlen <= 0 ) return 0;
+ if ( outlen <= 0 ) {
+ output[0] = '\0';
+ return 0;
+ }
return MIN( (size_t)outlen, len-1 );
}
@@ -338,7 +348,7 @@ int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_m
if ( i != list->count ) list->entry[i] = list->entry[list->count];
if ( fd != -1 ) {
sock_set_block( fd );
- if ( rw_ms != -1 && rw_ms != connect_ms ) {
+ if ( rw_ms != -1 ) {
sock_setTimeout( fd, rw_ms );
}
return fd;
diff --git a/src/shared/timing.c b/src/shared/timing.c
index 4ca1002..bdb8388 100644
--- a/src/shared/timing.c
+++ b/src/shared/timing.c
@@ -1,4 +1,4 @@
-#include "timing.h"
+#include <dnbd3/shared/timing.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/src/version.c.in b/src/version.c.in
deleted file mode 100644
index 54854c9..0000000
--- a/src/version.c.in
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "version.h"
-
-const char * VERSION_STRING = "@VERSION@";
-
diff --git a/src/version.h b/src/version.h
deleted file mode 100644
index 0c4a66b..0000000
--- a/src/version.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef VERSION_H_
-#define VERSION_H_
-
-extern const char *VERSION_STRING;
-
-// This is done in a little weird way but otherwise eclipse complains about
-// unresolvable symbols etc...
-#include "version.c"
-
-#endif /* VERSION_H_ */