summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFrederic Robra2019-06-25 17:03:28 +0200
committerFrederic Robra2019-06-25 17:03:28 +0200
commit43e57ce5e11e9052f5a7db66f2e8613f1784f919 (patch)
treec5e1372a160b2601f61b18d617b71799b06b02ae
downloaddnbd3-ng-43e57ce5e11e9052f5a7db66f2e8613f1784f919.tar.gz
dnbd3-ng-43e57ce5e11e9052f5a7db66f2e8613f1784f919.tar.xz
dnbd3-ng-43e57ce5e11e9052f5a7db66f2e8613f1784f919.zip
first version of dnbd3-ng
-rw-r--r--.gitignore9
-rw-r--r--CMakeLists.txt240
-rw-r--r--COPYING340
-rw-r--r--Kbuild.in2
-rw-r--r--LOCKS80
-rwxr-xr-xbuild.sh6
-rw-r--r--cmake/FindFuse.cmake30
-rw-r--r--cmake/FindJansson.cmake59
-rw-r--r--conf/README.server30
-rw-r--r--conf/alt-servers4
-rw-r--r--conf/rpc.acl5
-rw-r--r--conf/server.conf57
-rwxr-xr-xget-version.sh22
-rwxr-xr-xpack.sh6
-rw-r--r--src/bench/connection.c133
-rw-r--r--src/bench/connection.h26
-rw-r--r--src/bench/helper.c37
-rw-r--r--src/bench/helper.h38
-rw-r--r--src/bench/main.c154
-rw-r--r--src/bench/serialize.c5
-rw-r--r--src/client/client.c670
-rw-r--r--src/clientconfig.h36
-rw-r--r--src/config.h43
-rw-r--r--src/fuse/connection.c927
-rw-r--r--src/fuse/connection.h35
-rw-r--r--src/fuse/helper.c36
-rw-r--r--src/fuse/helper.h35
-rw-r--r--src/fuse/main.c420
-rw-r--r--src/fuse/serialize.c5
-rw-r--r--src/kernel/core.c484
-rw-r--r--src/kernel/dnbd3.h86
-rw-r--r--src/kernel/sysfs.c205
-rw-r--r--src/kernel/sysfs.h45
-rw-r--r--src/kernel/utils.c41
-rw-r--r--src/kernel/utils.h29
-rw-r--r--src/serialize.c84
-rw-r--r--src/serialize.h40
-rw-r--r--src/server/altservers.c612
-rw-r--r--src/server/altservers.h30
-rw-r--r--src/server/fileutil.c128
-rw-r--r--src/server/fileutil.h17
-rw-r--r--src/server/globals.c321
-rw-r--r--src/server/globals.h277
-rw-r--r--src/server/helper.c146
-rw-r--r--src/server/helper.h42
-rw-r--r--src/server/image.c1794
-rw-r--r--src/server/image.h63
-rw-r--r--src/server/ini.c164
-rw-r--r--src/server/ini.h66
-rw-r--r--src/server/integrity.c274
-rw-r--r--src/server/integrity.h12
-rw-r--r--src/server/locks.c306
-rw-r--r--src/server/locks.h85
-rw-r--r--src/server/net.c731
-rw-r--r--src/server/net.h40
-rw-r--r--src/server/picohttpparser/README.md116
-rw-r--r--src/server/picohttpparser/picohttpparser.c620
-rw-r--r--src/server/picohttpparser/picohttpparser.h92
-rw-r--r--src/server/rpc.c504
-rw-r--r--src/server/rpc.h10
-rw-r--r--src/server/serialize.c5
-rw-r--r--src/server/server.c495
-rw-r--r--src/server/server.h34
-rw-r--r--src/server/threadpool.c126
-rw-r--r--src/server/threadpool.h29
-rw-r--r--src/server/uplink.c1034
-rw-r--r--src/server/uplink.h19
-rw-r--r--src/server/urldecode.c61
-rw-r--r--src/server/urldecode.h19
-rw-r--r--src/serverconfig.h56
-rw-r--r--src/shared/crc32.c621
-rw-r--r--src/shared/crc32.h9
-rw-r--r--src/shared/fdsignal.c14
-rw-r--r--src/shared/fdsignal.h57
-rw-r--r--src/shared/fdsignal.inc/eventfd.c74
-rw-r--r--src/shared/fdsignal.inc/pipe64.c88
-rw-r--r--src/shared/fdsignal.inc/pipe_malloc.c89
-rw-r--r--src/shared/log.c204
-rw-r--r--src/shared/log.h65
-rw-r--r--src/shared/protocol.h159
-rw-r--r--src/shared/sockhelper.c430
-rw-r--r--src/shared/sockhelper.h120
-rw-r--r--src/shared/timing.c21
-rw-r--r--src/shared/timing.h162
-rw-r--r--src/types.h196
-rw-r--r--src/version.c.in4
-rw-r--r--src/version.h30
87 files changed, 15175 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ef0f43e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+build/
+.cproject
+.project
+*.swp
+.autotools
+.idea
+/version.txt
+.settings/
+.gdbinit
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..0141b05
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,240 @@
+################################################################################
+# GENERAL #
+################################################################################
+
+PROJECT(dnbd3 C)
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6.2)
+IF (CMAKE_BUILD_TYPE STREQUAL "")
+ SET(CMAKE_BUILD_TYPE Debug)
+ENDIF()
+
+SET(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Path prefix for system installation")
+OPTION(BUILD_FUSE_CLIENT "Build dnbd3 fuse client" ON)
+OPTION(BUILD_SERVER "Build dnbd3 server" ON)
+OPTION(BUILD_STRESSTEST "Build dnbd3 stress testing tool" OFF)
+SET(EXTRA_C_FLAGS "" CACHE STRING "Additional options to pass to compiler")
+
+OPTION(SERVER_FOR_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
+
+# Is there a non-retarded way to check if build type is debug or release?
+# When specifying, it is case insensitive, so DeBuG would also enable debug builds,
+# but in cmake, we can only do case sensitive matches... :/
+string( TOLOWER "${CMAKE_BUILD_TYPE}" bt_lower )
+if (NOT bt_lower MATCHES "^(debug|release)$")
+ message( FATAL_ERROR "Build type needs to be either Debug or Release" )
+endif()
+
+message( "Build Type selected: ${CMAKE_BUILD_TYPE}" )
+
+IF(CMAKE_SYSTEM_NAME MATCHES "BSD")
+ message("Detected *BSD System: disable build of Kernel Module.")
+ SET(BUILD_KERNEL_MODULE False)
+ELSE()
+ OPTION(BUILD_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON)
+ENDIF()
+
+if(CMAKE_C_COMPILER MATCHES "clang")
+ message( "Using clang flags." )
+ SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O1 -fno-omit-frame-pointer -g -Wall -Wextra -Wpedantic -Wno-unused-result -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
+ SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
+elseif (CMAKE_C_COMPILER MATCHES "(cc-)|(cc$)")
+ message( "Using (g)cc flags." )
+ SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O0 -g -Wall -Wextra -Wpedantic -Wconversion -Wno-sign-conversion -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
+ SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}")
+else()
+ message( FATAL_ERROR "Could not determine compiler type." )
+endif()
+#SET(CMAKE_CXX_FLAGS_DEBUG "-std=c99 -O0 -g -Wall -Wno-unused-result -D_GNU_SOURCE -D_DEBUG")
+#SET(CMAKE_CXX_FLAGS_RELEASE "-std=c99 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG" )
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
+
+ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+ADD_DEFINITIONS(-DWITH_IPV6)
+
+FIND_PACKAGE(Threads)
+
+SET(DO_ABORT False)
+
+message( " *************************************************" )
+if(BUILD_FUSE_CLIENT)
+ FIND_PACKAGE(Fuse)
+ if(NOT FUSE_FOUND)
+ message( " *** No fuse dev libs found, can't build dnbd3-fuse" )
+ SET(DO_ABORT True)
+ endif()
+ if(NOT THREADS_FOUND)
+ message( " *** No threads found, can't build dnbd3-fuse" )
+ SET(DO_ABORT True)
+ endif()
+endif()
+if(BUILD_SERVER)
+ FIND_PACKAGE(Jansson)
+ if(NOT THREADS_FOUND)
+ message( " *** No threads found, can't build dnbd3-server" )
+ SET(DO_ABORT True)
+ endif()
+ if(NOT JANSSON_FOUND)
+ message( " *** No jansson lib found, can't build dnbd3-server" )
+ SET(DO_ABORT True)
+ endif()
+endif()
+if(BUILD_STRESSTEST)
+ if(NOT THREADS_FOUND)
+ message( " *** No threads found, can't build dnbd3-bench" )
+ SET(DO_ABORT True)
+ endif()
+endif()
+message( " *************************************************" )
+if(DO_ABORT)
+ message( FATAL_ERROR "Aborting." )
+endif()
+
+#SET(FUSE_INCLUDE_DIR "")
+#SET(JANSSON_INCLUDE_DIR "")
+
+################################################################################
+# VERSION HEADER #
+################################################################################
+
+FILE(WRITE ${CMAKE_BINARY_DIR}/version.cmake
+"EXECUTE_PROCESS(
+ COMMAND \${CMD}
+ OUTPUT_VARIABLE VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ )
+ CONFIGURE_FILE(\${SRC} \${DST} @ONLY)
+")
+ADD_CUSTOM_TARGET(
+ version
+ ${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/src/version.c.in
+ -D DST=${CMAKE_BINARY_DIR}/version.c
+ -D CMD=${CMAKE_SOURCE_DIR}/get-version.sh
+ -P ${CMAKE_BINARY_DIR}/version.cmake
+)
+
+## This is required if you're not building the kernel module
+## TODO: Find a nicer way to avoid parent includes,
+## especially the ../version.h -> version.c -> version.h cycle
+FILE(GLOB COMMON_HEADER_FILES src/*.h)
+FOREACH(COMMON_HEADER_FILE ${COMMON_HEADER_FILES})
+ CONFIGURE_FILE(${COMMON_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
+ENDFOREACH( COMMON_HEADER_FILE )
+
+
+################################################################################
+# CLIENT #
+################################################################################
+
+if(BUILD_KERNEL_MODULE)
+ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR})
+ FILE(GLOB_RECURSE CLIENT_SRCS src/client/*.c)
+ ADD_EXECUTABLE(dnbd3-client ${CLIENT_SRCS})
+ TARGET_LINK_LIBRARIES(dnbd3-client)
+ ADD_DEPENDENCIES(dnbd3-client version)
+ INSTALL(TARGETS dnbd3-client RUNTIME DESTINATION sbin)
+ENDIF()
+
+
+################################################################################
+# SERVER #
+################################################################################
+
+if(BUILD_SERVER)
+ IF(SERVER_FOR_AFL)
+ message(" ######################## Building server for AFL mode - will be useless otherwise!")
+ ADD_DEFINITIONS(-DAFL_MODE)
+ ENDIF()
+ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${JANSSON_INCLUDE_DIR})
+ FILE(GLOB SERVER_SRCS src/server/*.c src/shared/*.c src/server/picohttpparser/*.c)
+ ADD_EXECUTABLE(dnbd3-server ${SERVER_SRCS})
+ TARGET_LINK_LIBRARIES(dnbd3-server ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
+ if(UNIX AND NOT APPLE)
+ target_link_libraries(dnbd3-server rt)
+ endif()
+ ADD_DEPENDENCIES(dnbd3-server version)
+ INSTALL(TARGETS dnbd3-server RUNTIME DESTINATION sbin)
+endif()
+
+
+
+################################################################################
+# FUSE #
+################################################################################
+
+if(BUILD_FUSE_CLIENT)
+ INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${FUSE_INCLUDE_DIR})
+ FILE(GLOB FUSE_SRCS src/fuse/*.c src/shared/*.c)
+ ADD_EXECUTABLE(dnbd3-fuse ${FUSE_SRCS})
+ TARGET_LINK_LIBRARIES(dnbd3-fuse ${CMAKE_THREAD_LIBS_INIT} ${FUSE_LIBRARIES})
+ ADD_DEPENDENCIES(dnbd3-fuse version)
+ INSTALL(TARGETS dnbd3-fuse RUNTIME DESTINATION bin)
+endif()
+
+################################################################################
+# STRESSTEST #
+################################################################################
+
+if(BUILD_STRESSTEST)
+ FILE(GLOB BENCH_SRCS src/bench/*.c src/shared/*.c)
+ ADD_EXECUTABLE(dnbd3-bench ${BENCH_SRCS})
+ TARGET_LINK_LIBRARIES(dnbd3-bench ${CMAKE_THREAD_LIBS_INIT})
+ ADD_DEPENDENCIES(dnbd3-bench version)
+ INSTALL(TARGETS dnbd3-bench RUNTIME DESTINATION bin)
+endif()
+
+################################################################################
+# MODULE #
+################################################################################
+
+IF(BUILD_KERNEL_MODULE)
+ SET(MODULE_NAME dnbd3)
+ SET(MODULE_FILE ${MODULE_NAME}.ko)
+ FILE(GLOB MODULE_SOURCE_FILES src/kernel/*.c src/serialize.c)
+ FILE(GLOB MODULE_HEADER_FILES src/kernel/*.h)
+
+ SET(KERNEL_DIR "" CACHE PATH "Path to kernel sources to compile against")
+ IF(KERNEL_DIR STREQUAL "")
+ SET(KERNEL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build")
+ ENDIF()
+
+ SET(KBUILD_COMMAND ${CMAKE_MAKE_PROGRAM} -C ${KERNEL_DIR}
+ M=${CMAKE_BINARY_DIR} modules
+ )
+
+ CONFIGURE_FILE(Kbuild.in ${CMAKE_BINARY_DIR}/Kbuild)
+
+ FOREACH(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES})
+ CONFIGURE_FILE(${MODULE_SOURCE_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
+ ENDFOREACH( MODULE_SOURCE_FILE )
+
+ FOREACH(MODULE_HEADER_FILE ${MODULE_HEADER_FILES})
+ CONFIGURE_FILE(${MODULE_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY)
+ ENDFOREACH( MODULE_HEADER_FILE )
+
+ ADD_CUSTOM_COMMAND(
+ OUTPUT ${CMAKE_BINARY_DIR}/${MODULE_FILE}
+ COMMAND ${KBUILD_COMMAND}
+ WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+ DEPENDS ${MODULE_SOURCE_FILES} Kbuild.in
+ VERBATIM
+ )
+
+ ADD_CUSTOM_TARGET(${MODULE_NAME} ALL DEPENDS ${CMAKE_BINARY_DIR}/${MODULE_FILE})
+
+ INSTALL(FILES ${CMAKE_BINARY_DIR}/${MODULE_NAME}.ko
+ DESTINATION /lib/modules/${CMAKE_SYSTEM_VERSION}/kernel/drivers/block
+ PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+ )
+
+ INSTALL(CODE "EXECUTE_PROCESS(COMMAND depmod -a)")
+ENDIF()
+
+
+#
+# Other install files
+#
+
+FILE(GLOB conf_files "${CMAKE_CURRENT_SOURCE_DIR}/conf/*")
+INSTALL(FILES ${conf_files} DESTINATION /etc/dnbd3-server/sample/)
+
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Kbuild.in b/Kbuild.in
new file mode 100644
index 0000000..ec8b830
--- /dev/null
+++ b/Kbuild.in
@@ -0,0 +1,2 @@
+obj-m := ${MODULE_NAME}.o
+${MODULE_NAME}-objs += core.o sysfs.o utils.o
diff --git a/LOCKS b/LOCKS
new file mode 100644
index 0000000..935dadb
--- /dev/null
+++ b/LOCKS
@@ -0,0 +1,80 @@
+Some notes about locking in dnbd3
+
+The order of aquiring multiple locks is
+VERY IMPORTANT, as you'll produce a possible deadlock
+if you do it in the wrong order.
+Take very good care of locking order if you have lots
+of functions that call each other. You might lose
+track of what's going on. ;)
+
+===== FUSE =====
+mutexInit
+newAltLock
+altLock
+connection.sendMutex
+requests.lock
+
+===== SERVER =====
+This is a list of used locks, in the order they
+have to be aquired if you must hold multiple locks:
+remoteCloneLock | reloadLock
+_clients_lock
+_clients[].lock
+integrityQueueLock
+_images_lock
+_images[].lock
+pendingLockConsume
+pendingLockProduce
+uplink.queueLock
+altServersLock
+client.sendMutex
+client.statsLock
+statisticsSentLock
+statisticsReceivedLock
+uplink.rttLock
+
+If you need to lock multiple clients/images/... at once,
+lock the client with the lowest array index first.
+
+If the program logic would require to aquire the
+locks in a different order, you HAVE TO rework the
+code.
+For example, if you hold the lock for client 10 and
+you need to look up some other client. You MUST NOT
+simply fetch the _clients_lock now and then iterate
+over the clients until you find the one you need,
+as it violates the above order to first lock on the
+clients array and then the clients lock.
+Instead, you need to release client 10's lock,
+then lock on _clients_lock and iterate over the
+clients. Now you check if you either encounter
+the client you originally held the lock on, or
+the client you are looking for. You immediately
+lock on those two. You can then release the
+_clients_lock and work with both clients.
+pseudo code:
+
+// client10 is assumed to be a pointer to
+// a client, which happens to be at index 10
+lock (client10->lock);
+....
+// oh, i need another client
+unlock(client10->lock);
+lock(_clients_lock);
+client clientA = NULL, clientB = NULL;
+for (i = 0; i < _num_clients; ++i) {
+ if (client[i] == client10) {
+ clientA = client[i];
+ lock(clientA.lock);
+ } else if (client[i].something == <whatever>) {
+ clientB = client[i];
+ lock(clientB.lock);
+ }
+}
+unlock(_clients_lock);
+if (clientA && clientB) { // Make sure we actually found both!
+ // DO something important with both clients
+}
+if (clientA) unlock(clientA.lock);
+if (clientB) unlock(clientB.lock);
+
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..6726a86
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+mkdir -p build
+cd build/
+cmake ..
+make
diff --git a/cmake/FindFuse.cmake b/cmake/FindFuse.cmake
new file mode 100644
index 0000000..b9c6f91
--- /dev/null
+++ b/cmake/FindFuse.cmake
@@ -0,0 +1,30 @@
+# - Find fuse
+# Find the native fuse includes and library
+#
+# FUSE_INCLUDE_DIR - where to find fuse/fuse.h.
+# FUSE_LIBRARIES - List of libraries when using fuse.
+# FUSE_FOUND - True if fuse found.
+
+
+IF (FUSE_INCLUDE_DIR)
+ # Already in cache, be silent
+ SET(FUSE_FIND_QUIETLY TRUE)
+ENDIF (FUSE_INCLUDE_DIR)
+
+FIND_PATH(FUSE_INCLUDE_DIR fuse/fuse.h)
+
+SET(FUSE_NAMES fuse)
+FIND_LIBRARY(FUSE_LIBRARY NAMES ${FUSE_NAMES} )
+
+# handle the QUIETLY and REQUIRED arguments and set FUSE_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(FUSE REQUIRED FUSE_LIBRARY FUSE_INCLUDE_DIR)
+
+IF(FUSE_FOUND)
+ SET( FUSE_LIBRARIES ${FUSE_LIBRARY} )
+ELSE(FUSE_FOUND)
+ SET( FUSE_LIBRARIES )
+ENDIF(FUSE_FOUND)
+
+MARK_AS_ADVANCED( FUSE_LIBRARY FUSE_INCLUDE_DIR )
diff --git a/cmake/FindJansson.cmake b/cmake/FindJansson.cmake
new file mode 100644
index 0000000..3225923
--- /dev/null
+++ b/cmake/FindJansson.cmake
@@ -0,0 +1,59 @@
+# - Try to find Jansson
+# Once done this will define
+#
+# JANSSON_FOUND - system has Jansson
+# JANSSON_INCLUDE_DIRS - the Jansson include directory
+# JANSSON_LIBRARIES - Link these to use Jansson
+#
+# Copyright (c) 2011 Lee Hambley <lee.hambley@gmail.com>
+#
+# Redistribution and use is allowed according to the terms of the New
+# BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+#
+
+if (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS)
+ # in cache already
+ set(JANSSON_FOUND TRUE)
+else (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS)
+ find_path(JANSSON_INCLUDE_DIR
+ NAMES
+ jansson.h
+ PATHS
+ /usr/include
+ /usr/local/include
+ /opt/local/include
+ /sw/include
+ )
+
+find_library(JANSSON_LIBRARY
+ NAMES
+ jansson
+ PATHS
+ /usr/lib
+ /usr/local/lib
+ /opt/local/lib
+ /sw/lib
+ )
+
+set(JANSSON_INCLUDE_DIRS
+ ${JANSSON_INCLUDE_DIR}
+ )
+
+if (JANSSON_LIBRARY)
+ set(JANSSON_LIBRARIES
+ ${JANSSON_LIBRARIES}
+ ${JANSSON_LIBRARY}
+ )
+endif (JANSSON_LIBRARY)
+
+ include(FindPackageHandleStandardArgs)
+ find_package_handle_standard_args(Jansson DEFAULT_MSG
+ JANSSON_LIBRARIES JANSSON_INCLUDE_DIRS)
+
+ # show the JANSSON_INCLUDE_DIRS and JANSSON_LIBRARIES variables only in the advanced view
+ mark_as_advanced(JANSSON_INCLUDE_DIRS JANSSON_LIBRARIES)
+
+endif (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS)
+
+
diff --git a/conf/README.server b/conf/README.server
new file mode 100644
index 0000000..285758b
--- /dev/null
+++ b/conf/README.server
@@ -0,0 +1,30 @@
+Configuration for dnbd3-server
+
+The server requires a config directory.
+Start it like so: ./dnbd3-server -c ./my-config/
+
+There are two files in that dir
+
+== alt-servers ==
+List of known alt-servers for this server.
+Format:
+[PREFIX]<IP:PORT> [Comment]
+
+Prefix can be:
++ - Only report server to clients as alt-server, but don't use for replication
+- - Only use server for replication, but don't advertise to clients
+No prefix means server will be advertised to clients and is used for replication
+
+If you're not running in proxy mode, this file won't do much for you
+
+== server.conf ==
+
+Main configuration file. Ini format.
+
+[dnbd3]
+basePath=/srv/openslx/dnbd3 # virtual root of image files
+serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs)
+clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs)
+isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image
+uplinkTimeout=1250 # r/w timeout for connections to uplink servers
+
diff --git a/conf/alt-servers b/conf/alt-servers
new file mode 100644
index 0000000..fd2f2ec
--- /dev/null
+++ b/conf/alt-servers
@@ -0,0 +1,4 @@
+192.168.100.10 Some alt server
++192.168.100.100 My first alt server that will not be used for replication
+-192.168.100.50 Super sectret alt server that will be used for replication, but clients don't know about it
+
diff --git a/conf/rpc.acl b/conf/rpc.acl
new file mode 100644
index 0000000..5167ae3
--- /dev/null
+++ b/conf/rpc.acl
@@ -0,0 +1,5 @@
+# Everything from localhost
+127.0.0.0/8 ALL
+# Some info reading for another machine
+132.230.8.113 STATS CLIENT_LIST IMAGE_LIST
+
diff --git a/conf/server.conf b/conf/server.conf
new file mode 100644
index 0000000..2f43247
--- /dev/null
+++ b/conf/server.conf
@@ -0,0 +1,57 @@
+[dnbd3]
+; port to listen on (default: 5003)
+listenPort=5003
+; relative root directory for images, ending in .r[1-9][0-9]*
+basePath=/mnt/storage/dnbd3
+; artificial connection delay for connecting servers
+serverPenalty=100000
+; artificial connection delay for connecting clients
+clientPenalty=0
+; is this server a proxy? if true, requests for non-existing images will be relayed to known alt-servers
+isProxy=true
+; if proxy is true and an image is incomplete, should idle bandwidth be used to replicate missing blocks?
+backgroundReplication=true
+; minimum amount of connected clients for background replication to kick in
+bgrMinClients=0
+; if isProxy==true and another proxy requests and image that we don't have, should we ask our alt-servers for it?
+lookupMissingForProxy=true
+; create sparse files instead of preallocating; ignored if backgroundReplication=true -- only recommended if cache space is small
+sparseFiles=false
+; if true (which is the default), images will automatically be removed from the list if they can't be accessed
+removeMissingImages=true
+; timeout in ms for send/recv on connections to uplink servers (used for replication)
+uplinkTimeout=1250
+; timeout in ms for send/recv on connections to clients (using an image on this server)
+clientTimeout=15000
+; set this to true to close handles of unused images after some timeout
+closeUnusedFd=false
+; set this to true to load files without the .r[0-9]+ extension too, assuming RID=1
+vmdkLegacyMode=false
+
+[limits]
+maxClients=2000
+maxImages=1000
+maxPayload=9M
+maxReplicationSize=150G
+
+; Log related config
+[logging]
+; log file path and name
+; comment out to disable logging to file
+; protip: use SIGUSR2 to reopen log file
+file=./dnbd3.log
+; which type of messages to log to file
+fileMask=ERROR WARNING MINOR INFO DEBUG1
+; which to log to console (stdout)
+consoleMask=ERROR WARNING MINOR INFO
+; Valid types (warning: specifying invalid types will not yield an error!)
+; ERROR Fatal error, server will terminate
+; WARNING Major issue, something is broken but keep running
+; MINOR Minor issue, more of a hickup than serious problem
+; INFO Informational message
+; DEBUG1 Debug information, used for medium verbosity
+; DEBUG2 Used for debug messages that would show up a lot
+;
+; Whether timestamps should be output to console too (or just to file if false)
+consoleTimestamps=false
+
diff --git a/get-version.sh b/get-version.sh
new file mode 100755
index 0000000..1d4a8cb
--- /dev/null
+++ b/get-version.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+# Always create version string for repository this script lies in,
+# not the cwd... Makes usage easier in cmake
+ARG0="$0"
+SELF="$(readlink -f "${ARG0}")"
+ROOT_DIR="$(dirname "${SELF}")"
+cd "$ROOT_DIR"
+
+if [ -d .git ]; then
+ [ -n "$(git diff)" ] && MODDED='+MOD'
+ echo $(git describe)$MODDED, branch $(git rev-parse --abbrev-ref HEAD), built "$(date +%Y-%m-%d)"
+ exit 0
+fi
+
+if [ -f "version.txt" ]; then
+ cat "version.txt"
+ exit 0
+fi
+
+echo "-unknown-"
+
diff --git a/pack.sh b/pack.sh
new file mode 100755
index 0000000..9cbe5c4
--- /dev/null
+++ b/pack.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+./get-version.sh > version.txt
+tar ckzf dnbd3.tar.gz src cmake CMakeLists.txt get-version.sh version.txt
+rm -- version.txt
+
diff --git a/src/bench/connection.c b/src/bench/connection.c
new file mode 100644
index 0000000..129ae3c
--- /dev/null
+++ b/src/bench/connection.c
@@ -0,0 +1,133 @@
+#include "connection.h"
+#include "helper.h"
+#include "../config.h"
+#include "../shared/protocol.h"
+#include "../shared/fdsignal.h"
+#include "../shared/sockhelper.h"
+#include "../shared/log.h"
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+
+/* Constants */
+static const size_t SHORTBUF = 100;
+#define SOCKET_KEEPALIVE_TIMEOUT (3)
+#define MAX_ALTS (8)
+#define MAX_HOSTS_PER_ADDRESS (2)
+// If a server wasn't reachable this many times, we slowly start skipping it on measurements
+static const int FAIL_BACKOFF_START_COUNT = 8;
+#define RTT_COUNT (4)
+
+/* Module variables */
+
+// Init guard
+static bool connectionInitDone = false;
+static bool keepRunning = true;
+
+static struct {
+ int sockFd;
+ pthread_mutex_t sendMutex;
+ dnbd3_signal_t* panicSignal;
+ dnbd3_host_t currentServer;
+ uint64_t startupTime;
+} connection;
+
+// Known alt servers
+typedef struct _alt_server {
+ dnbd3_host_t host;
+ int consecutiveFails;
+ int rtt;
+ int rtts[RTT_COUNT];
+ int rttIndex;
+ int bestCount;
+} alt_server_t;
+alt_server_t altservers[MAX_ALTS];
+dnbd3_server_entry_t newservers[MAX_ALTS];
+pthread_spinlock_t altLock;
+
+bool connection_init_n_times(
+ const char *hosts,
+ const char *lowerImage,
+ const uint16_t rid,
+ int ntimes,
+ BenchCounters* counters,
+ bool closeSockets
+ ) {
+ for (int run_i = 0; run_i < ntimes; ++run_i) {
+ counters->attempts++;
+
+ printf(".");
+ int sock = -1;
+ char host[SHORTBUF];
+ serialized_buffer_t buffer;
+ uint16_t remoteVersion, remoteRid;
+ char *remoteName;
+ uint64_t remoteSize;
+
+ if ( !connectionInitDone && keepRunning ) {
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
+ memset( altservers, 0, sizeof altservers );
+ connection.sockFd = -1;
+ current = hosts;
+ do {
+ // Get next host from string
+ while ( *current == ' ' ) current++;
+ end = strchr( current, ' ' );
+ size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
+ if ( len > SHORTBUF ) len = SHORTBUF;
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ altservers[altIndex].host = tempHosts[i];
+ altIndex += 1;
+ }
+ current = end + 1;
+ } while ( end != NULL && altIndex < MAX_ALTS );
+ logadd( LOG_INFO, "Got %d servers from init call", altIndex );
+ // Connect
+ for ( int i = 0; i < altIndex; ++i ) {
+ if ( altservers[i].host.type == 0 )
+ continue;
+ // Try to connect
+ sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
+ if ( sock == -1 ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not connect to host" );
+ } else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not send select image" );
+ } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
+ } else if ( rid != 0 && rid != remoteRid ) {
+ counters->fails++;
+ logadd( LOG_ERROR, "rid mismatch" );
+ } else {
+ counters->success++;
+ break;
+ }
+ // Failed
+ logadd( LOG_DEBUG1, "Server does not offer requested image... " );
+ if ( sock != -1 ) {
+ close( sock );
+ sock = -1;
+ }
+ }
+ if ( sock != -1 ) {
+ // connectionInitDone = true;
+ if (closeSockets) {
+ close( sock );
+ }
+ }
+ }
+ }
+ return true;
+}
diff --git a/src/bench/connection.h b/src/bench/connection.h
new file mode 100644
index 0000000..9cb59ef
--- /dev/null
+++ b/src/bench/connection.h
@@ -0,0 +1,26 @@
+#ifndef _CONNECTION_H_
+#define _CONNECTION_H_
+
+#include "../shared/fdsignal.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include "helper.h"
+
+struct _dnbd3_async;
+
+typedef struct _dnbd3_async {
+ struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller)
+ char* buffer; // Caller-provided buffer to be filled
+ uint64_t offset;
+ uint32_t length;
+ dnbd3_signal_t* signal; // Used to signal the caller
+ bool finished; // Will be set to true if the request has been handled
+ bool success; // Will be set to true if the request succeeded
+} dnbd3_async_t;
+
+
+bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters, bool closeSockets);
+
+bool connection_init(const char *hosts, const char *image, const uint16_t rid);
+
+#endif /* CONNECTION_H_ */
diff --git a/src/bench/helper.c b/src/bench/helper.c
new file mode 100644
index 0000000..c89b614
--- /dev/null
+++ b/src/bench/helper.c
@@ -0,0 +1,37 @@
+#include "helper.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+//BenchCounters benchC = { .attempts = 0, .success = 0, .fails = 0};
+
+void printLog( log_info *info )
+{
+ FILE *logFile;
+
+ // Create logfile
+
+ logFile = fopen( "log.txt", "w" );
+ if ( logFile == NULL ) {
+ printf( "Error creating/opening log.txt\n" );
+ return;
+ }
+
+ //rewind(file);
+ fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount );
+ fprintf( logFile, "Blocksize: 4KiB\n\n" );
+ fprintf( logFile, "Block access count:\n" );
+
+ uint64_t i = 0;
+ for ( ; i < info->imageBlockCount; i++ ) {
+ if ( i % 50 == 0 ) {
+ fprintf( logFile, "\n" );
+ }
+ fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] );
+ }
+ fprintf( logFile, "\n" );
+ fclose( logFile );
+}
diff --git a/src/bench/helper.h b/src/bench/helper.h
new file mode 100644
index 0000000..8342a79
--- /dev/null
+++ b/src/bench/helper.h
@@ -0,0 +1,38 @@
+#ifndef IMAGEHELPER_H
+#define IMAGEHELPER_H
+
+#include "../types.h"
+
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/socket.h>
+
+typedef struct log_info {
+ uint64_t imageSize;
+ uint64_t receivedBytes;
+ uint64_t imageBlockCount;
+ uint8_t *blockRequestCount;
+} log_info;
+
+
+typedef struct BenchCounters {
+ int attempts;
+ int success;
+ int fails;
+} BenchCounters;
+
+
+typedef struct BenchThreadData {
+ BenchCounters* counter;
+ char* server_address;
+ char * image_name;
+ int runs;
+ int threadNumber;
+ bool closeSockets;
+} BenchThreadData;
+
+
+
+#endif
diff --git a/src/bench/main.c b/src/bench/main.c
new file mode 100644
index 0000000..2f32dbf
--- /dev/null
+++ b/src/bench/main.c
@@ -0,0 +1,154 @@
+/*
+* Butchered from the dnbd3-fuse by C.K.
+**/
+
+#include "connection.h"
+#include "helper.h"
+#include "../shared/protocol.h"
+#include "../shared/log.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <getopt.h>
+#include <pthread.h>
+
+#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
+
+
+/* Debug/Benchmark variables */
+static bool useDebug = false;
+
+
+static void printUsage(char *argv0, int exitCode)
+{
+ printf( "Usage: %s [--debug] --host <serverAddress(es)> --image <imageName> [--rid revision]\n", argv0 );
+ printf( "Or: %s [-d] -h <serverAddress(es)> -i <imageName> [-r revision]\n", argv0 );
+ printf( " -h --host List of space separated hosts to use\n" );
+ printf( " -i --image Remote image name to request\n" );
+ printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" );
+ printf( " -n --runs Number of connection attempts per thread\n" );
+ printf( " -t --threads number of threads\n" );
+ printf( " -l --log Write log to given location\n" );
+ printf( " -d --debug Don't fork and print debug output (fuse > stderr, dnbd3 > stdout)\n" );
+ // // fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ exit( exitCode );
+}
+
+static const char *optString = "h:i:n:t:HvVd";
+static const struct option longOpts[] = {
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "nruns", optional_argument, NULL, 'n' },
+ { "threads", optional_argument, NULL, 't' },
+ { "help", optional_argument, NULL, 'H' },
+ { "version", no_argument, NULL, 'v' },
+ { 0, 0, 0, 0 }
+};
+
+
+static void printBenchCounters(BenchCounters* c) {
+ printf ("Attempts:\t%d\n", c->attempts);
+ printf ("Success :\t%d\n", c->success);
+ printf ("Fails :\t%d\n", c->fails);
+}
+
+
+void* runBenchThread(void* t) {
+ BenchThreadData* data = t;
+ connection_init_n_times(
+ data->server_address,
+ data->server_address,
+ 0,
+ data->runs,
+ data->counter,
+ data->closeSockets);
+ printf("Thread #%d finished\n", data->threadNumber);
+ return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+ char *server_address = NULL;
+ char *image_Name = NULL;
+ int opt, lidx;
+
+ bool closeSockets = false;
+ int n_runs = 100;
+ int n_threads = 1;
+
+ if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
+ printUsage( argv[0], 0 );
+ }
+
+ while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
+ switch ( opt ) {
+ case 'h':
+ server_address = optarg;
+ break;
+ case 'i':
+ image_Name = optarg;
+ break;
+ case 'n':
+ n_runs = atoi(optarg);
+ break;
+ case 't':
+ n_threads = atoi(optarg);
+ break;
+ case 'c':
+ closeSockets = true;
+ break;
+ case 'H':
+ printUsage( argv[0], 0 );
+ break;
+ case 'd':
+ useDebug = true;
+ break;
+ default:
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+ }
+
+ printf("Welcome to dnbd3 benchmark tool\n");
+
+ /* all counters */
+ BenchCounters counters[n_threads];
+ BenchThreadData threadData[n_threads];
+ pthread_t threads[n_threads];
+
+ /* create all threads */
+ for (int i = 0; i < n_threads; i++) {
+ BenchCounters tmp1 = {0,0,0};
+ counters[i] = tmp1;
+ BenchThreadData tmp2 = {
+ &(counters[i]),
+ server_address,
+ image_Name,
+ n_runs,
+ i,
+ closeSockets};
+ threadData[i] = tmp2;
+ pthread_create(&(threads[i]), NULL, runBenchThread, &(threadData[i]));
+ }
+
+
+ /* join all threads*/
+ for (int i = 0; i < n_threads; ++i) {
+ pthread_join(threads[i], NULL);
+ }
+
+ /* print out all counters & sum up */
+ BenchCounters total = {0,0,0};
+ for (int i = 0; i < n_threads; ++i) {
+ printf("#### Thread %d\n", i);
+ printBenchCounters(&counters[i]);
+ total.attempts += counters[i].attempts;
+ total.success += counters[i].success;
+ total.fails += counters[i].fails;
+ }
+ /* print out summary */
+ printf("\n\n#### SUMMARY\n");
+ printBenchCounters(&total);
+ printf("\n-- End of program");
+}
diff --git a/src/bench/serialize.c b/src/bench/serialize.c
new file mode 100644
index 0000000..4934132
--- /dev/null
+++ b/src/bench/serialize.c
@@ -0,0 +1,5 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../serialize.c"
diff --git a/src/client/client.c b/src/client/client.c
new file mode 100644
index 0000000..37f0558
--- /dev/null
+++ b/src/client/client.c
@@ -0,0 +1,670 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include "../clientconfig.h"
+#include "../types.h"
+#include "../version.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <errno.h>
+
+#define SOCK_PATH "/var/run/dnbd3.socket"
+#define SOCK_BUFFER 1000
+#define DEV_LEN 15
+#define MAX_DEVS 50
+
+
+static int openDevices[MAX_DEVS];
+static const char *optString = "f:h:i:r:d:a:cs:HV?k";
+static const struct option longOpts[] = {
+ { "file", required_argument, NULL, 'f' },
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "rid", required_argument, NULL, 'r' },
+ { "device", required_argument, NULL, 'd' },
+ { "ahead", required_argument, NULL, 'a' },
+ { "close", no_argument, NULL, 'c' },
+ { "switch", required_argument, NULL, 's' },
+ { "add", required_argument, NULL, 'adds' },
+ { "remove", required_argument, NULL, 'rems' },
+ { "help", no_argument, NULL, 'H' },
+ { "version", no_argument, NULL, 'V' },
+ { "daemon", no_argument, NULL, 'D' },
+ { "nofork", no_argument, NULL, 'N' },
+ { "kill", no_argument, NULL, 'k' },
+ { "user", required_argument, NULL, 'U' }, // Only used in daemon mode
+ { 0, 0, 0, 0 }
+};
+
+static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg);
+static void dnbd3_client_daemon();
+static void dnbd3_daemon_action(int client, int argc, char **argv);
+static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host);
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead);
+static int dnbd3_daemon_send(int argc, char **argv);
+static void dnbd3_print_help(char *argv_0);
+static void dnbd3_print_version();
+
+/**
+ * Convert a host and port (network byte order) to printable representation.
+ * Worst case required buffer len is 48, eg. [1234:1234:1234:1234:1234:1234:1234:1234]:12345 (+ \0)
+ * Returns true on success, false on error
+ */
+static char host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen)
+{
+ // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6
+ if ( targetlen < 10 ) return false;
+ if ( host->type == HOST_IP6 ) {
+ *target++ = '[';
+ inet_ntop( AF_INET6, host->addr, target, targetlen - 10 );
+ target += strlen( target );
+ *target++ = ']';
+ } else if ( host->type == HOST_IP4 ) {
+ inet_ntop( AF_INET, host->addr, target, targetlen - 8 );
+ target += strlen( target );
+ } else {
+ snprintf( target, targetlen, "<?addrtype=%d>", (int)host->type );
+ return false;
+ }
+ *target = '\0';
+ if ( host->port != 0 ) {
+ // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7
+ snprintf( target, 7, ":%d", (int)ntohs( host->port ) );
+ }
+ return true;
+}
+
+
+/**
+ * Parse IPv4 or IPv6 address in string representation to a suitable format usable by the BSD socket library
+ * @string eg. "1.2.3.4" or "2a01::10:5", optially with port appended, eg "1.2.3.4:6666" or "[2a01::10:5]:6666"
+ * @af will contain either HOST_IP4 or HOST_IP6
+ * @addr will contain the address in network representation
+ * @port will contain the port in network representation, defaulting to #define PORT if none was given
+ * returns 1 on success, 0 in failure. contents of af, addr and port are undefined in the latter case
+ * !! Contents of @string might be modified by this function !!
+ */
+static char parse_address(char *string, dnbd3_host_t *host)
+{
+ struct in_addr v4;
+ struct in6_addr v6;
+
+ // Try IPv4 without port
+ if ( 1 == inet_pton( AF_INET, string, &v4 ) ) {
+ host->type = HOST_IP4;
+ memcpy( host->addr, &v4, 4 );
+ host->port = htons( PORT );
+ return 1;
+ }
+ // Try IPv6 without port
+ if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) {
+ host->type = HOST_IP6;
+ memcpy( host->addr, &v6, 16 );
+ host->port = htons( PORT );
+ return 1;
+ }
+
+ // Scan for port
+ char *portpos = NULL, *ptr = string;
+ while ( *ptr ) {
+ if ( *ptr == ':' )
+ portpos = ptr;
+ ++ptr;
+ }
+ if ( portpos == NULL ) return 0; // No port in string
+ // Consider IP being surrounded by [ ]
+ if ( *string == '[' && *(portpos - 1) == ']' ) {
+ ++string;
+ *(portpos - 1) = '\0';
+ }
+ *portpos++ = '\0';
+ int p = atoi( portpos );
+ if ( p < 1 || p > 65535 )
+ return 0; // Invalid port
+ host->port = htons( (uint16_t)p );
+
+ // Try IPv4 with port
+ if ( 1 == inet_pton( AF_INET, string, &v4 ) ) {
+ host->type = HOST_IP4;
+ memcpy( host->addr, &v4, 4 );
+ return 1;
+ }
+ // Try IPv6 with port
+ if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) {
+ host->type = HOST_IP6;
+ memcpy( host->addr, &v6, 16 );
+ return 1;
+ }
+
+ // FAIL
+ return 0;
+}
+
+static int dnbd3_get_ip(char *hostname, dnbd3_host_t *host)
+{
+ if ( parse_address( hostname, host ) ) return true;
+ // TODO: Parse port too for host names
+ struct hostent *hent;
+ if ( (hent = gethostbyname( hostname )) == NULL ) {
+ printf( "Unknown host '%s'\n", hostname );
+ return false;
+ }
+
+ if ( hent->h_addrtype == AF_INET ) {
+ host->type = HOST_IP4;
+ memcpy( host->addr, hent->h_addr, 4);
+ } else if (hent->h_addrtype == AF_INET6) {
+ host->type = HOST_IP6;
+ memcpy(host->addr, hent->h_addr, 16);
+ } else {
+ printf("FATAL: Unknown address type: %d\n", hent->h_addrtype);
+ return false;
+ }
+ host->port = htons( PORT );
+ return true;
+}
+
+int main(int argc, char *argv[])
+{
+ char *dev = NULL;
+ char host[50];
+
+ int action = -1;
+
+ dnbd3_ioctl_t msg;
+ memset( &msg, 0, sizeof(dnbd3_ioctl_t) );
+ msg.len = (uint16_t)sizeof(dnbd3_ioctl_t);
+ msg.read_ahead_kb = DEFAULT_READ_AHEAD_KB;
+ msg.host.port = htons( PORT );
+ msg.host.type = 0;
+ msg.imgname = NULL;
+ msg.use_server_provided_alts = true;
+
+ int opt = 0;
+ int longIndex = 0;
+
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+
+ while ( opt != -1 ) {
+ switch ( opt ) {
+ case 'f':
+ break;
+ case 'h':
+ if ( !dnbd3_get_ip( optarg, &msg.host ) ) exit( EXIT_FAILURE );
+ break;
+ case 'i':
+ action = IOCTL_OPEN;
+ msg.imgname = strdup( optarg );
+ break;
+ case 'r':
+ msg.rid = atoi( optarg );
+ break;
+ case 'd':
+ dev = strdup( optarg );
+ printf( "Device is %s\n", dev );
+ break;
+ case 'a':
+ msg.read_ahead_kb = atoi( optarg );
+ break;
+ case 'c':
+ action = IOCTL_CLOSE;
+ break;
+ case 's':
+ dnbd3_get_ip( optarg, &msg.host );
+ action = IOCTL_SWITCH;
+ break;
+ case 'adds':
+ dnbd3_get_ip( optarg, &msg.host );
+ action = IOCTL_ADD_SRV;
+ break;
+ case 'rems':
+ dnbd3_get_ip( optarg, &msg.host );
+ action = IOCTL_REM_SRV;
+ break;
+ case 'H':
+ dnbd3_print_help( argv[0] );
+ break;
+ case 'V':
+ dnbd3_print_version();
+ break;
+ case '?':
+ dnbd3_print_help( argv[0] );
+ break;
+ case 'D':
+ dnbd3_client_daemon();
+ break;
+ }
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+ }
+
+ // See if socket exists, if so, try to send to daemon
+ struct stat st;
+ if ( stat( SOCK_PATH, &st ) == 0 ) {
+ if ( dnbd3_daemon_send( argc, argv ) ) exit( 0 );
+ printf( "\nFailed.\n" );
+ exit( 1 );
+ }
+
+ // Direct requests
+
+ // In case the client was invoked as a suid binary, change uid back to original user
+ // when being used for direct ioctl, so that the device's permissions are taken into account
+ if ( geteuid() == 0 ) {
+ setgid( getgid() );
+ setuid( getuid() );
+ }
+
+ host_to_string( &msg.host, host, 50 );
+
+ // close device
+ if ( action == IOCTL_CLOSE && msg.host.type == 0 && dev && (msg.imgname == NULL )) {
+ printf( "INFO: Closing device %s\n", dev );
+ if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) ) exit( EXIT_SUCCESS );
+ printf( "Couldn't close device.\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ // switch host
+ if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.host.type != 0 && dev && (msg.imgname == NULL )) {
+ if ( action == IOCTL_SWITCH ) printf( "INFO: Switching device %s to %s\n", dev, host );
+ if ( action == IOCTL_ADD_SRV ) printf( "INFO: %s: adding %s\n", dev, host );
+ if ( action == IOCTL_REM_SRV ) printf( "INFO: %s: removing %s\n", dev, host );
+ if ( dnbd3_ioctl( dev, action, &msg ) ) exit( EXIT_SUCCESS );
+ printf( "Failed! Maybe the device is not connected?\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ // connect
+ if ( action == IOCTL_OPEN && msg.host.type != 0 && dev && (msg.imgname != NULL )) {
+ printf( "INFO: Connecting device %s to %s for image %s\n", dev, host, msg.imgname );
+ if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) exit( EXIT_SUCCESS );
+ printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ dnbd3_print_help( argv[0] );
+ exit( EXIT_FAILURE );
+}
+
+static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg)
+{
+ const int fd = open( dev, O_WRONLY );
+ if ( fd < 0 ) {
+ printf( "open() for %s failed.\n", dev );
+ return false;
+ }
+ if ( msg != NULL && msg->imgname != NULL ) msg->imgnamelen = (uint16_t)strlen( msg->imgname );
+ const int ret = ioctl( fd, command, msg );
+ if ( ret < 0 ) {
+ printf( "ioctl() failed.\n" );
+ }
+ close( fd );
+ return ret >= 0;
+}
+
+static void dnbd3_client_daemon()
+{
+ int listener, client;
+ struct sockaddr_un addrLocal, addrRemote;
+ char buffer[SOCK_BUFFER];
+ struct timeval tv;
+ int done, ret, len;
+ socklen_t socklen;
+
+ if ( geteuid() != 0 ) {
+ printf( "Only root can run the dnbd3-client in daemon mode!\n" );
+ exit( 1 );
+ }
+
+ if ( (listener = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) {
+ perror( "socket" );
+ exit( 1 );
+ }
+
+ addrLocal.sun_family = AF_UNIX;
+ snprintf( addrLocal.sun_path, sizeof(addrLocal.sun_path), "%s", SOCK_PATH );
+ unlink( addrLocal.sun_path );
+ if ( bind( listener, (struct sockaddr *)&addrLocal, sizeof(addrLocal) ) < 0 ) {
+ perror( "bind" );
+ exit( 1 );
+ }
+ chmod( addrLocal.sun_path, 0600 );
+ if ( listen( listener, 5 ) == -1 ) {
+ perror( "listen" );
+ exit( 1 );
+ }
+
+ memset( openDevices, -1, sizeof(openDevices) );
+
+ for (;;) {
+ socklen = sizeof(addrRemote);
+ if ( (client = accept( listener, (struct sockaddr *)&addrRemote, &socklen )) == -1 ) {
+ printf( "accept error %d\n", (int)errno);
+ sleep( 1 );
+ continue;
+ }
+
+ tv.tv_sec = 1;
+ tv.tv_usec = 0;
+ setsockopt( client, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) );
+ setsockopt( client, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv) );
+
+ ret = recv( client, &len, sizeof(len), MSG_WAITALL );
+ if ( ret != sizeof(len) || len <= 0 || len + 4 > SOCK_BUFFER ) { // Leave a little room (at least one byte for the appended nullchar)
+ printf( "Error reading length field (ret: %d, len: %d)\n", ret, len );
+ close( client );
+ continue;
+ }
+ done = recv( client, buffer, len, MSG_WAITALL );
+
+ if ( done != len ) {
+ printf( "receiving payload from client failed (%d/%d)\n", done, len );
+ } else {
+ buffer[len] = '\0';
+ char *pos = buffer, *end = buffer + len;
+ int argc = 1;
+ char *argv[20] = { "dnbd3-client" };
+ while ( pos < end && argc < 20 ) {
+ while ( *pos == '\0' ) {
+ if ( ++pos >= end ) break;
+ }
+ if ( pos >= end ) break;
+ argv[argc++] = pos;
+ printf("Arg %d: '%s'\n", argc, pos);
+ while ( *pos != '\0' ) { // This will always be in bounds because of -4 above
+ if ( ++pos >= end ) break;
+ }
+ }
+ dnbd3_daemon_action( client, argc, argv );
+ }
+
+ close( client );
+ }
+}
+
+static void dnbd3_daemon_action(int client, int argc, char **argv)
+{
+ int opt = 0;
+ int longIndex = 0;
+ char *host = NULL, *image = NULL, *device = NULL;
+ int rid = 0, uid = 0, killMe = false, ahead = 512;
+ int len;
+ int action = -1;
+ const char *actionName = NULL;
+
+ optind = 1;
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+
+ while ( opt != -1 ) {
+ switch ( opt ) {
+ case 'd':
+ device = optarg;
+ break;
+ case 'h':
+ host = optarg;
+ break;
+ case 'i':
+ image = optarg;
+ action = IOCTL_OPEN;
+ actionName = "Open";
+ break;
+ case 'r':
+ rid = atoi( optarg );
+ break;
+ case 'U':
+ uid = atoi( optarg );
+ break;
+ case 'c':
+ action = IOCTL_CLOSE;
+ actionName = "Close";
+ break;
+ case 'adds':
+ action = IOCTL_ADD_SRV;
+ actionName = "Add Server";
+ break;
+ case 'rems':
+ action = IOCTL_REM_SRV;
+ actionName = "Remove Server";
+ break;
+ case 'a':
+ ahead = atoi( optarg );
+ break;
+ case 'k':
+ killMe = true;
+ break;
+ }
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+ }
+
+ if ( killMe ) {
+ if ( uid != 0 ) {
+ printf( "Ignoring kill request by user %d\n", uid );
+ close( client );
+ return;
+ }
+ printf( "Received kill request; exiting.\n" );
+ close( client );
+ unlink( SOCK_PATH );
+ exit( 0 );
+ }
+
+ if ( (action == IOCTL_CLOSE || ((action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && host != NULL)) && device != NULL ) {
+ if ( dnbd3_daemon_ioctl( uid, device, action, actionName, host ) ) {
+ len = 0;
+ } else {
+ len = -1;
+ }
+ send( client, &len, sizeof(len), 0 );
+ return;
+ }
+ if ( action == IOCTL_OPEN && host != NULL && image != NULL && rid >= 0 ) {
+ device = dnbd3_daemon_open( uid, host, image, rid, ahead );
+ if ( device != NULL ) {
+ len = strlen( device );
+ send( client, &len, sizeof(len), 0 );
+ send( client, device, len, 0 );
+ } else {
+ len = -1;
+ send( client, &len, sizeof(len), 0 );
+ }
+ return;
+ }
+ printf( "Received a client request I cannot understand.\n" );
+}
+
+static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host)
+{
+ int index = -1;
+ char dev[DEV_LEN];
+ if ( strncmp( device, "/dev/dnbd", 9 ) == 0 ) {
+ index = atoi( device + 9 );
+ } else {
+ index = atoi( device );
+ }
+ dnbd3_ioctl_t msg;
+ memset( &msg, 0, sizeof(msg) );
+ msg.len = (uint16_t)sizeof(msg);
+ if ( host != NULL ) {
+ dnbd3_get_ip( host, &msg.host );
+ }
+ if ( index < 0 || index >= MAX_DEVS ) {
+ printf( "%s request with invalid device id %d\n", actionName, index );
+ return false;
+ }
+ snprintf( dev, DEV_LEN, "/dev/dnbd%d", index );
+ if ( openDevices[index] == -1 ) {
+ printf( "%s request by %d for closed device %s\n", actionName, uid, dev );
+ return true;
+ }
+ if ( openDevices[index] != uid ) {
+ printf( "%s: User %d cannot access %s owned by %d\n", actionName, uid, dev, openDevices[index] );
+ return false;
+ }
+ if ( dnbd3_ioctl( dev, action, &msg ) ) {
+ printf( "%s request for device %s of user %d successful\n", actionName, dev, uid );
+ openDevices[index] = -1;
+ return true;
+ }
+ printf( "%s: Error on device %s, requested by %d\n", actionName, dev, uid );
+ return false;
+}
+
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead)
+{
+ int i, sameUser = 0;
+ struct stat st;
+ static char dev[DEV_LEN];
+ printf( "Opening a device for %s on %s\n", image, host );
+ // Check number of open devices
+ for (i = 0; i < MAX_DEVS; ++i) {
+ if ( openDevices[i] == uid ) sameUser++;
+ }
+ if ( sameUser > 1 ) {
+ printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
+ return NULL ;
+ }
+ // Find free device
+ for (i = 0; i < MAX_DEVS; ++i) {
+ if ( openDevices[i] != -1 ) continue;
+ snprintf( dev, DEV_LEN, "/dev/dnbd%d", i );
+ if ( stat( dev, &st ) == -1 ) {
+ break;
+ }
+ // Open
+ dnbd3_ioctl_t msg;
+ msg.len = (uint16_t)sizeof(msg);
+ if ( !dnbd3_get_ip( host, &msg.host ) ) {
+ printf( "Cannot parse host address %s\n", host );
+ return NULL ;
+ }
+ msg.imgname = image;
+ msg.imgnamelen = strlen( image );
+ msg.rid = rid;
+ msg.use_server_provided_alts = true;
+ msg.read_ahead_kb = readAhead;
+ if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) {
+ openDevices[i] = uid;
+ printf( "Device %s now occupied by %d\n", dev, uid );
+ return dev;
+ }
+ printf( "ioctl to open device %s failed, trying next...\n", dev );
+ }
+ // All devices in use
+ printf( "No more free devices. All %d are in use :-(\n", i );
+ return NULL ;
+}
+
+static int dnbd3_daemon_send(int argc, char **argv)
+{
+ const int uid = getuid();
+ int s, i, len;
+ struct sockaddr_un remote;
+ char buffer[SOCK_BUFFER];
+
+ if ( (s = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) {
+ perror( "socket" );
+ return false;
+ }
+
+ remote.sun_family = AF_UNIX;
+ snprintf( remote.sun_path, sizeof(remote.sun_path), "%s", SOCK_PATH );
+ if ( connect( s, (struct sockaddr *)&remote, sizeof(remote) ) == -1 ) {
+ perror( "connect" );
+ close( s );
+ return false;
+ }
+ // (Re)build argument string into a single one, arguments separated by null chars
+ char *pos = buffer;
+ char *end = buffer + SOCK_BUFFER;
+ pos += snprintf( pos, end - pos, "--user%c%d", (int)'\0', uid ) + 1;
+ for (i = 1; i < argc && pos < end; ++i) {
+ pos += snprintf( pos, end - pos, "%s", argv[i] ) + 1;
+ }
+ // Send
+ len = (int)(pos - buffer);
+ if ( send( s, &len, sizeof(len), 0 ) != sizeof(len) || send( s, buffer, len, 0 ) != len ) {
+ perror( "Sending request to daemon failed" );
+ close( s );
+ return false;
+ }
+ // Read reply
+ if ( recv( s, &len, sizeof(len), MSG_WAITALL ) != sizeof(len) ) {
+ perror( "Reading length-field from daemon failed" );
+ close( s );
+ return false;
+ }
+ if ( len <= 0 ) {
+ printf( "Daemon returned exit code %d\n", -len );
+ close( s );
+ exit( -len );
+ }
+ if ( len + 4 > SOCK_BUFFER ) {
+ printf( "Reply too long (is %d bytes)\n", len );
+ close( s );
+ return false;
+ }
+ if ( recv( s, buffer, len, MSG_WAITALL ) != len ) {
+ perror( "Reading reply payload from daemon failed" );
+ close( s );
+ return false;
+ }
+ buffer[len] = '\0';
+ printf( "%s", buffer );
+ return true;
+}
+
+static void dnbd3_print_help(char *argv_0)
+{
+ printf( "Version: %s\n\n", VERSION_STRING );
+ printf( "\nUsage: %s\n"
+ "\t-h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n", argv_0 );
+ printf( "Start the DNBD3 client.\n" );
+ //printf("-f or --file \t\t Configuration file (default /etc/dnbd3-client.conf)\n");
+ printf( "-h or --host \t\t Host running dnbd3-server.\n" );
+ printf( "-i or --image \t\t Image name of exported image.\n" );
+ printf( "-r or --rid \t\t Release-ID of exported image (default 0, latest).\n" );
+ printf( "-d or --device \t\t DNBD3 device name.\n" );
+ printf( "-a or --ahead \t\t Read ahead in KByte (default %i).\n", DEFAULT_READ_AHEAD_KB );
+ printf( "-c or --close \t\t Disconnect and close device.\n" );
+ printf( "-s or --switch \t\t Switch dnbd3-server on device (DEBUG).\n" );
+ printf( "-H or --help \t\t Show this help text and quit.\n" );
+ printf( "-V or --version \t Show version and quit.\n\n" );
+ printf( "\t--daemon \t Run as helper daemon\n" );
+ printf( "\t--kill \t Kill running helper daemon\n" );
+ printf( "The helper daemon makes it possible for normal users to connect dnbd3 devices.\n" );
+ printf( "The client binary needs to be a setuid program for this to work!\n\n" );
+}
+
+void dnbd3_print_version()
+{
+ printf( "Version: %s\n", VERSION_STRING );
+ exit( EXIT_SUCCESS );
+}
diff --git a/src/clientconfig.h b/src/clientconfig.h
new file mode 100644
index 0000000..f35f673
--- /dev/null
+++ b/src/clientconfig.h
@@ -0,0 +1,36 @@
+#ifndef _CLIENTCONFIG_H_
+#define _CLIENTCONFIG_H_
+
+// Which is the minimum protocol version the client expects from the server
+#define MIN_SUPPORTED_SERVER 2
+
+// in seconds if not stated otherwise (MS = milliseconds)
+#define SOCKET_TIMEOUT_CLIENT_DATA 2
+#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1
+
+#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
+#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
+#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
+// This must be a power of two:
+#define RTT_BLOCK_SIZE 4096
+
+#define STARTUP_MODE_DURATION 30
+// Interval of several repeating tasks (in seconds)
+#define TIMER_INTERVAL_PROBE_STARTUP 4
+#define TIMER_INTERVAL_PROBE_NORMAL 22
+#define TIMER_INTERVAL_PROBE_PANIC 2
+#define TIMER_INTERVAL_KEEPALIVE_PACKET 6
+
+// Expect a keepalive response every X seconds
+#define SOCKET_KEEPALIVE_TIMEOUT 8
+
+// Number of unsuccessful alt_server probes before read errors are reported to the block layer
+// (ALL servers will be probed this many times)
+// Set to 0 to disable
+#define PROBE_COUNT_TIMEOUT 0
+
+// ++ Kernel module ++
+#define DEFAULT_READ_AHEAD_KB 512
+#define NUMBER_DEVICES 8
+
+#endif
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 0000000..50336af
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,43 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+// +++++ Network +++++
+// Default port
+#define PORT 5003
+#define RPC_PORT (PORT+1)
+
+// No serialized payload allowed exceeding this many bytes (so actual data from client->server is not affected by this limit!)
+#define MAX_PAYLOAD 1000
+
+// Protocol version should be increased whenever new features/messages are added,
+// so either the client or server can run in compatibility mode, or they can
+// cancel the connection right away if the protocol has changed too much
+#define PROTOCOL_VERSION 3
+// 2017-10-16: Update to v3: Change header to support request hop-counting
+
+#define NUMBER_SERVERS 8 // Number of alt servers per image/device
+
+// +++++ Block Device +++++
+#define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END!
+
+#endif /* CONFIG_H_ */
diff --git a/src/fuse/connection.c b/src/fuse/connection.c
new file mode 100644
index 0000000..fc9f05b
--- /dev/null
+++ b/src/fuse/connection.c
@@ -0,0 +1,927 @@
+#include "connection.h"
+#include "helper.h"
+#include "../clientconfig.h"
+#include "../shared/protocol.h"
+#include "../shared/fdsignal.h"
+#include "../shared/sockhelper.h"
+#include "../shared/log.h"
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <time.h>
+#include <inttypes.h>
+
+/* Constants */
+static const size_t SHORTBUF = 100;
+#define MAX_ALTS (16)
+#define MAX_ALTS_ACTIVE (5)
+#define MAX_HOSTS_PER_ADDRESS (2)
+// If a server wasn't reachable this many times, we slowly start skipping it on measurements
+static const int FAIL_BACKOFF_START_COUNT = 8;
+#define RTT_COUNT (4)
+
+/* Module variables */
+
+// Init guard
+static bool connectionInitDone = false;
+static bool threadInitDone = false;
+static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER;
+static bool keepRunning = true;
+static bool learnNewServers;
+
+// List of pending requests
+static struct {
+ dnbd3_async_t *head;
+ dnbd3_async_t *tail;
+ pthread_spinlock_t lock;
+} requests;
+
+// Connection for the image
+static struct {
+ char *name;
+ uint16_t rid;
+ uint64_t size;
+} image;
+
+static struct {
+ int sockFd;
+ pthread_mutex_t sendMutex;
+ dnbd3_signal_t* panicSignal;
+ dnbd3_host_t currentServer;
+ ticks startupTime;
+} connection;
+
+// Known alt servers
+typedef struct _alt_server {
+ dnbd3_host_t host;
+ int consecutiveFails;
+ int rtt;
+ int rtts[RTT_COUNT];
+ int rttIndex;
+ int bestCount;
+ int liveRtt;
+} alt_server_t;
+
+static dnbd3_server_entry_t newservers[MAX_ALTS];
+static pthread_mutex_t newAltLock = PTHREAD_MUTEX_INITIALIZER;
+static alt_server_t altservers[MAX_ALTS];
+// WR: Use when re-assigning or sorting altservers, i.e. an index in altservers
+// changes its meaning (host). Also used for newservers.
+// RD: Use when reading the list or modifying individual entries data, like RTT
+// and fail count. Isn't super clean as we still might have races here, but mostly
+// the code is clean in this regard, so we should only have stale data somewhere
+// but nothing nonsensical.
+static pthread_rwlock_t altLock = PTHREAD_RWLOCK_INITIALIZER;
+#define lock_read pthread_rwlock_rdlock
+#define lock_write pthread_rwlock_wrlock
+#define unlock_rw pthread_rwlock_unlock
+
+/* Static methods */
+
+
+static void* connection_receiveThreadMain(void *sock);
+static void* connection_backgroundThread(void *something);
+
+static void addAltServers();
+static void sortAltServers();
+static void probeAltServers();
+static void switchConnection(int sockFd, alt_server_t *srv);
+static void requestAltServers();
+static bool throwDataAway(int sockFd, uint32_t amount);
+
+static void enqueueRequest(dnbd3_async_t *request);
+static dnbd3_async_t* removeRequest(dnbd3_async_t *request);
+
+bool connection_init(const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew)
+{
+ int sock = -1;
+ char host[SHORTBUF];
+ size_t hlen;
+ serialized_buffer_t buffer;
+ uint16_t remoteVersion, remoteRid;
+ char *remoteName;
+ uint64_t remoteSize;
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ poll_list_t *cons = sock_newPollList();
+
+ timing_setBase();
+ pthread_mutex_lock( &mutexInit );
+ if ( !connectionInitDone && keepRunning ) {
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
+ learnNewServers = doLearnNew;
+ memset( altservers, 0, sizeof altservers );
+ connection.sockFd = -1;
+ current = hosts;
+ do {
+ // Get next host from string
+ while ( *current == ' ' ) current++;
+ end = strchr( current, ' ' );
+ size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
+ if ( len > SHORTBUF ) len = SHORTBUF;
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ altservers[altIndex].host = tempHosts[i];
+ altIndex += 1;
+ }
+ current = end + 1;
+ } while ( end != NULL && altIndex < MAX_ALTS );
+ logadd( LOG_INFO, "Got %d servers from init call", altIndex );
+ // Connect
+ for ( int i = 0; i < altIndex + 5; ++i ) {
+ if ( i >= altIndex ) {
+ // Additional iteration - no corresponding slot in altservers, this
+ // is just so we can make a final calls with longer timeout
+ sock = sock_multiConnect( cons, NULL, 400, 1000 );
+ if ( sock == -2 ) {
+ logadd( LOG_ERROR, "Could not connect to any host" );
+ sock = -1;
+ break;
+ }
+ } else {
+ if ( altservers[i].host.type == 0 )
+ continue;
+ // Try to connect - 100ms timeout
+ sock = sock_multiConnect( cons, &altservers[i].host, 100, 1000 );
+ }
+ if ( sock == -2 || sock == -1 )
+ continue;
+ salen = sizeof(sa);
+ if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
+ logadd( LOG_ERROR, "getpeername on successful connection failed!? (errno=%d)", errno );
+ close( sock );
+ sock = -1;
+ continue;
+ }
+ hlen = sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
+ logadd( LOG_INFO, "Connected to %.*s", (int)hlen, host );
+ if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
+ logadd( LOG_ERROR, "Could not send select image" );
+ } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
+ } else if ( rid != 0 && rid != remoteRid ) {
+ logadd( LOG_ERROR, "rid mismatch (want: %d, got: %d)", (int)rid, (int)remoteRid );
+ } else {
+ logadd( LOG_INFO, "Requested: '%s:%d'", lowerImage, (int)rid );
+ logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
+ sock_setTimeout( sock, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
+ image.name = strdup( remoteName );
+ image.rid = remoteRid;
+ image.size = remoteSize;
+ if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &connection.currentServer ) ) {
+ logadd( LOG_ERROR, "sockaddr to dnbd3_host_t failed!?" );
+ connection.currentServer.type = 0;
+ }
+ connection.panicSignal = signal_new();
+ timing_get( &connection.startupTime );
+ connection.sockFd = sock;
+ requests.head = NULL;
+ requests.tail = NULL;
+ requestAltServers();
+ break;
+ }
+ // Failed
+ if ( sock != -1 ) {
+ close( sock );
+ sock = -1;
+ }
+ }
+ if ( sock != -1 ) {
+ connectionInitDone = true;
+ }
+ }
+ pthread_mutex_unlock( &mutexInit );
+ sock_destroyPollList( cons );
+ return sock != -1;
+}
+
+bool connection_initThreads()
+{
+ pthread_mutex_lock( &mutexInit );
+ if ( !keepRunning || !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
+ pthread_mutex_unlock( &mutexInit );
+ return false;
+ }
+ bool success = true;
+ pthread_t thread;
+ threadInitDone = true;
+ logadd( LOG_DEBUG1, "Initializing stuff" );
+ if ( pthread_mutex_init( &connection.sendMutex, NULL ) != 0
+ || pthread_spin_init( &requests.lock, PTHREAD_PROCESS_PRIVATE ) != 0 ) {
+ logadd( LOG_ERROR, "Mutex or spinlock init failure" );
+ success = false;
+ } else {
+ if ( pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)connection.sockFd ) != 0 ) {
+ logadd( LOG_ERROR, "Could not create receive thread" );
+ success = false;
+ } else if ( pthread_create( &thread, NULL, &connection_backgroundThread, NULL ) != 0 ) {
+ logadd( LOG_ERROR, "Could not create background thread" );
+ success = false;
+ }
+ }
+ if ( !success ) {
+ close( connection.sockFd );
+ connection.sockFd = -1;
+ }
+ pthread_mutex_unlock( &mutexInit );
+ return success;
+}
+
+uint64_t connection_getImageSize()
+{
+ return image.size;
+}
+
+bool connection_read(dnbd3_async_t *request)
+{
+ if ( !connectionInitDone ) return false;
+ pthread_mutex_lock( &connection.sendMutex );
+ enqueueRequest( request );
+ if ( connection.sockFd != -1 ) {
+ if ( !dnbd3_get_block( connection.sockFd, request->offset, request->length, (uint64_t)request, 0 ) ) {
+ shutdown( connection.sockFd, SHUT_RDWR );
+ connection.sockFd = -1;
+ pthread_mutex_unlock( &connection.sendMutex );
+ signal_call( connection.panicSignal );
+ return true;
+ }
+ }
+ pthread_mutex_unlock( &connection.sendMutex );
+ return true;
+}
+
+void connection_close()
+{
+ if ( keepRunning ) {
+ logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
+ }
+ pthread_mutex_lock( &mutexInit );
+ keepRunning = false;
+ if ( !connectionInitDone ) {
+ pthread_mutex_unlock( &mutexInit );
+ return;
+ }
+ pthread_mutex_unlock( &mutexInit );
+ pthread_mutex_lock( &connection.sendMutex );
+ if ( connection.sockFd != -1 ) {
+ shutdown( connection.sockFd, SHUT_RDWR );
+ }
+ pthread_mutex_unlock( &connection.sendMutex );
+}
+
+size_t connection_printStats(char *buffer, const size_t len)
+{
+ int ret;
+ size_t remaining = len;
+ declare_now;
+ if ( remaining > 0 ) {
+ ret = snprintf( buffer, remaining, "Image: %s\nRevision: %d\n\nCurrent connection time: %" PRIu32 "s\n\n",
+ image.name, (int)image.rid, timing_diff( &connection.startupTime, &now ) );
+ if ( ret < 0 ) {
+ ret = 0;
+ }
+ if ( (size_t)ret >= remaining ) {
+ return len;
+ }
+ remaining -= ret;
+ buffer += ret;
+ }
+ int i = -1;
+ lock_read( &altLock );
+ while ( remaining > 3 && ++i < MAX_ALTS ) {
+ if ( altservers[i].host.type == 0 )
+ continue;
+ if ( isSameAddressPort( &connection.currentServer, &altservers[i].host ) ) {
+ *buffer++ = '*';
+ } else if ( i >= MAX_ALTS_ACTIVE ) {
+ *buffer++ = '-';
+ } else {
+ *buffer++ = ' ';
+ }
+ const size_t addrlen = sock_printHost( &altservers[i].host, buffer, remaining );
+ remaining -= (addrlen + 1); // For space or * above
+ buffer += addrlen;
+ if ( remaining < 3 )
+ break;
+ int width = addrlen >= 35 ? 0 : 35 - (int)addrlen;
+ char *unit;
+ int value;
+ if ( altservers[i].rtt > 5000 ) {
+ unit = "ms ";
+ value = altservers[i].rtt / 1000;
+ } else {
+ unit = "µs";
+ value = altservers[i].rtt;
+ width += 3;
+ }
+ ret = snprintf( buffer, remaining, "% *d %s Unreachable:% 5d BestCount:% 5d Live:% 5dµs\n",
+ width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
+ if ( ret < 0 ) {
+ ret = 0;
+ }
+ if ( (size_t)ret >= remaining ) {
+ remaining = 0;
+ break;
+ }
+ remaining -= ret;
+ buffer += ret;
+ }
+ unlock_rw( &altLock );
+ return len - remaining;
+}
+
+static void* connection_receiveThreadMain(void *sockPtr)
+{
+ int sockFd = (int)(size_t)sockPtr;
+ dnbd3_reply_t reply;
+ pthread_detach( pthread_self() );
+
+ while ( keepRunning ) {
+ int ret;
+ do {
+ ret = dnbd3_read_reply( sockFd, &reply, true );
+ if ( ret == REPLY_OK ) break;
+ } while ( ret == REPLY_INTR || ret == REPLY_AGAIN );
+ if ( ret != REPLY_OK ) {
+ logadd( LOG_DEBUG1, "Error receiving reply on receiveThread (%d)", ret );
+ goto fail;
+ }
+
+ if ( reply.cmd == CMD_GET_BLOCK ) {
+ // Get block reply. find matching request
+ dnbd3_async_t *request = removeRequest( (dnbd3_async_t*)reply.handle );
+ if ( request == NULL ) {
+ // This happens if the alt server probing thread tears down our connection
+ // and did a direct RTT probe to satisfy this very request.
+ logadd( LOG_DEBUG1, "Got block reply with no matching request" );
+ if ( reply.size != 0 && !throwDataAway( sockFd, reply.size ) ) {
+ logadd( LOG_DEBUG1, "....and choked on reply payload" );
+ goto fail;
+ }
+ } else {
+ // Found a match
+ const ssize_t ret = sock_recv( sockFd, request->buffer, request->length );
+ if ( ret != (ssize_t)request->length ) {
+ logadd( LOG_DEBUG1, "receiving payload for a block reply failed" );
+ connection_read( request );
+ goto fail;
+ }
+ // Check RTT
+ declare_now;
+ uint64_t diff = timing_diffUs( &request->time, &now );
+ if ( diff < 30ull * 1000 * 1000 ) { // Sanity check - ignore if > 30s
+ lock_read( &altLock );
+ for ( int i = 0; i < MAX_ALTS; ++i ) {
+ if ( altservers[i].host.type == 0 )
+ continue;
+ if ( isSameAddressPort( &connection.currentServer, &altservers[i].host ) ) {
+ altservers[i].liveRtt = ( altservers[i].liveRtt * 3 + (int)diff ) / 4;
+ break;
+ }
+ }
+ unlock_rw( &altLock );
+ }
+ // Success, wake up caller
+ request->success = true;
+ request->finished = true;
+ signal_call( request->signal );
+ }
+ } else if ( reply.cmd == CMD_GET_SERVERS ) {
+ // List of known alt servers
+ dnbd3_server_entry_t entries[MAX_ALTS];
+ const int count = MIN( reply.size / sizeof(dnbd3_server_entry_t), MAX_ALTS );
+ const size_t relevantSize = sizeof(dnbd3_server_entry_t) * count;
+ if ( sock_recv( sockFd, entries, relevantSize ) != (ssize_t)relevantSize
+ || !throwDataAway( sockFd, reply.size - (uint32_t)relevantSize ) ) {
+ logadd( LOG_DEBUG1, "Error receiving list of alt servers." );
+ goto fail;
+ }
+ pthread_mutex_lock( &newAltLock );
+ memcpy( newservers, entries, relevantSize );
+ pthread_mutex_unlock( &newAltLock );
+ } else {
+ // TODO: Handle the others?
+ if ( reply.size != 0 && !throwDataAway( sockFd, reply.size ) ) {
+ logadd( LOG_DEBUG1, "Could not throw %d bytes away on CMD %d", (int)reply.size, (int)reply.cmd );
+ goto fail;
+ }
+ }
+ }
+ logadd( LOG_DEBUG1, "Aus der Schleife rausgeflogen! ARRRRRRRRRR" );
+fail:;
+ // Make sure noone is trying to use the socket for sending by locking,
+ pthread_mutex_lock( &connection.sendMutex );
+ // then just set the fd to -1, but only if it's the same fd as ours,
+ // as someone could have established a new connection already
+ if ( connection.sockFd == sockFd ) {
+ connection.sockFd = -1;
+ signal_call( connection.panicSignal );
+ }
+ pthread_mutex_unlock( &connection.sendMutex );
+ // As we're the only reader, it's safe to close the socket now
+ close( sockFd );
+ return NULL;
+}
+
+static void* connection_backgroundThread(void *something UNUSED)
+{
+ ticks nextKeepalive;
+ ticks nextRttCheck;
+
+ timing_get( &nextKeepalive );
+ nextRttCheck = nextKeepalive;
+ while ( keepRunning ) {
+ ticks now;
+ timing_get( &now );
+ uint32_t wt1 = timing_diffMs( &now, &nextKeepalive );
+ uint32_t wt2 = timing_diffMs( &now, &nextRttCheck );
+ if ( wt1 > 0 && wt2 > 0 ) {
+ int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 );
+ if ( waitRes == SIGNAL_ERROR ) {
+ logadd( LOG_WARNING, "Error waiting on signal in background thread! Errno = %d", errno );
+ }
+ timing_get( &now );
+ }
+ // Woken up, see what we have to do
+ const bool panic = connection.sockFd == -1;
+ // Check alt servers
+ if ( panic || timing_reachedPrecise( &nextRttCheck, &now ) ) {
+ if ( learnNewServers ) {
+ addAltServers();
+ }
+ sortAltServers();
+ probeAltServers();
+ if ( panic || timing_diff( &connection.startupTime, &now ) <= STARTUP_MODE_DURATION ) {
+ timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_STARTUP );
+ } else {
+ timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_NORMAL );
+ }
+ }
+ // Send keepalive packet
+ if ( timing_reachedPrecise( &nextKeepalive, &now ) ) {
+ pthread_mutex_lock( &connection.sendMutex );
+ if ( connection.sockFd != -1 ) {
+ dnbd3_request_t request;
+ request.magic = dnbd3_packet_magic;
+ request.cmd = CMD_KEEPALIVE;
+ request.handle = request.offset = request.size = 0;
+ fixup_request( request );
+ ssize_t ret = sock_sendAll( connection.sockFd, &request, sizeof request, 2 );
+ if ( (size_t)ret != sizeof request ) {
+ shutdown( connection.sockFd, SHUT_RDWR );
+ connection.sockFd = -1;
+ nextRttCheck = now;
+ }
+ }
+ pthread_mutex_unlock( &connection.sendMutex );
+ timing_addSeconds( &nextKeepalive, &now, TIMER_INTERVAL_KEEPALIVE_PACKET );
+ }
+ }
+ return NULL;
+}
+
+// Private quick helpers
+
+static void addAltServers()
+{
+ pthread_mutex_lock( &newAltLock );
+ lock_write( &altLock );
+ for ( int nIdx = 0; nIdx < MAX_ALTS; ++nIdx ) {
+ if ( newservers[nIdx].host.type == 0 )
+ continue;
+ // Got a new alt server, see if it's already known
+ for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
+ if ( isSameAddress( &newservers[nIdx].host, &altservers[eIdx].host ) ) {
+ goto skip_server;
+ }
+ }
+ // Not known yet, add - find free slot
+ int slot = -1;
+ for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
+ if ( altservers[eIdx].host.type == 0 ) {
+ slot = eIdx; // free - bail out and use this one
+ break;
+ }
+ if ( altservers[eIdx].consecutiveFails > FAIL_BACKOFF_START_COUNT
+ && slot != -1 && altservers[slot].consecutiveFails < altservers[eIdx].consecutiveFails ) {
+ // Replace an existing alt-server that failed recently if we got no more slots
+ slot = eIdx;
+ }
+ }
+ if ( slot != -1 ) {
+ char txt[200];
+ sock_printHost( &newservers[nIdx].host, txt, 200 );
+ logadd( LOG_DEBUG1, "new server %s in slot %d", txt, slot );
+ altservers[slot].consecutiveFails = 0;
+ altservers[slot].bestCount = 0;
+ altservers[slot].rtts[0] = RTT_UNREACHABLE;
+ altservers[slot].rttIndex = 1;
+ altservers[slot].host = newservers[nIdx].host;
+ altservers[slot].liveRtt = 0;
+ }
+skip_server:;
+ }
+ memset( newservers, 0, sizeof(newservers) );
+ unlock_rw( &altLock );
+ pthread_mutex_unlock( &newAltLock );
+}
+
+/**
+ * Find a server at index >= MAX_ALTS_ACTIVE (one that isn't considered for switching over)
+ * that has been inactive for a while, then look if there's an active server that's failed
+ * a couple of times recently. Swap both if found.
+ */
+static void sortAltServers()
+{
+ int ac = 0;
+ lock_write( &altLock );
+ for ( int ia = MAX_ALTS_ACTIVE; ia < MAX_ALTS; ++ia ) {
+ alt_server_t * const inactive = &altservers[ia];
+ if ( inactive->host.type == 0 || inactive->consecutiveFails > 0 )
+ continue;
+ while ( ac < MAX_ALTS_ACTIVE ) {
+ if ( altservers[ac].host.type == 0 || altservers[ac].consecutiveFails > FAIL_BACKOFF_START_COUNT )
+ break;
+ ac++;
+ }
+ if ( ac == MAX_ALTS_ACTIVE )
+ break;
+ // Switch!
+ alt_server_t * const active = &altservers[ac];
+ dnbd3_host_t tmp = inactive->host;
+ inactive->host = active->host;
+ inactive->consecutiveFails = FAIL_BACKOFF_START_COUNT * 4;
+ inactive->bestCount = 0;
+ inactive->rtts[0] = RTT_UNREACHABLE;
+ inactive->rttIndex = 1;
+ inactive->liveRtt = 0;
+ active->host = tmp;
+ active->consecutiveFails = 0;
+ active->bestCount = 0;
+ active->rtts[0] = RTT_UNREACHABLE;
+ active->rttIndex = 1;
+ active->liveRtt = 0;
+ }
+ unlock_rw( &altLock );
+}
+
+static void probeAltServers()
+{
+ serialized_buffer_t buffer;
+ dnbd3_reply_t reply;
+ int bestSock = -1;
+ uint16_t remoteRid, remoteProto;
+ uint64_t remoteSize;
+ char *remoteName;
+ bool doSwitch;
+ bool panic = connection.sockFd == -1;
+ uint64_t testOffset = 0;
+ uint32_t testLength = RTT_BLOCK_SIZE;
+ dnbd3_async_t *request = NULL;
+ alt_server_t *current = NULL, *best = NULL;
+
+ if ( !panic ) {
+ lock_read( &altLock );
+ for ( int altIndex = 0; altIndex < MAX_ALTS; ++altIndex ) {
+ if ( altservers[altIndex].host.type != 0
+ && isSameAddressPort( &altservers[altIndex].host, &connection.currentServer ) ) {
+ current = &altservers[altIndex];
+ break;
+ }
+ }
+ unlock_rw( &altLock );
+ }
+ declare_now;
+ pthread_spin_lock( &requests.lock );
+ if ( requests.head != NULL ) {
+ if ( !panic && current != NULL ) {
+ const int maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
+ dnbd3_async_t *iterator;
+ for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) {
+ // A request with measurement tag is pending
+ if ( timing_diffUs( &iterator->time, &now ) > maxDelay ) {
+ panic = true;
+ break;
+ }
+ }
+ }
+ if ( panic ) {
+ request = requests.head;
+ testOffset = requests.head->offset;
+ testLength = requests.head->length;
+ }
+ }
+ pthread_spin_unlock( &requests.lock );
+ if ( testOffset != 0 ) {
+ logadd( LOG_DEBUG1, "Panic with pending %" PRIu64 ":%" PRIu32, testOffset, testLength );
+ }
+
+ lock_read( &altLock );
+ for ( int altIndex = 0; altIndex < (panic ? MAX_ALTS : MAX_ALTS_ACTIVE); ++altIndex ) {
+ alt_server_t * const srv = &altservers[altIndex];
+ if ( srv->host.type == 0 )
+ continue;
+ if ( !panic && srv->consecutiveFails > FAIL_BACKOFF_START_COUNT
+ && rand() % srv->consecutiveFails >= FAIL_BACKOFF_START_COUNT ) {
+ continue;
+ }
+ if ( srv->rttIndex >= RTT_COUNT ) {
+ srv->rttIndex = 0;
+ } else {
+ srv->rttIndex += 1;
+ }
+ // Probe
+ ticks start;
+ timing_get( &start );
+ errno = 0;
+ int sock = sock_connect( &srv->host, panic ? 1000 : 333, 1000 );
+ if ( sock == -1 ) {
+ logadd( LOG_DEBUG1, "Could not connect for probing. errno = %d", errno );
+ goto fail;
+ }
+ if ( !dnbd3_select_image( sock, image.name, image.rid, 0 ) ) {
+ logadd( LOG_DEBUG1, "probe: select_image failed" );
+ goto fail;
+ }
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize )) {
+ logadd( LOG_DEBUG1, "probe: select image reply failed" );
+ goto fail;
+ }
+ if ( remoteProto < MIN_SUPPORTED_SERVER ) {
+ logadd( LOG_WARNING, "Unsupported remote version (local: %d, remote: %d)", (int)PROTOCOL_VERSION, (int)remoteProto );
+ srv->consecutiveFails += 10;
+ goto fail;
+ }
+ if ( remoteRid != image.rid || strcmp( remoteName, image.name ) != 0 ) {
+ logadd( LOG_WARNING, "Remote rid or name mismatch (got '%s')", remoteName );
+ srv->consecutiveFails += 10;
+ goto fail;
+ }
+ if ( !dnbd3_get_block( sock, testOffset, testLength, 0, 0 ) ) {
+ logadd( LOG_DEBUG1, "-> block request fail" );
+ goto fail;
+ }
+ int a = 111;
+ if ( !(a = dnbd3_get_reply( sock, &reply )) || reply.size != testLength ) {
+ logadd( LOG_DEBUG1, "<- get block reply fail %d %d", a, (int)reply.size );
+ goto fail;
+ }
+ if ( request != NULL && removeRequest( request ) != NULL ) {
+ // Request successfully removed from queue
+ const ssize_t ret = sock_recv( sock, request->buffer, request->length );
+ if ( ret != (ssize_t)request->length ) {
+ logadd( LOG_DEBUG1, "[RTT] receiving payload for a block reply failed" );
+ // Failure, add to queue again
+ connection_read( request );
+ goto fail;
+ }
+ // Success, wake up caller
+ logadd( LOG_DEBUG1, "[RTT] Successful direct probe" );
+ request->success = true;
+ request->finished = true;
+ signal_call( request->signal );
+ } else {
+ // Wasn't a request that's in our request queue
+ if ( !throwDataAway( sock, testLength ) ) {
+ logadd( LOG_DEBUG1, "<- get block reply payload fail" );
+ goto fail;
+ }
+ }
+
+ // Yay, success
+ // Panic mode? Just switch to server
+ if ( panic ) {
+ unlock_rw( &altLock );
+ switchConnection( sock, srv );
+ return;
+ }
+ // Non-panic mode:
+ // Update stats of server
+ ticks end;
+ timing_get( &end );
+ srv->consecutiveFails = 0;
+ srv->rtts[srv->rttIndex] = (int)timing_diffUs( &start, &end );
+ int newRtt = 0;
+ for ( int i = 0; i < RTT_COUNT; ++i ) {
+ newRtt += srv->rtts[i];
+ }
+ if ( srv->liveRtt != 0 ) {
+ // Make live rtt measurement influence result
+ newRtt = ( newRtt + srv->liveRtt ) / ( RTT_COUNT + 1 );
+ } else {
+ newRtt /= RTT_COUNT;
+ }
+ srv->rtt = newRtt;
+
+ // Keep socket open if this is currently the best one
+ if ( best == NULL || best->rtt > srv->rtt ) {
+ best = srv;
+ if ( bestSock != -1 ) {
+ close( bestSock );
+ }
+ bestSock = sock;
+ } else {
+ close( sock );
+ }
+ continue;
+fail:;
+ if ( sock != -1 ) {
+ close( sock );
+ }
+ srv->rtts[srv->rttIndex] = RTT_UNREACHABLE;
+ srv->consecutiveFails += 1;
+ }
+ doSwitch = false;
+ if ( best != NULL ) {
+ // Time-sensitive switch decision: If a server was best for some consecutive measurements,
+ // we switch no matter how small the difference to the current server is
+ for ( int altIndex = 0; altIndex < MAX_ALTS_ACTIVE; ++altIndex ) {
+ alt_server_t * const srv = &altservers[altIndex];
+ // Decay liveRtt slowly...
+ if ( srv->liveRtt > current->liveRtt && srv->liveRtt > srv->rtt ) {
+ srv->liveRtt -= ( ( srv->liveRtt / 100 ) + 1 );
+ }
+ if ( srv == best ) {
+ if ( srv->bestCount < 50 ) {
+ srv->bestCount += 2;
+ }
+ // Switch with increasing probability the higher the bestCount is
+ if ( srv->bestCount > 12 && ( current == NULL || srv->rtt < current->rtt ) && srv->bestCount > rand() % 50 ) {
+ doSwitch = true;
+ }
+ } else if ( srv->bestCount > 0 ) {
+ srv->bestCount--;
+ }
+ }
+ for ( int i = MAX_ALTS_ACTIVE; i < MAX_ALTS; ++i ) {
+ if ( altservers[i].consecutiveFails > 0 ) {
+ altservers[i].consecutiveFails--;
+ }
+ }
+ // This takes care of the situation where two servers alternate being the best server all the time
+ if ( doSwitch && current != NULL && best->bestCount - current->bestCount < 8 ) {
+ doSwitch = false;
+ }
+ // Regular logic: Apply threshold when considering switch
+ if ( !doSwitch && current != NULL ) {
+ doSwitch = current->rtt > best->rtt + RTT_ABSOLUTE_THRESHOLD
+ || RTT_THRESHOLD_FACTOR(current->rtt) > best->rtt + 1000;
+ }
+ }
+ // Switch if a better server was found
+ if ( doSwitch ) {
+ logadd( LOG_INFO, "Current: %dµs, best: %dµs. Will switch!", current == NULL ? 0 : current->rtt, best->rtt );
+ for ( int i = 0; i < MAX_ALTS; ++i ) {
+ if ( &altservers[i] != best ) {
+ altservers[i].bestCount = 0;
+ }
+ }
+ unlock_rw( &altLock );
+ switchConnection( bestSock, best );
+ return;
+ }
+ // No switch
+ unlock_rw( &altLock );
+ if ( best != NULL ) {
+ close( bestSock );
+ }
+}
+
+static void switchConnection(int sockFd, alt_server_t *srv)
+{
+ pthread_t thread;
+ struct sockaddr_storage addr;
+ socklen_t addrLen = sizeof(addr);
+ char message[200] = "Connection switched to ";
+ const size_t len = strlen( message );
+ int ret;
+ dnbd3_async_t *queue, *it;
+
+ pthread_mutex_lock( &connection.sendMutex );
+ if ( connection.sockFd != -1 ) {
+ shutdown( connection.sockFd, SHUT_RDWR );
+ }
+ ret = getpeername( sockFd, (struct sockaddr*)&addr, &addrLen );
+ if ( ret == 0 ) {
+ connection.currentServer = srv->host;
+ connection.sockFd = sockFd;
+ pthread_spin_lock( &requests.lock );
+ queue = requests.head;
+ requests.head = requests.tail = NULL;
+ pthread_spin_unlock( &requests.lock );
+ } else {
+ connection.sockFd = -1;
+ }
+ requestAltServers();
+ pthread_mutex_unlock( &connection.sendMutex );
+ if ( ret != 0 ) {
+ close( sockFd );
+ logadd( LOG_WARNING, "Could not getpeername after connection switch, assuming connection already dead again. (Errno=%d)", errno );
+ signal_call( connection.panicSignal );
+ return;
+ }
+ timing_get( &connection.startupTime );
+ pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)sockFd );
+ sock_printable( (struct sockaddr*)&addr, sizeof(addr), message + len, sizeof(message) - len );
+ logadd( LOG_INFO, "%s", message );
+ // resend queue
+ if ( queue != NULL ) {
+ pthread_mutex_lock( &connection.sendMutex );
+ dnbd3_async_t *next = NULL;
+ for ( it = queue; it != NULL; it = next ) {
+ logadd( LOG_DEBUG1, "Requeue after server change" );
+ next = it->next;
+ enqueueRequest( it );
+ if ( connection.sockFd != -1 && !dnbd3_get_block( connection.sockFd, it->offset, it->length, (uint64_t)it, 0 ) ) {
+ logadd( LOG_WARNING, "Resending pending request failed, re-entering panic mode" );
+ shutdown( connection.sockFd, SHUT_RDWR );
+ connection.sockFd = -1;
+ signal_call( connection.panicSignal );
+ }
+ }
+ pthread_mutex_unlock( &connection.sendMutex );
+ }
+}
+
+/**
+ * Does not lock, so get the sendMutex first!
+ */
+static void requestAltServers()
+{
+ if ( connection.sockFd == -1 || !learnNewServers )
+ return;
+ dnbd3_request_t request = { 0 };
+ request.magic = dnbd3_packet_magic;
+ request.cmd = CMD_GET_SERVERS;
+ fixup_request( request );
+ if ( sock_sendAll( connection.sockFd, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) {
+ logadd( LOG_WARNING, "Connection failed while requesting alt server list" );
+ shutdown( connection.sockFd, SHUT_RDWR );
+ connection.sockFd = -1;
+ }
+}
+
+static bool throwDataAway(int sockFd, uint32_t amount)
+{
+ size_t done = 0;
+ char tempBuffer[SHORTBUF];
+ while ( done < amount ) {
+ const ssize_t ret = sock_recv( sockFd, tempBuffer, MIN( amount - done, SHORTBUF ) );
+ if ( ret <= 0 )
+ return false;
+ done += (size_t)ret;
+ }
+ return true;
+}
+
+static void enqueueRequest(dnbd3_async_t *request)
+{
+ request->next = NULL;
+ request->finished = false;
+ request->success = false;
+ //logadd( LOG_DEBUG2, "Queue: %p @ %s : %d", request, file, line );
+ // Measure latency and add to switch formula
+ timing_get( &request->time );
+ pthread_spin_lock( &requests.lock );
+ if ( requests.head == NULL ) {
+ requests.head = requests.tail = request;
+ } else {
+ requests.tail->next = request;
+ requests.tail = request;
+ }
+ pthread_spin_unlock( &requests.lock );
+}
+
+static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
+{
+ pthread_spin_lock( &requests.lock );
+ //logadd( LOG_DEBUG2, "Remov: %p @ %s : %d", request, file, line );
+ dnbd3_async_t *iterator, *prev = NULL;
+ for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) {
+ if ( iterator == request ) {
+ // Found it, break!
+ if ( prev != NULL ) {
+ prev->next = iterator->next;
+ } else {
+ requests.head = iterator->next;
+ }
+ if ( requests.tail == iterator ) {
+ requests.tail = prev;
+ }
+ break;
+ }
+ prev = iterator;
+ }
+ pthread_spin_unlock( &requests.lock );
+ return iterator;
+}
+
diff --git a/src/fuse/connection.h b/src/fuse/connection.h
new file mode 100644
index 0000000..cae554c
--- /dev/null
+++ b/src/fuse/connection.h
@@ -0,0 +1,35 @@
+#ifndef _CONNECTION_H_
+#define _CONNECTION_H_
+
+#include "../shared/fdsignal.h"
+#include "../shared/timing.h"
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+struct _dnbd3_async;
+
+typedef struct _dnbd3_async {
+ struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller)
+ dnbd3_signal_t* signal; // Used to signal the caller
+ char* buffer; // Caller-provided buffer to be filled
+ ticks time; // When request was put on wire, 0 if not measuring
+ uint64_t offset;
+ uint32_t length;
+ bool finished; // Will be set to true if the request has been handled
+ bool success; // Will be set to true if the request succeeded
+} dnbd3_async_t;
+
+bool connection_init(const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers);
+
+bool connection_initThreads();
+
+uint64_t connection_getImageSize();
+
+bool connection_read(dnbd3_async_t *request);
+
+void connection_close();
+
+size_t connection_printStats(char *buffer, const size_t len);
+
+#endif /* CONNECTION_H_ */
diff --git a/src/fuse/helper.c b/src/fuse/helper.c
new file mode 100644
index 0000000..d81b08f
--- /dev/null
+++ b/src/fuse/helper.c
@@ -0,0 +1,36 @@
+#include "helper.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+
+void printLog( log_info *info )
+{
+ FILE *logFile;
+
+ // Create logfile
+
+ logFile = fopen( "log.txt", "w" );
+ if ( logFile == NULL ) {
+ printf( "Error creating/opening log.txt\n" );
+ return;
+ }
+
+ //rewind(file);
+ fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount );
+ fprintf( logFile, "Blocksize: 4KiB\n\n" );
+ fprintf( logFile, "Block access count:\n" );
+
+ uint64_t i = 0;
+ for ( ; i < info->imageBlockCount; i++ ) {
+ if ( i % 50 == 0 ) {
+ fprintf( logFile, "\n" );
+ }
+ fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] );
+ }
+ fprintf( logFile, "\n" );
+ fclose( logFile );
+}
diff --git a/src/fuse/helper.h b/src/fuse/helper.h
new file mode 100644
index 0000000..9e5d127
--- /dev/null
+++ b/src/fuse/helper.h
@@ -0,0 +1,35 @@
+#ifndef IMAGEHELPER_H
+#define IMAGEHELPER_H
+
+#include "../types.h"
+
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/socket.h>
+
+typedef struct log_info {
+ uint64_t imageSize;
+ uint64_t receivedBytes;
+ uint64_t imageBlockCount;
+ uint8_t *blockRequestCount;
+} log_info;
+
+
+
+void printLog(log_info *info);
+
+int connect_to_server(char *server_adress, int port);
+
+static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+{
+ return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+}
+
+static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+{
+ return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+}
+
+#endif
diff --git a/src/fuse/main.c b/src/fuse/main.c
new file mode 100644
index 0000000..1a5643c
--- /dev/null
+++ b/src/fuse/main.c
@@ -0,0 +1,420 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ * This program can be distributed under the terms of the GNU GPL.
+ * See the file COPYING.
+ *
+ * Changed by Stephan Schwaer
+ * */
+
+#include "connection.h"
+#include "helper.h"
+#include "../shared/protocol.h"
+#include "../shared/log.h"
+
+#define FUSE_USE_VERSION 30
+#include <fuse.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+/* for printing uint */
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <getopt.h>
+#include <time.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
+
+static const char * const IMAGE_PATH = "/img";
+static const char * const STATS_PATH = "/status";
+
+static uint64_t imageSize;
+/* Debug/Benchmark variables */
+static bool useDebug = false;
+static log_info logInfo;
+static struct timespec startupTime;
+static uid_t owner;
+static bool keepRunning = true;
+static void (*fuse_sigIntHandler)(int) = NULL;
+static void (*fuse_sigTermHandler)(int) = NULL;
+static struct fuse_operations dnbd3_fuse_no_operations;
+
+#define SIGPOOLSIZE 6
+static pthread_spinlock_t sigLock;
+static dnbd3_signal_t *signalPool[SIGPOOLSIZE];
+static dnbd3_signal_t **sigEnd = signalPool + SIGPOOLSIZE;
+static void signalInit()
+{
+ pthread_spin_init( &sigLock, PTHREAD_PROCESS_PRIVATE );
+ for ( size_t i = 0; i < SIGPOOLSIZE; ++i ) {
+ signalPool[i] = NULL;
+ }
+}
+static inline dnbd3_signal_t *signalGet()
+{
+ pthread_spin_lock( &sigLock );
+ for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
+ if ( *it != NULL ) {
+ dnbd3_signal_t *ret = *it;
+ *it = NULL;
+ pthread_spin_unlock( &sigLock );
+ return ret;
+ }
+ }
+ pthread_spin_unlock( &sigLock );
+ return signal_newBlocking();
+}
+static inline void signalPut(dnbd3_signal_t *signal)
+{
+ pthread_spin_lock( &sigLock );
+ for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
+ if ( *it == NULL ) {
+ *it = signal;
+ pthread_spin_unlock( &sigLock );
+ return;
+ }
+ }
+ pthread_spin_unlock( &sigLock );
+ signal_close( signal );
+}
+
+static int image_getattr(const char *path, struct stat *stbuf)
+{
+ int res = 0;
+ memset( stbuf, 0, sizeof( struct stat ) );
+ stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
+ stbuf->st_uid = owner;
+ if ( strcmp( path, "/" ) == 0 ) {
+ stbuf->st_mode = S_IFDIR | 0550;
+ stbuf->st_nlink = 2;
+ } else if ( strcmp( path, IMAGE_PATH ) == 0 ) {
+ stbuf->st_mode = S_IFREG | 0440;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = imageSize;
+ } else if ( strcmp( path, STATS_PATH ) == 0 ) {
+ stbuf->st_mode = S_IFREG | 0440;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = 4096;
+ clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ } else {
+ res = -ENOENT;
+ }
+ return res;
+}
+
+static int image_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset UNUSED, struct fuse_file_info *fi UNUSED)
+{
+ if ( strcmp( path, "/" ) != 0 ) {
+ return -ENOENT;
+ }
+ filler( buf, ".", NULL, 0 );
+ filler( buf, "..", NULL, 0 );
+ filler( buf, IMAGE_PATH + 1, NULL, 0 );
+ filler( buf, STATS_PATH + 1, NULL, 0 );
+ return 0;
+}
+
+static int image_open(const char *path, struct fuse_file_info *fi)
+{
+ if ( strcmp( path, IMAGE_PATH ) != 0 && strcmp( path, STATS_PATH ) != 0 ) {
+ return -ENOENT;
+ }
+ if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ return -EACCES;
+ }
+ return 0;
+}
+
+static int fillStatsFile(char *buf, size_t size, off_t offset) {
+ if ( offset == 0 ) {
+ return (int)connection_printStats( buf, size );
+ }
+ char buffer[4096];
+ int ret = (int)connection_printStats( buffer, sizeof buffer );
+ int len = MIN( ret - (int)offset, (int)size );
+ if ( len == 0 )
+ return 0;
+ if ( len < 0 ) {
+ return -EOF;
+ }
+ memcpy( buf, buffer + offset, len );
+ return len;
+}
+
+static int image_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi UNUSED)
+{
+ if ( size > __INT_MAX__ ) {
+ // fuse docs say we MUST fill the buffer with exactly size bytes and return size,
+ // otherwise the buffer will we padded with zeros. Since the return value is just
+ // an int, we could not properly fulfill read requests > 2GB. Since there is no
+ // mention of a guarantee that this will never happen, better add a safety check.
+ // Way to go fuse.
+ return -EIO;
+ }
+ if ( path[1] == STATS_PATH[1] ) {
+ return fillStatsFile( buf, size, offset );
+ }
+
+ if ( (uint64_t)offset >= imageSize ) {
+ return 0;
+ }
+
+ if ( offset + size > imageSize ) {
+ size = imageSize - offset;
+ }
+
+ if ( useDebug ) {
+ /* count the requested blocks */
+ uint64_t startBlock = offset / ( 4096 );
+ const uint64_t endBlock = ( offset + size - 1 ) / ( 4096 );
+
+ for ( ; startBlock <= endBlock; startBlock++ ) {
+ ++logInfo.blockRequestCount[startBlock];
+ }
+ }
+
+ dnbd3_async_t request;
+ request.buffer = buf;
+ request.length = (uint32_t)size;
+ request.offset = offset;
+ request.signal = signalGet();
+
+ if ( !connection_read( &request ) ) {
+ signalPut( request.signal );
+ return -EINVAL;
+ }
+ while ( !request.finished ) {
+ int ret = signal_wait( request.signal, 5000 );
+ if ( !keepRunning ) {
+ connection_close();
+ break;
+ }
+ if ( ret < 0 ) {
+ debugf( "fuse_read signal wait returned %d", ret );
+ }
+ }
+ signalPut( request.signal );
+ if ( request.success ) {
+ return request.length;
+ } else {
+ return -EIO;
+ }
+}
+
+static void image_sigHandler(int signum) {
+ keepRunning = false;
+ if ( signum == SIGINT && fuse_sigIntHandler != NULL ) {
+ fuse_sigIntHandler(signum);
+ }
+ if ( signum == SIGTERM && fuse_sigTermHandler != NULL ) {
+ fuse_sigTermHandler(signum);
+ }
+}
+
+static void* image_init(struct fuse_conn_info *conn UNUSED)
+{
+ if ( !connection_initThreads() ) {
+ logadd( LOG_ERROR, "Could not initialize threads for dnbd3 connection, exiting..." );
+ exit( EXIT_FAILURE );
+ }
+ // Prepare our handler
+ struct sigaction newHandler;
+ memset( &newHandler, 0, sizeof(newHandler) );
+ newHandler.sa_handler = &image_sigHandler;
+ sigemptyset( &newHandler.sa_mask );
+ struct sigaction oldHandler;
+ // Retrieve old handlers when setting
+ sigaction( SIGINT, &newHandler, &oldHandler );
+ fuse_sigIntHandler = oldHandler.sa_handler;
+ logadd( LOG_DEBUG1, "Previous SIGINT handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
+ sigaction( SIGTERM, &newHandler, &oldHandler );
+ fuse_sigTermHandler = oldHandler.sa_handler;
+ logadd( LOG_DEBUG1, "Previous SIGTERM handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
+ return NULL;
+}
+
+/* close the connection */
+static void image_destroy(void *private_data UNUSED)
+{
+ if ( useDebug ) {
+ printLog( &logInfo );
+ }
+ connection_close();
+ return;
+}
+
+/* map the implemented fuse operations */
+static struct fuse_operations image_oper = {
+ .getattr = image_getattr,
+ .readdir = image_readdir,
+ .open = image_open,
+ .read = image_read,
+ .init = image_init,
+ .destroy = image_destroy,
+};
+
+static void printVersion()
+{
+ char *arg[] = { "foo", "-V" };
+ printf( "DNBD3-Fuse Version 1.2.3.4, protocol version %d\n", (int)PROTOCOL_VERSION );
+ fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ exit( 0 );
+}
+
+static void printUsage(char *argv0, int exitCode)
+{
+ char *arg[] = { argv0, "-h" };
+ fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ printf( "\n" );
+ printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
+ printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
+ printf( " -d --debug Don't fork, write stats file, and print debug output (fuse -> stderr, dnbd3 -> stdout)\n" );
+ printf( " -f Don't fork (dnbd3 -> stdout)\n" );
+ printf( " -h --host List of space separated hosts to use\n" );
+ printf( " -i --image Remote image name to request\n" );
+ printf( " -l --log Write log to given location\n" );
+ printf( " -o --option Mount options to pass to libfuse\n" );
+ printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" );
+ printf( " -S --sticky Use only servers from command line (no learning from servers)\n" );
+ printf( " -s Single threaded mode\n" );
+ exit( exitCode );
+}
+
+static const char *optString = "dfHh:i:l:o:r:SsVv";
+static const struct option longOpts[] = {
+ { "debug", no_argument, NULL, 'd' },
+ { "help", no_argument, NULL, 'H' },
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "log", required_argument, NULL, 'l' },
+ { "option", required_argument, NULL, 'o' },
+ { "rid", required_argument, NULL, 'r' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "version", no_argument, NULL, 'v' },
+ { 0, 0, 0, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+ char *server_address = NULL;
+ char *image_Name = NULL;
+ char *log_file = NULL;
+ uint16_t rid = 0;
+ char **newArgv;
+ int newArgc;
+ int opt, lidx;
+ bool learnNewServers = true;
+
+ if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
+ printUsage( argv[0], 0 );
+ }
+
+ // TODO Make log mask configurable
+ log_setConsoleMask( 65535 );
+ log_setConsoleTimestamps( true );
+ log_setFileMask( 65535 );
+
+ newArgv = calloc( argc + 10, sizeof(char*) );
+ newArgv[0] = argv[0];
+ newArgc = 1;
+ while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
+ switch ( opt ) {
+ case 'h':
+ server_address = optarg;
+ break;
+ case 'i':
+ image_Name = optarg;
+ break;
+ case 'r':
+ rid = (uint16_t)atoi(optarg);
+ break;
+ case 'o':
+ newArgv[newArgc++] = "-o";
+ newArgv[newArgc++] = optarg;
+ if ( strstr( optarg, "use_ino" ) != NULL ) {
+ logadd( LOG_WARNING, "************************" );
+ logadd( LOG_WARNING, "* WARNING: use_ino mount option is unsupported, use at your own risk!" );
+ logadd( LOG_WARNING, "************************" );
+ }
+ if ( strstr( optarg, "intr" ) != NULL ) {
+ logadd( LOG_WARNING, "************************" );
+ logadd( LOG_WARNING, "* WARNING: intr mount option is unsupported, use at your own risk!" );
+ logadd( LOG_WARNING, "************************" );
+ }
+ break;
+ case 'l':
+ log_file = optarg;
+ break;
+ case 'H':
+ printUsage( argv[0], 0 );
+ break;
+ case 'v':
+ case 'V':
+ printVersion();
+ break;
+ case 'd':
+ useDebug = true;
+ newArgv[newArgc++] = "-d";
+ break;
+ case 's':
+ newArgv[newArgc++] = "-s";
+ break;
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'f':
+ newArgv[newArgc++] = "-f";
+ break;
+ default:
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+ }
+
+ if ( optind >= argc ) { // Missing mount point
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+
+ if ( server_address == NULL || image_Name == NULL ) {
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+
+ if ( log_file != NULL ) {
+ if ( !log_openLogFile( log_file ) ) {
+ logadd( LOG_WARNING, "Could not open log file at '%s'", log_file );
+ }
+ }
+
+ if ( !connection_init( server_address, image_Name, rid, learnNewServers ) ) {
+ logadd( LOG_ERROR, "Could not connect to any server. Bye.\n" );
+ return EXIT_FAILURE;
+ }
+ imageSize = connection_getImageSize();
+
+ /* initialize benchmark variables */
+ logInfo.receivedBytes = 0;
+ logInfo.imageSize = imageSize;
+ logInfo.imageBlockCount = ( imageSize + 4095 ) / 4096;
+ if ( useDebug ) {
+ logInfo.blockRequestCount = calloc( logInfo.imageBlockCount, sizeof(uint8_t) );
+ } else {
+ logInfo.blockRequestCount = NULL;
+ }
+
+ // Since dnbd3 is always read only and the remote image will not change
+ newArgv[newArgc++] = "-o";
+ newArgv[newArgc++] = "ro,auto_cache,default_permissions";
+ // Mount point goes last
+ newArgv[newArgc++] = argv[optind];
+
+ printf( "ImagePathName: %s\nFuseArgs:",IMAGE_PATH );
+ for ( int i = 0; i < newArgc; ++i ) {
+ printf( " '%s'", newArgv[i] );
+ }
+ putchar('\n');
+ clock_gettime( CLOCK_REALTIME, &startupTime );
+ owner = getuid();
+ signalInit();
+ return fuse_main( newArgc, newArgv, &image_oper, NULL );
+}
diff --git a/src/fuse/serialize.c b/src/fuse/serialize.c
new file mode 100644
index 0000000..4934132
--- /dev/null
+++ b/src/fuse/serialize.c
@@ -0,0 +1,5 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../serialize.c"
diff --git a/src/kernel/core.c b/src/kernel/core.c
new file mode 100644
index 0000000..bfa8d22
--- /dev/null
+++ b/src/kernel/core.c
@@ -0,0 +1,484 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2019 Frederic Robra <frederic@robra.org>
+ * Parts copyright 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <linux/major.h>
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/ioctl.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/blk-mq.h>
+
+#include <linux/uaccess.h>
+#include <asm/types.h>
+
+#include "dnbd3.h"
+#include "clientconfig.h"
+#include "sysfs.h"
+
+static DEFINE_IDR(dnbd3_index_idr);
+static DEFINE_MUTEX(dnbd3_index_mutex);
+
+static unsigned int max_devs = NUMBER_DEVICES;
+static dnbd3_device_t *dnbd3_device;
+int major;
+
+
+static int dnbd3_open(struct block_device *bdev, fmode_t mode)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_open");
+
+ return 0;
+}
+
+static void dnbd3_release(struct gendisk *disk, fmode_t mode)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_release");
+
+}
+
+
+void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_blk_fail_all_requests");
+}
+
+
+int dnbd3_net_connect(dnbd3_device_t *dev)
+{
+ return 0;
+}
+
+
+int dnbd3_net_disconnect(dnbd3_device_t *dev)
+{
+ return 0;
+}
+
+static int dnbd3_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_ioctl");
+
+ int result = -100;
+ dnbd3_device_t *dev = bdev->bd_disk->private_data;
+ char *imgname = NULL;
+ dnbd3_ioctl_t *msg = NULL;
+ //unsigned long irqflags;
+
+ while (dev->disconnecting)
+ {
+ // do nothing
+ }
+
+ if (arg != 0)
+ {
+ msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+ if (msg == NULL) return -ENOMEM;
+ if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg))
+ {
+ result = -ENOEXEC;
+ goto cleanup_return;
+ }
+ if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0)
+ {
+ result = -ENOENT;
+ goto cleanup_return;
+ }
+ if (msg->imgname != NULL && msg->imgnamelen > 0)
+ {
+ imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
+ if (imgname == NULL)
+ {
+ result = -ENOMEM;
+ goto cleanup_return;
+ }
+ if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0)
+ {
+ result = -ENOENT;
+ goto cleanup_return;
+ }
+ imgname[msg->imgnamelen] = '\0';
+ //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname);
+ }
+ }
+
+
+ switch (cmd)
+ {
+ case IOCTL_OPEN:
+ if (dev->imgname != NULL)
+ {
+ result = -EBUSY;
+ }
+ else if (imgname == NULL)
+ {
+ result = -EINVAL;
+ }
+ else if (msg == NULL)
+ {
+ result = -EINVAL;
+ }
+ else
+ {
+ if (sizeof(msg->host) != sizeof(dev->cur_server.host))
+ printk("Odd size bug#1 triggered in IOCTL\n");
+ memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host));
+ dev->cur_server.failures = 0;
+ memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server));
+ dev->imgname = imgname;
+ dev->rid = msg->rid;
+ dev->use_server_provided_alts = msg->use_server_provided_alts;
+ // Forget all alt servers on explicit connect, set first al server to initial server
+ memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
+ memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0]));
+//#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+// if (blk_queue->backing_dev_info != NULL) {
+// blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
+// }
+//#else
+// blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
+//#endif
+ if (dnbd3_net_connect(dev) == 0)
+ {
+ result = 0;
+ imgname = NULL; // Prevent kfree at the end
+ }
+ else
+ {
+ result = -ENOENT;
+ dev->imgname = NULL;
+ }
+ }
+ break;
+
+ case IOCTL_CLOSE:
+ dnbd3_blk_fail_all_requests(dev);
+ result = dnbd3_net_disconnect(dev);
+ dnbd3_blk_fail_all_requests(dev);
+ set_capacity(dev->disk, 0);
+ if (dev->imgname)
+ {
+ kfree(dev->imgname);
+ dev->imgname = NULL;
+ }
+ break;
+
+ case IOCTL_SWITCH:
+ result = -EINVAL;
+ break;
+
+ case IOCTL_ADD_SRV:
+ case IOCTL_REM_SRV:
+ if (dev->imgname == NULL)
+ {
+ result = -ENOENT;
+ }
+ else if (dev->new_servers_num >= NUMBER_SERVERS)
+ {
+ result = -EAGAIN;
+ }
+ else if (msg == NULL)
+ {
+ result = -EINVAL;
+ }
+ else
+ {
+ memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host));
+ dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM
+ ++dev->new_servers_num;
+ result = 0;
+ }
+ break;
+
+ case BLKFLSBUF:
+ result = 0;
+ break;
+
+ default:
+ result = -EIO;
+ break;
+ }
+
+cleanup_return:
+ if (msg) kfree(msg);
+ if (imgname) kfree(imgname);
+ return result;
+
+}
+
+static const struct block_device_operations dnbd3_fops =
+{
+ .owner = THIS_MODULE,
+ .open = dnbd3_open,
+ .release = dnbd3_release,
+ .ioctl = dnbd3_ioctl,
+ .compat_ioctl = dnbd3_ioctl,
+};
+
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_queue_rq");
+ return 0;
+}
+
+static void dnbd3_complete_rq(struct request *req)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_complete_rq");
+
+}
+
+static int dnbd3_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node)
+{
+ struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ cmd->dnbd3 = set->driver_data;
+ cmd->flags = 0;
+ mutex_init(&cmd->lock);
+ return 0;
+}
+static enum blk_eh_timer_return dnbd3_xmit_timeout(struct request *req, bool reserved)
+{
+ printk(KERN_DEBUG "dnbd3: dnbd3_xmit_timeout");
+ return BLK_EH_DONE;
+}
+
+
+static const struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .complete = dnbd3_complete_rq,
+ .init_request = dnbd3_init_request,
+ .timeout = dnbd3_xmit_timeout,
+};
+
+
+static int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int err = -ENOMEM;
+ printk(KERN_DEBUG "dnbd3: adding device %i", minor);
+
+
+ disk = alloc_disk(1);
+ if (!disk) {
+ printk(KERN_DEBUG "dnbd3: alloc_disc failed, device %i", minor);
+ goto out_free_nbd;
+ }
+
+ err = idr_alloc(&dnbd3_index_idr, dev, minor, minor + 1, GFP_KERNEL);
+ if (err == -ENOSPC) {
+ printk(KERN_DEBUG "dnbd3: idr_alloc failed, device %i", minor);
+ err = -EEXIST;
+ }
+
+ if (err < 0)
+ goto out_free_disk;
+
+ dev->minor = minor;
+ dev->disk = disk;
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = sizeof(dnbd3_cmd);
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+ BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+ dev->tag_set.driver_data = dev;
+
+ err = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (err)
+ goto out_free_idr;
+
+ q = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(q)) {
+ err = PTR_ERR(q);
+ goto out_free_tags;
+ }
+ disk->queue = q;
+
+ /*
+ * Tell the block layer that we are not a rotational device
+ */
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+ disk->queue->limits.discard_granularity = 0;
+ disk->queue->limits.discard_alignment = 0;
+ blk_queue_max_discard_sectors(disk->queue, 0);
+ blk_queue_max_segment_size(disk->queue, UINT_MAX);
+ blk_queue_max_segments(disk->queue, USHRT_MAX);
+ blk_queue_max_hw_sectors(disk->queue, 65536);
+ disk->queue->limits.max_sectors = 256;
+
+ mutex_init(&dev->config_lock);
+ refcount_set(&dev->config_refs, 0);
+ refcount_set(&dev->refs, 1);
+ INIT_LIST_HEAD(&dev->list);
+ disk->major = major;
+ disk->first_minor = minor;
+ disk->fops = &dnbd3_fops;
+ disk->private_data = dev;
+ sprintf(disk->disk_name, "dnbd%d", minor);
+// sprintf(disk->disk_name, "dnbd3%d", minor);
+ printk(KERN_DEBUG "dnbd3: add disk, device %s", disk->disk_name);
+ add_disk(disk);
+ dnbd3_sysfs_init(dev);
+ return minor;
+
+out_free_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out_free_idr:
+ idr_remove(&dnbd3_index_idr, minor);
+out_free_disk:
+ put_disk(disk);
+out_free_nbd:
+ kfree(dev);
+ printk(KERN_DEBUG "dnbd3: destroy device %i", minor);
+ return err;
+}
+
+
+
+static int __init dnbd3_init(void)
+{
+ int i;
+ printk(KERN_DEBUG "dnbd3: starting kernel module");
+
+ if (max_devs < 0) {
+ printk(KERN_ERR "dnbd3: max_devs must be >= 0");
+ return -EINVAL;
+ }
+
+
+ dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL);
+ if (!dnbd3_device) {
+ printk(KERN_ERR "dnbd3: failed to create dnbd3 device");
+ return -ENOMEM;
+ }
+
+ // initialize block device
+ major = register_blkdev(0, "dnbd3");
+ if (major == 0) {
+ printk(KERN_ERR "dnbd3: register_blkdev failed");
+ return -EIO;
+ }
+
+ printk(KERN_DEBUG "dnbd3: kernel module loaded. Machine type: " ENDIAN_MODE);
+
+ // add MAX_NUMBER_DEVICES devices
+ mutex_lock(&dnbd3_index_mutex);
+ for (i = 0; i < max_devs; i++) {
+ dnbd3_blk_add_device(&dnbd3_device[i], i);
+ }
+ mutex_unlock(&dnbd3_index_mutex);
+
+ printk(KERN_INFO "dnbd3: init successful (%i devices).\n", max_devs);
+
+ return 0;
+}
+
+
+static int dnbd3_exit_cb(int id, void *ptr, void *data)
+{
+ struct list_head *list = (struct list_head *)data;
+ struct dnbd3_device_t *dnbd3 = ptr;
+
+ list_add_tail(&dnbd3->list, list);
+ return 0;
+}
+
+static void dnbd3_dev_remove(struct dnbd3_device_t *dnbd3)
+{
+ struct gendisk *disk = dnbd3->disk;
+ struct request_queue *q;
+
+ if (disk) {
+ q = disk->queue;
+ del_gendisk(disk);
+ blk_cleanup_queue(q);
+ blk_mq_free_tag_set(&dnbd3->tag_set);
+ disk->private_data = NULL;
+ put_disk(disk);
+ }
+}
+
+static void dnbd3_put(struct dnbd3_device_t *dnbd3)
+{
+ if (refcount_dec_and_mutex_lock(&dnbd3->refs, &dnbd3_index_mutex)) {
+ idr_remove(&dnbd3_index_idr, dnbd3->minor);
+ mutex_unlock(&dnbd3_index_mutex);
+ dnbd3_dev_remove(dnbd3);
+ }
+}
+
+
+static void __exit dnbd3_exit(void)
+{
+ dnbd3_device_t *dnbd3;
+ LIST_HEAD(del_list);
+ printk(KERN_DEBUG "dnbd3: stopping kernel module");
+
+ mutex_lock(&dnbd3_index_mutex);
+ idr_for_each(&dnbd3_index_idr, &dnbd3_exit_cb, &del_list);
+ mutex_unlock(&dnbd3_index_mutex);
+
+ while (!list_empty(&del_list)) {
+ dnbd3 = list_first_entry(&del_list, struct dnbd3_device_t, list);
+ dnbd3_sysfs_exit(dnbd3);
+ list_del_init(&dnbd3->list);
+ if (refcount_read(&dnbd3->refs) != 1) {
+ printk(KERN_ERR "dnbd3: possibly leaking a device\n");
+ }
+ dnbd3_put(dnbd3);
+ }
+
+ idr_destroy(&dnbd3_index_idr);
+ unregister_blkdev(major, "dnbd3");
+
+ kfree(dnbd3_device);
+
+ printk(KERN_INFO "dnbd3: stopped kernel module");
+}
+
+
+module_init(dnbd3_init);
+module_exit(dnbd3_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device 3");
+MODULE_LICENSE("GPL");
+
+module_param(max_devs, int, 0444);
+MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
new file mode 100644
index 0000000..2575cd8
--- /dev/null
+++ b/src/kernel/dnbd3.h
@@ -0,0 +1,86 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2019 Frederic Robra <frederic@robra.org>
+ * Parts copyright 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+#ifndef DNBD_H_
+#define DNBD_H_
+
+#include <linux/blk-mq.h>
+#include <linux/types.h>
+#include <linux/refcount.h>
+#include <linux/blkdev.h>
+
+#define KERNEL_MODULE
+#include "types.h"
+
+typedef struct
+{
+ dnbd3_host_t host;
+ uint64_t rtts[4]; // Last four round trip time measurements in microsecond
+ uint16_t protocol_version; // dnbd3 protocol version of this server
+ uint8_t failures; // How many times the server was unreachable
+} dnbd3_server_t;
+
+
+typedef struct dnbd3_device_t {
+ int minor;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue queue;
+ struct mutex config_lock;
+ refcount_t config_refs;
+ refcount_t refs;
+ struct list_head list;
+
+ // block
+ struct gendisk *disk;
+
+ // sysfs
+ struct kobject kobj;
+
+ // network
+ char *imgname;
+ struct socket *sock;
+ dnbd3_server_t cur_server, initial_server;
+ unsigned long cur_rtt;
+// serialized_buffer_t payload_buffer;
+ dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers
+ int new_servers_num; // number of new alt servers that are waiting to be copied to above array
+ dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers
+ uint8_t discover, panic, disconnecting, update_available, panic_count;
+ uint8_t use_server_provided_alts;
+ uint16_t rid;
+ uint32_t heartbeat_count;
+ uint64_t reported_size;
+ // server switch
+ struct socket *better_sock;
+
+} dnbd3_device_t;
+
+
+typedef struct dnbd3_cmd {
+ struct dnbd3_device_t *dnbd3;
+ struct mutex lock;
+ int index;
+ int cookie;
+ blk_status_t status;
+ unsigned long flags;
+ u32 cmd_cookie;
+} dnbd3_cmd;
+
+#endif /* DNBD_H_ */
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
new file mode 100644
index 0000000..4406072
--- /dev/null
+++ b/src/kernel/sysfs.c
@@ -0,0 +1,205 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <linux/kobject.h>
+
+#include "sysfs.h"
+#include "utils.h"
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev)
+{
+ if (dev->cur_server.host.type == HOST_IP4)
+ return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
+ else if (dev->cur_server.host.type == HOST_IP6)
+ return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
+ *buf = '\0';
+ return 0;
+}
+
+ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev)
+{
+ return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE);
+}
+
+ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev)
+{
+ int i, num = 0;
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ {
+ if (dev->alt_servers[i].host.type) ++num;
+ }
+ return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE);
+}
+
+ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev)
+{
+ int i, size = PAGE_SIZE, ret;
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ {
+ if (dev->alt_servers[i].host.type == HOST_IP4)
+ ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n",
+ dev->alt_servers[i].host.addr,
+ (int)ntohs(dev->alt_servers[i].host.port),
+ (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures)
+ , size);
+ else if (dev->alt_servers[i].host.type == HOST_IP6)
+ ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n",
+ dev->alt_servers[i].host.addr,
+ (int)ntohs(dev->alt_servers[i].host.port),
+ (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures)
+ , size);
+ else
+ continue;
+ size -= ret;
+ buf += ret;
+ if (size <= 0)
+ {
+ size = 0;
+ break;
+ }
+ }
+ return PAGE_SIZE - size;
+}
+
+ssize_t show_image_name(char *buf, dnbd3_device_t *dev)
+{
+ if (dev->imgname == NULL) return sprintf(buf, "(null)");
+ return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+}
+
+ssize_t show_rid(char *buf, dnbd3_device_t *dev)
+{
+ return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE);
+}
+
+ssize_t show_update_available(char *buf, dnbd3_device_t *dev)
+{
+ return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE);
+}
+
+device_attr_t cur_server_addr =
+{
+ .attr = {.name = "cur_server_addr", .mode = 0444 },
+ .show = show_cur_server_addr,
+ .store = NULL,
+};
+
+device_attr_t cur_server_rtt =
+{
+ .attr = {.name = "cur_server_rtt", .mode = 0444 },
+ .show = show_cur_server_rtt,
+ .store = NULL,
+};
+
+device_attr_t alt_server_num =
+{
+ .attr = {.name = "alt_server_num", .mode = 0444 },
+ .show = show_alt_server_num,
+ .store = NULL,
+};
+
+device_attr_t alt_servers =
+{
+ .attr = {.name = "alt_servers", .mode = 0444 },
+ .show = show_alt_servers,
+ .store = NULL,
+};
+
+device_attr_t image_name =
+{
+ .attr = {.name = "image_name", .mode = 0444 },
+ .show = show_image_name,
+ .store = NULL,
+};
+
+device_attr_t rid =
+{
+ .attr = {.name = "rid", .mode = 0444 },
+ .show = show_rid,
+ .store = NULL,
+};
+
+device_attr_t update_available =
+{
+ .attr = {.name = "update_available", .mode = 0444 },
+ .show = show_update_available,
+ .store = NULL,
+};
+
+ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ device_attr_t *device_attr = container_of(attr, device_attr_t, attr);
+ dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj);
+ return device_attr->show(buf, dev);
+}
+
+struct attribute *device_attrs[] =
+{
+ &cur_server_addr.attr,
+ &cur_server_rtt.attr,
+ &alt_server_num.attr,
+ &alt_servers.attr,
+ &image_name.attr,
+ &rid.attr,
+ &update_available.attr,
+ NULL,
+};
+
+
+struct sysfs_ops device_ops =
+{
+ .show = device_show,
+};
+
+void release(struct kobject *kobj)
+{
+ kobj->state_initialized = 0;
+}
+
+struct kobj_type device_ktype =
+{
+ .default_attrs = device_attrs,
+ .sysfs_ops = &device_ops,
+ .release = release,
+};
+
+
+void dnbd3_sysfs_init(dnbd3_device_t *dev)
+{
+ int error;
+ struct kobject *kobj = &dev->kobj;
+ struct kobj_type *ktype = &device_ktype;
+ struct kobject *parent = &disk_to_dev(dev->disk)->kobj;
+
+ error = kobject_init_and_add(kobj, ktype, parent, "%s", "net");
+ if (error)
+ printk("Error initializing dnbd3 device!\n");
+}
+
+void dnbd3_sysfs_exit(dnbd3_device_t *dev)
+{
+ kobject_put(&dev->kobj);
+}
diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h
new file mode 100644
index 0000000..0a747a5
--- /dev/null
+++ b/src/kernel/sysfs.h
@@ -0,0 +1,45 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef SYSFS_H_
+#define SYSFS_H_
+
+#include "dnbd3.h"
+
+void dnbd3_sysfs_init(dnbd3_device_t *dev);
+
+void dnbd3_sysfs_exit(dnbd3_device_t *dev);
+
+typedef struct
+{
+ struct attribute attr;
+ ssize_t (*show)(char *, dnbd3_device_t *);
+ ssize_t (*store)(const char *, size_t, dnbd3_device_t *);
+} device_attr_t;
+
+typedef struct
+{
+ struct attribute attr;
+ ssize_t (*show)(char *, dnbd3_server_t *);
+ ssize_t (*store)(const char *, size_t, dnbd3_server_t *);
+} server_attr_t;
+
+
+#endif /* SYSFS_H_ */
diff --git a/src/kernel/utils.c b/src/kernel/utils.c
new file mode 100644
index 0000000..902025f
--- /dev/null
+++ b/src/kernel/utils.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include <linux/kernel.h>
+
+#include "utils.h"
+
+unsigned int inet_addr(char *str)
+{
+ int a, b, c, d;
+ char arr[4];
+ sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d);
+ arr[0] = a;
+ arr[1] = b;
+ arr[2] = c;
+ arr[3] = d;
+ return *(unsigned int *) arr;
+}
+
+void inet_ntoa(struct in_addr addr, char *str)
+{
+ unsigned char *ptr = (unsigned char *) &addr;
+ sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff);
+}
diff --git a/src/kernel/utils.h b/src/kernel/utils.h
new file mode 100644
index 0000000..e54b3cf
--- /dev/null
+++ b/src/kernel/utils.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef UTILS_H_
+#define UTILS_H_
+
+#include <linux/in.h>
+
+unsigned int inet_addr(char *str);
+void inet_ntoa(struct in_addr addr, char *str);
+
+#endif /* UTILS_H_ */
diff --git a/src/serialize.c b/src/serialize.c
new file mode 100644
index 0000000..0bc0dcd
--- /dev/null
+++ b/src/serialize.c
@@ -0,0 +1,84 @@
+#include "serialize.h"
+#include "types.h"
+
+
+void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len)
+{
+ buffer->buffer_end = buffer->buffer + MIN(MAX_PAYLOAD, data_len);
+ buffer->buffer_pointer = buffer->buffer;
+}
+
+void serializer_reset_write(serialized_buffer_t *buffer)
+{
+ buffer->buffer_end = buffer->buffer + MAX_PAYLOAD;
+ buffer->buffer_pointer = buffer->buffer;
+}
+
+uint8_t serializer_get_uint8(serialized_buffer_t *buffer)
+{
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end) return 0;
+ return (uint8_t)*buffer->buffer_pointer++;
+}
+
+uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
+{
+ uint16_t ret;
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end) return 0;
+ memcpy(&ret, buffer->buffer_pointer, 2);
+ buffer->buffer_pointer += 2;
+ return net_order_16(ret);
+}
+
+uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
+{
+ uint64_t ret;
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end) return 0;
+ memcpy(&ret, buffer->buffer_pointer, 8);
+ buffer->buffer_pointer += 8;
+ return net_order_64(ret);
+}
+
+char *serializer_get_string(serialized_buffer_t *buffer)
+{
+ char *ptr = buffer->buffer_pointer, *start = buffer->buffer_pointer;
+ if (ptr >= buffer->buffer_end) return NULL;
+ while (ptr < buffer->buffer_end && *ptr) ++ptr;
+ if (*ptr) return NULL; // String did not terminate within buffer (possibly corrupted/malicious packet)
+ buffer->buffer_pointer = ptr + 1;
+ return start;
+}
+
+void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value)
+{
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end) return;
+ *buffer->buffer_pointer++ = (char)value;
+}
+
+void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
+{
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end) return;
+ value = net_order_16(value);
+ memcpy(buffer->buffer_pointer, &value, 2);
+ buffer->buffer_pointer += 2;
+}
+
+void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
+{
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end) return;
+ value = net_order_64(value);
+ memcpy(buffer->buffer_pointer, &value, 8);
+ buffer->buffer_pointer += 8;
+}
+
+void serializer_put_string(serialized_buffer_t *buffer, const char *value)
+{
+ const size_t len = strlen(value) + 1;
+ if (buffer->buffer_pointer + len > buffer->buffer_end) return;
+ memcpy(buffer->buffer_pointer, value, len);
+ buffer->buffer_pointer += len;
+}
+
+uint32_t serializer_get_written_length(serialized_buffer_t *buffer)
+{
+ return (uint32_t)( buffer->buffer_pointer - buffer->buffer );
+}
diff --git a/src/serialize.h b/src/serialize.h
new file mode 100644
index 0000000..1b73531
--- /dev/null
+++ b/src/serialize.h
@@ -0,0 +1,40 @@
+#ifndef SERIALIZER_H_
+#define SERIALIZER_H_
+
+// Careful with includes - this is used in kernel module too
+#include "config.h"
+
+typedef struct
+{
+ char buffer[MAX_PAYLOAD]; // This MUST be the first member or send_reply() will blow up
+ char *buffer_end;
+ char *buffer_pointer;
+} serialized_buffer_t;
+
+void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len);
+
+void serializer_reset_write(serialized_buffer_t *buffer);
+
+uint32_t serializer_get_written_length(serialized_buffer_t *buffer);
+
+//
+
+uint8_t serializer_get_uint8(serialized_buffer_t *buffer);
+
+uint16_t serializer_get_uint16(serialized_buffer_t *buffer);
+
+uint64_t serializer_get_uint64(serialized_buffer_t *buffer);
+
+char *serializer_get_string(serialized_buffer_t *buffer);
+
+//
+
+void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value);
+
+void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value);
+
+void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value);
+
+void serializer_put_string(serialized_buffer_t *buffer, const char *value);
+
+#endif
diff --git a/src/server/altservers.c b/src/server/altservers.c
new file mode 100644
index 0000000..b91ceab
--- /dev/null
+++ b/src/server/altservers.c
@@ -0,0 +1,612 @@
+#include "altservers.h"
+#include "locks.h"
+#include "helper.h"
+#include "image.h"
+#include "fileutil.h"
+#include "../shared/protocol.h"
+#include "../shared/timing.h"
+#include "../serverconfig.h"
+#include <assert.h>
+#include <inttypes.h>
+#include <jansson.h>
+
+#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid)
+#define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
+#define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
+
+static dnbd3_connection_t *pending[SERVER_MAX_PENDING_ALT_CHECKS];
+static pthread_spinlock_t pendingLockWrite; // Lock for adding something to pending. (NULL -> nonNULL)
+static pthread_mutex_t pendingLockConsume = PTHREAD_MUTEX_INITIALIZER; // Lock for removing something (nonNULL -> NULL)
+static dnbd3_signal_t* runSignal = NULL;
+
+static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
+static int numAltServers = 0;
+static pthread_spinlock_t altServersLock;
+
+static pthread_t altThread;
+
+static void *altservers_main(void *data);
+static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt);
+
+void altservers_init()
+{
+ srand( (unsigned int)time( NULL ) );
+ // Init spinlock
+ spin_init( &pendingLockWrite, PTHREAD_PROCESS_PRIVATE );
+ spin_init( &altServersLock, PTHREAD_PROCESS_PRIVATE );
+ // Init signal
+ runSignal = signal_new();
+ if ( runSignal == NULL ) {
+ logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." );
+ exit( EXIT_FAILURE );
+ }
+ memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) );
+ if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) {
+ logadd( LOG_ERROR, "Could not start altservers connector thread" );
+ exit( EXIT_FAILURE );
+ }
+ // Init waiting links queue -- this is currently a global static array so
+ // it will already be zero, but in case we refactor later do it explicitly
+ // while also holding the write lock so thread sanitizer is happy
+ spin_lock( &pendingLockWrite );
+ for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
+ pending[i] = NULL;
+ }
+ spin_unlock( &pendingLockWrite );
+}
+
+void altservers_shutdown()
+{
+ if ( runSignal == NULL ) return;
+ signal_call( runSignal ); // Wake altservers thread up
+ thread_join( altThread, NULL );
+}
+
+static void addalt(int argc, char **argv, void *data)
+{
+ char *shost;
+ dnbd3_host_t host;
+ bool isPrivate = false;
+ bool isClientOnly = false;
+ if ( argv[0][0] == '#' ) return;
+ for (shost = argv[0]; *shost != '\0'; ) { // Trim left and scan for "-" prefix
+ if ( *shost == '-' ) isPrivate = true;
+ else if ( *shost == '+' ) isClientOnly = true;
+ else if ( *shost != ' ' && *shost != '\t' ) break;
+ shost++;
+ }
+ if ( !parse_address( shost, &host ) ) {
+ logadd( LOG_WARNING, "Invalid entry in alt-servers file ignored: '%s'", shost );
+ return;
+ }
+ if ( argc == 1 ) argv[1] = "";
+ if ( altservers_add( &host, argv[1], isPrivate, isClientOnly ) ) {
+ (*(int*)data)++;
+ }
+}
+
+int altservers_load()
+{
+ int count = 0;
+ char *name;
+ if ( asprintf( &name, "%s/%s", _configDir, "alt-servers" ) == -1 ) return -1;
+ file_loadLineBased( name, 1, 2, &addalt, (void*)&count );
+ free( name );
+ logadd( LOG_DEBUG1, "Added %d alt servers\n", count );
+ return count;
+}
+
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly)
+{
+ int i, freeSlot = -1;
+ spin_lock( &altServersLock );
+ for (i = 0; i < numAltServers; ++i) {
+ if ( isSameAddressPort( &altServers[i].host, host ) ) {
+ spin_unlock( &altServersLock );
+ return false;
+ } else if ( freeSlot == -1 && altServers[i].host.type == 0 ) {
+ freeSlot = i;
+ }
+ }
+ if ( freeSlot == -1 ) {
+ if ( numAltServers >= SERVER_MAX_ALTS ) {
+ logadd( LOG_WARNING, "Cannot add another alt server, maximum of %d already reached.", (int)SERVER_MAX_ALTS );
+ spin_unlock( &altServersLock );
+ return false;
+ }
+ freeSlot = numAltServers++;
+ }
+ altServers[freeSlot].host = *host;
+ altServers[freeSlot].isPrivate = isPrivate;
+ altServers[freeSlot].isClientOnly = isClientOnly;
+ if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment );
+ spin_unlock( &altServersLock );
+ return true;
+}
+
+/**
+ * ONLY called from the passed uplink's main thread
+ */
+void altservers_findUplink(dnbd3_connection_t *uplink)
+{
+ int i;
+ // if betterFd != -1 it means the uplink is supposed to switch to another
+ // server. As this function here is called by the uplink thread, it can
+ // never be that the uplink is supposed to switch, but instead calls
+ // this function.
+ assert( uplink->betterFd == -1 );
+ spin_lock( &pendingLockWrite );
+ // it is however possible that an RTT measurement is currently in progress,
+ // so check for that case and do nothing if one is in progress
+ if ( uplink->rttTestResult == RTT_INPROGRESS ) {
+ for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
+ if ( pending[i] != uplink ) continue;
+ // Yep, measuring right now
+ spin_unlock( &pendingLockWrite );
+ return;
+ }
+ }
+ // Find free slot for measurement
+ for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
+ if ( pending[i] != NULL ) continue;
+ pending[i] = uplink;
+ uplink->rttTestResult = RTT_INPROGRESS;
+ spin_unlock( &pendingLockWrite );
+ signal_call( runSignal ); // Wake altservers thread up
+ return;
+ }
+ // End of loop - no free slot
+ spin_unlock( &pendingLockWrite );
+ logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." );
+}
+
+/**
+ * The given uplink is about to disappear, so remove it from any queues
+ */
+void altservers_removeUplink(dnbd3_connection_t *uplink)
+{
+ pthread_mutex_lock( &pendingLockConsume );
+ spin_lock( &pendingLockWrite );
+ for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
+ if ( pending[i] == uplink ) {
+ uplink->rttTestResult = RTT_NOT_REACHABLE;
+ pending[i] = NULL;
+ }
+ }
+ spin_unlock( &pendingLockWrite );
+ pthread_mutex_unlock( &pendingLockConsume );
+}
+
+/**
+ * Get <size> known (working) alt servers, ordered by network closeness
+ * (by finding the smallest possible subnet)
+ * Private servers are excluded, so this is what you want to call to
+ * get a list of servers you can tell a client about
+ */
+int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size)
+{
+ if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
+ int i, j;
+ int count = 0;
+ int scores[size];
+ int score;
+ spin_lock( &altServersLock );
+ if ( size > numAltServers ) size = numAltServers;
+ for (i = 0; i < numAltServers; ++i) {
+ if ( altServers[i].host.type == 0 ) continue; // Slot is empty
+ if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers
+ if ( host->type == altServers[i].host.type ) {
+ score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails;
+ } else {
+ score = -( altServers[i].numFails + 128 ); // Wrong address family
+ }
+ if ( count == 0 ) {
+ // Trivial - this is the first entry
+ output[0].host = altServers[i].host;
+ output[0].failures = 0;
+ scores[0] = score;
+ count++;
+ } else {
+ // Other entries already exist, insert in proper position
+ for (j = 0; j < size; ++j) {
+ if ( j < count && score <= scores[j] ) continue;
+ if ( j > count ) break; // Should never happen but just in case...
+ if ( j < count && j + 1 < size ) {
+ // Check if we're in the middle and need to move other entries...
+ memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) );
+ memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) );
+ }
+ if ( count < size ) {
+ count++;
+ }
+ output[j].host = altServers[i].host;
+ output[j].failures = 0;
+ scores[j] = score;
+ break;
+ }
+ }
+ }
+ spin_unlock( &altServersLock );
+ return count;
+}
+
+/**
+ * Get <size> alt servers. If there are more alt servers than
+ * requested, random servers will be picked.
+ * This function is suited for finding uplink servers as
+ * it includes private servers and ignores any "client only" servers
+ */
+int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency)
+{
+ if ( size <= 0 ) return 0;
+ int count = 0, i;
+ ticks now;
+ timing_get( &now );
+ spin_lock( &altServersLock );
+ // Flip first server in list with a random one every time this is called
+ if ( numAltServers > 1 ) {
+ const dnbd3_alt_server_t tmp = altServers[0];
+ do {
+ i = rand() % numAltServers;
+ } while ( i == 0 );
+ altServers[0] = altServers[i];
+ altServers[i] = tmp;
+ }
+ // We iterate over the list twice. First run adds servers with 0 failures only,
+ // second one also considers those that failed (not too many times)
+ if ( size > numAltServers ) size = numAltServers;
+ for (i = 0; i < numAltServers * 2; ++i) {
+ dnbd3_alt_server_t *srv = &altServers[i % numAltServers];
+ if ( srv->host.type == 0 ) continue; // Slot is empty
+ if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore!
+ if ( srv->isClientOnly ) continue;
+ bool first = ( i < numAltServers );
+ if ( first ) {
+ if ( srv->numFails > 0 ) continue;
+ } else {
+ if ( srv->numFails == 0 ) continue; // Already added in first iteration
+ if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row
+ && timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore!
+ if ( !emergency ) srv->numFails--;
+ }
+ // server seems ok, include in output and decrease its fail counter
+ output[count++] = srv->host;
+ if ( count >= size ) break;
+ }
+ spin_unlock( &altServersLock );
+ return count;
+}
+
+json_t* altservers_toJson()
+{
+ json_t *list = json_array();
+
+ spin_lock( &altServersLock );
+ char host[100];
+ const int count = numAltServers;
+ dnbd3_alt_server_t src[count];
+ memcpy( src, altServers, sizeof(src) );
+ spin_unlock( &altServersLock );
+ for (int i = 0; i < count; ++i) {
+ json_t *rtts = json_array();
+ for (int j = 0; j < SERVER_RTT_PROBES; ++j) {
+ json_array_append_new( rtts, json_integer( src[i].rtt[ (j + src[i].rttIndex + 1) % SERVER_RTT_PROBES ] ) );
+ }
+ sock_printHost( &src[i].host, host, sizeof(host) );
+ json_t *server = json_pack( "{ss,ss,so,sb,sb,si}",
+ "comment", src[i].comment,
+ "host", host,
+ "rtt", rtts,
+ "isPrivate", (int)src[i].isPrivate,
+ "isClientOnly", (int)src[i].isClientOnly,
+ "numFails", src[i].numFails
+ );
+ json_array_append_new( list, server );
+ }
+ return list;
+}
+
+/**
+ * Update rtt history of given server - returns the new average for that server
+ */
+static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
+{
+ unsigned int avg = rtt;
+ int i;
+ spin_lock( &altServersLock );
+ for (i = 0; i < numAltServers; ++i) {
+ if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
+ altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
+#if SERVER_RTT_PROBES == 5
+ avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2]
+ + altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES;
+#else
+#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES
+ avg = 0;
+ for (int j = 0; j < SERVER_RTT_PROBES; ++j) {
+ avg += altServers[i].rtt[j];
+ }
+ avg /= SERVER_RTT_PROBES;
+#endif
+ // If we got a new rtt value, server must be working
+ if ( altServers[i].numFails > 0 ) {
+ altServers[i].numFails--;
+ }
+ break;
+ }
+ spin_unlock( &altServersLock );
+ return avg;
+}
+
+/**
+ * Determine how close two addresses are to each other by comparing the number of
+ * matching bits from the left of the address. Does not count individual bits but
+ * groups of 4 for speed.
+ * Return: Closeness - higher number means closer
+ */
+int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2)
+{
+ if ( host1 == NULL || host2 == NULL || host1->type != host2->type ) return -1;
+ int retval = 0;
+ const int max = host1->type == HOST_IP4 ? 4 : 16;
+ for (int i = 0; i < max; ++i) {
+ if ( (host1->addr[i] & 0xf0) != (host2->addr[i] & 0xf0) ) return retval;
+ ++retval;
+ if ( (host1->addr[i] & 0x0f) != (host2->addr[i] & 0x0f) ) return retval;
+ ++retval;
+ }
+ return retval;
+}
+
+/**
+ * Called if an uplink server failed during normal uplink operation. This unit keeps
+ * track of how often servers fail, and consider them disabled for some time if they
+ * fail too many times.
+ */
+void altservers_serverFailed(const dnbd3_host_t * const host)
+{
+ int i;
+ int foundIndex = -1, lastOk = -1;
+ ticks now;
+ timing_get( &now );
+ spin_lock( &altServersLock );
+ for (i = 0; i < numAltServers; ++i) {
+ if ( foundIndex == -1 ) {
+ // Looking for the failed server in list
+ if ( isSameAddressPort( host, &altServers[i].host ) ) {
+ foundIndex = i;
+ }
+ } else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) {
+ lastOk = i;
+ }
+ }
+ // Do only increase counter if last fail was not too recent. This is
+ // to prevent the counter from increasing rapidly if many images use the
+ // same uplink. If there's a network hickup, all uplinks will call this
+ // function and would increase the counter too quickly, disabling the server.
+ if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) {
+ altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE;
+ altServers[foundIndex].lastFail = now;
+ if ( lastOk != -1 ) {
+ // Make sure non-working servers are put at the end of the list, so they're less likely
+ // to get picked when testing servers for uplink connections.
+ const dnbd3_alt_server_t tmp = altServers[foundIndex];
+ altServers[foundIndex] = altServers[lastOk];
+ altServers[lastOk] = tmp;
+ }
+ }
+ spin_unlock( &altServersLock );
+}
+/**
+ * Mainloop of this module. It will wait for requests by uplinks to find a
+ * suitable uplink server for them. If found, it will tell the uplink about
+ * the best server found. Currently the RTT history is kept per server and
+ * not per uplink, so if many images use the same uplink server, the history
+ * will update quite quickly. Needs to be improved some time, ie. by only
+ * updating the rtt if the last update was at least X seconds ago.
+ */
+static void *altservers_main(void *data UNUSED)
+{
+ const int ALTS = 4;
+ int ret, itLink, itAlt, numAlts;
+ bool found;
+ char buffer[DNBD3_BLOCK_SIZE ];
+ dnbd3_reply_t reply;
+ dnbd3_host_t servers[ALTS + 1];
+ serialized_buffer_t serialized;
+ struct timespec start, end;
+ ticks nextCloseUnusedFd;
+
+ setThreadName( "altserver-check" );
+ blockNoncriticalSignals();
+ timing_gets( &nextCloseUnusedFd, 900 );
+ // LOOP
+ while ( !_shutdown ) {
+ // Wait 5 seconds max.
+ ret = signal_wait( runSignal, 5000 );
+ if ( _shutdown ) goto cleanup;
+ if ( ret == SIGNAL_ERROR ) {
+ if ( errno == EAGAIN || errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno );
+ usleep( 100000 );
+ }
+ // Work your way through the queue
+ for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
+ spin_lock( &pendingLockWrite );
+ if ( pending[itLink] == NULL ) {
+ spin_unlock( &pendingLockWrite );
+ continue; // Check once before locking, as a mutex is expensive
+ }
+ spin_unlock( &pendingLockWrite );
+ pthread_mutex_lock( &pendingLockConsume );
+ spin_lock( &pendingLockWrite );
+ dnbd3_connection_t * const uplink = pending[itLink];
+ spin_unlock( &pendingLockWrite );
+ if ( uplink == NULL ) { // Check again after locking
+ pthread_mutex_unlock( &pendingLockConsume );
+ continue;
+ }
+ dnbd3_image_t * const image = image_lock( uplink->image );
+ if ( image == NULL ) { // Check again after locking
+ uplink->rttTestResult = RTT_NOT_REACHABLE;
+ spin_lock( &pendingLockWrite );
+ pending[itLink] = NULL;
+ spin_unlock( &pendingLockWrite );
+ pthread_mutex_unlock( &pendingLockConsume );
+ logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" );
+ continue;
+ }
+ LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
+ assert( uplink->rttTestResult == RTT_INPROGRESS );
+ // Now get 4 alt servers
+ numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 );
+ if ( uplink->fd != -1 ) {
+ // Add current server if not already in list
+ found = false;
+ for (itAlt = 0; itAlt < numAlts; ++itAlt) {
+ if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue;
+ found = true;
+ break;
+ }
+ if ( !found ) servers[numAlts++] = uplink->currentServer;
+ }
+ // Test them all
+ int bestSock = -1;
+ int bestIndex = -1;
+ int bestProtocolVersion = -1;
+ unsigned long bestRtt = RTT_UNREACHABLE;
+ unsigned long currentRtt = RTT_UNREACHABLE;
+ for (itAlt = 0; itAlt < numAlts; ++itAlt) {
+ usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...)
+ // Connect
+ clock_gettime( BEST_CLOCK_SOURCE, &start );
+ int sock = sock_connect( &servers[itAlt], 750, 1000 );
+ if ( sock < 0 ) continue;
+ // Select image ++++++++++++++++++++++++++++++
+ if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
+ goto server_failed;
+ }
+ // See if selecting the image succeeded ++++++++++++++++++++++++++++++
+ uint16_t protocolVersion, rid;
+ uint64_t imageSize;
+ char *name;
+ if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
+ goto server_image_not_available;
+ }
+ if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed;
+ if ( name == NULL || strcmp( name, image->name ) != 0 ) {
+ ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name );
+ }
+ if ( rid != image->rid ) {
+ ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid );
+ }
+ if ( imageSize != image->virtualFilesize ) {
+ ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
+ }
+ // Request first block (NOT random!) ++++++++++++++++++++++++++++++
+ if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+ LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink );
+ }
+ // See if requesting the block succeeded ++++++++++++++++++++++
+ if ( !dnbd3_get_reply( sock, &reply ) ) {
+ LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink );
+ }
+ // check reply header
+ if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+ ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
+ }
+ if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+ ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink );
+ }
+ clock_gettime( BEST_CLOCK_SOURCE, &end );
+ // Measurement done - everything fine so far
+ spin_lock( &uplink->rttLock );
+ const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
+ // Penaltize rtt if this was a cycle; this will treat this server with lower priority
+ // in the near future too, so we prevent alternating between two servers that are both
+ // part of a cycle and have the lowest latency.
+ const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000
+ + (end.tv_nsec - start.tv_nsec) / 1000
+ + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
+ unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
+ // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
+ if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
+ spin_unlock( &uplink->rttLock );
+ if ( uplink->fd != -1 && isCurrent ) {
+ // Was measuring current server
+ currentRtt = avg;
+ close( sock );
+ } else if ( avg < bestRtt ) {
+ // Was another server, update "best"
+ if ( bestSock != -1 ) close( bestSock );
+ bestSock = sock;
+ bestRtt = avg;
+ bestIndex = itAlt;
+ bestProtocolVersion = protocolVersion;
+ } else {
+ // Was too slow, ignore
+ close( sock );
+ }
+ // We're done, call continue
+ continue;
+ // Jump here if anything went wrong
+ // This will cleanup and continue
+ server_failed: ;
+ altservers_serverFailed( &servers[itAlt] );
+ server_image_not_available: ;
+ close( sock );
+ }
+ // Done testing all servers. See if we should switch
+ if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
+ // yep
+ if ( currentRtt > 10000000 || uplink->fd == -1 ) {
+ LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
+ } else {
+ LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
+ }
+ sock_setTimeout( bestSock, _uplinkTimeout );
+ spin_lock( &uplink->rttLock );
+ uplink->betterFd = bestSock;
+ uplink->betterServer = servers[bestIndex];
+ uplink->betterVersion = bestProtocolVersion;
+ uplink->rttTestResult = RTT_DOCHANGE;
+ spin_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
+ } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) {
+ // No server was reachable
+ spin_lock( &uplink->rttLock );
+ uplink->rttTestResult = RTT_NOT_REACHABLE;
+ spin_unlock( &uplink->rttLock );
+ } else {
+ // nope
+ if ( bestSock != -1 ) close( bestSock );
+ spin_lock( &uplink->rttLock );
+ uplink->rttTestResult = RTT_DONTCHANGE;
+ uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
+ spin_unlock( &uplink->rttLock );
+ if ( !image->working ) {
+ image->working = true;
+ LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink );
+ }
+ }
+ image_release( image );
+ // end of loop over all pending uplinks
+ spin_lock( &pendingLockWrite );
+ pending[itLink] = NULL;
+ spin_unlock( &pendingLockWrite );
+ pthread_mutex_unlock( &pendingLockConsume );
+ }
+ // Save cache maps of all images if applicable
+ declare_now;
+ // TODO: Has nothing to do with alt servers really, maybe move somewhere else?
+ if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
+ timing_gets( &nextCloseUnusedFd, 900 );
+ image_closeUnusedFd();
+ }
+ }
+ cleanup: ;
+ if ( runSignal != NULL ) signal_close( runSignal );
+ runSignal = NULL;
+ return NULL ;
+}
+
diff --git a/src/server/altservers.h b/src/server/altservers.h
new file mode 100644
index 0000000..7b7b46d
--- /dev/null
+++ b/src/server/altservers.h
@@ -0,0 +1,30 @@
+#ifndef _ALTSERVERS_H_
+#define _ALTSERVERS_H_
+
+#include "globals.h"
+
+struct json_t;
+
+void altservers_init();
+
+void altservers_shutdown();
+
+int altservers_load();
+
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
+
+void altservers_findUplink(dnbd3_connection_t *uplink);
+
+void altservers_removeUplink(dnbd3_connection_t *uplink);
+
+int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
+
+int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency);
+
+int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
+
+void altservers_serverFailed(const dnbd3_host_t * const host);
+
+struct json_t* altservers_toJson();
+
+#endif /* UPLINK_CONNECTOR_H_ */
diff --git a/src/server/fileutil.c b/src/server/fileutil.c
new file mode 100644
index 0000000..336ab68
--- /dev/null
+++ b/src/server/fileutil.c
@@ -0,0 +1,128 @@
+#include "fileutil.h"
+#include "helper.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+
+bool file_isReadable(char *file)
+{
+ int fd = open( file, O_RDONLY );
+ if ( fd < 0 ) return false;
+ close( fd );
+ return true;
+}
+
+bool file_isWritable(char *file)
+{
+ int fd = open( file, O_WRONLY );
+ if ( fd >= 0 ) {
+ close( fd );
+ return true;
+ }
+ fd = open( file, O_WRONLY | O_CREAT, 0600 );
+ if ( fd < 0 ) return false;
+ close( fd );
+ remove( file );
+ return true;
+}
+
+bool mkdir_p(const char* path)
+{
+ assert( path != NULL );
+ if ( *path == '\0' ) return true;
+ char buffer[strlen( path ) + 1];
+ strcpy( buffer, path );
+ char *current = buffer;
+ char *slash;
+ while ( (slash = strchr( current, '/' )) != NULL ) {
+ *slash = '\0';
+ if ( *buffer != '\0' && mkdir( buffer, 0755 ) != 0 && errno != EEXIST ) return false;
+ *slash = '/';
+ current = slash + 1;
+ }
+ if ( mkdir( buffer, 0755 ) != 0 && errno != EEXIST ) return false;
+ return true;
+}
+
+bool file_alloc(int fd, uint64_t offset, uint64_t size)
+{
+#ifdef __linux__
+ if ( fallocate( fd, 0, offset, size ) == 0 ) return true; // fast way
+#elif defined(__FreeBSD__)
+ if ( posix_fallocate( fd, offset, size ) == 0 ) return true; // slow way
+#endif
+ return false;
+}
+
+bool file_setSize(int fd, uint64_t size)
+{
+ if ( ftruncate( fd, size ) == 0 ) return true;
+
+ // Try really hard... image loading logic relies on the file
+ // having the proper apparent size
+ uint8_t byte = 0;
+ pread( fd, &byte, 1, size - 1 );
+ if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true;
+ return false;
+}
+
+bool file_freeDiskSpace(const char * const path, uint64_t *total, uint64_t *avail)
+{
+ struct statvfs fiData;
+ if ( statvfs( path, &fiData ) < 0 ) {
+ return false;
+ }
+ if ( avail != NULL ) {
+ *avail = ((uint64_t)fiData.f_bavail * (uint64_t)fiData.f_frsize);
+ }
+ if ( total != NULL ) {
+ *total = ((uint64_t)fiData.f_blocks * (uint64_t)fiData.f_frsize);
+ }
+ return true;
+}
+
+time_t file_lastModification(const char * const file)
+{
+ struct stat st;
+ if ( stat( file, &st ) != 0 ) return 0;
+ return st.st_mtime;
+}
+
+int file_loadLineBased(const char * const file, int minFields, int maxFields, void (*cb)(int argc, char **argv, void *data), void *data)
+{
+ char buffer[1000], *line;
+ char *items[20];
+ int count = 0, itemCount;
+
+ if ( file == NULL || cb == NULL ) return -1;
+ FILE *fp = fopen( file, "r" );
+ if ( fp == NULL ) return -1;
+ while ( fgets( buffer, sizeof(buffer), fp ) != NULL ) {
+ itemCount = 0;
+ for (line = buffer; *line != '\0' && itemCount < 20; ) { // Trim left and scan for "-" prefix
+ while ( *line == ' ' || *line == '\t' ) ++line;
+ if ( *line == '\r' || *line == '\n' || *line == '\0' ) break; // Ignore empty lines
+ items[itemCount++] = line;
+ if ( itemCount >= maxFields ) {
+ trim_right( line );
+ break;
+ }
+ while ( *line != '\0' && *line != ' ' && *line != '\t' && *line != '\r' && *line != '\n' ) ++line;
+ if ( *line != '\0' ) *line++ = '\0';
+ }
+ if ( itemCount >= minFields ) {
+ cb( itemCount, items, data );
+ count++;
+ }
+ }
+ fclose( fp );
+ return count;
+}
+
diff --git a/src/server/fileutil.h b/src/server/fileutil.h
new file mode 100644
index 0000000..fcb5c20
--- /dev/null
+++ b/src/server/fileutil.h
@@ -0,0 +1,17 @@
+#ifndef _FILEUTIL_H_
+#define _FILEUTIL_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <time.h>
+
+bool file_isReadable(char *file);
+bool file_isWritable(char *file);
+bool mkdir_p(const char* path);
+bool file_alloc(int fd, uint64_t offset, uint64_t size);
+bool file_setSize(int fd, uint64_t size);
+bool file_freeDiskSpace(const char * const path, uint64_t *total, uint64_t *avail);
+time_t file_lastModification(const char * const file);
+int file_loadLineBased(const char * const file, int minFields, int maxFields, void (*cb)(int argc, char **argv, void *data), void *data);
+
+#endif /* FILEUTIL_H_ */
diff --git a/src/server/globals.c b/src/server/globals.c
new file mode 100644
index 0000000..c9b9411
--- /dev/null
+++ b/src/server/globals.c
@@ -0,0 +1,321 @@
+#include "globals.h"
+#include "ini.h"
+#include "../shared/log.h"
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sys/resource.h>
+#include <errno.h>
+
+char *_configDir = NULL;
+atomic_bool _shutdown = false;
+// [dnbd3]
+atomic_int _listenPort = PORT;
+char *_basePath = NULL;
+atomic_int _serverPenalty = 0;
+atomic_int _clientPenalty = 0;
+atomic_bool _isProxy = false;
+atomic_int _backgroundReplication = BGR_FULL;
+atomic_int _bgrMinClients = 0;
+atomic_bool _lookupMissingForProxy = true;
+atomic_bool _sparseFiles = false;
+atomic_bool _removeMissingImages = true;
+atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
+atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT;
+atomic_bool _closeUnusedFd = false;
+atomic_bool _vmdkLegacyMode = false;
+// Not really needed anymore since we have '+' and '-' in alt-servers
+atomic_bool _proxyPrivateOnly = false;
+// [limits]
+atomic_int _maxClients = SERVER_MAX_CLIENTS;
+atomic_int _maxImages = SERVER_MAX_IMAGES;
+atomic_int _maxPayload = 9000000; // 9MB
+atomic_uint_fast64_t _maxReplicationSize = (uint64_t)100000000000LL;
+
+/**
+ * True when loading config the first time. Consecutive loads will
+ * ignore certain values which cannot be changed safely at runtime.
+ */
+static atomic_bool initialLoad = true;
+static pthread_mutex_t loadLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define IS_TRUE(value) (atoi(value) != 0 || strcmp(value, "true") == 0 || strcmp(value, "True") == 0 || strcmp(value, "TRUE") == 0)
+#define SAVE_TO_VAR_STR(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) { if (_ ## kk != NULL) free(_ ## kk); _ ## kk = strdup(value); } } while (0)
+#define SAVE_TO_VAR_BOOL(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) _ ## kk = IS_TRUE(value); } while (0)
+#define SAVE_TO_VAR_INT(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse32(value, &_ ## kk, #ss); } while (0)
+#define SAVE_TO_VAR_UINT(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse32u(value, &_ ## kk, #ss); } while (0)
+#define SAVE_TO_VAR_UINT64(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse64u(value, &_ ## kk, #ss); } while (0)
+
+static void sanitizeFixedConfig();
+
+static void handleMaskString( const char *value, void(*func)(logmask_t) );
+
+static const char* units = "KMGTPEZY";
+
+static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname);
+static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname);
+static bool parse32(const char *in, atomic_int *out, const char *optname) UNUSED;
+static bool parse32u(const char *in, atomic_int *out, const char *optname);
+
+static int ini_handler(void *custom UNUSED, const char* section, const char* key, const char* value)
+{
+ if ( initialLoad ) {
+ if ( _basePath == NULL ) SAVE_TO_VAR_STR( dnbd3, basePath );
+ SAVE_TO_VAR_BOOL( dnbd3, vmdkLegacyMode );
+ SAVE_TO_VAR_UINT( dnbd3, listenPort );
+ SAVE_TO_VAR_UINT( limits, maxClients );
+ SAVE_TO_VAR_UINT( limits, maxImages );
+ }
+ SAVE_TO_VAR_BOOL( dnbd3, isProxy );
+ SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
+ SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+ SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
+ SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
+ SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages );
+ SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd );
+ SAVE_TO_VAR_UINT( dnbd3, serverPenalty );
+ SAVE_TO_VAR_UINT( dnbd3, clientPenalty );
+ SAVE_TO_VAR_UINT( dnbd3, uplinkTimeout );
+ SAVE_TO_VAR_UINT( dnbd3, clientTimeout );
+ SAVE_TO_VAR_UINT( limits, maxPayload );
+ SAVE_TO_VAR_UINT64( limits, maxReplicationSize );
+ if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) {
+ if ( strcmp( value, "hashblock" ) == 0 ) {
+ _backgroundReplication = BGR_HASHBLOCK;
+ } else if ( IS_TRUE( value ) ) {
+ _backgroundReplication = BGR_FULL;
+ } else {
+ _backgroundReplication = BGR_DISABLED;
+ }
+ }
+ if ( strcmp( section, "logging" ) == 0 && strcmp( key, "fileMask" ) == 0 ) handleMaskString( value, &log_setFileMask );
+ if ( strcmp( section, "logging" ) == 0 && strcmp( key, "consoleMask" ) == 0 ) handleMaskString( value, &log_setConsoleMask );
+ if ( strcmp( section, "logging" ) == 0 && strcmp( key, "consoleTimestamps" ) == 0 ) log_setConsoleTimestamps( IS_TRUE(value) );
+ if ( strcmp( section, "logging" ) == 0 && strcmp( key, "file" ) == 0 ) {
+ if ( log_openLogFile( value ) ) {
+ logadd( LOG_INFO, "Opened log file %s", value );
+ } else {
+ logadd( LOG_ERROR, "Could not open log file %s", value );
+ exit( EXIT_FAILURE );
+ }
+ }
+ return 1;
+}
+
+void globals_loadConfig()
+{
+ char *name = NULL;
+ asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
+ if ( name == NULL ) return;
+ if ( pthread_mutex_trylock( &loadLock ) != 0 ) {
+ logadd( LOG_INFO, "Ignoring config reload request due to already running reload" );
+ return;
+ }
+ ini_parse( name, &ini_handler, NULL );
+ free( name );
+ if ( initialLoad ) {
+ sanitizeFixedConfig();
+ }
+ if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
+ logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
+ _sparseFiles = false;
+ }
+ // Dump config as interpreted
+ char buffer[2000];
+ globals_dumpConfig( buffer, sizeof(buffer) );
+ logadd( LOG_DEBUG1, "Effective configuration:\n%s", buffer );
+ initialLoad = false;
+ pthread_mutex_unlock( &loadLock );
+}
+
+static void sanitizeFixedConfig()
+{
+ // Validate settings after loading:
+ // base path for images valid?
+ if ( _basePath == NULL || _basePath[0] == '\0' ) {
+ logadd( LOG_WARNING, "No/empty basePath in " CONFIG_FILENAME );
+ free( _basePath );
+ _basePath = NULL;
+ } else if ( _basePath[0] != '/' ) {
+ logadd( LOG_WARNING, "basePath must be absolute!" );
+ free( _basePath );
+ _basePath = NULL;
+ } else {
+ char *end = _basePath + strlen( _basePath ) - 1;
+ while ( end >= _basePath && *end == '/' ) {
+ *end-- = '\0';
+ }
+ }
+ // listen port
+ if ( _listenPort < 1 || _listenPort > 65535 ) {
+ logadd( LOG_ERROR, "listenPort must be 1-65535, but is %d", _listenPort );
+ exit( EXIT_FAILURE );
+ }
+ // Cap to hard limit
+ if ( _maxClients > SERVER_MAX_CLIENTS ) _maxClients = SERVER_MAX_CLIENTS;
+ if ( _maxImages > SERVER_MAX_IMAGES ) _maxImages = SERVER_MAX_IMAGES;
+ // Consider rlimits
+ struct rlimit limit;
+ if ( getrlimit( RLIMIT_NOFILE, &limit ) != 0 ) {
+ logadd( LOG_DEBUG1, "getrlimit failed, errno %d", errno );
+ } else {
+ const rlim_t required = (rlim_t)( _maxClients + _maxImages * ( _isProxy ? 2 : 1 ) + 50 );
+ if ( limit.rlim_cur != RLIM_INFINITY && limit.rlim_cur < required ) {
+ rlim_t current = limit.rlim_cur;
+ if ( required <= limit.rlim_max || limit.rlim_max == RLIM_INFINITY ) {
+ limit.rlim_cur = required;
+ } else {
+ limit.rlim_cur = limit.rlim_max;
+ }
+ if ( current != limit.rlim_cur && setrlimit( RLIMIT_NOFILE, &limit ) == 0 ) {
+ current = limit.rlim_cur;
+ logadd( LOG_INFO, "LIMIT_NOFILE (ulimit -n) soft limit increased to %d", (int)current );
+ }
+ if ( current < required ) {
+ logadd( LOG_WARNING, "This process can only have %d open file handles,"
+ " which is not enough for the selected maxClients and maxImages counts."
+ " Consider increasing the limit to at least %d (RLIMIT_NOFILE, ulimit -n)"
+ " to support the current configuration. maxClients and maxImages have"
+ " been lowered for this session.", (int)current, (int)required );
+ do {
+ if ( _maxClients > 500 && _maxImages > 150 ) {
+ _maxImages -= _maxImages / 20 + 1;
+ _maxClients -= _maxClients / 20 + 1;
+ } else if ( _maxImages > 100 ) {
+ _maxImages -= _maxImages / 20 + 1;
+ if ( _maxClients > 200 ) _maxClients -= _maxClients / 25 + 1;
+ } else {
+ break;
+ }
+ } while ( (rlim_t)( _maxClients + _maxImages * ( _isProxy ? 2 : 1 ) + 50 ) > current );
+ }
+ }
+ }
+}
+
+#define SETLOGBIT(name) do { if ( strstr( value, #name ) != NULL ) mask |= LOG_ ## name; } while (0)
+static void handleMaskString( const char *value, void(*func)(logmask_t) )
+{
+ logmask_t mask = 0;
+ SETLOGBIT( ERROR );
+ SETLOGBIT( WARNING );
+ SETLOGBIT( MINOR );
+ SETLOGBIT( INFO );
+ SETLOGBIT( DEBUG1 );
+ SETLOGBIT( DEBUG2 );
+ (*func)( mask );
+}
+
+static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname)
+{
+ if ( *in == '\0' ) {
+ logadd( LOG_WARNING, "Ignoring empty numeric setting '%s'", optname );
+ return false;
+ }
+ char *end;
+ long long int num = strtoll( in, &end, 10 );
+ if ( end == in ) {
+ logadd( LOG_WARNING, "Ignoring value '%s' for '%s': Not a number", in, optname );
+ return false;
+ }
+ int exp, base = 1024;
+ while ( *end == ' ' ) end++;
+ if ( *end == '\0' ) {
+ exp = 0;
+ } else {
+ char *pos = strchr( units, *end > 'Z' ? (*end - 32) : *end );
+ if ( pos == NULL ) {
+ logadd( LOG_ERROR, "Invalid unit '%s' for '%s'", end, optname );
+ return false;
+ }
+ exp = (int)( pos - units ) + 1;
+ end++;
+ if ( *end == 'B' || *end == 'b' ) {
+ base = 1000;
+ }
+ }
+ while ( exp-- > 0 ) num *= base;
+ *out = (int64_t)num;
+ return true;
+}
+
+static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname)
+{
+ atomic_int_fast64_t v;
+ if ( !parse64( in, &v, optname ) ) return false;
+ if ( v < 0 ) {
+ logadd( LOG_WARNING, "Ignoring value '%s' for '%s': Cannot be negative", in, optname );
+ return false;
+ }
+ *out = (uint64_t)v;
+ return true;
+}
+
+static bool parse32(const char *in, atomic_int *out, const char *optname)
+{
+ atomic_int_fast64_t v;
+ if ( !parse64( in, &v, optname ) ) return false;
+ if ( v < INT_MIN || v > INT_MAX ) {
+ logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)INT_MIN, (int)INT_MAX, in );
+ return false;
+ }
+ *out = (int)v;
+ return true;
+}
+
+static bool parse32u(const char *in, atomic_int *out, const char *optname)
+{
+ atomic_int_fast64_t v;
+ if ( !parse64( in, &v, optname ) ) return false;
+ if ( v < 0 || v > INT_MAX ) {
+ logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)0, (int)INT_MAX, in );
+ return false;
+ }
+ *out = (int)v;
+ return true;
+}
+
+#define P_ARG(...) do { \
+ int r = snprintf(buffer, rem, __VA_ARGS__); \
+ if ( r < 0 || (size_t)r >= rem ) return size - 1; \
+ rem -= r; \
+ buffer += r; \
+} while (0)
+#define PVAR(var,type,cast) P_ARG(#var "=%" type "\n", (cast) _ ## var)
+#define PINT(var) PVAR(var, "d", int)
+#define PUINT64(var) PVAR(var, PRIu64, uint64_t)
+#define PSTR(var) PVAR(var, "s", const char*)
+#define PBOOL(var) P_ARG(#var "=%s\n", _ ## var ? "true" : "false")
+
+size_t globals_dumpConfig(char *buffer, size_t size)
+{
+ size_t rem = size;
+ P_ARG("[dnbd3]\n");
+ PINT(listenPort);
+ PSTR(basePath);
+ PINT(serverPenalty);
+ PINT(clientPenalty);
+ PBOOL(isProxy);
+ if ( _backgroundReplication == BGR_HASHBLOCK ) {
+ P_ARG("backgroundReplication=hashblock\n");
+ } else {
+ PBOOL(backgroundReplication);
+ }
+ PINT(bgrMinClients);
+ PBOOL(lookupMissingForProxy);
+ PBOOL(sparseFiles);
+ PBOOL(removeMissingImages);
+ PINT(uplinkTimeout);
+ PINT(clientTimeout);
+ PBOOL(closeUnusedFd);
+ PBOOL(vmdkLegacyMode);
+ PBOOL(proxyPrivateOnly);
+ P_ARG("[limits]\n");
+ PINT(maxClients);
+ PINT(maxImages);
+ PINT(maxPayload);
+ PUINT64(maxReplicationSize);
+ return size - rem;
+}
+
diff --git a/src/server/globals.h b/src/server/globals.h
new file mode 100644
index 0000000..2b30bc2
--- /dev/null
+++ b/src/server/globals.h
@@ -0,0 +1,277 @@
+#ifndef _GLOBALS_H_
+#define _GLOBALS_H_
+
+#include "../types.h"
+#include "../shared/fdsignal.h"
+#include "../serverconfig.h"
+#include <stdint.h>
+#include <stdatomic.h>
+#include <time.h>
+#include <pthread.h>
+
+typedef struct timespec ticks;
+
+// ######### All structs/types used by the server ########
+
+typedef struct _dnbd3_connection dnbd3_connection_t;
+typedef struct _dnbd3_image dnbd3_image_t;
+typedef struct _dnbd3_client dnbd3_client_t;
+
+// Slot is free, can be used.
+// Must only be set in uplink_handle_receive() or uplink_remove_client()
+#define ULR_FREE 0
+// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
+// Must only be set in uplink_request()
+#define ULR_NEW 1
+// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
+// Must only be set in uplink_mainloop() or uplink_request()
+#define ULR_PENDING 2
+// Slot is being processed, do not consider for hop on.
+// Must only be set in uplink_handle_receive()
+#define ULR_PROCESSING 3
+typedef struct
+{
+ uint64_t handle; // Client defined handle to pass back in reply
+ uint64_t from; // First byte offset of requested block (ie. 4096)
+ uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+ dnbd3_client_t * client; // Client to send reply to
+ int status; // status of this entry: ULR_*
+#ifdef _DEBUG
+ ticks entered; // When this request entered the queue (for debugging)
+#endif
+ uint8_t hopCount; // How many hops this request has already taken across proxies
+} dnbd3_queued_request_t;
+
+#define RTT_IDLE 0 // Not in progress
+#define RTT_INPROGRESS 1 // In progess, not finished
+#define RTT_DONTCHANGE 2 // Finished, but no better alternative found
+#define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd
+#define RTT_NOT_REACHABLE 4 // No uplink was reachable
+struct _dnbd3_connection
+{
+ int fd; // socket fd to remote server
+ int version; // remote server protocol version
+ dnbd3_signal_t* signal; // used to wake up the process
+ pthread_t thread; // thread holding the connection
+ pthread_spinlock_t queueLock; // lock for synchronization on request queue etc.
+ dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer
+ dnbd3_host_t currentServer; // Current server we're connected to
+ pthread_spinlock_t rttLock; // When accessing rttTestResult, betterFd or betterServer
+ int rttTestResult; // RTT_*
+ int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD!
+ int betterVersion; // protocol version of better server
+ int betterFd; // Active connection to better server, ready to use
+ dnbd3_host_t betterServer; // The better server
+ uint8_t *recvBuffer; // Buffer for receiving payload
+ uint32_t recvBufferLen; // Len of ^^
+ volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
+ bool replicatedLastBlock; // bool telling if the last block has been replicated yet
+ bool cycleDetected; // connection cycle between proxies detected for current remote server
+ int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at
+ // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
+ uint64_t replicationHandle; // Handle of pending replication request
+ atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
+ int queueLen; // length of queue
+ uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives)
+ dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+};
+
+typedef struct
+{
+ char comment[COMMENT_LENGTH];
+ dnbd3_host_t host;
+ unsigned int rtt[SERVER_RTT_PROBES];
+ unsigned int rttIndex;
+ bool isPrivate, isClientOnly;
+ ticks lastFail;
+ int numFails;
+} dnbd3_alt_server_t;
+
+typedef struct
+{
+ uint8_t host[16];
+ int bytes;
+ int bitMask;
+ int permissions;
+} dnbd3_access_rule_t;
+
+/**
+ * Image struct. An image path could be something like
+ * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1
+ * and the name would then be
+ * rz/zfs/windows7 zfs.vmdk
+ */
+struct _dnbd3_image
+{
+ char *path; // absolute path of the image
+ char *name; // public name of the image (usually relative path minus revision ID)
+ dnbd3_connection_t *uplink; // pointer to a server connection
+ uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete
+ uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k)
+ uint64_t realFilesize; // actual file size on disk
+ ticks atime; // last access time
+ ticks lastWorkCheck; // last time a non-working image has been checked
+ ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
+ uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image
+ uint32_t masterCrc32; // CRC-32 of the crc-32 list
+ int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al)
+ int completenessEstimate; // Completeness estimate in percent
+ int users; // clients currently using this image
+ int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
+ bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected
+ uint16_t rid; // revision of image
+ pthread_spinlock_t lock;
+};
+
+struct _dnbd3_client
+{
+#define HOSTNAMELEN (48)
+ atomic_uint_fast64_t bytesSent; // Byte counter for this client.
+ dnbd3_image_t *image; // Image in use by this client, or NULL during handshake
+ int sock;
+ bool isServer; // true if a server in proxy mode, false if real client
+ dnbd3_host_t host;
+ char hostName[HOSTNAMELEN]; // inet_ntop version of host
+ pthread_mutex_t sendMutex; // Held while writing to sock if image is incomplete (since uplink uses socket too)
+ pthread_spinlock_t lock;
+};
+
+// #######################################################
+#define CONFIG_FILENAME "server.conf"
+
+/**
+ * Base directory where the configuration files reside. Will never have a trailing slash.
+ */
+extern char *_configDir;
+
+/**
+ * Base directory where all images are stored in. Will never have a trailing slash.
+ */
+extern char *_basePath;
+
+/**
+ * Whether or not simple *.vmdk files should be treated as revision 1
+ */
+extern atomic_bool _vmdkLegacyMode;
+
+/**
+ * How much artificial delay should we add when a server connects to us?
+ */
+extern atomic_int _serverPenalty;
+
+/**
+ * How much artificial delay should we add when a client connects to us?
+ */
+extern atomic_int _clientPenalty;
+
+/**
+ * Is server shutting down?
+ */
+extern atomic_bool _shutdown;
+
+/**
+ * Is server allowed to provide images in proxy mode?
+ */
+extern atomic_bool _isProxy;
+
+/**
+ * Only use servers as upstream proxy which are private?
+ */
+extern atomic_bool _proxyPrivateOnly;
+
+/**
+ * Whether to remove missing images from image list on SIGHUP
+ */
+extern atomic_bool _removeMissingImages;
+
+/**
+ * Read timeout when waiting for or sending data on an uplink
+ */
+extern atomic_int _uplinkTimeout;
+
+/**
+ * Read timeout when waiting for or sending data from/to client
+ */
+extern atomic_int _clientTimeout;
+
+/**
+ * If true, images with no active client will have their fd closed after some
+ * idle time.
+ */
+extern atomic_bool _closeUnusedFd;
+
+/**
+ * Should we replicate incomplete images in the background?
+ * Otherwise, only blocks that were explicitly requested will be cached.
+ */
+extern atomic_int _backgroundReplication;
+#define BGR_DISABLED (0)
+#define BGR_FULL (1)
+#define BGR_HASHBLOCK (2)
+
+/**
+ * Minimum connected clients for background replication to kick in
+ */
+extern atomic_int _bgrMinClients;
+
+/**
+ * (In proxy mode): If connecting client is a proxy, and the requested image
+ * is not known locally, should we ask our known alt servers for it?
+ * Otherwise the request is rejected.
+ */
+extern atomic_bool _lookupMissingForProxy;
+
+/**
+ * Should we preallocate proxied images right at the start to make
+ * sure we can cache it entirely, or rather create sparse files
+ * with holes in them? With sparse files, we just keep writing
+ * cached blocks to disk until it is full, and only then will we
+ * start to delete old images. This might be a bit flaky so use
+ * only in space restricted environments. Also make sure your
+ * file system actually supports sparse files / files with holes
+ * in them, or you might get really shitty performance.
+ * This setting will have no effect if background replication is
+ * turned on.
+ */
+extern atomic_bool _sparseFiles;
+
+/**
+ * Port to listen on (default: #define PORT (5003))
+ */
+extern atomic_int _listenPort;
+
+/**
+ * Max number of DNBD3 clients we accept
+ */
+extern atomic_int _maxClients;
+
+/**
+ * Max number of Images we support (in baseDir)
+ */
+extern atomic_int _maxImages;
+
+/**
+ * Maximum payload length we accept on uplinks and thus indirectly
+ * from clients in case the requested range is not cached locally.
+ * Usually this isn't even a megabyte for "real" clients (blockdev
+ * or fuse).
+ */
+extern atomic_int _maxPayload;
+
+/**
+ * If in proxy mode, don't replicate images that are
+ * larger than this according to the uplink server.
+ */
+extern atomic_uint_fast64_t _maxReplicationSize;
+
+/**
+ * Load the server configuration.
+ */
+void globals_loadConfig();
+
+/**
+ * Dump the effective configuration in use to given buffer.
+ */
+size_t globals_dumpConfig(char *buffer, size_t size);
+
+#endif /* GLOBALS_H_ */
diff --git a/src/server/helper.c b/src/server/helper.c
new file mode 100644
index 0000000..2dbc3ea
--- /dev/null
+++ b/src/server/helper.c
@@ -0,0 +1,146 @@
+#include "helper.h"
+#include <arpa/inet.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/socket.h>
+
+#ifdef HAVE_THREAD_NAMES
+#include <sys/prctl.h> // For thread names
+#endif
+
+/**
+ * Parse IPv4 or IPv6 address in string representation to a suitable format usable by the BSD socket library
+ * !! Contents of 'string' might be modified by this function !!
+ *
+ * @param string eg. "1.2.3.4" or "2a01::10:5", optially with port appended, eg "1.2.3.4:6666" or "[2a01::10:5]:6666"
+ * @param host pointer to dnbd3_host_t that will be filled with the following data:
+ * .type will contain either HOST_IP4 or HOST_IP6
+ * .addr will contain the address in network representation
+ * .port will contain the port in network representation, defaulting to #define PORT if none was given
+ * @return true on success, false in failure. contents of af, addr and port are undefined in the latter case
+ */
+bool parse_address(char *string, dnbd3_host_t *host)
+{
+ struct in_addr v4;
+ struct in6_addr v6;
+
+ memset( host, 0, sizeof(*host) );
+ // Try IPv4 without port
+ if ( 1 == inet_pton( AF_INET, string, &v4 ) ) {
+ host->type = HOST_IP4;
+ memcpy( host->addr, &v4, 4 );
+ host->port = htons( PORT );
+ return true;
+ }
+ // Try IPv6 without port
+ if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) {
+ host->type = HOST_IP6;
+ memcpy( host->addr, &v6, 16 );
+ host->port = htons( PORT );
+ return true;
+ }
+
+ // Scan for port
+ char *portpos = NULL, *ptr = string;
+ while ( *ptr ) {
+ if ( *ptr == ':' ) portpos = ptr;
+ ++ptr;
+ }
+ if ( portpos == NULL ) return false; // No port in string
+ // Consider IP being surrounded by [ ]
+ if ( *string == '[' && *(portpos - 1) == ']' ) {
+ ++string;
+ *(portpos - 1) = '\0';
+ }
+ *portpos++ = '\0';
+ int p = atoi( portpos );
+ if ( p < 1 || p > 65535 ) return false; // Invalid port
+ host->port = htons( (uint16_t)p );
+
+ // Try IPv4 with port
+ if ( 1 == inet_pton( AF_INET, string, &v4 ) ) {
+ host->type = HOST_IP4;
+ memcpy( host->addr, &v4, 4 );
+ return true;
+ }
+ // Try IPv6 with port
+ if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) {
+ host->type = HOST_IP6;
+ memcpy( host->addr, &v6, 16 );
+ return true;
+ }
+
+ // FAIL
+ return false;
+}
+
+/**
+ * Convert a host and port (network byte order) to printable representation.
+ * Worst case required buffer len is 48, eg. [1234:1234:1234:1234:1234:1234:1234:1234]:12345 (+ \0)
+ * Returns true on success, false on error
+ */
+bool host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen)
+{
+ // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6
+ if ( targetlen < 10 ) return false;
+ if ( host->type == HOST_IP6 ) {
+ *target++ = '[';
+ inet_ntop( AF_INET6, host->addr, target, (socklen_t)targetlen - 10 );
+ target += strlen( target );
+ *target++ = ']';
+ } else if ( host->type == HOST_IP4 ) {
+ inet_ntop( AF_INET, host->addr, target, (socklen_t)targetlen - 8 );
+ target += strlen( target );
+ } else {
+ snprintf( target, targetlen, "<?addrtype=%d>", (int)host->type );
+ return false;
+ }
+ *target = '\0';
+ if ( host->port != 0 ) {
+ // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7
+ snprintf( target, 7, ":%d", (int)ntohs( host->port ) );
+ }
+ return true;
+}
+
+void remove_trailing_slash(char *string)
+{
+ char *ptr = string + strlen( string ) - 1;
+ while ( ptr >= string && *ptr == '/' )
+ *ptr-- = '\0';
+}
+
+void trim_right(char * const string)
+{
+ char *end = string + strlen( string ) - 1;
+ while ( end >= string && (*end == '\r' || *end == '\n' || *end == ' ' || *end == '\t') )
+ *end-- = '\0';
+}
+
+void setThreadName(const char *name)
+{
+ char newName[16];
+ if ( strlen( name ) > 15 ) {
+ snprintf( newName, sizeof(newName), "%s", name );
+ newName[15] = '\0';
+ name = newName;
+ }
+#ifdef HAVE_THREAD_NAMES
+ prctl( PR_SET_NAME, (unsigned long)name, 0, 0, 0 );
+#endif
+ //TODO: On FreeBSD set threadname with pthread_setname_np
+}
+
+void blockNoncriticalSignals()
+{
+ sigset_t sigmask;
+ sigemptyset( &sigmask );
+ sigaddset( &sigmask, SIGUSR1 );
+ sigaddset( &sigmask, SIGUSR2 );
+ sigaddset( &sigmask, SIGHUP );
+ sigaddset( &sigmask, SIGPIPE );
+ pthread_sigmask( SIG_BLOCK, &sigmask, NULL );
+}
+
diff --git a/src/server/helper.h b/src/server/helper.h
new file mode 100644
index 0000000..102cb36
--- /dev/null
+++ b/src/server/helper.h
@@ -0,0 +1,42 @@
+#ifndef HELPER_H_
+#define HELPER_H_
+
+#include "server.h"
+#include "../shared/log.h"
+#include "../types.h"
+#include <netinet/in.h>
+#include <string.h>
+#include <unistd.h>
+
+bool parse_address(char *string, dnbd3_host_t *host);
+bool host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen);
+void remove_trailing_slash(char *string);
+void trim_right(char * const string);
+void setThreadName(const char *name);
+void blockNoncriticalSignals();
+
+static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+{
+ return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+}
+
+static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+{
+ return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+}
+
+/**
+ * Test whether string ends in suffix.
+ * @return true if string =~ /suffix$/
+ */
+static inline bool strend(char *string, char *suffix)
+{
+ if ( string == NULL ) return false;
+ if ( suffix == NULL || *suffix == '\0' ) return true;
+ const size_t len1 = strlen( string );
+ const size_t len2 = strlen( suffix );
+ if ( len2 > len1 ) return false;
+ return strcmp( string + len1 - len2, suffix ) == 0;
+}
+
+#endif
diff --git a/src/server/image.c b/src/server/image.c
new file mode 100644
index 0000000..061f9a3
--- /dev/null
+++ b/src/server/image.c
@@ -0,0 +1,1794 @@
+#include "image.h"
+#include "helper.h"
+#include "fileutil.h"
+#include "uplink.h"
+#include "locks.h"
+#include "integrity.h"
+#include "altservers.h"
+#include "../shared/protocol.h"
+#include "../shared/timing.h"
+#include "../shared/crc32.h"
+
+#include <assert.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <inttypes.h>
+#include <glob.h>
+#include <jansson.h>
+
+#define PATHLEN (2000)
+#define NONWORKING_RECHECK_INTERVAL_SECONDS (60)
+
+// ##########################################
+
+static dnbd3_image_t *_images[SERVER_MAX_IMAGES];
+static int _num_images = 0;
+
+static pthread_spinlock_t imageListLock;
+static pthread_mutex_t remoteCloneLock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t reloadLock = PTHREAD_MUTEX_INITIALIZER;
+#define NAMELEN 500
+#define CACHELEN 20
+typedef struct
+{
+ char name[NAMELEN];
+ uint16_t rid;
+ ticks deadline;
+} imagecache;
+static imagecache remoteCloneCache[CACHELEN];
+
+// ##########################################
+
+static bool isForbiddenExtension(const char* name);
+static dnbd3_image_t* image_remove(dnbd3_image_t *image);
+static dnbd3_image_t* image_free(dnbd3_image_t *image);
+static bool image_load_all_internal(char *base, char *path);
+static bool image_addToList(dnbd3_image_t *image);
+static bool image_load(char *base, char *path, int withUplink);
+static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize);
+static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
+static bool image_ensureDiskSpace(uint64_t size, bool force);
+
+static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
+static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map);
+
+// ##########################################
+
+void image_serverStartup()
+{
+ srand( (unsigned int)time( NULL ) );
+ spin_init( &imageListLock, PTHREAD_PROCESS_PRIVATE );
+}
+
+/**
+ * Update cache-map of given image for the given byte range
+ * start (inclusive) - end (exclusive)
+ * Locks on: images[].lock
+ */
+void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set)
+{
+ assert( image != NULL );
+ // This should always be block borders due to how the protocol works, but better be safe
+ // than accidentally mark blocks as cached when they really aren't entirely cached.
+ assert( end <= image->virtualFilesize );
+ assert( start <= end );
+ if ( set ) {
+ // If we set as cached, move "inwards" in case we're not at 4k border
+ end &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ start = (uint64_t)(start + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ } else {
+ // If marking as NOT cached, move "outwards" in case we're not at 4k border
+ start &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ end = (uint64_t)(end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ }
+ if ( start >= end )
+ return;
+ bool setNewBlocks = false;
+ uint64_t pos = start;
+ spin_lock( &image->lock );
+ if ( image->cache_map == NULL ) {
+ // Image seems already complete
+ if ( set ) {
+ // This makes no sense
+ spin_unlock( &image->lock );
+ logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path );
+ return;
+ }
+ // Recreate a cache map, set it to all 1 initially as we assume the image was complete
+ const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ image->cache_map = malloc( byteSize );
+ memset( image->cache_map, 0xff, byteSize );
+ }
+ while ( pos < end ) {
+ const size_t map_y = (int)( pos >> 15 );
+ const int map_x = (int)( (pos >> 12) & 7 ); // mod 8
+ const int bit_mask = 1 << map_x;
+ if ( set ) {
+ if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true;
+ image->cache_map[map_y] |= (uint8_t)bit_mask;
+ } else {
+ image->cache_map[map_y] &= (uint8_t)~bit_mask;
+ }
+ pos += DNBD3_BLOCK_SIZE;
+ }
+ if ( setNewBlocks && image->crc32 != NULL ) {
+ // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
+ // for checking, even though this might lead to checking some hash block again, if it was
+ // already complete and the block range spanned at least two hash blocks.
+ // First set start and end to borders of hash blocks
+ start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1);
+ end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1);
+ pos = start;
+ while ( pos < end ) {
+ if ( image->cache_map == NULL ) break;
+ const int block = (int)( pos / HASH_BLOCK_SIZE );
+ if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) {
+ spin_unlock( &image->lock );
+ integrity_check( image, block );
+ spin_lock( &image->lock );
+ }
+ pos += HASH_BLOCK_SIZE;
+ }
+ }
+ spin_unlock( &image->lock );
+}
+
+/**
+ * Returns true if the given image is complete.
+ * Also frees cache_map and deletes it on disk
+ * if it hasn't been complete before
+ * Locks on: image.lock
+ */
+bool image_isComplete(dnbd3_image_t *image)
+{
+ assert( image != NULL );
+ spin_lock( &image->lock );
+ if ( image->virtualFilesize == 0 ) {
+ spin_unlock( &image->lock );
+ return false;
+ }
+ if ( image->cache_map == NULL ) {
+ spin_unlock( &image->lock );
+ return true;
+ }
+ bool complete = true;
+ int j;
+ const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ for (j = 0; j < map_len_bytes - 1; ++j) {
+ if ( image->cache_map[j] != 0xFF ) {
+ complete = false;
+ break;
+ }
+ }
+ if ( complete ) { // Every block except the last one is complete
+ // Last one might need extra treatment if it's not a full byte
+ const int blocks_in_last_byte = (image->virtualFilesize >> 12) & 7;
+ uint8_t last_byte = 0;
+ if ( blocks_in_last_byte == 0 ) {
+ last_byte = 0xFF;
+ } else {
+ for (j = 0; j < blocks_in_last_byte; ++j)
+ last_byte |= (uint8_t)(1 << j);
+ }
+ complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte);
+ }
+ if ( !complete ) {
+ spin_unlock( &image->lock );
+ return false;
+ }
+ char mapfile[PATHLEN] = "";
+ free( image->cache_map );
+ image->cache_map = NULL;
+ snprintf( mapfile, PATHLEN, "%s.map", image->path );
+ spin_unlock( &image->lock );
+ unlink( mapfile );
+ return true;
+}
+
+/**
+ * Make sure readFd is open, useful when closeUnusedFd is active.
+ * This function assumes you called image_lock first, so its known
+ * to be active and the fd won't be closed halfway through the
+ * function.
+ * Does not update atime, so the fd might be closed again very soon.
+ * Since the caller should have image_lock()ed first, it could do
+ * a quick operation on it before calling image_release which
+ * guarantees that the fd will not be closed meanwhile.
+ */
+bool image_ensureOpen(dnbd3_image_t *image)
+{
+ if ( image->readFd != -1 ) return image;
+ int newFd = open( image->path, O_RDONLY );
+ if ( newFd != -1 ) {
+ // Check size
+ const off_t flen = lseek( newFd, 0, SEEK_END );
+ if ( flen == -1 ) {
+ logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+ close( newFd );
+ newFd = -1;
+ } else if ( (uint64_t)flen != image->realFilesize ) {
+ logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+ close( newFd );
+ newFd = -1;
+ }
+ }
+ if ( newFd == -1 ) {
+ spin_lock( &image->lock );
+ image->working = false;
+ spin_unlock( &image->lock );
+ return false;
+ }
+ spin_lock( &image->lock );
+ if ( image->readFd == -1 ) {
+ image->readFd = newFd;
+ spin_unlock( &image->lock );
+ } else {
+ // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+ spin_unlock( &image->lock );
+ close( newFd );
+ }
+ return image->readFd != -1;
+}
+
+/**
+ * Get an image by name+rid. This function increases a reference counter,
+ * so you HAVE TO CALL image_release for every image_get() call at some
+ * point...
+ * Locks on: imageListLock, _images[].lock
+ */
+dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+{
+ int i;
+ const char *removingText = _removeMissingImages ? ", removing from list" : "";
+ dnbd3_image_t *candidate = NULL;
+ // Simple sanity check
+ const size_t slen = strlen( name );
+ if ( slen == 0 || name[slen - 1] == '/' || name[0] == '/' ) return NULL ;
+ // Go through array
+ spin_lock( &imageListLock );
+ for (i = 0; i < _num_images; ++i) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image == NULL || strcmp( image->name, name ) != 0 ) continue;
+ if ( revision == image->rid ) {
+ candidate = image;
+ break;
+ } else if ( revision == 0 && (candidate == NULL || candidate->rid < image->rid) ) {
+ candidate = image;
+ }
+ }
+
+ // Not found
+ if ( candidate == NULL ) {
+ spin_unlock( &imageListLock );
+ return NULL ;
+ }
+
+ spin_lock( &candidate->lock );
+ spin_unlock( &imageListLock );
+ candidate->users++;
+ spin_unlock( &candidate->lock );
+
+ // Found, see if it works
+// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
+// TODO: But remember size-changed images forever
+ if ( candidate->working || checkIfWorking ) {
+ // Is marked working, but might not have an fd open
+ if ( !image_ensureOpen( candidate ) ) {
+ spin_lock( &candidate->lock );
+ timing_get( &candidate->lastWorkCheck );
+ spin_unlock( &candidate->lock );
+ if ( _removeMissingImages ) {
+ candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
+ }
+ return candidate;
+ }
+ }
+
+ if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
+
+ // ...not working...
+
+ // Don't re-check too often
+ spin_lock( &candidate->lock );
+ bool check;
+ declare_now;
+ check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
+ if ( check ) {
+ candidate->lastWorkCheck = now;
+ }
+ spin_unlock( &candidate->lock );
+ if ( !check ) {
+ return candidate;
+ }
+
+ // reaching this point means:
+ // 1) We should check if the image is working, it might or might not be in working state right now
+ // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
+ // 3) We made sure not to re-check this image too often
+
+ // Common for ro and rw images: Size check, read check
+ const off_t len = lseek( candidate->readFd, 0, SEEK_END );
+ bool reload = false;
+ if ( len == -1 ) {
+ logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
+ reload = true;
+ } else if ( (uint64_t)len != candidate->realFilesize ) {
+ logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
+ ". Try sending SIGHUP to server if you know what you're doing.",
+ candidate->path, candidate->realFilesize, (uint64_t)len );
+ } else {
+ // Seek worked, file size is same, now see if we can read from file
+ char buffer[100];
+ if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+ logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.",
+ (int)sizeof(buffer), candidate->path, errno, removingText );
+ reload = true;
+ } else if ( !candidate->working ) {
+ // Seems everything is fine again \o/
+ candidate->working = true;
+ logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
+ }
+ }
+
+ if ( reload ) {
+ // Could not access the image with exising fd - mark for reload which will re-open the file.
+ // make a copy of the image struct but keep the old one around. If/When it's not being used
+ // anymore, it will be freed automatically.
+ dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
+ img->path = strdup( candidate->path );
+ img->name = strdup( candidate->name );
+ img->virtualFilesize = candidate->virtualFilesize;
+ img->realFilesize = candidate->realFilesize;
+ img->atime = now;
+ img->masterCrc32 = candidate->masterCrc32;
+ img->readFd = -1;
+ img->rid = candidate->rid;
+ img->users = 1;
+ img->working = false;
+ spin_init( &img->lock, PTHREAD_PROCESS_PRIVATE );
+ if ( candidate->crc32 != NULL ) {
+ const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
+ img->crc32 = malloc( mb );
+ memcpy( img->crc32, candidate->crc32, mb );
+ }
+ spin_lock( &candidate->lock );
+ if ( candidate->cache_map != NULL ) {
+ const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize );
+ img->cache_map = malloc( mb );
+ memcpy( img->cache_map, candidate->cache_map, mb );
+ }
+ spin_unlock( &candidate->lock );
+ if ( image_addToList( img ) ) {
+ image_release( candidate );
+ candidate = img;
+ } else {
+ img->users = 0;
+ image_free( img );
+ }
+ // readFd == -1 and working == FALSE at this point,
+ // this function needs some splitting up for handling as we need to run most
+ // of the above code again. for now we know that the next call for this
+ // name:rid will get ne newly inserted "img" and try to re-open the file.
+ }
+
+ // Check if image is incomplete, handle
+ if ( candidate->cache_map != NULL ) {
+ if ( candidate->uplink == NULL ) {
+ uplink_init( candidate, -1, NULL, -1 );
+ }
+ }
+
+ return candidate; // We did all we can, hopefully it's working
+}
+
+/**
+ * Lock the image by increasing its users count
+ * Returns the image on success, NULL if it is not found in the image list
+ * Every call to image_lock() needs to be followed by a call to image_release() at some point.
+ * Locks on: imageListLock, _images[].lock
+ */
+dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users--
+{
+ if ( image == NULL ) return NULL ;
+ int i;
+ spin_lock( &imageListLock );
+ for (i = 0; i < _num_images; ++i) {
+ if ( _images[i] == image ) {
+ spin_lock( &image->lock );
+ spin_unlock( &imageListLock );
+ image->users++;
+ spin_unlock( &image->lock );
+ return image;
+ }
+ }
+ spin_unlock( &imageListLock );
+ return NULL ;
+}
+
+/**
+ * Release given image. This will decrease the reference counter of the image.
+ * If the usage counter reaches 0 and the image is not in the images array
+ * anymore, the image will be freed
+ * Locks on: imageListLock, _images[].lock
+ */
+dnbd3_image_t* image_release(dnbd3_image_t *image)
+{
+ if ( image == NULL ) return NULL;
+ spin_lock( &imageListLock );
+ spin_lock( &image->lock );
+ assert( image->users > 0 );
+ image->users--;
+ bool inUse = image->users != 0;
+ spin_unlock( &image->lock );
+ if ( inUse ) { // Still in use, do nothing
+ spin_unlock( &imageListLock );
+ return NULL;
+ }
+ // Getting here means we decreased the usage counter to zero
+ // If the image is not in the images list anymore, we're
+ // responsible for freeing it
+ for (int i = 0; i < _num_images; ++i) {
+ if ( _images[i] == image ) { // Found, do nothing
+ spin_unlock( &imageListLock );
+ return NULL;
+ }
+ }
+ spin_unlock( &imageListLock );
+ // So it wasn't in the images list anymore either, get rid of it
+ if ( !inUse ) image = image_free( image );
+ return NULL;
+}
+
+/**
+ * Returns true if the given file name ends in one of our meta data
+ * file extensions. Used to prevent loading them as images.
+ */
+static bool isForbiddenExtension(const char* name)
+{
+ const size_t len = strlen( name );
+ if ( len < 4 ) return false;
+ const char *ptr = name + len - 4;
+ if ( strcmp( ptr, ".crc" ) == 0 ) return true; // CRC list
+ if ( strcmp( ptr, ".map" ) == 0 ) return true; // cache map for incomplete images
+ if ( len < 5 ) return false;
+ --ptr;
+ if ( strcmp( ptr, ".meta" ) == 0 ) return true; // Meta data (currently not in use)
+ return false;
+}
+
+/**
+ * Remove image from images array. Only free it if it has
+ * no active users and was actually in the list.
+ * Locks on: imageListLock, image[].lock
+ * @return NULL if image was also freed, image otherwise
+ */
+static dnbd3_image_t* image_remove(dnbd3_image_t *image)
+{
+ bool mustFree = false;
+ spin_lock( &imageListLock );
+ spin_lock( &image->lock );
+ for ( int i = _num_images - 1; i >= 0; --i ) {
+ if ( _images[i] == image ) {
+ _images[i] = NULL;
+ mustFree = ( image->users == 0 );
+ }
+ if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--;
+ }
+ spin_unlock( &image->lock );
+ spin_unlock( &imageListLock );
+ if ( mustFree ) image = image_free( image );
+ return image;
+}
+
+/**
+ * Kill all uplinks
+ */
+void image_killUplinks()
+{
+ int i;
+ spin_lock( &imageListLock );
+ for (i = 0; i < _num_images; ++i) {
+ if ( _images[i] == NULL ) continue;
+ spin_lock( &_images[i]->lock );
+ if ( _images[i]->uplink != NULL ) {
+ spin_lock( &_images[i]->uplink->queueLock );
+ if ( !_images[i]->uplink->shutdown ) {
+ thread_detach( _images[i]->uplink->thread );
+ _images[i]->uplink->shutdown = true;
+ }
+ spin_unlock( &_images[i]->uplink->queueLock );
+ signal_call( _images[i]->uplink->signal );
+ }
+ spin_unlock( &_images[i]->lock );
+ }
+ spin_unlock( &imageListLock );
+}
+
+/**
+ * Load all images in given path recursively.
+ * Pass NULL to use path from config.
+ */
+bool image_loadAll(char *path)
+{
+ bool ret;
+ char imgPath[PATHLEN];
+ int imgId;
+ dnbd3_image_t *imgHandle;
+
+ if ( path == NULL ) path = _basePath;
+ if ( pthread_mutex_trylock( &reloadLock ) != 0 ) {
+ logadd( LOG_MINOR, "Could not (re)load image list, already in progress." );
+ return false;
+ }
+ if ( _removeMissingImages ) {
+ // Check if all loaded images still exist on disk
+ logadd( LOG_INFO, "Checking for vanished images" );
+ spin_lock( &imageListLock );
+ for ( int i = _num_images - 1; i >= 0; --i ) {
+ if ( _shutdown ) break;
+ if ( _images[i] == NULL ) {
+ if ( i + 1 == _num_images ) _num_images--;
+ continue;
+ }
+ imgId = _images[i]->id;
+ snprintf( imgPath, PATHLEN, "%s", _images[i]->path );
+ spin_unlock( &imageListLock ); // isReadable hits the fs; unlock
+ // Check if fill can still be opened for reading
+ ret = file_isReadable( imgPath );
+ // Lock again, see if image is still there, free if required
+ spin_lock( &imageListLock );
+ if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue;
+ // Image needs to be removed
+ imgHandle = _images[i];
+ _images[i] = NULL;
+ if ( i + 1 == _num_images ) _num_images--;
+ spin_lock( &imgHandle->lock );
+ const bool freeImg = ( imgHandle->users == 0 );
+ spin_unlock( &imgHandle->lock );
+ // We unlocked, but the image has been removed from the list already, so
+ // there's no way the users-counter can increase at this point.
+ if ( freeImg ) {
+ // Image is not in use anymore, free the dangling entry immediately
+ spin_unlock( &imageListLock ); // image_free might do several fs operations; unlock
+ image_free( imgHandle );
+ spin_lock( &imageListLock );
+ }
+ }
+ spin_unlock( &imageListLock );
+ if ( _shutdown ) {
+ pthread_mutex_unlock( &reloadLock );
+ return true;
+ }
+ }
+ // Now scan for new images
+ logadd( LOG_INFO, "Scanning for new or modified images" );
+ ret = image_load_all_internal( path, path );
+ pthread_mutex_unlock( &reloadLock );
+ logadd( LOG_INFO, "Finished scanning %s", path );
+ return ret;
+}
+
+/**
+ * Free all images we have, but only if they're not in use anymore.
+ * Locks on imageListLock
+ * @return true if all images have been freed
+ */
+bool image_tryFreeAll()
+{
+ spin_lock( &imageListLock );
+ for (int i = _num_images - 1; i >= 0; --i) {
+ if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race...
+ dnbd3_image_t *image = _images[i];
+ _images[i] = NULL;
+ spin_unlock( &imageListLock );
+ image = image_free( image );
+ spin_lock( &imageListLock );
+ }
+ if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--;
+ }
+ spin_unlock( &imageListLock );
+ return _num_images == 0;
+}
+
+/**
+ * Free image. DOES NOT check if it's in use.
+ * Indirectly locks on imageListLock, image.lock, uplink.queueLock
+ */
+static dnbd3_image_t* image_free(dnbd3_image_t *image)
+{
+ assert( image != NULL );
+ if ( !_shutdown ) {
+ logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
+ }
+ //
+ uplink_shutdown( image );
+ spin_lock( &image->lock );
+ free( image->cache_map );
+ free( image->crc32 );
+ free( image->path );
+ free( image->name );
+ image->cache_map = NULL;
+ image->crc32 = NULL;
+ image->path = NULL;
+ image->name = NULL;
+ spin_unlock( &image->lock );
+ if ( image->readFd != -1 ) close( image->readFd );
+ spin_destroy( &image->lock );
+ //
+ memset( image, 0, sizeof(*image) );
+ free( image );
+ return NULL ;
+}
+
+bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
+{
+ if ( cacheMap == NULL ) return true;
+ const uint64_t end = (block + 1) * HASH_BLOCK_SIZE;
+ if ( end <= realFilesize ) {
+ // Trivial case: block in question is not the last block (well, or image size is multiple of HASH_BLOCK_SIZE)
+ const int startCacheIndex = (int)( ( block * HASH_BLOCK_SIZE ) / ( DNBD3_BLOCK_SIZE * 8 ) );
+ const int endCacheIndex = startCacheIndex + (int)( HASH_BLOCK_SIZE / ( DNBD3_BLOCK_SIZE * 8 ) );
+ for ( int i = startCacheIndex; i < endCacheIndex; ++i ) {
+ if ( cacheMap[i] != 0xff ) {
+ return false;
+ }
+ }
+ } else {
+ // Special case: Checking last block, which is smaller than HASH_BLOCK_SIZE
+ for (uint64_t mapPos = block * HASH_BLOCK_SIZE; mapPos < realFilesize; mapPos += DNBD3_BLOCK_SIZE ) {
+ const size_t map_y = (size_t)( mapPos >> 15 );
+ const int map_x = (int)( (mapPos >> 12) & 7 ); // mod 8
+ const int mask = 1 << map_x;
+ if ( (cacheMap[map_y] & mask) == 0 ) return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * Load all images in the given path recursively,
+ * consider *base the base path that is to be cut off
+ */
+static bool image_load_all_internal(char *base, char *path)
+{
+#define SUBDIR_LEN 150
+ assert( path != NULL );
+ assert( *path == '/' );
+ struct dirent entry, *entryPtr;
+ const size_t pathLen = strlen( path );
+ char subpath[PATHLEN];
+ struct stat st;
+ DIR * const dir = opendir( path );
+
+ if ( dir == NULL ) {
+ logadd( LOG_ERROR, "Could not opendir '%s' for loading", path );
+ return false;
+ }
+
+ while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) {
+ entry = *entryPtr;
+ if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue;
+ if ( strlen( entry.d_name ) > SUBDIR_LEN ) {
+ logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN );
+ continue;
+ }
+ if ( entry.d_name[0] == '/' || path[pathLen - 1] == '/' ) {
+ snprintf( subpath, PATHLEN, "%s%s", path, entry.d_name );
+ } else {
+ snprintf( subpath, PATHLEN, "%s/%s", path, entry.d_name );
+ }
+ if ( stat( subpath, &st ) < 0 ) {
+ logadd( LOG_WARNING, "stat() for '%s' failed. Ignoring....", subpath );
+ continue;
+ }
+ if ( S_ISDIR( st.st_mode ) ) {
+ image_load_all_internal( base, subpath ); // Recurse
+ } else if ( !isForbiddenExtension( subpath ) ) {
+ image_load( base, subpath, true ); // Load image if possible
+ }
+ }
+ closedir( dir );
+ return true;
+#undef SUBDIR_LEN
+}
+
+/**
+ */
+static bool image_addToList(dnbd3_image_t *image)
+{
+ int i;
+ static int imgIdCounter = 0; // Used to assign unique numeric IDs to images
+ spin_lock( &imageListLock );
+ // Now we're locked, assign unique ID to image (unique for this running server instance!)
+ image->id = ++imgIdCounter;
+ for ( i = 0; i < _num_images; ++i ) {
+ if ( _images[i] != NULL ) continue;
+ _images[i] = image;
+ break;
+ }
+ if ( i >= _num_images ) {
+ if ( _num_images >= _maxImages ) {
+ spin_unlock( &imageListLock );
+ return false;
+ }
+ _images[_num_images++] = image;
+ }
+ spin_unlock( &imageListLock );
+ return true;
+}
+
+/**
+ * Load image from given path. This will check if the image is
+ * already loaded and updates its information in that case.
+ * Note that this is NOT THREAD SAFE so make sure its always
+ * called on one thread only.
+ */
+static bool image_load(char *base, char *path, int withUplink)
+{
+ int revision = -1;
+ struct stat st;
+ uint8_t *cache_map = NULL;
+ uint32_t *crc32list = NULL;
+ dnbd3_image_t *existing = NULL;
+ int fdImage = -1;
+ bool function_return = false; // Return false by default
+ assert( base != NULL );
+ assert( path != NULL );
+ assert( *path == '/' );
+ assert( strncmp( path, base, strlen(base)) == 0 );
+ assert( base[strlen(base) - 1] != '/' );
+ assert( strlen(path) > strlen(base) );
+ char *lastSlash = strrchr( path, '/' );
+ char *fileName = lastSlash + 1;
+ char imgName[strlen( path )];
+ const size_t fileNameLen = strlen( fileName );
+
+ // Copy virtual path (relative path in "base")
+ char * const virtBase = path + strlen( base ) + 1;
+ assert( *virtBase != '/' );
+ char *src = virtBase, *dst = imgName;
+ while ( src <= lastSlash ) {
+ *dst++ = *src++;
+ }
+ *dst = '\0';
+
+ do {
+ // Parse file name for revision
+ // Try to parse *.r<ID> syntax
+ size_t i;
+ for (i = fileNameLen - 1; i > 1; --i) {
+ if ( fileName[i] < '0' || fileName[i] > '9' ) break;
+ }
+ if ( i != fileNameLen - 1 && fileName[i] == 'r' && fileName[i - 1] == '.' ) {
+ revision = atoi( fileName + i + 1 );
+ src = fileName;
+ while ( src < fileName + i - 1 ) {
+ *dst++ = *src++;
+ }
+ *dst = '\0';
+ }
+ } while (0);
+
+ // Legacy mode enabled and no rid extracted from filename?
+ if ( _vmdkLegacyMode && revision == -1 ) {
+ fdImage = open( path, O_RDONLY ); // Check if it exists
+ if ( fdImage == -1 ) goto load_error;
+ // Yes, simply append full file name and set rid to 1
+ strcat( dst, fileName );
+ revision = 1;
+ }
+ // Did we get anything?
+ if ( revision <= 0 || revision >= 65536 ) {
+ logadd( LOG_WARNING, "Image '%s' has invalid revision ID %d", path, revision );
+ goto load_error;
+ }
+
+ // Get pointer to already existing image if possible
+ existing = image_get( imgName, (uint16_t)revision, true );
+
+ // ### Now load the actual image related data ###
+ if ( fdImage == -1 ) {
+ fdImage = open( path, O_RDONLY );
+ }
+ if ( fdImage == -1 ) {
+ logadd( LOG_ERROR, "Could not open '%s' for reading...", path );
+ goto load_error;
+ }
+ // Determine file size
+ const off_t seekret = lseek( fdImage, 0, SEEK_END );
+ if ( seekret < 0 ) {
+ logadd( LOG_ERROR, "Could not seek to end of file '%s'", path );
+ goto load_error;
+ } else if ( seekret == 0 ) {
+ logadd( LOG_WARNING, "Empty image file '%s'", path );
+ goto load_error;
+ }
+ const uint64_t realFilesize = (uint64_t)seekret;
+ const uint64_t virtualFilesize = ( realFilesize + (DNBD3_BLOCK_SIZE - 1) ) & ~(DNBD3_BLOCK_SIZE - 1);
+ if ( realFilesize != virtualFilesize ) {
+ logadd( LOG_DEBUG1, "Image size of '%s' is %" PRIu64 ", virtual size: %" PRIu64, path, realFilesize, virtualFilesize );
+ }
+
+ // 1. Allocate memory for the cache map if the image is incomplete
+ cache_map = image_loadCacheMap( path, virtualFilesize );
+
+ // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented)
+
+ // 2. Load CRC-32 list of image
+ bool doFullCheck = false;
+ uint32_t masterCrc = 0;
+ const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize );
+ crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc );
+
+ // Check CRC32
+ if ( crc32list != NULL ) {
+ if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) {
+ logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path );
+ doFullCheck = true;
+ }
+ }
+
+ // Compare data just loaded to identical image we apparently already loaded
+ if ( existing != NULL ) {
+ if ( existing->realFilesize != realFilesize ) {
+ logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ // Image will be replaced below
+ } else if ( existing->crc32 != NULL && crc32list != NULL
+ && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) {
+ logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." );
+ logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" );
+ // Image will be replaced below
+ } else if ( existing->crc32 == NULL && crc32list != NULL ) {
+ logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid );
+ existing->crc32 = crc32list;
+ existing->masterCrc32 = masterCrc;
+ crc32list = NULL;
+ function_return = true;
+ goto load_error; // Keep existing
+ } else if ( existing->cache_map != NULL && cache_map == NULL ) {
+ // Just ignore that fact, if replication is really complete the cache map will be removed anyways
+ logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
+ function_return = true;
+ goto load_error; // Keep existing
+ } else {
+ // Nothing changed about the existing image, so do nothing
+ logadd( LOG_DEBUG1, "Did not change" );
+ function_return = true;
+ goto load_error; // Keep existing
+ }
+ // Remove existing image from images array, so it will be replaced by the reloaded image
+ existing = image_remove( existing );
+ existing = image_release( existing );
+ }
+
+ // Load fresh image
+ dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) );
+ image->path = strdup( path );
+ image->name = strdup( imgName );
+ image->cache_map = cache_map;
+ image->crc32 = crc32list;
+ image->masterCrc32 = masterCrc;
+ image->uplink = NULL;
+ image->realFilesize = realFilesize;
+ image->virtualFilesize = virtualFilesize;
+ image->rid = (uint16_t)revision;
+ image->users = 0;
+ image->readFd = -1;
+ image->working = (image->cache_map == NULL );
+ timing_get( &image->nextCompletenessEstimate );
+ image->completenessEstimate = -1;
+ spin_init( &image->lock, PTHREAD_PROCESS_PRIVATE );
+ int32_t offset;
+ if ( stat( path, &st ) == 0 ) {
+ // Negatively offset atime by file modification time
+ offset = (int32_t)( st.st_mtime - time( NULL ) );
+ if ( offset > 0 ) offset = 0;
+ } else {
+ offset = 0;
+ }
+ timing_gets( &image->atime, offset );
+
+ // Prevent freeing in cleanup
+ cache_map = NULL;
+ crc32list = NULL;
+
+ // Get rid of cache map if image is complete
+ if ( image->cache_map != NULL ) {
+ image_isComplete( image );
+ }
+
+ // Image is definitely incomplete, initialize uplink worker
+ if ( image->cache_map != NULL ) {
+ image->working = false;
+ if ( withUplink ) {
+ uplink_init( image, -1, NULL, -1 );
+ }
+ }
+
+ // ### Reaching this point means loading succeeded
+ image->readFd = fdImage;
+ if ( image_addToList( image ) ) {
+ // Keep fd for reading
+ fdImage = -1;
+ } else {
+ logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
+ image->readFd = -1; // Keep fdImage instead, will be closed below
+ image = image_free( image );
+ goto load_error;
+ }
+ logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
+ // CRC errors found...
+ if ( doFullCheck ) {
+ logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid );
+ integrity_check( image, -1 );
+ }
+
+ function_return = true;
+
+ // Clean exit:
+load_error: ;
+ if ( existing != NULL ) existing = image_release( existing );
+ if ( crc32list != NULL ) free( crc32list );
+ if ( cache_map != NULL ) free( cache_map );
+ if ( fdImage != -1 ) close( fdImage );
+ return function_return;
+}
+
+static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
+{
+ uint8_t *retval = NULL;
+ char mapFile[strlen( imagePath ) + 10 + 1];
+ sprintf( mapFile, "%s.map", imagePath );
+ int fdMap = open( mapFile, O_RDONLY );
+ if ( fdMap >= 0 ) {
+ const int map_size = IMGSIZE_TO_MAPBYTES( fileSize );
+ retval = calloc( 1, map_size );
+ const ssize_t rd = read( fdMap, retval, map_size );
+ if ( map_size != rd ) {
+ logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath );
+ // Could not read complete map, that means the rest of the image file will be considered incomplete
+ }
+ close( fdMap );
+ // Later on we check if the hash map says the image is complete
+ }
+ return retval;
+}
+
+static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc)
+{
+ assert( masterCrc != NULL );
+ uint32_t *retval = NULL;
+ const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( fileSize );
+ // Currently this should only prevent accidental corruption (esp. regarding transparent proxy mode)
+ // but maybe later on you want better security
+ char hashFile[strlen( imagePath ) + 10 + 1];
+ sprintf( hashFile, "%s.crc", imagePath );
+ int fdHash = open( hashFile, O_RDONLY );
+ if ( fdHash >= 0 ) {
+ off_t fs = lseek( fdHash, 0, SEEK_END );
+ if ( fs < (hashBlocks + 1) * 4 ) {
+ logadd( LOG_WARNING, "Ignoring crc32 list for '%s' as it is too short", imagePath );
+ } else {
+ if ( pread( fdHash, masterCrc, sizeof(uint32_t), 0 ) != sizeof(uint32_t) ) {
+ logadd( LOG_WARNING, "Error reading first crc32 of '%s'", imagePath );
+ } else {
+ const size_t crcFileLen = hashBlocks * sizeof(uint32_t);
+ size_t pos = 0;
+ retval = calloc( hashBlocks, sizeof(uint32_t) );
+ while ( pos < crcFileLen ) {
+ ssize_t ret = pread( fdHash, retval + pos, crcFileLen - pos, pos + sizeof(uint32_t) /* skip master-crc */ );
+ if ( ret == -1 ) {
+ if ( errno == EINTR || errno == EAGAIN ) continue;
+ }
+ if ( ret <= 0 ) break;
+ pos += ret;
+ }
+ if ( pos != crcFileLen ) {
+ free( retval );
+ retval = NULL;
+ logadd( LOG_WARNING, "Could not read crc32 list of '%s'", imagePath );
+ } else {
+ uint32_t lists_crc = crc32( 0, NULL, 0 );
+ lists_crc = crc32( lists_crc, (uint8_t*)retval, hashBlocks * sizeof(uint32_t) );
+ lists_crc = net_order_32( lists_crc );
+ if ( lists_crc != *masterCrc ) {
+ free( retval );
+ retval = NULL;
+ logadd( LOG_WARNING, "CRC-32 of CRC-32 list mismatch. CRC-32 list of '%s' might be corrupted.", imagePath );
+ }
+ }
+ }
+ }
+ close( fdHash );
+ }
+ return retval;
+}
+
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map)
+{
+ // This checks the first block and (up to) count - 1 random blocks for corruption
+ // via the known crc32 list. This is very sloppy and is merely supposed to detect
+ // accidental corruption due to broken dnbd3-proxy functionality or file system
+ // corruption.
+ assert( count > 0 );
+ const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize );
+ int blocks[count + 1];
+ int index = 0, j;
+ int block;
+ if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0;
+ int tries = count * 5; // Try only so many times to find a non-duplicate complete block
+ while ( index + 1 < count && --tries > 0 ) {
+ block = rand() % hashBlocks; // Random block
+ for ( j = 0; j < index; ++j ) { // Random block already in list?
+ if ( blocks[j] == block ) goto while_end;
+ }
+ // Block complete? If yes, add to list
+ if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block;
+while_end: ;
+ }
+ blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1
+ return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check
+}
+
+/**
+ * Create a new image with the given image name and revision id in _basePath
+ * Returns true on success, false otherwise
+ */
+bool image_create(char *image, int revision, uint64_t size)
+{
+ assert( image != NULL );
+ assert( size >= DNBD3_BLOCK_SIZE );
+ if ( revision <= 0 ) {
+ logadd( LOG_ERROR, "revision id invalid: %d", revision );
+ return false;
+ }
+ char path[PATHLEN], cache[PATHLEN];
+ char *lastSlash = strrchr( image, '/' );
+ if ( lastSlash == NULL ) {
+ snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
+ } else {
+ *lastSlash = '\0';
+ snprintf( path, PATHLEN, "%s/%s", _basePath, image );
+ mkdir_p( path );
+ *lastSlash = '/';
+ snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
+ }
+ snprintf( cache, PATHLEN, "%s.map", path );
+ size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const int mapsize = IMGSIZE_TO_MAPBYTES(size);
+ // Write files
+ int fdImage = -1, fdCache = -1;
+ fdImage = open( path, O_RDWR | O_TRUNC | O_CREAT, 0644 );
+ fdCache = open( cache, O_RDWR | O_TRUNC | O_CREAT, 0644 );
+ if ( fdImage < 0 ) {
+ logadd( LOG_ERROR, "Could not open %s for writing.", path );
+ goto failure_cleanup;
+ }
+ if ( fdCache < 0 ) {
+ logadd( LOG_ERROR, "Could not open %s for writing.", cache );
+ goto failure_cleanup;
+ }
+ // Try cache map first
+ if ( !file_alloc( fdCache, 0, mapsize ) && !file_setSize( fdCache, mapsize ) ) {
+ const int err = errno;
+ logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err );
+ }
+ // Now write image
+ if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) {
+ logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
+ logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk"
+ " space without actually writing all zeroes to the block device." );
+ logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect"
+ " divine performance during replication." );
+ goto failure_cleanup;
+ } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) {
+ logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
+ logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." );
+ goto failure_cleanup;
+ }
+ close( fdImage );
+ close( fdCache );
+ return true;
+ //
+failure_cleanup: ;
+ if ( fdImage >= 0 ) close( fdImage );
+ if ( fdCache >= 0 ) close( fdCache );
+ remove( path );
+ remove( cache );
+ return false;
+}
+
+static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len);
+static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid);
+
+/**
+ * Does the same as image_get, but if the image is not known locally, or if
+ * revision 0 is requested, it will:
+ * a) Try to clone it from an authoritative dnbd3 server, if
+ * the server is running in proxy mode.
+ * b) Try to load it from disk by constructing the appropriate file name, if not
+ * running in proxy mode.
+ *
+ * If the return value is not NULL,
+ * image_release needs to be called on the image at some point.
+ * Locks on: remoteCloneLock, imageListLock, _images[].lock
+ */
+dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
+{
+ // specific revision - try shortcut
+ if ( revision != 0 ) {
+ dnbd3_image_t *image = image_get( name, revision, true );
+ if ( image != NULL ) return image;
+ }
+ const size_t len = strlen( name );
+ // Sanity check
+ if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
+ || name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
+ // Call specific function depending on whether this is a proxy or not
+ if ( _isProxy ) {
+ return loadImageProxy( name, revision, len );
+ } else {
+ return loadImageServer( name, revision );
+ }
+}
+
+/**
+ * Called if specific rid is not loaded, or if rid is 0 (some version might be loaded locally,
+ * but we should check if there's a higher rid on a remote server).
+ */
+static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len)
+{
+ // Already existing locally?
+ dnbd3_image_t *image = NULL;
+ if ( revision == 0 ) {
+ image = image_get( name, revision, true );
+ }
+
+ // Doesn't exist or is rid 0, try remote if not already tried it recently
+ declare_now;
+ char *cmpname = name;
+ int useIndex = -1, fallbackIndex = 0;
+ if ( len >= NAMELEN ) cmpname += 1 + len - NAMELEN;
+ pthread_mutex_lock( &remoteCloneLock );
+ for (int i = 0; i < CACHELEN; ++i) {
+ if ( remoteCloneCache[i].rid == revision && strcmp( cmpname, remoteCloneCache[i].name ) == 0 ) {
+ useIndex = i;
+ if ( timing_reached( &remoteCloneCache[i].deadline, &now ) ) break;
+ pthread_mutex_unlock( &remoteCloneLock ); // Was recently checked...
+ return image;
+ }
+ if ( timing_1le2( &remoteCloneCache[i].deadline, &remoteCloneCache[fallbackIndex].deadline ) ) {
+ fallbackIndex = i;
+ }
+ }
+ // Re-check to prevent two clients at the same time triggering this,
+ // but only if rid != 0, since we would just get an old rid then
+ if ( revision != 0 ) {
+ if ( image == NULL ) image = image_get( name, revision, true );
+ if ( image != NULL ) {
+ pthread_mutex_unlock( &remoteCloneLock );
+ return image;
+ }
+ }
+ // Reaching this point means we should contact an authority server
+ serialized_buffer_t serialized;
+ // Mark as recently checked
+ if ( useIndex == -1 ) {
+ useIndex = fallbackIndex;
+ }
+ timing_set( &remoteCloneCache[useIndex].deadline, &now, SERVER_REMOTE_IMAGE_CHECK_CACHETIME );
+ snprintf( remoteCloneCache[useIndex].name, NAMELEN, "%s", cmpname );
+ remoteCloneCache[useIndex].rid = revision;
+ pthread_mutex_unlock( &remoteCloneLock );
+
+ // Get some alt servers and try to get the image from there
+#define REP_NUM_SRV (8)
+ dnbd3_host_t servers[REP_NUM_SRV];
+ int uplinkSock = -1;
+ dnbd3_host_t uplinkServer;
+ const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false );
+ uint16_t remoteProtocolVersion;
+ uint16_t remoteRid = revision;
+ uint64_t remoteImageSize;
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ poll_list_t *cons = sock_newPollList();
+ logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count );
+ for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects
+ char *remoteName;
+ bool ok = false;
+ int sock;
+ if ( i >= count ) {
+ sock = sock_multiConnect( cons, NULL, 100, 1000 );
+ if ( sock == -2 ) break;
+ } else {
+ if ( log_hasMask( LOG_DEBUG2 ) ) {
+ char host[50];
+ size_t len = sock_printHost( &servers[i], host, sizeof(host) );
+ host[len] = '\0';
+ logadd( LOG_DEBUG2, "Trying to replicate from %s", host );
+ }
+ sock = sock_multiConnect( cons, &servers[i], 100, 1000 );
+ }
+ if ( sock == -1 || sock == -2 ) continue;
+ salen = sizeof(sa);
+ if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
+ logadd( LOG_MINOR, "getpeername on successful connection failed!? (errno=%d)", errno );
+ goto server_fail;
+ }
+ if ( !dnbd3_select_image( sock, name, revision, SI_SERVER_FLAGS ) ) goto server_fail;
+ if ( !dnbd3_select_image_reply( &serialized, sock, &remoteProtocolVersion, &remoteName, &remoteRid, &remoteImageSize ) ) goto server_fail;
+ if ( remoteProtocolVersion < MIN_SUPPORTED_SERVER || remoteRid == 0 ) goto server_fail;
+ if ( revision != 0 && remoteRid != revision ) goto server_fail; // Want specific revision but uplink supplied different rid
+ if ( revision == 0 && image != NULL && image->rid >= remoteRid ) goto server_fail; // Not actually a failure: Highest remote rid is <= highest local rid - don't clone!
+ if ( remoteImageSize < DNBD3_BLOCK_SIZE || remoteName == NULL || strcmp( name, remoteName ) != 0 ) goto server_fail;
+ if ( remoteImageSize > _maxReplicationSize ) {
+ logadd( LOG_MINOR, "Won't proxy '%s:%d': Larger than maxReplicationSize", name, (int)revision );
+ goto server_fail;
+ }
+ pthread_mutex_lock( &reloadLock );
+ // Ensure disk space entirely if not using sparse files, otherwise just make sure we have some room at least
+ if ( _sparseFiles ) {
+ ok = image_ensureDiskSpace( 2ull * 1024 * 1024 * 1024, false ); // 2GiB, maybe configurable one day
+ } else {
+ ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc.
+ }
+ ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ pthread_mutex_unlock( &reloadLock );
+ if ( !ok ) goto server_fail;
+
+ // Cloning worked :-)
+ uplinkSock = sock;
+ if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) {
+ uplinkServer.type = 0;
+ }
+ break;
+
+server_fail: ;
+ close( sock );
+ }
+ sock_destroyPollList( cons );
+
+ // If we still have a pointer to a local image, release the reference
+ if ( image != NULL ) image_release( image );
+ // If everything worked out, this call should now actually return the image
+ image = image_get( name, remoteRid, false );
+ if ( image != NULL && uplinkSock != -1 ) {
+ // If so, init the uplink and pass it the socket
+ sock_setTimeout( uplinkSock, _uplinkTimeout );
+ if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) {
+ close( uplinkSock );
+ } else {
+ // Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
+ int i = 0;
+ while ( !image->working && ++i < 100 )
+ usleep( 2000 );
+ }
+ } else if ( uplinkSock != -1 ) {
+ close( uplinkSock );
+ }
+ return image;
+}
+
+/**
+ * Called if specific rid is not loaded, or if rid is 0, in which case we check on
+ * disk which revision is latest.
+ */
+static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid)
+{
+ char imageFile[PATHLEN] = "";
+ uint16_t detectedRid = 0;
+
+ if ( requestedRid != 0 ) {
+ snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid );
+ detectedRid = requestedRid;
+ } else {
+ glob_t g;
+ snprintf( imageFile, PATHLEN, "%s/%s.r*", _basePath, name );
+ const int ret = glob( imageFile, GLOB_NOSORT | GLOB_MARK, NULL, &g );
+ imageFile[0] = '\0';
+ if ( ret == 0 ) {
+ long int best = 0;
+ for ( size_t i = 0; i < g.gl_pathc; ++i ) {
+ const char * const path = g.gl_pathv[i];
+ const char * rev = strrchr( path, 'r' );
+ if ( rev == NULL || rev == path || *(rev - 1) != '.' ) continue;
+ rev++;
+ if ( *rev < '0' || *rev > '9' ) continue;
+ char *err = NULL;
+ long int val = strtol( rev, &err, 10 );
+ if ( err == NULL || *err != '\0' ) continue;
+ if ( val > best ) {
+ best = val;
+ snprintf( imageFile, PATHLEN, "%s", g.gl_pathv[i] );
+ }
+ }
+ if ( best > 0 && best < 65536 ) {
+ detectedRid = (uint16_t)best;
+ }
+ }
+ globfree( &g );
+ }
+ if ( _vmdkLegacyMode && requestedRid <= 1
+ && !isForbiddenExtension( name )
+ && ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) {
+ snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name );
+ detectedRid = 1;
+ }
+ logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile );
+ // No file was determined, or it doesn't seem to exist/be readable
+ if ( detectedRid == 0 ) {
+ logadd( LOG_DEBUG2, "Not found, bailing out" );
+ return image_get( name, requestedRid, true );
+ }
+ if ( !_vmdkLegacyMode && requestedRid == 0 ) {
+ // rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0
+ while ( detectedRid != 0 ) {
+ dnbd3_image_t *image = image_get( name, detectedRid, true );
+ if ( image != NULL ) {
+ // globbed rid already loaded, return
+ return image;
+ }
+ if ( file_isReadable( imageFile ) ) {
+ // globbed rid is
+ break;
+ }
+ logadd( LOG_DEBUG2, "%s: rid %d globbed but not readable, trying lower rid...", name, (int)detectedRid );
+ detectedRid--;
+ snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, requestedRid );
+ }
+ }
+
+ // Now lock on the loading mutex, then check again if the image exists (we're multi-threaded)
+ pthread_mutex_lock( &reloadLock );
+ dnbd3_image_t* image = image_get( name, detectedRid, true );
+ if ( image != NULL ) {
+ // The image magically appeared in the meantime
+ logadd( LOG_DEBUG2, "Magically appeared" );
+ pthread_mutex_unlock( &reloadLock );
+ return image;
+ }
+ // Still not loaded, let's try to do so
+ logadd( LOG_DEBUG2, "Calling load" );
+ image_load( _basePath, imageFile, false );
+ pthread_mutex_unlock( &reloadLock );
+ // If loading succeeded, this will return the image
+ logadd( LOG_DEBUG2, "Calling get" );
+ return image_get( name, requestedRid, true );
+}
+
+/**
+ * Prepare a cloned image:
+ * 1. Allocate empty image file and its cache map
+ * 2. Use passed socket to request the crc32 list and save it to disk
+ * 3. Load the image from disk
+ * Returns: true on success, false otherwise
+ */
+static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize)
+{
+ // Allocate disk space and create cache map
+ if ( !image_create( name, revision, imageSize ) ) return false;
+ // CRC32
+ const size_t len = strlen( _basePath ) + strlen( name ) + 20;
+ char crcFile[len];
+ snprintf( crcFile, len, "%s/%s.r%d.crc", _basePath, name, (int)revision );
+ if ( !file_isReadable( crcFile ) ) {
+ // Get crc32list from remote server
+ size_t crc32len = IMGSIZE_TO_HASHBLOCKS(imageSize) * sizeof(uint32_t);
+ uint32_t masterCrc;
+ uint8_t *crc32list = malloc( crc32len );
+ if ( !dnbd3_get_crc32( sock, &masterCrc, crc32list, &crc32len ) ) {
+ free( crc32list );
+ return false;
+ }
+ if ( crc32len != 0 ) {
+ uint32_t lists_crc = crc32( 0, NULL, 0 );
+ lists_crc = crc32( lists_crc, (uint8_t*)crc32list, crc32len );
+ lists_crc = net_order_32( lists_crc );
+ if ( lists_crc != masterCrc ) {
+ logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name );
+ } else {
+ int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 );
+ write( fd, &masterCrc, sizeof(uint32_t) );
+ write( fd, crc32list, crc32len );
+ close( fd );
+ }
+ }
+ free( crc32list );
+ }
+ // HACK: Chop of ".crc" to get the image file name
+ crcFile[strlen( crcFile ) - 4] = '\0';
+ return image_load( _basePath, crcFile, false );
+}
+
+/**
+ * Generate the crc32 block list file for the given file.
+ * This function wants a plain file name instead of a dnbd3_image_t,
+ * as it can be used directly from the command line.
+ */
+bool image_generateCrcFile(char *image)
+{
+ int fdCrc = -1;
+ uint32_t crc;
+ char crcFile[strlen( image ) + 4 + 1];
+ int fdImage = open( image, O_RDONLY );
+
+ if ( fdImage == -1 ) {
+ logadd( LOG_ERROR, "Could not open %s.", image );
+ return false;
+ }
+
+ const int64_t fileLen = lseek( fdImage, 0, SEEK_END );
+ if ( fileLen <= 0 ) {
+ logadd( LOG_ERROR, "Error seeking to end, or file is empty." );
+ goto cleanup_fail;
+ }
+
+ struct stat sst;
+ sprintf( crcFile, "%s.crc", image );
+ if ( stat( crcFile, &sst ) == 0 ) {
+ logadd( LOG_ERROR, "CRC File for %s already exists! Delete it first if you want to regen.", image );
+ goto cleanup_fail;
+ }
+
+ fdCrc = open( crcFile, O_RDWR | O_CREAT, 0644 );
+ if ( fdCrc == -1 ) {
+ logadd( LOG_ERROR, "Could not open CRC File %s for writing..", crcFile );
+ goto cleanup_fail;
+ }
+ // CRC of all CRCs goes first. Don't know it yet, write 4 bytes dummy data.
+ if ( write( fdCrc, crcFile, sizeof(crc) ) != sizeof(crc) ) {
+ logadd( LOG_ERROR, "Write error" );
+ goto cleanup_fail;
+ }
+
+ printf( "Generating CRC32" );
+ fflush( stdout );
+ const int blockCount = IMGSIZE_TO_HASHBLOCKS( fileLen );
+ for ( int i = 0; i < blockCount; ++i ) {
+ if ( !image_calcBlockCrc32( fdImage, i, fileLen, &crc ) ) {
+ goto cleanup_fail;
+ }
+ if ( write( fdCrc, &crc, sizeof(crc) ) != sizeof(crc) ) {
+ printf( "\nWrite error writing crc file: %d\n", errno );
+ goto cleanup_fail;
+ }
+ putchar( '.' );
+ fflush( stdout );
+ }
+ close( fdImage );
+ fdImage = -1;
+ printf( "done!\n" );
+
+ logadd( LOG_INFO, "Generating master-crc..." );
+ fflush( stdout );
+ // File is written - read again to calc master crc
+ if ( lseek( fdCrc, 4, SEEK_SET ) != 4 ) {
+ logadd( LOG_ERROR, "Could not seek to beginning of crc list in file" );
+ goto cleanup_fail;
+ }
+ char buffer[400];
+ int blocksToGo = blockCount;
+ crc = crc32( 0, NULL, 0 );
+ while ( blocksToGo > 0 ) {
+ const int numBlocks = MIN( (int)( sizeof(buffer) / sizeof(crc) ), blocksToGo );
+ if ( read( fdCrc, buffer, numBlocks * sizeof(crc) ) != numBlocks * (int)sizeof(crc) ) {
+ logadd( LOG_ERROR, "Could not re-read from crc32 file" );
+ goto cleanup_fail;
+ }
+ crc = crc32( crc, (uint8_t*)buffer, numBlocks * sizeof(crc) );
+ blocksToGo -= numBlocks;
+ }
+ crc = net_order_32( crc );
+ if ( pwrite( fdCrc, &crc, sizeof(crc), 0 ) != sizeof(crc) ) {
+ logadd( LOG_ERROR, "Could not write master crc to file" );
+ goto cleanup_fail;
+ }
+ logadd( LOG_INFO, "CRC-32 file successfully generated." );
+ fflush( stdout );
+ return true;
+
+cleanup_fail:;
+ if ( fdImage != -1 ) close( fdImage );
+ if ( fdCrc != -1 ) close( fdCrc );
+ return false;
+}
+
+json_t* image_getListAsJson()
+{
+ json_t *imagesJson = json_array();
+ json_t *jsonImage;
+ int i;
+ char uplinkName[100] = { 0 };
+ uint64_t bytesReceived;
+ int users, completeness, idleTime;
+ declare_now;
+
+ spin_lock( &imageListLock );
+ for ( i = 0; i < _num_images; ++i ) {
+ if ( _images[i] == NULL ) continue;
+ dnbd3_image_t *image = _images[i];
+ spin_lock( &image->lock );
+ spin_unlock( &imageListLock );
+ users = image->users;
+ idleTime = (int)timing_diff( &image->atime, &now );
+ completeness = image_getCompletenessEstimate( image );
+ if ( image->uplink == NULL ) {
+ bytesReceived = 0;
+ uplinkName[0] = '\0';
+ } else {
+ bytesReceived = image->uplink->bytesReceived;
+ if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) {
+ uplinkName[0] = '\0';
+ }
+ }
+ image->users++; // Prevent freeing after we unlock
+ spin_unlock( &image->lock );
+
+ jsonImage = json_pack( "{sisssisisisisI}",
+ "id", image->id, // id, name, rid never change, so access them without locking
+ "name", image->name,
+ "rid", (int) image->rid,
+ "users", users,
+ "complete", completeness,
+ "idle", idleTime,
+ "size", (json_int_t)image->virtualFilesize );
+ if ( bytesReceived != 0 ) {
+ json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) );
+ }
+ if ( uplinkName[0] != '\0' ) {
+ json_object_set_new( jsonImage, "uplinkServer", json_string( uplinkName ) );
+ }
+ json_array_append_new( imagesJson, jsonImage );
+
+ image = image_release( image ); // Since we did image->users++;
+ spin_lock( &imageListLock );
+ }
+ spin_unlock( &imageListLock );
+ return imagesJson;
+}
+
+/**
+ * Get completeness of an image in percent. Only estimated, not exact.
+ * Returns: 0-100
+ * DOES NOT LOCK, so make sure to do so before calling
+ */
+int image_getCompletenessEstimate(dnbd3_image_t * const image)
+{
+ assert( image != NULL );
+ if ( image->cache_map == NULL ) return image->working ? 100 : 0;
+ declare_now;
+ if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) {
+ // Since this operation is relatively expensive, we cache the result for a while
+ return image->completenessEstimate;
+ }
+ int i;
+ int percent = 0;
+ const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( len == 0 ) return 0;
+ for ( i = 0; i < len; ++i ) {
+ if ( image->cache_map[i] == 0xff ) {
+ percent += 100;
+ } else if ( image->cache_map[i] != 0 ) {
+ percent += 50;
+ }
+ }
+ image->completenessEstimate = percent / len;
+ timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 );
+ return image->completenessEstimate;
+}
+
+/**
+ * Check the CRC-32 of the given blocks. The array "blocks" is of variable length.
+ * !! pass -1 as the last block so the function knows when to stop !!
+ * Does NOT check whether block index is within image.
+ * Returns true or false
+ */
+bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks, const uint64_t realFilesize)
+{
+ while ( *blocks != -1 ) {
+ uint32_t crc;
+ if ( !image_calcBlockCrc32( fd, *blocks, realFilesize, &crc ) ) {
+ return false;
+ }
+ if ( crc != crc32list[*blocks] ) {
+ logadd( LOG_WARNING, "Block %d is %x, should be %x", *blocks, crc, crc32list[*blocks] );
+ return false;
+ }
+ blocks++;
+ }
+ return true;
+}
+
+/**
+ * Calc CRC-32 of block. Value is returned as little endian.
+ */
+static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc)
+{
+ // Make buffer 4k aligned in case fd has O_DIRECT set
+#define BSIZE 262144
+ char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE];
+ char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) );
+ // How many bytes to read from the input file
+ const uint64_t bytesFromFile = MIN( HASH_BLOCK_SIZE, realFilesize - ( block * HASH_BLOCK_SIZE) );
+ // Determine how many bytes we had to read if the file size were a multiple of 4k
+ // This might be the same value if the real file's size is a multiple of 4k
+ const uint64_t vbs = ( ( realFilesize + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ) - ( block * HASH_BLOCK_SIZE );
+ const uint64_t virtualBytesFromFile = MIN( HASH_BLOCK_SIZE, vbs );
+ const off_t readPos = (int64_t)block * HASH_BLOCK_SIZE;
+ size_t bytes = 0;
+ assert( vbs >= bytesFromFile );
+ *crc = crc32( 0, NULL, 0 );
+ // Calculate the crc32 by reading data from the file
+ while ( bytes < bytesFromFile ) {
+ const size_t n = (size_t)MIN( BSIZE, bytesFromFile - bytes );
+ const ssize_t r = pread( fd, buffer, n, readPos + bytes );
+ if ( r <= 0 ) {
+ logadd( LOG_WARNING, "CRC: Read error (errno=%d)", errno );
+ return false;
+ }
+ *crc = crc32( *crc, (uint8_t*)buffer, r );
+ bytes += (size_t)r;
+ }
+ // If the virtual file size is different, keep going using nullbytes
+ if ( bytesFromFile < virtualBytesFromFile ) {
+ memset( buffer, 0, BSIZE );
+ bytes = (size_t)( virtualBytesFromFile - bytesFromFile );
+ while ( bytes != 0 ) {
+ const size_t len = MIN( BSIZE, bytes );
+ *crc = crc32( *crc, (uint8_t*)buffer, len );
+ bytes -= len;
+ }
+ }
+ *crc = net_order_32( *crc );
+ return true;
+#undef BSIZE
+}
+
+/**
+ * Call image_ensureDiskSpace (below), but aquire
+ * reloadLock first.
+ */
+bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
+{
+ bool ret;
+ pthread_mutex_lock( &reloadLock );
+ ret = image_ensureDiskSpace( size, force );
+ pthread_mutex_unlock( &reloadLock );
+ return ret;
+}
+
+/**
+ * Make sure at least size bytes are available in _basePath.
+ * Will delete old images to make room for new ones.
+ * TODO: Store last access time of images. Currently the
+ * last access time is reset to the file modification time
+ * on server restart. Thus it will
+ * currently only delete images if server uptime is > 10 hours.
+ * This can be overridden by setting force to true, in case
+ * free space is desperately needed.
+ * Return true iff enough space is available. false in random other cases
+ */
+static bool image_ensureDiskSpace(uint64_t size, bool force)
+{
+ for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
+ uint64_t available;
+ if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
+ const int e = errno;
+ logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e );
+ return true;
+ }
+ if ( available > size ) return true;
+ if ( !force && dnbd3_serverUptime() < 10 * 3600 ) {
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)),
+ (int)(size / (1024 * 1024)) );
+ return false;
+ }
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
+ (int)(size / (1024 * 1024)) );
+ // Find least recently used image
+ dnbd3_image_t *oldest = NULL;
+ int i; // XXX improve locking
+ for (i = 0; i < _num_images; ++i) {
+ if ( _images[i] == NULL ) continue;
+ dnbd3_image_t *current = image_lock( _images[i] );
+ if ( current == NULL ) continue;
+ if ( current->users == 1 ) { // Just from the lock above
+ if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
+ // Oldest access time so far
+ oldest = current;
+ }
+ }
+ current = image_release( current );
+ }
+ declare_now;
+ if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) {
+ if ( oldest == NULL ) {
+ logadd( LOG_INFO, "All images are currently in use :-(" );
+ } else {
+ logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" );
+ }
+ return false;
+ }
+ oldest = image_lock( oldest );
+ if ( oldest == NULL ) continue; // Image freed in the meantime? Try again
+ logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
+ char *filename = strdup( oldest->path );
+ oldest = image_remove( oldest );
+ oldest = image_release( oldest );
+ unlink( filename );
+ size_t len = strlen( filename ) + 10;
+ char buffer[len];
+ snprintf( buffer, len, "%s.map", filename );
+ unlink( buffer );
+ snprintf( buffer, len, "%s.crc", filename );
+ unlink( buffer );
+ snprintf( buffer, len, "%s.meta", filename );
+ unlink( buffer );
+ free( filename );
+ }
+ return false;
+}
+
+void image_closeUnusedFd()
+{
+ int fd, i;
+ ticks deadline;
+ timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
+ char imgstr[300];
+ spin_lock( &imageListLock );
+ for (i = 0; i < _num_images; ++i) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image == NULL )
+ continue;
+ spin_lock( &image->lock );
+ spin_unlock( &imageListLock );
+ if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) {
+ snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
+ fd = image->readFd;
+ image->readFd = -1;
+ } else {
+ fd = -1;
+ }
+ spin_unlock( &image->lock );
+ if ( fd != -1 ) {
+ close( fd );
+ logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr );
+ }
+ spin_lock( &imageListLock );
+ }
+ spin_unlock( &imageListLock );
+}
+
+/*
+ void image_find_latest()
+ {
+ // Not in array or most recent rid is requested, try file system
+ if (revision != 0) {
+ // Easy case - specific RID
+ char
+ } else {
+ // Determine base directory where the image in question has to reside.
+ // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04"
+ // Then searchPath has to be set to "/srv/rz/ubuntu"
+ char searchPath[strlen(_basePath) + len + 1];
+ char *lastSlash = strrchr(name, '/');
+ char *baseName; // Name of the image. In the example above, it will be "default-13.04"
+ if ( lastSlash == NULL ) {
+ *searchPath = '\0';
+ baseName = name;
+ } else {
+ char *from = name, *to = searchPath;
+ while (from < lastSlash) *to++ = *from++;
+ *to = '\0';
+ baseName = lastSlash + 1;
+ }
+ // Now we have the search path in our real file system and the expected image name.
+ // The revision naming sceme is <IMAGENAME>.r<RID>, so if we're looking for revision 13,
+ // our example image has to be named default-13.04.r13
+ }
+ }
+ */
diff --git a/src/server/image.h b/src/server/image.h
new file mode 100644
index 0000000..4668eff
--- /dev/null
+++ b/src/server/image.h
@@ -0,0 +1,63 @@
+#ifndef _IMAGE_H_
+#define _IMAGE_H_
+
+#include "globals.h"
+
+struct json_t;
+
+void image_serverStartup();
+
+bool image_isComplete(dnbd3_image_t *image);
+
+bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
+
+void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set);
+
+void image_markComplete(dnbd3_image_t *image);
+
+bool image_ensureOpen(dnbd3_image_t *image);
+
+dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
+
+bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
+
+dnbd3_image_t* image_getOrLoad(char *name, uint16_t revision);
+
+dnbd3_image_t* image_lock(dnbd3_image_t *image);
+
+dnbd3_image_t* image_release(dnbd3_image_t *image);
+
+bool image_checkBlocksCrc32(int fd, uint32_t *crc32list, const int *blocks, const uint64_t fileSize);
+
+void image_killUplinks();
+
+bool image_loadAll(char *path);
+
+bool image_tryFreeAll();
+
+bool image_create(char *image, int revision, uint64_t size);
+
+bool image_generateCrcFile(char *image);
+
+struct json_t* image_getListAsJson();
+
+int image_getCompletenessEstimate(dnbd3_image_t * const image);
+
+void image_closeUnusedFd();
+
+bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
+
+// one byte in the map covers 8 4kib blocks, so 32kib per byte
+// "+ (1 << 15) - 1" is required to account for the last bit of
+// the image that is smaller than 32kib
+// this would be the case whenever the image file size is not a
+// multiple of 32kib (= the number of blocks is not divisible by 8)
+// ie: if the image is 49152 bytes and you do 49152 >> 15 you get 1,
+// but you actually need 2 bytes to have a complete cache map
+#define IMGSIZE_TO_MAPBYTES(bytes) ((int)(((bytes) + (1 << 15) - 1) >> 15))
+
+// calculate number of hash blocks in file. One hash block is 16MiB
+#define HASH_BLOCK_SIZE ((int64_t)(1 << 24))
+#define IMGSIZE_TO_HASHBLOCKS(bytes) ((int)(((bytes) + HASH_BLOCK_SIZE - 1) / HASH_BLOCK_SIZE))
+
+#endif
diff --git a/src/server/ini.c b/src/server/ini.c
new file mode 100644
index 0000000..216543b
--- /dev/null
+++ b/src/server/ini.c
@@ -0,0 +1,164 @@
+/* inih -- simple .INI file parser
+
+ inih is released under the New BSD license (see LICENSE.txt). Go to the project
+ home page for more info:
+
+ http://code.google.com/p/inih/
+
+ */
+
+#include "ini.h"
+
+#include <ctype.h>
+#include <string.h>
+
+#if !INI_USE_STACK
+#include <stdlib.h>
+#endif
+
+#define MAX_SECTION 50
+#define MAX_NAME 50
+
+/* Strip whitespace chars off end of given string, in place. Return s. */
+static char* rstrip(char* s)
+{
+ char* p = s + strlen( s );
+ while ( p > s && isspace((unsigned char)(*--p)))
+ *p = '\0';
+ return s;
+}
+
+/* Return pointer to first non-whitespace char in given string. */
+static char* lskip(const char* s)
+{
+ while ( *s && isspace((unsigned char)(*s)))
+ s++;
+ return (char*)s;
+}
+
+/* Return pointer to first char c or ';' comment in given string, or pointer to
+ null at end of string if neither found. ';' must be prefixed by a whitespace
+ character to register as a comment. */
+static char* find_char_or_comment(const char* s, char c)
+{
+ int was_whitespace = 0;
+ while ( *s && *s != c && !(was_whitespace && *s == ';') ) {
+ was_whitespace = isspace((unsigned char)(*s));
+ s++;
+ }
+ return (char*)s;
+}
+
+/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
+static char* strncpy0(char* dest, const char* src, size_t size)
+{
+ strncpy( dest, src, size );
+ dest[size - 1] = '\0';
+ return dest;
+}
+
+/* See documentation in header file. */
+int ini_parse_file(FILE* file, int (*handler)(void*, const char*, const char*, const char*), void* user)
+{
+ /* Uses a fair bit of stack (use heap instead if you need to) */
+#if INI_USE_STACK
+ char line[INI_MAX_LINE];
+#else
+ char* line;
+#endif
+ char section[MAX_SECTION] = "";
+ char prev_name[MAX_NAME] = "";
+
+ char* start;
+ char* end;
+ char* name;
+ char* value;
+ int lineno = 0;
+ int error = 0;
+
+#if !INI_USE_STACK
+ line = (char*)malloc( INI_MAX_LINE );
+ if ( !line ) {
+ return -2;
+ }
+#endif
+
+ /* Scan through file line by line */
+ while ( fgets( line, INI_MAX_LINE, file ) != NULL ) {
+ lineno++;
+
+ start = line;
+#if INI_ALLOW_BOM
+ if (lineno == 1 && (unsigned char)start[0] == 0xEF &&
+ (unsigned char)start[1] == 0xBB &&
+ (unsigned char)start[2] == 0xBF) {
+ start += 3;
+ }
+#endif
+ start = lskip( rstrip( start ) );
+
+ if ( *start == ';' || *start == '#' ) {
+ /* Per Python ConfigParser, allow '#' comments at start of line */
+ }
+#if INI_ALLOW_MULTILINE
+ else if (*prev_name && *start && start > line) {
+ /* Non-black line with leading whitespace, treat as continuation
+ of previous name's value (as per Python ConfigParser). */
+ if (!handler(user, section, prev_name, start) && !error)
+ error = lineno;
+ }
+#endif
+ else if ( *start == '[' ) {
+ /* A "[section]" line */
+ end = find_char_or_comment( start + 1, ']' );
+ if ( *end == ']' ) {
+ *end = '\0';
+ strncpy0( section, start + 1, sizeof(section) );
+ *prev_name = '\0';
+ } else if ( !error ) {
+ /* No ']' found on section line */
+ error = lineno;
+ }
+ } else if ( *start && *start != ';' ) {
+ /* Not a comment, must be a name[=:]value pair */
+ end = find_char_or_comment( start, '=' );
+ if ( *end != '=' ) {
+ end = find_char_or_comment( start, ':' );
+ }
+ if ( *end == '=' || *end == ':' ) {
+ *end = '\0';
+ name = rstrip( start );
+ value = lskip( end + 1 );
+ end = find_char_or_comment( value, '\0' );
+ if ( *end == ';' ) *end = '\0';
+ rstrip( value );
+
+ /* Valid name[=:]value pair found, call handler */
+ strncpy0( prev_name, name, sizeof(prev_name) );
+ if ( !handler( user, section, name, value ) && !error ) error = lineno;
+ } else if ( !error ) {
+ /* No '=' or ':' found on name[=:]value line */
+ error = lineno;
+ }
+ }
+ }
+
+#if !INI_USE_STACK
+ free( line );
+#endif
+
+ return error;
+}
+
+/* See documentation in header file. */
+int ini_parse(const char* filename, int (*handler)(void*, const char*, const char*, const char*), void* user)
+{
+ FILE* file;
+ int error;
+
+ file = fopen( filename, "r" );
+ if ( !file ) return -1;
+ error = ini_parse_file( file, handler, user );
+ fclose( file );
+ return error;
+}
diff --git a/src/server/ini.h b/src/server/ini.h
new file mode 100644
index 0000000..06f1123
--- /dev/null
+++ b/src/server/ini.h
@@ -0,0 +1,66 @@
+/* inih -- simple .INI file parser
+
+ inih is released under the New BSD license (see LICENSE.txt). Go to the project
+ home page for more info:
+
+ http://code.google.com/p/inih/
+
+ */
+
+#ifndef __INI_H__
+#define __INI_H__
+
+/* Make this header file easier to include in C++ code */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+/* Parse given INI-style file. May have [section]s, name=value pairs
+ (whitespace stripped), and comments starting with ';' (semicolon). Section
+ is "" if name=value pair parsed before any section heading. name:value
+ pairs are also supported as a concession to Python's ConfigParser.
+
+ For each name=value pair parsed, call handler function with given user
+ pointer as well as section, name, and value (data only valid for duration
+ of handler call). Handler should return nonzero on success, zero on error.
+
+ Returns 0 on success, line number of first error on parse error (doesn't
+ stop on first error), -1 on file open error, or -2 on memory allocation
+ error (only when INI_USE_STACK is zero).
+ */
+int ini_parse(const char* filename, int (*handler)(void* user, const char* section, const char* name, const char* value), void* user);
+
+/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't
+ close the file when it's finished -- the caller must do that. */
+int ini_parse_file(FILE* file, int (*handler)(void* user, const char* section, const char* name, const char* value), void* user);
+
+/* Nonzero to allow multi-line value parsing, in the style of Python's
+ ConfigParser. If allowed, ini_parse() will call the handler with the same
+ name for each subsequent line parsed. */
+#ifndef INI_ALLOW_MULTILINE
+#define INI_ALLOW_MULTILINE 1
+#endif
+
+/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of
+ the file. See http://code.google.com/p/inih/issues/detail?id=21 */
+#ifndef INI_ALLOW_BOM
+#define INI_ALLOW_BOM 1
+#endif
+
+/* Nonzero to use stack, zero to use heap (malloc/free). */
+#ifndef INI_USE_STACK
+#define INI_USE_STACK 1
+#endif
+
+/* Maximum line length for any line in INI file. */
+#ifndef INI_MAX_LINE
+#define INI_MAX_LINE 200
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INI_H__ */
diff --git a/src/server/integrity.c b/src/server/integrity.c
new file mode 100644
index 0000000..88b7487
--- /dev/null
+++ b/src/server/integrity.c
@@ -0,0 +1,274 @@
+#include "integrity.h"
+
+#include "helper.h"
+#include "locks.h"
+#include "image.h"
+#include "uplink.h"
+
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define CHECK_QUEUE_SIZE 200
+
+#define CHECK_ALL (0x7fffffff)
+
+typedef struct
+{
+ dnbd3_image_t *image; // Image to check
+ int block; // Block to check
+ int count; // How many blocks to check starting at .block
+} queue_entry;
+
+static pthread_t thread;
+static queue_entry checkQueue[CHECK_QUEUE_SIZE];
+static pthread_mutex_t integrityQueueLock;
+static pthread_cond_t queueSignal;
+static int queueLen = -1;
+static volatile bool bRunning = false;
+
+static void* integrity_main(void *data);
+
+/**
+ * Initialize the integrity check thread
+ */
+void integrity_init()
+{
+ assert( queueLen == -1 );
+ pthread_mutex_init( &integrityQueueLock, NULL );
+ pthread_cond_init( &queueSignal, NULL );
+ pthread_mutex_lock( &integrityQueueLock );
+ queueLen = 0;
+ pthread_mutex_unlock( &integrityQueueLock );
+ bRunning = true;
+ if ( 0 != thread_create( &thread, NULL, &integrity_main, (void *)NULL ) ) {
+ bRunning = false;
+ logadd( LOG_WARNING, "Could not start integrity check thread. Corrupted images will not be detected." );
+ return;
+ }
+}
+
+void integrity_shutdown()
+{
+ assert( queueLen != -1 );
+ logadd( LOG_DEBUG1, "Shutting down integrity checker...\n" );
+ pthread_mutex_lock( &integrityQueueLock );
+ pthread_cond_signal( &queueSignal );
+ pthread_mutex_unlock( &integrityQueueLock );
+ thread_join( thread, NULL );
+ while ( bRunning )
+ usleep( 10000 );
+ pthread_mutex_destroy( &integrityQueueLock );
+ pthread_cond_destroy( &queueSignal );
+ logadd( LOG_DEBUG1, "Integrity checker exited normally.\n" );
+}
+
+/**
+ * Schedule an integrity check on the given image for the given hash block.
+ * It is not checked whether the block is completely cached locally, so
+ * make sure it is before calling, otherwise it will result in falsely
+ * detected corruption.
+ */
+void integrity_check(dnbd3_image_t *image, int block)
+{
+ if ( !bRunning ) {
+ logadd( LOG_MINOR, "Ignoring check request; thread not running..." );
+ return;
+ }
+ int i, freeSlot = -1;
+ pthread_mutex_lock( &integrityQueueLock );
+ for (i = 0; i < queueLen; ++i) {
+ if ( freeSlot == -1 && checkQueue[i].image == NULL ) {
+ freeSlot = i;
+ } else if ( checkQueue[i].image == image
+ && checkQueue[i].block <= block && checkQueue[i].block + checkQueue[i].count >= block ) {
+ // Already queued check dominates this one, or at least lies directly before this block
+ if ( checkQueue[i].block + checkQueue[i].count == block ) {
+ // It's directly before this one; expand range
+ checkQueue[i].count += 1;
+ }
+ logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (%d +%d)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
+ pthread_mutex_unlock( &integrityQueueLock );
+ return;
+ }
+ }
+ if ( freeSlot == -1 ) {
+ if ( queueLen >= CHECK_QUEUE_SIZE ) {
+ pthread_mutex_unlock( &integrityQueueLock );
+ logadd( LOG_INFO, "Check queue full, discarding check request...\n" );
+ return;
+ }
+ freeSlot = queueLen++;
+ }
+ checkQueue[freeSlot].image = image;
+ if ( block == -1 ) {
+ checkQueue[freeSlot].block = 0;
+ checkQueue[freeSlot].count = CHECK_ALL;
+ } else {
+ checkQueue[freeSlot].block = block;
+ checkQueue[freeSlot].count = 1;
+ }
+ pthread_cond_signal( &queueSignal );
+ pthread_mutex_unlock( &integrityQueueLock );
+}
+
+static void* integrity_main(void * data UNUSED)
+{
+ int i;
+ uint8_t *buffer = NULL;
+ size_t bufferSize = 0;
+ setThreadName( "image-check" );
+ blockNoncriticalSignals();
+#if defined(linux) || defined(__linux)
+ // Setting nice of this thread - this is not POSIX conforming, so check if other platforms support this.
+ // POSIX says that setpriority() should set the nice value of all threads belonging to the current process,
+ // but on linux you can do this per thread.
+ pid_t tid = (pid_t)syscall( SYS_gettid );
+ setpriority( PRIO_PROCESS, tid, 10 );
+#endif
+ pthread_mutex_lock( &integrityQueueLock );
+ while ( !_shutdown ) {
+ if ( queueLen == 0 ) {
+ pthread_cond_wait( &queueSignal, &integrityQueueLock );
+ }
+ for (i = queueLen - 1; i >= 0; --i) {
+ if ( _shutdown ) break;
+ dnbd3_image_t * const image = image_lock( checkQueue[i].image );
+ if ( checkQueue[i].count == 0 || image == NULL ) {
+ checkQueue[i].image = image_release( image );
+ if ( i + 1 == queueLen ) queueLen--;
+ continue;
+ }
+ // We have the image. Call image_release() some time
+ const int qCount = checkQueue[i].count;
+ bool foundCorrupted = false;
+ spin_lock( &image->lock );
+ if ( image->crc32 != NULL && image->realFilesize != 0 ) {
+ int blocks[2] = { checkQueue[i].block, -1 };
+ pthread_mutex_unlock( &integrityQueueLock );
+ // Make copy of crc32 list as it might go away
+ const uint64_t fileSize = image->realFilesize;
+ const int numHashBlocks = IMGSIZE_TO_HASHBLOCKS(fileSize);
+ const size_t required = numHashBlocks * sizeof(uint32_t);
+ if ( buffer == NULL || required > bufferSize ) {
+ bufferSize = required;
+ if ( buffer != NULL ) free( buffer );
+ buffer = malloc( bufferSize );
+ }
+ memcpy( buffer, image->crc32, required );
+ spin_unlock( &image->lock );
+ // Open for direct I/O if possible; this prevents polluting the fs cache
+ int fd = open( image->path, O_RDONLY | O_DIRECT );
+ bool direct = fd != -1;
+ if ( unlikely( !direct ) ) {
+ // Try unbuffered; flush to disk for that
+ logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path );
+ image_ensureOpen( image );
+ fd = image->readFd;
+ }
+ int checkCount = MIN( qCount, 5 );
+ if ( fd != -1 ) {
+ while ( blocks[0] < numHashBlocks && !_shutdown ) {
+ const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
+ const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
+ bool complete = true;
+ if ( qCount == CHECK_ALL ) {
+ // When checking full image, skip incomplete blocks, otherwise assume block is complete
+ spin_lock( &image->lock );
+ complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
+ spin_unlock( &image->lock );
+ }
+#if defined(linux) || defined(__linux)
+ if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) {
+#else
+ if ( fsync( fd ) == -1 ) {
+#endif
+ logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path );
+ exit( 1 );
+ }
+ // Use direct I/O only if read length is multiple of 4096 to be on the safe side
+ int tfd;
+ if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
+ // Suitable for direct io
+ tfd = fd;
+ } else if ( !image_ensureOpen( image ) ) {
+ logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+ break;
+ } else {
+ tfd = image->readFd;
+ // Evict from cache so we have to re-read, making sure data was properly stored
+ posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED );
+ }
+ if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) {
+ logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name );
+ image_updateCachemap( image, start, end, false );
+ // If this is not a full check, queue one
+ if ( qCount != CHECK_ALL ) {
+ logadd( LOG_INFO, "Queueing full check for %s", image->name );
+ integrity_check( image, -1 );
+ }
+ foundCorrupted = true;
+ }
+ blocks[0]++; // Increase before break, so it always points to the next block to check after loop
+ if ( complete && --checkCount == 0 ) break;
+ }
+ if ( direct ) {
+ close( fd );
+ }
+ }
+ pthread_mutex_lock( &integrityQueueLock );
+ assert( checkQueue[i].image == image );
+ if ( qCount != CHECK_ALL ) {
+ // Not a full check; update the counter
+ checkQueue[i].count -= ( blocks[0] - checkQueue[i].block );
+ if ( checkQueue[i].count < 0 ) {
+ logadd( LOG_WARNING, "BUG! checkQueue counter ran negative" );
+ }
+ }
+ if ( checkCount > 0 || checkQueue[i].count <= 0 || fd == -1 ) {
+ // Done with this task as nothing left, OR we don't have an fd to read from
+ if ( fd == -1 ) {
+ logadd( LOG_WARNING, "Cannot hash check %s: bad fd", image->path );
+ }
+ checkQueue[i].image = NULL;
+ if ( i + 1 == queueLen ) queueLen--;
+ // Mark as working again if applicable
+ if ( !foundCorrupted ) {
+ spin_lock( &image->lock );
+ if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper?
+ image->working = image->uplink->fd != -1 && image->readFd != -1;
+ }
+ spin_unlock( &image->lock );
+ }
+ } else {
+ // Still more blocks to go...
+ checkQueue[i].block = blocks[0];
+ }
+ } else {
+ spin_unlock( &image->lock );
+ }
+ if ( foundCorrupted ) {
+ // Something was fishy, make sure uplink exists
+ spin_lock( &image->lock );
+ image->working = false;
+ bool restart = image->uplink == NULL || image->uplink->shutdown;
+ spin_unlock( &image->lock );
+ if ( restart ) {
+ uplink_shutdown( image );
+ uplink_init( image, -1, NULL, -1 );
+ }
+ }
+ // Release :-)
+ image_release( image );
+ }
+ }
+ pthread_mutex_unlock( &integrityQueueLock );
+ if ( buffer != NULL ) free( buffer );
+ bRunning = false;
+ return NULL;
+}
+
diff --git a/src/server/integrity.h b/src/server/integrity.h
new file mode 100644
index 0000000..c3c2b44
--- /dev/null
+++ b/src/server/integrity.h
@@ -0,0 +1,12 @@
+#ifndef _INTEGRITY_H_
+#define _INTEGRITY_H_
+
+#include "globals.h"
+
+void integrity_init();
+
+void integrity_shutdown();
+
+void integrity_check(dnbd3_image_t *image, int block);
+
+#endif /* INTEGRITY_H_ */
diff --git a/src/server/locks.c b/src/server/locks.c
new file mode 100644
index 0000000..71a1845
--- /dev/null
+++ b/src/server/locks.c
@@ -0,0 +1,306 @@
+/*
+ * locks.c
+ *
+ * Created on: 16.07.2013
+ * Author: sr
+ */
+
+#include "locks.h"
+#include "helper.h"
+#include "../shared/timing.h"
+
+#ifdef _DEBUG
+#define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
+#define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
+#define LOCKLEN 60
+typedef struct
+{
+ void *lock;
+ ticks locktime;
+ char locked;
+ pthread_t thread;
+ int lockId;
+ char name[LOCKLEN];
+ char where[LOCKLEN];
+} debug_lock_t;
+
+typedef struct
+{
+ pthread_t tid;
+ ticks time;
+ char name[LOCKLEN];
+ char where[LOCKLEN];
+
+} debug_thread_t;
+
+int debugThreadCount = 0;
+
+static debug_lock_t locks[MAXLOCKS];
+static debug_thread_t threads[MAXTHREADS];
+static int init_done = 0;
+static pthread_spinlock_t initdestory;
+static int lockId = 0;
+static pthread_t watchdog = 0;
+static dnbd3_signal_t* watchdogSignal = NULL;
+
+static void *debug_thread_watchdog(void *something);
+
+int debug_spin_init(const char *name, const char *file, int line, pthread_spinlock_t *lock, int shared)
+{
+ if ( !init_done ) {
+ memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) );
+ memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) );
+ pthread_spin_init( &initdestory, PTHREAD_PROCESS_PRIVATE );
+ init_done = 1;
+ }
+ int first = -1;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == lock ) {
+ logadd( LOG_ERROR, "Lock %p (%s) already initialized (%s:%d)\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ if ( first == -1 && locks[i].lock == NULL ) first = i;
+ }
+ if ( first == -1 ) {
+ logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line );
+ pthread_spin_unlock( &initdestory );
+ debug_dump_lock_stats();
+ exit( 4 );
+ }
+ locks[first].lock = (void*)lock;
+ locks[first].locked = 0;
+ snprintf( locks[first].name, LOCKLEN, "%s", name );
+ snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line );
+ pthread_spin_unlock( &initdestory );
+ return pthread_spin_init( lock, shared );
+}
+
+int debug_spin_lock(const char *name, const char *file, int line, pthread_spinlock_t *lock)
+{
+ debug_lock_t *l = NULL;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == lock ) {
+ l = &locks[i];
+ break;
+ }
+ }
+ pthread_spin_unlock( &initdestory );
+ if ( l == NULL ) {
+ logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ debug_dump_lock_stats();
+ exit( 4 );
+ }
+ debug_thread_t *t = NULL;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid != 0 ) continue;
+ threads[i].tid = pthread_self();
+ timing_get( &threads[i].time );
+ snprintf( threads[i].name, LOCKLEN, "%s", name );
+ snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
+ t = &threads[i];
+ break;
+ }
+ pthread_spin_unlock( &initdestory );
+ if ( t == NULL ) {
+ logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ const int retval = pthread_spin_lock( lock );
+ pthread_spin_lock( &initdestory );
+ t->tid = 0;
+ pthread_spin_unlock( &initdestory );
+ if ( l->locked ) {
+ logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ l->locked = 1;
+ timing_get( &l->locktime );
+ l->thread = pthread_self();
+ snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
+ pthread_spin_lock( &initdestory );
+ l->lockId = ++lockId;
+ pthread_spin_unlock( &initdestory );
+ return retval;
+}
+
+int debug_spin_trylock(const char *name, const char *file, int line, pthread_spinlock_t *lock)
+{
+ debug_lock_t *l = NULL;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == lock ) {
+ l = &locks[i];
+ break;
+ }
+ }
+ pthread_spin_unlock( &initdestory );
+ if ( l == NULL ) {
+ logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ debug_dump_lock_stats();
+ exit( 4 );
+ }
+ debug_thread_t *t = NULL;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid != 0 ) continue;
+ threads[i].tid = pthread_self();
+ timing_get( &threads[i].time );
+ snprintf( threads[i].name, LOCKLEN, "%s", name );
+ snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
+ t = &threads[i];
+ break;
+ }
+ pthread_spin_unlock( &initdestory );
+ if ( t == NULL ) {
+ logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ const int retval = pthread_spin_trylock( lock );
+ pthread_spin_lock( &initdestory );
+ t->tid = 0;
+ pthread_spin_unlock( &initdestory );
+ if ( retval == 0 ) {
+ if ( l->locked ) {
+ logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ l->locked = 1;
+ timing_get( &l->locktime );
+ l->thread = pthread_self();
+ snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
+ pthread_spin_lock( &initdestory );
+ l->lockId = ++lockId;
+ pthread_spin_unlock( &initdestory );
+ }
+ return retval;
+}
+
+int debug_spin_unlock(const char *name, const char *file, int line, pthread_spinlock_t *lock)
+{
+ debug_lock_t *l = NULL;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == lock ) {
+ l = &locks[i];
+ break;
+ }
+ }
+ pthread_spin_unlock( &initdestory );
+ if ( l == NULL ) {
+ logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ if ( !l->locked ) {
+ logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ l->locked = 0;
+ l->thread = 0;
+ snprintf( l->where, LOCKLEN, "U %s:%d", file, line );
+ int retval = pthread_spin_unlock( lock );
+ return retval;
+}
+
+int debug_spin_destroy(const char *name, const char *file, int line, pthread_spinlock_t *lock)
+{
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == lock ) {
+ if ( locks[i].locked ) {
+ logadd( LOG_ERROR, "Tried to destroy lock %p (%s) at %s:%d when it is still locked\n", (void*)lock, name, file, line );
+ exit( 4 );
+ }
+ locks[i].lock = NULL;
+ snprintf( locks[i].where, LOCKLEN, "D %s:%d", file, line );
+ pthread_spin_unlock( &initdestory );
+ return pthread_spin_destroy( lock );
+ }
+ }
+ logadd( LOG_ERROR, "Tried to destroy non-existent lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+ exit( 4 );
+}
+
+void debug_dump_lock_stats()
+{
+ declare_now;
+ pthread_spin_lock( &initdestory );
+ printf( "\n **** LOCKS ****\n\n" );
+ for (int i = 0; i < MAXLOCKS; ++i) {
+ if ( locks[i].lock == NULL ) continue;
+ if ( locks[i].locked ) {
+ printf( "* *** %s ***\n"
+ "* Where: %s\n"
+ "* When: %d secs ago\n"
+ "* Locked: %d\n"
+ "* Serial: %d\n"
+ "* Thread: %d\n", locks[i].name, locks[i].where, (int)timing_diff( &locks[i].locktime, &now ), (int)locks[i].locked, locks[i].lockId,
+ (int)locks[i].thread );
+ } else {
+ printf( "* *** %s ***\n"
+ "* Where: %s\n"
+ "* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked );
+ }
+ }
+ printf( "\n **** WAITING THREADS ****\n\n" );
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid == 0 ) continue;
+ printf( "* *** Thread %d ***\n"
+ "* Lock: %s\n"
+ "* Where: %s\n"
+ "* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) );
+ }
+ pthread_spin_unlock( &initdestory );
+}
+
+static void *debug_thread_watchdog(void *something UNUSED)
+{
+ setThreadName( "debug-watchdog" );
+ while ( !_shutdown ) {
+ if ( init_done ) {
+ declare_now;
+ pthread_spin_lock( &initdestory );
+ for (int i = 0; i < MAXTHREADS; ++i) {
+ if ( threads[i].tid == 0 ) continue;
+ const uint32_t diff = timing_diff( &threads[i].time, &now );
+ if ( diff > 6 && diff < 100000 ) {
+ printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" );
+ pthread_spin_unlock( &initdestory );
+ debug_dump_lock_stats();
+ exit( 99 );
+ }
+ }
+ pthread_spin_unlock( &initdestory );
+ }
+ if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 );
+ }
+ return NULL ;
+}
+
+#endif
+
+void debug_locks_start_watchdog()
+{
+#ifdef _DEBUG
+ watchdogSignal = signal_new();
+ if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) {
+ logadd( LOG_ERROR, "Could not start debug-lock watchdog." );
+ return;
+ }
+#endif
+}
+
+void debug_locks_stop_watchdog()
+{
+#ifdef _DEBUG
+ _shutdown = true;
+ printf( "Killing debug watchdog...\n" );
+ pthread_spin_lock( &initdestory );
+ signal_call( watchdogSignal );
+ pthread_spin_unlock( &initdestory );
+ thread_join( watchdog, NULL );
+ signal_close( watchdogSignal );
+#endif
+}
diff --git a/src/server/locks.h b/src/server/locks.h
new file mode 100644
index 0000000..16b59a7
--- /dev/null
+++ b/src/server/locks.h
@@ -0,0 +1,85 @@
+#ifndef _LOCKS_H_
+#define _LOCKS_H_
+
+#include <pthread.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef _DEBUG
+
+#define spin_init( lock, type ) debug_spin_init( #lock, __FILE__, __LINE__, lock, type)
+#define spin_lock( lock ) debug_spin_lock( #lock, __FILE__, __LINE__, lock)
+#define spin_trylock( lock ) debug_spin_trylock( #lock, __FILE__, __LINE__, lock)
+#define spin_unlock( lock ) debug_spin_unlock( #lock, __FILE__, __LINE__, lock)
+#define spin_destroy( lock ) debug_spin_destroy( #lock, __FILE__, __LINE__, lock)
+
+int debug_spin_init(const char *name, const char *file, int line, pthread_spinlock_t *lock, int shared);
+int debug_spin_lock(const char *name, const char *file, int line, pthread_spinlock_t *lock);
+int debug_spin_trylock(const char *name, const char *file, int line, pthread_spinlock_t *lock);
+int debug_spin_unlock(const char *name, const char *file, int line, pthread_spinlock_t *lock);
+int debug_spin_destroy(const char *name, const char *file, int line, pthread_spinlock_t *lock);
+
+void debug_dump_lock_stats();
+
+
+#else
+
+#define spin_init( lock, type ) pthread_spin_init(lock, type)
+#define spin_lock( lock ) pthread_spin_lock(lock)
+#define spin_trylock( lock ) pthread_spin_trylock(lock)
+#define spin_unlock( lock ) pthread_spin_unlock(lock)
+#define spin_destroy( lock ) pthread_spin_destroy(lock)
+
+#endif
+
+#ifdef DEBUG_THREADS
+
+extern int debugThreadCount;
+#define thread_create(thread,attr,routine,arg) (logadd( LOG_THREAD CREATE, "%d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
+static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg)
+{
+ int i;
+ if (attr == NULL || pthread_attr_getdetachstate(attr, &i) != 0 || i == PTHREAD_CREATE_JOINABLE) {
+ ++debugThreadCount;
+ }
+ return pthread_create( thread, attr, start_routine, arg );
+}
+
+#define thread_detach(thread) (logadd( LOG_THREAD DETACH, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
+static inline int debug_thread_detach(pthread_t thread)
+{
+ const int ret = pthread_detach(thread);
+ if (ret == 0) {
+ --debugThreadCount;
+ } else {
+ logadd( LOG_THREAD DETACH, "Tried to detach invalid thread (error %d)\n", (int)errno);
+ exit(1);
+ }
+ return ret;
+}
+#define thread_join(thread,value) (logadd( LOG_THREAD JOIN, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
+static inline int debug_thread_join(pthread_t thread, void **value_ptr)
+{
+ const int ret = pthread_join(thread, value_ptr);
+ if (ret == 0) {
+ --debugThreadCount;
+ } else {
+ logadd( LOG_THREAD JOIN, "Tried to join invalid thread (error %d)\n", (int)errno);
+ exit(1);
+ }
+ return ret;
+}
+
+#else
+
+#define thread_create(thread,attr,routine,param) pthread_create( thread, attr, routine, param )
+#define thread_detach(thread) pthread_detach( thread )
+#define thread_join(thread,value) pthread_join( thread, value )
+
+#endif
+
+void debug_locks_start_watchdog();
+void debug_locks_stop_watchdog();
+
+#endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
new file mode 100644
index 0000000..00e88e0
--- /dev/null
+++ b/src/server/net.c
@@ -0,0 +1,731 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include "helper.h"
+#include "image.h"
+#include "uplink.h"
+#include "locks.h"
+#include "rpc.h"
+#include "altservers.h"
+
+#include "../shared/sockhelper.h"
+#include "../shared/timing.h"
+#include "../shared/protocol.h"
+#include "../serialize.h"
+
+#include <assert.h>
+
+#ifdef __linux__
+#include <sys/sendfile.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#endif
+#include <jansson.h>
+#include <inttypes.h>
+#include <stdatomic.h>
+
+static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS];
+static int _num_clients = 0;
+static pthread_spinlock_t _clients_lock;
+
+static char nullbytes[500];
+
+static atomic_uint_fast64_t totalBytesSent = 0;
+
+// Adding and removing clients -- list management
+static bool addToList(dnbd3_client_t *client);
+static void removeFromList(dnbd3_client_t *client);
+static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client);
+
+static inline bool recv_request_header(int sock, dnbd3_request_t *request)
+{
+ ssize_t ret, fails = 0;
+#ifdef AFL_MODE
+ sock = 0;
+#endif
+ // Read request header from socket
+ while ( ( ret = recv( sock, request, sizeof(*request), MSG_WAITALL ) ) != sizeof(*request) ) {
+ if ( errno == EINTR && ++fails < 10 ) continue;
+ if ( ret >= 0 || ++fails > SOCKET_TIMEOUT_CLIENT_RETRIES ) return false;
+ if ( errno == EAGAIN ) continue;
+ logadd( LOG_DEBUG2, "Error receiving request: Could not read message header (%d/%d, e=%d)\n", (int)ret, (int)sizeof(*request), errno );
+ return false;
+ }
+ // Make sure all bytes are in the right order (endianness)
+ fixup_request( *request );
+ if ( request->magic != dnbd3_packet_magic ) {
+ logadd( LOG_DEBUG2, "Magic in client request incorrect (cmd: %d, len: %d)\n", (int)request->cmd, (int)request->size );
+ return false;
+ }
+ // Payload sanity check
+ if ( request->cmd != CMD_GET_BLOCK && request->size > MAX_PAYLOAD ) {
+ logadd( LOG_WARNING, "Client tries to send a packet of type %d with %d bytes payload. Dropping client.", (int)request->cmd, (int)request->size );
+ return false;
+ }
+ return true;
+}
+
+static inline bool recv_request_payload(int sock, uint32_t size, serialized_buffer_t *payload)
+{
+#ifdef AFL_MODE
+ sock = 0;
+#endif
+ if ( size == 0 ) {
+ logadd( LOG_ERROR, "Called recv_request_payload() to receive 0 bytes" );
+ return false;
+ }
+ if ( size > MAX_PAYLOAD ) {
+ logadd( LOG_ERROR, "Called recv_request_payload() for more bytes than the passed buffer could hold!" );
+ return false;
+ }
+ if ( sock_recv( sock, payload->buffer, size ) != (ssize_t)size ) {
+ logadd( LOG_DEBUG1, "Could not receive request payload of length %d\n", (int)size );
+ return false;
+ }
+ // Prepare payload buffer for reading
+ serializer_reset_read( payload, size );
+ return true;
+}
+
+/**
+ * Send reply with optional payload. payload can be null. The caller has to
+ * acquire the sendMutex first.
+ */
+static inline bool send_reply(int sock, dnbd3_reply_t *reply, void *payload)
+{
+ const uint32_t size = reply->size;
+ fixup_reply( *reply );
+ if ( sock_sendAll( sock, reply, sizeof(dnbd3_reply_t), 1 ) != sizeof(dnbd3_reply_t) ) {
+ logadd( LOG_DEBUG1, "Sending reply header to client failed" );
+ return false;
+ }
+ if ( size != 0 && payload != NULL ) {
+ if ( sock_sendAll( sock, payload, size, 1 ) != (ssize_t)size ) {
+ logadd( LOG_DEBUG1, "Sending payload of %"PRIu32" bytes to client failed", size );
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * Send given amount of null bytes. The caller has to acquire the sendMutex first.
+ */
+static inline bool sendPadding( const int fd, uint32_t bytes )
+{
+ ssize_t ret;
+ while ( bytes >= sizeof(nullbytes) ) {
+ ret = sock_sendAll( fd, nullbytes, sizeof(nullbytes), 2 );
+ if ( ret <= 0 )
+ return false;
+ bytes -= (uint32_t)ret;
+ }
+ return sock_sendAll( fd, nullbytes, bytes, 2 ) == (ssize_t)bytes;
+}
+
+void net_init()
+{
+ spin_init( &_clients_lock, PTHREAD_PROCESS_PRIVATE );
+}
+
+void* net_handleNewConnection(void *clientPtr)
+{
+ dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr;
+ dnbd3_request_t request;
+
+ // Await data from client. Since this is a fresh connection, we expect data right away
+ sock_setTimeout( client->sock, _clientTimeout );
+ do {
+#ifdef AFL_MODE
+ const int ret = (int)recv( 0, &request, sizeof(request), MSG_WAITALL );
+#else
+ const int ret = (int)recv( client->sock, &request, sizeof(request), MSG_WAITALL );
+#endif
+ // It's expected to be a real dnbd3 client
+ // Check request for validity. This implicitly dictates that all HTTP requests are more than 24 bytes...
+ if ( ret != (int)sizeof(request) ) {
+ logadd( LOG_DEBUG2, "Error receiving request: Could not read message header (%d/%d, e=%d)", (int)ret, (int)sizeof(request), errno );
+ goto fail_preadd;
+ }
+
+ if ( request.magic != dnbd3_packet_magic ) {
+ // Let's see if this looks like an HTTP request
+ if ( ((char*)&request)[0] == 'G' || ((char*)&request)[0] == 'P' ) {
+ // Close enough...
+ rpc_sendStatsJson( client->sock, &client->host, &request, ret );
+ } else {
+ logadd( LOG_DEBUG1, "Magic in client handshake incorrect" );
+ }
+ goto fail_preadd;
+ }
+ // Magic OK, untangle byte order if required
+ fixup_request( request );
+ if ( request.cmd != CMD_SELECT_IMAGE ) {
+ logadd( LOG_WARNING, "Client sent != CMD_SELECT_IMAGE in handshake (got cmd=%d, size=%d), dropping client.", (int)request.cmd, (int)request.size );
+ goto fail_preadd;
+ }
+ } while (0);
+ // Fully init client struct
+ spin_init( &client->lock, PTHREAD_PROCESS_PRIVATE );
+ pthread_mutex_init( &client->sendMutex, NULL );
+
+ spin_lock( &client->lock );
+ host_to_string( &client->host, client->hostName, HOSTNAMELEN );
+ client->hostName[HOSTNAMELEN-1] = '\0';
+ spin_unlock( &client->lock );
+ client->bytesSent = 0;
+
+ if ( !addToList( client ) ) {
+ freeClientStruct( client );
+ logadd( LOG_WARNING, "Could not add new client to list when connecting" );
+ return NULL;
+ }
+
+ dnbd3_reply_t reply;
+
+ dnbd3_image_t *image = NULL;
+ int image_file = -1;
+
+ int num;
+ bool bOk = false;
+ bool hasName = false;
+
+ serialized_buffer_t payload;
+ uint16_t rid, client_version;
+ uint64_t start, end;
+
+ dnbd3_server_entry_t server_list[NUMBER_SERVERS];
+
+ // Set to zero to make valgrind happy
+ memset( &reply, 0, sizeof(reply) );
+ memset( &payload, 0, sizeof(payload) );
+ reply.magic = dnbd3_packet_magic;
+
+ // Receive first packet's payload
+ if ( recv_request_payload( client->sock, request.size, &payload ) ) {
+ char *image_name;
+ client_version = serializer_get_uint16( &payload );
+ image_name = serializer_get_string( &payload );
+ rid = serializer_get_uint16( &payload );
+ const uint8_t flags = serializer_get_uint8( &payload );
+ client->isServer = ( flags & FLAGS8_SERVER );
+ if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) {
+ if ( client_version < MIN_SUPPORTED_CLIENT ) {
+ logadd( LOG_DEBUG1, "Client %s too old", client->hostName );
+ } else {
+ logadd( LOG_DEBUG1, "Incomplete handshake received from %s", client->hostName );
+ }
+ } else {
+ if ( !client->isServer || !_isProxy ) {
+ // Is a normal client, or we're not proxy
+ image = image_getOrLoad( image_name, rid );
+ } else if ( _backgroundReplication != BGR_FULL && ( flags & FLAGS8_BG_REP ) ) {
+ // We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does...
+ // Reject, as this would basically force this proxy to do BGR too.
+ image = image_get( image_name, rid, true );
+ if ( image != NULL && image->cache_map != NULL ) {
+ // Only exception is if the image is complete locally
+ image = image_release( image );
+ }
+ } else if ( _lookupMissingForProxy ) {
+ // No BGR mismatch and we're told to lookup missing images on a known uplink server
+ // if the requesting client is a proxy
+ image = image_getOrLoad( image_name, rid );
+ } else {
+ // No BGR mismatch, but don't lookup if image is unknown locally
+ image = image_get( image_name, rid, true );
+ }
+ spin_lock( &client->lock );
+ client->image = image;
+ spin_unlock( &client->lock );
+ if ( image == NULL ) {
+ //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
+ } else if ( !image->working ) {
+ logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
+ client->hostName, image_name, (int)rid );
+ } else {
+ bool penalty;
+ // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
+ bOk = true;
+ if ( image->cache_map != NULL ) {
+ spin_lock( &image->lock );
+ if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+ bOk = ( rand() % 4 ) == 1;
+ }
+ penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1;
+ spin_unlock( &image->lock );
+ if ( penalty ) { // Wait 100ms if local caching is not working so this
+ usleep( 100000 ); // server gets a penalty and is less likely to be selected
+ }
+ }
+ if ( bOk ) {
+ spin_lock( &image->lock );
+ image_file = image->readFd;
+ if ( !client->isServer ) {
+ // Only update immediately if this is a client. Servers are handled on disconnect.
+ timing_get( &image->atime );
+ }
+ spin_unlock( &image->lock );
+ serializer_reset_write( &payload );
+ serializer_put_uint16( &payload, client_version < 3 ? client_version : PROTOCOL_VERSION ); // XXX: Since messed up fuse client was messed up before :(
+ serializer_put_string( &payload, image->name );
+ serializer_put_uint16( &payload, (uint16_t)image->rid );
+ serializer_put_uint64( &payload, image->virtualFilesize );
+ reply.cmd = CMD_SELECT_IMAGE;
+ reply.size = serializer_get_written_length( &payload );
+ if ( !send_reply( client->sock, &reply, &payload ) ) {
+ bOk = false;
+ }
+ }
+ }
+ }
+ }
+
+ if ( bOk ) {
+ // add artificial delay if applicable
+ if ( client->isServer && _serverPenalty != 0 ) {
+ usleep( _serverPenalty );
+ } else if ( !client->isServer && _clientPenalty != 0 ) {
+ usleep( _clientPenalty );
+ }
+ // client handling mainloop
+ while ( recv_request_header( client->sock, &request ) ) {
+ if ( _shutdown ) break;
+ switch ( request.cmd ) {
+
+ case CMD_GET_BLOCK:;
+ const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
+ if ( offset >= image->virtualFilesize ) {
+ // Sanity check
+ logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
+ reply.size = 0;
+ reply.cmd = CMD_ERROR;
+ send_reply( client->sock, &reply, NULL );
+ break;
+ }
+ if ( offset + request.size > image->virtualFilesize ) {
+ // Sanity check
+ logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName );
+ reply.size = 0;
+ reply.cmd = CMD_ERROR;
+ send_reply( client->sock, &reply, NULL );
+ break;
+ }
+
+ if ( request.size != 0 && image->cache_map != NULL ) {
+ // This is a proxyed image, check if we need to relay the request...
+ start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ bool isCached = true;
+ spin_lock( &image->lock );
+ // Check again as we only aquired the lock just now
+ if ( image->cache_map != NULL ) {
+ const uint64_t firstByteInMap = start >> 15;
+ const uint64_t lastByteInMap = (end - 1) >> 15;
+ uint64_t pos;
+ // Middle - quick checking
+ if ( isCached ) {
+ pos = firstByteInMap + 1;
+ while ( pos < lastByteInMap ) {
+ if ( image->cache_map[pos] != 0xff ) {
+ isCached = false;
+ break;
+ }
+ ++pos;
+ }
+ }
+ // First byte
+ if ( isCached ) {
+ pos = start;
+ do {
+ const int map_x = (pos >> 12) & 7; // mod 8
+ const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+ if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) {
+ isCached = false;
+ break;
+ }
+ pos += DNBD3_BLOCK_SIZE;
+ } while ( firstByteInMap == (pos >> 15) && pos < end );
+ }
+ // Last byte - only check if request spans multiple bytes in cache map
+ if ( isCached && firstByteInMap != lastByteInMap ) {
+ pos = lastByteInMap << 15;
+ while ( pos < end ) {
+ assert( lastByteInMap == (pos >> 15) );
+ const int map_x = (pos >> 12) & 7; // mod 8
+ const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+ if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) {
+ isCached = false;
+ break;
+ }
+ pos += DNBD3_BLOCK_SIZE;
+ }
+ }
+ }
+ spin_unlock( &image->lock );
+ if ( !isCached ) {
+ if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
+ logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+ client->hostName, image->name, image->rid );
+ image->working = false;
+ goto exit_client_cleanup;
+ }
+ break; // DONE, exit request.cmd switch
+ }
+ }
+
+ reply.cmd = CMD_GET_BLOCK;
+ reply.size = request.size;
+ reply.handle = request.handle;
+
+ fixup_reply( reply );
+ const bool lock = image->uplink != NULL;
+ if ( lock ) pthread_mutex_lock( &client->sendMutex );
+ // Send reply header
+ if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) {
+ if ( lock ) pthread_mutex_unlock( &client->sendMutex );
+ logadd( LOG_DEBUG1, "Sending CMD_GET_BLOCK reply header to %s failed", client->hostName );
+ goto exit_client_cleanup;
+ }
+
+ if ( request.size != 0 ) {
+ // Send payload if request length > 0
+ size_t done = 0;
+ off_t foffset = (off_t)offset;
+ size_t realBytes;
+ if ( offset + request.size <= image->realFilesize ) {
+ realBytes = request.size;
+ } else {
+ realBytes = (size_t)(image->realFilesize - offset);
+ }
+ while ( done < realBytes ) {
+ // TODO: Should we consider EOPNOTSUPP on BSD for sendfile and fallback to read/write?
+ // Linux would set EINVAL or ENOSYS instead, which it unfortunately also does for a couple of other failures :/
+ // read/write would kill performance anyways so a fallback would probably be of little use either way.
+#ifdef AFL_MODE
+ char buf[1000];
+ size_t cnt = realBytes - done;
+ if ( cnt > 1000 ) {
+ cnt = 1000;
+ }
+ const ssize_t sent = pread( image_file, buf, cnt, foffset );
+ if ( sent > 0 ) {
+ //write( client->sock, buf, sent ); // This is not verified in any way, so why even do it...
+ } else {
+ const int err = errno;
+#elif defined(__linux__)
+ const ssize_t sent = sendfile( client->sock, image_file, &foffset, realBytes - done );
+ if ( sent <= 0 ) {
+ const int err = errno;
+#elif defined(__FreeBSD__)
+ off_t sent;
+ const int ret = sendfile( image_file, client->sock, foffset, realBytes - done, NULL, &sent, 0 );
+ if ( ret == -1 || sent == 0 ) {
+ const int err = errno;
+ if ( ret == -1 ) {
+ if ( err == EAGAIN || err == EINTR ) { // EBUSY? manpage doesn't explicitly mention *sent here.. But then again we dont set the according flag anyways
+ done += sent;
+ continue;
+ }
+ sent = -1;
+ }
+#endif
+ if ( lock ) pthread_mutex_unlock( &client->sendMutex );
+ if ( sent == -1 ) {
+ if ( err != EPIPE && err != ECONNRESET && err != ESHUTDOWN
+ && err != EAGAIN && err != EWOULDBLOCK ) {
+ logadd( LOG_DEBUG1, "sendfile to %s failed (image to net. sent %d/%d, errno=%d)",
+ client->hostName, (int)done, (int)realBytes, err );
+ }
+ if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
+ logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
+ image->working = false;
+ }
+ }
+ goto exit_client_cleanup;
+ }
+ done += sent;
+ }
+ if ( request.size > (uint32_t)realBytes ) {
+ if ( !sendPadding( client->sock, request.size - (uint32_t)realBytes ) ) {
+ if ( lock ) pthread_mutex_unlock( &client->sendMutex );
+ goto exit_client_cleanup;
+ }
+ }
+ }
+ if ( lock ) pthread_mutex_unlock( &client->sendMutex );
+ // Global per-client counter
+ client->bytesSent += request.size; // Increase counter for statistics.
+ break;
+
+ case CMD_GET_SERVERS:
+ // Build list of known working alt servers
+ num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS );
+ reply.cmd = CMD_GET_SERVERS;
+ reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) );
+ pthread_mutex_lock( &client->sendMutex );
+ send_reply( client->sock, &reply, server_list );
+ pthread_mutex_unlock( &client->sendMutex );
+ goto set_name;
+ break;
+
+ case CMD_KEEPALIVE:
+ reply.cmd = CMD_KEEPALIVE;
+ reply.size = 0;
+ pthread_mutex_lock( &client->sendMutex );
+ send_reply( client->sock, &reply, NULL );
+ pthread_mutex_unlock( &client->sendMutex );
+set_name: ;
+ if ( !hasName ) {
+ hasName = true;
+ setThreadName( client->hostName );
+ }
+ break;
+
+ case CMD_SET_CLIENT_MODE:
+ client->isServer = false;
+ break;
+
+ case CMD_GET_CRC32:
+ reply.cmd = CMD_GET_CRC32;
+ pthread_mutex_lock( &client->sendMutex );
+ if ( image->crc32 == NULL ) {
+ reply.size = 0;
+ send_reply( client->sock, &reply, NULL );
+ } else {
+ const uint32_t size = reply.size = (uint32_t)( (IMGSIZE_TO_HASHBLOCKS(image->realFilesize) + 1) * sizeof(uint32_t) );
+ send_reply( client->sock, &reply, NULL );
+ send( client->sock, &image->masterCrc32, sizeof(uint32_t), MSG_MORE );
+ send( client->sock, image->crc32, size - sizeof(uint32_t), 0 );
+ }
+ pthread_mutex_unlock( &client->sendMutex );
+ break;
+
+ default:
+ logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
+ break;
+
+ }
+ }
+ }
+exit_client_cleanup: ;
+ // First remove from list, then add to counter to prevent race condition
+ removeFromList( client );
+ totalBytesSent += client->bytesSent;
+ // Access time, but only if client didn't just probe
+ if ( image != NULL ) {
+ spin_lock( &image->lock );
+ if ( client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
+ timing_get( &image->atime );
+ }
+ spin_unlock( &image->lock );
+ }
+ freeClientStruct( client ); // This will also call image_release on client->image
+ return NULL ;
+fail_preadd: ;
+ close( client->sock );
+ free( client );
+ return NULL;
+}
+
+/**
+ * Get list of all clients.
+ */
+struct json_t* net_getListAsJson()
+{
+ json_t *jsonClients = json_array();
+ json_t *clientStats;
+ int imgId, isServer;
+ uint64_t bytesSent;
+ char host[HOSTNAMELEN];
+ host[HOSTNAMELEN-1] = '\0';
+
+ spin_lock( &_clients_lock );
+ for ( int i = 0; i < _num_clients; ++i ) {
+ dnbd3_client_t * const client = _clients[i];
+ if ( client == NULL || client->image == NULL )
+ continue;
+ spin_lock( &client->lock );
+ // Unlock so we give other threads a chance to access the client list.
+ // We might not get an atomic snapshot of the currently connected clients,
+ // but that doesn't really make a difference anyways.
+ spin_unlock( &_clients_lock );
+ strncpy( host, client->hostName, HOSTNAMELEN - 1 );
+ imgId = client->image->id;
+ isServer = (int)client->isServer;
+ bytesSent = client->bytesSent;
+ spin_unlock( &client->lock );
+ clientStats = json_pack( "{sssisisI}",
+ "address", host,
+ "imageId", imgId,
+ "isServer", isServer,
+ "bytesSent", (json_int_t)bytesSent );
+ json_array_append_new( jsonClients, clientStats );
+ spin_lock( &_clients_lock );
+ }
+ spin_unlock( &_clients_lock );
+ return jsonClients;
+}
+
+/**
+ * Get number of clients connected, total bytes sent, or both.
+ * we don't unlock the list while iterating or we might get an
+ * incorrect result if a client is disconnecting while iterating.
+ */
+void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
+{
+ int cc = 0, sc = 0;
+ uint64_t bs = 0;
+
+ spin_lock( &_clients_lock );
+ for ( int i = 0; i < _num_clients; ++i ) {
+ const dnbd3_client_t * const client = _clients[i];
+ if ( client == NULL || client->image == NULL )
+ continue;
+ if ( client->isServer ) {
+ sc += 1;
+ } else {
+ cc += 1;
+ }
+ bs += client->bytesSent;
+ }
+ spin_unlock( &_clients_lock );
+ if ( clientCount != NULL ) {
+ *clientCount = cc;
+ }
+ if ( serverCount != NULL ) {
+ *serverCount = sc;
+ }
+ if ( bytesSent != NULL ) {
+ *bytesSent = totalBytesSent + bs;
+ }
+}
+
+void net_disconnectAll()
+{
+ int i;
+ spin_lock( &_clients_lock );
+ for (i = 0; i < _num_clients; ++i) {
+ if ( _clients[i] == NULL ) continue;
+ dnbd3_client_t * const client = _clients[i];
+ spin_lock( &client->lock );
+ if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR );
+ spin_unlock( &client->lock );
+ }
+ spin_unlock( &_clients_lock );
+}
+
+void net_waitForAllDisconnected()
+{
+ int retries = 10, count, i;
+ do {
+ count = 0;
+ spin_lock( &_clients_lock );
+ for (i = 0; i < _num_clients; ++i) {
+ if ( _clients[i] == NULL ) continue;
+ count++;
+ }
+ spin_unlock( &_clients_lock );
+ if ( count != 0 ) {
+ logadd( LOG_INFO, "%d clients still active...\n", count );
+ sleep( 1 );
+ }
+ } while ( count != 0 && --retries > 0 );
+ _num_clients = 0;
+}
+
+/* +++
+ * Client list.
+ *
+ * Adding and removing clients.
+ */
+
+/**
+ * Remove a client from the clients array
+ * Locks on: _clients_lock
+ */
+static void removeFromList(dnbd3_client_t *client)
+{
+ int i;
+ spin_lock( &_clients_lock );
+ for ( i = _num_clients - 1; i >= 0; --i ) {
+ if ( _clients[i] == client ) {
+ _clients[i] = NULL;
+ }
+ if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients;
+ }
+ spin_unlock( &_clients_lock );
+}
+
+/**
+ * Free the client struct recursively.
+ * !! Make sure to call this function after removing the client from _dnbd3_clients !!
+ * Locks on: _clients[].lock, _images[].lock
+ * might call functions that lock on _images, _image[], uplink.queueLock, client.sendMutex
+ */
+static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
+{
+ spin_lock( &client->lock );
+ pthread_mutex_lock( &client->sendMutex );
+ if ( client->sock != -1 ) close( client->sock );
+ client->sock = -1;
+ pthread_mutex_unlock( &client->sendMutex );
+ if ( client->image != NULL ) {
+ spin_lock( &client->image->lock );
+ if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
+ spin_unlock( &client->image->lock );
+ client->image = image_release( client->image );
+ }
+ spin_unlock( &client->lock );
+ spin_destroy( &client->lock );
+ pthread_mutex_destroy( &client->sendMutex );
+ free( client );
+ return NULL ;
+}
+
+//###//
+
+/**
+ * Add client to the clients array.
+ * Locks on: _clients_lock
+ */
+static bool addToList(dnbd3_client_t *client)
+{
+ int i;
+ spin_lock( &_clients_lock );
+ for (i = 0; i < _num_clients; ++i) {
+ if ( _clients[i] != NULL ) continue;
+ _clients[i] = client;
+ spin_unlock( &_clients_lock );
+ return true;
+ }
+ if ( _num_clients >= _maxClients ) {
+ spin_unlock( &_clients_lock );
+ logadd( LOG_ERROR, "Maximum number of clients reached!" );
+ return false;
+ }
+ _clients[_num_clients++] = client;
+ spin_unlock( &_clients_lock );
+ return true;
+}
+
diff --git a/src/server/net.h b/src/server/net.h
new file mode 100644
index 0000000..6813b49
--- /dev/null
+++ b/src/server/net.h
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef NET_H_
+#define NET_H_
+
+#include "globals.h"
+
+struct json_t;
+
+void net_init();
+
+void* net_handleNewConnection(void *clientPtr);
+
+struct json_t* net_getListAsJson();
+
+void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent);
+
+void net_disconnectAll();
+
+void net_waitForAllDisconnected();
+
+#endif /* NET_H_ */
diff --git a/src/server/picohttpparser/README.md b/src/server/picohttpparser/README.md
new file mode 100644
index 0000000..cb32f58
--- /dev/null
+++ b/src/server/picohttpparser/README.md
@@ -0,0 +1,116 @@
+PicoHTTPParser
+=============
+
+Copyright (c) 2009-2014 [Kazuho Oku](https://github.com/kazuho), [Tokuhiro Matsuno](https://github.com/tokuhirom), [Daisuke Murase](https://github.com/typester), [Shigeo Mitsunari](https://github.com/herumi)
+
+PicoHTTPParser is a tiny, primitive, fast HTTP request/response parser.
+
+Unlike most parsers, it is stateless and does not allocate memory by itself.
+All it does is accept pointer to buffer and the output structure, and setups the pointers in the latter to point at the necessary portions of the buffer.
+
+The code is widely deployed within Perl applications through popular modules that use it, including [Plack](https://metacpan.org/pod/Plack), [Starman](https://metacpan.org/pod/Starman), [Starlet](https://metacpan.org/pod/Starlet), [Furl](https://metacpan.org/pod/Furl). It is also the HTTP/1 parser of [H2O](https://github.com/h2o/h2o).
+
+Check out [test.c] to find out how to use the parser.
+
+The software is dual-licensed under the Perl License or the MIT License.
+
+Usage
+-----
+
+The library exposes four functions: `phr_parse_request`, `phr_parse_response`, `phr_parse_headers`, `phr_decode_chunked`.
+
+### phr_parse_request
+
+The example below reads an HTTP request from socket `sock` using `read(2)`, parses it using `phr_parse_request`, and prints the details.
+
+```c
+char buf[4096], *method, *path;
+int pret, minor_version;
+struct phr_header headers[100];
+size_t buflen = 0, prevbuflen = 0, method_len, path_len, num_headers;
+ssize_t rret;
+
+while (1) {
+ /* read the request */
+ while ((rret = read(sock, buf + buflen, sizeof(buf) - buflen)) == -1 && errno == EINTR)
+ ;
+ if (rret <= 0)
+ return IOError;
+ prevbuflen = buflen;
+ buflen += rret;
+ /* parse the request */
+ num_headers = sizeof(headers) / sizeof(headers[0]);
+ pret = phr_parse_request(buf, buflen, &method, &method_len, &path, &path_len,
+ &minor_version, headers, &num_headers, prevbuflen);
+ if (pret > 0)
+ break; /* successfully parsed the request */
+ else if (pret == -1)
+ return ParseError;
+ /* request is incomplete, continue the loop */
+ assert(pret == -2);
+ if (buflen == sizeof(buf))
+ return RequestIsTooLongError;
+}
+
+printf("request is %d bytes long\n", pret);
+printf("method is %.*s\n", (int)method_len, method);
+printf("path is %.*s\n", (int)path_len, path);
+printf("HTTP version is 1.%d\n", minor_version);
+printf("headers:\n");
+for (i = 0; i != num_headers; ++i) {
+ printf("%.*s: %.*s\n", (int)headers[i].name_len, headers[i].name,
+ (int)headers[i].value_len, headers[i].value);
+}
+```
+
+### phr_parse_response, phr_parse_headers
+
+`phr_parse_response` and `phr_parse_headers` provide similar interfaces as `phr_parse_request`. `phr_parse_response` parses an HTTP response, and `phr_parse_headers` parses the headers only.
+
+### phr_decode_chunked
+
+The example below decodes incoming data in chunked-encoding. The data is decoded in-place.
+
+```c
+struct phr_chunked_decoder decoder = {}; /* zero-clear */
+char *buf = malloc(4096);
+size_t size = 0, capacity = 4096, rsize;
+ssize_t rret, pret;
+
+/* set consume_trailer to 1 to discard the trailing header, or the application
+ * should call phr_parse_headers to parse the trailing header */
+decoder.consume_trailer = 1;
+
+do {
+ /* expand the buffer if necessary */
+ if (size == capacity) {
+ capacity *= 2;
+ buf = realloc(buf, capacity);
+ assert(buf != NULL);
+ }
+ /* read */
+ while ((rret = read(sock, buf + size, capacity - size)) == -1 && errno == EINTR)
+ ;
+ if (rret <= 0)
+ return IOError;
+ /* decode */
+ rsize = rret;
+ pret = phr_decode_chunked(&decoder, buf + size, &rsize);
+ if (pret == -1)
+ return ParseError;
+ size += rsize;
+} while (pret == -2);
+
+/* successfully decoded the chunked data */
+assert(pret >= 0);
+printf("decoded data is at %p (%zu bytes)\n", buf, size);
+```
+
+Benchmark
+---------
+
+![benchmark results](http://i.gyazo.com/a85c18d3162dfb46b485bb41e0ad443a.png)
+
+The benchmark code is from [fukamachi/fast-http@6b91103](https://github.com/fukamachi/fast-http/tree/6b9110347c7a3407310c08979aefd65078518478).
+
+The internals of picohttpparser has been described to some extent in [my blog entry]( http://blog.kazuhooku.com/2014/11/the-internals-h2o-or-how-to-write-fast.html).
diff --git a/src/server/picohttpparser/picohttpparser.c b/src/server/picohttpparser/picohttpparser.c
new file mode 100644
index 0000000..cfa05ef
--- /dev/null
+++ b/src/server/picohttpparser/picohttpparser.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase,
+ * Shigeo Mitsunari
+ *
+ * The software is licensed under either the MIT License (below) or the Perl
+ * license.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#ifdef __SSE4_2__
+#ifdef _MSC_VER
+#include <nmmintrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+#include "picohttpparser.h"
+
+/* $Id$ */
+
+#if __GNUC__ >= 3
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+
+#ifdef _MSC_VER
+#define ALIGNED(n) _declspec(align(n))
+#else
+#define ALIGNED(n) __attribute__((aligned(n)))
+#endif
+
+#define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u)
+
+#define CHECK_EOF() \
+ if (buf == buf_end) { \
+ *ret = -2; \
+ return NULL; \
+ }
+
+#define EXPECT_CHAR_NO_CHECK(ch) \
+ if (*buf++ != ch) { \
+ *ret = -1; \
+ return NULL; \
+ }
+
+#define EXPECT_CHAR(ch) \
+ CHECK_EOF(); \
+ EXPECT_CHAR_NO_CHECK(ch);
+
+#define ADVANCE_TOKEN(tok, toklen) \
+ do { \
+ const char *tok_start = buf; \
+ static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \
+ int found2; \
+ buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \
+ if (!found2) { \
+ CHECK_EOF(); \
+ } \
+ while (1) { \
+ if (*buf == ' ') { \
+ break; \
+ } else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \
+ if ((unsigned char)*buf < '\040' || *buf == '\177') { \
+ *ret = -1; \
+ return NULL; \
+ } \
+ } \
+ ++buf; \
+ CHECK_EOF(); \
+ } \
+ tok = tok_start; \
+ toklen = buf - tok_start; \
+ } while (0)
+
+static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0"
+ "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1"
+ "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+ "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+
+static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found)
+{
+ *found = 0;
+#if __SSE4_2__
+ if (likely(buf_end - buf >= 16)) {
+ __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges);
+
+ size_t left = (buf_end - buf) & ~15;
+ do {
+ __m128i b16 = _mm_loadu_si128((const __m128i *)buf);
+ int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS);
+ if (unlikely(r != 16)) {
+ buf += r;
+ *found = 1;
+ break;
+ }
+ buf += 16;
+ left -= 16;
+ } while (likely(left != 0));
+ }
+#else
+ /* suppress unused parameter warning */
+ (void)buf_end;
+ (void)ranges;
+ (void)ranges_size;
+#endif
+ return buf;
+}
+
+static const char *get_token_to_eol(const char *buf, const char *buf_end, struct string *token, int *ret)
+{
+ const char *token_start = buf;
+
+#ifdef __SSE4_2__
+ static const char ranges1[] = "\0\010"
+ /* allow HT */
+ "\012\037"
+ /* allow SP and up to but not including DEL */
+ "\177\177"
+ /* allow chars w. MSB set */
+ ;
+ int found;
+ buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
+ if (found)
+ goto FOUND_CTL;
+#else
+ /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
+ while (likely(buf_end - buf >= 8)) {
+#define DOIT() \
+ do { \
+ if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \
+ goto NonPrintable; \
+ ++buf; \
+ } while (0)
+ DOIT();
+ DOIT();
+ DOIT();
+ DOIT();
+ DOIT();
+ DOIT();
+ DOIT();
+ DOIT();
+#undef DOIT
+ continue;
+ NonPrintable:
+ if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) {
+ goto FOUND_CTL;
+ }
+ ++buf;
+ }
+#endif
+ for (;; ++buf) {
+ CHECK_EOF();
+ if (unlikely(!IS_PRINTABLE_ASCII(*buf))) {
+ if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) {
+ goto FOUND_CTL;
+ }
+ }
+ }
+FOUND_CTL:
+ if (likely(*buf == '\015')) {
+ ++buf;
+ EXPECT_CHAR('\012');
+ token->l = buf - 2 - token_start;
+ } else if (*buf == '\012') {
+ token->l = buf - token_start;
+ ++buf;
+ } else {
+ *ret = -1;
+ return NULL;
+ }
+ token->s = token_start;
+
+ return buf;
+}
+
+static const char *is_complete(const char *buf, const char *buf_end, size_t last_len, int *ret)
+{
+ int ret_cnt = 0;
+ buf = last_len < 3 ? buf : buf + last_len - 3;
+
+ while (1) {
+ CHECK_EOF();
+ if (*buf == '\015') {
+ ++buf;
+ CHECK_EOF();
+ EXPECT_CHAR('\012');
+ ++ret_cnt;
+ } else if (*buf == '\012') {
+ ++buf;
+ ++ret_cnt;
+ } else {
+ ++buf;
+ ret_cnt = 0;
+ }
+ if (ret_cnt == 2) {
+ return buf;
+ }
+ }
+
+ *ret = -2;
+ return NULL;
+}
+
+#define PARSE_INT(valp_, mul_) \
+ if (*buf < '0' || '9' < *buf) { \
+ buf++; \
+ *ret = -1; \
+ return NULL; \
+ } \
+ *(valp_) = (mul_) * (*buf++ - '0');
+
+#define PARSE_INT_3(valp_) \
+ do { \
+ int res_ = 0; \
+ PARSE_INT(&res_, 100) \
+ *valp_ = res_; \
+ PARSE_INT(&res_, 10) \
+ *valp_ += res_; \
+ PARSE_INT(&res_, 1) \
+ *valp_ += res_; \
+ } while (0)
+
+/* returned pointer is always within [buf, buf_end), or null */
+static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret)
+{
+ /* we want at least [HTTP/1.<two chars>] to try to parse */
+ if (buf_end - buf < 9) {
+ *ret = -2;
+ return NULL;
+ }
+ EXPECT_CHAR_NO_CHECK('H');
+ EXPECT_CHAR_NO_CHECK('T');
+ EXPECT_CHAR_NO_CHECK('T');
+ EXPECT_CHAR_NO_CHECK('P');
+ EXPECT_CHAR_NO_CHECK('/');
+ EXPECT_CHAR_NO_CHECK('1');
+ EXPECT_CHAR_NO_CHECK('.');
+ PARSE_INT(minor_version, 1);
+ return buf;
+}
+
+static const char *parse_headers(const char *buf, const char *buf_end, struct phr_header *headers, size_t *num_headers,
+ size_t max_headers, int *ret)
+{
+ for (;; ++*num_headers) {
+ CHECK_EOF();
+ if (*buf == '\015') {
+ ++buf;
+ EXPECT_CHAR('\012');
+ break;
+ } else if (*buf == '\012') {
+ ++buf;
+ break;
+ }
+ if (*num_headers == max_headers) {
+ *ret = -1;
+ return NULL;
+ }
+ if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) {
+ /* parsing name, but do not discard SP before colon, see
+ * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */
+ headers[*num_headers].name.s = buf;
+ static const char ALIGNED(16) ranges1[] = "\x00 " /* control chars and up to SP */
+ "\"\"" /* 0x22 */
+ "()" /* 0x28,0x29 */
+ ",," /* 0x2c */
+ "//" /* 0x2f */
+ ":@" /* 0x3a-0x40 */
+ "[]" /* 0x5b-0x5d */
+ "{\377"; /* 0x7b-0xff */
+ int found;
+ buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
+ if (!found) {
+ CHECK_EOF();
+ }
+ while (1) {
+ if (*buf == ':') {
+ break;
+ } else if (!token_char_map[(unsigned char)*buf]) {
+ *ret = -1;
+ return NULL;
+ }
+ ++buf;
+ CHECK_EOF();
+ }
+ if ((headers[*num_headers].name.l = buf - headers[*num_headers].name.s) == 0) {
+ *ret = -1;
+ return NULL;
+ }
+ ++buf;
+ for (;; ++buf) {
+ CHECK_EOF();
+ if (!(*buf == ' ' || *buf == '\t')) {
+ break;
+ }
+ }
+ } else {
+ headers[*num_headers].name.s = NULL;
+ headers[*num_headers].name.l = 0;
+ }
+ if ((buf = get_token_to_eol(buf, buf_end, &headers[*num_headers].value, ret)) == NULL) {
+ return NULL;
+ }
+ }
+ return buf;
+}
+
+static const char *parse_request(const char *buf, const char *buf_end, struct string *method, struct string *path,
+ int *minor_version, struct phr_header *headers, size_t *num_headers,
+ size_t max_headers, int *ret)
+{
+ /* skip first empty line (some clients add CRLF after POST content) */
+ CHECK_EOF();
+ if (*buf == '\015') {
+ ++buf;
+ EXPECT_CHAR('\012');
+ } else if (*buf == '\012') {
+ ++buf;
+ }
+
+ /* parse request line */
+ ADVANCE_TOKEN(method->s, method->l);
+ ++buf;
+ ADVANCE_TOKEN(path->s, path->l);
+ ++buf;
+ if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
+ return NULL;
+ }
+ if (*buf == '\015') {
+ ++buf;
+ EXPECT_CHAR('\012');
+ } else if (*buf == '\012') {
+ ++buf;
+ } else {
+ *ret = -1;
+ return NULL;
+ }
+
+ return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret);
+}
+
+int phr_parse_request(const char *buf_start, size_t len, struct string *method, struct string *path,
+ int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len)
+{
+ const char *buf = buf_start, *buf_end = buf_start + len;
+ size_t max_headers = *num_headers;
+ int r;
+
+ method->s = NULL;
+ method->l = 0;
+ path->s = NULL;
+ path->l = 0;
+ *minor_version = -1;
+ *num_headers = 0;
+
+ /* if last_len != 0, check if the request is complete (a fast countermeasure
+ againt slowloris */
+ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
+ return r;
+ }
+
+ if ((buf = parse_request(buf, buf_end, method, path, minor_version, headers, num_headers, max_headers,
+ &r)) == NULL) {
+ return r;
+ }
+
+ return (int)(buf - buf_start);
+}
+
+static const char *parse_response(const char *buf, const char *buf_end, int *minor_version, int *status, struct string *msg,
+ struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret)
+{
+ /* parse "HTTP/1.x" */
+ if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
+ return NULL;
+ }
+ /* skip space */
+ if (*buf++ != ' ') {
+ *ret = -1;
+ return NULL;
+ }
+ /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
+ if (buf_end - buf < 4) {
+ *ret = -2;
+ return NULL;
+ }
+ PARSE_INT_3(status);
+
+ /* skip space */
+ if (*buf++ != ' ') {
+ *ret = -1;
+ return NULL;
+ }
+ /* get message */
+ if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) {
+ return NULL;
+ }
+
+ return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret);
+}
+
+int phr_parse_response(const char *buf_start, size_t len, int *minor_version, int *status, struct string *msg,
+ struct phr_header *headers, size_t *num_headers, size_t last_len)
+{
+ const char *buf = buf_start, *buf_end = buf + len;
+ size_t max_headers = *num_headers;
+ int r;
+
+ *minor_version = -1;
+ *status = 0;
+ msg->s = NULL;
+ msg->l = 0;
+ *num_headers = 0;
+
+ /* if last_len != 0, check if the response is complete (a fast countermeasure
+ against slowloris */
+ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
+ return r;
+ }
+
+ if ((buf = parse_response(buf, buf_end, minor_version, status, msg, headers, num_headers, max_headers, &r)) == NULL) {
+ return r;
+ }
+
+ return (int)(buf - buf_start);
+}
+
+int phr_parse_headers(const char *buf_start, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len)
+{
+ const char *buf = buf_start, *buf_end = buf + len;
+ size_t max_headers = *num_headers;
+ int r;
+
+ *num_headers = 0;
+
+ /* if last_len != 0, check if the response is complete (a fast countermeasure
+ against slowloris */
+ if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) {
+ return r;
+ }
+
+ if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r)) == NULL) {
+ return r;
+ }
+
+ return (int)(buf - buf_start);
+}
+
+enum {
+ CHUNKED_IN_CHUNK_SIZE,
+ CHUNKED_IN_CHUNK_EXT,
+ CHUNKED_IN_CHUNK_DATA,
+ CHUNKED_IN_CHUNK_CRLF,
+ CHUNKED_IN_TRAILERS_LINE_HEAD,
+ CHUNKED_IN_TRAILERS_LINE_MIDDLE
+};
+
+static int decode_hex(int ch)
+{
+ if ('0' <= ch && ch <= '9') {
+ return ch - '0';
+ } else if ('A' <= ch && ch <= 'F') {
+ return ch - 'A' + 0xa;
+ } else if ('a' <= ch && ch <= 'f') {
+ return ch - 'a' + 0xa;
+ } else {
+ return -1;
+ }
+}
+
+ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *_bufsz)
+{
+ size_t dst = 0, src = 0, bufsz = *_bufsz;
+ ssize_t ret = -2; /* incomplete */
+
+ while (1) {
+ switch (decoder->_state) {
+ case CHUNKED_IN_CHUNK_SIZE:
+ for (;; ++src) {
+ int v;
+ if (src == bufsz)
+ goto Exit;
+ if ((v = decode_hex(buf[src])) == -1) {
+ if (decoder->_hex_count == 0) {
+ ret = -1;
+ goto Exit;
+ }
+ break;
+ }
+ if (decoder->_hex_count == sizeof(size_t) * 2) {
+ ret = -1;
+ goto Exit;
+ }
+ decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v;
+ ++decoder->_hex_count;
+ }
+ decoder->_hex_count = 0;
+ decoder->_state = CHUNKED_IN_CHUNK_EXT;
+ /* fallthru */
+ case CHUNKED_IN_CHUNK_EXT:
+ /* RFC 7230 A.2 "Line folding in chunk extensions is disallowed" */
+ for (;; ++src) {
+ if (src == bufsz)
+ goto Exit;
+ if (buf[src] == '\012')
+ break;
+ }
+ ++src;
+ if (decoder->bytes_left_in_chunk == 0) {
+ if (decoder->consume_trailer) {
+ decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
+ break;
+ } else {
+ goto Complete;
+ }
+ }
+ decoder->_state = CHUNKED_IN_CHUNK_DATA;
+ /* fallthru */
+ case CHUNKED_IN_CHUNK_DATA: {
+ size_t avail = bufsz - src;
+ if (avail < decoder->bytes_left_in_chunk) {
+ if (dst != src)
+ memmove(buf + dst, buf + src, avail);
+ src += avail;
+ dst += avail;
+ decoder->bytes_left_in_chunk -= avail;
+ goto Exit;
+ }
+ if (dst != src)
+ memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk);
+ src += decoder->bytes_left_in_chunk;
+ dst += decoder->bytes_left_in_chunk;
+ decoder->bytes_left_in_chunk = 0;
+ decoder->_state = CHUNKED_IN_CHUNK_CRLF;
+ }
+ /* fallthru */
+ case CHUNKED_IN_CHUNK_CRLF:
+ for (;; ++src) {
+ if (src == bufsz)
+ goto Exit;
+ if (buf[src] != '\015')
+ break;
+ }
+ if (buf[src] != '\012') {
+ ret = -1;
+ goto Exit;
+ }
+ ++src;
+ decoder->_state = CHUNKED_IN_CHUNK_SIZE;
+ break;
+ case CHUNKED_IN_TRAILERS_LINE_HEAD:
+ for (;; ++src) {
+ if (src == bufsz)
+ goto Exit;
+ if (buf[src] != '\015')
+ break;
+ }
+ if (buf[src++] == '\012')
+ goto Complete;
+ decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE;
+ /* fallthru */
+ case CHUNKED_IN_TRAILERS_LINE_MIDDLE:
+ for (;; ++src) {
+ if (src == bufsz)
+ goto Exit;
+ if (buf[src] == '\012')
+ break;
+ }
+ ++src;
+ decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD;
+ break;
+ default:
+ assert(!"decoder is corrupt");
+ }
+ }
+
+Complete:
+ ret = bufsz - src;
+Exit:
+ if (dst != src)
+ memmove(buf + dst, buf + src, bufsz - src);
+ *_bufsz = dst;
+ return ret;
+}
+
+int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder)
+{
+ return decoder->_state == CHUNKED_IN_CHUNK_DATA;
+}
+
+#undef CHECK_EOF
+#undef EXPECT_CHAR
+#undef ADVANCE_TOKEN
diff --git a/src/server/picohttpparser/picohttpparser.h b/src/server/picohttpparser/picohttpparser.h
new file mode 100644
index 0000000..b315795
--- /dev/null
+++ b/src/server/picohttpparser/picohttpparser.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase,
+ * Shigeo Mitsunari
+ *
+ * The software is licensed under either the MIT License (below) or the Perl
+ * license.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef picohttpparser_h
+#define picohttpparser_h
+
+#include <sys/types.h>
+
+#ifdef _MSC_VER
+#define ssize_t intptr_t
+#endif
+
+/* $Id$ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct string {
+ const char *s;
+ size_t l;
+};
+
+/* contains name and value of a header (name == NULL if is a continuing line
+ * of a multiline header */
+struct phr_header {
+ struct string name;
+ struct string value;
+};
+
+/* returns number of bytes consumed if successful, -2 if request is partial,
+ * -1 if failed */
+int phr_parse_request(const char *buf, size_t len, struct string *method, struct string *path,
+ int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len);
+
+/* ditto */
+int phr_parse_response(const char *_buf, size_t len, int *minor_version, int *status, struct string *msg,
+ struct phr_header *headers, size_t *num_headers, size_t last_len);
+
+/* ditto */
+int phr_parse_headers(const char *buf, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len);
+
+/* should be zero-filled before start */
+struct phr_chunked_decoder {
+ size_t bytes_left_in_chunk; /* number of bytes left in current chunk */
+ char consume_trailer; /* if trailing headers should be consumed */
+ char _hex_count;
+ char _state;
+};
+
+/* the function rewrites the buffer given as (buf, bufsz) removing the chunked-
+ * encoding headers. When the function returns without an error, bufsz is
+ * updated to the length of the decoded data available. Applications should
+ * repeatedly call the function while it returns -2 (incomplete) every time
+ * supplying newly arrived data. If the end of the chunked-encoded data is
+ * found, the function returns a non-negative number indicating the number of
+ * octets left undecoded at the tail of the supplied buffer. Returns -1 on
+ * error.
+ */
+ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *bufsz);
+
+/* returns if the chunked decoder is in middle of chunked data */
+int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/server/rpc.c b/src/server/rpc.c
new file mode 100644
index 0000000..1ea09cb
--- /dev/null
+++ b/src/server/rpc.c
@@ -0,0 +1,504 @@
+#include "rpc.h"
+#include "helper.h"
+#include "net.h"
+#include "uplink.h"
+#include "locks.h"
+#include "image.h"
+#include "altservers.h"
+#include "../shared/sockhelper.h"
+#include "fileutil.h"
+#include "picohttpparser/picohttpparser.h"
+#include "urldecode.h"
+
+#include <jansson.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#if JANSSON_VERSION_HEX < 0x020600
+#define json_stringn_nocheck(a,b) json_string_nocheck(a)
+#endif
+
+#define ACL_ALL 0x7fffffff
+#define ACL_STATS 1
+#define ACL_CLIENT_LIST 2
+#define ACL_IMAGE_LIST 4
+#define ACL_CONFIG 8
+#define ACL_LOG 16
+#define ACL_ALTSERVERS 32
+
+#define HTTP_CLOSE 4
+#define HTTP_KEEPALIVE 9
+
+// Make sure compiler does not reserve more space for static strings than required (or rather, does not tell so in sizeof calls)
+// TODO Might be time for a dedicated string.h
+_Static_assert( sizeof("test") == 5 && sizeof("test2") == 6, "Stringsize messup :/" );
+#define STRCMP(str,chr) ( (str).s != NULL && (str).l == sizeof(chr)-1 && strncmp( (str).s, (chr), MIN((str).l, sizeof(chr)-1) ) == 0 )
+#define STRSTART(str,chr) ( (str).s != NULL && (str).l >= sizeof(chr)-1 && strncmp( (str).s, (chr), MIN((str).l, sizeof(chr)-1) ) == 0 )
+#define SETSTR(name,value) do { name.s = value; name.l = sizeof(value)-1; } while (0)
+#define DEFSTR(name,value) static struct string name = { .s = value, .l = sizeof(value)-1 };
+#define chartolower(c) ((char)( (c) >= 'A' && (c) <= 'Z' ? (c) + ('a'-'A') : (c) ))
+
+DEFSTR(STR_CONNECTION, "connection")
+DEFSTR(STR_CLOSE, "close")
+DEFSTR(STR_QUERY, "/query")
+DEFSTR(STR_Q, "q")
+
+static inline bool equals(struct string *s1,struct string *s2)
+{
+ if ( s1->s == NULL ) {
+ return s2->s == NULL;
+ } else if ( s2->s == NULL || s1->l != s2->l ) {
+ return false;
+ }
+ return memcmp( s1->s, s2->s, s1->l ) == 0;
+}
+
+static inline bool iequals(struct string *cmpMixed, struct string *cmpLower)
+{
+ if ( cmpMixed->s == NULL ) {
+ return cmpLower->s == NULL;
+ } else if ( cmpLower->s == NULL || cmpMixed->l != cmpLower->l ) {
+ return false;
+ }
+ for ( size_t i = 0; i < cmpMixed->l; ++i ) {
+ if ( chartolower( cmpMixed->s[i] ) != cmpLower->s[i] ) return false;
+ }
+ return true;
+}
+
+#define MAX_ACLS 100
+static int aclCount = 0;
+static dnbd3_access_rule_t aclRules[MAX_ACLS];
+static json_int_t randomRunId;
+static pthread_spinlock_t aclLock;
+#define MAX_CLIENTS 50
+#define CUTOFF_START 40
+static pthread_spinlock_t statusLock;
+static struct {
+ int count;
+ bool overloaded;
+} status;
+
+static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
+static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive);
+static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc);
+static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value);
+static int getacl(dnbd3_host_t *host);
+static void addacl(int argc, char **argv, void *data);
+static void loadAcl();
+
+void rpc_init()
+{
+ spin_init( &aclLock, PTHREAD_PROCESS_PRIVATE );
+ spin_init( &statusLock, PTHREAD_PROCESS_PRIVATE );
+ randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL);
+ // </guard>
+ if ( sizeof(randomRunId) > 4 ) {
+ int fd = open( "/dev/urandom", O_RDONLY );
+ if ( fd != -1 ) {
+ uint32_t bla = 1;
+ read( fd, &bla, 4 );
+ randomRunId = (randomRunId << 32) | bla;
+ }
+ close( fd );
+ }
+ loadAcl();
+}
+
+#define UPDATE_LOADSTATE(cnt) do { \
+ if ( cnt < (CUTOFF_START/2) ) { \
+ if ( status.overloaded ) status.overloaded = false; \
+ } else if ( cnt > CUTOFF_START ) { \
+ if ( !status.overloaded ) status.overloaded = true; \
+ } \
+} while (0)
+
+void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int dataLen)
+{
+ int permissions = getacl( host );
+ if ( permissions == 0 ) {
+ sendReply( sock, "403 Forbidden", "text/plain", "Access denied", -1, HTTP_CLOSE );
+ return;
+ }
+ do {
+ spin_lock( &statusLock );
+ const int curCount = ++status.count;
+ UPDATE_LOADSTATE( curCount );
+ spin_unlock( &statusLock );
+ if ( curCount > MAX_CLIENTS ) {
+ sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE );
+ goto func_return;
+ }
+ } while (0);
+ char headerBuf[3000];
+ if ( dataLen > 0 ) {
+ // We call this function internally with a maximum data len of sizeof(dnbd3_request_t) so no bounds checking
+ memcpy( headerBuf, data, dataLen );
+ }
+ size_t hoff = dataLen;
+ bool hasName = false;
+ bool ok;
+ int keepAlive = HTTP_KEEPALIVE;
+ do {
+ // Read request from client
+ struct phr_header headers[100];
+ size_t numHeaders, prevLen = 0, consumed;
+ struct string method, path;
+ int minorVersion;
+ do {
+ // Parse before calling recv, there might be a complete pipelined request in the buffer already
+ // If the request is incomplete, we allow exactly one additional recv() to complete it.
+ // This should suffice for real world scenarios as I don't know of any
+ // HTTP client that sends the request headers in multiple packets. Even
+ // with pipelining this should not break as we re-enter this loop after
+ // processing the requests one by one, so a potential partial request in the
+ // buffer will get another recv() (blocking mode)
+ // The alternative would be manual tracking of idle/request time to protect
+ // against never ending requests (slowloris)
+ int pret;
+ if ( hoff >= sizeof(headerBuf) ) goto func_return; // Request too large
+ if ( hoff != 0 ) {
+ numHeaders = 100;
+ pret = phr_parse_request( headerBuf, hoff, &method, &path, &minorVersion, headers, &numHeaders, prevLen );
+ } else {
+ // Nothing in buffer yet, just set to -2 which is the phr goto func_return code for "partial request"
+ pret = -2;
+ }
+ if ( pret > 0 ) {
+ // > 0 means parsing completed without error
+ consumed = (size_t)pret;
+ break;
+ }
+ // Reaching here means partial request or parse error
+ if ( pret == -2 ) { // Partial, keep reading
+ prevLen = hoff;
+#ifdef AFL_MODE
+ ssize_t ret = recv( 0, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
+#else
+ ssize_t ret = recv( sock, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
+#endif
+ if ( ret == 0 ) goto func_return;
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ if ( errno != EAGAIN && errno != EWOULDBLOCK ) {
+ sendReply( sock, "500 Internal Server Error", "text/plain", "Server made a boo-boo", -1, HTTP_CLOSE );
+ }
+ goto func_return; // Timeout or unknown error
+ }
+ hoff += ret;
+ } else { // Parse error
+ sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE );
+ goto func_return;
+ }
+ } while ( true );
+ if ( keepAlive == HTTP_KEEPALIVE ) {
+ // Only keep the connection alive (and indicate so) if the client seems to support this
+ if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
+ keepAlive = HTTP_CLOSE;
+ } else { // And if there aren't too many active HTTP sessions
+ spin_lock( &statusLock );
+ if ( status.overloaded ) keepAlive = HTTP_CLOSE;
+ spin_unlock( &statusLock );
+ }
+ }
+ if ( method.s != NULL && path.s != NULL ) {
+ // Basic data filled from request parser
+ // Handle stuff
+ struct string file;
+ struct field getv[10];
+ size_t getc = 10;
+ parsePath( &path, &file, getv, &getc );
+ if ( method.s && method.s[0] == 'P' ) {
+ // POST only methods
+ }
+ // Don't care if GET or POST
+ if ( equals( &file, &STR_QUERY ) ) {
+ ok = handleStatus( sock, permissions, getv, getc, keepAlive );
+ } else {
+ ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive );
+ }
+ if ( !ok ) break;
+ }
+ // hoff might be beyond end if the client sent another request (burst)
+ const ssize_t extra = hoff - consumed;
+ if ( extra > 0 ) {
+ memmove( headerBuf, headerBuf + consumed, extra );
+ }
+ hoff = extra;
+ if ( !hasName ) {
+ hasName = true;
+ setThreadName( "HTTP" );
+ }
+ } while (true);
+func_return:;
+ do {
+ spin_lock( &statusLock );
+ const int curCount = --status.count;
+ UPDATE_LOADSTATE( curCount );
+ spin_unlock( &statusLock );
+ } while (0);
+}
+
+void rpc_sendErrorMessage(int sock, const char* message)
+{
+ static const char *encoded = NULL;
+ static size_t len;
+ if ( encoded == NULL ) {
+ json_t *tmp = json_pack( "{ss}", "errorMsg", message );
+ encoded = json_dumps( tmp, 0 );
+ json_decref( tmp );
+ len = strlen( encoded );
+ }
+ sendReply( sock, "200 Somewhat OK", "application/json", encoded, len, HTTP_CLOSE );
+}
+
+static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive)
+{
+ bool ok;
+ bool stats = false, images = false, clients = false, space = false;
+ bool logfile = false, config = false, altservers = false;
+#define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true
+ for (size_t i = 0; i < fields_num; ++i) {
+ if ( !equals( &fields[i].name, &STR_Q ) ) continue;
+ SETVAR(stats);
+ else SETVAR(space);
+ else SETVAR(images);
+ else SETVAR(clients);
+ else SETVAR(logfile);
+ else SETVAR(config);
+ else SETVAR(altservers);
+ }
+#undef SETVAR
+ if ( ( stats || space ) && !(permissions & ACL_STATS) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive );
+ }
+ if ( images && !(permissions & ACL_IMAGE_LIST) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive );
+ }
+ if ( clients && !(permissions & ACL_CLIENT_LIST) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access client list", -1, keepAlive );
+ }
+ if ( logfile && !(permissions & ACL_LOG) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access log", -1, keepAlive );
+ }
+ if ( config && !(permissions & ACL_CONFIG) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access config", -1, keepAlive );
+ }
+ if ( altservers && !(permissions & ACL_ALTSERVERS) ) {
+ return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access altservers", -1, keepAlive );
+ }
+
+ json_t *statisticsJson;
+ if ( stats ) {
+ int clientCount, serverCount;
+ uint64_t bytesSent;
+ const uint64_t bytesReceived = uplink_getTotalBytesReceived();
+ net_getStats( &clientCount, &serverCount, &bytesSent );
+ statisticsJson = json_pack( "{sIsIsisisIsI}",
+ "bytesReceived", (json_int_t) bytesReceived,
+ "bytesSent", (json_int_t) bytesSent,
+ "clientCount", clientCount,
+ "serverCount", serverCount,
+ "uptime", (json_int_t) dnbd3_serverUptime(),
+ "runId", randomRunId );
+ } else {
+ statisticsJson = json_pack( "{sI}",
+ "runId", randomRunId );
+ }
+ if ( space ) {
+ uint64_t spaceTotal = 0, spaceAvail = 0;
+ file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail );
+ json_object_set_new( statisticsJson, "spaceTotal", json_integer( spaceTotal ) );
+ json_object_set_new( statisticsJson, "spaceFree", json_integer( spaceAvail ) );
+ }
+ if ( clients ) {
+ json_object_set_new( statisticsJson, "clients", net_getListAsJson() );
+ }
+ if ( images ) {
+ json_object_set_new( statisticsJson, "images", image_getListAsJson() );
+ }
+ if ( logfile ) {
+ char logbuf[4000];
+ ssize_t len = log_fetch( logbuf, sizeof(logbuf) );
+ json_t *val;
+ if ( len <= 0 ) {
+ val = json_null();
+ } else {
+ val = json_stringn_nocheck( logbuf, (size_t)len );
+
+ }
+ json_object_set_new( statisticsJson, "logfile", val );
+ }
+ if ( config ) {
+ char buf[2000];
+ size_t len = globals_dumpConfig( buf, sizeof(buf) );
+ json_object_set_new( statisticsJson, "config", json_stringn_nocheck( buf, len ) );
+ }
+ if ( altservers ) {
+ json_object_set_new( statisticsJson, "altservers", altservers_toJson() );
+ }
+
+ char *jsonString = json_dumps( statisticsJson, 0 );
+ json_decref( statisticsJson );
+ ok = sendReply( sock, "200 OK", "application/json", jsonString, -1, keepAlive );
+ free( jsonString );
+ return ok;
+}
+
+static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive)
+{
+ if ( plen == -1 ) plen = strlen( payload );
+ char buffer[600];
+ const char *connection = ( keepAlive == HTTP_KEEPALIVE ) ? "Keep-Alive" : "Close";
+ int hlen = snprintf(buffer, sizeof(buffer), "HTTP/1.1 %s\r\n"
+ "Connection: %s\r\n"
+ "Content-Type: %s; charset=utf-8\r\n"
+ "Content-Length: %u\r\n"
+ "\r\n",
+ status, connection, ctype, (unsigned int)plen );
+ if ( hlen < 0 || hlen >= (int)sizeof(buffer) ) return false; // Truncated
+ if ( send( sock, buffer, hlen, MSG_MORE ) != hlen ) return false;
+ if ( !sock_sendAll( sock, payload, plen, 10 ) ) return false;
+ if ( keepAlive == HTTP_CLOSE ) {
+ // Wait for flush
+ shutdown( sock, SHUT_WR );
+#ifdef AFL_MODE
+ sock = 0;
+#endif
+ while ( read( sock, buffer, sizeof buffer ) > 0 );
+ return false;
+ }
+ return true;
+}
+
+static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc)
+{
+ size_t i = 0;
+ while ( i < path->l && path->s[i] != '?' ) ++i;
+ if ( i == path->l ) {
+ *getc = 0;
+ *file = *path;
+ return;
+ }
+ file->s = path->s;
+ file->l = i;
+ ++i;
+ path->s += i;
+ path->l -= i;
+ urldecode( path, getv, getc );
+ path->s -= i;
+ path->l += i;
+}
+
+static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value)
+{
+ for (size_t i = 0; i < numHeaders; ++i) {
+ if ( !iequals( &headers[i].name, name ) ) continue;
+ if ( iequals( &headers[i].value, value ) ) return true;
+ }
+ return false;
+}
+
+static int getacl(dnbd3_host_t *host)
+{
+ if ( aclCount == 0 ) return 0x7fffff; // For now compat mode - no rules defined == all access
+ for (int i = 0; i < aclCount; ++i) {
+ if ( aclRules[i].bytes == 0 && aclRules[i].bitMask == 0 ) return aclRules[i].permissions;
+ if ( memcmp( aclRules[i].host, host->addr, aclRules[i].bytes ) != 0 ) continue;
+ if ( aclRules[i].bitMask != 0 && aclRules[i].host[aclRules[i].bytes] != ( host->addr[aclRules[i].bytes] & aclRules[i].bitMask ) ) continue;
+ return aclRules[i].permissions;
+ }
+#ifdef AFL_MODE
+ return 0x7fffff;
+#else
+ return 0;
+#endif
+}
+
+#define SETBIT(x) else if ( strcmp( argv[i], #x ) == 0 ) mask |= ACL_ ## x
+
+static void addacl(int argc, char **argv, void *data UNUSED)
+{
+ if ( argv[0][0] == '#' ) return;
+ spin_lock( &aclLock );
+ if ( aclCount >= MAX_ACLS ) {
+ logadd( LOG_WARNING, "Too many ACL rules, ignoring %s", argv[0] );
+ goto unlock_end;
+ }
+ int mask = 0;
+ for (int i = 1; i < argc; ++i) {
+ if (false) {}
+ SETBIT(ALL);
+ SETBIT(STATS);
+ SETBIT(CLIENT_LIST);
+ SETBIT(IMAGE_LIST);
+ else logadd( LOG_WARNING, "Invalid ACL flag '%s' for %s", argv[i], argv[0] );
+ }
+ if ( mask == 0 ) {
+ logadd( LOG_INFO, "Ignoring empty rule for %s", argv[0] );
+ goto unlock_end;
+ }
+ dnbd3_host_t host;
+ char *slash = strchr( argv[0], '/' );
+ if ( slash != NULL ) {
+ *slash++ = '\0';
+ }
+ if ( !parse_address( argv[0], &host ) ) goto unlock_end;
+ long int bits;
+ if ( slash != NULL ) {
+ char *last;
+ bits = strtol( slash, &last, 10 );
+ if ( last == slash ) slash = NULL;
+ if ( host.type == HOST_IP4 && bits > 32 ) bits = 32;
+ if ( bits > 128 ) bits = 128;
+ }
+ if ( slash == NULL ) {
+ if ( host.type == HOST_IP4 ) {
+ bits = 32;
+ } else {
+ bits = 128;
+ }
+ }
+ memcpy( aclRules[aclCount].host, host.addr, 16 );
+ aclRules[aclCount].bytes = (int)( bits / 8 );
+ aclRules[aclCount].bitMask = 0;
+ aclRules[aclCount].permissions = mask;
+ bits %= 8;
+ if ( bits != 0 ) {
+ for (long int i = 0; i < bits; ++i) {
+ aclRules[aclCount].bitMask = ( aclRules[aclCount].bitMask >> 1 ) | 0x80;
+ }
+ aclRules[aclCount].host[aclRules[aclCount].bytes] &= (uint8_t)aclRules[aclCount].bitMask;
+ }
+ // We now have .bytes set to the number of bytes to memcmp.
+ // In case we have an odd bitmask, .bitMask will be != 0, so when comparing,
+ // we need AND the host[.bytes] of the address to compare with the value
+ // in .bitMask, and compate it, otherwise, a simple memcmp will do.
+ aclCount++;
+unlock_end:;
+ spin_unlock( &aclLock );
+}
+
+static void loadAcl()
+{
+ static bool inProgress = false;
+ char *fn;
+ if ( asprintf( &fn, "%s/%s", _configDir, "rpc.acl" ) == -1 ) return;
+ spin_lock( &aclLock );
+ if ( inProgress ) {
+ spin_unlock( &aclLock );
+ return;
+ }
+ aclCount = 0;
+ inProgress = true;
+ spin_unlock( &aclLock );
+ file_loadLineBased( fn, 1, 20, &addacl, NULL );
+ spin_lock( &aclLock );
+ inProgress = false;
+ spin_unlock( &aclLock );
+ free( fn );
+ logadd( LOG_INFO, "%d HTTPRPC ACL rules loaded", (int)aclCount );
+}
+
diff --git a/src/server/rpc.h b/src/server/rpc.h
new file mode 100644
index 0000000..285242c
--- /dev/null
+++ b/src/server/rpc.h
@@ -0,0 +1,10 @@
+#ifndef _RPC_H_
+#define _RPC_H_
+
+struct dnbd3_host_t;
+
+void rpc_init();
+void rpc_sendStatsJson(int sock, struct dnbd3_host_t* host, const void *data, const int dataLen);
+void rpc_sendErrorMessage(int sock, const char* message);
+
+#endif
diff --git a/src/server/serialize.c b/src/server/serialize.c
new file mode 100644
index 0000000..4934132
--- /dev/null
+++ b/src/server/serialize.c
@@ -0,0 +1,5 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "../serialize.c"
diff --git a/src/server/server.c b/src/server/server.c
new file mode 100644
index 0000000..10ab208
--- /dev/null
+++ b/src/server/server.c
@@ -0,0 +1,495 @@
+ /*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include "server.h"
+#include "helper.h"
+
+#include "locks.h"
+#include "image.h"
+#include "uplink.h"
+#include "net.h"
+#include "altservers.h"
+#include "integrity.h"
+#include "threadpool.h"
+#include "rpc.h"
+
+#include "../version.h"
+#include "../shared/sockhelper.h"
+#include "../shared/timing.h"
+
+#include <signal.h>
+#include <getopt.h>
+#include <assert.h>
+
+#define LONGOPT_CRC4 1000
+#define LONGOPT_ASSERT 1001
+#define LONGOPT_CREATE 1002
+#define LONGOPT_REVISION 1003
+#define LONGOPT_SIZE 1004
+#define LONGOPT_ERRORMSG 1005
+
+static poll_list_t *listeners = NULL;
+
+/**
+ * Time the server was started
+ */
+static ticks startupTime;
+static bool sigReload = false, sigLogCycle = false;
+
+/**
+ * Copied to in signal handler so we can print info
+ * later on
+ */
+static siginfo_t lastSignal;
+
+void printSignal();
+
+static poll_list_t* setupNetwork(char *bindAddress);
+
+static dnbd3_client_t* dnbd3_prepareClient(struct sockaddr_storage *client, int fd);
+
+static void dnbd3_handleSignal(int signum);
+
+static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data);
+
+static void* server_asyncImageListLoad(void *data);
+
+/**
+ * Print help text for usage instructions
+ */
+void dnbd3_printHelp(char *argv_0)
+{
+ printf( "Version: %s\n\n", VERSION_STRING );
+ printf( "Usage: %s [OPTIONS]...\n", argv_0 );
+ printf( "Start the DNBD3 server\n" );
+ printf( "-c or --config Configuration directory (default /etc/dnbd3-server/)\n" );
+ printf( "-n or --nodaemon Start server in foreground\n" );
+ printf( "-b or --bind Local Address to bind to\n" );
+ printf( "-h or --help Show this help text and quit\n" );
+ printf( "-v or --version Show version and quit\n" );
+ printf( "\nManagement functions:\n" );
+ printf( "--crc [image-file] Generate crc block list for given image\n" );
+ printf( "--create [image-name] --revision [rid] --size [filesize]\n"
+ "\tCreate a local empty image file with a zeroed cache-map for the specified image\n" );
+ printf( "--errormsg [text] Just serve given error message via HTTP, no service otherwise\n" );
+ printf( "\n" );
+ exit( 0 );
+}
+
+/**
+ * Print version information
+ */
+void dnbd3_printVersion()
+{
+ printf( "Version: %s\n", VERSION_STRING );
+ exit( 0 );
+}
+
+/**
+ * Clean up structs, connections, write out data, then exit
+ */
+void dnbd3_cleanup()
+{
+ int retries;
+
+ _shutdown = true;
+ logadd( LOG_INFO, "Cleanup..." );
+
+ if ( listeners != NULL ) sock_destroyPollList( listeners );
+ listeners = NULL;
+
+ // Kill connection to all clients
+ net_disconnectAll();
+
+ // Disable threadpool
+ threadpool_close();
+
+ // Terminate the altserver checking thread
+ altservers_shutdown();
+
+ // Terminate all uplinks
+ image_killUplinks();
+
+ // Terminate integrity checker
+ integrity_shutdown();
+
+ // Wait for clients to disconnect
+ net_waitForAllDisconnected();
+
+ // Watchdog not needed anymore
+ debug_locks_stop_watchdog();
+
+ // Clean up images
+ retries = 5;
+ while ( !image_tryFreeAll() && --retries > 0 ) {
+ logadd( LOG_INFO, "Waiting for images to free...\n" );
+ sleep( 1 );
+ }
+
+ free( _basePath );
+ free( _configDir );
+ exit( EXIT_SUCCESS );
+}
+
+/**
+ * Program entry point
+ */
+int main(int argc, char *argv[])
+{
+ int demonize = 1;
+ int opt = 0;
+ int longIndex = 0;
+ char *paramCreate = NULL;
+ char *bindAddress = NULL;
+ char *errorMsg = NULL;
+ int64_t paramSize = -1;
+ int paramRevision = -1;
+ static const char *optString = "b:c:d:hnv?";
+ static const struct option longOpts[] = {
+ { "config", required_argument, NULL, 'c' },
+ { "nodaemon", no_argument, NULL, 'n' },
+ { "reload", no_argument, NULL, 'r' },
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, 'v' },
+ { "bind", required_argument, NULL, 'b' },
+ { "crc", required_argument, NULL, LONGOPT_CRC4 },
+ { "assert", no_argument, NULL, LONGOPT_ASSERT },
+ { "create", required_argument, NULL, LONGOPT_CREATE },
+ { "revision", required_argument, NULL, LONGOPT_REVISION },
+ { "size", required_argument, NULL, LONGOPT_SIZE },
+ { "errormsg", required_argument, NULL, LONGOPT_ERRORMSG },
+ { 0, 0, 0, 0 }
+ };
+
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+
+ while ( opt != -1 ) {
+ switch ( opt ) {
+ case 'c':
+ _configDir = strdup( optarg );
+ break;
+ case 'n':
+ demonize = 0;
+ break;
+ case 'h':
+ case '?':
+ dnbd3_printHelp( argv[0] );
+ break;
+ case 'v':
+ dnbd3_printVersion();
+ break;
+ case 'b':
+ bindAddress = strdup( optarg );
+ break;
+ case LONGOPT_CRC4:
+ return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE;
+ case LONGOPT_ASSERT:
+ printf( "Testing a failing assertion:\n" );
+ assert( 4 == 5 );
+ printf( "Assertion 4 == 5 seems to hold. ;-)\n" );
+ return EXIT_SUCCESS;
+ case LONGOPT_CREATE:
+ paramCreate = strdup( optarg );
+ break;
+ case LONGOPT_REVISION:
+ paramRevision = atoi( optarg );
+ break;
+ case LONGOPT_SIZE:
+ paramSize = strtoll( optarg, NULL, 10 );
+ break;
+ case LONGOPT_ERRORMSG:
+ errorMsg = strdup( optarg );
+ break;
+ }
+ opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
+ }
+
+ // Load general config
+
+ if ( _configDir == NULL ) _configDir = strdup( "/etc/dnbd3-server" );
+ globals_loadConfig();
+ if ( _basePath == NULL && errorMsg == NULL ) {
+ logadd( LOG_ERROR, "Aborting, set proper basePath in %s/%s", _configDir, CONFIG_FILENAME );
+ exit( EXIT_FAILURE );
+ }
+
+ timing_setBase();
+ timing_get( &startupTime );
+
+#ifdef AFL_MODE
+ // ###### AFL
+ //
+ image_serverStartup();
+ net_init();
+ uplink_globalsInit();
+ rpc_init();
+ if ( !image_loadAll( NULL ) || _shutdown ) {
+ fprintf( stderr, "Error loading images\n" );
+ exit( 3 );
+ }
+ {
+ struct sockaddr_storage client;
+ memset( &client, 0, sizeof client );
+ client.ss_family = AF_INET;
+ dnbd3_client_t *dnbd3_client = dnbd3_prepareClient( &client, 1 );
+ if ( dnbd3_client == NULL ) {
+ fprintf( stderr, "New client failed\n" );
+ exit( 1 );
+ }
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+ __AFL_INIT();
+#endif
+ net_handleNewConnection( dnbd3_client );
+ exit( 0 );
+ }
+ //
+ // ###### AFL END
+#endif
+
+
+ // One-shots first:
+
+ if ( paramCreate != NULL ) {
+ return image_create( paramCreate, paramRevision, paramSize ) ? 0 : EXIT_FAILURE;
+ }
+
+ // No one-shot detected, normal server operation or errormsg serving
+ if ( demonize ) {
+ logadd( LOG_INFO, "Forking into background, see log file for further information" );
+ daemon( 1, 0 );
+ }
+ if ( errorMsg != NULL ) {
+ setupNetwork( bindAddress );
+ logadd( LOG_INFO, "Running errormsg server" );
+ while ( true ) {
+ const int fd = sock_accept( listeners, NULL, NULL );
+ if ( fd >= 0 ) {
+ rpc_sendErrorMessage( fd, errorMsg );
+ } else {
+ const int err = errno;
+ if ( err == EINTR || err == EAGAIN ) continue;
+ logadd( LOG_ERROR, "Client accept failure (err=%d)", err );
+ usleep( 10000 ); // 10ms
+ }
+ }
+ exit( 0 );
+ }
+ image_serverStartup();
+ altservers_init();
+ integrity_init();
+ net_init();
+ uplink_globalsInit();
+ rpc_init();
+ logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE );
+
+ if ( altservers_load() < 0 ) {
+ logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
+ }
+
+#ifdef _DEBUG
+ debug_locks_start_watchdog();
+#endif
+
+ // setup signal handler
+ struct sigaction sa;
+ memset( &sa, 0, sizeof(sa) );
+ sa.sa_sigaction = dnbd3_handleSignal2;
+ sa.sa_flags = SA_SIGINFO;
+ //sa.sa_mask = ;
+ sigaction( SIGTERM, &sa, NULL );
+ sigaction( SIGINT, &sa, NULL );
+ sigaction( SIGUSR1, &sa, NULL );
+ sigaction( SIGHUP, &sa, NULL );
+ sigaction( SIGUSR2, &sa, NULL );
+ signal( SIGPIPE, SIG_IGN );
+
+ logadd( LOG_INFO, "Loading images...." );
+ // Load all images in base path
+ if ( !image_loadAll( NULL ) || _shutdown ) {
+ if ( _shutdown ) {
+ logadd( LOG_ERROR, "Received shutdown request while loading images." );
+ } else {
+ logadd( LOG_ERROR, "Could not load images." );
+ }
+ free( bindAddress );
+ dnbd3_cleanup();
+ return _shutdown ? 0 : 1;
+ }
+
+ // Give other threads some time to start up before accepting connections
+ usleep( 100000 );
+
+ // setup network
+ listeners = setupNetwork( bindAddress );
+
+ // Initialize thread pool
+ if ( !threadpool_init( 8 ) ) {
+ logadd( LOG_ERROR, "Could not init thread pool!\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
+
+ // +++++++++++++++++++++++++++++++++++++++++++++++++++ main loop
+ struct sockaddr_storage client;
+ socklen_t len;
+ int fd;
+ while ( !_shutdown ) {
+ // Handle signals
+ printSignal();
+ if ( sigReload ) {
+ sigReload = false;
+ logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" );
+ threadpool_run( &server_asyncImageListLoad, NULL );
+ }
+ if ( sigLogCycle ) {
+ sigLogCycle = false;
+ logadd( LOG_INFO, "SIGUSR2 received, reopening log file..." );
+ if ( log_openLogFile( NULL ) )
+ logadd( LOG_INFO, "Log file has been reopened." );
+ else
+ logadd( LOG_WARNING, "Could not cycle log file." );
+ }
+ //
+ len = sizeof(client);
+ fd = sock_accept( listeners, &client, &len );
+ if ( fd < 0 ) {
+ const int err = errno;
+ if ( err == EINTR || err == EAGAIN ) continue;
+ logadd( LOG_ERROR, "Client accept failure (err=%d)", err );
+ usleep( 10000 ); // 10ms
+ continue;
+ }
+
+ dnbd3_client_t *dnbd3_client = dnbd3_prepareClient( &client, fd );
+ if ( dnbd3_client == NULL ) {
+ close( fd );
+ continue;
+ }
+
+ if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) {
+ logadd( LOG_ERROR, "Could not start thread for new connection." );
+ free( dnbd3_client );
+ continue;
+ }
+ }
+ printSignal();
+ free( bindAddress );
+ dnbd3_cleanup();
+ return 0;
+}
+
+void printSignal()
+{
+ if ( lastSignal.si_signo != 0 ) {
+ logadd( LOG_INFO, "Signal %d (via %d) by pid %u, uid %u",
+ lastSignal.si_signo, lastSignal.si_code,
+ (unsigned int)lastSignal.si_pid, (unsigned int)lastSignal.si_uid );
+ if ( lastSignal.si_pid != 0 ) {
+ char buffer[500], path[100];
+ snprintf( path, sizeof(path), "/proc/%u/exe", (unsigned int)lastSignal.si_pid );
+ ssize_t len = readlink( path, buffer, sizeof(buffer) );
+ if ( len > 0 ) {
+ logadd( LOG_INFO, "%u is %.*s", (unsigned int)lastSignal.si_pid, (int)len, buffer );
+ }
+ }
+ lastSignal.si_signo = 0;
+ }
+}
+
+static poll_list_t* setupNetwork(char *bindAddress)
+{
+ listeners = sock_newPollList();
+ if ( listeners == NULL ) {
+ logadd( LOG_ERROR, "Didnt get a poll list!" );
+ exit( EXIT_FAILURE );
+ }
+ if ( !sock_listen( listeners, bindAddress, (uint16_t)_listenPort ) ) {
+ logadd( LOG_ERROR, "Could not listen on any local interface." );
+ exit( EXIT_FAILURE );
+ }
+ return listeners;
+}
+
+/**
+ * Initialize and partially populate the client struct - called when an incoming
+ * connection is accepted. As this might be an HTTP request we don't initialize the
+ * locks, that would happen later once we know.
+ */
+static dnbd3_client_t* dnbd3_prepareClient(struct sockaddr_storage *client, int fd)
+{
+ dnbd3_client_t *dnbd3_client = calloc( 1, sizeof(dnbd3_client_t) );
+ if ( dnbd3_client == NULL ) { // This will never happen thanks to memory overcommit
+ logadd( LOG_ERROR, "Could not alloc dnbd3_client_t for new client." );
+ return NULL;
+ }
+
+ if ( client->ss_family == AF_INET ) {
+ struct sockaddr_in *v4 = (struct sockaddr_in *)client;
+ dnbd3_client->host.type = HOST_IP4;
+ memcpy( dnbd3_client->host.addr, &(v4->sin_addr), 4 );
+ dnbd3_client->host.port = v4->sin_port;
+ } else if ( client->ss_family == AF_INET6 ) {
+ struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)client;
+ dnbd3_client->host.type = HOST_IP6;
+ memcpy( dnbd3_client->host.addr, &(v6->sin6_addr), 16 );
+ dnbd3_client->host.port = v6->sin6_port;
+ } else {
+ logadd( LOG_ERROR, "New client has unknown address family %d, disconnecting...", (int)client->ss_family );
+ free( dnbd3_client );
+ return NULL;
+ }
+ dnbd3_client->sock = fd;
+ return dnbd3_client;
+}
+
+static void dnbd3_handleSignal(int signum)
+{
+ if ( _shutdown ) return;
+ if ( signum == SIGINT || signum == SIGTERM ) {
+ _shutdown = true;
+ } else if ( signum == SIGUSR1 || signum == SIGHUP ) {
+ sigReload = true;
+ } else if ( signum == SIGUSR2 ) {
+ sigLogCycle = true;
+ }
+}
+
+static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
+{
+ memcpy( &lastSignal, info, sizeof(siginfo_t) );
+ dnbd3_handleSignal( signum );
+}
+
+uint32_t dnbd3_serverUptime()
+{
+ ticks now;
+ timing_get( &now );
+ return timing_diff( &startupTime, &now );
+}
+
+static void* server_asyncImageListLoad(void *data UNUSED)
+{
+ setThreadName( "img-list-loader" );
+ globals_loadConfig();
+ image_loadAll( NULL );
+ return NULL;
+}
+
diff --git a/src/server/server.h b/src/server/server.h
new file mode 100644
index 0000000..bab8421
--- /dev/null
+++ b/src/server/server.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef SERVER_H_
+#define SERVER_H_
+
+#include "globals.h"
+#include "../types.h"
+
+void dnbd3_cleanup();
+uint32_t dnbd3_serverUptime();
+
+#if !defined(_FILE_OFFSET_BITS) || _FILE_OFFSET_BITS != 64
+#error Please set _FILE_OFFSET_BITS to 64 in your makefile/configuration
+#endif
+
+#endif /* SERVER_H_ */
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
new file mode 100644
index 0000000..b55fe19
--- /dev/null
+++ b/src/server/threadpool.c
@@ -0,0 +1,126 @@
+#include "threadpool.h"
+#include "globals.h"
+#include "helper.h"
+#include "locks.h"
+
+typedef struct _entry_t {
+ struct _entry_t *next;
+ pthread_t thread;
+ dnbd3_signal_t* signal;
+ void *(*startRoutine)(void *);
+ void * arg;
+} entry_t;
+
+static void *threadpool_worker(void *entryPtr);
+
+static pthread_attr_t threadAttrs;
+
+static int maxIdleThreads = -1;
+static entry_t *pool = NULL;
+static pthread_spinlock_t poolLock;
+
+bool threadpool_init(int maxIdle)
+{
+ if ( maxIdle < 0 || maxIdleThreads >= 0 ) return false;
+ spin_init( &poolLock, PTHREAD_PROCESS_PRIVATE );
+ maxIdleThreads = maxIdle;
+ pthread_attr_init( &threadAttrs );
+ pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
+ return true;
+}
+
+void threadpool_close()
+{
+ _shutdown = true;
+ if ( maxIdleThreads < 0 ) return;
+ spin_lock( &poolLock );
+ maxIdleThreads = -1;
+ entry_t *ptr = pool;
+ while ( ptr != NULL ) {
+ entry_t *current = ptr;
+ ptr = ptr->next;
+ signal_call( current->signal );
+ }
+ spin_unlock( &poolLock );
+ spin_destroy( &poolLock );
+}
+
+bool threadpool_run(void *(*startRoutine)(void *), void *arg)
+{
+ spin_lock( &poolLock );
+ entry_t *entry = pool;
+ if ( entry != NULL ) pool = entry->next;
+ spin_unlock( &poolLock );
+ if ( entry == NULL ) {
+ entry = (entry_t*)malloc( sizeof(entry_t) );
+ if ( entry == NULL ) {
+ logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" );
+ return false;
+ }
+ entry->signal = signal_newBlocking();
+ if ( entry->signal == NULL ) {
+ logadd( LOG_WARNING, "Could not create signal for new thread pool thread\n" );
+ free( entry );
+ return false;
+ }
+ if ( 0 != thread_create( &(entry->thread), &threadAttrs, threadpool_worker, (void*)entry ) ) {
+ logadd( LOG_WARNING, "Could not create new thread for thread pool\n" );
+ signal_close( entry->signal );
+ free( entry );
+ return false;
+ }
+ }
+ entry->next = NULL;
+ entry->startRoutine = startRoutine;
+ entry->arg = arg;
+ signal_call( entry->signal );
+ return true;
+}
+
+/**
+ * This is a worker thread of our thread pool.
+ */
+static void *threadpool_worker(void *entryPtr)
+{
+ blockNoncriticalSignals();
+ entry_t *entry = (entry_t*)entryPtr;
+ for ( ;; ) {
+ // Wait for signal from outside that we have work to do
+ int ret = signal_clear( entry->signal );
+ if ( _shutdown ) break;
+ if ( ret > 0 ) {
+ if ( entry->startRoutine == NULL ) {
+ logadd( LOG_DEBUG1, "Worker woke up but has no work to do!" );
+ continue;
+ }
+ // Start assigned work
+ (*entry->startRoutine)( entry->arg );
+ // Reset vars for safety
+ entry->startRoutine = NULL;
+ entry->arg = NULL;
+ if ( _shutdown ) break;
+ // Put thread back into pool if there are less than maxIdleThreds threads, just die otherwise
+ int threadCount = 0;
+ spin_lock( &poolLock );
+ entry_t *ptr = pool;
+ while ( ptr != NULL ) {
+ threadCount++;
+ ptr = ptr->next;
+ }
+ if ( threadCount >= maxIdleThreads ) {
+ spin_unlock( &poolLock );
+ break;
+ }
+ entry->next = pool;
+ pool = entry;
+ spin_unlock( &poolLock );
+ setThreadName( "[pool]" );
+ } else {
+ logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
+ }
+ }
+ signal_close( entry->signal );
+ free( entry );
+ return NULL;
+}
+
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
new file mode 100644
index 0000000..15dd151
--- /dev/null
+++ b/src/server/threadpool.h
@@ -0,0 +1,29 @@
+#ifndef _THREADPOOL_H_
+#define _THREADPOOL_H_
+
+#include "../types.h"
+
+/**
+ * Initialize the thread pool. This must be called before using
+ * threadpool_run, and must only be called once.
+ * @param maxIdleThreadCount maximum number of idle threads in the pool
+ * @return true if initialized successfully
+ */
+bool threadpool_init(int maxIdleThreadCount);
+
+/**
+ * Shut down threadpool.
+ * Only call if it has been initialized before.
+ */
+void threadpool_close();
+
+/**
+ * Run a thread using the thread pool.
+ * @param startRoutine function to run in new thread
+ * @param arg argument to pass to thead
+ * @return true if thread was started
+ */
+bool threadpool_run(void *(*startRoutine)(void *), void *arg);
+
+#endif
+
diff --git a/src/server/uplink.c b/src/server/uplink.c
new file mode 100644
index 0000000..31b220d
--- /dev/null
+++ b/src/server/uplink.c
@@ -0,0 +1,1034 @@
+#include "uplink.h"
+#include "helper.h"
+#include "locks.h"
+#include "image.h"
+#include "altservers.h"
+#include "../shared/sockhelper.h"
+#include "../shared/protocol.h"
+#include "../shared/timing.h"
+#include "../shared/crc32.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <unistd.h>
+#include <stdatomic.h>
+
+#define FILE_BYTES_PER_MAP_BYTE ( DNBD3_BLOCK_SIZE * 8 )
+#define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
+#define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
+
+#define REP_NONE ( (uint64_t)0xffffffffffffffff )
+
+static atomic_uint_fast64_t totalBytesReceived = 0;
+
+static void* uplink_mainloop(void *data);
+static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly);
+static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex);
+static void uplink_handleReceive(dnbd3_connection_t *link);
+static int uplink_sendKeepalive(const int fd);
+static void uplink_addCrc32(dnbd3_connection_t *uplink);
+static void uplink_sendReplicationRequest(dnbd3_connection_t *link);
+static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force);
+static bool uplink_saveCacheMap(dnbd3_connection_t *link);
+static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link);
+static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew);
+
+// ############ uplink connection handling
+
+void uplink_globalsInit()
+{
+}
+
+uint64_t uplink_getTotalBytesReceived()
+{
+ return (uint64_t)totalBytesReceived;
+}
+
+/**
+ * Create and initialize an uplink instance for the given
+ * image. Uplinks run in their own thread.
+ * Locks on: _images[].lock
+ */
+bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version)
+{
+ if ( !_isProxy || _shutdown ) return false;
+ dnbd3_connection_t *link = NULL;
+ assert( image != NULL );
+ spin_lock( &image->lock );
+ if ( image->uplink != NULL && !image->uplink->shutdown ) {
+ spin_unlock( &image->lock );
+ if ( sock >= 0 ) close( sock );
+ return true; // There's already an uplink, so should we consider this success or failure?
+ }
+ if ( image->cache_map == NULL ) {
+ logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
+ goto failure;
+ }
+ link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) );
+ spin_init( &link->queueLock, PTHREAD_PROCESS_PRIVATE );
+ spin_init( &link->rttLock, PTHREAD_PROCESS_PRIVATE );
+ link->image = image;
+ link->bytesReceived = 0;
+ link->idleTime = 0;
+ link->queueLen = 0;
+ link->fd = -1;
+ link->cacheFd = -1;
+ link->signal = NULL;
+ link->replicationHandle = REP_NONE;
+ spin_lock( &link->rttLock );
+ link->cycleDetected = false;
+ if ( sock >= 0 ) {
+ link->betterFd = sock;
+ link->betterServer = *host;
+ link->rttTestResult = RTT_DOCHANGE;
+ link->betterVersion = version;
+ } else {
+ link->betterFd = -1;
+ link->rttTestResult = RTT_IDLE;
+ }
+ spin_unlock( &link->rttLock );
+ link->recvBufferLen = 0;
+ link->shutdown = false;
+ if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) {
+ logadd( LOG_ERROR, "Could not start thread for new uplink." );
+ goto failure;
+ }
+ spin_unlock( &image->lock );
+ return true;
+failure: ;
+ if ( link != NULL ) {
+ free( link );
+ link = image->uplink = NULL;
+ }
+ spin_unlock( &image->lock );
+ return false;
+}
+
+/**
+ * Locks on image.lock, uplink.lock
+ * Calling it multiple times, even concurrently, will
+ * not break anything.
+ */
+void uplink_shutdown(dnbd3_image_t *image)
+{
+ bool join = false;
+ pthread_t thread;
+ assert( image != NULL );
+ spin_lock( &image->lock );
+ if ( image->uplink == NULL ) {
+ spin_unlock( &image->lock );
+ return;
+ }
+ dnbd3_connection_t * const uplink = image->uplink;
+ spin_lock( &uplink->queueLock );
+ if ( !uplink->shutdown ) {
+ uplink->shutdown = true;
+ signal_call( uplink->signal );
+ thread = uplink->thread;
+ join = true;
+ }
+ spin_unlock( &uplink->queueLock );
+ bool wait = image->uplink != NULL;
+ spin_unlock( &image->lock );
+ if ( join ) thread_join( thread, NULL );
+ while ( wait ) {
+ usleep( 5000 );
+ spin_lock( &image->lock );
+ wait = image->uplink != NULL && image->uplink->shutdown;
+ spin_unlock( &image->lock );
+ }
+}
+
+/**
+ * Remove given client from uplink request queue
+ * Locks on: uplink.queueLock
+ */
+void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client)
+{
+ spin_lock( &uplink->queueLock );
+ for (int i = uplink->queueLen - 1; i >= 0; --i) {
+ if ( uplink->queue[i].client == client ) {
+ uplink->queue[i].client = NULL;
+ uplink->queue[i].status = ULR_FREE;
+ }
+ if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
+ }
+ spin_unlock( &uplink->queueLock );
+}
+
+/**
+ * Request a chunk of data through an uplink server
+ * Locks on: image.lock, uplink.queueLock
+ */
+bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+{
+ if ( client == NULL || client->image == NULL ) return false;
+ if ( length > (uint32_t)_maxPayload ) {
+ logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
+ return false;
+ }
+ spin_lock( &client->image->lock );
+ if ( client->image->uplink == NULL ) {
+ spin_unlock( &client->image->lock );
+ logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ return false;
+ }
+ dnbd3_connection_t * const uplink = client->image->uplink;
+ if ( uplink->shutdown ) {
+ spin_unlock( &client->image->lock );
+ logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
+ return false;
+ }
+ // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
+ // This might be a false positive if there are multiple instances running on the same host (IP)
+ if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) {
+ spin_unlock( &client->image->lock );
+ logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
+ spin_lock( &uplink->rttLock );
+ uplink->cycleDetected = true;
+ spin_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
+ return false;
+ }
+
+ int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
+ int existingType = -1; // ULR_* type of existing request
+ int i;
+ int freeSlot = -1;
+ bool requestLoop = false;
+ const uint64_t end = start + length;
+
+ spin_lock( &uplink->queueLock );
+ spin_unlock( &client->image->lock );
+ for (i = 0; i < uplink->queueLen; ++i) {
+ if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) {
+ freeSlot = i;
+ continue;
+ }
+ if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue;
+ if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) {
+ if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) {
+ requestLoop = true;
+ break;
+ }
+ if ( foundExisting == -1 || existingType == ULR_PENDING ) {
+ foundExisting = i;
+ existingType = uplink->queue[i].status;
+ if ( freeSlot != -1 ) break;
+ }
+ }
+ }
+ if ( requestLoop ) {
+ spin_unlock( &uplink->queueLock );
+ logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
+ spin_lock( &uplink->rttLock );
+ uplink->cycleDetected = true;
+ spin_unlock( &uplink->rttLock );
+ signal_call( uplink->signal );
+ return false;
+ }
+ if ( freeSlot == -1 ) {
+ if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
+ spin_unlock( &uplink->queueLock );
+ logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
+ return false;
+ }
+ freeSlot = uplink->queueLen++;
+ }
+ // Do not send request to uplink server if we have a matching pending request AND the request either has the
+ // status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise
+ // explicitly send this request to the uplink server. The second condition mentioned here is to prevent
+ // a race condition where the reply for the outstanding request already arrived and the uplink thread
+ // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
+ // already have passed the index of the free slot we determined, but not reached the existing request we just found above.
+ if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request"
+#ifdef _DEBUG
+ if ( foundExisting != -1 ) {
+ logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot );
+ logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
+ "New %" PRIu64 "-%" PRIu64 " (%p)\n",
+ uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
+ start, end, (void*)client );
+ }
+#endif
+ // Fill structure
+ uplink->queue[freeSlot].from = start;
+ uplink->queue[freeSlot].to = end;
+ uplink->queue[freeSlot].handle = handle;
+ uplink->queue[freeSlot].client = client;
+ //int old = uplink->queue[freeSlot].status;
+ uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING);
+ uplink->queue[freeSlot].hopCount = hops;
+#ifdef _DEBUG
+ timing_get( &uplink->queue[freeSlot].entered );
+ //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
+#endif
+ spin_unlock( &uplink->queueLock );
+
+ if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed
+ if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+ logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
+ }
+ }
+ return true;
+}
+
+/**
+ * Uplink thread.
+ * Locks are irrelevant as this is never called from another function
+ */
+static void* uplink_mainloop(void *data)
+{
+#define EV_SIGNAL (0)
+#define EV_SOCKET (1)
+#define EV_COUNT (2)
+ struct pollfd events[EV_COUNT];
+ dnbd3_connection_t * const link = (dnbd3_connection_t*)data;
+ int numSocks, i, waitTime;
+ int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
+ uint32_t discoverFailCount = 0;
+ uint32_t unsavedSeconds = 0;
+ ticks nextAltCheck, lastKeepalive;
+ char buffer[200];
+ memset( events, 0, sizeof(events) );
+ timing_get( &nextAltCheck );
+ lastKeepalive = nextAltCheck;
+ //
+ assert( link != NULL );
+ setThreadName( "idle-uplink" );
+ blockNoncriticalSignals();
+ // Make sure file is open for writing
+ if ( !uplink_reopenCacheFd( link, false ) ) {
+ // It might have failed - still offer proxy mode, we just can't cache
+ logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno );
+ }
+ //
+ link->signal = signal_new();
+ if ( link->signal == NULL ) {
+ logadd( LOG_WARNING, "error creating signal. Uplink unavailable." );
+ goto cleanup;
+ }
+ events[EV_SIGNAL].events = POLLIN;
+ events[EV_SIGNAL].fd = signal_getWaitFd( link->signal );
+ events[EV_SOCKET].fd = -1;
+ while ( !_shutdown && !link->shutdown ) {
+ // poll()
+ spin_lock( &link->rttLock );
+ waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1;
+ spin_unlock( &link->rttLock );
+ if ( waitTime == 0 ) {
+ // Nothing
+ } else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) {
+ waitTime = 1000;
+ } else {
+ declare_now;
+ waitTime = (int)timing_diffMs( &now, &nextAltCheck );
+ if ( waitTime < 100 ) waitTime = 100;
+ if ( waitTime > 5000 ) waitTime = 5000;
+ }
+ events[EV_SOCKET].fd = link->fd;
+ numSocks = poll( events, EV_COUNT, waitTime );
+ if ( _shutdown || link->shutdown ) goto cleanup;
+ if ( numSocks == -1 ) { // Error?
+ if ( errno == EINTR ) continue;
+ logadd( LOG_DEBUG1, "poll() error %d", (int)errno );
+ usleep( 10000 );
+ continue;
+ }
+ // Check if server switch is in order
+ spin_lock( &link->rttLock );
+ if ( link->rttTestResult != RTT_DOCHANGE ) {
+ spin_unlock( &link->rttLock );
+ } else {
+ link->rttTestResult = RTT_IDLE;
+ // The rttTest worker thread has finished our request.
+ // And says it's better to switch to another server
+ const int fd = link->fd;
+ link->fd = link->betterFd;
+ link->betterFd = -1;
+ link->currentServer = link->betterServer;
+ link->version = link->betterVersion;
+ link->cycleDetected = false;
+ spin_unlock( &link->rttLock );
+ discoverFailCount = 0;
+ if ( fd != -1 ) close( fd );
+ link->replicationHandle = REP_NONE;
+ link->image->working = true;
+ link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
+ buffer[0] = '@';
+ if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) {
+ logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 );
+ setThreadName( buffer );
+ }
+ // If we don't have a crc32 list yet, see if the new server has one
+ if ( link->image->crc32 == NULL ) {
+ uplink_addCrc32( link );
+ }
+ // Re-send all pending requests
+ uplink_sendRequests( link, false );
+ uplink_sendReplicationRequest( link );
+ events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+ timing_gets( &nextAltCheck, altCheckInterval );
+ // The rtt worker already did the handshake for our image, so there's nothing
+ // more to do here
+ }
+ // Check events
+ // Signal
+ if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+ logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
+ goto cleanup;
+ } else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
+ // signal triggered -> pending requests
+ if ( signal_clear( link->signal ) == SIGNAL_ERROR ) {
+ logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name );
+ }
+ if ( link->fd != -1 ) {
+ // Uplink seems fine, relay requests to it...
+ uplink_sendRequests( link, true );
+ } else { // No uplink; maybe it was shutdown since it was idle for too long
+ link->idleTime = 0;
+ }
+ }
+ // Uplink socket
+ if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+ uplink_connectionFailed( link, true );
+ logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" );
+ setThreadName( "panic-uplink" );
+ } else if ( (events[EV_SOCKET].revents & POLLIN) ) {
+ uplink_handleReceive( link );
+ if ( _shutdown || link->shutdown ) goto cleanup;
+ }
+ declare_now;
+ uint32_t timepassed = timing_diff( &lastKeepalive, &now );
+ if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+ lastKeepalive = now;
+ link->idleTime += timepassed;
+ unsavedSeconds += timepassed;
+ if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) {
+ // fsync/save every 4 minutes, or every 60 seconds if link is idle
+ unsavedSeconds = 0;
+ uplink_saveCacheMap( link );
+ }
+ // Keep-alive
+ if ( link->fd != -1 && link->replicationHandle == REP_NONE ) {
+ // Send keep-alive if nothing is happening
+ if ( uplink_sendKeepalive( link->fd ) ) {
+ // Re-trigger periodically, in case it requires a minimum user count
+ uplink_sendReplicationRequest( link );
+ } else {
+ uplink_connectionFailed( link, true );
+ logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
+ setThreadName( "panic-uplink" );
+ }
+ }
+ // Don't keep link established if we're idle for too much
+ if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) {
+ close( link->fd );
+ link->fd = events[EV_SOCKET].fd = -1;
+ link->cycleDetected = false;
+ if ( link->recvBufferLen != 0 ) {
+ link->recvBufferLen = 0;
+ free( link->recvBuffer );
+ link->recvBuffer = NULL;
+ }
+ logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid );
+ setThreadName( "idle-uplink" );
+ }
+ }
+ // See if we should trigger an RTT measurement
+ spin_lock( &link->rttLock );
+ const int rttTestResult = link->rttTestResult;
+ spin_unlock( &link->rttLock );
+ if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
+ if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) {
+ // It seems it's time for a check
+ if ( image_isComplete( link->image ) ) {
+ // Quit work if image is complete
+ logadd( LOG_INFO, "Replication of %s complete.", link->image->name );
+ setThreadName( "finished-uplink" );
+ goto cleanup;
+ } else if ( !uplink_connectionShouldShutdown( link ) ) {
+ // Not complete - do measurement
+ altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous)
+ if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
+ link->nextReplicationIndex = 0;
+ }
+ }
+ altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX);
+ timing_set( &nextAltCheck, &now, altCheckInterval );
+ }
+ } else if ( rttTestResult == RTT_NOT_REACHABLE ) {
+ spin_lock( &link->rttLock );
+ link->rttTestResult = RTT_IDLE;
+ spin_unlock( &link->rttLock );
+ discoverFailCount++;
+ timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
+ }
+#ifdef _DEBUG
+ if ( link->fd != -1 && !link->shutdown ) {
+ bool resend = false;
+ ticks deadline;
+ timing_set( &deadline, &now, -10 );
+ spin_lock( &link->queueLock );
+ for (i = 0; i < link->queueLen; ++i) {
+ if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) {
+ snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
+ "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name,
+ link->queue[i].from, link->queue[i].to, link->queue[i].status );
+ link->queue[i].entered = now;
+#ifdef _DEBUG_RESEND_STARVING
+ link->queue[i].status = ULR_NEW;
+ resend = true;
+#endif
+ spin_unlock( &link->queueLock );
+ logadd( LOG_WARNING, "%s", buffer );
+ spin_lock( &link->queueLock );
+ }
+ }
+ spin_unlock( &link->queueLock );
+ if ( resend )
+ uplink_sendRequests( link, true );
+ }
+#endif
+ }
+ cleanup: ;
+ altservers_removeUplink( link );
+ uplink_saveCacheMap( link );
+ spin_lock( &link->image->lock );
+ if ( link->image->uplink == link ) {
+ link->image->uplink = NULL;
+ }
+ spin_lock( &link->queueLock );
+ const int fd = link->fd;
+ const dnbd3_signal_t* signal = link->signal;
+ link->fd = -1;
+ link->signal = NULL;
+ if ( !link->shutdown ) {
+ link->shutdown = true;
+ thread_detach( link->thread );
+ }
+ // Do not access link->image after unlocking, since we set
+ // image->uplink to NULL. Acquire with image_lock first,
+ // like done below when checking whether to re-init uplink
+ spin_unlock( &link->image->lock );
+ spin_unlock( &link->queueLock );
+ if ( fd != -1 ) close( fd );
+ if ( signal != NULL ) signal_close( signal );
+ // Wait for the RTT check to finish/fail if it's in progress
+ while ( link->rttTestResult == RTT_INPROGRESS )
+ usleep( 10000 );
+ if ( link->betterFd != -1 ) {
+ close( link->betterFd );
+ }
+ spin_destroy( &link->queueLock );
+ spin_destroy( &link->rttLock );
+ free( link->recvBuffer );
+ link->recvBuffer = NULL;
+ if ( link->cacheFd != -1 ) {
+ close( link->cacheFd );
+ }
+ dnbd3_image_t *image = image_lock( link->image );
+ free( link ); // !!!
+ if ( image != NULL ) {
+ if ( !_shutdown && image->cache_map != NULL ) {
+ // Ingegrity checker must have found something in the meantime
+ uplink_init( image, -1, NULL, 0 );
+ }
+ image_release( image );
+ }
+ return NULL ;
+}
+
+static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly)
+{
+ // Scan for new requests
+ int j;
+ spin_lock( &link->queueLock );
+ for (j = 0; j < link->queueLen; ++j) {
+ if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue;
+ link->queue[j].status = ULR_PENDING;
+ uint8_t hops = link->queue[j].hopCount;
+ const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
+ /*
+ logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
+ (void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize );
+ */
+ spin_unlock( &link->queueLock );
+ if ( hops < 200 ) ++hops;
+ const int ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) );
+ if ( !ret ) {
+ // Non-critical - if the connection dropped or the server was changed
+ // the thread will re-send this request as soon as the connection
+ // is reestablished.
+ logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
+ altservers_serverFailed( &link->currentServer );
+ return;
+ }
+ spin_lock( &link->queueLock );
+ }
+ spin_unlock( &link->queueLock );
+}
+
+/**
+ * Send a block request to an uplink server without really having
+ * any client that needs that data. This will be used for background replication.
+ *
+ * We'll go through the cache map of the image and look for bytes that don't have
+ * all bits set. We then request the corresponding 8 blocks of 4kb from the uplink
+ * server. This means we might request data we already have, but it makes
+ * the code simpler. Worst case would be only one bit is zero, which means
+ * 4kb are missing, but we will request 32kb.
+ */
+static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
+{
+ if ( link == NULL || link->fd == -1 ) return;
+ if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication
+ if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE )
+ return;
+ dnbd3_image_t * const image = link->image;
+ if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
+ spin_lock( &image->lock );
+ if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) {
+ // No cache map (=image complete), or replication pending, or not enough users, do nothing
+ spin_unlock( &image->lock );
+ return;
+ }
+ const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ const int lastBlockIndex = mapBytes - 1;
+ int endByte;
+ if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+ endByte = link->nextReplicationIndex + mapBytes;
+ } else { // Hashblock based: Only look for match in current hash block
+ endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+ if ( endByte > mapBytes ) {
+ endByte = mapBytes;
+ }
+ }
+ int replicationIndex = -1;
+ for ( int j = link->nextReplicationIndex; j < endByte; ++j ) {
+ const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+ if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) {
+ // Found incomplete one
+ replicationIndex = i;
+ break;
+ }
+ }
+ spin_unlock( &image->lock );
+ if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+ // Nothing left in current block, find next one
+ replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte );
+ }
+ if ( replicationIndex == -1 ) {
+ // Replication might be complete, uplink_mainloop should take care....
+ link->nextReplicationIndex = -1;
+ return;
+ }
+ const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+ link->replicationHandle = offset;
+ const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+ if ( !dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) ) ) {
+ logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
+ return;
+ }
+ if ( replicationIndex == lastBlockIndex ) {
+ link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+ }
+ link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+ if ( _backgroundReplication == BGR_HASHBLOCK
+ && link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+ // Just crossed a hash block boundary, look for new candidate starting at this very index
+ link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex );
+ }
+}
+
+/**
+ * find next index into cache_map that corresponds to the beginning
+ * of a hash block which is neither completely empty nor completely
+ * replicated yet. Returns -1 if no match.
+ */
+static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex)
+{
+ int retval = -1;
+ spin_lock( &link->image->lock );
+ const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize );
+ const uint8_t *cache_map = link->image->cache_map;
+ if ( cache_map != NULL ) {
+ int j;
+ const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK );
+ for (j = 0; j < mapBytes; ++j) {
+ const int i = ( start + j ) % mapBytes;
+ const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock );
+ const bool isEmpty = cache_map[i] == 0;
+ if ( !isEmpty && !isFull ) {
+ // Neither full nor empty, replicate
+ if ( retval == -1 ) {
+ retval = i;
+ }
+ break;
+ }
+ if ( ( i & MAP_INDEX_HASH_START_MASK ) == i ) {
+ // Reset state if we just crossed into the next hash chunk
+ retval = ( isEmpty ) ? ( i ) : ( -1 );
+ } else if ( isFull ) {
+ if ( retval != -1 ) {
+ // It's a full one, previous one was empty -> replicate
+ break;
+ }
+ } else if ( isEmpty ) {
+ if ( retval == -1 ) { // Previous one was full -> replicate
+ retval = i;
+ break;
+ }
+ }
+ }
+ if ( j == mapBytes ) { // Nothing found, loop ran until end
+ retval = -1;
+ }
+ }
+ spin_unlock( &link->image->lock );
+ return retval;
+}
+
+/**
+ * Receive data from uplink server and process/dispatch
+ * Locks on: link.lock, images[].lock
+ */
+static void uplink_handleReceive(dnbd3_connection_t *link)
+{
+ dnbd3_reply_t inReply, outReply;
+ int ret, i;
+ for (;;) {
+ ret = dnbd3_read_reply( link->fd, &inReply, false );
+ if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue;
+ if ( ret == REPLY_AGAIN ) break;
+ if ( unlikely( ret == REPLY_CLOSED ) ) {
+ logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path );
+ goto error_cleanup;
+ }
+ if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
+ logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path );
+ goto error_cleanup;
+ }
+ if ( unlikely( ret != REPLY_OK ) ) {
+ logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path );
+ goto error_cleanup;
+ }
+ if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
+ logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path );
+ goto error_cleanup;
+ }
+
+ if ( unlikely( link->recvBufferLen < inReply.size ) ) {
+ link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536);
+ link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen );
+ if ( link->recvBuffer == NULL ) {
+ logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" );
+ exit( 1 );
+ }
+ }
+ if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) {
+ logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path );
+ goto error_cleanup;
+ }
+ // Payload read completely
+ // Bail out if we're not interested
+ if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+ // Is a legit block reply
+ struct iovec iov[2];
+ const uint64_t start = inReply.handle;
+ const uint64_t end = inReply.handle + inReply.size;
+ totalBytesReceived += inReply.size;
+ link->bytesReceived += inReply.size;
+ // 1) Write to cache file
+ if ( unlikely( link->cacheFd == -1 ) ) {
+ uplink_reopenCacheFd( link, false );
+ }
+ if ( likely( link->cacheFd != -1 ) ) {
+ int err = 0;
+ bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid
+ uint32_t done = 0;
+ ret = 0;
+ while ( done < inReply.size ) {
+ ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done );
+ if ( unlikely( ret == -1 ) ) {
+ err = errno;
+ if ( err == EINTR ) continue;
+ if ( err == ENOSPC || err == EDQUOT ) {
+ // try to free 256MiB
+ if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
+ tryAgain = false;
+ continue; // Success, retry write
+ }
+ if ( err == EBADF || err == EINVAL || err == EIO ) {
+ if ( !tryAgain || !uplink_reopenCacheFd( link, true ) )
+ break;
+ tryAgain = false;
+ continue; // Write handle to image successfully re-opened, try again
+ }
+ logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err );
+ break;
+ }
+ if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
+ logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid );
+ break;
+ }
+ done += (uint32_t)ret;
+ }
+ if ( likely( done > 0 ) ) {
+ image_updateCachemap( link->image, start, start + done, true );
+ }
+ if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
+ logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
+ link->image->name, (int)link->image->rid, err );
+ }
+ }
+ // 2) Figure out which clients are interested in it
+ spin_lock( &link->queueLock );
+ for (i = 0; i < link->queueLen; ++i) {
+ dnbd3_queued_request_t * const req = &link->queue[i];
+ assert( req->status != ULR_PROCESSING );
+ if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
+ assert( req->client != NULL );
+ if ( req->from >= start && req->to <= end ) { // Match :-)
+ req->status = ULR_PROCESSING;
+ }
+ }
+ // 3) Send to interested clients - iterate backwards so request collaboration works, and
+ // so we can decrease queueLen on the fly while iterating. Should you ever change this to start
+ // from 0, you also need to change the "attach to existing request"-logic in uplink_request()
+ outReply.magic = dnbd3_packet_magic;
+ bool served = false;
+ for ( i = link->queueLen - 1; i >= 0; --i ) {
+ dnbd3_queued_request_t * const req = &link->queue[i];
+ if ( req->status == ULR_PROCESSING ) {
+ size_t bytesSent = 0;
+ assert( req->from >= start && req->to <= end );
+ dnbd3_client_t * const client = req->client;
+ outReply.cmd = CMD_GET_BLOCK;
+ outReply.handle = req->handle;
+ outReply.size = (uint32_t)( req->to - req->from );
+ iov[0].iov_base = &outReply;
+ iov[0].iov_len = sizeof outReply;
+ iov[1].iov_base = link->recvBuffer + (req->from - start);
+ iov[1].iov_len = outReply.size;
+ fixup_reply( outReply );
+ req->status = ULR_FREE;
+ req->client = NULL;
+ served = true;
+ pthread_mutex_lock( &client->sendMutex );
+ spin_unlock( &link->queueLock );
+ if ( client->sock != -1 ) {
+ ssize_t sent = writev( client->sock, iov, 2 );
+ if ( sent > (ssize_t)sizeof outReply ) {
+ bytesSent = (size_t)sent - sizeof outReply;
+ }
+ }
+ pthread_mutex_unlock( &client->sendMutex );
+ if ( bytesSent != 0 ) {
+ client->bytesSent += bytesSent;
+ }
+ spin_lock( &link->queueLock );
+ }
+ if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--;
+ }
+ spin_unlock( &link->queueLock );
+#ifdef _DEBUG
+ if ( !served && start != link->replicationHandle ) {
+ logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end );
+ }
+#endif
+ if ( start == link->replicationHandle ) {
+ // Was our background replication
+ link->replicationHandle = REP_NONE;
+ // Try to remove from fs cache if no client was interested in this data
+ if ( !served && link->cacheFd != -1 ) {
+ posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+ }
+ }
+ if ( served ) {
+ // Was some client -- reset idle counter
+ link->idleTime = 0;
+ // Re-enable replication if disabled
+ if ( link->nextReplicationIndex == -1 ) {
+ link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
+ }
+ }
+ }
+ if ( link->replicationHandle == REP_NONE ) {
+ spin_lock( &link->queueLock );
+ const bool rep = ( link->queueLen == 0 );
+ spin_unlock( &link->queueLock );
+ if ( rep ) uplink_sendReplicationRequest( link );
+ }
+ return;
+ // Error handling from failed receive or message parsing
+ error_cleanup: ;
+ uplink_connectionFailed( link, true );
+}
+
+static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew)
+{
+ if ( link->fd == -1 )
+ return;
+ altservers_serverFailed( &link->currentServer );
+ close( link->fd );
+ link->fd = -1;
+ link->replicationHandle = REP_NONE;
+ if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
+ link->nextReplicationIndex = 0;
+ }
+ if ( !findNew )
+ return;
+ spin_lock( &link->rttLock );
+ bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1;
+ spin_unlock( &link->rttLock );
+ if ( bail )
+ return;
+ altservers_findUplink( link );
+}
+
+/**
+ * Send keep alive request to server
+ */
+static int uplink_sendKeepalive(const int fd)
+{
+ static dnbd3_request_t request = { 0 };
+ if ( request.magic == 0 ) {
+ request.magic = dnbd3_packet_magic;
+ request.cmd = CMD_KEEPALIVE;
+ fixup_request( request );
+ }
+ return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+}
+
+static void uplink_addCrc32(dnbd3_connection_t *uplink)
+{
+ dnbd3_image_t *image = uplink->image;
+ if ( image == NULL || image->virtualFilesize == 0 ) return;
+ size_t bytes = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize ) * sizeof(uint32_t);
+ uint32_t masterCrc;
+ uint32_t *buffer = malloc( bytes );
+ if ( !dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes ) || bytes == 0 ) {
+ free( buffer );
+ return;
+ }
+ uint32_t lists_crc = crc32( 0, NULL, 0 );
+ lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
+ lists_crc = net_order_32( lists_crc );
+ if ( lists_crc != masterCrc ) {
+ logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name );
+ free( buffer );
+ return;
+ }
+ uplink->image->masterCrc32 = masterCrc;
+ uplink->image->crc32 = buffer;
+ const size_t len = strlen( uplink->image->path ) + 30;
+ char path[len];
+ snprintf( path, len, "%s.crc", uplink->image->path );
+ const int fd = open( path, O_WRONLY | O_CREAT, 0644 );
+ if ( fd >= 0 ) {
+ write( fd, &masterCrc, sizeof(uint32_t) );
+ write( fd, buffer, bytes );
+ close( fd );
+ }
+}
+
+/**
+ * Open the given image's main image file in
+ * rw mode, assigning it to the cacheFd struct member.
+ *
+ * @param force If cacheFd was previously assigned a file descriptor (not == -1),
+ * it will be closed first. Otherwise, nothing will happen and true will be returned
+ * immediately.
+ */
+static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force)
+{
+ if ( link->cacheFd != -1 ) {
+ if ( !force ) return true;
+ close( link->cacheFd );
+ }
+ link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 );
+ return link->cacheFd != -1;
+}
+
+/**
+ * Saves the cache map of the given image.
+ * Return true on success.
+ * Locks on: imageListLock, image.lock
+ */
+static bool uplink_saveCacheMap(dnbd3_connection_t *link)
+{
+ dnbd3_image_t *image = link->image;
+ assert( image != NULL );
+
+ if ( link->cacheFd != -1 ) {
+ if ( fsync( link->cacheFd ) == -1 ) {
+ // A failing fsync means we have no guarantee that any data
+ // since the last fsync (or open if none) has been saved. Apart
+ // from keeping the cache_map from the last successful fsync
+ // around and restoring it there isn't much we can do to recover
+ // a consistent state. Bail out.
+ logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
+ logadd( LOG_ERROR, "Bailing out immediately" );
+ exit( 1 );
+ }
+ }
+
+ if ( image->cache_map == NULL ) return true;
+ logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
+ spin_lock( &image->lock );
+ // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to
+ // figure out that this image's cache copy is complete
+ if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) {
+ spin_unlock( &image->lock );
+ return true;
+ }
+ const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+ uint8_t *map = malloc( size );
+ memcpy( map, image->cache_map, size );
+ // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
+ // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
+ spin_unlock( &image->lock );
+ assert( image->path != NULL );
+ char mapfile[strlen( image->path ) + 4 + 1];
+ strcpy( mapfile, image->path );
+ strcat( mapfile, ".map" );
+
+ int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+ if ( fd == -1 ) {
+ const int err = errno;
+ free( map );
+ logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+ return false;
+ }
+
+ size_t done = 0;
+ while ( done < size ) {
+ const ssize_t ret = write( fd, map, size - done );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+ break;
+ }
+ if ( ret <= 0 ) {
+ logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+ break;
+ }
+ done += (size_t)ret;
+ }
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+ }
+ close( fd );
+ free( map );
+ return true;
+}
+
+static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link)
+{
+ return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && _backgroundReplication != BGR_FULL );
+}
+
diff --git a/src/server/uplink.h b/src/server/uplink.h
new file mode 100644
index 0000000..2b41dfc
--- /dev/null
+++ b/src/server/uplink.h
@@ -0,0 +1,19 @@
+#ifndef _UPLINK_H_
+#define _UPLINK_H_
+
+#include "globals.h"
+#include "../types.h"
+
+void uplink_globalsInit();
+
+uint64_t uplink_getTotalBytesReceived();
+
+bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version);
+
+void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client);
+
+bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+
+void uplink_shutdown(dnbd3_image_t *image);
+
+#endif /* UPLINK_H_ */
diff --git a/src/server/urldecode.c b/src/server/urldecode.c
new file mode 100644
index 0000000..4553097
--- /dev/null
+++ b/src/server/urldecode.c
@@ -0,0 +1,61 @@
+#include "urldecode.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+#define hex2int(a) do { \
+ if ( a >= 'a' ) { \
+ a = (char)(a - ( 'a' - 'A' - 10 )); \
+ } else if ( a > 'F' ) { \
+ goto normie; \
+ } else if ( a >= 'A' ) { \
+ a = (char)(a - ( 'A' - 10 )); \
+ } else if ( a < '0' || a > '9' ) { \
+ goto normie; \
+ } else { \
+ a = (char)(a - '0'); \
+ } \
+} while (0)
+
+void urldecode(struct string* str, struct field *out, size_t *out_num)
+{
+ char *src = (char*)str->s;
+ char *dst = src;
+ const char * const end = str->s + str->l;
+ char a, b;
+ size_t max_out = *out_num;
+ *out_num = 0;
+ do {
+ if ( *out_num == max_out ) return;
+ out->name.s = dst;
+ while ( src < end && *src != '=' ) {
+ *dst++ = *src++;
+ }
+ if ( src == end ) return;
+ out->name.l = (size_t)( dst - out->name.s );
+ ++src;
+ out->value.s = ++dst;
+ while ( src < end && *src != '&' ) {
+ if ( *src == '%' && src + 2 < end ) {
+ if ( src[1] > 'f' || src[2] > 'f' ) goto normie;
+ a = src[1];
+ hex2int(a);
+ b = src[2];
+ hex2int(b);
+ *dst++ = (char)( (16 * a) + b );
+ src += 3;
+ } else if (*src == '+') {
+ *dst++ = (char)' ';
+ ++src;
+ } else {
+ normie:;
+ *dst++ = *src++;
+ }
+ }
+ out->value.l = (size_t)( dst - out->value.s );
+ out++;
+ (*out_num)++;
+ if ( src++ >= end ) return;
+ ++dst;
+ } while ( 1 );
+}
+
diff --git a/src/server/urldecode.h b/src/server/urldecode.h
new file mode 100644
index 0000000..e27f8f8
--- /dev/null
+++ b/src/server/urldecode.h
@@ -0,0 +1,19 @@
+#ifndef _URLENCODE_H_
+#define _URLENCODE_H_
+
+#include "picohttpparser/picohttpparser.h"
+
+struct field {
+ struct string name;
+ struct string value;
+};
+
+/**
+ * decode given x-form-urlencoded string. Breaks constness rules by
+ * casting the const char* s from str to char* and modifying it, then
+ * populating out with pointers into it, so make sure the memory
+ * is actually writable.
+ */
+void urldecode(struct string* str, struct field *out, size_t *out_num);
+
+#endif
diff --git a/src/serverconfig.h b/src/serverconfig.h
new file mode 100644
index 0000000..0cbb320
--- /dev/null
+++ b/src/serverconfig.h
@@ -0,0 +1,56 @@
+#ifndef _SERVERCONFIG_H_
+#define _SERVERCONFIG_H_
+
+#include "config.h"
+
+// +++++ Performance/memory related
+#define SERVER_MAX_CLIENTS 4000
+#define SERVER_MAX_IMAGES 5000
+#define SERVER_MAX_ALTS 100
+// +++++ Uplink handling (proxy mode)
+#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value
+#define SERVER_BAD_UPLINK_THRES 40 // Thresold for numFails at which we ignore a server for the time span below
+#define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
+#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink
+#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients
+#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
+
+#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
+
+// Time in ms to wait for a read/write call to complete on an uplink connection
+#define SOCKET_TIMEOUT_UPLINK 5000
+// Same for client connections. Be a bit more liberal here
+#define SOCKET_TIMEOUT_CLIENT 15000
+// When waiting for the next request header from client, allow the timeout from above
+// to expire this many times. This allows for greater idle times without also increasing
+// the timeout for cases where we wait for additional data or are actively sending a reply
+#define SOCKET_TIMEOUT_CLIENT_RETRIES 3
+
+#define SERVER_UPLINK_KEEPALIVE_INTERVAL 10 // (Seconds) Send keep-alive if nothing else is happening on the uplink
+#define SERVER_UPLINK_IDLE_TIMEOUT 1800 // (Seconds) Timeout after which we tear down an uplink connection if no blocks needed to be fetched
+
+// +++++ Other magic constants
+#define SERVER_RTT_PROBES 5 // How many probes to average over
+#define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes
+#define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes
+#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad
+#define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times
+
+#define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes
+
+// Which is the minimum protocol version the server expects from the client
+#define MIN_SUPPORTED_CLIENT 2
+// Same for when we're a proxy talking to another server
+#define MIN_SUPPORTED_SERVER 2
+
+// Length of comment fields (for alt server etc.)
+#define COMMENT_LENGTH 120
+
+#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
+#define RTT_UNREACHABLE 0x7FFFFFFu // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
+
+// How many seconds have to pass after the last client disconnected until the imagefd is closed
+#define UNUSED_FD_TIMEOUT 3600
+
+#endif
+
diff --git a/src/shared/crc32.c b/src/shared/crc32.c
new file mode 100644
index 0000000..db941d3
--- /dev/null
+++ b/src/shared/crc32.c
@@ -0,0 +1,621 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ *
+ * Modified for use in dnbd3
+ * Original comment:
+ *
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors. This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ *
+ * Original zlib.h license text:
+ *
+
+ Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup@gzip.org madler@alumni.caltech.edu
+
+*/
+
+#include "../types.h"
+#include <stddef.h>
+
+#define FAR
+#define OF(args) args
+#define local static
+
+/* Definitions for doing the crc four data bytes at a time. */
+#if !defined(NOBYFOUR)
+# define BYFOUR
+#endif
+#ifdef BYFOUR
+# define TBLS 8
+#else
+# define TBLS 1
+#endif /* BYFOUR */
+
+local const uint32_t crc_table[TBLS][256] =
+{
+ {
+ 0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U,
+ 0x706af48fU, 0xe963a535U, 0x9e6495a3U, 0x0edb8832U, 0x79dcb8a4U,
+ 0xe0d5e91eU, 0x97d2d988U, 0x09b64c2bU, 0x7eb17cbdU, 0xe7b82d07U,
+ 0x90bf1d91U, 0x1db71064U, 0x6ab020f2U, 0xf3b97148U, 0x84be41deU,
+ 0x1adad47dU, 0x6ddde4ebU, 0xf4d4b551U, 0x83d385c7U, 0x136c9856U,
+ 0x646ba8c0U, 0xfd62f97aU, 0x8a65c9ecU, 0x14015c4fU, 0x63066cd9U,
+ 0xfa0f3d63U, 0x8d080df5U, 0x3b6e20c8U, 0x4c69105eU, 0xd56041e4U,
+ 0xa2677172U, 0x3c03e4d1U, 0x4b04d447U, 0xd20d85fdU, 0xa50ab56bU,
+ 0x35b5a8faU, 0x42b2986cU, 0xdbbbc9d6U, 0xacbcf940U, 0x32d86ce3U,
+ 0x45df5c75U, 0xdcd60dcfU, 0xabd13d59U, 0x26d930acU, 0x51de003aU,
+ 0xc8d75180U, 0xbfd06116U, 0x21b4f4b5U, 0x56b3c423U, 0xcfba9599U,
+ 0xb8bda50fU, 0x2802b89eU, 0x5f058808U, 0xc60cd9b2U, 0xb10be924U,
+ 0x2f6f7c87U, 0x58684c11U, 0xc1611dabU, 0xb6662d3dU, 0x76dc4190U,
+ 0x01db7106U, 0x98d220bcU, 0xefd5102aU, 0x71b18589U, 0x06b6b51fU,
+ 0x9fbfe4a5U, 0xe8b8d433U, 0x7807c9a2U, 0x0f00f934U, 0x9609a88eU,
+ 0xe10e9818U, 0x7f6a0dbbU, 0x086d3d2dU, 0x91646c97U, 0xe6635c01U,
+ 0x6b6b51f4U, 0x1c6c6162U, 0x856530d8U, 0xf262004eU, 0x6c0695edU,
+ 0x1b01a57bU, 0x8208f4c1U, 0xf50fc457U, 0x65b0d9c6U, 0x12b7e950U,
+ 0x8bbeb8eaU, 0xfcb9887cU, 0x62dd1ddfU, 0x15da2d49U, 0x8cd37cf3U,
+ 0xfbd44c65U, 0x4db26158U, 0x3ab551ceU, 0xa3bc0074U, 0xd4bb30e2U,
+ 0x4adfa541U, 0x3dd895d7U, 0xa4d1c46dU, 0xd3d6f4fbU, 0x4369e96aU,
+ 0x346ed9fcU, 0xad678846U, 0xda60b8d0U, 0x44042d73U, 0x33031de5U,
+ 0xaa0a4c5fU, 0xdd0d7cc9U, 0x5005713cU, 0x270241aaU, 0xbe0b1010U,
+ 0xc90c2086U, 0x5768b525U, 0x206f85b3U, 0xb966d409U, 0xce61e49fU,
+ 0x5edef90eU, 0x29d9c998U, 0xb0d09822U, 0xc7d7a8b4U, 0x59b33d17U,
+ 0x2eb40d81U, 0xb7bd5c3bU, 0xc0ba6cadU, 0xedb88320U, 0x9abfb3b6U,
+ 0x03b6e20cU, 0x74b1d29aU, 0xead54739U, 0x9dd277afU, 0x04db2615U,
+ 0x73dc1683U, 0xe3630b12U, 0x94643b84U, 0x0d6d6a3eU, 0x7a6a5aa8U,
+ 0xe40ecf0bU, 0x9309ff9dU, 0x0a00ae27U, 0x7d079eb1U, 0xf00f9344U,
+ 0x8708a3d2U, 0x1e01f268U, 0x6906c2feU, 0xf762575dU, 0x806567cbU,
+ 0x196c3671U, 0x6e6b06e7U, 0xfed41b76U, 0x89d32be0U, 0x10da7a5aU,
+ 0x67dd4accU, 0xf9b9df6fU, 0x8ebeeff9U, 0x17b7be43U, 0x60b08ed5U,
+ 0xd6d6a3e8U, 0xa1d1937eU, 0x38d8c2c4U, 0x4fdff252U, 0xd1bb67f1U,
+ 0xa6bc5767U, 0x3fb506ddU, 0x48b2364bU, 0xd80d2bdaU, 0xaf0a1b4cU,
+ 0x36034af6U, 0x41047a60U, 0xdf60efc3U, 0xa867df55U, 0x316e8eefU,
+ 0x4669be79U, 0xcb61b38cU, 0xbc66831aU, 0x256fd2a0U, 0x5268e236U,
+ 0xcc0c7795U, 0xbb0b4703U, 0x220216b9U, 0x5505262fU, 0xc5ba3bbeU,
+ 0xb2bd0b28U, 0x2bb45a92U, 0x5cb36a04U, 0xc2d7ffa7U, 0xb5d0cf31U,
+ 0x2cd99e8bU, 0x5bdeae1dU, 0x9b64c2b0U, 0xec63f226U, 0x756aa39cU,
+ 0x026d930aU, 0x9c0906a9U, 0xeb0e363fU, 0x72076785U, 0x05005713U,
+ 0x95bf4a82U, 0xe2b87a14U, 0x7bb12baeU, 0x0cb61b38U, 0x92d28e9bU,
+ 0xe5d5be0dU, 0x7cdcefb7U, 0x0bdbdf21U, 0x86d3d2d4U, 0xf1d4e242U,
+ 0x68ddb3f8U, 0x1fda836eU, 0x81be16cdU, 0xf6b9265bU, 0x6fb077e1U,
+ 0x18b74777U, 0x88085ae6U, 0xff0f6a70U, 0x66063bcaU, 0x11010b5cU,
+ 0x8f659effU, 0xf862ae69U, 0x616bffd3U, 0x166ccf45U, 0xa00ae278U,
+ 0xd70dd2eeU, 0x4e048354U, 0x3903b3c2U, 0xa7672661U, 0xd06016f7U,
+ 0x4969474dU, 0x3e6e77dbU, 0xaed16a4aU, 0xd9d65adcU, 0x40df0b66U,
+ 0x37d83bf0U, 0xa9bcae53U, 0xdebb9ec5U, 0x47b2cf7fU, 0x30b5ffe9U,
+ 0xbdbdf21cU, 0xcabac28aU, 0x53b39330U, 0x24b4a3a6U, 0xbad03605U,
+ 0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U,
+ 0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
+ 0x2d02ef8dU
+#ifdef BYFOUR
+ },
+ {
+ 0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U,
+ 0x7d77f445U, 0x565aa786U, 0x4f4196c7U, 0xc8d98a08U, 0xd1c2bb49U,
+ 0xfaefe88aU, 0xe3f4d9cbU, 0xacb54f0cU, 0xb5ae7e4dU, 0x9e832d8eU,
+ 0x87981ccfU, 0x4ac21251U, 0x53d92310U, 0x78f470d3U, 0x61ef4192U,
+ 0x2eaed755U, 0x37b5e614U, 0x1c98b5d7U, 0x05838496U, 0x821b9859U,
+ 0x9b00a918U, 0xb02dfadbU, 0xa936cb9aU, 0xe6775d5dU, 0xff6c6c1cU,
+ 0xd4413fdfU, 0xcd5a0e9eU, 0x958424a2U, 0x8c9f15e3U, 0xa7b24620U,
+ 0xbea97761U, 0xf1e8e1a6U, 0xe8f3d0e7U, 0xc3de8324U, 0xdac5b265U,
+ 0x5d5daeaaU, 0x44469febU, 0x6f6bcc28U, 0x7670fd69U, 0x39316baeU,
+ 0x202a5aefU, 0x0b07092cU, 0x121c386dU, 0xdf4636f3U, 0xc65d07b2U,
+ 0xed705471U, 0xf46b6530U, 0xbb2af3f7U, 0xa231c2b6U, 0x891c9175U,
+ 0x9007a034U, 0x179fbcfbU, 0x0e848dbaU, 0x25a9de79U, 0x3cb2ef38U,
+ 0x73f379ffU, 0x6ae848beU, 0x41c51b7dU, 0x58de2a3cU, 0xf0794f05U,
+ 0xe9627e44U, 0xc24f2d87U, 0xdb541cc6U, 0x94158a01U, 0x8d0ebb40U,
+ 0xa623e883U, 0xbf38d9c2U, 0x38a0c50dU, 0x21bbf44cU, 0x0a96a78fU,
+ 0x138d96ceU, 0x5ccc0009U, 0x45d73148U, 0x6efa628bU, 0x77e153caU,
+ 0xbabb5d54U, 0xa3a06c15U, 0x888d3fd6U, 0x91960e97U, 0xded79850U,
+ 0xc7cca911U, 0xece1fad2U, 0xf5facb93U, 0x7262d75cU, 0x6b79e61dU,
+ 0x4054b5deU, 0x594f849fU, 0x160e1258U, 0x0f152319U, 0x243870daU,
+ 0x3d23419bU, 0x65fd6ba7U, 0x7ce65ae6U, 0x57cb0925U, 0x4ed03864U,
+ 0x0191aea3U, 0x188a9fe2U, 0x33a7cc21U, 0x2abcfd60U, 0xad24e1afU,
+ 0xb43fd0eeU, 0x9f12832dU, 0x8609b26cU, 0xc94824abU, 0xd05315eaU,
+ 0xfb7e4629U, 0xe2657768U, 0x2f3f79f6U, 0x362448b7U, 0x1d091b74U,
+ 0x04122a35U, 0x4b53bcf2U, 0x52488db3U, 0x7965de70U, 0x607eef31U,
+ 0xe7e6f3feU, 0xfefdc2bfU, 0xd5d0917cU, 0xcccba03dU, 0x838a36faU,
+ 0x9a9107bbU, 0xb1bc5478U, 0xa8a76539U, 0x3b83984bU, 0x2298a90aU,
+ 0x09b5fac9U, 0x10aecb88U, 0x5fef5d4fU, 0x46f46c0eU, 0x6dd93fcdU,
+ 0x74c20e8cU, 0xf35a1243U, 0xea412302U, 0xc16c70c1U, 0xd8774180U,
+ 0x9736d747U, 0x8e2de606U, 0xa500b5c5U, 0xbc1b8484U, 0x71418a1aU,
+ 0x685abb5bU, 0x4377e898U, 0x5a6cd9d9U, 0x152d4f1eU, 0x0c367e5fU,
+ 0x271b2d9cU, 0x3e001cddU, 0xb9980012U, 0xa0833153U, 0x8bae6290U,
+ 0x92b553d1U, 0xddf4c516U, 0xc4eff457U, 0xefc2a794U, 0xf6d996d5U,
+ 0xae07bce9U, 0xb71c8da8U, 0x9c31de6bU, 0x852aef2aU, 0xca6b79edU,
+ 0xd37048acU, 0xf85d1b6fU, 0xe1462a2eU, 0x66de36e1U, 0x7fc507a0U,
+ 0x54e85463U, 0x4df36522U, 0x02b2f3e5U, 0x1ba9c2a4U, 0x30849167U,
+ 0x299fa026U, 0xe4c5aeb8U, 0xfdde9ff9U, 0xd6f3cc3aU, 0xcfe8fd7bU,
+ 0x80a96bbcU, 0x99b25afdU, 0xb29f093eU, 0xab84387fU, 0x2c1c24b0U,
+ 0x350715f1U, 0x1e2a4632U, 0x07317773U, 0x4870e1b4U, 0x516bd0f5U,
+ 0x7a468336U, 0x635db277U, 0xcbfad74eU, 0xd2e1e60fU, 0xf9ccb5ccU,
+ 0xe0d7848dU, 0xaf96124aU, 0xb68d230bU, 0x9da070c8U, 0x84bb4189U,
+ 0x03235d46U, 0x1a386c07U, 0x31153fc4U, 0x280e0e85U, 0x674f9842U,
+ 0x7e54a903U, 0x5579fac0U, 0x4c62cb81U, 0x8138c51fU, 0x9823f45eU,
+ 0xb30ea79dU, 0xaa1596dcU, 0xe554001bU, 0xfc4f315aU, 0xd7626299U,
+ 0xce7953d8U, 0x49e14f17U, 0x50fa7e56U, 0x7bd72d95U, 0x62cc1cd4U,
+ 0x2d8d8a13U, 0x3496bb52U, 0x1fbbe891U, 0x06a0d9d0U, 0x5e7ef3ecU,
+ 0x4765c2adU, 0x6c48916eU, 0x7553a02fU, 0x3a1236e8U, 0x230907a9U,
+ 0x0824546aU, 0x113f652bU, 0x96a779e4U, 0x8fbc48a5U, 0xa4911b66U,
+ 0xbd8a2a27U, 0xf2cbbce0U, 0xebd08da1U, 0xc0fdde62U, 0xd9e6ef23U,
+ 0x14bce1bdU, 0x0da7d0fcU, 0x268a833fU, 0x3f91b27eU, 0x70d024b9U,
+ 0x69cb15f8U, 0x42e6463bU, 0x5bfd777aU, 0xdc656bb5U, 0xc57e5af4U,
+ 0xee530937U, 0xf7483876U, 0xb809aeb1U, 0xa1129ff0U, 0x8a3fcc33U,
+ 0x9324fd72U
+ },
+ {
+ 0x00000000U, 0x01c26a37U, 0x0384d46eU, 0x0246be59U, 0x0709a8dcU,
+ 0x06cbc2ebU, 0x048d7cb2U, 0x054f1685U, 0x0e1351b8U, 0x0fd13b8fU,
+ 0x0d9785d6U, 0x0c55efe1U, 0x091af964U, 0x08d89353U, 0x0a9e2d0aU,
+ 0x0b5c473dU, 0x1c26a370U, 0x1de4c947U, 0x1fa2771eU, 0x1e601d29U,
+ 0x1b2f0bacU, 0x1aed619bU, 0x18abdfc2U, 0x1969b5f5U, 0x1235f2c8U,
+ 0x13f798ffU, 0x11b126a6U, 0x10734c91U, 0x153c5a14U, 0x14fe3023U,
+ 0x16b88e7aU, 0x177ae44dU, 0x384d46e0U, 0x398f2cd7U, 0x3bc9928eU,
+ 0x3a0bf8b9U, 0x3f44ee3cU, 0x3e86840bU, 0x3cc03a52U, 0x3d025065U,
+ 0x365e1758U, 0x379c7d6fU, 0x35dac336U, 0x3418a901U, 0x3157bf84U,
+ 0x3095d5b3U, 0x32d36beaU, 0x331101ddU, 0x246be590U, 0x25a98fa7U,
+ 0x27ef31feU, 0x262d5bc9U, 0x23624d4cU, 0x22a0277bU, 0x20e69922U,
+ 0x2124f315U, 0x2a78b428U, 0x2bbade1fU, 0x29fc6046U, 0x283e0a71U,
+ 0x2d711cf4U, 0x2cb376c3U, 0x2ef5c89aU, 0x2f37a2adU, 0x709a8dc0U,
+ 0x7158e7f7U, 0x731e59aeU, 0x72dc3399U, 0x7793251cU, 0x76514f2bU,
+ 0x7417f172U, 0x75d59b45U, 0x7e89dc78U, 0x7f4bb64fU, 0x7d0d0816U,
+ 0x7ccf6221U, 0x798074a4U, 0x78421e93U, 0x7a04a0caU, 0x7bc6cafdU,
+ 0x6cbc2eb0U, 0x6d7e4487U, 0x6f38fadeU, 0x6efa90e9U, 0x6bb5866cU,
+ 0x6a77ec5bU, 0x68315202U, 0x69f33835U, 0x62af7f08U, 0x636d153fU,
+ 0x612bab66U, 0x60e9c151U, 0x65a6d7d4U, 0x6464bde3U, 0x662203baU,
+ 0x67e0698dU, 0x48d7cb20U, 0x4915a117U, 0x4b531f4eU, 0x4a917579U,
+ 0x4fde63fcU, 0x4e1c09cbU, 0x4c5ab792U, 0x4d98dda5U, 0x46c49a98U,
+ 0x4706f0afU, 0x45404ef6U, 0x448224c1U, 0x41cd3244U, 0x400f5873U,
+ 0x4249e62aU, 0x438b8c1dU, 0x54f16850U, 0x55330267U, 0x5775bc3eU,
+ 0x56b7d609U, 0x53f8c08cU, 0x523aaabbU, 0x507c14e2U, 0x51be7ed5U,
+ 0x5ae239e8U, 0x5b2053dfU, 0x5966ed86U, 0x58a487b1U, 0x5deb9134U,
+ 0x5c29fb03U, 0x5e6f455aU, 0x5fad2f6dU, 0xe1351b80U, 0xe0f771b7U,
+ 0xe2b1cfeeU, 0xe373a5d9U, 0xe63cb35cU, 0xe7fed96bU, 0xe5b86732U,
+ 0xe47a0d05U, 0xef264a38U, 0xeee4200fU, 0xeca29e56U, 0xed60f461U,
+ 0xe82fe2e4U, 0xe9ed88d3U, 0xebab368aU, 0xea695cbdU, 0xfd13b8f0U,
+ 0xfcd1d2c7U, 0xfe976c9eU, 0xff5506a9U, 0xfa1a102cU, 0xfbd87a1bU,
+ 0xf99ec442U, 0xf85cae75U, 0xf300e948U, 0xf2c2837fU, 0xf0843d26U,
+ 0xf1465711U, 0xf4094194U, 0xf5cb2ba3U, 0xf78d95faU, 0xf64fffcdU,
+ 0xd9785d60U, 0xd8ba3757U, 0xdafc890eU, 0xdb3ee339U, 0xde71f5bcU,
+ 0xdfb39f8bU, 0xddf521d2U, 0xdc374be5U, 0xd76b0cd8U, 0xd6a966efU,
+ 0xd4efd8b6U, 0xd52db281U, 0xd062a404U, 0xd1a0ce33U, 0xd3e6706aU,
+ 0xd2241a5dU, 0xc55efe10U, 0xc49c9427U, 0xc6da2a7eU, 0xc7184049U,
+ 0xc25756ccU, 0xc3953cfbU, 0xc1d382a2U, 0xc011e895U, 0xcb4dafa8U,
+ 0xca8fc59fU, 0xc8c97bc6U, 0xc90b11f1U, 0xcc440774U, 0xcd866d43U,
+ 0xcfc0d31aU, 0xce02b92dU, 0x91af9640U, 0x906dfc77U, 0x922b422eU,
+ 0x93e92819U, 0x96a63e9cU, 0x976454abU, 0x9522eaf2U, 0x94e080c5U,
+ 0x9fbcc7f8U, 0x9e7eadcfU, 0x9c381396U, 0x9dfa79a1U, 0x98b56f24U,
+ 0x99770513U, 0x9b31bb4aU, 0x9af3d17dU, 0x8d893530U, 0x8c4b5f07U,
+ 0x8e0de15eU, 0x8fcf8b69U, 0x8a809decU, 0x8b42f7dbU, 0x89044982U,
+ 0x88c623b5U, 0x839a6488U, 0x82580ebfU, 0x801eb0e6U, 0x81dcdad1U,
+ 0x8493cc54U, 0x8551a663U, 0x8717183aU, 0x86d5720dU, 0xa9e2d0a0U,
+ 0xa820ba97U, 0xaa6604ceU, 0xaba46ef9U, 0xaeeb787cU, 0xaf29124bU,
+ 0xad6fac12U, 0xacadc625U, 0xa7f18118U, 0xa633eb2fU, 0xa4755576U,
+ 0xa5b73f41U, 0xa0f829c4U, 0xa13a43f3U, 0xa37cfdaaU, 0xa2be979dU,
+ 0xb5c473d0U, 0xb40619e7U, 0xb640a7beU, 0xb782cd89U, 0xb2cddb0cU,
+ 0xb30fb13bU, 0xb1490f62U, 0xb08b6555U, 0xbbd72268U, 0xba15485fU,
+ 0xb853f606U, 0xb9919c31U, 0xbcde8ab4U, 0xbd1ce083U, 0xbf5a5edaU,
+ 0xbe9834edU
+ },
+ {
+ 0x00000000U, 0xb8bc6765U, 0xaa09c88bU, 0x12b5afeeU, 0x8f629757U,
+ 0x37def032U, 0x256b5fdcU, 0x9dd738b9U, 0xc5b428efU, 0x7d084f8aU,
+ 0x6fbde064U, 0xd7018701U, 0x4ad6bfb8U, 0xf26ad8ddU, 0xe0df7733U,
+ 0x58631056U, 0x5019579fU, 0xe8a530faU, 0xfa109f14U, 0x42acf871U,
+ 0xdf7bc0c8U, 0x67c7a7adU, 0x75720843U, 0xcdce6f26U, 0x95ad7f70U,
+ 0x2d111815U, 0x3fa4b7fbU, 0x8718d09eU, 0x1acfe827U, 0xa2738f42U,
+ 0xb0c620acU, 0x087a47c9U, 0xa032af3eU, 0x188ec85bU, 0x0a3b67b5U,
+ 0xb28700d0U, 0x2f503869U, 0x97ec5f0cU, 0x8559f0e2U, 0x3de59787U,
+ 0x658687d1U, 0xdd3ae0b4U, 0xcf8f4f5aU, 0x7733283fU, 0xeae41086U,
+ 0x525877e3U, 0x40edd80dU, 0xf851bf68U, 0xf02bf8a1U, 0x48979fc4U,
+ 0x5a22302aU, 0xe29e574fU, 0x7f496ff6U, 0xc7f50893U, 0xd540a77dU,
+ 0x6dfcc018U, 0x359fd04eU, 0x8d23b72bU, 0x9f9618c5U, 0x272a7fa0U,
+ 0xbafd4719U, 0x0241207cU, 0x10f48f92U, 0xa848e8f7U, 0x9b14583dU,
+ 0x23a83f58U, 0x311d90b6U, 0x89a1f7d3U, 0x1476cf6aU, 0xaccaa80fU,
+ 0xbe7f07e1U, 0x06c36084U, 0x5ea070d2U, 0xe61c17b7U, 0xf4a9b859U,
+ 0x4c15df3cU, 0xd1c2e785U, 0x697e80e0U, 0x7bcb2f0eU, 0xc377486bU,
+ 0xcb0d0fa2U, 0x73b168c7U, 0x6104c729U, 0xd9b8a04cU, 0x446f98f5U,
+ 0xfcd3ff90U, 0xee66507eU, 0x56da371bU, 0x0eb9274dU, 0xb6054028U,
+ 0xa4b0efc6U, 0x1c0c88a3U, 0x81dbb01aU, 0x3967d77fU, 0x2bd27891U,
+ 0x936e1ff4U, 0x3b26f703U, 0x839a9066U, 0x912f3f88U, 0x299358edU,
+ 0xb4446054U, 0x0cf80731U, 0x1e4da8dfU, 0xa6f1cfbaU, 0xfe92dfecU,
+ 0x462eb889U, 0x549b1767U, 0xec277002U, 0x71f048bbU, 0xc94c2fdeU,
+ 0xdbf98030U, 0x6345e755U, 0x6b3fa09cU, 0xd383c7f9U, 0xc1366817U,
+ 0x798a0f72U, 0xe45d37cbU, 0x5ce150aeU, 0x4e54ff40U, 0xf6e89825U,
+ 0xae8b8873U, 0x1637ef16U, 0x048240f8U, 0xbc3e279dU, 0x21e91f24U,
+ 0x99557841U, 0x8be0d7afU, 0x335cb0caU, 0xed59b63bU, 0x55e5d15eU,
+ 0x47507eb0U, 0xffec19d5U, 0x623b216cU, 0xda874609U, 0xc832e9e7U,
+ 0x708e8e82U, 0x28ed9ed4U, 0x9051f9b1U, 0x82e4565fU, 0x3a58313aU,
+ 0xa78f0983U, 0x1f336ee6U, 0x0d86c108U, 0xb53aa66dU, 0xbd40e1a4U,
+ 0x05fc86c1U, 0x1749292fU, 0xaff54e4aU, 0x322276f3U, 0x8a9e1196U,
+ 0x982bbe78U, 0x2097d91dU, 0x78f4c94bU, 0xc048ae2eU, 0xd2fd01c0U,
+ 0x6a4166a5U, 0xf7965e1cU, 0x4f2a3979U, 0x5d9f9697U, 0xe523f1f2U,
+ 0x4d6b1905U, 0xf5d77e60U, 0xe762d18eU, 0x5fdeb6ebU, 0xc2098e52U,
+ 0x7ab5e937U, 0x680046d9U, 0xd0bc21bcU, 0x88df31eaU, 0x3063568fU,
+ 0x22d6f961U, 0x9a6a9e04U, 0x07bda6bdU, 0xbf01c1d8U, 0xadb46e36U,
+ 0x15080953U, 0x1d724e9aU, 0xa5ce29ffU, 0xb77b8611U, 0x0fc7e174U,
+ 0x9210d9cdU, 0x2aacbea8U, 0x38191146U, 0x80a57623U, 0xd8c66675U,
+ 0x607a0110U, 0x72cfaefeU, 0xca73c99bU, 0x57a4f122U, 0xef189647U,
+ 0xfdad39a9U, 0x45115eccU, 0x764dee06U, 0xcef18963U, 0xdc44268dU,
+ 0x64f841e8U, 0xf92f7951U, 0x41931e34U, 0x5326b1daU, 0xeb9ad6bfU,
+ 0xb3f9c6e9U, 0x0b45a18cU, 0x19f00e62U, 0xa14c6907U, 0x3c9b51beU,
+ 0x842736dbU, 0x96929935U, 0x2e2efe50U, 0x2654b999U, 0x9ee8defcU,
+ 0x8c5d7112U, 0x34e11677U, 0xa9362eceU, 0x118a49abU, 0x033fe645U,
+ 0xbb838120U, 0xe3e09176U, 0x5b5cf613U, 0x49e959fdU, 0xf1553e98U,
+ 0x6c820621U, 0xd43e6144U, 0xc68bceaaU, 0x7e37a9cfU, 0xd67f4138U,
+ 0x6ec3265dU, 0x7c7689b3U, 0xc4caeed6U, 0x591dd66fU, 0xe1a1b10aU,
+ 0xf3141ee4U, 0x4ba87981U, 0x13cb69d7U, 0xab770eb2U, 0xb9c2a15cU,
+ 0x017ec639U, 0x9ca9fe80U, 0x241599e5U, 0x36a0360bU, 0x8e1c516eU,
+ 0x866616a7U, 0x3eda71c2U, 0x2c6fde2cU, 0x94d3b949U, 0x090481f0U,
+ 0xb1b8e695U, 0xa30d497bU, 0x1bb12e1eU, 0x43d23e48U, 0xfb6e592dU,
+ 0xe9dbf6c3U, 0x516791a6U, 0xccb0a91fU, 0x740cce7aU, 0x66b96194U,
+ 0xde0506f1U
+ },
+ {
+ 0x00000000U, 0x96300777U, 0x2c610eeeU, 0xba510999U, 0x19c46d07U,
+ 0x8ff46a70U, 0x35a563e9U, 0xa395649eU, 0x3288db0eU, 0xa4b8dc79U,
+ 0x1ee9d5e0U, 0x88d9d297U, 0x2b4cb609U, 0xbd7cb17eU, 0x072db8e7U,
+ 0x911dbf90U, 0x6410b71dU, 0xf220b06aU, 0x4871b9f3U, 0xde41be84U,
+ 0x7dd4da1aU, 0xebe4dd6dU, 0x51b5d4f4U, 0xc785d383U, 0x56986c13U,
+ 0xc0a86b64U, 0x7af962fdU, 0xecc9658aU, 0x4f5c0114U, 0xd96c0663U,
+ 0x633d0ffaU, 0xf50d088dU, 0xc8206e3bU, 0x5e10694cU, 0xe44160d5U,
+ 0x727167a2U, 0xd1e4033cU, 0x47d4044bU, 0xfd850dd2U, 0x6bb50aa5U,
+ 0xfaa8b535U, 0x6c98b242U, 0xd6c9bbdbU, 0x40f9bcacU, 0xe36cd832U,
+ 0x755cdf45U, 0xcf0dd6dcU, 0x593dd1abU, 0xac30d926U, 0x3a00de51U,
+ 0x8051d7c8U, 0x1661d0bfU, 0xb5f4b421U, 0x23c4b356U, 0x9995bacfU,
+ 0x0fa5bdb8U, 0x9eb80228U, 0x0888055fU, 0xb2d90cc6U, 0x24e90bb1U,
+ 0x877c6f2fU, 0x114c6858U, 0xab1d61c1U, 0x3d2d66b6U, 0x9041dc76U,
+ 0x0671db01U, 0xbc20d298U, 0x2a10d5efU, 0x8985b171U, 0x1fb5b606U,
+ 0xa5e4bf9fU, 0x33d4b8e8U, 0xa2c90778U, 0x34f9000fU, 0x8ea80996U,
+ 0x18980ee1U, 0xbb0d6a7fU, 0x2d3d6d08U, 0x976c6491U, 0x015c63e6U,
+ 0xf4516b6bU, 0x62616c1cU, 0xd8306585U, 0x4e0062f2U, 0xed95066cU,
+ 0x7ba5011bU, 0xc1f40882U, 0x57c40ff5U, 0xc6d9b065U, 0x50e9b712U,
+ 0xeab8be8bU, 0x7c88b9fcU, 0xdf1ddd62U, 0x492dda15U, 0xf37cd38cU,
+ 0x654cd4fbU, 0x5861b24dU, 0xce51b53aU, 0x7400bca3U, 0xe230bbd4U,
+ 0x41a5df4aU, 0xd795d83dU, 0x6dc4d1a4U, 0xfbf4d6d3U, 0x6ae96943U,
+ 0xfcd96e34U, 0x468867adU, 0xd0b860daU, 0x732d0444U, 0xe51d0333U,
+ 0x5f4c0aaaU, 0xc97c0dddU, 0x3c710550U, 0xaa410227U, 0x10100bbeU,
+ 0x86200cc9U, 0x25b56857U, 0xb3856f20U, 0x09d466b9U, 0x9fe461ceU,
+ 0x0ef9de5eU, 0x98c9d929U, 0x2298d0b0U, 0xb4a8d7c7U, 0x173db359U,
+ 0x810db42eU, 0x3b5cbdb7U, 0xad6cbac0U, 0x2083b8edU, 0xb6b3bf9aU,
+ 0x0ce2b603U, 0x9ad2b174U, 0x3947d5eaU, 0xaf77d29dU, 0x1526db04U,
+ 0x8316dc73U, 0x120b63e3U, 0x843b6494U, 0x3e6a6d0dU, 0xa85a6a7aU,
+ 0x0bcf0ee4U, 0x9dff0993U, 0x27ae000aU, 0xb19e077dU, 0x44930ff0U,
+ 0xd2a30887U, 0x68f2011eU, 0xfec20669U, 0x5d5762f7U, 0xcb676580U,
+ 0x71366c19U, 0xe7066b6eU, 0x761bd4feU, 0xe02bd389U, 0x5a7ada10U,
+ 0xcc4add67U, 0x6fdfb9f9U, 0xf9efbe8eU, 0x43beb717U, 0xd58eb060U,
+ 0xe8a3d6d6U, 0x7e93d1a1U, 0xc4c2d838U, 0x52f2df4fU, 0xf167bbd1U,
+ 0x6757bca6U, 0xdd06b53fU, 0x4b36b248U, 0xda2b0dd8U, 0x4c1b0aafU,
+ 0xf64a0336U, 0x607a0441U, 0xc3ef60dfU, 0x55df67a8U, 0xef8e6e31U,
+ 0x79be6946U, 0x8cb361cbU, 0x1a8366bcU, 0xa0d26f25U, 0x36e26852U,
+ 0x95770cccU, 0x03470bbbU, 0xb9160222U, 0x2f260555U, 0xbe3bbac5U,
+ 0x280bbdb2U, 0x925ab42bU, 0x046ab35cU, 0xa7ffd7c2U, 0x31cfd0b5U,
+ 0x8b9ed92cU, 0x1daede5bU, 0xb0c2649bU, 0x26f263ecU, 0x9ca36a75U,
+ 0x0a936d02U, 0xa906099cU, 0x3f360eebU, 0x85670772U, 0x13570005U,
+ 0x824abf95U, 0x147ab8e2U, 0xae2bb17bU, 0x381bb60cU, 0x9b8ed292U,
+ 0x0dbed5e5U, 0xb7efdc7cU, 0x21dfdb0bU, 0xd4d2d386U, 0x42e2d4f1U,
+ 0xf8b3dd68U, 0x6e83da1fU, 0xcd16be81U, 0x5b26b9f6U, 0xe177b06fU,
+ 0x7747b718U, 0xe65a0888U, 0x706a0fffU, 0xca3b0666U, 0x5c0b0111U,
+ 0xff9e658fU, 0x69ae62f8U, 0xd3ff6b61U, 0x45cf6c16U, 0x78e20aa0U,
+ 0xeed20dd7U, 0x5483044eU, 0xc2b30339U, 0x612667a7U, 0xf71660d0U,
+ 0x4d476949U, 0xdb776e3eU, 0x4a6ad1aeU, 0xdc5ad6d9U, 0x660bdf40U,
+ 0xf03bd837U, 0x53aebca9U, 0xc59ebbdeU, 0x7fcfb247U, 0xe9ffb530U,
+ 0x1cf2bdbdU, 0x8ac2bacaU, 0x3093b353U, 0xa6a3b424U, 0x0536d0baU,
+ 0x9306d7cdU, 0x2957de54U, 0xbf67d923U, 0x2e7a66b3U, 0xb84a61c4U,
+ 0x021b685dU, 0x942b6f2aU, 0x37be0bb4U, 0xa18e0cc3U, 0x1bdf055aU,
+ 0x8def022dU
+ },
+ {
+ 0x00000000U, 0x41311b19U, 0x82623632U, 0xc3532d2bU, 0x04c56c64U,
+ 0x45f4777dU, 0x86a75a56U, 0xc796414fU, 0x088ad9c8U, 0x49bbc2d1U,
+ 0x8ae8effaU, 0xcbd9f4e3U, 0x0c4fb5acU, 0x4d7eaeb5U, 0x8e2d839eU,
+ 0xcf1c9887U, 0x5112c24aU, 0x1023d953U, 0xd370f478U, 0x9241ef61U,
+ 0x55d7ae2eU, 0x14e6b537U, 0xd7b5981cU, 0x96848305U, 0x59981b82U,
+ 0x18a9009bU, 0xdbfa2db0U, 0x9acb36a9U, 0x5d5d77e6U, 0x1c6c6cffU,
+ 0xdf3f41d4U, 0x9e0e5acdU, 0xa2248495U, 0xe3159f8cU, 0x2046b2a7U,
+ 0x6177a9beU, 0xa6e1e8f1U, 0xe7d0f3e8U, 0x2483dec3U, 0x65b2c5daU,
+ 0xaaae5d5dU, 0xeb9f4644U, 0x28cc6b6fU, 0x69fd7076U, 0xae6b3139U,
+ 0xef5a2a20U, 0x2c09070bU, 0x6d381c12U, 0xf33646dfU, 0xb2075dc6U,
+ 0x715470edU, 0x30656bf4U, 0xf7f32abbU, 0xb6c231a2U, 0x75911c89U,
+ 0x34a00790U, 0xfbbc9f17U, 0xba8d840eU, 0x79dea925U, 0x38efb23cU,
+ 0xff79f373U, 0xbe48e86aU, 0x7d1bc541U, 0x3c2ade58U, 0x054f79f0U,
+ 0x447e62e9U, 0x872d4fc2U, 0xc61c54dbU, 0x018a1594U, 0x40bb0e8dU,
+ 0x83e823a6U, 0xc2d938bfU, 0x0dc5a038U, 0x4cf4bb21U, 0x8fa7960aU,
+ 0xce968d13U, 0x0900cc5cU, 0x4831d745U, 0x8b62fa6eU, 0xca53e177U,
+ 0x545dbbbaU, 0x156ca0a3U, 0xd63f8d88U, 0x970e9691U, 0x5098d7deU,
+ 0x11a9ccc7U, 0xd2fae1ecU, 0x93cbfaf5U, 0x5cd76272U, 0x1de6796bU,
+ 0xdeb55440U, 0x9f844f59U, 0x58120e16U, 0x1923150fU, 0xda703824U,
+ 0x9b41233dU, 0xa76bfd65U, 0xe65ae67cU, 0x2509cb57U, 0x6438d04eU,
+ 0xa3ae9101U, 0xe29f8a18U, 0x21cca733U, 0x60fdbc2aU, 0xafe124adU,
+ 0xeed03fb4U, 0x2d83129fU, 0x6cb20986U, 0xab2448c9U, 0xea1553d0U,
+ 0x29467efbU, 0x687765e2U, 0xf6793f2fU, 0xb7482436U, 0x741b091dU,
+ 0x352a1204U, 0xf2bc534bU, 0xb38d4852U, 0x70de6579U, 0x31ef7e60U,
+ 0xfef3e6e7U, 0xbfc2fdfeU, 0x7c91d0d5U, 0x3da0cbccU, 0xfa368a83U,
+ 0xbb07919aU, 0x7854bcb1U, 0x3965a7a8U, 0x4b98833bU, 0x0aa99822U,
+ 0xc9fab509U, 0x88cbae10U, 0x4f5def5fU, 0x0e6cf446U, 0xcd3fd96dU,
+ 0x8c0ec274U, 0x43125af3U, 0x022341eaU, 0xc1706cc1U, 0x804177d8U,
+ 0x47d73697U, 0x06e62d8eU, 0xc5b500a5U, 0x84841bbcU, 0x1a8a4171U,
+ 0x5bbb5a68U, 0x98e87743U, 0xd9d96c5aU, 0x1e4f2d15U, 0x5f7e360cU,
+ 0x9c2d1b27U, 0xdd1c003eU, 0x120098b9U, 0x533183a0U, 0x9062ae8bU,
+ 0xd153b592U, 0x16c5f4ddU, 0x57f4efc4U, 0x94a7c2efU, 0xd596d9f6U,
+ 0xe9bc07aeU, 0xa88d1cb7U, 0x6bde319cU, 0x2aef2a85U, 0xed796bcaU,
+ 0xac4870d3U, 0x6f1b5df8U, 0x2e2a46e1U, 0xe136de66U, 0xa007c57fU,
+ 0x6354e854U, 0x2265f34dU, 0xe5f3b202U, 0xa4c2a91bU, 0x67918430U,
+ 0x26a09f29U, 0xb8aec5e4U, 0xf99fdefdU, 0x3accf3d6U, 0x7bfde8cfU,
+ 0xbc6ba980U, 0xfd5ab299U, 0x3e099fb2U, 0x7f3884abU, 0xb0241c2cU,
+ 0xf1150735U, 0x32462a1eU, 0x73773107U, 0xb4e17048U, 0xf5d06b51U,
+ 0x3683467aU, 0x77b25d63U, 0x4ed7facbU, 0x0fe6e1d2U, 0xccb5ccf9U,
+ 0x8d84d7e0U, 0x4a1296afU, 0x0b238db6U, 0xc870a09dU, 0x8941bb84U,
+ 0x465d2303U, 0x076c381aU, 0xc43f1531U, 0x850e0e28U, 0x42984f67U,
+ 0x03a9547eU, 0xc0fa7955U, 0x81cb624cU, 0x1fc53881U, 0x5ef42398U,
+ 0x9da70eb3U, 0xdc9615aaU, 0x1b0054e5U, 0x5a314ffcU, 0x996262d7U,
+ 0xd85379ceU, 0x174fe149U, 0x567efa50U, 0x952dd77bU, 0xd41ccc62U,
+ 0x138a8d2dU, 0x52bb9634U, 0x91e8bb1fU, 0xd0d9a006U, 0xecf37e5eU,
+ 0xadc26547U, 0x6e91486cU, 0x2fa05375U, 0xe836123aU, 0xa9070923U,
+ 0x6a542408U, 0x2b653f11U, 0xe479a796U, 0xa548bc8fU, 0x661b91a4U,
+ 0x272a8abdU, 0xe0bccbf2U, 0xa18dd0ebU, 0x62defdc0U, 0x23efe6d9U,
+ 0xbde1bc14U, 0xfcd0a70dU, 0x3f838a26U, 0x7eb2913fU, 0xb924d070U,
+ 0xf815cb69U, 0x3b46e642U, 0x7a77fd5bU, 0xb56b65dcU, 0xf45a7ec5U,
+ 0x370953eeU, 0x763848f7U, 0xb1ae09b8U, 0xf09f12a1U, 0x33cc3f8aU,
+ 0x72fd2493U
+ },
+ {
+ 0x00000000U, 0x376ac201U, 0x6ed48403U, 0x59be4602U, 0xdca80907U,
+ 0xebc2cb06U, 0xb27c8d04U, 0x85164f05U, 0xb851130eU, 0x8f3bd10fU,
+ 0xd685970dU, 0xe1ef550cU, 0x64f91a09U, 0x5393d808U, 0x0a2d9e0aU,
+ 0x3d475c0bU, 0x70a3261cU, 0x47c9e41dU, 0x1e77a21fU, 0x291d601eU,
+ 0xac0b2f1bU, 0x9b61ed1aU, 0xc2dfab18U, 0xf5b56919U, 0xc8f23512U,
+ 0xff98f713U, 0xa626b111U, 0x914c7310U, 0x145a3c15U, 0x2330fe14U,
+ 0x7a8eb816U, 0x4de47a17U, 0xe0464d38U, 0xd72c8f39U, 0x8e92c93bU,
+ 0xb9f80b3aU, 0x3cee443fU, 0x0b84863eU, 0x523ac03cU, 0x6550023dU,
+ 0x58175e36U, 0x6f7d9c37U, 0x36c3da35U, 0x01a91834U, 0x84bf5731U,
+ 0xb3d59530U, 0xea6bd332U, 0xdd011133U, 0x90e56b24U, 0xa78fa925U,
+ 0xfe31ef27U, 0xc95b2d26U, 0x4c4d6223U, 0x7b27a022U, 0x2299e620U,
+ 0x15f32421U, 0x28b4782aU, 0x1fdeba2bU, 0x4660fc29U, 0x710a3e28U,
+ 0xf41c712dU, 0xc376b32cU, 0x9ac8f52eU, 0xada2372fU, 0xc08d9a70U,
+ 0xf7e75871U, 0xae591e73U, 0x9933dc72U, 0x1c259377U, 0x2b4f5176U,
+ 0x72f11774U, 0x459bd575U, 0x78dc897eU, 0x4fb64b7fU, 0x16080d7dU,
+ 0x2162cf7cU, 0xa4748079U, 0x931e4278U, 0xcaa0047aU, 0xfdcac67bU,
+ 0xb02ebc6cU, 0x87447e6dU, 0xdefa386fU, 0xe990fa6eU, 0x6c86b56bU,
+ 0x5bec776aU, 0x02523168U, 0x3538f369U, 0x087faf62U, 0x3f156d63U,
+ 0x66ab2b61U, 0x51c1e960U, 0xd4d7a665U, 0xe3bd6464U, 0xba032266U,
+ 0x8d69e067U, 0x20cbd748U, 0x17a11549U, 0x4e1f534bU, 0x7975914aU,
+ 0xfc63de4fU, 0xcb091c4eU, 0x92b75a4cU, 0xa5dd984dU, 0x989ac446U,
+ 0xaff00647U, 0xf64e4045U, 0xc1248244U, 0x4432cd41U, 0x73580f40U,
+ 0x2ae64942U, 0x1d8c8b43U, 0x5068f154U, 0x67023355U, 0x3ebc7557U,
+ 0x09d6b756U, 0x8cc0f853U, 0xbbaa3a52U, 0xe2147c50U, 0xd57ebe51U,
+ 0xe839e25aU, 0xdf53205bU, 0x86ed6659U, 0xb187a458U, 0x3491eb5dU,
+ 0x03fb295cU, 0x5a456f5eU, 0x6d2fad5fU, 0x801b35e1U, 0xb771f7e0U,
+ 0xeecfb1e2U, 0xd9a573e3U, 0x5cb33ce6U, 0x6bd9fee7U, 0x3267b8e5U,
+ 0x050d7ae4U, 0x384a26efU, 0x0f20e4eeU, 0x569ea2ecU, 0x61f460edU,
+ 0xe4e22fe8U, 0xd388ede9U, 0x8a36abebU, 0xbd5c69eaU, 0xf0b813fdU,
+ 0xc7d2d1fcU, 0x9e6c97feU, 0xa90655ffU, 0x2c101afaU, 0x1b7ad8fbU,
+ 0x42c49ef9U, 0x75ae5cf8U, 0x48e900f3U, 0x7f83c2f2U, 0x263d84f0U,
+ 0x115746f1U, 0x944109f4U, 0xa32bcbf5U, 0xfa958df7U, 0xcdff4ff6U,
+ 0x605d78d9U, 0x5737bad8U, 0x0e89fcdaU, 0x39e33edbU, 0xbcf571deU,
+ 0x8b9fb3dfU, 0xd221f5ddU, 0xe54b37dcU, 0xd80c6bd7U, 0xef66a9d6U,
+ 0xb6d8efd4U, 0x81b22dd5U, 0x04a462d0U, 0x33cea0d1U, 0x6a70e6d3U,
+ 0x5d1a24d2U, 0x10fe5ec5U, 0x27949cc4U, 0x7e2adac6U, 0x494018c7U,
+ 0xcc5657c2U, 0xfb3c95c3U, 0xa282d3c1U, 0x95e811c0U, 0xa8af4dcbU,
+ 0x9fc58fcaU, 0xc67bc9c8U, 0xf1110bc9U, 0x740744ccU, 0x436d86cdU,
+ 0x1ad3c0cfU, 0x2db902ceU, 0x4096af91U, 0x77fc6d90U, 0x2e422b92U,
+ 0x1928e993U, 0x9c3ea696U, 0xab546497U, 0xf2ea2295U, 0xc580e094U,
+ 0xf8c7bc9fU, 0xcfad7e9eU, 0x9613389cU, 0xa179fa9dU, 0x246fb598U,
+ 0x13057799U, 0x4abb319bU, 0x7dd1f39aU, 0x3035898dU, 0x075f4b8cU,
+ 0x5ee10d8eU, 0x698bcf8fU, 0xec9d808aU, 0xdbf7428bU, 0x82490489U,
+ 0xb523c688U, 0x88649a83U, 0xbf0e5882U, 0xe6b01e80U, 0xd1dadc81U,
+ 0x54cc9384U, 0x63a65185U, 0x3a181787U, 0x0d72d586U, 0xa0d0e2a9U,
+ 0x97ba20a8U, 0xce0466aaU, 0xf96ea4abU, 0x7c78ebaeU, 0x4b1229afU,
+ 0x12ac6fadU, 0x25c6adacU, 0x1881f1a7U, 0x2feb33a6U, 0x765575a4U,
+ 0x413fb7a5U, 0xc429f8a0U, 0xf3433aa1U, 0xaafd7ca3U, 0x9d97bea2U,
+ 0xd073c4b5U, 0xe71906b4U, 0xbea740b6U, 0x89cd82b7U, 0x0cdbcdb2U,
+ 0x3bb10fb3U, 0x620f49b1U, 0x55658bb0U, 0x6822d7bbU, 0x5f4815baU,
+ 0x06f653b8U, 0x319c91b9U, 0xb48adebcU, 0x83e01cbdU, 0xda5e5abfU,
+ 0xed3498beU
+ },
+ {
+ 0x00000000U, 0x6567bcb8U, 0x8bc809aaU, 0xeeafb512U, 0x5797628fU,
+ 0x32f0de37U, 0xdc5f6b25U, 0xb938d79dU, 0xef28b4c5U, 0x8a4f087dU,
+ 0x64e0bd6fU, 0x018701d7U, 0xb8bfd64aU, 0xddd86af2U, 0x3377dfe0U,
+ 0x56106358U, 0x9f571950U, 0xfa30a5e8U, 0x149f10faU, 0x71f8ac42U,
+ 0xc8c07bdfU, 0xada7c767U, 0x43087275U, 0x266fcecdU, 0x707fad95U,
+ 0x1518112dU, 0xfbb7a43fU, 0x9ed01887U, 0x27e8cf1aU, 0x428f73a2U,
+ 0xac20c6b0U, 0xc9477a08U, 0x3eaf32a0U, 0x5bc88e18U, 0xb5673b0aU,
+ 0xd00087b2U, 0x6938502fU, 0x0c5fec97U, 0xe2f05985U, 0x8797e53dU,
+ 0xd1878665U, 0xb4e03addU, 0x5a4f8fcfU, 0x3f283377U, 0x8610e4eaU,
+ 0xe3775852U, 0x0dd8ed40U, 0x68bf51f8U, 0xa1f82bf0U, 0xc49f9748U,
+ 0x2a30225aU, 0x4f579ee2U, 0xf66f497fU, 0x9308f5c7U, 0x7da740d5U,
+ 0x18c0fc6dU, 0x4ed09f35U, 0x2bb7238dU, 0xc518969fU, 0xa07f2a27U,
+ 0x1947fdbaU, 0x7c204102U, 0x928ff410U, 0xf7e848a8U, 0x3d58149bU,
+ 0x583fa823U, 0xb6901d31U, 0xd3f7a189U, 0x6acf7614U, 0x0fa8caacU,
+ 0xe1077fbeU, 0x8460c306U, 0xd270a05eU, 0xb7171ce6U, 0x59b8a9f4U,
+ 0x3cdf154cU, 0x85e7c2d1U, 0xe0807e69U, 0x0e2fcb7bU, 0x6b4877c3U,
+ 0xa20f0dcbU, 0xc768b173U, 0x29c70461U, 0x4ca0b8d9U, 0xf5986f44U,
+ 0x90ffd3fcU, 0x7e5066eeU, 0x1b37da56U, 0x4d27b90eU, 0x284005b6U,
+ 0xc6efb0a4U, 0xa3880c1cU, 0x1ab0db81U, 0x7fd76739U, 0x9178d22bU,
+ 0xf41f6e93U, 0x03f7263bU, 0x66909a83U, 0x883f2f91U, 0xed589329U,
+ 0x546044b4U, 0x3107f80cU, 0xdfa84d1eU, 0xbacff1a6U, 0xecdf92feU,
+ 0x89b82e46U, 0x67179b54U, 0x027027ecU, 0xbb48f071U, 0xde2f4cc9U,
+ 0x3080f9dbU, 0x55e74563U, 0x9ca03f6bU, 0xf9c783d3U, 0x176836c1U,
+ 0x720f8a79U, 0xcb375de4U, 0xae50e15cU, 0x40ff544eU, 0x2598e8f6U,
+ 0x73888baeU, 0x16ef3716U, 0xf8408204U, 0x9d273ebcU, 0x241fe921U,
+ 0x41785599U, 0xafd7e08bU, 0xcab05c33U, 0x3bb659edU, 0x5ed1e555U,
+ 0xb07e5047U, 0xd519ecffU, 0x6c213b62U, 0x094687daU, 0xe7e932c8U,
+ 0x828e8e70U, 0xd49eed28U, 0xb1f95190U, 0x5f56e482U, 0x3a31583aU,
+ 0x83098fa7U, 0xe66e331fU, 0x08c1860dU, 0x6da63ab5U, 0xa4e140bdU,
+ 0xc186fc05U, 0x2f294917U, 0x4a4ef5afU, 0xf3762232U, 0x96119e8aU,
+ 0x78be2b98U, 0x1dd99720U, 0x4bc9f478U, 0x2eae48c0U, 0xc001fdd2U,
+ 0xa566416aU, 0x1c5e96f7U, 0x79392a4fU, 0x97969f5dU, 0xf2f123e5U,
+ 0x05196b4dU, 0x607ed7f5U, 0x8ed162e7U, 0xebb6de5fU, 0x528e09c2U,
+ 0x37e9b57aU, 0xd9460068U, 0xbc21bcd0U, 0xea31df88U, 0x8f566330U,
+ 0x61f9d622U, 0x049e6a9aU, 0xbda6bd07U, 0xd8c101bfU, 0x366eb4adU,
+ 0x53090815U, 0x9a4e721dU, 0xff29cea5U, 0x11867bb7U, 0x74e1c70fU,
+ 0xcdd91092U, 0xa8beac2aU, 0x46111938U, 0x2376a580U, 0x7566c6d8U,
+ 0x10017a60U, 0xfeaecf72U, 0x9bc973caU, 0x22f1a457U, 0x479618efU,
+ 0xa939adfdU, 0xcc5e1145U, 0x06ee4d76U, 0x6389f1ceU, 0x8d2644dcU,
+ 0xe841f864U, 0x51792ff9U, 0x341e9341U, 0xdab12653U, 0xbfd69aebU,
+ 0xe9c6f9b3U, 0x8ca1450bU, 0x620ef019U, 0x07694ca1U, 0xbe519b3cU,
+ 0xdb362784U, 0x35999296U, 0x50fe2e2eU, 0x99b95426U, 0xfcdee89eU,
+ 0x12715d8cU, 0x7716e134U, 0xce2e36a9U, 0xab498a11U, 0x45e63f03U,
+ 0x208183bbU, 0x7691e0e3U, 0x13f65c5bU, 0xfd59e949U, 0x983e55f1U,
+ 0x2106826cU, 0x44613ed4U, 0xaace8bc6U, 0xcfa9377eU, 0x38417fd6U,
+ 0x5d26c36eU, 0xb389767cU, 0xd6eecac4U, 0x6fd61d59U, 0x0ab1a1e1U,
+ 0xe41e14f3U, 0x8179a84bU, 0xd769cb13U, 0xb20e77abU, 0x5ca1c2b9U,
+ 0x39c67e01U, 0x80fea99cU, 0xe5991524U, 0x0b36a036U, 0x6e511c8eU,
+ 0xa7166686U, 0xc271da3eU, 0x2cde6f2cU, 0x49b9d394U, 0xf0810409U,
+ 0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU,
+ 0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U,
+ 0xf10605deU
+#endif
+ }
+};
+
+#ifdef NO_ENDIAN
+// Currently not in use, always use the BYFOUR method with known endianness
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+uint32_t crc32(crc, buf, len)
+ uint32_t crc;
+ const uint8_t *buf;
+ size_t len;
+{
+ if (buf == NULL) return 0;
+
+ crc = crc ^ 0xffffffffU;
+ while (len >= 8) {
+ DO8;
+ len -= 8;
+ }
+ if (len) do {
+ DO1;
+ } while (--len);
+ return crc ^ 0xffffffffU;
+}
+#endif
+
+#ifdef BYFOUR
+
+/*
+ This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
+ integer pointer type. This violates the strict aliasing rule, where a
+ compiler can assume, for optimization purposes, that two pointers to
+ fundamentally different types won't ever point to the same memory. This can
+ manifest as a problem only if one of the pointers is written to. This code
+ only reads from those pointers. So long as this code remains isolated in
+ this compilation unit, there won't be a problem. For this reason, this code
+ should not be copied and pasted into a compilation unit in which other code
+ writes to the buffer that is passed to these routines.
+ */
+
+#ifdef LITTLE_ENDIAN
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+ c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+uint32_t crc32(crc, buf, len)
+ uint32_t crc;
+ const uint8_t *buf;
+ size_t len;
+{
+ if (buf == NULL) return 0;
+ register uint32_t c;
+ register const uint32_t FAR *buf4;
+
+ c = ~crc;
+ while (len && ((uintptr_t)buf & 3)) {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ len--;
+ }
+
+ buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const uint8_t FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ } while (--len);
+ c = ~c;
+ return c;
+}
+#endif
+
+#ifdef BIG_ENDIAN
+/* ========================================================================= */
+#define DOBIG4 c ^= *buf4++; \
+ c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+uint32_t crc32(crc, buf, len)
+ uint32_t crc;
+ const uint8_t *buf;
+ size_t len;
+{
+ if (buf == NULL) return 0;
+ register uint32_t c;
+ register const uint32_t FAR *buf4;
+
+ c = ~net_order_32(crc);
+ while (len && ((uintptr_t)buf & 3)) {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ len--;
+ }
+
+ buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+ while (len >= 32) {
+ DOBIG32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOBIG4;
+ len -= 4;
+ }
+ buf = (const uint8_t FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ } while (--len);
+ c = ~c;
+ return net_order_32(c);
+}
+#endif
+
+#endif /* BYFOUR */
+
diff --git a/src/shared/crc32.h b/src/shared/crc32.h
new file mode 100644
index 0000000..00b8bdd
--- /dev/null
+++ b/src/shared/crc32.h
@@ -0,0 +1,9 @@
+#ifndef _CRC32_H_
+#define _CRC32_H_
+
+#include <stdint.h>
+
+uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len);
+
+#endif
+
diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c
new file mode 100644
index 0000000..5e5cf7f
--- /dev/null
+++ b/src/shared/fdsignal.c
@@ -0,0 +1,14 @@
+#include "fdsignal.h"
+
+#if defined(linux) || defined(__linux) || defined(__linux__)
+//#warning "Using eventfd based signalling"
+#include "fdsignal.inc/eventfd.c"
+#elif __SIZEOF_INT__ == 4 && __SIZEOF_POINTER__ == 8
+//#warning "Using pointer-packing pipe based signalling"
+#include "fdsignal.inc/pipe64.c"
+#else
+_Static_assert( sizeof(int) != 4 || sizeof(void*) != 8, "Something's goofy, fix preprocessor check above!" );
+//#warning "Using fallback pipe based signalling"
+#include "fdsignal.inc/pipe_malloc.c"
+#endif
+
diff --git a/src/shared/fdsignal.h b/src/shared/fdsignal.h
new file mode 100644
index 0000000..960a2a9
--- /dev/null
+++ b/src/shared/fdsignal.h
@@ -0,0 +1,57 @@
+#ifndef _FD_SIGNAL_H_
+#define _FD_SIGNAL_H_
+
+#define SIGNAL_OK (0)
+#define SIGNAL_TIMEOUT (-2)
+#define SIGNAL_ERROR (-1)
+
+typedef struct _dnbd3_signal dnbd3_signal_t;
+
+/**
+ * Create a new signal, nonblocking.
+ * @return NULL on error, pointer to dnbd3_signal_t on success.
+ */
+dnbd3_signal_t* signal_new();
+
+/**
+ * Create a new signal, blocking.
+ * @return NULL on error, pointer to dnbd3_signal_t on success.
+ */
+dnbd3_signal_t* signal_newBlocking();
+
+/**
+ * Trigger the given signal, so a wait or clear call will succeed.
+ * @return SIGNAL_OK on success, SIGNAL_ERROR on error
+ */
+int signal_call(const dnbd3_signal_t* const signal);
+
+/**
+ * Wait for given signal, with an optional timeout.
+ * If timeout == 0, just poll once.
+ * If timeout < 0, wait forever.
+ * @return > 0 telling how many times the signal was called,
+ * SIGNAL_TIMEOUT if the timeout was reached,
+ * SIGNAL_ERROR if some error occured
+ */
+int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs);
+
+/**
+ * Clears any pending signals on this signal.
+ * @return number of signals that were pending,
+ * SIGNAL_ERROR if some error occured
+ */
+int signal_clear(const dnbd3_signal_t* const signal);
+
+/**
+ * Close the given signal.
+ */
+void signal_close(const dnbd3_signal_t* const signal);
+
+/**
+ * Get a file descriptor for the given signal that can be
+ * waited on using poll or similar.
+ * @return -1 if the signal is invalid
+ */
+int signal_getWaitFd(const dnbd3_signal_t* const signal);
+
+#endif
diff --git a/src/shared/fdsignal.inc/eventfd.c b/src/shared/fdsignal.inc/eventfd.c
new file mode 100644
index 0000000..358d41c
--- /dev/null
+++ b/src/shared/fdsignal.inc/eventfd.c
@@ -0,0 +1,74 @@
+#include <sys/eventfd.h>
+#include <poll.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+
+/*
+ * Linux implementation of signals.
+ * Internally, eventfds are used for signalling, as they
+ * provide the least overhead. We don't allocate any struct
+ * ever, but cast the event fd+1 to dnbd3_signal_t*
+ * to save all the malloc() and free() calls.
+ */
+
+dnbd3_signal_t* signal_new()
+{
+ // On error, eventfd() returns -1, so essentially we return NULL on error.
+ // (Yes, NULL doesn't have to be 0 everywhere, but cmon)
+ return (dnbd3_signal_t*)(intptr_t)( eventfd( 0, EFD_NONBLOCK ) + 1 );
+}
+
+dnbd3_signal_t* signal_newBlocking()
+{
+ return (dnbd3_signal_t*)(intptr_t)( eventfd( 0, 0 ) + 1 );
+}
+
+int signal_call(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ static const uint64_t one = 1;
+ const int signalFd = ( (int)(intptr_t)signal ) - 1;
+ return write( signalFd, &one, sizeof one ) == sizeof one ? SIGNAL_OK : SIGNAL_ERROR;
+}
+
+int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ const int signalFd = ( (int)(intptr_t)signal ) - 1;
+ struct pollfd ps = {
+ .fd = signalFd,
+ .events = POLLIN
+ };
+ int ret = poll( &ps, 1, timeoutMs );
+ if ( ret == 0 ) return SIGNAL_TIMEOUT;
+ if ( ret == -1 ) return SIGNAL_ERROR;
+ if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR;
+ return signal_clear( signal );
+}
+
+int signal_clear(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ uint64_t ret;
+ const int signalFd = ( (int)(intptr_t)signal ) - 1;
+ if ( read( signalFd, &ret, sizeof ret ) != sizeof ret ) {
+ if ( errno == EAGAIN ) return 0;
+ return SIGNAL_ERROR;
+ }
+ return (int)ret;
+}
+
+void signal_close(const dnbd3_signal_t* const signal)
+{
+ const int signalFd = ( (int)(intptr_t)signal ) - 1;
+ close( signalFd );
+}
+
+int signal_getWaitFd(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return -1;
+ const int signalFd = ( (int)(intptr_t)signal ) - 1;
+ return signalFd;
+}
+
diff --git a/src/shared/fdsignal.inc/pipe64.c b/src/shared/fdsignal.inc/pipe64.c
new file mode 100644
index 0000000..4f0614b
--- /dev/null
+++ b/src/shared/fdsignal.inc/pipe64.c
@@ -0,0 +1,88 @@
+#include <poll.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define P_READ (0)
+#define P_WRITE (1)
+
+/*
+ * Generic (posix) implelentation of signals, using pipes.
+ * 64bit version, packing two ints into a pointer.
+ * This version requires that you use -fno-strict-aliasing
+ * since it's doing evil pointer casting.
+ */
+
+dnbd3_signal_t* signal_new()
+{
+ int fds[2];
+ if ( pipe( fds ) == -1 ) return NULL;
+ fcntl( fds[P_READ], F_SETFL, O_NONBLOCK );
+ fcntl( fds[P_WRITE], F_SETFL, O_NONBLOCK );
+ return (dnbd3_signal_t*)*((uintptr_t*)fds);
+}
+
+dnbd3_signal_t* signal_newBlocking()
+{
+ int fds[2];
+ if ( pipe( fds ) == -1 ) return NULL;
+ return (dnbd3_signal_t*)*((uintptr_t*)fds);
+}
+
+int signal_call(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ static char one = 1;
+ const int* fds = (int*)&signal;
+ // Write one byte on every call, so the number of bytes read will
+ // match the number of events
+ return write( fds[P_WRITE], &one, 1 ) > 0 ? SIGNAL_OK : SIGNAL_ERROR;
+}
+
+int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ const int* fds = (int*)&signal;
+ struct pollfd ps = {
+ .fd = fds[P_READ],
+ .events = POLLIN
+ };
+ int ret = poll( &ps, 1, timeoutMs );
+ if ( ret == 0 ) return SIGNAL_TIMEOUT;
+ if ( ret == -1 ) return SIGNAL_ERROR;
+ if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR;
+ return signal_clear( signal );
+}
+
+int signal_clear(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ char throwaway[100];
+ const int* fds = (int*)&signal;
+ ssize_t ret, total = 0;
+ do {
+ ret = read( fds[P_READ], throwaway, sizeof throwaway );
+ if ( ret < 0 ) {
+ if ( errno == EAGAIN ) return total;
+ return SIGNAL_ERROR;
+ }
+ total += ret;
+ } while ( (size_t)ret == sizeof throwaway );
+ return (int)total;
+}
+
+void signal_close(const dnbd3_signal_t* const signal)
+{
+ const int* fds = (int*)&signal;
+ close( fds[P_READ] );
+ close( fds[P_WRITE] );
+}
+
+int signal_getWaitFd(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return -1;
+ const int* fds = (int*)&signal;
+ return fds[P_READ];
+}
+
diff --git a/src/shared/fdsignal.inc/pipe_malloc.c b/src/shared/fdsignal.inc/pipe_malloc.c
new file mode 100644
index 0000000..b23ddcd
--- /dev/null
+++ b/src/shared/fdsignal.inc/pipe_malloc.c
@@ -0,0 +1,89 @@
+#include <poll.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+struct _dnbd3_signal {
+ int read;
+ int write;
+};
+
+/*
+ * Generic (posix) implelentation of signals, using pipes.
+ * A struct containing both fds will be malloc()ed for each
+ * signal.
+ */
+
+dnbd3_signal_t* signal_new()
+{
+ dnbd3_signal_t *ret = signal_newBlocking();
+ if ( ret == NULL ) return NULL;
+ fcntl( ret->read, F_SETFL, O_NONBLOCK );
+ fcntl( ret->write, F_SETFL, O_NONBLOCK );
+ return ret;
+}
+
+dnbd3_signal_t* signal_newBlocking()
+{
+ int fds[2];
+ if ( pipe( fds ) == -1 ) return NULL;
+ dnbd3_signal_t* ret = malloc( sizeof(dnbd3_signal_t) );
+ ret->read = fds[0];
+ ret->write = fds[1];
+ return ret;
+}
+
+int signal_call(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ static char one = 1;
+ // Write one byte on every call, so the number of bytes read will
+ // match the number of events
+ return write( signal->write, &one, 1 ) > 0 ? SIGNAL_OK : SIGNAL_ERROR;
+}
+
+int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ struct pollfd ps = {
+ .fd = signal->read,
+ .events = POLLIN
+ };
+ int ret = poll( &ps, 1, timeoutMs );
+ if ( ret == 0 ) return SIGNAL_TIMEOUT;
+ if ( ret == -1 ) return SIGNAL_ERROR;
+ if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR;
+ return signal_clear( signal );
+}
+
+int signal_clear(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return SIGNAL_ERROR;
+ char throwaway[100];
+ ssize_t ret, total = 0;
+ do {
+ ret = read( signal->read, throwaway, sizeof throwaway );
+ if ( ret < 0 ) {
+ if ( errno == EAGAIN ) return (int)total;
+ return SIGNAL_ERROR;
+ }
+ total += ret;
+ } while ( (size_t)ret == sizeof throwaway );
+ return (int)total;
+}
+
+void signal_close(const dnbd3_signal_t* const signal)
+{
+ close( signal->read );
+ close( signal->write );
+ free( (void*)signal );
+}
+
+int signal_getWaitFd(const dnbd3_signal_t* const signal)
+{
+ if ( signal == NULL ) return -1;
+ return signal->read;
+}
+
diff --git a/src/shared/log.c b/src/shared/log.c
new file mode 100644
index 0000000..055acb4
--- /dev/null
+++ b/src/shared/log.c
@@ -0,0 +1,204 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Simon Rettberg
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#include "log.h"
+#include <stdarg.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+
+#define LINE_LEN (800)
+
+static pthread_mutex_t logLock = PTHREAD_MUTEX_INITIALIZER;
+static _Atomic logmask_t maskFile = 31;
+static _Atomic logmask_t maskCon = 15;
+
+static char *logFile = NULL;
+static int logFd = -1;
+
+static bool consoleTimestamps = false;
+
+
+static int writeLevel(char *buffer, logmask_t level);
+
+
+bool log_hasMask(const logmask_t mask)
+{
+ return ( ( maskFile | maskCon ) & mask ) == mask;
+}
+
+void log_setFileMask(logmask_t mask)
+{
+ maskFile = mask;
+}
+
+void log_setConsoleMask(logmask_t mask)
+{
+ maskCon = mask;
+}
+
+void log_setConsoleTimestamps(bool on)
+{
+ consoleTimestamps = on;
+}
+
+bool log_openLogFile(const char *path)
+{
+ pthread_mutex_lock( &logLock );
+ if ( logFd >= 0 ) {
+ close( logFd );
+ }
+ if ( path == NULL && logFile == NULL )
+ goto unlock;
+ if ( path != NULL ) {
+ free( logFile );
+ logFile = strdup( path );
+ }
+ logFd = open( logFile, O_WRONLY | O_CREAT | O_APPEND, 0644 );
+ if ( logFd < 0 )
+ goto unlock;
+unlock: ;
+ pthread_mutex_unlock( &logLock );
+ return logFd >= 0;
+}
+
+void logadd(const logmask_t mask, const char *fmt, ...)
+{
+ if ( ( (maskFile | maskCon) & mask ) == 0 )
+ return;
+ va_list ap;
+ int ret;
+ time_t rawtime;
+ struct tm timeinfo;
+ char buffer[LINE_LEN];
+ bool toFile = maskFile & mask;
+ bool toStdout = maskCon & mask;
+ size_t offset;
+
+ if ( toFile || ( toStdout && consoleTimestamps ) ) {
+ time( &rawtime );
+ localtime_r( &rawtime, &timeinfo );
+ offset = strftime( buffer, LINE_LEN, "[%d.%m. %H:%M:%S] ", &timeinfo );
+ } else {
+ offset = 0;
+ }
+ const char *stdoutLine = buffer + offset;
+ offset += writeLevel( buffer + offset, mask );
+ va_start( ap, fmt );
+ ret = vsnprintf( buffer + offset, LINE_LEN - offset, fmt, ap );
+ va_end( ap );
+ if ( ret < 0 ) return;
+ offset += ret;
+ if ( offset + 1 >= LINE_LEN ) {
+ buffer[LINE_LEN-2] = '\0';
+ offset = LINE_LEN - 2;
+ }
+ if ( buffer[offset-1] != '\n' ) {
+ buffer[offset++] = '\n';
+ buffer[offset] = '\0';
+ }
+ if ( toFile ) {
+ pthread_mutex_lock( &logLock );
+ if ( logFd >= 0 ) {
+ size_t done = 0;
+ while (done < offset ) {
+ const ssize_t wr = write( logFd, buffer + done, offset - done );
+ if ( wr < 0 ) {
+ if ( errno == EINTR ) continue;
+ printf( "Logging to file failed! (errno=%d)\n", errno );
+ break;
+ }
+ done += (size_t)wr;
+ }
+ }
+ pthread_mutex_unlock( &logLock );
+ }
+ if ( toStdout ) {
+ if ( consoleTimestamps ) stdoutLine = buffer;
+#ifdef AFL_MODE
+ fputs( stdoutLine, stderr );
+ fflush( stderr );
+#else
+ fputs( stdoutLine, stdout );
+ fflush( stdout );
+#endif
+ }
+}
+
+ssize_t log_fetch(char *buffer, int size)
+{
+ if ( logFile == NULL || size <= 1 )
+ return -1;
+ int fd = open( logFile, O_RDONLY );
+ if ( fd < 0 )
+ return -1;
+ off_t off = lseek( fd, 0, SEEK_END );
+ if ( off == (off_t)-1 ) {
+ close( fd );
+ return -1;
+ }
+ if ( (off_t)size <= off ) {
+ off -= size;
+ } else {
+ off = 0;
+ }
+ ssize_t ret = pread( fd, buffer, size - 1, off );
+ close( fd );
+ buffer[ret] = '\0';
+ return ret;
+}
+
+static int writeLevel(char *buffer, logmask_t level)
+{
+ const char *word;
+ char *dest = buffer;
+ switch ( level ) {
+ case LOG_ERROR:
+ word = "ERROR";
+ break;
+ case LOG_WARNING:
+ word = "WARNING";
+ break;
+ case LOG_MINOR:
+ word = "Warning";
+ break;
+ case LOG_INFO:
+ word = "Info";
+ break;
+ case LOG_DEBUG1:
+ word = "DEBUG1";
+ break;
+ case LOG_DEBUG2:
+ word = "DEBUG2";
+ break;
+ default:
+ word = "!?!?!?";
+ break;
+ }
+ while ( ( *dest++ = *word++ ) );
+ *--dest = ':';
+ *++dest = ' ';
+ return (int)( dest - buffer ) + 1;
+}
+
diff --git a/src/shared/log.h b/src/shared/log.h
new file mode 100644
index 0000000..5b1e8f7
--- /dev/null
+++ b/src/shared/log.h
@@ -0,0 +1,65 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Simon Rettberg
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef LOG_H_
+#define LOG_H_
+
+#include <stdbool.h>
+#include <unistd.h>
+
+typedef unsigned int logmask_t;
+#define LOG_ERROR ((logmask_t)1) // Fatal error, server will terminate
+#define LOG_WARNING ((logmask_t)2) // Major issue, something is broken but keep running
+#define LOG_MINOR ((logmask_t)4) // Minor issue, more of a hickup than serious problem
+#define LOG_INFO ((logmask_t)8) // Informational message
+#define LOG_DEBUG1 ((logmask_t)16) // Debug information, use this for non-spammy stuff
+#define LOG_DEBUG2 ((logmask_t)32) // Use this for debug messages that will show up a lot
+
+
+/**
+ * Check if cansoleMask | fileMask has all of mask set.
+ */
+bool log_hasMask(const logmask_t mask);
+
+void log_setFileMask(logmask_t mask);
+
+void log_setConsoleMask(logmask_t mask);
+
+void log_setConsoleTimestamps(bool on);
+
+/**
+ * Open or reopen the log file. If path is NULL and the
+ * function was called with a path before, the same path
+ * will be used again.
+ */
+bool log_openLogFile(const char *path);
+
+/**
+ * Add a line to the log
+ */
+void logadd(const logmask_t mask, const char *text, ...)
+ __attribute__ ((format (printf, 2, 3)));
+
+/**
+ * Return last size bytes of log.
+ */
+ssize_t log_fetch(char *buffer, int size);
+
+#endif /* LOG_H_ */
diff --git a/src/shared/protocol.h b/src/shared/protocol.h
new file mode 100644
index 0000000..d87bbd8
--- /dev/null
+++ b/src/shared/protocol.h
@@ -0,0 +1,159 @@
+#ifndef _PROTOCOL_H_
+#define _PROTOCOL_H_
+
+#include "sockhelper.h"
+
+#include "../types.h"
+#include "../serialize.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+
+// Client tells server that it is another server
+#define FLAGS8_SERVER (1)
+// Client (which is a proxy) tells server that it has background-replication enabled
+#define FLAGS8_BG_REP (2)
+
+// 2017-10-16: We now support hop-counting, macro to pass hop count conditinally to a function
+#define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 )
+
+// 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var
+#define SI_SERVER_FLAGS ( FLAGS8_SERVER | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) )
+
+#define REPLY_OK (0)
+#define REPLY_ERRNO (-1)
+#define REPLY_AGAIN (-2)
+#define REPLY_INTR (-3)
+#define REPLY_CLOSED (-4)
+#define REPLY_INCOMPLETE (-5)
+#define REPLY_WRONGMAGIC (-6)
+
+static inline int dnbd3_read_reply(int sock, dnbd3_reply_t *reply, bool wait)
+{
+ ssize_t ret = recv( sock, reply, sizeof(*reply), (wait ? MSG_WAITALL : MSG_DONTWAIT) | MSG_NOSIGNAL );
+ if ( ret == 0 ) return REPLY_CLOSED;
+ if ( ret < 0 ) {
+ if ( errno == EAGAIN || errno == EWOULDBLOCK ) return REPLY_AGAIN;
+ if ( errno == EINTR ) return REPLY_INTR;
+ return REPLY_ERRNO;
+ }
+ if ( !wait && ret != sizeof(*reply) ) ret += recv( sock, ((char*)reply) + ret, sizeof(*reply) - ret, MSG_WAITALL | MSG_NOSIGNAL );
+ if ( ret != sizeof(*reply) ) return REPLY_INCOMPLETE;
+ fixup_reply( *reply );
+ if ( reply->magic != dnbd3_packet_magic ) return REPLY_WRONGMAGIC;
+ return REPLY_OK;
+}
+
+static inline bool dnbd3_get_reply(int sock, dnbd3_reply_t *reply)
+{
+ int ret;
+ do {
+ ret = dnbd3_read_reply( sock, reply, true );
+ } while ( ret == REPLY_INTR );
+ return ret == REPLY_OK;
+}
+
+static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid, uint8_t flags8)
+{
+ serialized_buffer_t serialized;
+ dnbd3_request_t request;
+ struct iovec iov[2];
+ serializer_reset_write( &serialized );
+ serializer_put_uint16( &serialized, PROTOCOL_VERSION );
+ serializer_put_string( &serialized, name );
+ serializer_put_uint16( &serialized, rid );
+ serializer_put_uint8( &serialized, flags8 );
+ const ssize_t len = serializer_get_written_length( &serialized );
+ request.magic = dnbd3_packet_magic;
+ request.cmd = CMD_SELECT_IMAGE;
+ request.size = (uint32_t)len;
+#ifdef _DEBUG
+ request.handle = 0;
+ request.offset = 0;
+#endif
+ fixup_request( request );
+ iov[0].iov_base = &request;
+ iov[0].iov_len = sizeof(request);
+ iov[1].iov_base = &serialized;
+ iov[1].iov_len = len;
+ ssize_t ret;
+ do {
+ ret = writev( sock, iov, 2 );
+ } while ( ret == -1 && errno == EINTR );
+ return ret == len + (ssize_t)sizeof(request);
+}
+
+static inline bool dnbd3_get_block(int sock, uint64_t offset, uint32_t size, uint64_t handle, uint8_t hopCount)
+{
+ dnbd3_request_t request;
+ request.magic = dnbd3_packet_magic;
+ request.handle = handle;
+ request.cmd = CMD_GET_BLOCK;
+ // When writing before "fixup", we can get away with assigning to offset instead of offset_small if we
+ // do it before assigning to .hops. Faster on 64bit machines (so, on everything)
+ request.offset = offset;
+ request.hops = hopCount;
+ request.size = size;
+ fixup_request( request );
+ return sock_sendAll( sock, &request, sizeof(request), 2 ) == (ssize_t)sizeof(request);
+}
+
+static inline bool dnbd3_get_crc32(int sock, uint32_t *master, void *buffer, size_t *bufferLen)
+{
+ dnbd3_request_t request;
+ dnbd3_reply_t reply;
+ request.magic = dnbd3_packet_magic;
+ request.handle = 0;
+ request.cmd = CMD_GET_CRC32;
+ request.offset = 0;
+ request.size = 0;
+ fixup_request( request );
+ if ( sock_sendAll( sock, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) return false;
+ if ( !dnbd3_get_reply( sock, &reply ) ) return false;
+ if ( reply.size == 0 ) {
+ *bufferLen = 0;
+ return true;
+ }
+ if ( reply.size < 4 ) return false;
+ reply.size -= 4;
+ if ( reply.cmd != CMD_GET_CRC32 || reply.size > *bufferLen ) return false;
+ *bufferLen = reply.size;
+ if ( sock_recv( sock, master, sizeof(uint32_t) ) != (ssize_t)sizeof(uint32_t) ) return false;
+ return sock_recv( sock, buffer, reply.size ) == (ssize_t)reply.size;
+}
+
+/**
+ * Pass a full serialized_buffer_t and a socket fd. Parsed data will be returned in further arguments.
+ * Note that all strings will point into the passed buffer, so there's no need to free them.
+ * This function will also read the header for you, as this message can only occur during connection,
+ * where no unrequested messages could arrive inbetween.
+ */
+static inline bool dnbd3_select_image_reply(serialized_buffer_t *buffer, int sock, uint16_t *protocol_version, char **name, uint16_t *rid,
+ uint64_t *imageSize)
+{
+ errno = 0;
+ dnbd3_reply_t reply;
+ if ( !dnbd3_get_reply( sock, &reply ) ) {
+ return false;
+ }
+ errno = 0;
+ if ( reply.cmd != CMD_SELECT_IMAGE || reply.size < 3 || reply.size > MAX_PAYLOAD ) {
+ return false;
+ }
+ // receive reply payload
+ ssize_t ret = sock_recv( sock, buffer, reply.size );
+ if ( ret != (ssize_t)reply.size ) {
+ return false;
+ }
+ // handle/check reply payload
+ serializer_reset_read( buffer, reply.size );
+ *protocol_version = serializer_get_uint16( buffer );
+ *name = serializer_get_string( buffer );
+ *rid = serializer_get_uint16( buffer );
+ *imageSize = serializer_get_uint64( buffer );
+ return true;
+}
+
+#endif
diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
new file mode 100644
index 0000000..ab34aa1
--- /dev/null
+++ b/src/shared/sockhelper.c
@@ -0,0 +1,430 @@
+#include "sockhelper.h"
+#include "log.h"
+#include <arpa/inet.h> // inet_ntop
+#include <netdb.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdlib.h>
+
+#define MAXLISTEN 20
+
+struct _poll_list {
+ int count;
+ struct pollfd entry[MAXLISTEN];
+};
+
+int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms)
+{
+ // TODO: Move out of here, this unit should contain general socket functions
+ // TODO: Abstract away from sockaddr_in* like the rest of the functions here do,
+ // so WITH_IPV6 can finally be removed as everything is transparent. b- but how?
+ struct sockaddr_storage ss;
+ int proto, addrlen;
+ memset( &ss, 0, sizeof ss );
+ if ( addr->type == HOST_IP4 ) {
+ // Set host (IPv4)
+ struct sockaddr_in *addr4 = (struct sockaddr_in*)&ss;
+ addr4->sin_family = AF_INET;
+ memcpy( &addr4->sin_addr, addr->addr, 4 );
+ addr4->sin_port = addr->port;
+ proto = PF_INET;
+ addrlen = sizeof *addr4;
+ }
+#ifdef WITH_IPV6
+ else if ( addr->type == HOST_IP6 ) {
+ // Set host (IPv6)
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)&ss;
+ addr6->sin6_family = AF_INET6;
+ memcpy( &addr6->sin6_addr, addr->addr, 16 );
+ addr6->sin6_port = addr->port;
+ proto = PF_INET6;
+ addrlen = sizeof *addr6;
+ }
+#endif
+ else {
+ logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type );
+ return -1;
+ }
+ int client_sock = socket( proto, SOCK_STREAM, IPPROTO_TCP );
+ if ( client_sock == -1 ) return -1;
+ // Apply connect timeout
+ if ( connect_ms == -1 ) {
+ sock_set_nonblock( client_sock );
+ } else {
+ sock_setTimeout( client_sock, connect_ms );
+ }
+ for ( int i = 0; i < 5; ++i ) {
+ int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen );
+ if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break;
+ if ( errno == EINTR ) {
+ // http://www.madore.org/~david/computers/connect-intr.html
+#ifdef __linux__
+ continue;
+#else
+ struct pollfd unix_really_sucks = { .fd = client_sock, .events = POLLOUT | POLLIN };
+ while ( i-- > 0 ) {
+ int pr = poll( &unix_really_sucks, 1, connect_ms == 0 ? -1 : connect_ms );
+ if ( pr == 1 && ( unix_really_sucks.revents & POLLOUT ) ) break;
+ if ( pr == -1 && errno == EINTR ) continue;
+ close( client_sock );
+ return -1;
+ }
+ sockaddr_storage junk;
+ socklen_t more_junk = sizeof(junk);
+ if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) {
+ close( client_sock );
+ return -1;
+ }
+ break;
+#endif
+ } // EINTR
+ close( client_sock );
+ return -1;
+ }
+ if ( connect_ms != -1 && connect_ms != rw_ms ) {
+ // Apply read/write timeout
+ sock_setTimeout( client_sock, rw_ms );
+ }
+ return client_sock;
+}
+
+// TODO: Pretty much same as in server/*
+int sock_resolveToDnbd3Host(const char * const address, dnbd3_host_t * const dest, const int count)
+{
+ if ( count <= 0 )
+ return 0;
+ struct addrinfo hints, *res, *ptr;
+ char bufferAddr[100], bufferPort[6];
+ char *addr = bufferAddr;
+ const char *portStr = NULL;
+ int addCount = 0;
+
+ // See if we have a port
+ snprintf( bufferAddr, sizeof bufferAddr, "%s", address );
+ char *c1, *c2;
+ c1 = strchr( addr, ':' );
+ if ( c1 != NULL ) {
+ c2 = strchr( c1 + 1, ':' );
+ if ( c2 == NULL ) {
+ *c1 = '\0';
+ portStr = c1 + 1;
+ } else if ( *addr == '[' ) {
+ // IPv6 - support [1:2::3]:123
+ do {
+ c1 = strchr( c2 + 1, ':' );
+ if ( c1 != NULL ) c2 = c1;
+ } while ( c1 != NULL );
+ if ( *(c2 - 1 ) == ']' ) {
+ *( c2 - 1 ) = '\0';
+ *c2 = '\0';
+ addr += 1;
+ portStr = c2 + 1;
+ }
+ }
+ }
+ if ( portStr == NULL ) {
+ portStr = bufferPort;
+ snprintf( bufferPort, sizeof bufferPort, "%d", (int)PORT );
+ }
+
+ // Set hints for local addresses.
+ memset( &hints, 0, sizeof( hints ) );
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ if ( getaddrinfo( addr, portStr, &hints, &res ) != 0 || res == NULL ) {
+ return 0;
+ }
+ for ( ptr = res; ptr != NULL && count > 0; ptr = ptr->ai_next ) {
+ if ( sock_sockaddrToDnbd3( ptr->ai_addr, &dest[addCount] ) ) {
+ addCount += 1;
+ }
+ }
+
+ freeaddrinfo( res );
+ return addCount;
+}
+
+bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
+{
+ if ( sa->sa_family == AF_INET ) {
+ // Set host (IPv4)
+ struct sockaddr_in *addr4 = (struct sockaddr_in*)sa;
+ host->type = HOST_IP4;
+ host->port = addr4->sin_port;
+ memcpy( host->addr, &addr4->sin_addr, 4 );
+ return true;
+ }
+#ifdef WITH_IPV6
+ if ( sa->sa_family == AF_INET6 ) {
+ // Set host (IPv6)
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)sa;
+ host->type = HOST_IP6;
+ host->port = addr6->sin6_port;
+ memcpy( host->addr, &addr6->sin6_addr, 16 );
+ return true;
+ }
+#endif
+ return false;
+}
+
+void sock_setTimeout(const int sockfd, const int milliseconds)
+{
+ struct timeval tv;
+ tv.tv_sec = milliseconds / 1000;
+ tv.tv_usec = (milliseconds * 1000) % 1000000;
+ setsockopt( sockfd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) );
+ setsockopt( sockfd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv) );
+}
+
+poll_list_t* sock_newPollList()
+{
+ poll_list_t *list = (poll_list_t*)malloc( sizeof( poll_list_t ) );
+ list->count = 0;
+ return list;
+}
+
+void sock_destroyPollList(poll_list_t *list)
+{
+ for ( int i = 0; i < list->count; ++i ) {
+ if ( list->entry[i].fd >= 0 ) close( list->entry[i].fd );
+ }
+ free( list );
+}
+
+size_t sock_printHost(const dnbd3_host_t * const host, char * const buffer, const size_t len)
+{
+ // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6
+ if ( len < 10 ) return 0;
+ char *output = buffer;
+ if ( host->type == HOST_IP6 ) {
+ *output++ = '[';
+ inet_ntop( AF_INET6, host->addr, output, (socklen_t)( len - 10 ) );
+ output += strlen( output );
+ *output++ = ']';
+ } else if ( host->type == HOST_IP4 ) {
+ inet_ntop( AF_INET, host->addr, output, (socklen_t)( len - 8 ) );
+ output += strlen( output );
+ } else {
+ int ret = snprintf( output, len, "<?addrtype=%d>", (int)host->type );
+ if ( ret <= 0 ) return 0;
+ return MIN( (size_t)ret, len-1 );
+ }
+ *output = '\0';
+ if ( host->port != 0 ) {
+ // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7
+ int ret = snprintf( output, 7, ":%d", (int)ntohs( host->port ) );
+ if ( ret < 0 ) ret = 0;
+ output += MIN( ret, 6 );
+ }
+ return output - buffer;
+}
+
+size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLen, char *output, const size_t len)
+{
+ char host[100], port[10];
+ int outlen = 0;
+ int ret = getnameinfo( addr, addrLen, host, sizeof(host), port, sizeof(port), NI_NUMERICHOST | NI_NUMERICSERV );
+ if ( ret == 0 ) {
+ if ( addr->sa_family == AF_INET ) {
+ outlen = snprintf( output, len, "%s:%s", host, port );
+ } else {
+ outlen = snprintf( output, len, "[%s]:%s", host, port );
+ }
+ }
+ if ( outlen <= 0 ) return 0;
+ return MIN( (size_t)outlen, len-1 );
+}
+
+bool sock_listen(poll_list_t* list, char* bind_addr, uint16_t port)
+{
+ if ( list->count >= MAXLISTEN ) return false;
+ struct addrinfo hints, *res = NULL, *ptr;
+ char portStr[6];
+ const int on = 1;
+ int openCount = 0;
+ // Set hints for local addresses.
+ memset( &hints, 0, sizeof(hints) );
+ hints.ai_flags = AI_PASSIVE;
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ snprintf( portStr, sizeof portStr, "%d", (int)port );
+ if ( getaddrinfo( bind_addr, portStr, &hints, &res ) != 0 || res == NULL ) return false;
+ // Attempt to bind to all of the addresses as long as there's room in the poll list
+ for( ptr = res; ptr != NULL; ptr = ptr->ai_next ) {
+ char bla[100];
+ if ( !sock_printable( (struct sockaddr*)ptr->ai_addr, ptr->ai_addrlen, bla, 100 ) ) snprintf( bla, 100, "[invalid]" );
+ logadd( LOG_DEBUG1, "Binding to %s...", bla );
+ int sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol );
+ if ( sock < 0 ) {
+ logadd( LOG_WARNING, "(Bind to %s): cannot socket(), errno=%d", bla, errno );
+ continue;
+ }
+ setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on) );
+ if ( ptr->ai_family == PF_INET6 ) setsockopt( sock, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on) );
+ if ( bind( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 ) {
+ logadd( LOG_WARNING, "(Bind to %s): cannot bind(), errno=%d", bla, errno );
+ close( sock );
+ continue;
+ }
+ if ( listen( sock, 20 ) == -1 ) {
+ logadd( LOG_WARNING, "(Bind to %s): cannot listen(), errno=%d", bla, errno );
+ close( sock );
+ continue;
+ }
+ list->entry[list->count].fd = sock;
+ list->entry[list->count].events = POLLIN | POLLRDHUP;
+ list->count++;
+ openCount++;
+ if ( list->count >= MAXLISTEN ) break;
+ }
+ freeaddrinfo( res );
+ return openCount > 0;
+}
+
+bool sock_listenAny(poll_list_t* list, uint16_t port)
+{
+ return sock_listen( list, NULL, port );
+}
+
+int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_ms, int rw_ms)
+{
+ // Nonblocking connect seems to be hard to get right in a portable fashion
+ // that's why you might see some weird checks here and there. For now there's
+ // only Linux and FreeBSD, but let's try to not make this code fall on its nose
+ // should dnbd3 be ported to other platforms.
+ if ( list->count < MAXLISTEN && host != NULL ) {
+ int sock = sock_connect( host, -1, -1 );
+ if ( sock != -1 ) {
+ list->entry[list->count].fd = sock;
+ list->entry[list->count].events = POLLIN | POLLOUT | POLLRDHUP;
+ list->count++;
+ }
+ }
+ if ( list->count == 0 ) {
+ return -2;
+ }
+ int ret, tries = 5;
+ do {
+ ret = poll( list->entry, list->count, connect_ms );
+ if ( ret > 0 ) break;
+ if ( ret == 0 ) return -1;
+ if ( ret == -1 && ( errno == EINTR || errno == EAGAIN ) ) {
+ if ( --tries == 0 ) return -1;
+ if ( connect_ms > 1 ) connect_ms /= 2; // Maybe properly account time one day
+ continue;
+ }
+ return -1;
+ } while ( true );
+ for ( int i = list->count - 1; i >= 0; --i ) {
+ int fd = -1;
+ if ( list->entry[i].revents & ( POLLIN | POLLOUT ) ) {
+ struct sockaddr_storage tmp;
+ socklen_t len = sizeof(tmp);
+ fd = list->entry[i].fd;
+ if ( getpeername( fd, (struct sockaddr*)&tmp, &len ) == -1 ) { // More portable then SO_ERROR ...
+ close( fd );
+ fd = -1;
+ }
+ } else if ( list->entry[i].revents != 0 ) {
+ close( list->entry[i].fd );
+ } else {
+ continue;
+ }
+ // Either error or connect success
+ list->count--;
+ if ( i != list->count ) list->entry[i] = list->entry[list->count];
+ if ( fd != -1 ) {
+ sock_set_block( fd );
+ if ( rw_ms != -1 && rw_ms != connect_ms ) {
+ sock_setTimeout( fd, rw_ms );
+ }
+ return fd;
+ }
+ }
+ return -1;
+}
+
+int sock_accept(poll_list_t *list, struct sockaddr_storage *addr, socklen_t *length_ptr)
+{
+ int ret = poll( list->entry, list->count, -1 );
+ if ( ret < 0 ) {
+ return -1;
+ }
+ for ( int i = list->count - 1; i >= 0; --i ) {
+ if ( list->entry[i].revents == 0 ) continue;
+ if ( list->entry[i].revents == POLLIN ) return accept( list->entry[i].fd, (struct sockaddr *)addr, length_ptr );
+ if ( list->entry[i].revents & ( POLLNVAL | POLLHUP | POLLERR | POLLRDHUP ) ) {
+ logadd( LOG_DEBUG1, "poll fd revents=%d for index=%d and fd=%d", (int)list->entry[i].revents, i, list->entry[i].fd );
+ close( list->entry[i].fd );
+ list->count--;
+ if ( i != list->count ) list->entry[i] = list->entry[list->count];
+ }
+ }
+ return -1;
+}
+
+void sock_set_nonblock(int sock)
+{
+ int flags = fcntl( sock, F_GETFL, 0 );
+ if ( flags == -1 ) flags = 0;
+ fcntl( sock, F_SETFL, flags | O_NONBLOCK );
+}
+
+void sock_set_block(int sock)
+{
+ int flags = fcntl( sock, F_GETFL, 0 );
+ if ( flags == -1 ) flags = 0;
+ fcntl( sock, F_SETFL, flags & ~(int)O_NONBLOCK );
+}
+
+bool sock_append(poll_list_t *list, const int sock, bool wantRead, bool wantWrite)
+{
+ if ( sock == -1 || list->count >= MAXLISTEN ) return false;
+ list->entry[list->count++].fd = sock;
+ list->entry[list->count++].events = (short)( ( wantRead ? POLLIN : 0 ) | ( wantWrite ? POLLOUT : 0 ) | POLLRDHUP );
+ list->count++;
+ return true;
+}
+
+ssize_t sock_sendAll(const int sock, const void *buffer, const size_t len, int maxtries)
+{
+ size_t done = 0;
+ ssize_t ret = 0;
+ while ( done < len ) {
+ if ( maxtries >= 0 && --maxtries == -1 ) break;
+ ret = send( sock, (const uint8_t*)buffer + done, len - done, MSG_NOSIGNAL );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ if ( errno == EAGAIN || errno == EWOULDBLOCK ) {
+ continue;
+ }
+ break;
+ }
+ if ( ret == 0 ) break;
+ done += ret;
+ }
+ if ( done == 0 ) return ret;
+ return done;
+}
+
+ssize_t sock_recv(const int sock, void *buffer, const size_t len)
+{
+ size_t done = 0;
+ ssize_t ret = 0;
+ int intrs = 0;
+ while ( done < len ) {
+ ret = recv( sock, (char*)buffer + done, len - done, MSG_NOSIGNAL );
+ if ( ret == -1 ) {
+ if ( errno == EINTR && ++intrs < 10 ) continue;
+ break;
+ }
+ if ( ret == 0 ) break;
+ done += ret;
+ }
+ if ( done == 0 ) return ret;
+ return done;
+}
+
diff --git a/src/shared/sockhelper.h b/src/shared/sockhelper.h
new file mode 100644
index 0000000..8d70789
--- /dev/null
+++ b/src/shared/sockhelper.h
@@ -0,0 +1,120 @@
+#ifndef SOCKHELPER_H_
+#define SOCKHELPER_H_
+
+/*
+ * Helper functions for dealing with sockets. These functions should
+ * abstract from the IP version by using getaddrinfo() and thelike.
+ */
+
+#include "../types.h"
+#include <stdint.h>
+#include <sys/socket.h>
+#include <string.h>
+
+typedef struct _poll_list poll_list_t;
+
+/**
+ * Connect to given dnbd3_host_t.
+ * @param addr - address of host to connect to
+ * @param connect_ms - timeout in milliseconds after which the connection attempt fails
+ * @param rw_ms - read/write timeout in milliseconds to apply on successful connect
+ * @return socket file descriptor, or -1 on error
+ */
+int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms);
+
+/**
+ * Resolve/parse given address and put the result(s) into passed dnbd3_host_t array,
+ * but only up to count entries.
+ * @return Number of items added to array
+ */
+int sock_resolveToDnbd3Host(const char * const address, dnbd3_host_t * const dest, const int count);
+
+bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host);
+
+void sock_setTimeout(const int sockfd, const int milliseconds);
+
+size_t sock_printHost(const dnbd3_host_t * const host, char *output, const size_t len);
+
+size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLen, char *output, const size_t len);
+
+/**
+ * Create new poll list.
+ */
+poll_list_t* sock_newPollList();
+
+/**
+ * Delete a poll list, closing all sockets first if necessary.
+ */
+void sock_destroyPollList(poll_list_t *list);
+
+/**
+ * Listen on all interfaces/available IP addresses, using the given protocol.
+ * IPv4 and IPv6 are supported.
+ * @param protocol_family PF_INET or PF_INET6
+ * @param port port to listen on
+ * @return true if any listen call was successful
+ */
+bool sock_listenAny(poll_list_t* list, uint16_t port);
+
+/**
+ * Listen on a specific address and port.
+ * @param bind_addr human readable address to bind to for listening
+ * @param port to listen on
+ */
+bool sock_listen(poll_list_t* list, char* bind_addr, uint16_t port);
+
+/**
+ * Asynchroneously connect to multiple hosts.
+ * This can be called multiple times with varying timeouts. Calling it
+ * the first time on an empty list is identical to sock_connect(). On
+ * consecutive calls, more nonblocking sockets in connecting state will
+ * be added to the list, and on each of these calls, all the pending
+ * sockets will be checked for successful connection (or error), respecting
+ * the passed timeout.
+ * host can be NULL to just wait on the sockets already in the list.
+ * If at least one socket completed the connection
+ * within the given timeout, it will be removed from the list and
+ * returned. On error or timeout, -1 is returned. If there are no more sockets
+ * in the list, -2 is returned.
+ */
+int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_ms, int rw_ms);
+
+/**
+ * This is a multi-socket version of accept. Pass in an array of listening sockets.
+ * If any of the sockets has an incoming connection, accept it and return the new socket's fd.
+ * On error, return -1, just like accept().
+ * @param sockets array of listening socket fds
+ * @param socket_count number of sockets in that array
+ * @return fd of new client socket, -1 on error
+ */
+int sock_accept(poll_list_t *list, struct sockaddr_storage *addr, socklen_t *length_ptr);
+
+void sock_set_nonblock(int sock);
+
+void sock_set_block(int sock);
+
+/**
+ * Add given socket to array. Take an existing empty slot ( == -1) if available,
+ * append to end otherwise. Updates socket count variable passed by reference.
+ *
+ * @param poll_list_t list the poll list to add the socket to
+ * @param sock socket fd to add
+ * @param wantRead whether to set the EPOLLIN flag
+ * @param wantWrite whether to set the EPOLLOUT flag
+ * @return true on success, false iff the array is already full or socket is < 0
+ */
+bool sock_append(poll_list_t *list, const int sock, bool wantRead, bool wantWrite);
+
+/**
+ * Send the whole buffer, calling write() multiple times if neccessary.
+ * Give up after calling write() maxtries times.
+ * Set maxtries < 0 to try infinitely.
+ */
+ssize_t sock_sendAll(const int sock, const void *buffer, const size_t len, int maxtries);
+
+/**
+ * Send given buffer, repeatedly calling recv on partial send or EINTR.
+ */
+ssize_t sock_recv(const int sock, void *buffer, const size_t len);
+
+#endif /* SOCKHELPER_H_ */
diff --git a/src/shared/timing.c b/src/shared/timing.c
new file mode 100644
index 0000000..4ca1002
--- /dev/null
+++ b/src/shared/timing.c
@@ -0,0 +1,21 @@
+#include "timing.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+struct timespec basetime;
+
+void timing_abort()
+{
+ printf( "Cannot get CLOCK_MONOTONIC(_RAW), errno=%d\n", errno );
+ exit( 1 );
+}
+
+void timing_setBase()
+{
+ if ( clock_gettime( BEST_CLOCK_SOURCE, &basetime ) == -1 ) {
+ memset( &basetime, 0, sizeof(basetime) );
+ }
+}
+
diff --git a/src/shared/timing.h b/src/shared/timing.h
new file mode 100644
index 0000000..f3d8802
--- /dev/null
+++ b/src/shared/timing.h
@@ -0,0 +1,162 @@
+#ifndef _D_TIMING_H
+#define _D_TIMING_H
+
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 199309L
+#endif
+
+#include <time.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef CLOCK_MONOTONIC_RAW
+#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC_RAW
+#else
+#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC
+#endif
+
+typedef struct timespec ticks;
+
+extern struct timespec basetime;
+
+/**
+ * Assign src to dst while adding secs seconds.
+ */
+#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + secs; (dst)->tv_nsec = (src)->tv_nsec; } while (0)
+
+/**
+ * Define variable now, initialize to timing_get.
+ */
+#define declare_now ticks now; timing_get( &now )
+
+/**
+ * Call this once to calibrate on startup.
+ * Although overflows of CLOCK_MONOTONIC(_RAW) should
+ * by definition never happen, we still have a fixed size
+ * int that could at some point. By forcing the counter
+ * to start at 0 on startup the point of overflow
+ * will be very far in the future (decades for 32bit time_t,
+ * end of universe for 64bit).
+ */
+void timing_setBase();
+
+/**
+ * Internal, do not use. Moved to another function
+ * to prevent inlining of error handling code, which
+ * should be very unlikely to ever trigger.
+ */
+_Noreturn void timing_abort();
+
+/**
+ * Get current time. Shortcut for clock_gettime with error check.
+ */
+static inline void timing_get(ticks* retval)
+{
+ if ( clock_gettime( BEST_CLOCK_SOURCE, retval ) == -1 ) timing_abort();
+ retval->tv_sec -= basetime.tv_sec;
+}
+
+/**
+ * Get a ticks instance somewhere in the future.
+ * Useful for timeouts.
+ */
+static inline void timing_gets(ticks* retval, int32_t addSeconds)
+{
+ timing_get( retval );
+ retval->tv_sec += addSeconds;
+}
+
+static inline void timing_addSeconds(ticks* retval, ticks* base, int32_t addSeconds)
+{
+ retval->tv_sec = base->tv_sec + addSeconds;
+ retval->tv_nsec = base->tv_nsec;
+}
+
+/**
+ * Check whether given timeout is reached.
+ * Might trigger up to one second early.
+ */
+static inline bool timing_reached(const ticks* timeout, const ticks* now)
+{
+ return now->tv_sec >= timeout->tv_sec;
+}
+#define timing_1le2(one,two) timing_reached(one,two)
+
+/**
+ * Precise check whether given timeout has been reached.
+ */
+static inline bool timing_reachedPrecise(const ticks* timeout, const ticks* now)
+{
+ return now->tv_sec > timeout->tv_sec
+ || (now->tv_sec == timeout->tv_sec && now->tv_nsec > timeout->tv_nsec);
+}
+
+/**
+ * Shortcut for above. Useful if not used in loop.
+ * Might trigger up to one second early.
+ */
+static inline bool timing_isReached(const ticks* timeout)
+{
+ ticks now;
+ timing_get( &now );
+ return timing_reached( timeout, &now );
+}
+/**
+ * Shortcut for above. Useful if not used in loop.
+ */
+static inline bool timing_isReachedPrecise(const ticks* timeout)
+{
+ ticks now;
+ timing_get( &now );
+ return timing_reachedPrecise( timeout, &now );
+}
+
+
+/**
+ * Get difference between two ticks, rounded down to seconds.
+ * Make sure you pass the arguments in the proper order. If
+ * end is before start, 0 will always be returned.
+ */
+static inline uint32_t timing_diff(const ticks *start, const ticks *end)
+{
+ if ( end->tv_sec <= start->tv_sec ) return 0;
+ return (uint32_t)( ( end->tv_sec - start->tv_sec )
+ + ( start->tv_nsec > end->tv_nsec ? -1 : 0 ) );
+}
+
+/**
+ * Get difference between two ticks, rounded down to milliseconds.
+ * Same as above; passing arguments in reverse will always return 0.
+ */
+static inline uint64_t timing_diffMs(const ticks *start, const ticks *end)
+{
+ if ( end->tv_sec < start->tv_sec ) return 0;
+ uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000;
+ if ( start->tv_nsec >= end->tv_nsec ) {
+ if ( diff == 0 ) return 0;
+ diff -= (start->tv_nsec - end->tv_nsec) / 1000000;
+ } else {
+ diff += (end->tv_nsec - start->tv_nsec) / 1000000;
+ }
+ return diff;
+}
+
+/**
+ * Get difference between two ticks, rounded down to microseconds.
+ * Same as above; passing arguments in reverse will always return 0.
+ */
+static inline uint64_t timing_diffUs(const ticks *start, const ticks *end)
+{
+ if ( end->tv_sec < start->tv_sec ) return 0;
+ uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000000;
+ if ( start->tv_nsec >= end->tv_nsec ) {
+ if ( diff == 0 ) return 0;
+ diff -= ( start->tv_nsec - end->tv_nsec ) / 1000;
+ } else {
+ diff += ( end->tv_nsec - start->tv_nsec ) / 1000;
+ }
+ return diff;
+}
+
+
+#endif
diff --git a/src/types.h b/src/types.h
new file mode 100644
index 0000000..ec37d9b
--- /dev/null
+++ b/src/types.h
@@ -0,0 +1,196 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef TYPES_H_
+#define TYPES_H_
+
+#include "config.h"
+#ifndef KERNEL_MODULE
+#include <stdint.h>
+#include <stdbool.h>
+#endif
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
+#ifdef __GNUC__
+#define UNUSED __attribute__ ((unused))
+#else
+#error "Please add define for your compiler for UNUSED, or define to nothing for your compiler if not supported"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+
+#ifdef __linux__
+#define HAVE_THREAD_NAMES
+#endif
+
+#ifdef __FreeBSD__
+#ifndef MSG_MORE
+#define MSG_MORE 0
+#endif
+#ifndef POLLRDHUP
+#define POLLRDHUP 0x2000
+#endif
+#include <netinet/in.h>
+#endif
+
+#ifdef AFL_MODE
+#define send(a,b,c,d) write(a,b,c)
+#define recv(a,b,c,d) read(a,b,c)
+#endif
+
+
+// ioctl
+#define DNBD3_MAGIC 'd'
+#define IOCTL_OPEN _IO(0xab, 1)
+#define IOCTL_CLOSE _IO(0xab, 2)
+#define IOCTL_SWITCH _IO(0xab, 3)
+#define IOCTL_ADD_SRV _IO(0xab, 4)
+#define IOCTL_REM_SRV _IO(0xab, 5)
+
+#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72);
+// Flip bytes around on big endian when putting stuff on the net
+#define net_order_64(a) ((uint64_t)((((a) & 0xFFull) << 56) | (((a) & 0xFF00ull) << 40) | (((a) & 0xFF0000ull) << 24) | (((a) & 0xFF000000ull) << 8) | (((a) & 0xFF00000000ull) >> 8) | (((a) & 0xFF0000000000ull) >> 24) | (((a) & 0xFF000000000000ull) >> 40) | (((a) & 0xFF00000000000000ull) >> 56)))
+#define net_order_32(a) ((uint32_t)((((a) & (uint32_t)0xFF) << 24) | (((a) & (uint32_t)0xFF00) << 8) | (((a) & (uint32_t)0xFF0000) >> 8) | (((a) & (uint32_t)0xFF000000) >> 24)))
+#define net_order_16(a) ((uint16_t)((((a) & (uint16_t)0xFF) << 8) | (((a) & (uint16_t)0xFF00) >> 8)))
+#define fixup_request(a) do { \
+ (a).cmd = net_order_16((a).cmd); \
+ (a).size = net_order_32((a).size); \
+ (a).offset = net_order_64((a).offset); \
+} while (0)
+#define fixup_reply(a) do { \
+ (a).cmd = net_order_16((a).cmd); \
+ (a).size = net_order_32((a).size); \
+} while (0)
+#define ENDIAN_MODE "Big Endian"
+#ifndef BIG_ENDIAN
+#define BIG_ENDIAN
+#endif
+#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
+static const uint16_t dnbd3_packet_magic = (0x73) | (0x72 << 8);
+// Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
+#define net_order_64(a) (a)
+#define net_order_32(a) (a)
+#define net_order_16(a) (a)
+#define fixup_request(a) while(0)
+#define fixup_reply(a) while(0)
+#define ENDIAN_MODE "Little Endian"
+#ifndef LITTLE_ENDIAN
+#define LITTLE_ENDIAN
+#endif
+#else
+#error "Unknown Endianness"
+#endif
+
+typedef uint8_t dnbd3_af;
+
+static const dnbd3_af HOST_NONE = (dnbd3_af)0;
+static const dnbd3_af HOST_IP4 = (dnbd3_af)2;
+static const dnbd3_af HOST_IP6 = (dnbd3_af)10;
+
+#pragma pack(1)
+typedef struct dnbd3_host_t
+{
+ uint8_t addr[16]; // 16byte (network representation, so it can be directly passed to socket functions)
+ uint16_t port; // 2byte (network representation, so it can be directly passed to socket functions)
+ dnbd3_af type; // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored)
+} dnbd3_host_t;
+#pragma pack(0)
+
+#pragma pack(1)
+typedef struct
+{
+ uint16_t len;
+ dnbd3_host_t host;
+ uint16_t imgnamelen;
+ char *imgname;
+ int rid;
+ int read_ahead_kb;
+ uint8_t use_server_provided_alts;
+} dnbd3_ioctl_t;
+#pragma pack(0)
+
+// network
+#define CMD_GET_BLOCK 1
+#define CMD_SELECT_IMAGE 2
+#define CMD_GET_SERVERS 3
+#define CMD_ERROR 4
+#define CMD_KEEPALIVE 5
+#define CMD_LATEST_RID 6
+#define CMD_SET_CLIENT_MODE 7
+#define CMD_GET_CRC32 8
+
+#define DNBD3_REQUEST_SIZE 24
+#pragma pack(1)
+typedef struct
+{
+ uint16_t magic; // 2byte
+ uint16_t cmd; // 2byte
+ uint32_t size; // 4byte
+ union {
+ struct {
+#ifdef LITTLE_ENDIAN
+ uint64_t offset_small:56; // 7byte
+ uint8_t hops; // 1byte
+#elif defined(BIG_ENDIAN)
+ uint8_t hops; // 1byte
+ uint64_t offset_small:56; // 7byte
+#endif
+ };
+ uint64_t offset; // 8byte
+ };
+ uint64_t handle; // 8byte
+} dnbd3_request_t;
+#pragma pack(0)
+_Static_assert( sizeof(dnbd3_request_t) == DNBD3_REQUEST_SIZE, "dnbd3_request_t is messed up" );
+
+#define DNBD3_REPLY_SIZE 16
+#pragma pack(1)
+typedef struct
+{
+ uint16_t magic; // 2byte
+ uint16_t cmd; // 2byte
+ uint32_t size; // 4byte
+ uint64_t handle; // 8byte
+} dnbd3_reply_t;
+#pragma pack(0)
+_Static_assert( sizeof(dnbd3_reply_t) == DNBD3_REPLY_SIZE, "dnbd3_reply_t is messed up" );
+
+#pragma pack(1)
+typedef struct
+{
+ dnbd3_host_t host;
+ uint8_t failures; // 1byte (number of times server has been consecutively unreachable)
+} dnbd3_server_entry_t;
+#pragma pack(0)
+
+#endif /* TYPES_H_ */
diff --git a/src/version.c.in b/src/version.c.in
new file mode 100644
index 0000000..54854c9
--- /dev/null
+++ b/src/version.c.in
@@ -0,0 +1,4 @@
+#include "version.h"
+
+const char * VERSION_STRING = "@VERSION@";
+
diff --git a/src/version.h b/src/version.h
new file mode 100644
index 0000000..0c4a66b
--- /dev/null
+++ b/src/version.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef VERSION_H_
+#define VERSION_H_
+
+extern const char *VERSION_STRING;
+
+// This is done in a little weird way but otherwise eclipse complains about
+// unresolvable symbols etc...
+#include "version.c"
+
+#endif /* VERSION_H_ */