From 43e57ce5e11e9052f5a7db66f2e8613f1784f919 Mon Sep 17 00:00:00 2001 From: Frederic Robra Date: Tue, 25 Jun 2019 17:03:28 +0200 Subject: first version of dnbd3-ng --- .gitignore | 9 + CMakeLists.txt | 240 ++++ COPYING | 340 ++++++ Kbuild.in | 2 + LOCKS | 80 ++ build.sh | 6 + cmake/FindFuse.cmake | 30 + cmake/FindJansson.cmake | 59 + conf/README.server | 30 + conf/alt-servers | 4 + conf/rpc.acl | 5 + conf/server.conf | 57 + get-version.sh | 22 + pack.sh | 6 + src/bench/connection.c | 133 +++ src/bench/connection.h | 26 + src/bench/helper.c | 37 + src/bench/helper.h | 38 + src/bench/main.c | 154 +++ src/bench/serialize.c | 5 + src/client/client.c | 670 +++++++++++ src/clientconfig.h | 36 + src/config.h | 43 + src/fuse/connection.c | 927 ++++++++++++++ src/fuse/connection.h | 35 + src/fuse/helper.c | 36 + src/fuse/helper.h | 35 + src/fuse/main.c | 420 +++++++ src/fuse/serialize.c | 5 + src/kernel/core.c | 484 ++++++++ src/kernel/dnbd3.h | 86 ++ src/kernel/sysfs.c | 205 ++++ src/kernel/sysfs.h | 45 + src/kernel/utils.c | 41 + src/kernel/utils.h | 29 + src/serialize.c | 84 ++ src/serialize.h | 40 + src/server/altservers.c | 612 ++++++++++ src/server/altservers.h | 30 + src/server/fileutil.c | 128 ++ src/server/fileutil.h | 17 + src/server/globals.c | 321 +++++ src/server/globals.h | 277 +++++ src/server/helper.c | 146 +++ src/server/helper.h | 42 + src/server/image.c | 1794 ++++++++++++++++++++++++++++ src/server/image.h | 63 + src/server/ini.c | 164 +++ src/server/ini.h | 66 + src/server/integrity.c | 274 +++++ src/server/integrity.h | 12 + src/server/locks.c | 306 +++++ src/server/locks.h | 85 ++ src/server/net.c | 731 ++++++++++++ src/server/net.h | 40 + src/server/picohttpparser/README.md | 116 ++ src/server/picohttpparser/picohttpparser.c | 620 ++++++++++ src/server/picohttpparser/picohttpparser.h | 92 ++ src/server/rpc.c | 504 ++++++++ src/server/rpc.h | 10 + src/server/serialize.c | 5 + src/server/server.c | 495 ++++++++ src/server/server.h | 34 + src/server/threadpool.c | 126 ++ src/server/threadpool.h | 29 + src/server/uplink.c | 1034 ++++++++++++++++ src/server/uplink.h | 19 + src/server/urldecode.c | 61 + src/server/urldecode.h | 19 + src/serverconfig.h | 56 + src/shared/crc32.c | 621 ++++++++++ src/shared/crc32.h | 9 + src/shared/fdsignal.c | 14 + src/shared/fdsignal.h | 57 + src/shared/fdsignal.inc/eventfd.c | 74 ++ src/shared/fdsignal.inc/pipe64.c | 88 ++ src/shared/fdsignal.inc/pipe_malloc.c | 89 ++ src/shared/log.c | 204 ++++ src/shared/log.h | 65 + src/shared/protocol.h | 159 +++ src/shared/sockhelper.c | 430 +++++++ src/shared/sockhelper.h | 120 ++ src/shared/timing.c | 21 + src/shared/timing.h | 162 +++ src/types.h | 196 +++ src/version.c.in | 4 + src/version.h | 30 + 87 files changed, 15175 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 COPYING create mode 100644 Kbuild.in create mode 100644 LOCKS create mode 100755 build.sh create mode 100644 cmake/FindFuse.cmake create mode 100644 cmake/FindJansson.cmake create mode 100644 conf/README.server create mode 100644 conf/alt-servers create mode 100644 conf/rpc.acl create mode 100644 conf/server.conf create mode 100755 get-version.sh create mode 100755 pack.sh create mode 100644 src/bench/connection.c create mode 100644 src/bench/connection.h create mode 100644 src/bench/helper.c create mode 100644 src/bench/helper.h create mode 100644 src/bench/main.c create mode 100644 src/bench/serialize.c create mode 100644 src/client/client.c create mode 100644 src/clientconfig.h create mode 100644 src/config.h create mode 100644 src/fuse/connection.c create mode 100644 src/fuse/connection.h create mode 100644 src/fuse/helper.c create mode 100644 src/fuse/helper.h create mode 100644 src/fuse/main.c create mode 100644 src/fuse/serialize.c create mode 100644 src/kernel/core.c create mode 100644 src/kernel/dnbd3.h create mode 100644 src/kernel/sysfs.c create mode 100644 src/kernel/sysfs.h create mode 100644 src/kernel/utils.c create mode 100644 src/kernel/utils.h create mode 100644 src/serialize.c create mode 100644 src/serialize.h create mode 100644 src/server/altservers.c create mode 100644 src/server/altservers.h create mode 100644 src/server/fileutil.c create mode 100644 src/server/fileutil.h create mode 100644 src/server/globals.c create mode 100644 src/server/globals.h create mode 100644 src/server/helper.c create mode 100644 src/server/helper.h create mode 100644 src/server/image.c create mode 100644 src/server/image.h create mode 100644 src/server/ini.c create mode 100644 src/server/ini.h create mode 100644 src/server/integrity.c create mode 100644 src/server/integrity.h create mode 100644 src/server/locks.c create mode 100644 src/server/locks.h create mode 100644 src/server/net.c create mode 100644 src/server/net.h create mode 100644 src/server/picohttpparser/README.md create mode 100644 src/server/picohttpparser/picohttpparser.c create mode 100644 src/server/picohttpparser/picohttpparser.h create mode 100644 src/server/rpc.c create mode 100644 src/server/rpc.h create mode 100644 src/server/serialize.c create mode 100644 src/server/server.c create mode 100644 src/server/server.h create mode 100644 src/server/threadpool.c create mode 100644 src/server/threadpool.h create mode 100644 src/server/uplink.c create mode 100644 src/server/uplink.h create mode 100644 src/server/urldecode.c create mode 100644 src/server/urldecode.h create mode 100644 src/serverconfig.h create mode 100644 src/shared/crc32.c create mode 100644 src/shared/crc32.h create mode 100644 src/shared/fdsignal.c create mode 100644 src/shared/fdsignal.h create mode 100644 src/shared/fdsignal.inc/eventfd.c create mode 100644 src/shared/fdsignal.inc/pipe64.c create mode 100644 src/shared/fdsignal.inc/pipe_malloc.c create mode 100644 src/shared/log.c create mode 100644 src/shared/log.h create mode 100644 src/shared/protocol.h create mode 100644 src/shared/sockhelper.c create mode 100644 src/shared/sockhelper.h create mode 100644 src/shared/timing.c create mode 100644 src/shared/timing.h create mode 100644 src/types.h create mode 100644 src/version.c.in create mode 100644 src/version.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ef0f43e --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +build/ +.cproject +.project +*.swp +.autotools +.idea +/version.txt +.settings/ +.gdbinit diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..0141b05 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,240 @@ +################################################################################ +# GENERAL # +################################################################################ + +PROJECT(dnbd3 C) +CMAKE_MINIMUM_REQUIRED(VERSION 2.6.2) +IF (CMAKE_BUILD_TYPE STREQUAL "") + SET(CMAKE_BUILD_TYPE Debug) +ENDIF() + +SET(CMAKE_INSTALL_PREFIX "/usr/local" CACHE PATH "Path prefix for system installation") +OPTION(BUILD_FUSE_CLIENT "Build dnbd3 fuse client" ON) +OPTION(BUILD_SERVER "Build dnbd3 server" ON) +OPTION(BUILD_STRESSTEST "Build dnbd3 stress testing tool" OFF) +SET(EXTRA_C_FLAGS "" CACHE STRING "Additional options to pass to compiler") + +OPTION(SERVER_FOR_AFL "Build dnbd3-server for usage with afl-fuzz" OFF) + +# Is there a non-retarded way to check if build type is debug or release? +# When specifying, it is case insensitive, so DeBuG would also enable debug builds, +# but in cmake, we can only do case sensitive matches... :/ +string( TOLOWER "${CMAKE_BUILD_TYPE}" bt_lower ) +if (NOT bt_lower MATCHES "^(debug|release)$") + message( FATAL_ERROR "Build type needs to be either Debug or Release" ) +endif() + +message( "Build Type selected: ${CMAKE_BUILD_TYPE}" ) + +IF(CMAKE_SYSTEM_NAME MATCHES "BSD") + message("Detected *BSD System: disable build of Kernel Module.") + SET(BUILD_KERNEL_MODULE False) +ELSE() + OPTION(BUILD_KERNEL_MODULE "Build the dnbd3 Linux kernel module" ON) +ENDIF() + +if(CMAKE_C_COMPILER MATCHES "clang") + message( "Using clang flags." ) + SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O1 -fno-omit-frame-pointer -g -Wall -Wextra -Wpedantic -Wno-unused-result -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}") + SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}") +elseif (CMAKE_C_COMPILER MATCHES "(cc-)|(cc$)") + message( "Using (g)cc flags." ) + SET(CMAKE_C_FLAGS_DEBUG "-std=c11 -O0 -g -Wall -Wextra -Wpedantic -Wconversion -Wno-sign-conversion -D_GNU_SOURCE -D_DEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}") + SET(CMAKE_C_FLAGS_RELEASE "-std=c11 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG -Wno-multichar -fno-strict-aliasing ${EXTRA_C_FLAGS}") +else() + message( FATAL_ERROR "Could not determine compiler type." ) +endif() +#SET(CMAKE_CXX_FLAGS_DEBUG "-std=c99 -O0 -g -Wall -Wno-unused-result -D_GNU_SOURCE -D_DEBUG") +#SET(CMAKE_CXX_FLAGS_RELEASE "-std=c99 -O2 -Wno-unused-result -D_GNU_SOURCE -DNDEBUG" ) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") + +ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64) +ADD_DEFINITIONS(-DWITH_IPV6) + +FIND_PACKAGE(Threads) + +SET(DO_ABORT False) + +message( " *************************************************" ) +if(BUILD_FUSE_CLIENT) + FIND_PACKAGE(Fuse) + if(NOT FUSE_FOUND) + message( " *** No fuse dev libs found, can't build dnbd3-fuse" ) + SET(DO_ABORT True) + endif() + if(NOT THREADS_FOUND) + message( " *** No threads found, can't build dnbd3-fuse" ) + SET(DO_ABORT True) + endif() +endif() +if(BUILD_SERVER) + FIND_PACKAGE(Jansson) + if(NOT THREADS_FOUND) + message( " *** No threads found, can't build dnbd3-server" ) + SET(DO_ABORT True) + endif() + if(NOT JANSSON_FOUND) + message( " *** No jansson lib found, can't build dnbd3-server" ) + SET(DO_ABORT True) + endif() +endif() +if(BUILD_STRESSTEST) + if(NOT THREADS_FOUND) + message( " *** No threads found, can't build dnbd3-bench" ) + SET(DO_ABORT True) + endif() +endif() +message( " *************************************************" ) +if(DO_ABORT) + message( FATAL_ERROR "Aborting." ) +endif() + +#SET(FUSE_INCLUDE_DIR "") +#SET(JANSSON_INCLUDE_DIR "") + +################################################################################ +# VERSION HEADER # +################################################################################ + +FILE(WRITE ${CMAKE_BINARY_DIR}/version.cmake +"EXECUTE_PROCESS( + COMMAND \${CMD} + OUTPUT_VARIABLE VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + CONFIGURE_FILE(\${SRC} \${DST} @ONLY) +") +ADD_CUSTOM_TARGET( + version + ${CMAKE_COMMAND} -D SRC=${CMAKE_SOURCE_DIR}/src/version.c.in + -D DST=${CMAKE_BINARY_DIR}/version.c + -D CMD=${CMAKE_SOURCE_DIR}/get-version.sh + -P ${CMAKE_BINARY_DIR}/version.cmake +) + +## This is required if you're not building the kernel module +## TODO: Find a nicer way to avoid parent includes, +## especially the ../version.h -> version.c -> version.h cycle +FILE(GLOB COMMON_HEADER_FILES src/*.h) +FOREACH(COMMON_HEADER_FILE ${COMMON_HEADER_FILES}) + CONFIGURE_FILE(${COMMON_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY) +ENDFOREACH( COMMON_HEADER_FILE ) + + +################################################################################ +# CLIENT # +################################################################################ + +if(BUILD_KERNEL_MODULE) + INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR}) + FILE(GLOB_RECURSE CLIENT_SRCS src/client/*.c) + ADD_EXECUTABLE(dnbd3-client ${CLIENT_SRCS}) + TARGET_LINK_LIBRARIES(dnbd3-client) + ADD_DEPENDENCIES(dnbd3-client version) + INSTALL(TARGETS dnbd3-client RUNTIME DESTINATION sbin) +ENDIF() + + +################################################################################ +# SERVER # +################################################################################ + +if(BUILD_SERVER) + IF(SERVER_FOR_AFL) + message(" ######################## Building server for AFL mode - will be useless otherwise!") + ADD_DEFINITIONS(-DAFL_MODE) + ENDIF() + INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${JANSSON_INCLUDE_DIR}) + FILE(GLOB SERVER_SRCS src/server/*.c src/shared/*.c src/server/picohttpparser/*.c) + ADD_EXECUTABLE(dnbd3-server ${SERVER_SRCS}) + TARGET_LINK_LIBRARIES(dnbd3-server ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES}) + if(UNIX AND NOT APPLE) + target_link_libraries(dnbd3-server rt) + endif() + ADD_DEPENDENCIES(dnbd3-server version) + INSTALL(TARGETS dnbd3-server RUNTIME DESTINATION sbin) +endif() + + + +################################################################################ +# FUSE # +################################################################################ + +if(BUILD_FUSE_CLIENT) + INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR} ${FUSE_INCLUDE_DIR}) + FILE(GLOB FUSE_SRCS src/fuse/*.c src/shared/*.c) + ADD_EXECUTABLE(dnbd3-fuse ${FUSE_SRCS}) + TARGET_LINK_LIBRARIES(dnbd3-fuse ${CMAKE_THREAD_LIBS_INIT} ${FUSE_LIBRARIES}) + ADD_DEPENDENCIES(dnbd3-fuse version) + INSTALL(TARGETS dnbd3-fuse RUNTIME DESTINATION bin) +endif() + +################################################################################ +# STRESSTEST # +################################################################################ + +if(BUILD_STRESSTEST) + FILE(GLOB BENCH_SRCS src/bench/*.c src/shared/*.c) + ADD_EXECUTABLE(dnbd3-bench ${BENCH_SRCS}) + TARGET_LINK_LIBRARIES(dnbd3-bench ${CMAKE_THREAD_LIBS_INIT}) + ADD_DEPENDENCIES(dnbd3-bench version) + INSTALL(TARGETS dnbd3-bench RUNTIME DESTINATION bin) +endif() + +################################################################################ +# MODULE # +################################################################################ + +IF(BUILD_KERNEL_MODULE) + SET(MODULE_NAME dnbd3) + SET(MODULE_FILE ${MODULE_NAME}.ko) + FILE(GLOB MODULE_SOURCE_FILES src/kernel/*.c src/serialize.c) + FILE(GLOB MODULE_HEADER_FILES src/kernel/*.h) + + SET(KERNEL_DIR "" CACHE PATH "Path to kernel sources to compile against") + IF(KERNEL_DIR STREQUAL "") + SET(KERNEL_DIR "/lib/modules/${CMAKE_SYSTEM_VERSION}/build") + ENDIF() + + SET(KBUILD_COMMAND ${CMAKE_MAKE_PROGRAM} -C ${KERNEL_DIR} + M=${CMAKE_BINARY_DIR} modules + ) + + CONFIGURE_FILE(Kbuild.in ${CMAKE_BINARY_DIR}/Kbuild) + + FOREACH(MODULE_SOURCE_FILE ${MODULE_SOURCE_FILES}) + CONFIGURE_FILE(${MODULE_SOURCE_FILE} ${CMAKE_BINARY_DIR} COPYONLY) + ENDFOREACH( MODULE_SOURCE_FILE ) + + FOREACH(MODULE_HEADER_FILE ${MODULE_HEADER_FILES}) + CONFIGURE_FILE(${MODULE_HEADER_FILE} ${CMAKE_BINARY_DIR} COPYONLY) + ENDFOREACH( MODULE_HEADER_FILE ) + + ADD_CUSTOM_COMMAND( + OUTPUT ${CMAKE_BINARY_DIR}/${MODULE_FILE} + COMMAND ${KBUILD_COMMAND} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS ${MODULE_SOURCE_FILES} Kbuild.in + VERBATIM + ) + + ADD_CUSTOM_TARGET(${MODULE_NAME} ALL DEPENDS ${CMAKE_BINARY_DIR}/${MODULE_FILE}) + + INSTALL(FILES ${CMAKE_BINARY_DIR}/${MODULE_NAME}.ko + DESTINATION /lib/modules/${CMAKE_SYSTEM_VERSION}/kernel/drivers/block + PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ + ) + + INSTALL(CODE "EXECUTE_PROCESS(COMMAND depmod -a)") +ENDIF() + + +# +# Other install files +# + +FILE(GLOB conf_files "${CMAKE_CURRENT_SOURCE_DIR}/conf/*") +INSTALL(FILES ${conf_files} DESTINATION /etc/dnbd3-server/sample/) + diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Kbuild.in b/Kbuild.in new file mode 100644 index 0000000..ec8b830 --- /dev/null +++ b/Kbuild.in @@ -0,0 +1,2 @@ +obj-m := ${MODULE_NAME}.o +${MODULE_NAME}-objs += core.o sysfs.o utils.o diff --git a/LOCKS b/LOCKS new file mode 100644 index 0000000..935dadb --- /dev/null +++ b/LOCKS @@ -0,0 +1,80 @@ +Some notes about locking in dnbd3 + +The order of aquiring multiple locks is +VERY IMPORTANT, as you'll produce a possible deadlock +if you do it in the wrong order. +Take very good care of locking order if you have lots +of functions that call each other. You might lose +track of what's going on. ;) + +===== FUSE ===== +mutexInit +newAltLock +altLock +connection.sendMutex +requests.lock + +===== SERVER ===== +This is a list of used locks, in the order they +have to be aquired if you must hold multiple locks: +remoteCloneLock | reloadLock +_clients_lock +_clients[].lock +integrityQueueLock +_images_lock +_images[].lock +pendingLockConsume +pendingLockProduce +uplink.queueLock +altServersLock +client.sendMutex +client.statsLock +statisticsSentLock +statisticsReceivedLock +uplink.rttLock + +If you need to lock multiple clients/images/... at once, +lock the client with the lowest array index first. + +If the program logic would require to aquire the +locks in a different order, you HAVE TO rework the +code. +For example, if you hold the lock for client 10 and +you need to look up some other client. You MUST NOT +simply fetch the _clients_lock now and then iterate +over the clients until you find the one you need, +as it violates the above order to first lock on the +clients array and then the clients lock. +Instead, you need to release client 10's lock, +then lock on _clients_lock and iterate over the +clients. Now you check if you either encounter +the client you originally held the lock on, or +the client you are looking for. You immediately +lock on those two. You can then release the +_clients_lock and work with both clients. +pseudo code: + +// client10 is assumed to be a pointer to +// a client, which happens to be at index 10 +lock (client10->lock); +.... +// oh, i need another client +unlock(client10->lock); +lock(_clients_lock); +client clientA = NULL, clientB = NULL; +for (i = 0; i < _num_clients; ++i) { + if (client[i] == client10) { + clientA = client[i]; + lock(clientA.lock); + } else if (client[i].something == ) { + clientB = client[i]; + lock(clientB.lock); + } +} +unlock(_clients_lock); +if (clientA && clientB) { // Make sure we actually found both! + // DO something important with both clients +} +if (clientA) unlock(clientA.lock); +if (clientB) unlock(clientB.lock); + diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..6726a86 --- /dev/null +++ b/build.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +mkdir -p build +cd build/ +cmake .. +make diff --git a/cmake/FindFuse.cmake b/cmake/FindFuse.cmake new file mode 100644 index 0000000..b9c6f91 --- /dev/null +++ b/cmake/FindFuse.cmake @@ -0,0 +1,30 @@ +# - Find fuse +# Find the native fuse includes and library +# +# FUSE_INCLUDE_DIR - where to find fuse/fuse.h. +# FUSE_LIBRARIES - List of libraries when using fuse. +# FUSE_FOUND - True if fuse found. + + +IF (FUSE_INCLUDE_DIR) + # Already in cache, be silent + SET(FUSE_FIND_QUIETLY TRUE) +ENDIF (FUSE_INCLUDE_DIR) + +FIND_PATH(FUSE_INCLUDE_DIR fuse/fuse.h) + +SET(FUSE_NAMES fuse) +FIND_LIBRARY(FUSE_LIBRARY NAMES ${FUSE_NAMES} ) + +# handle the QUIETLY and REQUIRED arguments and set FUSE_FOUND to TRUE if +# all listed variables are TRUE +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(FUSE REQUIRED FUSE_LIBRARY FUSE_INCLUDE_DIR) + +IF(FUSE_FOUND) + SET( FUSE_LIBRARIES ${FUSE_LIBRARY} ) +ELSE(FUSE_FOUND) + SET( FUSE_LIBRARIES ) +ENDIF(FUSE_FOUND) + +MARK_AS_ADVANCED( FUSE_LIBRARY FUSE_INCLUDE_DIR ) diff --git a/cmake/FindJansson.cmake b/cmake/FindJansson.cmake new file mode 100644 index 0000000..3225923 --- /dev/null +++ b/cmake/FindJansson.cmake @@ -0,0 +1,59 @@ +# - Try to find Jansson +# Once done this will define +# +# JANSSON_FOUND - system has Jansson +# JANSSON_INCLUDE_DIRS - the Jansson include directory +# JANSSON_LIBRARIES - Link these to use Jansson +# +# Copyright (c) 2011 Lee Hambley +# +# Redistribution and use is allowed according to the terms of the New +# BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. +# + +if (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS) + # in cache already + set(JANSSON_FOUND TRUE) +else (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS) + find_path(JANSSON_INCLUDE_DIR + NAMES + jansson.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ) + +find_library(JANSSON_LIBRARY + NAMES + jansson + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ) + +set(JANSSON_INCLUDE_DIRS + ${JANSSON_INCLUDE_DIR} + ) + +if (JANSSON_LIBRARY) + set(JANSSON_LIBRARIES + ${JANSSON_LIBRARIES} + ${JANSSON_LIBRARY} + ) +endif (JANSSON_LIBRARY) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(Jansson DEFAULT_MSG + JANSSON_LIBRARIES JANSSON_INCLUDE_DIRS) + + # show the JANSSON_INCLUDE_DIRS and JANSSON_LIBRARIES variables only in the advanced view + mark_as_advanced(JANSSON_INCLUDE_DIRS JANSSON_LIBRARIES) + +endif (JANSSON_LIBRARIES AND JANSSON_INCLUDE_DIRS) + + diff --git a/conf/README.server b/conf/README.server new file mode 100644 index 0000000..285758b --- /dev/null +++ b/conf/README.server @@ -0,0 +1,30 @@ +Configuration for dnbd3-server + +The server requires a config directory. +Start it like so: ./dnbd3-server -c ./my-config/ + +There are two files in that dir + +== alt-servers == +List of known alt-servers for this server. +Format: +[PREFIX] [Comment] + +Prefix can be: ++ - Only report server to clients as alt-server, but don't use for replication +- - Only use server for replication, but don't advertise to clients +No prefix means server will be advertised to clients and is used for replication + +If you're not running in proxy mode, this file won't do much for you + +== server.conf == + +Main configuration file. Ini format. + +[dnbd3] +basePath=/srv/openslx/dnbd3 # virtual root of image files +serverPenalty=1234 # artificial acceptance delay for incoming server connections (µs) +clientPenalty=2345 # artificial acceptance delay for incoming client connection (µs) +isProxy=true # enable proxy mode - will try to replicate from alt-servers if a client requests unknown image +uplinkTimeout=1250 # r/w timeout for connections to uplink servers + diff --git a/conf/alt-servers b/conf/alt-servers new file mode 100644 index 0000000..fd2f2ec --- /dev/null +++ b/conf/alt-servers @@ -0,0 +1,4 @@ +192.168.100.10 Some alt server ++192.168.100.100 My first alt server that will not be used for replication +-192.168.100.50 Super sectret alt server that will be used for replication, but clients don't know about it + diff --git a/conf/rpc.acl b/conf/rpc.acl new file mode 100644 index 0000000..5167ae3 --- /dev/null +++ b/conf/rpc.acl @@ -0,0 +1,5 @@ +# Everything from localhost +127.0.0.0/8 ALL +# Some info reading for another machine +132.230.8.113 STATS CLIENT_LIST IMAGE_LIST + diff --git a/conf/server.conf b/conf/server.conf new file mode 100644 index 0000000..2f43247 --- /dev/null +++ b/conf/server.conf @@ -0,0 +1,57 @@ +[dnbd3] +; port to listen on (default: 5003) +listenPort=5003 +; relative root directory for images, ending in .r[1-9][0-9]* +basePath=/mnt/storage/dnbd3 +; artificial connection delay for connecting servers +serverPenalty=100000 +; artificial connection delay for connecting clients +clientPenalty=0 +; is this server a proxy? if true, requests for non-existing images will be relayed to known alt-servers +isProxy=true +; if proxy is true and an image is incomplete, should idle bandwidth be used to replicate missing blocks? +backgroundReplication=true +; minimum amount of connected clients for background replication to kick in +bgrMinClients=0 +; if isProxy==true and another proxy requests and image that we don't have, should we ask our alt-servers for it? +lookupMissingForProxy=true +; create sparse files instead of preallocating; ignored if backgroundReplication=true -- only recommended if cache space is small +sparseFiles=false +; if true (which is the default), images will automatically be removed from the list if they can't be accessed +removeMissingImages=true +; timeout in ms for send/recv on connections to uplink servers (used for replication) +uplinkTimeout=1250 +; timeout in ms for send/recv on connections to clients (using an image on this server) +clientTimeout=15000 +; set this to true to close handles of unused images after some timeout +closeUnusedFd=false +; set this to true to load files without the .r[0-9]+ extension too, assuming RID=1 +vmdkLegacyMode=false + +[limits] +maxClients=2000 +maxImages=1000 +maxPayload=9M +maxReplicationSize=150G + +; Log related config +[logging] +; log file path and name +; comment out to disable logging to file +; protip: use SIGUSR2 to reopen log file +file=./dnbd3.log +; which type of messages to log to file +fileMask=ERROR WARNING MINOR INFO DEBUG1 +; which to log to console (stdout) +consoleMask=ERROR WARNING MINOR INFO +; Valid types (warning: specifying invalid types will not yield an error!) +; ERROR Fatal error, server will terminate +; WARNING Major issue, something is broken but keep running +; MINOR Minor issue, more of a hickup than serious problem +; INFO Informational message +; DEBUG1 Debug information, used for medium verbosity +; DEBUG2 Used for debug messages that would show up a lot +; +; Whether timestamps should be output to console too (or just to file if false) +consoleTimestamps=false + diff --git a/get-version.sh b/get-version.sh new file mode 100755 index 0000000..1d4a8cb --- /dev/null +++ b/get-version.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +# Always create version string for repository this script lies in, +# not the cwd... Makes usage easier in cmake +ARG0="$0" +SELF="$(readlink -f "${ARG0}")" +ROOT_DIR="$(dirname "${SELF}")" +cd "$ROOT_DIR" + +if [ -d .git ]; then + [ -n "$(git diff)" ] && MODDED='+MOD' + echo $(git describe)$MODDED, branch $(git rev-parse --abbrev-ref HEAD), built "$(date +%Y-%m-%d)" + exit 0 +fi + +if [ -f "version.txt" ]; then + cat "version.txt" + exit 0 +fi + +echo "-unknown-" + diff --git a/pack.sh b/pack.sh new file mode 100755 index 0000000..9cbe5c4 --- /dev/null +++ b/pack.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +./get-version.sh > version.txt +tar ckzf dnbd3.tar.gz src cmake CMakeLists.txt get-version.sh version.txt +rm -- version.txt + diff --git a/src/bench/connection.c b/src/bench/connection.c new file mode 100644 index 0000000..129ae3c --- /dev/null +++ b/src/bench/connection.c @@ -0,0 +1,133 @@ +#include "connection.h" +#include "helper.h" +#include "../config.h" +#include "../shared/protocol.h" +#include "../shared/fdsignal.h" +#include "../shared/sockhelper.h" +#include "../shared/log.h" + +#include +#include +#include +#include +#include +#include + +/* Constants */ +static const size_t SHORTBUF = 100; +#define SOCKET_KEEPALIVE_TIMEOUT (3) +#define MAX_ALTS (8) +#define MAX_HOSTS_PER_ADDRESS (2) +// If a server wasn't reachable this many times, we slowly start skipping it on measurements +static const int FAIL_BACKOFF_START_COUNT = 8; +#define RTT_COUNT (4) + +/* Module variables */ + +// Init guard +static bool connectionInitDone = false; +static bool keepRunning = true; + +static struct { + int sockFd; + pthread_mutex_t sendMutex; + dnbd3_signal_t* panicSignal; + dnbd3_host_t currentServer; + uint64_t startupTime; +} connection; + +// Known alt servers +typedef struct _alt_server { + dnbd3_host_t host; + int consecutiveFails; + int rtt; + int rtts[RTT_COUNT]; + int rttIndex; + int bestCount; +} alt_server_t; +alt_server_t altservers[MAX_ALTS]; +dnbd3_server_entry_t newservers[MAX_ALTS]; +pthread_spinlock_t altLock; + +bool connection_init_n_times( + const char *hosts, + const char *lowerImage, + const uint16_t rid, + int ntimes, + BenchCounters* counters, + bool closeSockets + ) { + for (int run_i = 0; run_i < ntimes; ++run_i) { + counters->attempts++; + + printf("."); + int sock = -1; + char host[SHORTBUF]; + serialized_buffer_t buffer; + uint16_t remoteVersion, remoteRid; + char *remoteName; + uint64_t remoteSize; + + if ( !connectionInitDone && keepRunning ) { + dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS]; + const char *current, *end; + int altIndex = 0; + memset( altservers, 0, sizeof altservers ); + connection.sockFd = -1; + current = hosts; + do { + // Get next host from string + while ( *current == ' ' ) current++; + end = strchr( current, ' ' ); + size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1); + if ( len > SHORTBUF ) len = SHORTBUF; + snprintf( host, len, "%s", current ); + int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS ); + for ( int i = 0; i < newHosts; ++i ) { + if ( altIndex >= MAX_ALTS ) + break; + altservers[altIndex].host = tempHosts[i]; + altIndex += 1; + } + current = end + 1; + } while ( end != NULL && altIndex < MAX_ALTS ); + logadd( LOG_INFO, "Got %d servers from init call", altIndex ); + // Connect + for ( int i = 0; i < altIndex; ++i ) { + if ( altservers[i].host.type == 0 ) + continue; + // Try to connect + sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 ); + if ( sock == -1 ) { + counters->fails++; + logadd( LOG_ERROR, "Could not connect to host" ); + } else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) { + counters->fails++; + logadd( LOG_ERROR, "Could not send select image" ); + } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) { + counters->fails++; + logadd( LOG_ERROR, "Could not read select image reply (%d)", errno ); + } else if ( rid != 0 && rid != remoteRid ) { + counters->fails++; + logadd( LOG_ERROR, "rid mismatch" ); + } else { + counters->success++; + break; + } + // Failed + logadd( LOG_DEBUG1, "Server does not offer requested image... " ); + if ( sock != -1 ) { + close( sock ); + sock = -1; + } + } + if ( sock != -1 ) { + // connectionInitDone = true; + if (closeSockets) { + close( sock ); + } + } + } + } + return true; +} diff --git a/src/bench/connection.h b/src/bench/connection.h new file mode 100644 index 0000000..9cb59ef --- /dev/null +++ b/src/bench/connection.h @@ -0,0 +1,26 @@ +#ifndef _CONNECTION_H_ +#define _CONNECTION_H_ + +#include "../shared/fdsignal.h" +#include +#include +#include "helper.h" + +struct _dnbd3_async; + +typedef struct _dnbd3_async { + struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller) + char* buffer; // Caller-provided buffer to be filled + uint64_t offset; + uint32_t length; + dnbd3_signal_t* signal; // Used to signal the caller + bool finished; // Will be set to true if the request has been handled + bool success; // Will be set to true if the request succeeded +} dnbd3_async_t; + + +bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters, bool closeSockets); + +bool connection_init(const char *hosts, const char *image, const uint16_t rid); + +#endif /* CONNECTION_H_ */ diff --git a/src/bench/helper.c b/src/bench/helper.c new file mode 100644 index 0000000..c89b614 --- /dev/null +++ b/src/bench/helper.c @@ -0,0 +1,37 @@ +#include "helper.h" + +#include +#include +#include + +//BenchCounters benchC = { .attempts = 0, .success = 0, .fails = 0}; + +void printLog( log_info *info ) +{ + FILE *logFile; + + // Create logfile + + logFile = fopen( "log.txt", "w" ); + if ( logFile == NULL ) { + printf( "Error creating/opening log.txt\n" ); + return; + } + + //rewind(file); + fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) ); + fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) ); + fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount ); + fprintf( logFile, "Blocksize: 4KiB\n\n" ); + fprintf( logFile, "Block access count:\n" ); + + uint64_t i = 0; + for ( ; i < info->imageBlockCount; i++ ) { + if ( i % 50 == 0 ) { + fprintf( logFile, "\n" ); + } + fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] ); + } + fprintf( logFile, "\n" ); + fclose( logFile ); +} diff --git a/src/bench/helper.h b/src/bench/helper.h new file mode 100644 index 0000000..8342a79 --- /dev/null +++ b/src/bench/helper.h @@ -0,0 +1,38 @@ +#ifndef IMAGEHELPER_H +#define IMAGEHELPER_H + +#include "../types.h" + +#include +#include +#include +#include +#include + +typedef struct log_info { + uint64_t imageSize; + uint64_t receivedBytes; + uint64_t imageBlockCount; + uint8_t *blockRequestCount; +} log_info; + + +typedef struct BenchCounters { + int attempts; + int success; + int fails; +} BenchCounters; + + +typedef struct BenchThreadData { + BenchCounters* counter; + char* server_address; + char * image_name; + int runs; + int threadNumber; + bool closeSockets; +} BenchThreadData; + + + +#endif diff --git a/src/bench/main.c b/src/bench/main.c new file mode 100644 index 0000000..2f32dbf --- /dev/null +++ b/src/bench/main.c @@ -0,0 +1,154 @@ +/* +* Butchered from the dnbd3-fuse by C.K. +**/ + +#include "connection.h" +#include "helper.h" +#include "../shared/protocol.h" +#include "../shared/log.h" + +#include +#include +#include +#include +#include +#include + +#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0) + + +/* Debug/Benchmark variables */ +static bool useDebug = false; + + +static void printUsage(char *argv0, int exitCode) +{ + printf( "Usage: %s [--debug] --host --image [--rid revision]\n", argv0 ); + printf( "Or: %s [-d] -h -i [-r revision]\n", argv0 ); + printf( " -h --host List of space separated hosts to use\n" ); + printf( " -i --image Remote image name to request\n" ); + printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" ); + printf( " -n --runs Number of connection attempts per thread\n" ); + printf( " -t --threads number of threads\n" ); + printf( " -l --log Write log to given location\n" ); + printf( " -d --debug Don't fork and print debug output (fuse > stderr, dnbd3 > stdout)\n" ); + // // fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL ); + exit( exitCode ); +} + +static const char *optString = "h:i:n:t:HvVd"; +static const struct option longOpts[] = { + { "host", required_argument, NULL, 'h' }, + { "image", required_argument, NULL, 'i' }, + { "nruns", optional_argument, NULL, 'n' }, + { "threads", optional_argument, NULL, 't' }, + { "help", optional_argument, NULL, 'H' }, + { "version", no_argument, NULL, 'v' }, + { 0, 0, 0, 0 } +}; + + +static void printBenchCounters(BenchCounters* c) { + printf ("Attempts:\t%d\n", c->attempts); + printf ("Success :\t%d\n", c->success); + printf ("Fails :\t%d\n", c->fails); +} + + +void* runBenchThread(void* t) { + BenchThreadData* data = t; + connection_init_n_times( + data->server_address, + data->server_address, + 0, + data->runs, + data->counter, + data->closeSockets); + printf("Thread #%d finished\n", data->threadNumber); + return NULL; +} + +int main(int argc, char *argv[]) +{ + char *server_address = NULL; + char *image_Name = NULL; + int opt, lidx; + + bool closeSockets = false; + int n_runs = 100; + int n_threads = 1; + + if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) { + printUsage( argv[0], 0 ); + } + + while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) { + switch ( opt ) { + case 'h': + server_address = optarg; + break; + case 'i': + image_Name = optarg; + break; + case 'n': + n_runs = atoi(optarg); + break; + case 't': + n_threads = atoi(optarg); + break; + case 'c': + closeSockets = true; + break; + case 'H': + printUsage( argv[0], 0 ); + break; + case 'd': + useDebug = true; + break; + default: + printUsage( argv[0], EXIT_FAILURE ); + } + } + + printf("Welcome to dnbd3 benchmark tool\n"); + + /* all counters */ + BenchCounters counters[n_threads]; + BenchThreadData threadData[n_threads]; + pthread_t threads[n_threads]; + + /* create all threads */ + for (int i = 0; i < n_threads; i++) { + BenchCounters tmp1 = {0,0,0}; + counters[i] = tmp1; + BenchThreadData tmp2 = { + &(counters[i]), + server_address, + image_Name, + n_runs, + i, + closeSockets}; + threadData[i] = tmp2; + pthread_create(&(threads[i]), NULL, runBenchThread, &(threadData[i])); + } + + + /* join all threads*/ + for (int i = 0; i < n_threads; ++i) { + pthread_join(threads[i], NULL); + } + + /* print out all counters & sum up */ + BenchCounters total = {0,0,0}; + for (int i = 0; i < n_threads; ++i) { + printf("#### Thread %d\n", i); + printBenchCounters(&counters[i]); + total.attempts += counters[i].attempts; + total.success += counters[i].success; + total.fails += counters[i].fails; + } + /* print out summary */ + printf("\n\n#### SUMMARY\n"); + printBenchCounters(&total); + printf("\n-- End of program"); +} diff --git a/src/bench/serialize.c b/src/bench/serialize.c new file mode 100644 index 0000000..4934132 --- /dev/null +++ b/src/bench/serialize.c @@ -0,0 +1,5 @@ +#include +#include +#include + +#include "../serialize.c" diff --git a/src/client/client.c b/src/client/client.c new file mode 100644 index 0000000..37f0558 --- /dev/null +++ b/src/client/client.c @@ -0,0 +1,670 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include "../clientconfig.h" +#include "../types.h" +#include "../version.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SOCK_PATH "/var/run/dnbd3.socket" +#define SOCK_BUFFER 1000 +#define DEV_LEN 15 +#define MAX_DEVS 50 + + +static int openDevices[MAX_DEVS]; +static const char *optString = "f:h:i:r:d:a:cs:HV?k"; +static const struct option longOpts[] = { + { "file", required_argument, NULL, 'f' }, + { "host", required_argument, NULL, 'h' }, + { "image", required_argument, NULL, 'i' }, + { "rid", required_argument, NULL, 'r' }, + { "device", required_argument, NULL, 'd' }, + { "ahead", required_argument, NULL, 'a' }, + { "close", no_argument, NULL, 'c' }, + { "switch", required_argument, NULL, 's' }, + { "add", required_argument, NULL, 'adds' }, + { "remove", required_argument, NULL, 'rems' }, + { "help", no_argument, NULL, 'H' }, + { "version", no_argument, NULL, 'V' }, + { "daemon", no_argument, NULL, 'D' }, + { "nofork", no_argument, NULL, 'N' }, + { "kill", no_argument, NULL, 'k' }, + { "user", required_argument, NULL, 'U' }, // Only used in daemon mode + { 0, 0, 0, 0 } +}; + +static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg); +static void dnbd3_client_daemon(); +static void dnbd3_daemon_action(int client, int argc, char **argv); +static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host); +static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead); +static int dnbd3_daemon_send(int argc, char **argv); +static void dnbd3_print_help(char *argv_0); +static void dnbd3_print_version(); + +/** + * Convert a host and port (network byte order) to printable representation. + * Worst case required buffer len is 48, eg. [1234:1234:1234:1234:1234:1234:1234:1234]:12345 (+ \0) + * Returns true on success, false on error + */ +static char host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen) +{ + // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6 + if ( targetlen < 10 ) return false; + if ( host->type == HOST_IP6 ) { + *target++ = '['; + inet_ntop( AF_INET6, host->addr, target, targetlen - 10 ); + target += strlen( target ); + *target++ = ']'; + } else if ( host->type == HOST_IP4 ) { + inet_ntop( AF_INET, host->addr, target, targetlen - 8 ); + target += strlen( target ); + } else { + snprintf( target, targetlen, "", (int)host->type ); + return false; + } + *target = '\0'; + if ( host->port != 0 ) { + // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7 + snprintf( target, 7, ":%d", (int)ntohs( host->port ) ); + } + return true; +} + + +/** + * Parse IPv4 or IPv6 address in string representation to a suitable format usable by the BSD socket library + * @string eg. "1.2.3.4" or "2a01::10:5", optially with port appended, eg "1.2.3.4:6666" or "[2a01::10:5]:6666" + * @af will contain either HOST_IP4 or HOST_IP6 + * @addr will contain the address in network representation + * @port will contain the port in network representation, defaulting to #define PORT if none was given + * returns 1 on success, 0 in failure. contents of af, addr and port are undefined in the latter case + * !! Contents of @string might be modified by this function !! + */ +static char parse_address(char *string, dnbd3_host_t *host) +{ + struct in_addr v4; + struct in6_addr v6; + + // Try IPv4 without port + if ( 1 == inet_pton( AF_INET, string, &v4 ) ) { + host->type = HOST_IP4; + memcpy( host->addr, &v4, 4 ); + host->port = htons( PORT ); + return 1; + } + // Try IPv6 without port + if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) { + host->type = HOST_IP6; + memcpy( host->addr, &v6, 16 ); + host->port = htons( PORT ); + return 1; + } + + // Scan for port + char *portpos = NULL, *ptr = string; + while ( *ptr ) { + if ( *ptr == ':' ) + portpos = ptr; + ++ptr; + } + if ( portpos == NULL ) return 0; // No port in string + // Consider IP being surrounded by [ ] + if ( *string == '[' && *(portpos - 1) == ']' ) { + ++string; + *(portpos - 1) = '\0'; + } + *portpos++ = '\0'; + int p = atoi( portpos ); + if ( p < 1 || p > 65535 ) + return 0; // Invalid port + host->port = htons( (uint16_t)p ); + + // Try IPv4 with port + if ( 1 == inet_pton( AF_INET, string, &v4 ) ) { + host->type = HOST_IP4; + memcpy( host->addr, &v4, 4 ); + return 1; + } + // Try IPv6 with port + if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) { + host->type = HOST_IP6; + memcpy( host->addr, &v6, 16 ); + return 1; + } + + // FAIL + return 0; +} + +static int dnbd3_get_ip(char *hostname, dnbd3_host_t *host) +{ + if ( parse_address( hostname, host ) ) return true; + // TODO: Parse port too for host names + struct hostent *hent; + if ( (hent = gethostbyname( hostname )) == NULL ) { + printf( "Unknown host '%s'\n", hostname ); + return false; + } + + if ( hent->h_addrtype == AF_INET ) { + host->type = HOST_IP4; + memcpy( host->addr, hent->h_addr, 4); + } else if (hent->h_addrtype == AF_INET6) { + host->type = HOST_IP6; + memcpy(host->addr, hent->h_addr, 16); + } else { + printf("FATAL: Unknown address type: %d\n", hent->h_addrtype); + return false; + } + host->port = htons( PORT ); + return true; +} + +int main(int argc, char *argv[]) +{ + char *dev = NULL; + char host[50]; + + int action = -1; + + dnbd3_ioctl_t msg; + memset( &msg, 0, sizeof(dnbd3_ioctl_t) ); + msg.len = (uint16_t)sizeof(dnbd3_ioctl_t); + msg.read_ahead_kb = DEFAULT_READ_AHEAD_KB; + msg.host.port = htons( PORT ); + msg.host.type = 0; + msg.imgname = NULL; + msg.use_server_provided_alts = true; + + int opt = 0; + int longIndex = 0; + + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + + while ( opt != -1 ) { + switch ( opt ) { + case 'f': + break; + case 'h': + if ( !dnbd3_get_ip( optarg, &msg.host ) ) exit( EXIT_FAILURE ); + break; + case 'i': + action = IOCTL_OPEN; + msg.imgname = strdup( optarg ); + break; + case 'r': + msg.rid = atoi( optarg ); + break; + case 'd': + dev = strdup( optarg ); + printf( "Device is %s\n", dev ); + break; + case 'a': + msg.read_ahead_kb = atoi( optarg ); + break; + case 'c': + action = IOCTL_CLOSE; + break; + case 's': + dnbd3_get_ip( optarg, &msg.host ); + action = IOCTL_SWITCH; + break; + case 'adds': + dnbd3_get_ip( optarg, &msg.host ); + action = IOCTL_ADD_SRV; + break; + case 'rems': + dnbd3_get_ip( optarg, &msg.host ); + action = IOCTL_REM_SRV; + break; + case 'H': + dnbd3_print_help( argv[0] ); + break; + case 'V': + dnbd3_print_version(); + break; + case '?': + dnbd3_print_help( argv[0] ); + break; + case 'D': + dnbd3_client_daemon(); + break; + } + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + } + + // See if socket exists, if so, try to send to daemon + struct stat st; + if ( stat( SOCK_PATH, &st ) == 0 ) { + if ( dnbd3_daemon_send( argc, argv ) ) exit( 0 ); + printf( "\nFailed.\n" ); + exit( 1 ); + } + + // Direct requests + + // In case the client was invoked as a suid binary, change uid back to original user + // when being used for direct ioctl, so that the device's permissions are taken into account + if ( geteuid() == 0 ) { + setgid( getgid() ); + setuid( getuid() ); + } + + host_to_string( &msg.host, host, 50 ); + + // close device + if ( action == IOCTL_CLOSE && msg.host.type == 0 && dev && (msg.imgname == NULL )) { + printf( "INFO: Closing device %s\n", dev ); + if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) ) exit( EXIT_SUCCESS ); + printf( "Couldn't close device.\n" ); + exit( EXIT_FAILURE ); + } + + // switch host + if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.host.type != 0 && dev && (msg.imgname == NULL )) { + if ( action == IOCTL_SWITCH ) printf( "INFO: Switching device %s to %s\n", dev, host ); + if ( action == IOCTL_ADD_SRV ) printf( "INFO: %s: adding %s\n", dev, host ); + if ( action == IOCTL_REM_SRV ) printf( "INFO: %s: removing %s\n", dev, host ); + if ( dnbd3_ioctl( dev, action, &msg ) ) exit( EXIT_SUCCESS ); + printf( "Failed! Maybe the device is not connected?\n" ); + exit( EXIT_FAILURE ); + } + + // connect + if ( action == IOCTL_OPEN && msg.host.type != 0 && dev && (msg.imgname != NULL )) { + printf( "INFO: Connecting device %s to %s for image %s\n", dev, host, msg.imgname ); + if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) exit( EXIT_SUCCESS ); + printf( "ERROR: connecting device failed. Maybe it's already connected?\n" ); + exit( EXIT_FAILURE ); + } + + dnbd3_print_help( argv[0] ); + exit( EXIT_FAILURE ); +} + +static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg) +{ + const int fd = open( dev, O_WRONLY ); + if ( fd < 0 ) { + printf( "open() for %s failed.\n", dev ); + return false; + } + if ( msg != NULL && msg->imgname != NULL ) msg->imgnamelen = (uint16_t)strlen( msg->imgname ); + const int ret = ioctl( fd, command, msg ); + if ( ret < 0 ) { + printf( "ioctl() failed.\n" ); + } + close( fd ); + return ret >= 0; +} + +static void dnbd3_client_daemon() +{ + int listener, client; + struct sockaddr_un addrLocal, addrRemote; + char buffer[SOCK_BUFFER]; + struct timeval tv; + int done, ret, len; + socklen_t socklen; + + if ( geteuid() != 0 ) { + printf( "Only root can run the dnbd3-client in daemon mode!\n" ); + exit( 1 ); + } + + if ( (listener = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) { + perror( "socket" ); + exit( 1 ); + } + + addrLocal.sun_family = AF_UNIX; + snprintf( addrLocal.sun_path, sizeof(addrLocal.sun_path), "%s", SOCK_PATH ); + unlink( addrLocal.sun_path ); + if ( bind( listener, (struct sockaddr *)&addrLocal, sizeof(addrLocal) ) < 0 ) { + perror( "bind" ); + exit( 1 ); + } + chmod( addrLocal.sun_path, 0600 ); + if ( listen( listener, 5 ) == -1 ) { + perror( "listen" ); + exit( 1 ); + } + + memset( openDevices, -1, sizeof(openDevices) ); + + for (;;) { + socklen = sizeof(addrRemote); + if ( (client = accept( listener, (struct sockaddr *)&addrRemote, &socklen )) == -1 ) { + printf( "accept error %d\n", (int)errno); + sleep( 1 ); + continue; + } + + tv.tv_sec = 1; + tv.tv_usec = 0; + setsockopt( client, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) ); + setsockopt( client, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv) ); + + ret = recv( client, &len, sizeof(len), MSG_WAITALL ); + if ( ret != sizeof(len) || len <= 0 || len + 4 > SOCK_BUFFER ) { // Leave a little room (at least one byte for the appended nullchar) + printf( "Error reading length field (ret: %d, len: %d)\n", ret, len ); + close( client ); + continue; + } + done = recv( client, buffer, len, MSG_WAITALL ); + + if ( done != len ) { + printf( "receiving payload from client failed (%d/%d)\n", done, len ); + } else { + buffer[len] = '\0'; + char *pos = buffer, *end = buffer + len; + int argc = 1; + char *argv[20] = { "dnbd3-client" }; + while ( pos < end && argc < 20 ) { + while ( *pos == '\0' ) { + if ( ++pos >= end ) break; + } + if ( pos >= end ) break; + argv[argc++] = pos; + printf("Arg %d: '%s'\n", argc, pos); + while ( *pos != '\0' ) { // This will always be in bounds because of -4 above + if ( ++pos >= end ) break; + } + } + dnbd3_daemon_action( client, argc, argv ); + } + + close( client ); + } +} + +static void dnbd3_daemon_action(int client, int argc, char **argv) +{ + int opt = 0; + int longIndex = 0; + char *host = NULL, *image = NULL, *device = NULL; + int rid = 0, uid = 0, killMe = false, ahead = 512; + int len; + int action = -1; + const char *actionName = NULL; + + optind = 1; + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + + while ( opt != -1 ) { + switch ( opt ) { + case 'd': + device = optarg; + break; + case 'h': + host = optarg; + break; + case 'i': + image = optarg; + action = IOCTL_OPEN; + actionName = "Open"; + break; + case 'r': + rid = atoi( optarg ); + break; + case 'U': + uid = atoi( optarg ); + break; + case 'c': + action = IOCTL_CLOSE; + actionName = "Close"; + break; + case 'adds': + action = IOCTL_ADD_SRV; + actionName = "Add Server"; + break; + case 'rems': + action = IOCTL_REM_SRV; + actionName = "Remove Server"; + break; + case 'a': + ahead = atoi( optarg ); + break; + case 'k': + killMe = true; + break; + } + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + } + + if ( killMe ) { + if ( uid != 0 ) { + printf( "Ignoring kill request by user %d\n", uid ); + close( client ); + return; + } + printf( "Received kill request; exiting.\n" ); + close( client ); + unlink( SOCK_PATH ); + exit( 0 ); + } + + if ( (action == IOCTL_CLOSE || ((action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && host != NULL)) && device != NULL ) { + if ( dnbd3_daemon_ioctl( uid, device, action, actionName, host ) ) { + len = 0; + } else { + len = -1; + } + send( client, &len, sizeof(len), 0 ); + return; + } + if ( action == IOCTL_OPEN && host != NULL && image != NULL && rid >= 0 ) { + device = dnbd3_daemon_open( uid, host, image, rid, ahead ); + if ( device != NULL ) { + len = strlen( device ); + send( client, &len, sizeof(len), 0 ); + send( client, device, len, 0 ); + } else { + len = -1; + send( client, &len, sizeof(len), 0 ); + } + return; + } + printf( "Received a client request I cannot understand.\n" ); +} + +static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host) +{ + int index = -1; + char dev[DEV_LEN]; + if ( strncmp( device, "/dev/dnbd", 9 ) == 0 ) { + index = atoi( device + 9 ); + } else { + index = atoi( device ); + } + dnbd3_ioctl_t msg; + memset( &msg, 0, sizeof(msg) ); + msg.len = (uint16_t)sizeof(msg); + if ( host != NULL ) { + dnbd3_get_ip( host, &msg.host ); + } + if ( index < 0 || index >= MAX_DEVS ) { + printf( "%s request with invalid device id %d\n", actionName, index ); + return false; + } + snprintf( dev, DEV_LEN, "/dev/dnbd%d", index ); + if ( openDevices[index] == -1 ) { + printf( "%s request by %d for closed device %s\n", actionName, uid, dev ); + return true; + } + if ( openDevices[index] != uid ) { + printf( "%s: User %d cannot access %s owned by %d\n", actionName, uid, dev, openDevices[index] ); + return false; + } + if ( dnbd3_ioctl( dev, action, &msg ) ) { + printf( "%s request for device %s of user %d successful\n", actionName, dev, uid ); + openDevices[index] = -1; + return true; + } + printf( "%s: Error on device %s, requested by %d\n", actionName, dev, uid ); + return false; +} + +static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead) +{ + int i, sameUser = 0; + struct stat st; + static char dev[DEV_LEN]; + printf( "Opening a device for %s on %s\n", image, host ); + // Check number of open devices + for (i = 0; i < MAX_DEVS; ++i) { + if ( openDevices[i] == uid ) sameUser++; + } + if ( sameUser > 1 ) { + printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser ); + return NULL ; + } + // Find free device + for (i = 0; i < MAX_DEVS; ++i) { + if ( openDevices[i] != -1 ) continue; + snprintf( dev, DEV_LEN, "/dev/dnbd%d", i ); + if ( stat( dev, &st ) == -1 ) { + break; + } + // Open + dnbd3_ioctl_t msg; + msg.len = (uint16_t)sizeof(msg); + if ( !dnbd3_get_ip( host, &msg.host ) ) { + printf( "Cannot parse host address %s\n", host ); + return NULL ; + } + msg.imgname = image; + msg.imgnamelen = strlen( image ); + msg.rid = rid; + msg.use_server_provided_alts = true; + msg.read_ahead_kb = readAhead; + if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) { + openDevices[i] = uid; + printf( "Device %s now occupied by %d\n", dev, uid ); + return dev; + } + printf( "ioctl to open device %s failed, trying next...\n", dev ); + } + // All devices in use + printf( "No more free devices. All %d are in use :-(\n", i ); + return NULL ; +} + +static int dnbd3_daemon_send(int argc, char **argv) +{ + const int uid = getuid(); + int s, i, len; + struct sockaddr_un remote; + char buffer[SOCK_BUFFER]; + + if ( (s = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) { + perror( "socket" ); + return false; + } + + remote.sun_family = AF_UNIX; + snprintf( remote.sun_path, sizeof(remote.sun_path), "%s", SOCK_PATH ); + if ( connect( s, (struct sockaddr *)&remote, sizeof(remote) ) == -1 ) { + perror( "connect" ); + close( s ); + return false; + } + // (Re)build argument string into a single one, arguments separated by null chars + char *pos = buffer; + char *end = buffer + SOCK_BUFFER; + pos += snprintf( pos, end - pos, "--user%c%d", (int)'\0', uid ) + 1; + for (i = 1; i < argc && pos < end; ++i) { + pos += snprintf( pos, end - pos, "%s", argv[i] ) + 1; + } + // Send + len = (int)(pos - buffer); + if ( send( s, &len, sizeof(len), 0 ) != sizeof(len) || send( s, buffer, len, 0 ) != len ) { + perror( "Sending request to daemon failed" ); + close( s ); + return false; + } + // Read reply + if ( recv( s, &len, sizeof(len), MSG_WAITALL ) != sizeof(len) ) { + perror( "Reading length-field from daemon failed" ); + close( s ); + return false; + } + if ( len <= 0 ) { + printf( "Daemon returned exit code %d\n", -len ); + close( s ); + exit( -len ); + } + if ( len + 4 > SOCK_BUFFER ) { + printf( "Reply too long (is %d bytes)\n", len ); + close( s ); + return false; + } + if ( recv( s, buffer, len, MSG_WAITALL ) != len ) { + perror( "Reading reply payload from daemon failed" ); + close( s ); + return false; + } + buffer[len] = '\0'; + printf( "%s", buffer ); + return true; +} + +static void dnbd3_print_help(char *argv_0) +{ + printf( "Version: %s\n\n", VERSION_STRING ); + printf( "\nUsage: %s\n" + "\t-h -i [-r ] -d [-a ] || -c -d \n\n", argv_0 ); + printf( "Start the DNBD3 client.\n" ); + //printf("-f or --file \t\t Configuration file (default /etc/dnbd3-client.conf)\n"); + printf( "-h or --host \t\t Host running dnbd3-server.\n" ); + printf( "-i or --image \t\t Image name of exported image.\n" ); + printf( "-r or --rid \t\t Release-ID of exported image (default 0, latest).\n" ); + printf( "-d or --device \t\t DNBD3 device name.\n" ); + printf( "-a or --ahead \t\t Read ahead in KByte (default %i).\n", DEFAULT_READ_AHEAD_KB ); + printf( "-c or --close \t\t Disconnect and close device.\n" ); + printf( "-s or --switch \t\t Switch dnbd3-server on device (DEBUG).\n" ); + printf( "-H or --help \t\t Show this help text and quit.\n" ); + printf( "-V or --version \t Show version and quit.\n\n" ); + printf( "\t--daemon \t Run as helper daemon\n" ); + printf( "\t--kill \t Kill running helper daemon\n" ); + printf( "The helper daemon makes it possible for normal users to connect dnbd3 devices.\n" ); + printf( "The client binary needs to be a setuid program for this to work!\n\n" ); +} + +void dnbd3_print_version() +{ + printf( "Version: %s\n", VERSION_STRING ); + exit( EXIT_SUCCESS ); +} diff --git a/src/clientconfig.h b/src/clientconfig.h new file mode 100644 index 0000000..f35f673 --- /dev/null +++ b/src/clientconfig.h @@ -0,0 +1,36 @@ +#ifndef _CLIENTCONFIG_H_ +#define _CLIENTCONFIG_H_ + +// Which is the minimum protocol version the client expects from the server +#define MIN_SUPPORTED_SERVER 2 + +// in seconds if not stated otherwise (MS = milliseconds) +#define SOCKET_TIMEOUT_CLIENT_DATA 2 +#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1 + +#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse +#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse +#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds +// This must be a power of two: +#define RTT_BLOCK_SIZE 4096 + +#define STARTUP_MODE_DURATION 30 +// Interval of several repeating tasks (in seconds) +#define TIMER_INTERVAL_PROBE_STARTUP 4 +#define TIMER_INTERVAL_PROBE_NORMAL 22 +#define TIMER_INTERVAL_PROBE_PANIC 2 +#define TIMER_INTERVAL_KEEPALIVE_PACKET 6 + +// Expect a keepalive response every X seconds +#define SOCKET_KEEPALIVE_TIMEOUT 8 + +// Number of unsuccessful alt_server probes before read errors are reported to the block layer +// (ALL servers will be probed this many times) +// Set to 0 to disable +#define PROBE_COUNT_TIMEOUT 0 + +// ++ Kernel module ++ +#define DEFAULT_READ_AHEAD_KB 512 +#define NUMBER_DEVICES 8 + +#endif diff --git a/src/config.h b/src/config.h new file mode 100644 index 0000000..50336af --- /dev/null +++ b/src/config.h @@ -0,0 +1,43 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef CONFIG_H_ +#define CONFIG_H_ + +// +++++ Network +++++ +// Default port +#define PORT 5003 +#define RPC_PORT (PORT+1) + +// No serialized payload allowed exceeding this many bytes (so actual data from client->server is not affected by this limit!) +#define MAX_PAYLOAD 1000 + +// Protocol version should be increased whenever new features/messages are added, +// so either the client or server can run in compatibility mode, or they can +// cancel the connection right away if the protocol has changed too much +#define PROTOCOL_VERSION 3 +// 2017-10-16: Update to v3: Change header to support request hop-counting + +#define NUMBER_SERVERS 8 // Number of alt servers per image/device + +// +++++ Block Device +++++ +#define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END! + +#endif /* CONFIG_H_ */ diff --git a/src/fuse/connection.c b/src/fuse/connection.c new file mode 100644 index 0000000..fc9f05b --- /dev/null +++ b/src/fuse/connection.c @@ -0,0 +1,927 @@ +#include "connection.h" +#include "helper.h" +#include "../clientconfig.h" +#include "../shared/protocol.h" +#include "../shared/fdsignal.h" +#include "../shared/sockhelper.h" +#include "../shared/log.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Constants */ +static const size_t SHORTBUF = 100; +#define MAX_ALTS (16) +#define MAX_ALTS_ACTIVE (5) +#define MAX_HOSTS_PER_ADDRESS (2) +// If a server wasn't reachable this many times, we slowly start skipping it on measurements +static const int FAIL_BACKOFF_START_COUNT = 8; +#define RTT_COUNT (4) + +/* Module variables */ + +// Init guard +static bool connectionInitDone = false; +static bool threadInitDone = false; +static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER; +static bool keepRunning = true; +static bool learnNewServers; + +// List of pending requests +static struct { + dnbd3_async_t *head; + dnbd3_async_t *tail; + pthread_spinlock_t lock; +} requests; + +// Connection for the image +static struct { + char *name; + uint16_t rid; + uint64_t size; +} image; + +static struct { + int sockFd; + pthread_mutex_t sendMutex; + dnbd3_signal_t* panicSignal; + dnbd3_host_t currentServer; + ticks startupTime; +} connection; + +// Known alt servers +typedef struct _alt_server { + dnbd3_host_t host; + int consecutiveFails; + int rtt; + int rtts[RTT_COUNT]; + int rttIndex; + int bestCount; + int liveRtt; +} alt_server_t; + +static dnbd3_server_entry_t newservers[MAX_ALTS]; +static pthread_mutex_t newAltLock = PTHREAD_MUTEX_INITIALIZER; +static alt_server_t altservers[MAX_ALTS]; +// WR: Use when re-assigning or sorting altservers, i.e. an index in altservers +// changes its meaning (host). Also used for newservers. +// RD: Use when reading the list or modifying individual entries data, like RTT +// and fail count. Isn't super clean as we still might have races here, but mostly +// the code is clean in this regard, so we should only have stale data somewhere +// but nothing nonsensical. +static pthread_rwlock_t altLock = PTHREAD_RWLOCK_INITIALIZER; +#define lock_read pthread_rwlock_rdlock +#define lock_write pthread_rwlock_wrlock +#define unlock_rw pthread_rwlock_unlock + +/* Static methods */ + + +static void* connection_receiveThreadMain(void *sock); +static void* connection_backgroundThread(void *something); + +static void addAltServers(); +static void sortAltServers(); +static void probeAltServers(); +static void switchConnection(int sockFd, alt_server_t *srv); +static void requestAltServers(); +static bool throwDataAway(int sockFd, uint32_t amount); + +static void enqueueRequest(dnbd3_async_t *request); +static dnbd3_async_t* removeRequest(dnbd3_async_t *request); + +bool connection_init(const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew) +{ + int sock = -1; + char host[SHORTBUF]; + size_t hlen; + serialized_buffer_t buffer; + uint16_t remoteVersion, remoteRid; + char *remoteName; + uint64_t remoteSize; + struct sockaddr_storage sa; + socklen_t salen; + poll_list_t *cons = sock_newPollList(); + + timing_setBase(); + pthread_mutex_lock( &mutexInit ); + if ( !connectionInitDone && keepRunning ) { + dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS]; + const char *current, *end; + int altIndex = 0; + learnNewServers = doLearnNew; + memset( altservers, 0, sizeof altservers ); + connection.sockFd = -1; + current = hosts; + do { + // Get next host from string + while ( *current == ' ' ) current++; + end = strchr( current, ' ' ); + size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1); + if ( len > SHORTBUF ) len = SHORTBUF; + snprintf( host, len, "%s", current ); + int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS ); + for ( int i = 0; i < newHosts; ++i ) { + if ( altIndex >= MAX_ALTS ) + break; + altservers[altIndex].host = tempHosts[i]; + altIndex += 1; + } + current = end + 1; + } while ( end != NULL && altIndex < MAX_ALTS ); + logadd( LOG_INFO, "Got %d servers from init call", altIndex ); + // Connect + for ( int i = 0; i < altIndex + 5; ++i ) { + if ( i >= altIndex ) { + // Additional iteration - no corresponding slot in altservers, this + // is just so we can make a final calls with longer timeout + sock = sock_multiConnect( cons, NULL, 400, 1000 ); + if ( sock == -2 ) { + logadd( LOG_ERROR, "Could not connect to any host" ); + sock = -1; + break; + } + } else { + if ( altservers[i].host.type == 0 ) + continue; + // Try to connect - 100ms timeout + sock = sock_multiConnect( cons, &altservers[i].host, 100, 1000 ); + } + if ( sock == -2 || sock == -1 ) + continue; + salen = sizeof(sa); + if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) { + logadd( LOG_ERROR, "getpeername on successful connection failed!? (errno=%d)", errno ); + close( sock ); + sock = -1; + continue; + } + hlen = sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) ); + logadd( LOG_INFO, "Connected to %.*s", (int)hlen, host ); + if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) { + logadd( LOG_ERROR, "Could not send select image" ); + } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) { + logadd( LOG_ERROR, "Could not read select image reply (%d)", errno ); + } else if ( rid != 0 && rid != remoteRid ) { + logadd( LOG_ERROR, "rid mismatch (want: %d, got: %d)", (int)rid, (int)remoteRid ); + } else { + logadd( LOG_INFO, "Requested: '%s:%d'", lowerImage, (int)rid ); + logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid ); + sock_setTimeout( sock, SOCKET_KEEPALIVE_TIMEOUT * 1000 ); + image.name = strdup( remoteName ); + image.rid = remoteRid; + image.size = remoteSize; + if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &connection.currentServer ) ) { + logadd( LOG_ERROR, "sockaddr to dnbd3_host_t failed!?" ); + connection.currentServer.type = 0; + } + connection.panicSignal = signal_new(); + timing_get( &connection.startupTime ); + connection.sockFd = sock; + requests.head = NULL; + requests.tail = NULL; + requestAltServers(); + break; + } + // Failed + if ( sock != -1 ) { + close( sock ); + sock = -1; + } + } + if ( sock != -1 ) { + connectionInitDone = true; + } + } + pthread_mutex_unlock( &mutexInit ); + sock_destroyPollList( cons ); + return sock != -1; +} + +bool connection_initThreads() +{ + pthread_mutex_lock( &mutexInit ); + if ( !keepRunning || !connectionInitDone || threadInitDone || connection.sockFd == -1 ) { + pthread_mutex_unlock( &mutexInit ); + return false; + } + bool success = true; + pthread_t thread; + threadInitDone = true; + logadd( LOG_DEBUG1, "Initializing stuff" ); + if ( pthread_mutex_init( &connection.sendMutex, NULL ) != 0 + || pthread_spin_init( &requests.lock, PTHREAD_PROCESS_PRIVATE ) != 0 ) { + logadd( LOG_ERROR, "Mutex or spinlock init failure" ); + success = false; + } else { + if ( pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)connection.sockFd ) != 0 ) { + logadd( LOG_ERROR, "Could not create receive thread" ); + success = false; + } else if ( pthread_create( &thread, NULL, &connection_backgroundThread, NULL ) != 0 ) { + logadd( LOG_ERROR, "Could not create background thread" ); + success = false; + } + } + if ( !success ) { + close( connection.sockFd ); + connection.sockFd = -1; + } + pthread_mutex_unlock( &mutexInit ); + return success; +} + +uint64_t connection_getImageSize() +{ + return image.size; +} + +bool connection_read(dnbd3_async_t *request) +{ + if ( !connectionInitDone ) return false; + pthread_mutex_lock( &connection.sendMutex ); + enqueueRequest( request ); + if ( connection.sockFd != -1 ) { + if ( !dnbd3_get_block( connection.sockFd, request->offset, request->length, (uint64_t)request, 0 ) ) { + shutdown( connection.sockFd, SHUT_RDWR ); + connection.sockFd = -1; + pthread_mutex_unlock( &connection.sendMutex ); + signal_call( connection.panicSignal ); + return true; + } + } + pthread_mutex_unlock( &connection.sendMutex ); + return true; +} + +void connection_close() +{ + if ( keepRunning ) { + logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" ); + } + pthread_mutex_lock( &mutexInit ); + keepRunning = false; + if ( !connectionInitDone ) { + pthread_mutex_unlock( &mutexInit ); + return; + } + pthread_mutex_unlock( &mutexInit ); + pthread_mutex_lock( &connection.sendMutex ); + if ( connection.sockFd != -1 ) { + shutdown( connection.sockFd, SHUT_RDWR ); + } + pthread_mutex_unlock( &connection.sendMutex ); +} + +size_t connection_printStats(char *buffer, const size_t len) +{ + int ret; + size_t remaining = len; + declare_now; + if ( remaining > 0 ) { + ret = snprintf( buffer, remaining, "Image: %s\nRevision: %d\n\nCurrent connection time: %" PRIu32 "s\n\n", + image.name, (int)image.rid, timing_diff( &connection.startupTime, &now ) ); + if ( ret < 0 ) { + ret = 0; + } + if ( (size_t)ret >= remaining ) { + return len; + } + remaining -= ret; + buffer += ret; + } + int i = -1; + lock_read( &altLock ); + while ( remaining > 3 && ++i < MAX_ALTS ) { + if ( altservers[i].host.type == 0 ) + continue; + if ( isSameAddressPort( &connection.currentServer, &altservers[i].host ) ) { + *buffer++ = '*'; + } else if ( i >= MAX_ALTS_ACTIVE ) { + *buffer++ = '-'; + } else { + *buffer++ = ' '; + } + const size_t addrlen = sock_printHost( &altservers[i].host, buffer, remaining ); + remaining -= (addrlen + 1); // For space or * above + buffer += addrlen; + if ( remaining < 3 ) + break; + int width = addrlen >= 35 ? 0 : 35 - (int)addrlen; + char *unit; + int value; + if ( altservers[i].rtt > 5000 ) { + unit = "ms "; + value = altservers[i].rtt / 1000; + } else { + unit = "µs"; + value = altservers[i].rtt; + width += 3; + } + ret = snprintf( buffer, remaining, "% *d %s Unreachable:% 5d BestCount:% 5d Live:% 5dµs\n", + width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt ); + if ( ret < 0 ) { + ret = 0; + } + if ( (size_t)ret >= remaining ) { + remaining = 0; + break; + } + remaining -= ret; + buffer += ret; + } + unlock_rw( &altLock ); + return len - remaining; +} + +static void* connection_receiveThreadMain(void *sockPtr) +{ + int sockFd = (int)(size_t)sockPtr; + dnbd3_reply_t reply; + pthread_detach( pthread_self() ); + + while ( keepRunning ) { + int ret; + do { + ret = dnbd3_read_reply( sockFd, &reply, true ); + if ( ret == REPLY_OK ) break; + } while ( ret == REPLY_INTR || ret == REPLY_AGAIN ); + if ( ret != REPLY_OK ) { + logadd( LOG_DEBUG1, "Error receiving reply on receiveThread (%d)", ret ); + goto fail; + } + + if ( reply.cmd == CMD_GET_BLOCK ) { + // Get block reply. find matching request + dnbd3_async_t *request = removeRequest( (dnbd3_async_t*)reply.handle ); + if ( request == NULL ) { + // This happens if the alt server probing thread tears down our connection + // and did a direct RTT probe to satisfy this very request. + logadd( LOG_DEBUG1, "Got block reply with no matching request" ); + if ( reply.size != 0 && !throwDataAway( sockFd, reply.size ) ) { + logadd( LOG_DEBUG1, "....and choked on reply payload" ); + goto fail; + } + } else { + // Found a match + const ssize_t ret = sock_recv( sockFd, request->buffer, request->length ); + if ( ret != (ssize_t)request->length ) { + logadd( LOG_DEBUG1, "receiving payload for a block reply failed" ); + connection_read( request ); + goto fail; + } + // Check RTT + declare_now; + uint64_t diff = timing_diffUs( &request->time, &now ); + if ( diff < 30ull * 1000 * 1000 ) { // Sanity check - ignore if > 30s + lock_read( &altLock ); + for ( int i = 0; i < MAX_ALTS; ++i ) { + if ( altservers[i].host.type == 0 ) + continue; + if ( isSameAddressPort( &connection.currentServer, &altservers[i].host ) ) { + altservers[i].liveRtt = ( altservers[i].liveRtt * 3 + (int)diff ) / 4; + break; + } + } + unlock_rw( &altLock ); + } + // Success, wake up caller + request->success = true; + request->finished = true; + signal_call( request->signal ); + } + } else if ( reply.cmd == CMD_GET_SERVERS ) { + // List of known alt servers + dnbd3_server_entry_t entries[MAX_ALTS]; + const int count = MIN( reply.size / sizeof(dnbd3_server_entry_t), MAX_ALTS ); + const size_t relevantSize = sizeof(dnbd3_server_entry_t) * count; + if ( sock_recv( sockFd, entries, relevantSize ) != (ssize_t)relevantSize + || !throwDataAway( sockFd, reply.size - (uint32_t)relevantSize ) ) { + logadd( LOG_DEBUG1, "Error receiving list of alt servers." ); + goto fail; + } + pthread_mutex_lock( &newAltLock ); + memcpy( newservers, entries, relevantSize ); + pthread_mutex_unlock( &newAltLock ); + } else { + // TODO: Handle the others? + if ( reply.size != 0 && !throwDataAway( sockFd, reply.size ) ) { + logadd( LOG_DEBUG1, "Could not throw %d bytes away on CMD %d", (int)reply.size, (int)reply.cmd ); + goto fail; + } + } + } + logadd( LOG_DEBUG1, "Aus der Schleife rausgeflogen! ARRRRRRRRRR" ); +fail:; + // Make sure noone is trying to use the socket for sending by locking, + pthread_mutex_lock( &connection.sendMutex ); + // then just set the fd to -1, but only if it's the same fd as ours, + // as someone could have established a new connection already + if ( connection.sockFd == sockFd ) { + connection.sockFd = -1; + signal_call( connection.panicSignal ); + } + pthread_mutex_unlock( &connection.sendMutex ); + // As we're the only reader, it's safe to close the socket now + close( sockFd ); + return NULL; +} + +static void* connection_backgroundThread(void *something UNUSED) +{ + ticks nextKeepalive; + ticks nextRttCheck; + + timing_get( &nextKeepalive ); + nextRttCheck = nextKeepalive; + while ( keepRunning ) { + ticks now; + timing_get( &now ); + uint32_t wt1 = timing_diffMs( &now, &nextKeepalive ); + uint32_t wt2 = timing_diffMs( &now, &nextRttCheck ); + if ( wt1 > 0 && wt2 > 0 ) { + int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 ); + if ( waitRes == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Error waiting on signal in background thread! Errno = %d", errno ); + } + timing_get( &now ); + } + // Woken up, see what we have to do + const bool panic = connection.sockFd == -1; + // Check alt servers + if ( panic || timing_reachedPrecise( &nextRttCheck, &now ) ) { + if ( learnNewServers ) { + addAltServers(); + } + sortAltServers(); + probeAltServers(); + if ( panic || timing_diff( &connection.startupTime, &now ) <= STARTUP_MODE_DURATION ) { + timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_STARTUP ); + } else { + timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_NORMAL ); + } + } + // Send keepalive packet + if ( timing_reachedPrecise( &nextKeepalive, &now ) ) { + pthread_mutex_lock( &connection.sendMutex ); + if ( connection.sockFd != -1 ) { + dnbd3_request_t request; + request.magic = dnbd3_packet_magic; + request.cmd = CMD_KEEPALIVE; + request.handle = request.offset = request.size = 0; + fixup_request( request ); + ssize_t ret = sock_sendAll( connection.sockFd, &request, sizeof request, 2 ); + if ( (size_t)ret != sizeof request ) { + shutdown( connection.sockFd, SHUT_RDWR ); + connection.sockFd = -1; + nextRttCheck = now; + } + } + pthread_mutex_unlock( &connection.sendMutex ); + timing_addSeconds( &nextKeepalive, &now, TIMER_INTERVAL_KEEPALIVE_PACKET ); + } + } + return NULL; +} + +// Private quick helpers + +static void addAltServers() +{ + pthread_mutex_lock( &newAltLock ); + lock_write( &altLock ); + for ( int nIdx = 0; nIdx < MAX_ALTS; ++nIdx ) { + if ( newservers[nIdx].host.type == 0 ) + continue; + // Got a new alt server, see if it's already known + for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) { + if ( isSameAddress( &newservers[nIdx].host, &altservers[eIdx].host ) ) { + goto skip_server; + } + } + // Not known yet, add - find free slot + int slot = -1; + for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) { + if ( altservers[eIdx].host.type == 0 ) { + slot = eIdx; // free - bail out and use this one + break; + } + if ( altservers[eIdx].consecutiveFails > FAIL_BACKOFF_START_COUNT + && slot != -1 && altservers[slot].consecutiveFails < altservers[eIdx].consecutiveFails ) { + // Replace an existing alt-server that failed recently if we got no more slots + slot = eIdx; + } + } + if ( slot != -1 ) { + char txt[200]; + sock_printHost( &newservers[nIdx].host, txt, 200 ); + logadd( LOG_DEBUG1, "new server %s in slot %d", txt, slot ); + altservers[slot].consecutiveFails = 0; + altservers[slot].bestCount = 0; + altservers[slot].rtts[0] = RTT_UNREACHABLE; + altservers[slot].rttIndex = 1; + altservers[slot].host = newservers[nIdx].host; + altservers[slot].liveRtt = 0; + } +skip_server:; + } + memset( newservers, 0, sizeof(newservers) ); + unlock_rw( &altLock ); + pthread_mutex_unlock( &newAltLock ); +} + +/** + * Find a server at index >= MAX_ALTS_ACTIVE (one that isn't considered for switching over) + * that has been inactive for a while, then look if there's an active server that's failed + * a couple of times recently. Swap both if found. + */ +static void sortAltServers() +{ + int ac = 0; + lock_write( &altLock ); + for ( int ia = MAX_ALTS_ACTIVE; ia < MAX_ALTS; ++ia ) { + alt_server_t * const inactive = &altservers[ia]; + if ( inactive->host.type == 0 || inactive->consecutiveFails > 0 ) + continue; + while ( ac < MAX_ALTS_ACTIVE ) { + if ( altservers[ac].host.type == 0 || altservers[ac].consecutiveFails > FAIL_BACKOFF_START_COUNT ) + break; + ac++; + } + if ( ac == MAX_ALTS_ACTIVE ) + break; + // Switch! + alt_server_t * const active = &altservers[ac]; + dnbd3_host_t tmp = inactive->host; + inactive->host = active->host; + inactive->consecutiveFails = FAIL_BACKOFF_START_COUNT * 4; + inactive->bestCount = 0; + inactive->rtts[0] = RTT_UNREACHABLE; + inactive->rttIndex = 1; + inactive->liveRtt = 0; + active->host = tmp; + active->consecutiveFails = 0; + active->bestCount = 0; + active->rtts[0] = RTT_UNREACHABLE; + active->rttIndex = 1; + active->liveRtt = 0; + } + unlock_rw( &altLock ); +} + +static void probeAltServers() +{ + serialized_buffer_t buffer; + dnbd3_reply_t reply; + int bestSock = -1; + uint16_t remoteRid, remoteProto; + uint64_t remoteSize; + char *remoteName; + bool doSwitch; + bool panic = connection.sockFd == -1; + uint64_t testOffset = 0; + uint32_t testLength = RTT_BLOCK_SIZE; + dnbd3_async_t *request = NULL; + alt_server_t *current = NULL, *best = NULL; + + if ( !panic ) { + lock_read( &altLock ); + for ( int altIndex = 0; altIndex < MAX_ALTS; ++altIndex ) { + if ( altservers[altIndex].host.type != 0 + && isSameAddressPort( &altservers[altIndex].host, &connection.currentServer ) ) { + current = &altservers[altIndex]; + break; + } + } + unlock_rw( &altLock ); + } + declare_now; + pthread_spin_lock( &requests.lock ); + if ( requests.head != NULL ) { + if ( !panic && current != NULL ) { + const int maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second + dnbd3_async_t *iterator; + for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) { + // A request with measurement tag is pending + if ( timing_diffUs( &iterator->time, &now ) > maxDelay ) { + panic = true; + break; + } + } + } + if ( panic ) { + request = requests.head; + testOffset = requests.head->offset; + testLength = requests.head->length; + } + } + pthread_spin_unlock( &requests.lock ); + if ( testOffset != 0 ) { + logadd( LOG_DEBUG1, "Panic with pending %" PRIu64 ":%" PRIu32, testOffset, testLength ); + } + + lock_read( &altLock ); + for ( int altIndex = 0; altIndex < (panic ? MAX_ALTS : MAX_ALTS_ACTIVE); ++altIndex ) { + alt_server_t * const srv = &altservers[altIndex]; + if ( srv->host.type == 0 ) + continue; + if ( !panic && srv->consecutiveFails > FAIL_BACKOFF_START_COUNT + && rand() % srv->consecutiveFails >= FAIL_BACKOFF_START_COUNT ) { + continue; + } + if ( srv->rttIndex >= RTT_COUNT ) { + srv->rttIndex = 0; + } else { + srv->rttIndex += 1; + } + // Probe + ticks start; + timing_get( &start ); + errno = 0; + int sock = sock_connect( &srv->host, panic ? 1000 : 333, 1000 ); + if ( sock == -1 ) { + logadd( LOG_DEBUG1, "Could not connect for probing. errno = %d", errno ); + goto fail; + } + if ( !dnbd3_select_image( sock, image.name, image.rid, 0 ) ) { + logadd( LOG_DEBUG1, "probe: select_image failed" ); + goto fail; + } + if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize )) { + logadd( LOG_DEBUG1, "probe: select image reply failed" ); + goto fail; + } + if ( remoteProto < MIN_SUPPORTED_SERVER ) { + logadd( LOG_WARNING, "Unsupported remote version (local: %d, remote: %d)", (int)PROTOCOL_VERSION, (int)remoteProto ); + srv->consecutiveFails += 10; + goto fail; + } + if ( remoteRid != image.rid || strcmp( remoteName, image.name ) != 0 ) { + logadd( LOG_WARNING, "Remote rid or name mismatch (got '%s')", remoteName ); + srv->consecutiveFails += 10; + goto fail; + } + if ( !dnbd3_get_block( sock, testOffset, testLength, 0, 0 ) ) { + logadd( LOG_DEBUG1, "-> block request fail" ); + goto fail; + } + int a = 111; + if ( !(a = dnbd3_get_reply( sock, &reply )) || reply.size != testLength ) { + logadd( LOG_DEBUG1, "<- get block reply fail %d %d", a, (int)reply.size ); + goto fail; + } + if ( request != NULL && removeRequest( request ) != NULL ) { + // Request successfully removed from queue + const ssize_t ret = sock_recv( sock, request->buffer, request->length ); + if ( ret != (ssize_t)request->length ) { + logadd( LOG_DEBUG1, "[RTT] receiving payload for a block reply failed" ); + // Failure, add to queue again + connection_read( request ); + goto fail; + } + // Success, wake up caller + logadd( LOG_DEBUG1, "[RTT] Successful direct probe" ); + request->success = true; + request->finished = true; + signal_call( request->signal ); + } else { + // Wasn't a request that's in our request queue + if ( !throwDataAway( sock, testLength ) ) { + logadd( LOG_DEBUG1, "<- get block reply payload fail" ); + goto fail; + } + } + + // Yay, success + // Panic mode? Just switch to server + if ( panic ) { + unlock_rw( &altLock ); + switchConnection( sock, srv ); + return; + } + // Non-panic mode: + // Update stats of server + ticks end; + timing_get( &end ); + srv->consecutiveFails = 0; + srv->rtts[srv->rttIndex] = (int)timing_diffUs( &start, &end ); + int newRtt = 0; + for ( int i = 0; i < RTT_COUNT; ++i ) { + newRtt += srv->rtts[i]; + } + if ( srv->liveRtt != 0 ) { + // Make live rtt measurement influence result + newRtt = ( newRtt + srv->liveRtt ) / ( RTT_COUNT + 1 ); + } else { + newRtt /= RTT_COUNT; + } + srv->rtt = newRtt; + + // Keep socket open if this is currently the best one + if ( best == NULL || best->rtt > srv->rtt ) { + best = srv; + if ( bestSock != -1 ) { + close( bestSock ); + } + bestSock = sock; + } else { + close( sock ); + } + continue; +fail:; + if ( sock != -1 ) { + close( sock ); + } + srv->rtts[srv->rttIndex] = RTT_UNREACHABLE; + srv->consecutiveFails += 1; + } + doSwitch = false; + if ( best != NULL ) { + // Time-sensitive switch decision: If a server was best for some consecutive measurements, + // we switch no matter how small the difference to the current server is + for ( int altIndex = 0; altIndex < MAX_ALTS_ACTIVE; ++altIndex ) { + alt_server_t * const srv = &altservers[altIndex]; + // Decay liveRtt slowly... + if ( srv->liveRtt > current->liveRtt && srv->liveRtt > srv->rtt ) { + srv->liveRtt -= ( ( srv->liveRtt / 100 ) + 1 ); + } + if ( srv == best ) { + if ( srv->bestCount < 50 ) { + srv->bestCount += 2; + } + // Switch with increasing probability the higher the bestCount is + if ( srv->bestCount > 12 && ( current == NULL || srv->rtt < current->rtt ) && srv->bestCount > rand() % 50 ) { + doSwitch = true; + } + } else if ( srv->bestCount > 0 ) { + srv->bestCount--; + } + } + for ( int i = MAX_ALTS_ACTIVE; i < MAX_ALTS; ++i ) { + if ( altservers[i].consecutiveFails > 0 ) { + altservers[i].consecutiveFails--; + } + } + // This takes care of the situation where two servers alternate being the best server all the time + if ( doSwitch && current != NULL && best->bestCount - current->bestCount < 8 ) { + doSwitch = false; + } + // Regular logic: Apply threshold when considering switch + if ( !doSwitch && current != NULL ) { + doSwitch = current->rtt > best->rtt + RTT_ABSOLUTE_THRESHOLD + || RTT_THRESHOLD_FACTOR(current->rtt) > best->rtt + 1000; + } + } + // Switch if a better server was found + if ( doSwitch ) { + logadd( LOG_INFO, "Current: %dµs, best: %dµs. Will switch!", current == NULL ? 0 : current->rtt, best->rtt ); + for ( int i = 0; i < MAX_ALTS; ++i ) { + if ( &altservers[i] != best ) { + altservers[i].bestCount = 0; + } + } + unlock_rw( &altLock ); + switchConnection( bestSock, best ); + return; + } + // No switch + unlock_rw( &altLock ); + if ( best != NULL ) { + close( bestSock ); + } +} + +static void switchConnection(int sockFd, alt_server_t *srv) +{ + pthread_t thread; + struct sockaddr_storage addr; + socklen_t addrLen = sizeof(addr); + char message[200] = "Connection switched to "; + const size_t len = strlen( message ); + int ret; + dnbd3_async_t *queue, *it; + + pthread_mutex_lock( &connection.sendMutex ); + if ( connection.sockFd != -1 ) { + shutdown( connection.sockFd, SHUT_RDWR ); + } + ret = getpeername( sockFd, (struct sockaddr*)&addr, &addrLen ); + if ( ret == 0 ) { + connection.currentServer = srv->host; + connection.sockFd = sockFd; + pthread_spin_lock( &requests.lock ); + queue = requests.head; + requests.head = requests.tail = NULL; + pthread_spin_unlock( &requests.lock ); + } else { + connection.sockFd = -1; + } + requestAltServers(); + pthread_mutex_unlock( &connection.sendMutex ); + if ( ret != 0 ) { + close( sockFd ); + logadd( LOG_WARNING, "Could not getpeername after connection switch, assuming connection already dead again. (Errno=%d)", errno ); + signal_call( connection.panicSignal ); + return; + } + timing_get( &connection.startupTime ); + pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)sockFd ); + sock_printable( (struct sockaddr*)&addr, sizeof(addr), message + len, sizeof(message) - len ); + logadd( LOG_INFO, "%s", message ); + // resend queue + if ( queue != NULL ) { + pthread_mutex_lock( &connection.sendMutex ); + dnbd3_async_t *next = NULL; + for ( it = queue; it != NULL; it = next ) { + logadd( LOG_DEBUG1, "Requeue after server change" ); + next = it->next; + enqueueRequest( it ); + if ( connection.sockFd != -1 && !dnbd3_get_block( connection.sockFd, it->offset, it->length, (uint64_t)it, 0 ) ) { + logadd( LOG_WARNING, "Resending pending request failed, re-entering panic mode" ); + shutdown( connection.sockFd, SHUT_RDWR ); + connection.sockFd = -1; + signal_call( connection.panicSignal ); + } + } + pthread_mutex_unlock( &connection.sendMutex ); + } +} + +/** + * Does not lock, so get the sendMutex first! + */ +static void requestAltServers() +{ + if ( connection.sockFd == -1 || !learnNewServers ) + return; + dnbd3_request_t request = { 0 }; + request.magic = dnbd3_packet_magic; + request.cmd = CMD_GET_SERVERS; + fixup_request( request ); + if ( sock_sendAll( connection.sockFd, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) { + logadd( LOG_WARNING, "Connection failed while requesting alt server list" ); + shutdown( connection.sockFd, SHUT_RDWR ); + connection.sockFd = -1; + } +} + +static bool throwDataAway(int sockFd, uint32_t amount) +{ + size_t done = 0; + char tempBuffer[SHORTBUF]; + while ( done < amount ) { + const ssize_t ret = sock_recv( sockFd, tempBuffer, MIN( amount - done, SHORTBUF ) ); + if ( ret <= 0 ) + return false; + done += (size_t)ret; + } + return true; +} + +static void enqueueRequest(dnbd3_async_t *request) +{ + request->next = NULL; + request->finished = false; + request->success = false; + //logadd( LOG_DEBUG2, "Queue: %p @ %s : %d", request, file, line ); + // Measure latency and add to switch formula + timing_get( &request->time ); + pthread_spin_lock( &requests.lock ); + if ( requests.head == NULL ) { + requests.head = requests.tail = request; + } else { + requests.tail->next = request; + requests.tail = request; + } + pthread_spin_unlock( &requests.lock ); +} + +static dnbd3_async_t* removeRequest(dnbd3_async_t *request) +{ + pthread_spin_lock( &requests.lock ); + //logadd( LOG_DEBUG2, "Remov: %p @ %s : %d", request, file, line ); + dnbd3_async_t *iterator, *prev = NULL; + for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) { + if ( iterator == request ) { + // Found it, break! + if ( prev != NULL ) { + prev->next = iterator->next; + } else { + requests.head = iterator->next; + } + if ( requests.tail == iterator ) { + requests.tail = prev; + } + break; + } + prev = iterator; + } + pthread_spin_unlock( &requests.lock ); + return iterator; +} + diff --git a/src/fuse/connection.h b/src/fuse/connection.h new file mode 100644 index 0000000..cae554c --- /dev/null +++ b/src/fuse/connection.h @@ -0,0 +1,35 @@ +#ifndef _CONNECTION_H_ +#define _CONNECTION_H_ + +#include "../shared/fdsignal.h" +#include "../shared/timing.h" +#include +#include +#include + +struct _dnbd3_async; + +typedef struct _dnbd3_async { + struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller) + dnbd3_signal_t* signal; // Used to signal the caller + char* buffer; // Caller-provided buffer to be filled + ticks time; // When request was put on wire, 0 if not measuring + uint64_t offset; + uint32_t length; + bool finished; // Will be set to true if the request has been handled + bool success; // Will be set to true if the request succeeded +} dnbd3_async_t; + +bool connection_init(const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers); + +bool connection_initThreads(); + +uint64_t connection_getImageSize(); + +bool connection_read(dnbd3_async_t *request); + +void connection_close(); + +size_t connection_printStats(char *buffer, const size_t len); + +#endif /* CONNECTION_H_ */ diff --git a/src/fuse/helper.c b/src/fuse/helper.c new file mode 100644 index 0000000..d81b08f --- /dev/null +++ b/src/fuse/helper.c @@ -0,0 +1,36 @@ +#include "helper.h" + +#include +#include +#include + + +void printLog( log_info *info ) +{ + FILE *logFile; + + // Create logfile + + logFile = fopen( "log.txt", "w" ); + if ( logFile == NULL ) { + printf( "Error creating/opening log.txt\n" ); + return; + } + + //rewind(file); + fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) ); + fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) ); + fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount ); + fprintf( logFile, "Blocksize: 4KiB\n\n" ); + fprintf( logFile, "Block access count:\n" ); + + uint64_t i = 0; + for ( ; i < info->imageBlockCount; i++ ) { + if ( i % 50 == 0 ) { + fprintf( logFile, "\n" ); + } + fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] ); + } + fprintf( logFile, "\n" ); + fclose( logFile ); +} diff --git a/src/fuse/helper.h b/src/fuse/helper.h new file mode 100644 index 0000000..9e5d127 --- /dev/null +++ b/src/fuse/helper.h @@ -0,0 +1,35 @@ +#ifndef IMAGEHELPER_H +#define IMAGEHELPER_H + +#include "../types.h" + +#include +#include +#include +#include +#include + +typedef struct log_info { + uint64_t imageSize; + uint64_t receivedBytes; + uint64_t imageBlockCount; + uint8_t *blockRequestCount; +} log_info; + + + +void printLog(log_info *info); + +int connect_to_server(char *server_adress, int port); + +static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b) +{ + return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) )); +} + +static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b) +{ + return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) )); +} + +#endif diff --git a/src/fuse/main.c b/src/fuse/main.c new file mode 100644 index 0000000..1a5643c --- /dev/null +++ b/src/fuse/main.c @@ -0,0 +1,420 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + * + * Changed by Stephan Schwaer + * */ + +#include "connection.h" +#include "helper.h" +#include "../shared/protocol.h" +#include "../shared/log.h" + +#define FUSE_USE_VERSION 30 +#include +#include +#include +#include +/* for printing uint */ +#define __STDC_FORMAT_MACROS +#include +#include +#include +#include +#include + +#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0) + +static const char * const IMAGE_PATH = "/img"; +static const char * const STATS_PATH = "/status"; + +static uint64_t imageSize; +/* Debug/Benchmark variables */ +static bool useDebug = false; +static log_info logInfo; +static struct timespec startupTime; +static uid_t owner; +static bool keepRunning = true; +static void (*fuse_sigIntHandler)(int) = NULL; +static void (*fuse_sigTermHandler)(int) = NULL; +static struct fuse_operations dnbd3_fuse_no_operations; + +#define SIGPOOLSIZE 6 +static pthread_spinlock_t sigLock; +static dnbd3_signal_t *signalPool[SIGPOOLSIZE]; +static dnbd3_signal_t **sigEnd = signalPool + SIGPOOLSIZE; +static void signalInit() +{ + pthread_spin_init( &sigLock, PTHREAD_PROCESS_PRIVATE ); + for ( size_t i = 0; i < SIGPOOLSIZE; ++i ) { + signalPool[i] = NULL; + } +} +static inline dnbd3_signal_t *signalGet() +{ + pthread_spin_lock( &sigLock ); + for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) { + if ( *it != NULL ) { + dnbd3_signal_t *ret = *it; + *it = NULL; + pthread_spin_unlock( &sigLock ); + return ret; + } + } + pthread_spin_unlock( &sigLock ); + return signal_newBlocking(); +} +static inline void signalPut(dnbd3_signal_t *signal) +{ + pthread_spin_lock( &sigLock ); + for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) { + if ( *it == NULL ) { + *it = signal; + pthread_spin_unlock( &sigLock ); + return; + } + } + pthread_spin_unlock( &sigLock ); + signal_close( signal ); +} + +static int image_getattr(const char *path, struct stat *stbuf) +{ + int res = 0; + memset( stbuf, 0, sizeof( struct stat ) ); + stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime; + stbuf->st_uid = owner; + if ( strcmp( path, "/" ) == 0 ) { + stbuf->st_mode = S_IFDIR | 0550; + stbuf->st_nlink = 2; + } else if ( strcmp( path, IMAGE_PATH ) == 0 ) { + stbuf->st_mode = S_IFREG | 0440; + stbuf->st_nlink = 1; + stbuf->st_size = imageSize; + } else if ( strcmp( path, STATS_PATH ) == 0 ) { + stbuf->st_mode = S_IFREG | 0440; + stbuf->st_nlink = 1; + stbuf->st_size = 4096; + clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim ); + } else { + res = -ENOENT; + } + return res; +} + +static int image_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset UNUSED, struct fuse_file_info *fi UNUSED) +{ + if ( strcmp( path, "/" ) != 0 ) { + return -ENOENT; + } + filler( buf, ".", NULL, 0 ); + filler( buf, "..", NULL, 0 ); + filler( buf, IMAGE_PATH + 1, NULL, 0 ); + filler( buf, STATS_PATH + 1, NULL, 0 ); + return 0; +} + +static int image_open(const char *path, struct fuse_file_info *fi) +{ + if ( strcmp( path, IMAGE_PATH ) != 0 && strcmp( path, STATS_PATH ) != 0 ) { + return -ENOENT; + } + if ( ( fi->flags & 3 ) != O_RDONLY ) { + return -EACCES; + } + return 0; +} + +static int fillStatsFile(char *buf, size_t size, off_t offset) { + if ( offset == 0 ) { + return (int)connection_printStats( buf, size ); + } + char buffer[4096]; + int ret = (int)connection_printStats( buffer, sizeof buffer ); + int len = MIN( ret - (int)offset, (int)size ); + if ( len == 0 ) + return 0; + if ( len < 0 ) { + return -EOF; + } + memcpy( buf, buffer + offset, len ); + return len; +} + +static int image_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi UNUSED) +{ + if ( size > __INT_MAX__ ) { + // fuse docs say we MUST fill the buffer with exactly size bytes and return size, + // otherwise the buffer will we padded with zeros. Since the return value is just + // an int, we could not properly fulfill read requests > 2GB. Since there is no + // mention of a guarantee that this will never happen, better add a safety check. + // Way to go fuse. + return -EIO; + } + if ( path[1] == STATS_PATH[1] ) { + return fillStatsFile( buf, size, offset ); + } + + if ( (uint64_t)offset >= imageSize ) { + return 0; + } + + if ( offset + size > imageSize ) { + size = imageSize - offset; + } + + if ( useDebug ) { + /* count the requested blocks */ + uint64_t startBlock = offset / ( 4096 ); + const uint64_t endBlock = ( offset + size - 1 ) / ( 4096 ); + + for ( ; startBlock <= endBlock; startBlock++ ) { + ++logInfo.blockRequestCount[startBlock]; + } + } + + dnbd3_async_t request; + request.buffer = buf; + request.length = (uint32_t)size; + request.offset = offset; + request.signal = signalGet(); + + if ( !connection_read( &request ) ) { + signalPut( request.signal ); + return -EINVAL; + } + while ( !request.finished ) { + int ret = signal_wait( request.signal, 5000 ); + if ( !keepRunning ) { + connection_close(); + break; + } + if ( ret < 0 ) { + debugf( "fuse_read signal wait returned %d", ret ); + } + } + signalPut( request.signal ); + if ( request.success ) { + return request.length; + } else { + return -EIO; + } +} + +static void image_sigHandler(int signum) { + keepRunning = false; + if ( signum == SIGINT && fuse_sigIntHandler != NULL ) { + fuse_sigIntHandler(signum); + } + if ( signum == SIGTERM && fuse_sigTermHandler != NULL ) { + fuse_sigTermHandler(signum); + } +} + +static void* image_init(struct fuse_conn_info *conn UNUSED) +{ + if ( !connection_initThreads() ) { + logadd( LOG_ERROR, "Could not initialize threads for dnbd3 connection, exiting..." ); + exit( EXIT_FAILURE ); + } + // Prepare our handler + struct sigaction newHandler; + memset( &newHandler, 0, sizeof(newHandler) ); + newHandler.sa_handler = &image_sigHandler; + sigemptyset( &newHandler.sa_mask ); + struct sigaction oldHandler; + // Retrieve old handlers when setting + sigaction( SIGINT, &newHandler, &oldHandler ); + fuse_sigIntHandler = oldHandler.sa_handler; + logadd( LOG_DEBUG1, "Previous SIGINT handler was %p", (void*)(uintptr_t)fuse_sigIntHandler ); + sigaction( SIGTERM, &newHandler, &oldHandler ); + fuse_sigTermHandler = oldHandler.sa_handler; + logadd( LOG_DEBUG1, "Previous SIGTERM handler was %p", (void*)(uintptr_t)fuse_sigIntHandler ); + return NULL; +} + +/* close the connection */ +static void image_destroy(void *private_data UNUSED) +{ + if ( useDebug ) { + printLog( &logInfo ); + } + connection_close(); + return; +} + +/* map the implemented fuse operations */ +static struct fuse_operations image_oper = { + .getattr = image_getattr, + .readdir = image_readdir, + .open = image_open, + .read = image_read, + .init = image_init, + .destroy = image_destroy, +}; + +static void printVersion() +{ + char *arg[] = { "foo", "-V" }; + printf( "DNBD3-Fuse Version 1.2.3.4, protocol version %d\n", (int)PROTOCOL_VERSION ); + fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL ); + exit( 0 ); +} + +static void printUsage(char *argv0, int exitCode) +{ + char *arg[] = { argv0, "-h" }; + fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL ); + printf( "\n" ); + printf( "Usage: %s [--debug] [--option mountOpts] --host --image [--rid revision] \n", argv0 ); + printf( "Or: %s [-d] [-o mountOpts] -h -i [-r revision] \n", argv0 ); + printf( " -d --debug Don't fork, write stats file, and print debug output (fuse -> stderr, dnbd3 -> stdout)\n" ); + printf( " -f Don't fork (dnbd3 -> stdout)\n" ); + printf( " -h --host List of space separated hosts to use\n" ); + printf( " -i --image Remote image name to request\n" ); + printf( " -l --log Write log to given location\n" ); + printf( " -o --option Mount options to pass to libfuse\n" ); + printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" ); + printf( " -S --sticky Use only servers from command line (no learning from servers)\n" ); + printf( " -s Single threaded mode\n" ); + exit( exitCode ); +} + +static const char *optString = "dfHh:i:l:o:r:SsVv"; +static const struct option longOpts[] = { + { "debug", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'H' }, + { "host", required_argument, NULL, 'h' }, + { "image", required_argument, NULL, 'i' }, + { "log", required_argument, NULL, 'l' }, + { "option", required_argument, NULL, 'o' }, + { "rid", required_argument, NULL, 'r' }, + { "sticky", no_argument, NULL, 'S' }, + { "version", no_argument, NULL, 'v' }, + { 0, 0, 0, 0 } +}; + +int main(int argc, char *argv[]) +{ + char *server_address = NULL; + char *image_Name = NULL; + char *log_file = NULL; + uint16_t rid = 0; + char **newArgv; + int newArgc; + int opt, lidx; + bool learnNewServers = true; + + if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) { + printUsage( argv[0], 0 ); + } + + // TODO Make log mask configurable + log_setConsoleMask( 65535 ); + log_setConsoleTimestamps( true ); + log_setFileMask( 65535 ); + + newArgv = calloc( argc + 10, sizeof(char*) ); + newArgv[0] = argv[0]; + newArgc = 1; + while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) { + switch ( opt ) { + case 'h': + server_address = optarg; + break; + case 'i': + image_Name = optarg; + break; + case 'r': + rid = (uint16_t)atoi(optarg); + break; + case 'o': + newArgv[newArgc++] = "-o"; + newArgv[newArgc++] = optarg; + if ( strstr( optarg, "use_ino" ) != NULL ) { + logadd( LOG_WARNING, "************************" ); + logadd( LOG_WARNING, "* WARNING: use_ino mount option is unsupported, use at your own risk!" ); + logadd( LOG_WARNING, "************************" ); + } + if ( strstr( optarg, "intr" ) != NULL ) { + logadd( LOG_WARNING, "************************" ); + logadd( LOG_WARNING, "* WARNING: intr mount option is unsupported, use at your own risk!" ); + logadd( LOG_WARNING, "************************" ); + } + break; + case 'l': + log_file = optarg; + break; + case 'H': + printUsage( argv[0], 0 ); + break; + case 'v': + case 'V': + printVersion(); + break; + case 'd': + useDebug = true; + newArgv[newArgc++] = "-d"; + break; + case 's': + newArgv[newArgc++] = "-s"; + break; + case 'S': + learnNewServers = false; + break; + case 'f': + newArgv[newArgc++] = "-f"; + break; + default: + printUsage( argv[0], EXIT_FAILURE ); + } + } + + if ( optind >= argc ) { // Missing mount point + printUsage( argv[0], EXIT_FAILURE ); + } + + if ( server_address == NULL || image_Name == NULL ) { + printUsage( argv[0], EXIT_FAILURE ); + } + + if ( log_file != NULL ) { + if ( !log_openLogFile( log_file ) ) { + logadd( LOG_WARNING, "Could not open log file at '%s'", log_file ); + } + } + + if ( !connection_init( server_address, image_Name, rid, learnNewServers ) ) { + logadd( LOG_ERROR, "Could not connect to any server. Bye.\n" ); + return EXIT_FAILURE; + } + imageSize = connection_getImageSize(); + + /* initialize benchmark variables */ + logInfo.receivedBytes = 0; + logInfo.imageSize = imageSize; + logInfo.imageBlockCount = ( imageSize + 4095 ) / 4096; + if ( useDebug ) { + logInfo.blockRequestCount = calloc( logInfo.imageBlockCount, sizeof(uint8_t) ); + } else { + logInfo.blockRequestCount = NULL; + } + + // Since dnbd3 is always read only and the remote image will not change + newArgv[newArgc++] = "-o"; + newArgv[newArgc++] = "ro,auto_cache,default_permissions"; + // Mount point goes last + newArgv[newArgc++] = argv[optind]; + + printf( "ImagePathName: %s\nFuseArgs:",IMAGE_PATH ); + for ( int i = 0; i < newArgc; ++i ) { + printf( " '%s'", newArgv[i] ); + } + putchar('\n'); + clock_gettime( CLOCK_REALTIME, &startupTime ); + owner = getuid(); + signalInit(); + return fuse_main( newArgc, newArgv, &image_oper, NULL ); +} diff --git a/src/fuse/serialize.c b/src/fuse/serialize.c new file mode 100644 index 0000000..4934132 --- /dev/null +++ b/src/fuse/serialize.c @@ -0,0 +1,5 @@ +#include +#include +#include + +#include "../serialize.c" diff --git a/src/kernel/core.c b/src/kernel/core.c new file mode 100644 index 0000000..bfa8d22 --- /dev/null +++ b/src/kernel/core.c @@ -0,0 +1,484 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2019 Frederic Robra + * Parts copyright 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "dnbd3.h" +#include "clientconfig.h" +#include "sysfs.h" + +static DEFINE_IDR(dnbd3_index_idr); +static DEFINE_MUTEX(dnbd3_index_mutex); + +static unsigned int max_devs = NUMBER_DEVICES; +static dnbd3_device_t *dnbd3_device; +int major; + + +static int dnbd3_open(struct block_device *bdev, fmode_t mode) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_open"); + + return 0; +} + +static void dnbd3_release(struct gendisk *disk, fmode_t mode) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_release"); + +} + + +void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_blk_fail_all_requests"); +} + + +int dnbd3_net_connect(dnbd3_device_t *dev) +{ + return 0; +} + + +int dnbd3_net_disconnect(dnbd3_device_t *dev) +{ + return 0; +} + +static int dnbd3_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_ioctl"); + + int result = -100; + dnbd3_device_t *dev = bdev->bd_disk->private_data; + char *imgname = NULL; + dnbd3_ioctl_t *msg = NULL; + //unsigned long irqflags; + + while (dev->disconnecting) + { + // do nothing + } + + if (arg != 0) + { + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (msg == NULL) return -ENOMEM; + if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) + { + result = -ENOEXEC; + goto cleanup_return; + } + if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) + { + result = -ENOENT; + goto cleanup_return; + } + if (msg->imgname != NULL && msg->imgnamelen > 0) + { + imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL); + if (imgname == NULL) + { + result = -ENOMEM; + goto cleanup_return; + } + if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) + { + result = -ENOENT; + goto cleanup_return; + } + imgname[msg->imgnamelen] = '\0'; + //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname); + } + } + + + switch (cmd) + { + case IOCTL_OPEN: + if (dev->imgname != NULL) + { + result = -EBUSY; + } + else if (imgname == NULL) + { + result = -EINVAL; + } + else if (msg == NULL) + { + result = -EINVAL; + } + else + { + if (sizeof(msg->host) != sizeof(dev->cur_server.host)) + printk("Odd size bug#1 triggered in IOCTL\n"); + memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host)); + dev->cur_server.failures = 0; + memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server)); + dev->imgname = imgname; + dev->rid = msg->rid; + dev->use_server_provided_alts = msg->use_server_provided_alts; + // Forget all alt servers on explicit connect, set first al server to initial server + memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS); + memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0])); +//#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +// if (blk_queue->backing_dev_info != NULL) { +// blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; +// } +//#else +// blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; +//#endif + if (dnbd3_net_connect(dev) == 0) + { + result = 0; + imgname = NULL; // Prevent kfree at the end + } + else + { + result = -ENOENT; + dev->imgname = NULL; + } + } + break; + + case IOCTL_CLOSE: + dnbd3_blk_fail_all_requests(dev); + result = dnbd3_net_disconnect(dev); + dnbd3_blk_fail_all_requests(dev); + set_capacity(dev->disk, 0); + if (dev->imgname) + { + kfree(dev->imgname); + dev->imgname = NULL; + } + break; + + case IOCTL_SWITCH: + result = -EINVAL; + break; + + case IOCTL_ADD_SRV: + case IOCTL_REM_SRV: + if (dev->imgname == NULL) + { + result = -ENOENT; + } + else if (dev->new_servers_num >= NUMBER_SERVERS) + { + result = -EAGAIN; + } + else if (msg == NULL) + { + result = -EINVAL; + } + else + { + memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host)); + dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM + ++dev->new_servers_num; + result = 0; + } + break; + + case BLKFLSBUF: + result = 0; + break; + + default: + result = -EIO; + break; + } + +cleanup_return: + if (msg) kfree(msg); + if (imgname) kfree(imgname); + return result; + +} + +static const struct block_device_operations dnbd3_fops = +{ + .owner = THIS_MODULE, + .open = dnbd3_open, + .release = dnbd3_release, + .ioctl = dnbd3_ioctl, + .compat_ioctl = dnbd3_ioctl, +}; + +static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_queue_rq"); + return 0; +} + +static void dnbd3_complete_rq(struct request *req) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_complete_rq"); + +} + +static int dnbd3_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) +{ + struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(rq); + cmd->dnbd3 = set->driver_data; + cmd->flags = 0; + mutex_init(&cmd->lock); + return 0; +} +static enum blk_eh_timer_return dnbd3_xmit_timeout(struct request *req, bool reserved) +{ + printk(KERN_DEBUG "dnbd3: dnbd3_xmit_timeout"); + return BLK_EH_DONE; +} + + +static const struct blk_mq_ops dnbd3_mq_ops = { + .queue_rq = dnbd3_queue_rq, + .complete = dnbd3_complete_rq, + .init_request = dnbd3_init_request, + .timeout = dnbd3_xmit_timeout, +}; + + +static int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor) +{ + struct gendisk *disk; + struct request_queue *q; + int err = -ENOMEM; + printk(KERN_DEBUG "dnbd3: adding device %i", minor); + + + disk = alloc_disk(1); + if (!disk) { + printk(KERN_DEBUG "dnbd3: alloc_disc failed, device %i", minor); + goto out_free_nbd; + } + + err = idr_alloc(&dnbd3_index_idr, dev, minor, minor + 1, GFP_KERNEL); + if (err == -ENOSPC) { + printk(KERN_DEBUG "dnbd3: idr_alloc failed, device %i", minor); + err = -EEXIST; + } + + if (err < 0) + goto out_free_disk; + + dev->minor = minor; + dev->disk = disk; + dev->tag_set.ops = &dnbd3_mq_ops; + dev->tag_set.nr_hw_queues = 1; + dev->tag_set.queue_depth = 128; + dev->tag_set.numa_node = NUMA_NO_NODE; + dev->tag_set.cmd_size = sizeof(dnbd3_cmd); + dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; + dev->tag_set.driver_data = dev; + + err = blk_mq_alloc_tag_set(&dev->tag_set); + if (err) + goto out_free_idr; + + q = blk_mq_init_queue(&dev->tag_set); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_free_tags; + } + disk->queue = q; + + /* + * Tell the block layer that we are not a rotational device + */ + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); + disk->queue->limits.discard_granularity = 0; + disk->queue->limits.discard_alignment = 0; + blk_queue_max_discard_sectors(disk->queue, 0); + blk_queue_max_segment_size(disk->queue, UINT_MAX); + blk_queue_max_segments(disk->queue, USHRT_MAX); + blk_queue_max_hw_sectors(disk->queue, 65536); + disk->queue->limits.max_sectors = 256; + + mutex_init(&dev->config_lock); + refcount_set(&dev->config_refs, 0); + refcount_set(&dev->refs, 1); + INIT_LIST_HEAD(&dev->list); + disk->major = major; + disk->first_minor = minor; + disk->fops = &dnbd3_fops; + disk->private_data = dev; + sprintf(disk->disk_name, "dnbd%d", minor); +// sprintf(disk->disk_name, "dnbd3%d", minor); + printk(KERN_DEBUG "dnbd3: add disk, device %s", disk->disk_name); + add_disk(disk); + dnbd3_sysfs_init(dev); + return minor; + +out_free_tags: + blk_mq_free_tag_set(&dev->tag_set); +out_free_idr: + idr_remove(&dnbd3_index_idr, minor); +out_free_disk: + put_disk(disk); +out_free_nbd: + kfree(dev); + printk(KERN_DEBUG "dnbd3: destroy device %i", minor); + return err; +} + + + +static int __init dnbd3_init(void) +{ + int i; + printk(KERN_DEBUG "dnbd3: starting kernel module"); + + if (max_devs < 0) { + printk(KERN_ERR "dnbd3: max_devs must be >= 0"); + return -EINVAL; + } + + + dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL); + if (!dnbd3_device) { + printk(KERN_ERR "dnbd3: failed to create dnbd3 device"); + return -ENOMEM; + } + + // initialize block device + major = register_blkdev(0, "dnbd3"); + if (major == 0) { + printk(KERN_ERR "dnbd3: register_blkdev failed"); + return -EIO; + } + + printk(KERN_DEBUG "dnbd3: kernel module loaded. Machine type: " ENDIAN_MODE); + + // add MAX_NUMBER_DEVICES devices + mutex_lock(&dnbd3_index_mutex); + for (i = 0; i < max_devs; i++) { + dnbd3_blk_add_device(&dnbd3_device[i], i); + } + mutex_unlock(&dnbd3_index_mutex); + + printk(KERN_INFO "dnbd3: init successful (%i devices).\n", max_devs); + + return 0; +} + + +static int dnbd3_exit_cb(int id, void *ptr, void *data) +{ + struct list_head *list = (struct list_head *)data; + struct dnbd3_device_t *dnbd3 = ptr; + + list_add_tail(&dnbd3->list, list); + return 0; +} + +static void dnbd3_dev_remove(struct dnbd3_device_t *dnbd3) +{ + struct gendisk *disk = dnbd3->disk; + struct request_queue *q; + + if (disk) { + q = disk->queue; + del_gendisk(disk); + blk_cleanup_queue(q); + blk_mq_free_tag_set(&dnbd3->tag_set); + disk->private_data = NULL; + put_disk(disk); + } +} + +static void dnbd3_put(struct dnbd3_device_t *dnbd3) +{ + if (refcount_dec_and_mutex_lock(&dnbd3->refs, &dnbd3_index_mutex)) { + idr_remove(&dnbd3_index_idr, dnbd3->minor); + mutex_unlock(&dnbd3_index_mutex); + dnbd3_dev_remove(dnbd3); + } +} + + +static void __exit dnbd3_exit(void) +{ + dnbd3_device_t *dnbd3; + LIST_HEAD(del_list); + printk(KERN_DEBUG "dnbd3: stopping kernel module"); + + mutex_lock(&dnbd3_index_mutex); + idr_for_each(&dnbd3_index_idr, &dnbd3_exit_cb, &del_list); + mutex_unlock(&dnbd3_index_mutex); + + while (!list_empty(&del_list)) { + dnbd3 = list_first_entry(&del_list, struct dnbd3_device_t, list); + dnbd3_sysfs_exit(dnbd3); + list_del_init(&dnbd3->list); + if (refcount_read(&dnbd3->refs) != 1) { + printk(KERN_ERR "dnbd3: possibly leaking a device\n"); + } + dnbd3_put(dnbd3); + } + + idr_destroy(&dnbd3_index_idr); + unregister_blkdev(major, "dnbd3"); + + kfree(dnbd3_device); + + printk(KERN_INFO "dnbd3: stopped kernel module"); +} + + +module_init(dnbd3_init); +module_exit(dnbd3_exit); + +MODULE_DESCRIPTION("Distributed Network Block Device 3"); +MODULE_LICENSE("GPL"); + +module_param(max_devs, int, 0444); +MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)"); diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h new file mode 100644 index 0000000..2575cd8 --- /dev/null +++ b/src/kernel/dnbd3.h @@ -0,0 +1,86 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2019 Frederic Robra + * Parts copyright 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ +#ifndef DNBD_H_ +#define DNBD_H_ + +#include +#include +#include +#include + +#define KERNEL_MODULE +#include "types.h" + +typedef struct +{ + dnbd3_host_t host; + uint64_t rtts[4]; // Last four round trip time measurements in microsecond + uint16_t protocol_version; // dnbd3 protocol version of this server + uint8_t failures; // How many times the server was unreachable +} dnbd3_server_t; + + +typedef struct dnbd3_device_t { + int minor; + struct blk_mq_tag_set tag_set; + struct request_queue queue; + struct mutex config_lock; + refcount_t config_refs; + refcount_t refs; + struct list_head list; + + // block + struct gendisk *disk; + + // sysfs + struct kobject kobj; + + // network + char *imgname; + struct socket *sock; + dnbd3_server_t cur_server, initial_server; + unsigned long cur_rtt; +// serialized_buffer_t payload_buffer; + dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers + int new_servers_num; // number of new alt servers that are waiting to be copied to above array + dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers + uint8_t discover, panic, disconnecting, update_available, panic_count; + uint8_t use_server_provided_alts; + uint16_t rid; + uint32_t heartbeat_count; + uint64_t reported_size; + // server switch + struct socket *better_sock; + +} dnbd3_device_t; + + +typedef struct dnbd3_cmd { + struct dnbd3_device_t *dnbd3; + struct mutex lock; + int index; + int cookie; + blk_status_t status; + unsigned long flags; + u32 cmd_cookie; +} dnbd3_cmd; + +#endif /* DNBD_H_ */ diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c new file mode 100644 index 0000000..4406072 --- /dev/null +++ b/src/kernel/sysfs.c @@ -0,0 +1,205 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include + +#include "sysfs.h" +#include "utils.h" + +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev) +{ + if (dev->cur_server.host.type == HOST_IP4) + return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE); + else if (dev->cur_server.host.type == HOST_IP6) + return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE); + *buf = '\0'; + return 0; +} + +ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev) +{ + return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE); +} + +ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev) +{ + int i, num = 0; + for (i = 0; i < NUMBER_SERVERS; ++i) + { + if (dev->alt_servers[i].host.type) ++num; + } + return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE); +} + +ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev) +{ + int i, size = PAGE_SIZE, ret; + for (i = 0; i < NUMBER_SERVERS; ++i) + { + if (dev->alt_servers[i].host.type == HOST_IP4) + ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n", + dev->alt_servers[i].host.addr, + (int)ntohs(dev->alt_servers[i].host.port), + (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4), + (int)dev->alt_servers[i].failures) + , size); + else if (dev->alt_servers[i].host.type == HOST_IP6) + ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n", + dev->alt_servers[i].host.addr, + (int)ntohs(dev->alt_servers[i].host.port), + (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4), + (int)dev->alt_servers[i].failures) + , size); + else + continue; + size -= ret; + buf += ret; + if (size <= 0) + { + size = 0; + break; + } + } + return PAGE_SIZE - size; +} + +ssize_t show_image_name(char *buf, dnbd3_device_t *dev) +{ + if (dev->imgname == NULL) return sprintf(buf, "(null)"); + return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE); +} + +ssize_t show_rid(char *buf, dnbd3_device_t *dev) +{ + return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE); +} + +ssize_t show_update_available(char *buf, dnbd3_device_t *dev) +{ + return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE); +} + +device_attr_t cur_server_addr = +{ + .attr = {.name = "cur_server_addr", .mode = 0444 }, + .show = show_cur_server_addr, + .store = NULL, +}; + +device_attr_t cur_server_rtt = +{ + .attr = {.name = "cur_server_rtt", .mode = 0444 }, + .show = show_cur_server_rtt, + .store = NULL, +}; + +device_attr_t alt_server_num = +{ + .attr = {.name = "alt_server_num", .mode = 0444 }, + .show = show_alt_server_num, + .store = NULL, +}; + +device_attr_t alt_servers = +{ + .attr = {.name = "alt_servers", .mode = 0444 }, + .show = show_alt_servers, + .store = NULL, +}; + +device_attr_t image_name = +{ + .attr = {.name = "image_name", .mode = 0444 }, + .show = show_image_name, + .store = NULL, +}; + +device_attr_t rid = +{ + .attr = {.name = "rid", .mode = 0444 }, + .show = show_rid, + .store = NULL, +}; + +device_attr_t update_available = +{ + .attr = {.name = "update_available", .mode = 0444 }, + .show = show_update_available, + .store = NULL, +}; + +ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + device_attr_t *device_attr = container_of(attr, device_attr_t, attr); + dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj); + return device_attr->show(buf, dev); +} + +struct attribute *device_attrs[] = +{ + &cur_server_addr.attr, + &cur_server_rtt.attr, + &alt_server_num.attr, + &alt_servers.attr, + &image_name.attr, + &rid.attr, + &update_available.attr, + NULL, +}; + + +struct sysfs_ops device_ops = +{ + .show = device_show, +}; + +void release(struct kobject *kobj) +{ + kobj->state_initialized = 0; +} + +struct kobj_type device_ktype = +{ + .default_attrs = device_attrs, + .sysfs_ops = &device_ops, + .release = release, +}; + + +void dnbd3_sysfs_init(dnbd3_device_t *dev) +{ + int error; + struct kobject *kobj = &dev->kobj; + struct kobj_type *ktype = &device_ktype; + struct kobject *parent = &disk_to_dev(dev->disk)->kobj; + + error = kobject_init_and_add(kobj, ktype, parent, "%s", "net"); + if (error) + printk("Error initializing dnbd3 device!\n"); +} + +void dnbd3_sysfs_exit(dnbd3_device_t *dev) +{ + kobject_put(&dev->kobj); +} diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h new file mode 100644 index 0000000..0a747a5 --- /dev/null +++ b/src/kernel/sysfs.h @@ -0,0 +1,45 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef SYSFS_H_ +#define SYSFS_H_ + +#include "dnbd3.h" + +void dnbd3_sysfs_init(dnbd3_device_t *dev); + +void dnbd3_sysfs_exit(dnbd3_device_t *dev); + +typedef struct +{ + struct attribute attr; + ssize_t (*show)(char *, dnbd3_device_t *); + ssize_t (*store)(const char *, size_t, dnbd3_device_t *); +} device_attr_t; + +typedef struct +{ + struct attribute attr; + ssize_t (*show)(char *, dnbd3_server_t *); + ssize_t (*store)(const char *, size_t, dnbd3_server_t *); +} server_attr_t; + + +#endif /* SYSFS_H_ */ diff --git a/src/kernel/utils.c b/src/kernel/utils.c new file mode 100644 index 0000000..902025f --- /dev/null +++ b/src/kernel/utils.c @@ -0,0 +1,41 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include + +#include "utils.h" + +unsigned int inet_addr(char *str) +{ + int a, b, c, d; + char arr[4]; + sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d); + arr[0] = a; + arr[1] = b; + arr[2] = c; + arr[3] = d; + return *(unsigned int *) arr; +} + +void inet_ntoa(struct in_addr addr, char *str) +{ + unsigned char *ptr = (unsigned char *) &addr; + sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff); +} diff --git a/src/kernel/utils.h b/src/kernel/utils.h new file mode 100644 index 0000000..e54b3cf --- /dev/null +++ b/src/kernel/utils.h @@ -0,0 +1,29 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include + +unsigned int inet_addr(char *str); +void inet_ntoa(struct in_addr addr, char *str); + +#endif /* UTILS_H_ */ diff --git a/src/serialize.c b/src/serialize.c new file mode 100644 index 0000000..0bc0dcd --- /dev/null +++ b/src/serialize.c @@ -0,0 +1,84 @@ +#include "serialize.h" +#include "types.h" + + +void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len) +{ + buffer->buffer_end = buffer->buffer + MIN(MAX_PAYLOAD, data_len); + buffer->buffer_pointer = buffer->buffer; +} + +void serializer_reset_write(serialized_buffer_t *buffer) +{ + buffer->buffer_end = buffer->buffer + MAX_PAYLOAD; + buffer->buffer_pointer = buffer->buffer; +} + +uint8_t serializer_get_uint8(serialized_buffer_t *buffer) +{ + if (buffer->buffer_pointer + 1 > buffer->buffer_end) return 0; + return (uint8_t)*buffer->buffer_pointer++; +} + +uint16_t serializer_get_uint16(serialized_buffer_t *buffer) +{ + uint16_t ret; + if (buffer->buffer_pointer + 2 > buffer->buffer_end) return 0; + memcpy(&ret, buffer->buffer_pointer, 2); + buffer->buffer_pointer += 2; + return net_order_16(ret); +} + +uint64_t serializer_get_uint64(serialized_buffer_t *buffer) +{ + uint64_t ret; + if (buffer->buffer_pointer + 8 > buffer->buffer_end) return 0; + memcpy(&ret, buffer->buffer_pointer, 8); + buffer->buffer_pointer += 8; + return net_order_64(ret); +} + +char *serializer_get_string(serialized_buffer_t *buffer) +{ + char *ptr = buffer->buffer_pointer, *start = buffer->buffer_pointer; + if (ptr >= buffer->buffer_end) return NULL; + while (ptr < buffer->buffer_end && *ptr) ++ptr; + if (*ptr) return NULL; // String did not terminate within buffer (possibly corrupted/malicious packet) + buffer->buffer_pointer = ptr + 1; + return start; +} + +void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value) +{ + if (buffer->buffer_pointer + 1 > buffer->buffer_end) return; + *buffer->buffer_pointer++ = (char)value; +} + +void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value) +{ + if (buffer->buffer_pointer + 2 > buffer->buffer_end) return; + value = net_order_16(value); + memcpy(buffer->buffer_pointer, &value, 2); + buffer->buffer_pointer += 2; +} + +void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value) +{ + if (buffer->buffer_pointer + 8 > buffer->buffer_end) return; + value = net_order_64(value); + memcpy(buffer->buffer_pointer, &value, 8); + buffer->buffer_pointer += 8; +} + +void serializer_put_string(serialized_buffer_t *buffer, const char *value) +{ + const size_t len = strlen(value) + 1; + if (buffer->buffer_pointer + len > buffer->buffer_end) return; + memcpy(buffer->buffer_pointer, value, len); + buffer->buffer_pointer += len; +} + +uint32_t serializer_get_written_length(serialized_buffer_t *buffer) +{ + return (uint32_t)( buffer->buffer_pointer - buffer->buffer ); +} diff --git a/src/serialize.h b/src/serialize.h new file mode 100644 index 0000000..1b73531 --- /dev/null +++ b/src/serialize.h @@ -0,0 +1,40 @@ +#ifndef SERIALIZER_H_ +#define SERIALIZER_H_ + +// Careful with includes - this is used in kernel module too +#include "config.h" + +typedef struct +{ + char buffer[MAX_PAYLOAD]; // This MUST be the first member or send_reply() will blow up + char *buffer_end; + char *buffer_pointer; +} serialized_buffer_t; + +void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len); + +void serializer_reset_write(serialized_buffer_t *buffer); + +uint32_t serializer_get_written_length(serialized_buffer_t *buffer); + +// + +uint8_t serializer_get_uint8(serialized_buffer_t *buffer); + +uint16_t serializer_get_uint16(serialized_buffer_t *buffer); + +uint64_t serializer_get_uint64(serialized_buffer_t *buffer); + +char *serializer_get_string(serialized_buffer_t *buffer); + +// + +void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value); + +void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value); + +void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value); + +void serializer_put_string(serialized_buffer_t *buffer, const char *value); + +#endif diff --git a/src/server/altservers.c b/src/server/altservers.c new file mode 100644 index 0000000..b91ceab --- /dev/null +++ b/src/server/altservers.c @@ -0,0 +1,612 @@ +#include "altservers.h" +#include "locks.h" +#include "helper.h" +#include "image.h" +#include "fileutil.h" +#include "../shared/protocol.h" +#include "../shared/timing.h" +#include "../serverconfig.h" +#include +#include +#include + +#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid) +#define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0); +#define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__) + +static dnbd3_connection_t *pending[SERVER_MAX_PENDING_ALT_CHECKS]; +static pthread_spinlock_t pendingLockWrite; // Lock for adding something to pending. (NULL -> nonNULL) +static pthread_mutex_t pendingLockConsume = PTHREAD_MUTEX_INITIALIZER; // Lock for removing something (nonNULL -> NULL) +static dnbd3_signal_t* runSignal = NULL; + +static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS]; +static int numAltServers = 0; +static pthread_spinlock_t altServersLock; + +static pthread_t altThread; + +static void *altservers_main(void *data); +static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt); + +void altservers_init() +{ + srand( (unsigned int)time( NULL ) ); + // Init spinlock + spin_init( &pendingLockWrite, PTHREAD_PROCESS_PRIVATE ); + spin_init( &altServersLock, PTHREAD_PROCESS_PRIVATE ); + // Init signal + runSignal = signal_new(); + if ( runSignal == NULL ) { + logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." ); + exit( EXIT_FAILURE ); + } + memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) ); + if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) { + logadd( LOG_ERROR, "Could not start altservers connector thread" ); + exit( EXIT_FAILURE ); + } + // Init waiting links queue -- this is currently a global static array so + // it will already be zero, but in case we refactor later do it explicitly + // while also holding the write lock so thread sanitizer is happy + spin_lock( &pendingLockWrite ); + for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { + pending[i] = NULL; + } + spin_unlock( &pendingLockWrite ); +} + +void altservers_shutdown() +{ + if ( runSignal == NULL ) return; + signal_call( runSignal ); // Wake altservers thread up + thread_join( altThread, NULL ); +} + +static void addalt(int argc, char **argv, void *data) +{ + char *shost; + dnbd3_host_t host; + bool isPrivate = false; + bool isClientOnly = false; + if ( argv[0][0] == '#' ) return; + for (shost = argv[0]; *shost != '\0'; ) { // Trim left and scan for "-" prefix + if ( *shost == '-' ) isPrivate = true; + else if ( *shost == '+' ) isClientOnly = true; + else if ( *shost != ' ' && *shost != '\t' ) break; + shost++; + } + if ( !parse_address( shost, &host ) ) { + logadd( LOG_WARNING, "Invalid entry in alt-servers file ignored: '%s'", shost ); + return; + } + if ( argc == 1 ) argv[1] = ""; + if ( altservers_add( &host, argv[1], isPrivate, isClientOnly ) ) { + (*(int*)data)++; + } +} + +int altservers_load() +{ + int count = 0; + char *name; + if ( asprintf( &name, "%s/%s", _configDir, "alt-servers" ) == -1 ) return -1; + file_loadLineBased( name, 1, 2, &addalt, (void*)&count ); + free( name ); + logadd( LOG_DEBUG1, "Added %d alt servers\n", count ); + return count; +} + +bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly) +{ + int i, freeSlot = -1; + spin_lock( &altServersLock ); + for (i = 0; i < numAltServers; ++i) { + if ( isSameAddressPort( &altServers[i].host, host ) ) { + spin_unlock( &altServersLock ); + return false; + } else if ( freeSlot == -1 && altServers[i].host.type == 0 ) { + freeSlot = i; + } + } + if ( freeSlot == -1 ) { + if ( numAltServers >= SERVER_MAX_ALTS ) { + logadd( LOG_WARNING, "Cannot add another alt server, maximum of %d already reached.", (int)SERVER_MAX_ALTS ); + spin_unlock( &altServersLock ); + return false; + } + freeSlot = numAltServers++; + } + altServers[freeSlot].host = *host; + altServers[freeSlot].isPrivate = isPrivate; + altServers[freeSlot].isClientOnly = isClientOnly; + if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment ); + spin_unlock( &altServersLock ); + return true; +} + +/** + * ONLY called from the passed uplink's main thread + */ +void altservers_findUplink(dnbd3_connection_t *uplink) +{ + int i; + // if betterFd != -1 it means the uplink is supposed to switch to another + // server. As this function here is called by the uplink thread, it can + // never be that the uplink is supposed to switch, but instead calls + // this function. + assert( uplink->betterFd == -1 ); + spin_lock( &pendingLockWrite ); + // it is however possible that an RTT measurement is currently in progress, + // so check for that case and do nothing if one is in progress + if ( uplink->rttTestResult == RTT_INPROGRESS ) { + for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { + if ( pending[i] != uplink ) continue; + // Yep, measuring right now + spin_unlock( &pendingLockWrite ); + return; + } + } + // Find free slot for measurement + for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { + if ( pending[i] != NULL ) continue; + pending[i] = uplink; + uplink->rttTestResult = RTT_INPROGRESS; + spin_unlock( &pendingLockWrite ); + signal_call( runSignal ); // Wake altservers thread up + return; + } + // End of loop - no free slot + spin_unlock( &pendingLockWrite ); + logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." ); +} + +/** + * The given uplink is about to disappear, so remove it from any queues + */ +void altservers_removeUplink(dnbd3_connection_t *uplink) +{ + pthread_mutex_lock( &pendingLockConsume ); + spin_lock( &pendingLockWrite ); + for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) { + if ( pending[i] == uplink ) { + uplink->rttTestResult = RTT_NOT_REACHABLE; + pending[i] = NULL; + } + } + spin_unlock( &pendingLockWrite ); + pthread_mutex_unlock( &pendingLockConsume ); +} + +/** + * Get known (working) alt servers, ordered by network closeness + * (by finding the smallest possible subnet) + * Private servers are excluded, so this is what you want to call to + * get a list of servers you can tell a client about + */ +int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size) +{ + if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0; + int i, j; + int count = 0; + int scores[size]; + int score; + spin_lock( &altServersLock ); + if ( size > numAltServers ) size = numAltServers; + for (i = 0; i < numAltServers; ++i) { + if ( altServers[i].host.type == 0 ) continue; // Slot is empty + if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers + if ( host->type == altServers[i].host.type ) { + score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails; + } else { + score = -( altServers[i].numFails + 128 ); // Wrong address family + } + if ( count == 0 ) { + // Trivial - this is the first entry + output[0].host = altServers[i].host; + output[0].failures = 0; + scores[0] = score; + count++; + } else { + // Other entries already exist, insert in proper position + for (j = 0; j < size; ++j) { + if ( j < count && score <= scores[j] ) continue; + if ( j > count ) break; // Should never happen but just in case... + if ( j < count && j + 1 < size ) { + // Check if we're in the middle and need to move other entries... + memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) ); + memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) ); + } + if ( count < size ) { + count++; + } + output[j].host = altServers[i].host; + output[j].failures = 0; + scores[j] = score; + break; + } + } + } + spin_unlock( &altServersLock ); + return count; +} + +/** + * Get alt servers. If there are more alt servers than + * requested, random servers will be picked. + * This function is suited for finding uplink servers as + * it includes private servers and ignores any "client only" servers + */ +int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency) +{ + if ( size <= 0 ) return 0; + int count = 0, i; + ticks now; + timing_get( &now ); + spin_lock( &altServersLock ); + // Flip first server in list with a random one every time this is called + if ( numAltServers > 1 ) { + const dnbd3_alt_server_t tmp = altServers[0]; + do { + i = rand() % numAltServers; + } while ( i == 0 ); + altServers[0] = altServers[i]; + altServers[i] = tmp; + } + // We iterate over the list twice. First run adds servers with 0 failures only, + // second one also considers those that failed (not too many times) + if ( size > numAltServers ) size = numAltServers; + for (i = 0; i < numAltServers * 2; ++i) { + dnbd3_alt_server_t *srv = &altServers[i % numAltServers]; + if ( srv->host.type == 0 ) continue; // Slot is empty + if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore! + if ( srv->isClientOnly ) continue; + bool first = ( i < numAltServers ); + if ( first ) { + if ( srv->numFails > 0 ) continue; + } else { + if ( srv->numFails == 0 ) continue; // Already added in first iteration + if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row + && timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore! + if ( !emergency ) srv->numFails--; + } + // server seems ok, include in output and decrease its fail counter + output[count++] = srv->host; + if ( count >= size ) break; + } + spin_unlock( &altServersLock ); + return count; +} + +json_t* altservers_toJson() +{ + json_t *list = json_array(); + + spin_lock( &altServersLock ); + char host[100]; + const int count = numAltServers; + dnbd3_alt_server_t src[count]; + memcpy( src, altServers, sizeof(src) ); + spin_unlock( &altServersLock ); + for (int i = 0; i < count; ++i) { + json_t *rtts = json_array(); + for (int j = 0; j < SERVER_RTT_PROBES; ++j) { + json_array_append_new( rtts, json_integer( src[i].rtt[ (j + src[i].rttIndex + 1) % SERVER_RTT_PROBES ] ) ); + } + sock_printHost( &src[i].host, host, sizeof(host) ); + json_t *server = json_pack( "{ss,ss,so,sb,sb,si}", + "comment", src[i].comment, + "host", host, + "rtt", rtts, + "isPrivate", (int)src[i].isPrivate, + "isClientOnly", (int)src[i].isClientOnly, + "numFails", src[i].numFails + ); + json_array_append_new( list, server ); + } + return list; +} + +/** + * Update rtt history of given server - returns the new average for that server + */ +static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt) +{ + unsigned int avg = rtt; + int i; + spin_lock( &altServersLock ); + for (i = 0; i < numAltServers; ++i) { + if ( !isSameAddressPort( host, &altServers[i].host ) ) continue; + altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt; +#if SERVER_RTT_PROBES == 5 + avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2] + + altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES; +#else +#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES + avg = 0; + for (int j = 0; j < SERVER_RTT_PROBES; ++j) { + avg += altServers[i].rtt[j]; + } + avg /= SERVER_RTT_PROBES; +#endif + // If we got a new rtt value, server must be working + if ( altServers[i].numFails > 0 ) { + altServers[i].numFails--; + } + break; + } + spin_unlock( &altServersLock ); + return avg; +} + +/** + * Determine how close two addresses are to each other by comparing the number of + * matching bits from the left of the address. Does not count individual bits but + * groups of 4 for speed. + * Return: Closeness - higher number means closer + */ +int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2) +{ + if ( host1 == NULL || host2 == NULL || host1->type != host2->type ) return -1; + int retval = 0; + const int max = host1->type == HOST_IP4 ? 4 : 16; + for (int i = 0; i < max; ++i) { + if ( (host1->addr[i] & 0xf0) != (host2->addr[i] & 0xf0) ) return retval; + ++retval; + if ( (host1->addr[i] & 0x0f) != (host2->addr[i] & 0x0f) ) return retval; + ++retval; + } + return retval; +} + +/** + * Called if an uplink server failed during normal uplink operation. This unit keeps + * track of how often servers fail, and consider them disabled for some time if they + * fail too many times. + */ +void altservers_serverFailed(const dnbd3_host_t * const host) +{ + int i; + int foundIndex = -1, lastOk = -1; + ticks now; + timing_get( &now ); + spin_lock( &altServersLock ); + for (i = 0; i < numAltServers; ++i) { + if ( foundIndex == -1 ) { + // Looking for the failed server in list + if ( isSameAddressPort( host, &altServers[i].host ) ) { + foundIndex = i; + } + } else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) { + lastOk = i; + } + } + // Do only increase counter if last fail was not too recent. This is + // to prevent the counter from increasing rapidly if many images use the + // same uplink. If there's a network hickup, all uplinks will call this + // function and would increase the counter too quickly, disabling the server. + if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) { + altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE; + altServers[foundIndex].lastFail = now; + if ( lastOk != -1 ) { + // Make sure non-working servers are put at the end of the list, so they're less likely + // to get picked when testing servers for uplink connections. + const dnbd3_alt_server_t tmp = altServers[foundIndex]; + altServers[foundIndex] = altServers[lastOk]; + altServers[lastOk] = tmp; + } + } + spin_unlock( &altServersLock ); +} +/** + * Mainloop of this module. It will wait for requests by uplinks to find a + * suitable uplink server for them. If found, it will tell the uplink about + * the best server found. Currently the RTT history is kept per server and + * not per uplink, so if many images use the same uplink server, the history + * will update quite quickly. Needs to be improved some time, ie. by only + * updating the rtt if the last update was at least X seconds ago. + */ +static void *altservers_main(void *data UNUSED) +{ + const int ALTS = 4; + int ret, itLink, itAlt, numAlts; + bool found; + char buffer[DNBD3_BLOCK_SIZE ]; + dnbd3_reply_t reply; + dnbd3_host_t servers[ALTS + 1]; + serialized_buffer_t serialized; + struct timespec start, end; + ticks nextCloseUnusedFd; + + setThreadName( "altserver-check" ); + blockNoncriticalSignals(); + timing_gets( &nextCloseUnusedFd, 900 ); + // LOOP + while ( !_shutdown ) { + // Wait 5 seconds max. + ret = signal_wait( runSignal, 5000 ); + if ( _shutdown ) goto cleanup; + if ( ret == SIGNAL_ERROR ) { + if ( errno == EAGAIN || errno == EINTR ) continue; + logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno ); + usleep( 100000 ); + } + // Work your way through the queue + for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) { + spin_lock( &pendingLockWrite ); + if ( pending[itLink] == NULL ) { + spin_unlock( &pendingLockWrite ); + continue; // Check once before locking, as a mutex is expensive + } + spin_unlock( &pendingLockWrite ); + pthread_mutex_lock( &pendingLockConsume ); + spin_lock( &pendingLockWrite ); + dnbd3_connection_t * const uplink = pending[itLink]; + spin_unlock( &pendingLockWrite ); + if ( uplink == NULL ) { // Check again after locking + pthread_mutex_unlock( &pendingLockConsume ); + continue; + } + dnbd3_image_t * const image = image_lock( uplink->image ); + if ( image == NULL ) { // Check again after locking + uplink->rttTestResult = RTT_NOT_REACHABLE; + spin_lock( &pendingLockWrite ); + pending[itLink] = NULL; + spin_unlock( &pendingLockWrite ); + pthread_mutex_unlock( &pendingLockConsume ); + logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" ); + continue; + } + LOG( LOG_DEBUG2, "[%d] Running alt check", itLink ); + assert( uplink->rttTestResult == RTT_INPROGRESS ); + // Now get 4 alt servers + numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 ); + if ( uplink->fd != -1 ) { + // Add current server if not already in list + found = false; + for (itAlt = 0; itAlt < numAlts; ++itAlt) { + if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue; + found = true; + break; + } + if ( !found ) servers[numAlts++] = uplink->currentServer; + } + // Test them all + int bestSock = -1; + int bestIndex = -1; + int bestProtocolVersion = -1; + unsigned long bestRtt = RTT_UNREACHABLE; + unsigned long currentRtt = RTT_UNREACHABLE; + for (itAlt = 0; itAlt < numAlts; ++itAlt) { + usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...) + // Connect + clock_gettime( BEST_CLOCK_SOURCE, &start ); + int sock = sock_connect( &servers[itAlt], 750, 1000 ); + if ( sock < 0 ) continue; + // Select image ++++++++++++++++++++++++++++++ + if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) { + goto server_failed; + } + // See if selecting the image succeeded ++++++++++++++++++++++++++++++ + uint16_t protocolVersion, rid; + uint64_t imageSize; + char *name; + if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) { + goto server_image_not_available; + } + if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed; + if ( name == NULL || strcmp( name, image->name ) != 0 ) { + ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name ); + } + if ( rid != image->rid ) { + ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid ); + } + if ( imageSize != image->virtualFilesize ) { + ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize ); + } + // Request first block (NOT random!) ++++++++++++++++++++++++++++++ + if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) { + LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink ); + } + // See if requesting the block succeeded ++++++++++++++++++++++ + if ( !dnbd3_get_reply( sock, &reply ) ) { + LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink ); + } + // check reply header + if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) { + ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size ); + } + if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) { + ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink ); + } + clock_gettime( BEST_CLOCK_SOURCE, &end ); + // Measurement done - everything fine so far + spin_lock( &uplink->rttLock ); + const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer ); + // Penaltize rtt if this was a cycle; this will treat this server with lower priority + // in the near future too, so we prevent alternating between two servers that are both + // part of a cycle and have the lowest latency. + const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000 + + (end.tv_nsec - start.tv_nsec) / 1000 + + ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs + unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt ); + // If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time + if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000; + spin_unlock( &uplink->rttLock ); + if ( uplink->fd != -1 && isCurrent ) { + // Was measuring current server + currentRtt = avg; + close( sock ); + } else if ( avg < bestRtt ) { + // Was another server, update "best" + if ( bestSock != -1 ) close( bestSock ); + bestSock = sock; + bestRtt = avg; + bestIndex = itAlt; + bestProtocolVersion = protocolVersion; + } else { + // Was too slow, ignore + close( sock ); + } + // We're done, call continue + continue; + // Jump here if anything went wrong + // This will cleanup and continue + server_failed: ; + altservers_serverFailed( &servers[itAlt] ); + server_image_not_available: ; + close( sock ); + } + // Done testing all servers. See if we should switch + if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) { + // yep + if ( currentRtt > 10000000 || uplink->fd == -1 ) { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt ); + } else { + LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt ); + } + sock_setTimeout( bestSock, _uplinkTimeout ); + spin_lock( &uplink->rttLock ); + uplink->betterFd = bestSock; + uplink->betterServer = servers[bestIndex]; + uplink->betterVersion = bestProtocolVersion; + uplink->rttTestResult = RTT_DOCHANGE; + spin_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); + } else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) { + // No server was reachable + spin_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_NOT_REACHABLE; + spin_unlock( &uplink->rttLock ); + } else { + // nope + if ( bestSock != -1 ) close( bestSock ); + spin_lock( &uplink->rttLock ); + uplink->rttTestResult = RTT_DONTCHANGE; + uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away + spin_unlock( &uplink->rttLock ); + if ( !image->working ) { + image->working = true; + LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink ); + } + } + image_release( image ); + // end of loop over all pending uplinks + spin_lock( &pendingLockWrite ); + pending[itLink] = NULL; + spin_unlock( &pendingLockWrite ); + pthread_mutex_unlock( &pendingLockConsume ); + } + // Save cache maps of all images if applicable + declare_now; + // TODO: Has nothing to do with alt servers really, maybe move somewhere else? + if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) { + timing_gets( &nextCloseUnusedFd, 900 ); + image_closeUnusedFd(); + } + } + cleanup: ; + if ( runSignal != NULL ) signal_close( runSignal ); + runSignal = NULL; + return NULL ; +} + diff --git a/src/server/altservers.h b/src/server/altservers.h new file mode 100644 index 0000000..7b7b46d --- /dev/null +++ b/src/server/altservers.h @@ -0,0 +1,30 @@ +#ifndef _ALTSERVERS_H_ +#define _ALTSERVERS_H_ + +#include "globals.h" + +struct json_t; + +void altservers_init(); + +void altservers_shutdown(); + +int altservers_load(); + +bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly); + +void altservers_findUplink(dnbd3_connection_t *uplink); + +void altservers_removeUplink(dnbd3_connection_t *uplink); + +int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size); + +int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency); + +int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2); + +void altservers_serverFailed(const dnbd3_host_t * const host); + +struct json_t* altservers_toJson(); + +#endif /* UPLINK_CONNECTOR_H_ */ diff --git a/src/server/fileutil.c b/src/server/fileutil.c new file mode 100644 index 0000000..336ab68 --- /dev/null +++ b/src/server/fileutil.c @@ -0,0 +1,128 @@ +#include "fileutil.h" +#include "helper.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool file_isReadable(char *file) +{ + int fd = open( file, O_RDONLY ); + if ( fd < 0 ) return false; + close( fd ); + return true; +} + +bool file_isWritable(char *file) +{ + int fd = open( file, O_WRONLY ); + if ( fd >= 0 ) { + close( fd ); + return true; + } + fd = open( file, O_WRONLY | O_CREAT, 0600 ); + if ( fd < 0 ) return false; + close( fd ); + remove( file ); + return true; +} + +bool mkdir_p(const char* path) +{ + assert( path != NULL ); + if ( *path == '\0' ) return true; + char buffer[strlen( path ) + 1]; + strcpy( buffer, path ); + char *current = buffer; + char *slash; + while ( (slash = strchr( current, '/' )) != NULL ) { + *slash = '\0'; + if ( *buffer != '\0' && mkdir( buffer, 0755 ) != 0 && errno != EEXIST ) return false; + *slash = '/'; + current = slash + 1; + } + if ( mkdir( buffer, 0755 ) != 0 && errno != EEXIST ) return false; + return true; +} + +bool file_alloc(int fd, uint64_t offset, uint64_t size) +{ +#ifdef __linux__ + if ( fallocate( fd, 0, offset, size ) == 0 ) return true; // fast way +#elif defined(__FreeBSD__) + if ( posix_fallocate( fd, offset, size ) == 0 ) return true; // slow way +#endif + return false; +} + +bool file_setSize(int fd, uint64_t size) +{ + if ( ftruncate( fd, size ) == 0 ) return true; + + // Try really hard... image loading logic relies on the file + // having the proper apparent size + uint8_t byte = 0; + pread( fd, &byte, 1, size - 1 ); + if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true; + return false; +} + +bool file_freeDiskSpace(const char * const path, uint64_t *total, uint64_t *avail) +{ + struct statvfs fiData; + if ( statvfs( path, &fiData ) < 0 ) { + return false; + } + if ( avail != NULL ) { + *avail = ((uint64_t)fiData.f_bavail * (uint64_t)fiData.f_frsize); + } + if ( total != NULL ) { + *total = ((uint64_t)fiData.f_blocks * (uint64_t)fiData.f_frsize); + } + return true; +} + +time_t file_lastModification(const char * const file) +{ + struct stat st; + if ( stat( file, &st ) != 0 ) return 0; + return st.st_mtime; +} + +int file_loadLineBased(const char * const file, int minFields, int maxFields, void (*cb)(int argc, char **argv, void *data), void *data) +{ + char buffer[1000], *line; + char *items[20]; + int count = 0, itemCount; + + if ( file == NULL || cb == NULL ) return -1; + FILE *fp = fopen( file, "r" ); + if ( fp == NULL ) return -1; + while ( fgets( buffer, sizeof(buffer), fp ) != NULL ) { + itemCount = 0; + for (line = buffer; *line != '\0' && itemCount < 20; ) { // Trim left and scan for "-" prefix + while ( *line == ' ' || *line == '\t' ) ++line; + if ( *line == '\r' || *line == '\n' || *line == '\0' ) break; // Ignore empty lines + items[itemCount++] = line; + if ( itemCount >= maxFields ) { + trim_right( line ); + break; + } + while ( *line != '\0' && *line != ' ' && *line != '\t' && *line != '\r' && *line != '\n' ) ++line; + if ( *line != '\0' ) *line++ = '\0'; + } + if ( itemCount >= minFields ) { + cb( itemCount, items, data ); + count++; + } + } + fclose( fp ); + return count; +} + diff --git a/src/server/fileutil.h b/src/server/fileutil.h new file mode 100644 index 0000000..fcb5c20 --- /dev/null +++ b/src/server/fileutil.h @@ -0,0 +1,17 @@ +#ifndef _FILEUTIL_H_ +#define _FILEUTIL_H_ + +#include +#include +#include + +bool file_isReadable(char *file); +bool file_isWritable(char *file); +bool mkdir_p(const char* path); +bool file_alloc(int fd, uint64_t offset, uint64_t size); +bool file_setSize(int fd, uint64_t size); +bool file_freeDiskSpace(const char * const path, uint64_t *total, uint64_t *avail); +time_t file_lastModification(const char * const file); +int file_loadLineBased(const char * const file, int minFields, int maxFields, void (*cb)(int argc, char **argv, void *data), void *data); + +#endif /* FILEUTIL_H_ */ diff --git a/src/server/globals.c b/src/server/globals.c new file mode 100644 index 0000000..c9b9411 --- /dev/null +++ b/src/server/globals.c @@ -0,0 +1,321 @@ +#include "globals.h" +#include "ini.h" +#include "../shared/log.h" +#include +#include +#include +#include +#include +#include + +char *_configDir = NULL; +atomic_bool _shutdown = false; +// [dnbd3] +atomic_int _listenPort = PORT; +char *_basePath = NULL; +atomic_int _serverPenalty = 0; +atomic_int _clientPenalty = 0; +atomic_bool _isProxy = false; +atomic_int _backgroundReplication = BGR_FULL; +atomic_int _bgrMinClients = 0; +atomic_bool _lookupMissingForProxy = true; +atomic_bool _sparseFiles = false; +atomic_bool _removeMissingImages = true; +atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK; +atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT; +atomic_bool _closeUnusedFd = false; +atomic_bool _vmdkLegacyMode = false; +// Not really needed anymore since we have '+' and '-' in alt-servers +atomic_bool _proxyPrivateOnly = false; +// [limits] +atomic_int _maxClients = SERVER_MAX_CLIENTS; +atomic_int _maxImages = SERVER_MAX_IMAGES; +atomic_int _maxPayload = 9000000; // 9MB +atomic_uint_fast64_t _maxReplicationSize = (uint64_t)100000000000LL; + +/** + * True when loading config the first time. Consecutive loads will + * ignore certain values which cannot be changed safely at runtime. + */ +static atomic_bool initialLoad = true; +static pthread_mutex_t loadLock = PTHREAD_MUTEX_INITIALIZER; + +#define IS_TRUE(value) (atoi(value) != 0 || strcmp(value, "true") == 0 || strcmp(value, "True") == 0 || strcmp(value, "TRUE") == 0) +#define SAVE_TO_VAR_STR(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) { if (_ ## kk != NULL) free(_ ## kk); _ ## kk = strdup(value); } } while (0) +#define SAVE_TO_VAR_BOOL(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) _ ## kk = IS_TRUE(value); } while (0) +#define SAVE_TO_VAR_INT(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse32(value, &_ ## kk, #ss); } while (0) +#define SAVE_TO_VAR_UINT(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse32u(value, &_ ## kk, #ss); } while (0) +#define SAVE_TO_VAR_UINT64(ss, kk) do { if (strcmp(section, #ss) == 0 && strcmp(key, #kk) == 0) parse64u(value, &_ ## kk, #ss); } while (0) + +static void sanitizeFixedConfig(); + +static void handleMaskString( const char *value, void(*func)(logmask_t) ); + +static const char* units = "KMGTPEZY"; + +static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname); +static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname); +static bool parse32(const char *in, atomic_int *out, const char *optname) UNUSED; +static bool parse32u(const char *in, atomic_int *out, const char *optname); + +static int ini_handler(void *custom UNUSED, const char* section, const char* key, const char* value) +{ + if ( initialLoad ) { + if ( _basePath == NULL ) SAVE_TO_VAR_STR( dnbd3, basePath ); + SAVE_TO_VAR_BOOL( dnbd3, vmdkLegacyMode ); + SAVE_TO_VAR_UINT( dnbd3, listenPort ); + SAVE_TO_VAR_UINT( limits, maxClients ); + SAVE_TO_VAR_UINT( limits, maxImages ); + } + SAVE_TO_VAR_BOOL( dnbd3, isProxy ); + SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly ); + SAVE_TO_VAR_INT( dnbd3, bgrMinClients ); + SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy ); + SAVE_TO_VAR_BOOL( dnbd3, sparseFiles ); + SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages ); + SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd ); + SAVE_TO_VAR_UINT( dnbd3, serverPenalty ); + SAVE_TO_VAR_UINT( dnbd3, clientPenalty ); + SAVE_TO_VAR_UINT( dnbd3, uplinkTimeout ); + SAVE_TO_VAR_UINT( dnbd3, clientTimeout ); + SAVE_TO_VAR_UINT( limits, maxPayload ); + SAVE_TO_VAR_UINT64( limits, maxReplicationSize ); + if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) { + if ( strcmp( value, "hashblock" ) == 0 ) { + _backgroundReplication = BGR_HASHBLOCK; + } else if ( IS_TRUE( value ) ) { + _backgroundReplication = BGR_FULL; + } else { + _backgroundReplication = BGR_DISABLED; + } + } + if ( strcmp( section, "logging" ) == 0 && strcmp( key, "fileMask" ) == 0 ) handleMaskString( value, &log_setFileMask ); + if ( strcmp( section, "logging" ) == 0 && strcmp( key, "consoleMask" ) == 0 ) handleMaskString( value, &log_setConsoleMask ); + if ( strcmp( section, "logging" ) == 0 && strcmp( key, "consoleTimestamps" ) == 0 ) log_setConsoleTimestamps( IS_TRUE(value) ); + if ( strcmp( section, "logging" ) == 0 && strcmp( key, "file" ) == 0 ) { + if ( log_openLogFile( value ) ) { + logadd( LOG_INFO, "Opened log file %s", value ); + } else { + logadd( LOG_ERROR, "Could not open log file %s", value ); + exit( EXIT_FAILURE ); + } + } + return 1; +} + +void globals_loadConfig() +{ + char *name = NULL; + asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ); + if ( name == NULL ) return; + if ( pthread_mutex_trylock( &loadLock ) != 0 ) { + logadd( LOG_INFO, "Ignoring config reload request due to already running reload" ); + return; + } + ini_parse( name, &ini_handler, NULL ); + free( name ); + if ( initialLoad ) { + sanitizeFixedConfig(); + } + if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) { + logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" ); + _sparseFiles = false; + } + // Dump config as interpreted + char buffer[2000]; + globals_dumpConfig( buffer, sizeof(buffer) ); + logadd( LOG_DEBUG1, "Effective configuration:\n%s", buffer ); + initialLoad = false; + pthread_mutex_unlock( &loadLock ); +} + +static void sanitizeFixedConfig() +{ + // Validate settings after loading: + // base path for images valid? + if ( _basePath == NULL || _basePath[0] == '\0' ) { + logadd( LOG_WARNING, "No/empty basePath in " CONFIG_FILENAME ); + free( _basePath ); + _basePath = NULL; + } else if ( _basePath[0] != '/' ) { + logadd( LOG_WARNING, "basePath must be absolute!" ); + free( _basePath ); + _basePath = NULL; + } else { + char *end = _basePath + strlen( _basePath ) - 1; + while ( end >= _basePath && *end == '/' ) { + *end-- = '\0'; + } + } + // listen port + if ( _listenPort < 1 || _listenPort > 65535 ) { + logadd( LOG_ERROR, "listenPort must be 1-65535, but is %d", _listenPort ); + exit( EXIT_FAILURE ); + } + // Cap to hard limit + if ( _maxClients > SERVER_MAX_CLIENTS ) _maxClients = SERVER_MAX_CLIENTS; + if ( _maxImages > SERVER_MAX_IMAGES ) _maxImages = SERVER_MAX_IMAGES; + // Consider rlimits + struct rlimit limit; + if ( getrlimit( RLIMIT_NOFILE, &limit ) != 0 ) { + logadd( LOG_DEBUG1, "getrlimit failed, errno %d", errno ); + } else { + const rlim_t required = (rlim_t)( _maxClients + _maxImages * ( _isProxy ? 2 : 1 ) + 50 ); + if ( limit.rlim_cur != RLIM_INFINITY && limit.rlim_cur < required ) { + rlim_t current = limit.rlim_cur; + if ( required <= limit.rlim_max || limit.rlim_max == RLIM_INFINITY ) { + limit.rlim_cur = required; + } else { + limit.rlim_cur = limit.rlim_max; + } + if ( current != limit.rlim_cur && setrlimit( RLIMIT_NOFILE, &limit ) == 0 ) { + current = limit.rlim_cur; + logadd( LOG_INFO, "LIMIT_NOFILE (ulimit -n) soft limit increased to %d", (int)current ); + } + if ( current < required ) { + logadd( LOG_WARNING, "This process can only have %d open file handles," + " which is not enough for the selected maxClients and maxImages counts." + " Consider increasing the limit to at least %d (RLIMIT_NOFILE, ulimit -n)" + " to support the current configuration. maxClients and maxImages have" + " been lowered for this session.", (int)current, (int)required ); + do { + if ( _maxClients > 500 && _maxImages > 150 ) { + _maxImages -= _maxImages / 20 + 1; + _maxClients -= _maxClients / 20 + 1; + } else if ( _maxImages > 100 ) { + _maxImages -= _maxImages / 20 + 1; + if ( _maxClients > 200 ) _maxClients -= _maxClients / 25 + 1; + } else { + break; + } + } while ( (rlim_t)( _maxClients + _maxImages * ( _isProxy ? 2 : 1 ) + 50 ) > current ); + } + } + } +} + +#define SETLOGBIT(name) do { if ( strstr( value, #name ) != NULL ) mask |= LOG_ ## name; } while (0) +static void handleMaskString( const char *value, void(*func)(logmask_t) ) +{ + logmask_t mask = 0; + SETLOGBIT( ERROR ); + SETLOGBIT( WARNING ); + SETLOGBIT( MINOR ); + SETLOGBIT( INFO ); + SETLOGBIT( DEBUG1 ); + SETLOGBIT( DEBUG2 ); + (*func)( mask ); +} + +static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname) +{ + if ( *in == '\0' ) { + logadd( LOG_WARNING, "Ignoring empty numeric setting '%s'", optname ); + return false; + } + char *end; + long long int num = strtoll( in, &end, 10 ); + if ( end == in ) { + logadd( LOG_WARNING, "Ignoring value '%s' for '%s': Not a number", in, optname ); + return false; + } + int exp, base = 1024; + while ( *end == ' ' ) end++; + if ( *end == '\0' ) { + exp = 0; + } else { + char *pos = strchr( units, *end > 'Z' ? (*end - 32) : *end ); + if ( pos == NULL ) { + logadd( LOG_ERROR, "Invalid unit '%s' for '%s'", end, optname ); + return false; + } + exp = (int)( pos - units ) + 1; + end++; + if ( *end == 'B' || *end == 'b' ) { + base = 1000; + } + } + while ( exp-- > 0 ) num *= base; + *out = (int64_t)num; + return true; +} + +static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname) +{ + atomic_int_fast64_t v; + if ( !parse64( in, &v, optname ) ) return false; + if ( v < 0 ) { + logadd( LOG_WARNING, "Ignoring value '%s' for '%s': Cannot be negative", in, optname ); + return false; + } + *out = (uint64_t)v; + return true; +} + +static bool parse32(const char *in, atomic_int *out, const char *optname) +{ + atomic_int_fast64_t v; + if ( !parse64( in, &v, optname ) ) return false; + if ( v < INT_MIN || v > INT_MAX ) { + logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)INT_MIN, (int)INT_MAX, in ); + return false; + } + *out = (int)v; + return true; +} + +static bool parse32u(const char *in, atomic_int *out, const char *optname) +{ + atomic_int_fast64_t v; + if ( !parse64( in, &v, optname ) ) return false; + if ( v < 0 || v > INT_MAX ) { + logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)0, (int)INT_MAX, in ); + return false; + } + *out = (int)v; + return true; +} + +#define P_ARG(...) do { \ + int r = snprintf(buffer, rem, __VA_ARGS__); \ + if ( r < 0 || (size_t)r >= rem ) return size - 1; \ + rem -= r; \ + buffer += r; \ +} while (0) +#define PVAR(var,type,cast) P_ARG(#var "=%" type "\n", (cast) _ ## var) +#define PINT(var) PVAR(var, "d", int) +#define PUINT64(var) PVAR(var, PRIu64, uint64_t) +#define PSTR(var) PVAR(var, "s", const char*) +#define PBOOL(var) P_ARG(#var "=%s\n", _ ## var ? "true" : "false") + +size_t globals_dumpConfig(char *buffer, size_t size) +{ + size_t rem = size; + P_ARG("[dnbd3]\n"); + PINT(listenPort); + PSTR(basePath); + PINT(serverPenalty); + PINT(clientPenalty); + PBOOL(isProxy); + if ( _backgroundReplication == BGR_HASHBLOCK ) { + P_ARG("backgroundReplication=hashblock\n"); + } else { + PBOOL(backgroundReplication); + } + PINT(bgrMinClients); + PBOOL(lookupMissingForProxy); + PBOOL(sparseFiles); + PBOOL(removeMissingImages); + PINT(uplinkTimeout); + PINT(clientTimeout); + PBOOL(closeUnusedFd); + PBOOL(vmdkLegacyMode); + PBOOL(proxyPrivateOnly); + P_ARG("[limits]\n"); + PINT(maxClients); + PINT(maxImages); + PINT(maxPayload); + PUINT64(maxReplicationSize); + return size - rem; +} + diff --git a/src/server/globals.h b/src/server/globals.h new file mode 100644 index 0000000..2b30bc2 --- /dev/null +++ b/src/server/globals.h @@ -0,0 +1,277 @@ +#ifndef _GLOBALS_H_ +#define _GLOBALS_H_ + +#include "../types.h" +#include "../shared/fdsignal.h" +#include "../serverconfig.h" +#include +#include +#include +#include + +typedef struct timespec ticks; + +// ######### All structs/types used by the server ######## + +typedef struct _dnbd3_connection dnbd3_connection_t; +typedef struct _dnbd3_image dnbd3_image_t; +typedef struct _dnbd3_client dnbd3_client_t; + +// Slot is free, can be used. +// Must only be set in uplink_handle_receive() or uplink_remove_client() +#define ULR_FREE 0 +// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse. +// Must only be set in uplink_request() +#define ULR_NEW 1 +// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse. +// Must only be set in uplink_mainloop() or uplink_request() +#define ULR_PENDING 2 +// Slot is being processed, do not consider for hop on. +// Must only be set in uplink_handle_receive() +#define ULR_PROCESSING 3 +typedef struct +{ + uint64_t handle; // Client defined handle to pass back in reply + uint64_t from; // First byte offset of requested block (ie. 4096) + uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191) + dnbd3_client_t * client; // Client to send reply to + int status; // status of this entry: ULR_* +#ifdef _DEBUG + ticks entered; // When this request entered the queue (for debugging) +#endif + uint8_t hopCount; // How many hops this request has already taken across proxies +} dnbd3_queued_request_t; + +#define RTT_IDLE 0 // Not in progress +#define RTT_INPROGRESS 1 // In progess, not finished +#define RTT_DONTCHANGE 2 // Finished, but no better alternative found +#define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd +#define RTT_NOT_REACHABLE 4 // No uplink was reachable +struct _dnbd3_connection +{ + int fd; // socket fd to remote server + int version; // remote server protocol version + dnbd3_signal_t* signal; // used to wake up the process + pthread_t thread; // thread holding the connection + pthread_spinlock_t queueLock; // lock for synchronization on request queue etc. + dnbd3_image_t *image; // image that this uplink is used for; do not call get/release for this pointer + dnbd3_host_t currentServer; // Current server we're connected to + pthread_spinlock_t rttLock; // When accessing rttTestResult, betterFd or betterServer + int rttTestResult; // RTT_* + int cacheFd; // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD! + int betterVersion; // protocol version of better server + int betterFd; // Active connection to better server, ready to use + dnbd3_host_t betterServer; // The better server + uint8_t *recvBuffer; // Buffer for receiving payload + uint32_t recvBufferLen; // Len of ^^ + volatile bool shutdown; // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop() + bool replicatedLastBlock; // bool telling if the last block has been replicated yet + bool cycleDetected; // connection cycle between proxies detected for current remote server + int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at + // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block" + uint64_t replicationHandle; // Handle of pending replication request + atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup. + int queueLen; // length of queue + uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives) + dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE]; +}; + +typedef struct +{ + char comment[COMMENT_LENGTH]; + dnbd3_host_t host; + unsigned int rtt[SERVER_RTT_PROBES]; + unsigned int rttIndex; + bool isPrivate, isClientOnly; + ticks lastFail; + int numFails; +} dnbd3_alt_server_t; + +typedef struct +{ + uint8_t host[16]; + int bytes; + int bitMask; + int permissions; +} dnbd3_access_rule_t; + +/** + * Image struct. An image path could be something like + * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1 + * and the name would then be + * rz/zfs/windows7 zfs.vmdk + */ +struct _dnbd3_image +{ + char *path; // absolute path of the image + char *name; // public name of the image (usually relative path minus revision ID) + dnbd3_connection_t *uplink; // pointer to a server connection + uint8_t *cache_map; // cache map telling which parts are locally cached, NULL if complete + uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k) + uint64_t realFilesize; // actual file size on disk + ticks atime; // last access time + ticks lastWorkCheck; // last time a non-working image has been checked + ticks nextCompletenessEstimate; // next time the completeness estimate should be updated + uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image + uint32_t masterCrc32; // CRC-32 of the crc-32 list + int readFd; // used to read the image. Used from multiple threads, so use atomic operations (pread et al) + int completenessEstimate; // Completeness estimate in percent + int users; // clients currently using this image + int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server + bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected + uint16_t rid; // revision of image + pthread_spinlock_t lock; +}; + +struct _dnbd3_client +{ +#define HOSTNAMELEN (48) + atomic_uint_fast64_t bytesSent; // Byte counter for this client. + dnbd3_image_t *image; // Image in use by this client, or NULL during handshake + int sock; + bool isServer; // true if a server in proxy mode, false if real client + dnbd3_host_t host; + char hostName[HOSTNAMELEN]; // inet_ntop version of host + pthread_mutex_t sendMutex; // Held while writing to sock if image is incomplete (since uplink uses socket too) + pthread_spinlock_t lock; +}; + +// ####################################################### +#define CONFIG_FILENAME "server.conf" + +/** + * Base directory where the configuration files reside. Will never have a trailing slash. + */ +extern char *_configDir; + +/** + * Base directory where all images are stored in. Will never have a trailing slash. + */ +extern char *_basePath; + +/** + * Whether or not simple *.vmdk files should be treated as revision 1 + */ +extern atomic_bool _vmdkLegacyMode; + +/** + * How much artificial delay should we add when a server connects to us? + */ +extern atomic_int _serverPenalty; + +/** + * How much artificial delay should we add when a client connects to us? + */ +extern atomic_int _clientPenalty; + +/** + * Is server shutting down? + */ +extern atomic_bool _shutdown; + +/** + * Is server allowed to provide images in proxy mode? + */ +extern atomic_bool _isProxy; + +/** + * Only use servers as upstream proxy which are private? + */ +extern atomic_bool _proxyPrivateOnly; + +/** + * Whether to remove missing images from image list on SIGHUP + */ +extern atomic_bool _removeMissingImages; + +/** + * Read timeout when waiting for or sending data on an uplink + */ +extern atomic_int _uplinkTimeout; + +/** + * Read timeout when waiting for or sending data from/to client + */ +extern atomic_int _clientTimeout; + +/** + * If true, images with no active client will have their fd closed after some + * idle time. + */ +extern atomic_bool _closeUnusedFd; + +/** + * Should we replicate incomplete images in the background? + * Otherwise, only blocks that were explicitly requested will be cached. + */ +extern atomic_int _backgroundReplication; +#define BGR_DISABLED (0) +#define BGR_FULL (1) +#define BGR_HASHBLOCK (2) + +/** + * Minimum connected clients for background replication to kick in + */ +extern atomic_int _bgrMinClients; + +/** + * (In proxy mode): If connecting client is a proxy, and the requested image + * is not known locally, should we ask our known alt servers for it? + * Otherwise the request is rejected. + */ +extern atomic_bool _lookupMissingForProxy; + +/** + * Should we preallocate proxied images right at the start to make + * sure we can cache it entirely, or rather create sparse files + * with holes in them? With sparse files, we just keep writing + * cached blocks to disk until it is full, and only then will we + * start to delete old images. This might be a bit flaky so use + * only in space restricted environments. Also make sure your + * file system actually supports sparse files / files with holes + * in them, or you might get really shitty performance. + * This setting will have no effect if background replication is + * turned on. + */ +extern atomic_bool _sparseFiles; + +/** + * Port to listen on (default: #define PORT (5003)) + */ +extern atomic_int _listenPort; + +/** + * Max number of DNBD3 clients we accept + */ +extern atomic_int _maxClients; + +/** + * Max number of Images we support (in baseDir) + */ +extern atomic_int _maxImages; + +/** + * Maximum payload length we accept on uplinks and thus indirectly + * from clients in case the requested range is not cached locally. + * Usually this isn't even a megabyte for "real" clients (blockdev + * or fuse). + */ +extern atomic_int _maxPayload; + +/** + * If in proxy mode, don't replicate images that are + * larger than this according to the uplink server. + */ +extern atomic_uint_fast64_t _maxReplicationSize; + +/** + * Load the server configuration. + */ +void globals_loadConfig(); + +/** + * Dump the effective configuration in use to given buffer. + */ +size_t globals_dumpConfig(char *buffer, size_t size); + +#endif /* GLOBALS_H_ */ diff --git a/src/server/helper.c b/src/server/helper.c new file mode 100644 index 0000000..2dbc3ea --- /dev/null +++ b/src/server/helper.c @@ -0,0 +1,146 @@ +#include "helper.h" +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_THREAD_NAMES +#include // For thread names +#endif + +/** + * Parse IPv4 or IPv6 address in string representation to a suitable format usable by the BSD socket library + * !! Contents of 'string' might be modified by this function !! + * + * @param string eg. "1.2.3.4" or "2a01::10:5", optially with port appended, eg "1.2.3.4:6666" or "[2a01::10:5]:6666" + * @param host pointer to dnbd3_host_t that will be filled with the following data: + * .type will contain either HOST_IP4 or HOST_IP6 + * .addr will contain the address in network representation + * .port will contain the port in network representation, defaulting to #define PORT if none was given + * @return true on success, false in failure. contents of af, addr and port are undefined in the latter case + */ +bool parse_address(char *string, dnbd3_host_t *host) +{ + struct in_addr v4; + struct in6_addr v6; + + memset( host, 0, sizeof(*host) ); + // Try IPv4 without port + if ( 1 == inet_pton( AF_INET, string, &v4 ) ) { + host->type = HOST_IP4; + memcpy( host->addr, &v4, 4 ); + host->port = htons( PORT ); + return true; + } + // Try IPv6 without port + if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) { + host->type = HOST_IP6; + memcpy( host->addr, &v6, 16 ); + host->port = htons( PORT ); + return true; + } + + // Scan for port + char *portpos = NULL, *ptr = string; + while ( *ptr ) { + if ( *ptr == ':' ) portpos = ptr; + ++ptr; + } + if ( portpos == NULL ) return false; // No port in string + // Consider IP being surrounded by [ ] + if ( *string == '[' && *(portpos - 1) == ']' ) { + ++string; + *(portpos - 1) = '\0'; + } + *portpos++ = '\0'; + int p = atoi( portpos ); + if ( p < 1 || p > 65535 ) return false; // Invalid port + host->port = htons( (uint16_t)p ); + + // Try IPv4 with port + if ( 1 == inet_pton( AF_INET, string, &v4 ) ) { + host->type = HOST_IP4; + memcpy( host->addr, &v4, 4 ); + return true; + } + // Try IPv6 with port + if ( 1 == inet_pton( AF_INET6, string, &v6 ) ) { + host->type = HOST_IP6; + memcpy( host->addr, &v6, 16 ); + return true; + } + + // FAIL + return false; +} + +/** + * Convert a host and port (network byte order) to printable representation. + * Worst case required buffer len is 48, eg. [1234:1234:1234:1234:1234:1234:1234:1234]:12345 (+ \0) + * Returns true on success, false on error + */ +bool host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen) +{ + // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6 + if ( targetlen < 10 ) return false; + if ( host->type == HOST_IP6 ) { + *target++ = '['; + inet_ntop( AF_INET6, host->addr, target, (socklen_t)targetlen - 10 ); + target += strlen( target ); + *target++ = ']'; + } else if ( host->type == HOST_IP4 ) { + inet_ntop( AF_INET, host->addr, target, (socklen_t)targetlen - 8 ); + target += strlen( target ); + } else { + snprintf( target, targetlen, "", (int)host->type ); + return false; + } + *target = '\0'; + if ( host->port != 0 ) { + // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7 + snprintf( target, 7, ":%d", (int)ntohs( host->port ) ); + } + return true; +} + +void remove_trailing_slash(char *string) +{ + char *ptr = string + strlen( string ) - 1; + while ( ptr >= string && *ptr == '/' ) + *ptr-- = '\0'; +} + +void trim_right(char * const string) +{ + char *end = string + strlen( string ) - 1; + while ( end >= string && (*end == '\r' || *end == '\n' || *end == ' ' || *end == '\t') ) + *end-- = '\0'; +} + +void setThreadName(const char *name) +{ + char newName[16]; + if ( strlen( name ) > 15 ) { + snprintf( newName, sizeof(newName), "%s", name ); + newName[15] = '\0'; + name = newName; + } +#ifdef HAVE_THREAD_NAMES + prctl( PR_SET_NAME, (unsigned long)name, 0, 0, 0 ); +#endif + //TODO: On FreeBSD set threadname with pthread_setname_np +} + +void blockNoncriticalSignals() +{ + sigset_t sigmask; + sigemptyset( &sigmask ); + sigaddset( &sigmask, SIGUSR1 ); + sigaddset( &sigmask, SIGUSR2 ); + sigaddset( &sigmask, SIGHUP ); + sigaddset( &sigmask, SIGPIPE ); + pthread_sigmask( SIG_BLOCK, &sigmask, NULL ); +} + diff --git a/src/server/helper.h b/src/server/helper.h new file mode 100644 index 0000000..102cb36 --- /dev/null +++ b/src/server/helper.h @@ -0,0 +1,42 @@ +#ifndef HELPER_H_ +#define HELPER_H_ + +#include "server.h" +#include "../shared/log.h" +#include "../types.h" +#include +#include +#include + +bool parse_address(char *string, dnbd3_host_t *host); +bool host_to_string(const dnbd3_host_t *host, char *target, size_t targetlen); +void remove_trailing_slash(char *string); +void trim_right(char * const string); +void setThreadName(const char *name); +void blockNoncriticalSignals(); + +static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b) +{ + return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) )); +} + +static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b) +{ + return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) )); +} + +/** + * Test whether string ends in suffix. + * @return true if string =~ /suffix$/ + */ +static inline bool strend(char *string, char *suffix) +{ + if ( string == NULL ) return false; + if ( suffix == NULL || *suffix == '\0' ) return true; + const size_t len1 = strlen( string ); + const size_t len2 = strlen( suffix ); + if ( len2 > len1 ) return false; + return strcmp( string + len1 - len2, suffix ) == 0; +} + +#endif diff --git a/src/server/image.c b/src/server/image.c new file mode 100644 index 0000000..061f9a3 --- /dev/null +++ b/src/server/image.c @@ -0,0 +1,1794 @@ +#include "image.h" +#include "helper.h" +#include "fileutil.h" +#include "uplink.h" +#include "locks.h" +#include "integrity.h" +#include "altservers.h" +#include "../shared/protocol.h" +#include "../shared/timing.h" +#include "../shared/crc32.h" + +#include +#include +#include +#include +#include +#include +#include + +#define PATHLEN (2000) +#define NONWORKING_RECHECK_INTERVAL_SECONDS (60) + +// ########################################## + +static dnbd3_image_t *_images[SERVER_MAX_IMAGES]; +static int _num_images = 0; + +static pthread_spinlock_t imageListLock; +static pthread_mutex_t remoteCloneLock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t reloadLock = PTHREAD_MUTEX_INITIALIZER; +#define NAMELEN 500 +#define CACHELEN 20 +typedef struct +{ + char name[NAMELEN]; + uint16_t rid; + ticks deadline; +} imagecache; +static imagecache remoteCloneCache[CACHELEN]; + +// ########################################## + +static bool isForbiddenExtension(const char* name); +static dnbd3_image_t* image_remove(dnbd3_image_t *image); +static dnbd3_image_t* image_free(dnbd3_image_t *image); +static bool image_load_all_internal(char *base, char *path); +static bool image_addToList(dnbd3_image_t *image); +static bool image_load(char *base, char *path, int withUplink); +static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize); +static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc); +static bool image_ensureDiskSpace(uint64_t size, bool force); + +static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize); +static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc); +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map); + +// ########################################## + +void image_serverStartup() +{ + srand( (unsigned int)time( NULL ) ); + spin_init( &imageListLock, PTHREAD_PROCESS_PRIVATE ); +} + +/** + * Update cache-map of given image for the given byte range + * start (inclusive) - end (exclusive) + * Locks on: images[].lock + */ +void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set) +{ + assert( image != NULL ); + // This should always be block borders due to how the protocol works, but better be safe + // than accidentally mark blocks as cached when they really aren't entirely cached. + assert( end <= image->virtualFilesize ); + assert( start <= end ); + if ( set ) { + // If we set as cached, move "inwards" in case we're not at 4k border + end &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + start = (uint64_t)(start + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + } else { + // If marking as NOT cached, move "outwards" in case we're not at 4k border + start &= ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + end = (uint64_t)(end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + } + if ( start >= end ) + return; + bool setNewBlocks = false; + uint64_t pos = start; + spin_lock( &image->lock ); + if ( image->cache_map == NULL ) { + // Image seems already complete + if ( set ) { + // This makes no sense + spin_unlock( &image->lock ); + logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path ); + return; + } + // Recreate a cache map, set it to all 1 initially as we assume the image was complete + const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + image->cache_map = malloc( byteSize ); + memset( image->cache_map, 0xff, byteSize ); + } + while ( pos < end ) { + const size_t map_y = (int)( pos >> 15 ); + const int map_x = (int)( (pos >> 12) & 7 ); // mod 8 + const int bit_mask = 1 << map_x; + if ( set ) { + if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true; + image->cache_map[map_y] |= (uint8_t)bit_mask; + } else { + image->cache_map[map_y] &= (uint8_t)~bit_mask; + } + pos += DNBD3_BLOCK_SIZE; + } + if ( setNewBlocks && image->crc32 != NULL ) { + // If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks + // for checking, even though this might lead to checking some hash block again, if it was + // already complete and the block range spanned at least two hash blocks. + // First set start and end to borders of hash blocks + start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1); + end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1); + pos = start; + while ( pos < end ) { + if ( image->cache_map == NULL ) break; + const int block = (int)( pos / HASH_BLOCK_SIZE ); + if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) { + spin_unlock( &image->lock ); + integrity_check( image, block ); + spin_lock( &image->lock ); + } + pos += HASH_BLOCK_SIZE; + } + } + spin_unlock( &image->lock ); +} + +/** + * Returns true if the given image is complete. + * Also frees cache_map and deletes it on disk + * if it hasn't been complete before + * Locks on: image.lock + */ +bool image_isComplete(dnbd3_image_t *image) +{ + assert( image != NULL ); + spin_lock( &image->lock ); + if ( image->virtualFilesize == 0 ) { + spin_unlock( &image->lock ); + return false; + } + if ( image->cache_map == NULL ) { + spin_unlock( &image->lock ); + return true; + } + bool complete = true; + int j; + const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + for (j = 0; j < map_len_bytes - 1; ++j) { + if ( image->cache_map[j] != 0xFF ) { + complete = false; + break; + } + } + if ( complete ) { // Every block except the last one is complete + // Last one might need extra treatment if it's not a full byte + const int blocks_in_last_byte = (image->virtualFilesize >> 12) & 7; + uint8_t last_byte = 0; + if ( blocks_in_last_byte == 0 ) { + last_byte = 0xFF; + } else { + for (j = 0; j < blocks_in_last_byte; ++j) + last_byte |= (uint8_t)(1 << j); + } + complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte); + } + if ( !complete ) { + spin_unlock( &image->lock ); + return false; + } + char mapfile[PATHLEN] = ""; + free( image->cache_map ); + image->cache_map = NULL; + snprintf( mapfile, PATHLEN, "%s.map", image->path ); + spin_unlock( &image->lock ); + unlink( mapfile ); + return true; +} + +/** + * Make sure readFd is open, useful when closeUnusedFd is active. + * This function assumes you called image_lock first, so its known + * to be active and the fd won't be closed halfway through the + * function. + * Does not update atime, so the fd might be closed again very soon. + * Since the caller should have image_lock()ed first, it could do + * a quick operation on it before calling image_release which + * guarantees that the fd will not be closed meanwhile. + */ +bool image_ensureOpen(dnbd3_image_t *image) +{ + if ( image->readFd != -1 ) return image; + int newFd = open( image->path, O_RDONLY ); + if ( newFd != -1 ) { + // Check size + const off_t flen = lseek( newFd, 0, SEEK_END ); + if ( flen == -1 ) { + logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno ); + close( newFd ); + newFd = -1; + } else if ( (uint64_t)flen != image->realFilesize ) { + logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen ); + close( newFd ); + newFd = -1; + } + } + if ( newFd == -1 ) { + spin_lock( &image->lock ); + image->working = false; + spin_unlock( &image->lock ); + return false; + } + spin_lock( &image->lock ); + if ( image->readFd == -1 ) { + image->readFd = newFd; + spin_unlock( &image->lock ); + } else { + // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed + spin_unlock( &image->lock ); + close( newFd ); + } + return image->readFd != -1; +} + +/** + * Get an image by name+rid. This function increases a reference counter, + * so you HAVE TO CALL image_release for every image_get() call at some + * point... + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking) +{ + int i; + const char *removingText = _removeMissingImages ? ", removing from list" : ""; + dnbd3_image_t *candidate = NULL; + // Simple sanity check + const size_t slen = strlen( name ); + if ( slen == 0 || name[slen - 1] == '/' || name[0] == '/' ) return NULL ; + // Go through array + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image == NULL || strcmp( image->name, name ) != 0 ) continue; + if ( revision == image->rid ) { + candidate = image; + break; + } else if ( revision == 0 && (candidate == NULL || candidate->rid < image->rid) ) { + candidate = image; + } + } + + // Not found + if ( candidate == NULL ) { + spin_unlock( &imageListLock ); + return NULL ; + } + + spin_lock( &candidate->lock ); + spin_unlock( &imageListLock ); + candidate->users++; + spin_unlock( &candidate->lock ); + + // Found, see if it works +// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list +// TODO: But remember size-changed images forever + if ( candidate->working || checkIfWorking ) { + // Is marked working, but might not have an fd open + if ( !image_ensureOpen( candidate ) ) { + spin_lock( &candidate->lock ); + timing_get( &candidate->lastWorkCheck ); + spin_unlock( &candidate->lock ); + if ( _removeMissingImages ) { + candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller + } + return candidate; + } + } + + if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state + + // ...not working... + + // Don't re-check too often + spin_lock( &candidate->lock ); + bool check; + declare_now; + check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS; + if ( check ) { + candidate->lastWorkCheck = now; + } + spin_unlock( &candidate->lock ); + if ( !check ) { + return candidate; + } + + // reaching this point means: + // 1) We should check if the image is working, it might or might not be in working state right now + // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.) + // 3) We made sure not to re-check this image too often + + // Common for ro and rw images: Size check, read check + const off_t len = lseek( candidate->readFd, 0, SEEK_END ); + bool reload = false; + if ( len == -1 ) { + logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText ); + reload = true; + } else if ( (uint64_t)len != candidate->realFilesize ) { + logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64 + ". Try sending SIGHUP to server if you know what you're doing.", + candidate->path, candidate->realFilesize, (uint64_t)len ); + } else { + // Seek worked, file size is same, now see if we can read from file + char buffer[100]; + if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) { + logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.", + (int)sizeof(buffer), candidate->path, errno, removingText ); + reload = true; + } else if ( !candidate->working ) { + // Seems everything is fine again \o/ + candidate->working = true; + logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid ); + } + } + + if ( reload ) { + // Could not access the image with exising fd - mark for reload which will re-open the file. + // make a copy of the image struct but keep the old one around. If/When it's not being used + // anymore, it will be freed automatically. + dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 ); + img->path = strdup( candidate->path ); + img->name = strdup( candidate->name ); + img->virtualFilesize = candidate->virtualFilesize; + img->realFilesize = candidate->realFilesize; + img->atime = now; + img->masterCrc32 = candidate->masterCrc32; + img->readFd = -1; + img->rid = candidate->rid; + img->users = 1; + img->working = false; + spin_init( &img->lock, PTHREAD_PROCESS_PRIVATE ); + if ( candidate->crc32 != NULL ) { + const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t); + img->crc32 = malloc( mb ); + memcpy( img->crc32, candidate->crc32, mb ); + } + spin_lock( &candidate->lock ); + if ( candidate->cache_map != NULL ) { + const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize ); + img->cache_map = malloc( mb ); + memcpy( img->cache_map, candidate->cache_map, mb ); + } + spin_unlock( &candidate->lock ); + if ( image_addToList( img ) ) { + image_release( candidate ); + candidate = img; + } else { + img->users = 0; + image_free( img ); + } + // readFd == -1 and working == FALSE at this point, + // this function needs some splitting up for handling as we need to run most + // of the above code again. for now we know that the next call for this + // name:rid will get ne newly inserted "img" and try to re-open the file. + } + + // Check if image is incomplete, handle + if ( candidate->cache_map != NULL ) { + if ( candidate->uplink == NULL ) { + uplink_init( candidate, -1, NULL, -1 ); + } + } + + return candidate; // We did all we can, hopefully it's working +} + +/** + * Lock the image by increasing its users count + * Returns the image on success, NULL if it is not found in the image list + * Every call to image_lock() needs to be followed by a call to image_release() at some point. + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users-- +{ + if ( image == NULL ) return NULL ; + int i; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == image ) { + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + image->users++; + spin_unlock( &image->lock ); + return image; + } + } + spin_unlock( &imageListLock ); + return NULL ; +} + +/** + * Release given image. This will decrease the reference counter of the image. + * If the usage counter reaches 0 and the image is not in the images array + * anymore, the image will be freed + * Locks on: imageListLock, _images[].lock + */ +dnbd3_image_t* image_release(dnbd3_image_t *image) +{ + if ( image == NULL ) return NULL; + spin_lock( &imageListLock ); + spin_lock( &image->lock ); + assert( image->users > 0 ); + image->users--; + bool inUse = image->users != 0; + spin_unlock( &image->lock ); + if ( inUse ) { // Still in use, do nothing + spin_unlock( &imageListLock ); + return NULL; + } + // Getting here means we decreased the usage counter to zero + // If the image is not in the images list anymore, we're + // responsible for freeing it + for (int i = 0; i < _num_images; ++i) { + if ( _images[i] == image ) { // Found, do nothing + spin_unlock( &imageListLock ); + return NULL; + } + } + spin_unlock( &imageListLock ); + // So it wasn't in the images list anymore either, get rid of it + if ( !inUse ) image = image_free( image ); + return NULL; +} + +/** + * Returns true if the given file name ends in one of our meta data + * file extensions. Used to prevent loading them as images. + */ +static bool isForbiddenExtension(const char* name) +{ + const size_t len = strlen( name ); + if ( len < 4 ) return false; + const char *ptr = name + len - 4; + if ( strcmp( ptr, ".crc" ) == 0 ) return true; // CRC list + if ( strcmp( ptr, ".map" ) == 0 ) return true; // cache map for incomplete images + if ( len < 5 ) return false; + --ptr; + if ( strcmp( ptr, ".meta" ) == 0 ) return true; // Meta data (currently not in use) + return false; +} + +/** + * Remove image from images array. Only free it if it has + * no active users and was actually in the list. + * Locks on: imageListLock, image[].lock + * @return NULL if image was also freed, image otherwise + */ +static dnbd3_image_t* image_remove(dnbd3_image_t *image) +{ + bool mustFree = false; + spin_lock( &imageListLock ); + spin_lock( &image->lock ); + for ( int i = _num_images - 1; i >= 0; --i ) { + if ( _images[i] == image ) { + _images[i] = NULL; + mustFree = ( image->users == 0 ); + } + if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--; + } + spin_unlock( &image->lock ); + spin_unlock( &imageListLock ); + if ( mustFree ) image = image_free( image ); + return image; +} + +/** + * Kill all uplinks + */ +void image_killUplinks() +{ + int i; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == NULL ) continue; + spin_lock( &_images[i]->lock ); + if ( _images[i]->uplink != NULL ) { + spin_lock( &_images[i]->uplink->queueLock ); + if ( !_images[i]->uplink->shutdown ) { + thread_detach( _images[i]->uplink->thread ); + _images[i]->uplink->shutdown = true; + } + spin_unlock( &_images[i]->uplink->queueLock ); + signal_call( _images[i]->uplink->signal ); + } + spin_unlock( &_images[i]->lock ); + } + spin_unlock( &imageListLock ); +} + +/** + * Load all images in given path recursively. + * Pass NULL to use path from config. + */ +bool image_loadAll(char *path) +{ + bool ret; + char imgPath[PATHLEN]; + int imgId; + dnbd3_image_t *imgHandle; + + if ( path == NULL ) path = _basePath; + if ( pthread_mutex_trylock( &reloadLock ) != 0 ) { + logadd( LOG_MINOR, "Could not (re)load image list, already in progress." ); + return false; + } + if ( _removeMissingImages ) { + // Check if all loaded images still exist on disk + logadd( LOG_INFO, "Checking for vanished images" ); + spin_lock( &imageListLock ); + for ( int i = _num_images - 1; i >= 0; --i ) { + if ( _shutdown ) break; + if ( _images[i] == NULL ) { + if ( i + 1 == _num_images ) _num_images--; + continue; + } + imgId = _images[i]->id; + snprintf( imgPath, PATHLEN, "%s", _images[i]->path ); + spin_unlock( &imageListLock ); // isReadable hits the fs; unlock + // Check if fill can still be opened for reading + ret = file_isReadable( imgPath ); + // Lock again, see if image is still there, free if required + spin_lock( &imageListLock ); + if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue; + // Image needs to be removed + imgHandle = _images[i]; + _images[i] = NULL; + if ( i + 1 == _num_images ) _num_images--; + spin_lock( &imgHandle->lock ); + const bool freeImg = ( imgHandle->users == 0 ); + spin_unlock( &imgHandle->lock ); + // We unlocked, but the image has been removed from the list already, so + // there's no way the users-counter can increase at this point. + if ( freeImg ) { + // Image is not in use anymore, free the dangling entry immediately + spin_unlock( &imageListLock ); // image_free might do several fs operations; unlock + image_free( imgHandle ); + spin_lock( &imageListLock ); + } + } + spin_unlock( &imageListLock ); + if ( _shutdown ) { + pthread_mutex_unlock( &reloadLock ); + return true; + } + } + // Now scan for new images + logadd( LOG_INFO, "Scanning for new or modified images" ); + ret = image_load_all_internal( path, path ); + pthread_mutex_unlock( &reloadLock ); + logadd( LOG_INFO, "Finished scanning %s", path ); + return ret; +} + +/** + * Free all images we have, but only if they're not in use anymore. + * Locks on imageListLock + * @return true if all images have been freed + */ +bool image_tryFreeAll() +{ + spin_lock( &imageListLock ); + for (int i = _num_images - 1; i >= 0; --i) { + if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race... + dnbd3_image_t *image = _images[i]; + _images[i] = NULL; + spin_unlock( &imageListLock ); + image = image_free( image ); + spin_lock( &imageListLock ); + } + if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--; + } + spin_unlock( &imageListLock ); + return _num_images == 0; +} + +/** + * Free image. DOES NOT check if it's in use. + * Indirectly locks on imageListLock, image.lock, uplink.queueLock + */ +static dnbd3_image_t* image_free(dnbd3_image_t *image) +{ + assert( image != NULL ); + if ( !_shutdown ) { + logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid ); + } + // + uplink_shutdown( image ); + spin_lock( &image->lock ); + free( image->cache_map ); + free( image->crc32 ); + free( image->path ); + free( image->name ); + image->cache_map = NULL; + image->crc32 = NULL; + image->path = NULL; + image->name = NULL; + spin_unlock( &image->lock ); + if ( image->readFd != -1 ) close( image->readFd ); + spin_destroy( &image->lock ); + // + memset( image, 0, sizeof(*image) ); + free( image ); + return NULL ; +} + +bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize) +{ + if ( cacheMap == NULL ) return true; + const uint64_t end = (block + 1) * HASH_BLOCK_SIZE; + if ( end <= realFilesize ) { + // Trivial case: block in question is not the last block (well, or image size is multiple of HASH_BLOCK_SIZE) + const int startCacheIndex = (int)( ( block * HASH_BLOCK_SIZE ) / ( DNBD3_BLOCK_SIZE * 8 ) ); + const int endCacheIndex = startCacheIndex + (int)( HASH_BLOCK_SIZE / ( DNBD3_BLOCK_SIZE * 8 ) ); + for ( int i = startCacheIndex; i < endCacheIndex; ++i ) { + if ( cacheMap[i] != 0xff ) { + return false; + } + } + } else { + // Special case: Checking last block, which is smaller than HASH_BLOCK_SIZE + for (uint64_t mapPos = block * HASH_BLOCK_SIZE; mapPos < realFilesize; mapPos += DNBD3_BLOCK_SIZE ) { + const size_t map_y = (size_t)( mapPos >> 15 ); + const int map_x = (int)( (mapPos >> 12) & 7 ); // mod 8 + const int mask = 1 << map_x; + if ( (cacheMap[map_y] & mask) == 0 ) return false; + } + } + return true; +} + +/** + * Load all images in the given path recursively, + * consider *base the base path that is to be cut off + */ +static bool image_load_all_internal(char *base, char *path) +{ +#define SUBDIR_LEN 150 + assert( path != NULL ); + assert( *path == '/' ); + struct dirent entry, *entryPtr; + const size_t pathLen = strlen( path ); + char subpath[PATHLEN]; + struct stat st; + DIR * const dir = opendir( path ); + + if ( dir == NULL ) { + logadd( LOG_ERROR, "Could not opendir '%s' for loading", path ); + return false; + } + + while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) { + entry = *entryPtr; + if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue; + if ( strlen( entry.d_name ) > SUBDIR_LEN ) { + logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN ); + continue; + } + if ( entry.d_name[0] == '/' || path[pathLen - 1] == '/' ) { + snprintf( subpath, PATHLEN, "%s%s", path, entry.d_name ); + } else { + snprintf( subpath, PATHLEN, "%s/%s", path, entry.d_name ); + } + if ( stat( subpath, &st ) < 0 ) { + logadd( LOG_WARNING, "stat() for '%s' failed. Ignoring....", subpath ); + continue; + } + if ( S_ISDIR( st.st_mode ) ) { + image_load_all_internal( base, subpath ); // Recurse + } else if ( !isForbiddenExtension( subpath ) ) { + image_load( base, subpath, true ); // Load image if possible + } + } + closedir( dir ); + return true; +#undef SUBDIR_LEN +} + +/** + */ +static bool image_addToList(dnbd3_image_t *image) +{ + int i; + static int imgIdCounter = 0; // Used to assign unique numeric IDs to images + spin_lock( &imageListLock ); + // Now we're locked, assign unique ID to image (unique for this running server instance!) + image->id = ++imgIdCounter; + for ( i = 0; i < _num_images; ++i ) { + if ( _images[i] != NULL ) continue; + _images[i] = image; + break; + } + if ( i >= _num_images ) { + if ( _num_images >= _maxImages ) { + spin_unlock( &imageListLock ); + return false; + } + _images[_num_images++] = image; + } + spin_unlock( &imageListLock ); + return true; +} + +/** + * Load image from given path. This will check if the image is + * already loaded and updates its information in that case. + * Note that this is NOT THREAD SAFE so make sure its always + * called on one thread only. + */ +static bool image_load(char *base, char *path, int withUplink) +{ + int revision = -1; + struct stat st; + uint8_t *cache_map = NULL; + uint32_t *crc32list = NULL; + dnbd3_image_t *existing = NULL; + int fdImage = -1; + bool function_return = false; // Return false by default + assert( base != NULL ); + assert( path != NULL ); + assert( *path == '/' ); + assert( strncmp( path, base, strlen(base)) == 0 ); + assert( base[strlen(base) - 1] != '/' ); + assert( strlen(path) > strlen(base) ); + char *lastSlash = strrchr( path, '/' ); + char *fileName = lastSlash + 1; + char imgName[strlen( path )]; + const size_t fileNameLen = strlen( fileName ); + + // Copy virtual path (relative path in "base") + char * const virtBase = path + strlen( base ) + 1; + assert( *virtBase != '/' ); + char *src = virtBase, *dst = imgName; + while ( src <= lastSlash ) { + *dst++ = *src++; + } + *dst = '\0'; + + do { + // Parse file name for revision + // Try to parse *.r syntax + size_t i; + for (i = fileNameLen - 1; i > 1; --i) { + if ( fileName[i] < '0' || fileName[i] > '9' ) break; + } + if ( i != fileNameLen - 1 && fileName[i] == 'r' && fileName[i - 1] == '.' ) { + revision = atoi( fileName + i + 1 ); + src = fileName; + while ( src < fileName + i - 1 ) { + *dst++ = *src++; + } + *dst = '\0'; + } + } while (0); + + // Legacy mode enabled and no rid extracted from filename? + if ( _vmdkLegacyMode && revision == -1 ) { + fdImage = open( path, O_RDONLY ); // Check if it exists + if ( fdImage == -1 ) goto load_error; + // Yes, simply append full file name and set rid to 1 + strcat( dst, fileName ); + revision = 1; + } + // Did we get anything? + if ( revision <= 0 || revision >= 65536 ) { + logadd( LOG_WARNING, "Image '%s' has invalid revision ID %d", path, revision ); + goto load_error; + } + + // Get pointer to already existing image if possible + existing = image_get( imgName, (uint16_t)revision, true ); + + // ### Now load the actual image related data ### + if ( fdImage == -1 ) { + fdImage = open( path, O_RDONLY ); + } + if ( fdImage == -1 ) { + logadd( LOG_ERROR, "Could not open '%s' for reading...", path ); + goto load_error; + } + // Determine file size + const off_t seekret = lseek( fdImage, 0, SEEK_END ); + if ( seekret < 0 ) { + logadd( LOG_ERROR, "Could not seek to end of file '%s'", path ); + goto load_error; + } else if ( seekret == 0 ) { + logadd( LOG_WARNING, "Empty image file '%s'", path ); + goto load_error; + } + const uint64_t realFilesize = (uint64_t)seekret; + const uint64_t virtualFilesize = ( realFilesize + (DNBD3_BLOCK_SIZE - 1) ) & ~(DNBD3_BLOCK_SIZE - 1); + if ( realFilesize != virtualFilesize ) { + logadd( LOG_DEBUG1, "Image size of '%s' is %" PRIu64 ", virtual size: %" PRIu64, path, realFilesize, virtualFilesize ); + } + + // 1. Allocate memory for the cache map if the image is incomplete + cache_map = image_loadCacheMap( path, virtualFilesize ); + + // XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented) + + // 2. Load CRC-32 list of image + bool doFullCheck = false; + uint32_t masterCrc = 0; + const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize ); + crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc ); + + // Check CRC32 + if ( crc32list != NULL ) { + if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) { + logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path ); + doFullCheck = true; + } + } + + // Compare data just loaded to identical image we apparently already loaded + if ( existing != NULL ) { + if ( existing->realFilesize != realFilesize ) { + logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + // Image will be replaced below + } else if ( existing->crc32 != NULL && crc32list != NULL + && memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) { + logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid ); + logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." ); + logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" ); + // Image will be replaced below + } else if ( existing->crc32 == NULL && crc32list != NULL ) { + logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid ); + existing->crc32 = crc32list; + existing->masterCrc32 = masterCrc; + crc32list = NULL; + function_return = true; + goto load_error; // Keep existing + } else if ( existing->cache_map != NULL && cache_map == NULL ) { + // Just ignore that fact, if replication is really complete the cache map will be removed anyways + logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid ); + function_return = true; + goto load_error; // Keep existing + } else { + // Nothing changed about the existing image, so do nothing + logadd( LOG_DEBUG1, "Did not change" ); + function_return = true; + goto load_error; // Keep existing + } + // Remove existing image from images array, so it will be replaced by the reloaded image + existing = image_remove( existing ); + existing = image_release( existing ); + } + + // Load fresh image + dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) ); + image->path = strdup( path ); + image->name = strdup( imgName ); + image->cache_map = cache_map; + image->crc32 = crc32list; + image->masterCrc32 = masterCrc; + image->uplink = NULL; + image->realFilesize = realFilesize; + image->virtualFilesize = virtualFilesize; + image->rid = (uint16_t)revision; + image->users = 0; + image->readFd = -1; + image->working = (image->cache_map == NULL ); + timing_get( &image->nextCompletenessEstimate ); + image->completenessEstimate = -1; + spin_init( &image->lock, PTHREAD_PROCESS_PRIVATE ); + int32_t offset; + if ( stat( path, &st ) == 0 ) { + // Negatively offset atime by file modification time + offset = (int32_t)( st.st_mtime - time( NULL ) ); + if ( offset > 0 ) offset = 0; + } else { + offset = 0; + } + timing_gets( &image->atime, offset ); + + // Prevent freeing in cleanup + cache_map = NULL; + crc32list = NULL; + + // Get rid of cache map if image is complete + if ( image->cache_map != NULL ) { + image_isComplete( image ); + } + + // Image is definitely incomplete, initialize uplink worker + if ( image->cache_map != NULL ) { + image->working = false; + if ( withUplink ) { + uplink_init( image, -1, NULL, -1 ); + } + } + + // ### Reaching this point means loading succeeded + image->readFd = fdImage; + if ( image_addToList( image ) ) { + // Keep fd for reading + fdImage = -1; + } else { + logadd( LOG_ERROR, "Image list full: Could not add image %s", path ); + image->readFd = -1; // Keep fdImage instead, will be closed below + image = image_free( image ); + goto load_error; + } + logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid ); + // CRC errors found... + if ( doFullCheck ) { + logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid ); + integrity_check( image, -1 ); + } + + function_return = true; + + // Clean exit: +load_error: ; + if ( existing != NULL ) existing = image_release( existing ); + if ( crc32list != NULL ) free( crc32list ); + if ( cache_map != NULL ) free( cache_map ); + if ( fdImage != -1 ) close( fdImage ); + return function_return; +} + +static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize) +{ + uint8_t *retval = NULL; + char mapFile[strlen( imagePath ) + 10 + 1]; + sprintf( mapFile, "%s.map", imagePath ); + int fdMap = open( mapFile, O_RDONLY ); + if ( fdMap >= 0 ) { + const int map_size = IMGSIZE_TO_MAPBYTES( fileSize ); + retval = calloc( 1, map_size ); + const ssize_t rd = read( fdMap, retval, map_size ); + if ( map_size != rd ) { + logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath ); + // Could not read complete map, that means the rest of the image file will be considered incomplete + } + close( fdMap ); + // Later on we check if the hash map says the image is complete + } + return retval; +} + +static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc) +{ + assert( masterCrc != NULL ); + uint32_t *retval = NULL; + const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( fileSize ); + // Currently this should only prevent accidental corruption (esp. regarding transparent proxy mode) + // but maybe later on you want better security + char hashFile[strlen( imagePath ) + 10 + 1]; + sprintf( hashFile, "%s.crc", imagePath ); + int fdHash = open( hashFile, O_RDONLY ); + if ( fdHash >= 0 ) { + off_t fs = lseek( fdHash, 0, SEEK_END ); + if ( fs < (hashBlocks + 1) * 4 ) { + logadd( LOG_WARNING, "Ignoring crc32 list for '%s' as it is too short", imagePath ); + } else { + if ( pread( fdHash, masterCrc, sizeof(uint32_t), 0 ) != sizeof(uint32_t) ) { + logadd( LOG_WARNING, "Error reading first crc32 of '%s'", imagePath ); + } else { + const size_t crcFileLen = hashBlocks * sizeof(uint32_t); + size_t pos = 0; + retval = calloc( hashBlocks, sizeof(uint32_t) ); + while ( pos < crcFileLen ) { + ssize_t ret = pread( fdHash, retval + pos, crcFileLen - pos, pos + sizeof(uint32_t) /* skip master-crc */ ); + if ( ret == -1 ) { + if ( errno == EINTR || errno == EAGAIN ) continue; + } + if ( ret <= 0 ) break; + pos += ret; + } + if ( pos != crcFileLen ) { + free( retval ); + retval = NULL; + logadd( LOG_WARNING, "Could not read crc32 list of '%s'", imagePath ); + } else { + uint32_t lists_crc = crc32( 0, NULL, 0 ); + lists_crc = crc32( lists_crc, (uint8_t*)retval, hashBlocks * sizeof(uint32_t) ); + lists_crc = net_order_32( lists_crc ); + if ( lists_crc != *masterCrc ) { + free( retval ); + retval = NULL; + logadd( LOG_WARNING, "CRC-32 of CRC-32 list mismatch. CRC-32 list of '%s' might be corrupted.", imagePath ); + } + } + } + } + close( fdHash ); + } + return retval; +} + +static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map) +{ + // This checks the first block and (up to) count - 1 random blocks for corruption + // via the known crc32 list. This is very sloppy and is merely supposed to detect + // accidental corruption due to broken dnbd3-proxy functionality or file system + // corruption. + assert( count > 0 ); + const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize ); + int blocks[count + 1]; + int index = 0, j; + int block; + if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0; + int tries = count * 5; // Try only so many times to find a non-duplicate complete block + while ( index + 1 < count && --tries > 0 ) { + block = rand() % hashBlocks; // Random block + for ( j = 0; j < index; ++j ) { // Random block already in list? + if ( blocks[j] == block ) goto while_end; + } + // Block complete? If yes, add to list + if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block; +while_end: ; + } + blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1 + return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check +} + +/** + * Create a new image with the given image name and revision id in _basePath + * Returns true on success, false otherwise + */ +bool image_create(char *image, int revision, uint64_t size) +{ + assert( image != NULL ); + assert( size >= DNBD3_BLOCK_SIZE ); + if ( revision <= 0 ) { + logadd( LOG_ERROR, "revision id invalid: %d", revision ); + return false; + } + char path[PATHLEN], cache[PATHLEN]; + char *lastSlash = strrchr( image, '/' ); + if ( lastSlash == NULL ) { + snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); + } else { + *lastSlash = '\0'; + snprintf( path, PATHLEN, "%s/%s", _basePath, image ); + mkdir_p( path ); + *lastSlash = '/'; + snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision ); + } + snprintf( cache, PATHLEN, "%s.map", path ); + size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + const int mapsize = IMGSIZE_TO_MAPBYTES(size); + // Write files + int fdImage = -1, fdCache = -1; + fdImage = open( path, O_RDWR | O_TRUNC | O_CREAT, 0644 ); + fdCache = open( cache, O_RDWR | O_TRUNC | O_CREAT, 0644 ); + if ( fdImage < 0 ) { + logadd( LOG_ERROR, "Could not open %s for writing.", path ); + goto failure_cleanup; + } + if ( fdCache < 0 ) { + logadd( LOG_ERROR, "Could not open %s for writing.", cache ); + goto failure_cleanup; + } + // Try cache map first + if ( !file_alloc( fdCache, 0, mapsize ) && !file_setSize( fdCache, mapsize ) ) { + const int err = errno; + logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err ); + } + // Now write image + if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) { + logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); + logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk" + " space without actually writing all zeroes to the block device." ); + logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect" + " divine performance during replication." ); + goto failure_cleanup; + } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) { + logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno ); + logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." ); + goto failure_cleanup; + } + close( fdImage ); + close( fdCache ); + return true; + // +failure_cleanup: ; + if ( fdImage >= 0 ) close( fdImage ); + if ( fdCache >= 0 ) close( fdCache ); + remove( path ); + remove( cache ); + return false; +} + +static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len); +static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid); + +/** + * Does the same as image_get, but if the image is not known locally, or if + * revision 0 is requested, it will: + * a) Try to clone it from an authoritative dnbd3 server, if + * the server is running in proxy mode. + * b) Try to load it from disk by constructing the appropriate file name, if not + * running in proxy mode. + * + * If the return value is not NULL, + * image_release needs to be called on the image at some point. + * Locks on: remoteCloneLock, imageListLock, _images[].lock + */ +dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision) +{ + // specific revision - try shortcut + if ( revision != 0 ) { + dnbd3_image_t *image = image_get( name, revision, true ); + if ( image != NULL ) return image; + } + const size_t len = strlen( name ); + // Sanity check + if ( len == 0 || name[len - 1] == '/' || name[0] == '/' + || name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL; + // Call specific function depending on whether this is a proxy or not + if ( _isProxy ) { + return loadImageProxy( name, revision, len ); + } else { + return loadImageServer( name, revision ); + } +} + +/** + * Called if specific rid is not loaded, or if rid is 0 (some version might be loaded locally, + * but we should check if there's a higher rid on a remote server). + */ +static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision, const size_t len) +{ + // Already existing locally? + dnbd3_image_t *image = NULL; + if ( revision == 0 ) { + image = image_get( name, revision, true ); + } + + // Doesn't exist or is rid 0, try remote if not already tried it recently + declare_now; + char *cmpname = name; + int useIndex = -1, fallbackIndex = 0; + if ( len >= NAMELEN ) cmpname += 1 + len - NAMELEN; + pthread_mutex_lock( &remoteCloneLock ); + for (int i = 0; i < CACHELEN; ++i) { + if ( remoteCloneCache[i].rid == revision && strcmp( cmpname, remoteCloneCache[i].name ) == 0 ) { + useIndex = i; + if ( timing_reached( &remoteCloneCache[i].deadline, &now ) ) break; + pthread_mutex_unlock( &remoteCloneLock ); // Was recently checked... + return image; + } + if ( timing_1le2( &remoteCloneCache[i].deadline, &remoteCloneCache[fallbackIndex].deadline ) ) { + fallbackIndex = i; + } + } + // Re-check to prevent two clients at the same time triggering this, + // but only if rid != 0, since we would just get an old rid then + if ( revision != 0 ) { + if ( image == NULL ) image = image_get( name, revision, true ); + if ( image != NULL ) { + pthread_mutex_unlock( &remoteCloneLock ); + return image; + } + } + // Reaching this point means we should contact an authority server + serialized_buffer_t serialized; + // Mark as recently checked + if ( useIndex == -1 ) { + useIndex = fallbackIndex; + } + timing_set( &remoteCloneCache[useIndex].deadline, &now, SERVER_REMOTE_IMAGE_CHECK_CACHETIME ); + snprintf( remoteCloneCache[useIndex].name, NAMELEN, "%s", cmpname ); + remoteCloneCache[useIndex].rid = revision; + pthread_mutex_unlock( &remoteCloneLock ); + + // Get some alt servers and try to get the image from there +#define REP_NUM_SRV (8) + dnbd3_host_t servers[REP_NUM_SRV]; + int uplinkSock = -1; + dnbd3_host_t uplinkServer; + const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false ); + uint16_t remoteProtocolVersion; + uint16_t remoteRid = revision; + uint64_t remoteImageSize; + struct sockaddr_storage sa; + socklen_t salen; + poll_list_t *cons = sock_newPollList(); + logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count ); + for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects + char *remoteName; + bool ok = false; + int sock; + if ( i >= count ) { + sock = sock_multiConnect( cons, NULL, 100, 1000 ); + if ( sock == -2 ) break; + } else { + if ( log_hasMask( LOG_DEBUG2 ) ) { + char host[50]; + size_t len = sock_printHost( &servers[i], host, sizeof(host) ); + host[len] = '\0'; + logadd( LOG_DEBUG2, "Trying to replicate from %s", host ); + } + sock = sock_multiConnect( cons, &servers[i], 100, 1000 ); + } + if ( sock == -1 || sock == -2 ) continue; + salen = sizeof(sa); + if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) { + logadd( LOG_MINOR, "getpeername on successful connection failed!? (errno=%d)", errno ); + goto server_fail; + } + if ( !dnbd3_select_image( sock, name, revision, SI_SERVER_FLAGS ) ) goto server_fail; + if ( !dnbd3_select_image_reply( &serialized, sock, &remoteProtocolVersion, &remoteName, &remoteRid, &remoteImageSize ) ) goto server_fail; + if ( remoteProtocolVersion < MIN_SUPPORTED_SERVER || remoteRid == 0 ) goto server_fail; + if ( revision != 0 && remoteRid != revision ) goto server_fail; // Want specific revision but uplink supplied different rid + if ( revision == 0 && image != NULL && image->rid >= remoteRid ) goto server_fail; // Not actually a failure: Highest remote rid is <= highest local rid - don't clone! + if ( remoteImageSize < DNBD3_BLOCK_SIZE || remoteName == NULL || strcmp( name, remoteName ) != 0 ) goto server_fail; + if ( remoteImageSize > _maxReplicationSize ) { + logadd( LOG_MINOR, "Won't proxy '%s:%d': Larger than maxReplicationSize", name, (int)revision ); + goto server_fail; + } + pthread_mutex_lock( &reloadLock ); + // Ensure disk space entirely if not using sparse files, otherwise just make sure we have some room at least + if ( _sparseFiles ) { + ok = image_ensureDiskSpace( 2ull * 1024 * 1024 * 1024, false ); // 2GiB, maybe configurable one day + } else { + ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc. + } + ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img + pthread_mutex_unlock( &reloadLock ); + if ( !ok ) goto server_fail; + + // Cloning worked :-) + uplinkSock = sock; + if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) { + uplinkServer.type = 0; + } + break; + +server_fail: ; + close( sock ); + } + sock_destroyPollList( cons ); + + // If we still have a pointer to a local image, release the reference + if ( image != NULL ) image_release( image ); + // If everything worked out, this call should now actually return the image + image = image_get( name, remoteRid, false ); + if ( image != NULL && uplinkSock != -1 ) { + // If so, init the uplink and pass it the socket + sock_setTimeout( uplinkSock, _uplinkTimeout ); + if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) { + close( uplinkSock ); + } else { + // Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism? + int i = 0; + while ( !image->working && ++i < 100 ) + usleep( 2000 ); + } + } else if ( uplinkSock != -1 ) { + close( uplinkSock ); + } + return image; +} + +/** + * Called if specific rid is not loaded, or if rid is 0, in which case we check on + * disk which revision is latest. + */ +static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requestedRid) +{ + char imageFile[PATHLEN] = ""; + uint16_t detectedRid = 0; + + if ( requestedRid != 0 ) { + snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid ); + detectedRid = requestedRid; + } else { + glob_t g; + snprintf( imageFile, PATHLEN, "%s/%s.r*", _basePath, name ); + const int ret = glob( imageFile, GLOB_NOSORT | GLOB_MARK, NULL, &g ); + imageFile[0] = '\0'; + if ( ret == 0 ) { + long int best = 0; + for ( size_t i = 0; i < g.gl_pathc; ++i ) { + const char * const path = g.gl_pathv[i]; + const char * rev = strrchr( path, 'r' ); + if ( rev == NULL || rev == path || *(rev - 1) != '.' ) continue; + rev++; + if ( *rev < '0' || *rev > '9' ) continue; + char *err = NULL; + long int val = strtol( rev, &err, 10 ); + if ( err == NULL || *err != '\0' ) continue; + if ( val > best ) { + best = val; + snprintf( imageFile, PATHLEN, "%s", g.gl_pathv[i] ); + } + } + if ( best > 0 && best < 65536 ) { + detectedRid = (uint16_t)best; + } + } + globfree( &g ); + } + if ( _vmdkLegacyMode && requestedRid <= 1 + && !isForbiddenExtension( name ) + && ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) { + snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name ); + detectedRid = 1; + } + logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile ); + // No file was determined, or it doesn't seem to exist/be readable + if ( detectedRid == 0 ) { + logadd( LOG_DEBUG2, "Not found, bailing out" ); + return image_get( name, requestedRid, true ); + } + if ( !_vmdkLegacyMode && requestedRid == 0 ) { + // rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0 + while ( detectedRid != 0 ) { + dnbd3_image_t *image = image_get( name, detectedRid, true ); + if ( image != NULL ) { + // globbed rid already loaded, return + return image; + } + if ( file_isReadable( imageFile ) ) { + // globbed rid is + break; + } + logadd( LOG_DEBUG2, "%s: rid %d globbed but not readable, trying lower rid...", name, (int)detectedRid ); + detectedRid--; + snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, requestedRid ); + } + } + + // Now lock on the loading mutex, then check again if the image exists (we're multi-threaded) + pthread_mutex_lock( &reloadLock ); + dnbd3_image_t* image = image_get( name, detectedRid, true ); + if ( image != NULL ) { + // The image magically appeared in the meantime + logadd( LOG_DEBUG2, "Magically appeared" ); + pthread_mutex_unlock( &reloadLock ); + return image; + } + // Still not loaded, let's try to do so + logadd( LOG_DEBUG2, "Calling load" ); + image_load( _basePath, imageFile, false ); + pthread_mutex_unlock( &reloadLock ); + // If loading succeeded, this will return the image + logadd( LOG_DEBUG2, "Calling get" ); + return image_get( name, requestedRid, true ); +} + +/** + * Prepare a cloned image: + * 1. Allocate empty image file and its cache map + * 2. Use passed socket to request the crc32 list and save it to disk + * 3. Load the image from disk + * Returns: true on success, false otherwise + */ +static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize) +{ + // Allocate disk space and create cache map + if ( !image_create( name, revision, imageSize ) ) return false; + // CRC32 + const size_t len = strlen( _basePath ) + strlen( name ) + 20; + char crcFile[len]; + snprintf( crcFile, len, "%s/%s.r%d.crc", _basePath, name, (int)revision ); + if ( !file_isReadable( crcFile ) ) { + // Get crc32list from remote server + size_t crc32len = IMGSIZE_TO_HASHBLOCKS(imageSize) * sizeof(uint32_t); + uint32_t masterCrc; + uint8_t *crc32list = malloc( crc32len ); + if ( !dnbd3_get_crc32( sock, &masterCrc, crc32list, &crc32len ) ) { + free( crc32list ); + return false; + } + if ( crc32len != 0 ) { + uint32_t lists_crc = crc32( 0, NULL, 0 ); + lists_crc = crc32( lists_crc, (uint8_t*)crc32list, crc32len ); + lists_crc = net_order_32( lists_crc ); + if ( lists_crc != masterCrc ) { + logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name ); + } else { + int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 ); + write( fd, &masterCrc, sizeof(uint32_t) ); + write( fd, crc32list, crc32len ); + close( fd ); + } + } + free( crc32list ); + } + // HACK: Chop of ".crc" to get the image file name + crcFile[strlen( crcFile ) - 4] = '\0'; + return image_load( _basePath, crcFile, false ); +} + +/** + * Generate the crc32 block list file for the given file. + * This function wants a plain file name instead of a dnbd3_image_t, + * as it can be used directly from the command line. + */ +bool image_generateCrcFile(char *image) +{ + int fdCrc = -1; + uint32_t crc; + char crcFile[strlen( image ) + 4 + 1]; + int fdImage = open( image, O_RDONLY ); + + if ( fdImage == -1 ) { + logadd( LOG_ERROR, "Could not open %s.", image ); + return false; + } + + const int64_t fileLen = lseek( fdImage, 0, SEEK_END ); + if ( fileLen <= 0 ) { + logadd( LOG_ERROR, "Error seeking to end, or file is empty." ); + goto cleanup_fail; + } + + struct stat sst; + sprintf( crcFile, "%s.crc", image ); + if ( stat( crcFile, &sst ) == 0 ) { + logadd( LOG_ERROR, "CRC File for %s already exists! Delete it first if you want to regen.", image ); + goto cleanup_fail; + } + + fdCrc = open( crcFile, O_RDWR | O_CREAT, 0644 ); + if ( fdCrc == -1 ) { + logadd( LOG_ERROR, "Could not open CRC File %s for writing..", crcFile ); + goto cleanup_fail; + } + // CRC of all CRCs goes first. Don't know it yet, write 4 bytes dummy data. + if ( write( fdCrc, crcFile, sizeof(crc) ) != sizeof(crc) ) { + logadd( LOG_ERROR, "Write error" ); + goto cleanup_fail; + } + + printf( "Generating CRC32" ); + fflush( stdout ); + const int blockCount = IMGSIZE_TO_HASHBLOCKS( fileLen ); + for ( int i = 0; i < blockCount; ++i ) { + if ( !image_calcBlockCrc32( fdImage, i, fileLen, &crc ) ) { + goto cleanup_fail; + } + if ( write( fdCrc, &crc, sizeof(crc) ) != sizeof(crc) ) { + printf( "\nWrite error writing crc file: %d\n", errno ); + goto cleanup_fail; + } + putchar( '.' ); + fflush( stdout ); + } + close( fdImage ); + fdImage = -1; + printf( "done!\n" ); + + logadd( LOG_INFO, "Generating master-crc..." ); + fflush( stdout ); + // File is written - read again to calc master crc + if ( lseek( fdCrc, 4, SEEK_SET ) != 4 ) { + logadd( LOG_ERROR, "Could not seek to beginning of crc list in file" ); + goto cleanup_fail; + } + char buffer[400]; + int blocksToGo = blockCount; + crc = crc32( 0, NULL, 0 ); + while ( blocksToGo > 0 ) { + const int numBlocks = MIN( (int)( sizeof(buffer) / sizeof(crc) ), blocksToGo ); + if ( read( fdCrc, buffer, numBlocks * sizeof(crc) ) != numBlocks * (int)sizeof(crc) ) { + logadd( LOG_ERROR, "Could not re-read from crc32 file" ); + goto cleanup_fail; + } + crc = crc32( crc, (uint8_t*)buffer, numBlocks * sizeof(crc) ); + blocksToGo -= numBlocks; + } + crc = net_order_32( crc ); + if ( pwrite( fdCrc, &crc, sizeof(crc), 0 ) != sizeof(crc) ) { + logadd( LOG_ERROR, "Could not write master crc to file" ); + goto cleanup_fail; + } + logadd( LOG_INFO, "CRC-32 file successfully generated." ); + fflush( stdout ); + return true; + +cleanup_fail:; + if ( fdImage != -1 ) close( fdImage ); + if ( fdCrc != -1 ) close( fdCrc ); + return false; +} + +json_t* image_getListAsJson() +{ + json_t *imagesJson = json_array(); + json_t *jsonImage; + int i; + char uplinkName[100] = { 0 }; + uint64_t bytesReceived; + int users, completeness, idleTime; + declare_now; + + spin_lock( &imageListLock ); + for ( i = 0; i < _num_images; ++i ) { + if ( _images[i] == NULL ) continue; + dnbd3_image_t *image = _images[i]; + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + users = image->users; + idleTime = (int)timing_diff( &image->atime, &now ); + completeness = image_getCompletenessEstimate( image ); + if ( image->uplink == NULL ) { + bytesReceived = 0; + uplinkName[0] = '\0'; + } else { + bytesReceived = image->uplink->bytesReceived; + if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) { + uplinkName[0] = '\0'; + } + } + image->users++; // Prevent freeing after we unlock + spin_unlock( &image->lock ); + + jsonImage = json_pack( "{sisssisisisisI}", + "id", image->id, // id, name, rid never change, so access them without locking + "name", image->name, + "rid", (int) image->rid, + "users", users, + "complete", completeness, + "idle", idleTime, + "size", (json_int_t)image->virtualFilesize ); + if ( bytesReceived != 0 ) { + json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) ); + } + if ( uplinkName[0] != '\0' ) { + json_object_set_new( jsonImage, "uplinkServer", json_string( uplinkName ) ); + } + json_array_append_new( imagesJson, jsonImage ); + + image = image_release( image ); // Since we did image->users++; + spin_lock( &imageListLock ); + } + spin_unlock( &imageListLock ); + return imagesJson; +} + +/** + * Get completeness of an image in percent. Only estimated, not exact. + * Returns: 0-100 + * DOES NOT LOCK, so make sure to do so before calling + */ +int image_getCompletenessEstimate(dnbd3_image_t * const image) +{ + assert( image != NULL ); + if ( image->cache_map == NULL ) return image->working ? 100 : 0; + declare_now; + if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) { + // Since this operation is relatively expensive, we cache the result for a while + return image->completenessEstimate; + } + int i; + int percent = 0; + const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + if ( len == 0 ) return 0; + for ( i = 0; i < len; ++i ) { + if ( image->cache_map[i] == 0xff ) { + percent += 100; + } else if ( image->cache_map[i] != 0 ) { + percent += 50; + } + } + image->completenessEstimate = percent / len; + timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 ); + return image->completenessEstimate; +} + +/** + * Check the CRC-32 of the given blocks. The array "blocks" is of variable length. + * !! pass -1 as the last block so the function knows when to stop !! + * Does NOT check whether block index is within image. + * Returns true or false + */ +bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks, const uint64_t realFilesize) +{ + while ( *blocks != -1 ) { + uint32_t crc; + if ( !image_calcBlockCrc32( fd, *blocks, realFilesize, &crc ) ) { + return false; + } + if ( crc != crc32list[*blocks] ) { + logadd( LOG_WARNING, "Block %d is %x, should be %x", *blocks, crc, crc32list[*blocks] ); + return false; + } + blocks++; + } + return true; +} + +/** + * Calc CRC-32 of block. Value is returned as little endian. + */ +static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc) +{ + // Make buffer 4k aligned in case fd has O_DIRECT set +#define BSIZE 262144 + char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE]; + char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ); + // How many bytes to read from the input file + const uint64_t bytesFromFile = MIN( HASH_BLOCK_SIZE, realFilesize - ( block * HASH_BLOCK_SIZE) ); + // Determine how many bytes we had to read if the file size were a multiple of 4k + // This might be the same value if the real file's size is a multiple of 4k + const uint64_t vbs = ( ( realFilesize + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) ) - ( block * HASH_BLOCK_SIZE ); + const uint64_t virtualBytesFromFile = MIN( HASH_BLOCK_SIZE, vbs ); + const off_t readPos = (int64_t)block * HASH_BLOCK_SIZE; + size_t bytes = 0; + assert( vbs >= bytesFromFile ); + *crc = crc32( 0, NULL, 0 ); + // Calculate the crc32 by reading data from the file + while ( bytes < bytesFromFile ) { + const size_t n = (size_t)MIN( BSIZE, bytesFromFile - bytes ); + const ssize_t r = pread( fd, buffer, n, readPos + bytes ); + if ( r <= 0 ) { + logadd( LOG_WARNING, "CRC: Read error (errno=%d)", errno ); + return false; + } + *crc = crc32( *crc, (uint8_t*)buffer, r ); + bytes += (size_t)r; + } + // If the virtual file size is different, keep going using nullbytes + if ( bytesFromFile < virtualBytesFromFile ) { + memset( buffer, 0, BSIZE ); + bytes = (size_t)( virtualBytesFromFile - bytesFromFile ); + while ( bytes != 0 ) { + const size_t len = MIN( BSIZE, bytes ); + *crc = crc32( *crc, (uint8_t*)buffer, len ); + bytes -= len; + } + } + *crc = net_order_32( *crc ); + return true; +#undef BSIZE +} + +/** + * Call image_ensureDiskSpace (below), but aquire + * reloadLock first. + */ +bool image_ensureDiskSpaceLocked(uint64_t size, bool force) +{ + bool ret; + pthread_mutex_lock( &reloadLock ); + ret = image_ensureDiskSpace( size, force ); + pthread_mutex_unlock( &reloadLock ); + return ret; +} + +/** + * Make sure at least size bytes are available in _basePath. + * Will delete old images to make room for new ones. + * TODO: Store last access time of images. Currently the + * last access time is reset to the file modification time + * on server restart. Thus it will + * currently only delete images if server uptime is > 10 hours. + * This can be overridden by setting force to true, in case + * free space is desperately needed. + * Return true iff enough space is available. false in random other cases + */ +static bool image_ensureDiskSpace(uint64_t size, bool force) +{ + for ( int maxtries = 0; maxtries < 20; ++maxtries ) { + uint64_t available; + if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) { + const int e = errno; + logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e ); + return true; + } + if ( available > size ) return true; + if ( !force && dnbd3_serverUptime() < 10 * 3600 ) { + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)) ); + return false; + } + logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)), + (int)(size / (1024 * 1024)) ); + // Find least recently used image + dnbd3_image_t *oldest = NULL; + int i; // XXX improve locking + for (i = 0; i < _num_images; ++i) { + if ( _images[i] == NULL ) continue; + dnbd3_image_t *current = image_lock( _images[i] ); + if ( current == NULL ) continue; + if ( current->users == 1 ) { // Just from the lock above + if ( oldest == NULL || timing_1le2( ¤t->atime, &oldest->atime ) ) { + // Oldest access time so far + oldest = current; + } + } + current = image_release( current ); + } + declare_now; + if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) { + if ( oldest == NULL ) { + logadd( LOG_INFO, "All images are currently in use :-(" ); + } else { + logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" ); + } + return false; + } + oldest = image_lock( oldest ); + if ( oldest == NULL ) continue; // Image freed in the meantime? Try again + logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid ); + char *filename = strdup( oldest->path ); + oldest = image_remove( oldest ); + oldest = image_release( oldest ); + unlink( filename ); + size_t len = strlen( filename ) + 10; + char buffer[len]; + snprintf( buffer, len, "%s.map", filename ); + unlink( buffer ); + snprintf( buffer, len, "%s.crc", filename ); + unlink( buffer ); + snprintf( buffer, len, "%s.meta", filename ); + unlink( buffer ); + free( filename ); + } + return false; +} + +void image_closeUnusedFd() +{ + int fd, i; + ticks deadline; + timing_gets( &deadline, -UNUSED_FD_TIMEOUT ); + char imgstr[300]; + spin_lock( &imageListLock ); + for (i = 0; i < _num_images; ++i) { + dnbd3_image_t * const image = _images[i]; + if ( image == NULL ) + continue; + spin_lock( &image->lock ); + spin_unlock( &imageListLock ); + if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) { + snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid ); + fd = image->readFd; + image->readFd = -1; + } else { + fd = -1; + } + spin_unlock( &image->lock ); + if ( fd != -1 ) { + close( fd ); + logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr ); + } + spin_lock( &imageListLock ); + } + spin_unlock( &imageListLock ); +} + +/* + void image_find_latest() + { + // Not in array or most recent rid is requested, try file system + if (revision != 0) { + // Easy case - specific RID + char + } else { + // Determine base directory where the image in question has to reside. + // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04" + // Then searchPath has to be set to "/srv/rz/ubuntu" + char searchPath[strlen(_basePath) + len + 1]; + char *lastSlash = strrchr(name, '/'); + char *baseName; // Name of the image. In the example above, it will be "default-13.04" + if ( lastSlash == NULL ) { + *searchPath = '\0'; + baseName = name; + } else { + char *from = name, *to = searchPath; + while (from < lastSlash) *to++ = *from++; + *to = '\0'; + baseName = lastSlash + 1; + } + // Now we have the search path in our real file system and the expected image name. + // The revision naming sceme is .r, so if we're looking for revision 13, + // our example image has to be named default-13.04.r13 + } + } + */ diff --git a/src/server/image.h b/src/server/image.h new file mode 100644 index 0000000..4668eff --- /dev/null +++ b/src/server/image.h @@ -0,0 +1,63 @@ +#ifndef _IMAGE_H_ +#define _IMAGE_H_ + +#include "globals.h" + +struct json_t; + +void image_serverStartup(); + +bool image_isComplete(dnbd3_image_t *image); + +bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize); + +void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set); + +void image_markComplete(dnbd3_image_t *image); + +bool image_ensureOpen(dnbd3_image_t *image); + +dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking); + +bool image_reopenCacheFd(dnbd3_image_t *image, const bool force); + +dnbd3_image_t* image_getOrLoad(char *name, uint16_t revision); + +dnbd3_image_t* image_lock(dnbd3_image_t *image); + +dnbd3_image_t* image_release(dnbd3_image_t *image); + +bool image_checkBlocksCrc32(int fd, uint32_t *crc32list, const int *blocks, const uint64_t fileSize); + +void image_killUplinks(); + +bool image_loadAll(char *path); + +bool image_tryFreeAll(); + +bool image_create(char *image, int revision, uint64_t size); + +bool image_generateCrcFile(char *image); + +struct json_t* image_getListAsJson(); + +int image_getCompletenessEstimate(dnbd3_image_t * const image); + +void image_closeUnusedFd(); + +bool image_ensureDiskSpaceLocked(uint64_t size, bool force); + +// one byte in the map covers 8 4kib blocks, so 32kib per byte +// "+ (1 << 15) - 1" is required to account for the last bit of +// the image that is smaller than 32kib +// this would be the case whenever the image file size is not a +// multiple of 32kib (= the number of blocks is not divisible by 8) +// ie: if the image is 49152 bytes and you do 49152 >> 15 you get 1, +// but you actually need 2 bytes to have a complete cache map +#define IMGSIZE_TO_MAPBYTES(bytes) ((int)(((bytes) + (1 << 15) - 1) >> 15)) + +// calculate number of hash blocks in file. One hash block is 16MiB +#define HASH_BLOCK_SIZE ((int64_t)(1 << 24)) +#define IMGSIZE_TO_HASHBLOCKS(bytes) ((int)(((bytes) + HASH_BLOCK_SIZE - 1) / HASH_BLOCK_SIZE)) + +#endif diff --git a/src/server/ini.c b/src/server/ini.c new file mode 100644 index 0000000..216543b --- /dev/null +++ b/src/server/ini.c @@ -0,0 +1,164 @@ +/* inih -- simple .INI file parser + + inih is released under the New BSD license (see LICENSE.txt). Go to the project + home page for more info: + + http://code.google.com/p/inih/ + + */ + +#include "ini.h" + +#include +#include + +#if !INI_USE_STACK +#include +#endif + +#define MAX_SECTION 50 +#define MAX_NAME 50 + +/* Strip whitespace chars off end of given string, in place. Return s. */ +static char* rstrip(char* s) +{ + char* p = s + strlen( s ); + while ( p > s && isspace((unsigned char)(*--p))) + *p = '\0'; + return s; +} + +/* Return pointer to first non-whitespace char in given string. */ +static char* lskip(const char* s) +{ + while ( *s && isspace((unsigned char)(*s))) + s++; + return (char*)s; +} + +/* Return pointer to first char c or ';' comment in given string, or pointer to + null at end of string if neither found. ';' must be prefixed by a whitespace + character to register as a comment. */ +static char* find_char_or_comment(const char* s, char c) +{ + int was_whitespace = 0; + while ( *s && *s != c && !(was_whitespace && *s == ';') ) { + was_whitespace = isspace((unsigned char)(*s)); + s++; + } + return (char*)s; +} + +/* Version of strncpy that ensures dest (size bytes) is null-terminated. */ +static char* strncpy0(char* dest, const char* src, size_t size) +{ + strncpy( dest, src, size ); + dest[size - 1] = '\0'; + return dest; +} + +/* See documentation in header file. */ +int ini_parse_file(FILE* file, int (*handler)(void*, const char*, const char*, const char*), void* user) +{ + /* Uses a fair bit of stack (use heap instead if you need to) */ +#if INI_USE_STACK + char line[INI_MAX_LINE]; +#else + char* line; +#endif + char section[MAX_SECTION] = ""; + char prev_name[MAX_NAME] = ""; + + char* start; + char* end; + char* name; + char* value; + int lineno = 0; + int error = 0; + +#if !INI_USE_STACK + line = (char*)malloc( INI_MAX_LINE ); + if ( !line ) { + return -2; + } +#endif + + /* Scan through file line by line */ + while ( fgets( line, INI_MAX_LINE, file ) != NULL ) { + lineno++; + + start = line; +#if INI_ALLOW_BOM + if (lineno == 1 && (unsigned char)start[0] == 0xEF && + (unsigned char)start[1] == 0xBB && + (unsigned char)start[2] == 0xBF) { + start += 3; + } +#endif + start = lskip( rstrip( start ) ); + + if ( *start == ';' || *start == '#' ) { + /* Per Python ConfigParser, allow '#' comments at start of line */ + } +#if INI_ALLOW_MULTILINE + else if (*prev_name && *start && start > line) { + /* Non-black line with leading whitespace, treat as continuation + of previous name's value (as per Python ConfigParser). */ + if (!handler(user, section, prev_name, start) && !error) + error = lineno; + } +#endif + else if ( *start == '[' ) { + /* A "[section]" line */ + end = find_char_or_comment( start + 1, ']' ); + if ( *end == ']' ) { + *end = '\0'; + strncpy0( section, start + 1, sizeof(section) ); + *prev_name = '\0'; + } else if ( !error ) { + /* No ']' found on section line */ + error = lineno; + } + } else if ( *start && *start != ';' ) { + /* Not a comment, must be a name[=:]value pair */ + end = find_char_or_comment( start, '=' ); + if ( *end != '=' ) { + end = find_char_or_comment( start, ':' ); + } + if ( *end == '=' || *end == ':' ) { + *end = '\0'; + name = rstrip( start ); + value = lskip( end + 1 ); + end = find_char_or_comment( value, '\0' ); + if ( *end == ';' ) *end = '\0'; + rstrip( value ); + + /* Valid name[=:]value pair found, call handler */ + strncpy0( prev_name, name, sizeof(prev_name) ); + if ( !handler( user, section, name, value ) && !error ) error = lineno; + } else if ( !error ) { + /* No '=' or ':' found on name[=:]value line */ + error = lineno; + } + } + } + +#if !INI_USE_STACK + free( line ); +#endif + + return error; +} + +/* See documentation in header file. */ +int ini_parse(const char* filename, int (*handler)(void*, const char*, const char*, const char*), void* user) +{ + FILE* file; + int error; + + file = fopen( filename, "r" ); + if ( !file ) return -1; + error = ini_parse_file( file, handler, user ); + fclose( file ); + return error; +} diff --git a/src/server/ini.h b/src/server/ini.h new file mode 100644 index 0000000..06f1123 --- /dev/null +++ b/src/server/ini.h @@ -0,0 +1,66 @@ +/* inih -- simple .INI file parser + + inih is released under the New BSD license (see LICENSE.txt). Go to the project + home page for more info: + + http://code.google.com/p/inih/ + + */ + +#ifndef __INI_H__ +#define __INI_H__ + +/* Make this header file easier to include in C++ code */ +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Parse given INI-style file. May have [section]s, name=value pairs + (whitespace stripped), and comments starting with ';' (semicolon). Section + is "" if name=value pair parsed before any section heading. name:value + pairs are also supported as a concession to Python's ConfigParser. + + For each name=value pair parsed, call handler function with given user + pointer as well as section, name, and value (data only valid for duration + of handler call). Handler should return nonzero on success, zero on error. + + Returns 0 on success, line number of first error on parse error (doesn't + stop on first error), -1 on file open error, or -2 on memory allocation + error (only when INI_USE_STACK is zero). + */ +int ini_parse(const char* filename, int (*handler)(void* user, const char* section, const char* name, const char* value), void* user); + +/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't + close the file when it's finished -- the caller must do that. */ +int ini_parse_file(FILE* file, int (*handler)(void* user, const char* section, const char* name, const char* value), void* user); + +/* Nonzero to allow multi-line value parsing, in the style of Python's + ConfigParser. If allowed, ini_parse() will call the handler with the same + name for each subsequent line parsed. */ +#ifndef INI_ALLOW_MULTILINE +#define INI_ALLOW_MULTILINE 1 +#endif + +/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of + the file. See http://code.google.com/p/inih/issues/detail?id=21 */ +#ifndef INI_ALLOW_BOM +#define INI_ALLOW_BOM 1 +#endif + +/* Nonzero to use stack, zero to use heap (malloc/free). */ +#ifndef INI_USE_STACK +#define INI_USE_STACK 1 +#endif + +/* Maximum line length for any line in INI file. */ +#ifndef INI_MAX_LINE +#define INI_MAX_LINE 200 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __INI_H__ */ diff --git a/src/server/integrity.c b/src/server/integrity.c new file mode 100644 index 0000000..88b7487 --- /dev/null +++ b/src/server/integrity.c @@ -0,0 +1,274 @@ +#include "integrity.h" + +#include "helper.h" +#include "locks.h" +#include "image.h" +#include "uplink.h" + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_QUEUE_SIZE 200 + +#define CHECK_ALL (0x7fffffff) + +typedef struct +{ + dnbd3_image_t *image; // Image to check + int block; // Block to check + int count; // How many blocks to check starting at .block +} queue_entry; + +static pthread_t thread; +static queue_entry checkQueue[CHECK_QUEUE_SIZE]; +static pthread_mutex_t integrityQueueLock; +static pthread_cond_t queueSignal; +static int queueLen = -1; +static volatile bool bRunning = false; + +static void* integrity_main(void *data); + +/** + * Initialize the integrity check thread + */ +void integrity_init() +{ + assert( queueLen == -1 ); + pthread_mutex_init( &integrityQueueLock, NULL ); + pthread_cond_init( &queueSignal, NULL ); + pthread_mutex_lock( &integrityQueueLock ); + queueLen = 0; + pthread_mutex_unlock( &integrityQueueLock ); + bRunning = true; + if ( 0 != thread_create( &thread, NULL, &integrity_main, (void *)NULL ) ) { + bRunning = false; + logadd( LOG_WARNING, "Could not start integrity check thread. Corrupted images will not be detected." ); + return; + } +} + +void integrity_shutdown() +{ + assert( queueLen != -1 ); + logadd( LOG_DEBUG1, "Shutting down integrity checker...\n" ); + pthread_mutex_lock( &integrityQueueLock ); + pthread_cond_signal( &queueSignal ); + pthread_mutex_unlock( &integrityQueueLock ); + thread_join( thread, NULL ); + while ( bRunning ) + usleep( 10000 ); + pthread_mutex_destroy( &integrityQueueLock ); + pthread_cond_destroy( &queueSignal ); + logadd( LOG_DEBUG1, "Integrity checker exited normally.\n" ); +} + +/** + * Schedule an integrity check on the given image for the given hash block. + * It is not checked whether the block is completely cached locally, so + * make sure it is before calling, otherwise it will result in falsely + * detected corruption. + */ +void integrity_check(dnbd3_image_t *image, int block) +{ + if ( !bRunning ) { + logadd( LOG_MINOR, "Ignoring check request; thread not running..." ); + return; + } + int i, freeSlot = -1; + pthread_mutex_lock( &integrityQueueLock ); + for (i = 0; i < queueLen; ++i) { + if ( freeSlot == -1 && checkQueue[i].image == NULL ) { + freeSlot = i; + } else if ( checkQueue[i].image == image + && checkQueue[i].block <= block && checkQueue[i].block + checkQueue[i].count >= block ) { + // Already queued check dominates this one, or at least lies directly before this block + if ( checkQueue[i].block + checkQueue[i].count == block ) { + // It's directly before this one; expand range + checkQueue[i].count += 1; + } + logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (%d +%d)", i, queueLen, checkQueue[i].block, checkQueue[i].count ); + pthread_mutex_unlock( &integrityQueueLock ); + return; + } + } + if ( freeSlot == -1 ) { + if ( queueLen >= CHECK_QUEUE_SIZE ) { + pthread_mutex_unlock( &integrityQueueLock ); + logadd( LOG_INFO, "Check queue full, discarding check request...\n" ); + return; + } + freeSlot = queueLen++; + } + checkQueue[freeSlot].image = image; + if ( block == -1 ) { + checkQueue[freeSlot].block = 0; + checkQueue[freeSlot].count = CHECK_ALL; + } else { + checkQueue[freeSlot].block = block; + checkQueue[freeSlot].count = 1; + } + pthread_cond_signal( &queueSignal ); + pthread_mutex_unlock( &integrityQueueLock ); +} + +static void* integrity_main(void * data UNUSED) +{ + int i; + uint8_t *buffer = NULL; + size_t bufferSize = 0; + setThreadName( "image-check" ); + blockNoncriticalSignals(); +#if defined(linux) || defined(__linux) + // Setting nice of this thread - this is not POSIX conforming, so check if other platforms support this. + // POSIX says that setpriority() should set the nice value of all threads belonging to the current process, + // but on linux you can do this per thread. + pid_t tid = (pid_t)syscall( SYS_gettid ); + setpriority( PRIO_PROCESS, tid, 10 ); +#endif + pthread_mutex_lock( &integrityQueueLock ); + while ( !_shutdown ) { + if ( queueLen == 0 ) { + pthread_cond_wait( &queueSignal, &integrityQueueLock ); + } + for (i = queueLen - 1; i >= 0; --i) { + if ( _shutdown ) break; + dnbd3_image_t * const image = image_lock( checkQueue[i].image ); + if ( checkQueue[i].count == 0 || image == NULL ) { + checkQueue[i].image = image_release( image ); + if ( i + 1 == queueLen ) queueLen--; + continue; + } + // We have the image. Call image_release() some time + const int qCount = checkQueue[i].count; + bool foundCorrupted = false; + spin_lock( &image->lock ); + if ( image->crc32 != NULL && image->realFilesize != 0 ) { + int blocks[2] = { checkQueue[i].block, -1 }; + pthread_mutex_unlock( &integrityQueueLock ); + // Make copy of crc32 list as it might go away + const uint64_t fileSize = image->realFilesize; + const int numHashBlocks = IMGSIZE_TO_HASHBLOCKS(fileSize); + const size_t required = numHashBlocks * sizeof(uint32_t); + if ( buffer == NULL || required > bufferSize ) { + bufferSize = required; + if ( buffer != NULL ) free( buffer ); + buffer = malloc( bufferSize ); + } + memcpy( buffer, image->crc32, required ); + spin_unlock( &image->lock ); + // Open for direct I/O if possible; this prevents polluting the fs cache + int fd = open( image->path, O_RDONLY | O_DIRECT ); + bool direct = fd != -1; + if ( unlikely( !direct ) ) { + // Try unbuffered; flush to disk for that + logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path ); + image_ensureOpen( image ); + fd = image->readFd; + } + int checkCount = MIN( qCount, 5 ); + if ( fd != -1 ) { + while ( blocks[0] < numHashBlocks && !_shutdown ) { + const uint64_t start = blocks[0] * HASH_BLOCK_SIZE; + const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize ); + bool complete = true; + if ( qCount == CHECK_ALL ) { + // When checking full image, skip incomplete blocks, otherwise assume block is complete + spin_lock( &image->lock ); + complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize ); + spin_unlock( &image->lock ); + } +#if defined(linux) || defined(__linux) + if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) { +#else + if ( fsync( fd ) == -1 ) { +#endif + logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path ); + exit( 1 ); + } + // Use direct I/O only if read length is multiple of 4096 to be on the safe side + int tfd; + if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) { + // Suitable for direct io + tfd = fd; + } else if ( !image_ensureOpen( image ) ) { + logadd( LOG_WARNING, "Cannot open %s for reading", image->path ); + break; + } else { + tfd = image->readFd; + // Evict from cache so we have to re-read, making sure data was properly stored + posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED ); + } + if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) { + logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name ); + image_updateCachemap( image, start, end, false ); + // If this is not a full check, queue one + if ( qCount != CHECK_ALL ) { + logadd( LOG_INFO, "Queueing full check for %s", image->name ); + integrity_check( image, -1 ); + } + foundCorrupted = true; + } + blocks[0]++; // Increase before break, so it always points to the next block to check after loop + if ( complete && --checkCount == 0 ) break; + } + if ( direct ) { + close( fd ); + } + } + pthread_mutex_lock( &integrityQueueLock ); + assert( checkQueue[i].image == image ); + if ( qCount != CHECK_ALL ) { + // Not a full check; update the counter + checkQueue[i].count -= ( blocks[0] - checkQueue[i].block ); + if ( checkQueue[i].count < 0 ) { + logadd( LOG_WARNING, "BUG! checkQueue counter ran negative" ); + } + } + if ( checkCount > 0 || checkQueue[i].count <= 0 || fd == -1 ) { + // Done with this task as nothing left, OR we don't have an fd to read from + if ( fd == -1 ) { + logadd( LOG_WARNING, "Cannot hash check %s: bad fd", image->path ); + } + checkQueue[i].image = NULL; + if ( i + 1 == queueLen ) queueLen--; + // Mark as working again if applicable + if ( !foundCorrupted ) { + spin_lock( &image->lock ); + if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper? + image->working = image->uplink->fd != -1 && image->readFd != -1; + } + spin_unlock( &image->lock ); + } + } else { + // Still more blocks to go... + checkQueue[i].block = blocks[0]; + } + } else { + spin_unlock( &image->lock ); + } + if ( foundCorrupted ) { + // Something was fishy, make sure uplink exists + spin_lock( &image->lock ); + image->working = false; + bool restart = image->uplink == NULL || image->uplink->shutdown; + spin_unlock( &image->lock ); + if ( restart ) { + uplink_shutdown( image ); + uplink_init( image, -1, NULL, -1 ); + } + } + // Release :-) + image_release( image ); + } + } + pthread_mutex_unlock( &integrityQueueLock ); + if ( buffer != NULL ) free( buffer ); + bRunning = false; + return NULL; +} + diff --git a/src/server/integrity.h b/src/server/integrity.h new file mode 100644 index 0000000..c3c2b44 --- /dev/null +++ b/src/server/integrity.h @@ -0,0 +1,12 @@ +#ifndef _INTEGRITY_H_ +#define _INTEGRITY_H_ + +#include "globals.h" + +void integrity_init(); + +void integrity_shutdown(); + +void integrity_check(dnbd3_image_t *image, int block); + +#endif /* INTEGRITY_H_ */ diff --git a/src/server/locks.c b/src/server/locks.c new file mode 100644 index 0000000..71a1845 --- /dev/null +++ b/src/server/locks.c @@ -0,0 +1,306 @@ +/* + * locks.c + * + * Created on: 16.07.2013 + * Author: sr + */ + +#include "locks.h" +#include "helper.h" +#include "../shared/timing.h" + +#ifdef _DEBUG +#define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES) +#define MAXTHREADS (SERVER_MAX_CLIENTS + 100) +#define LOCKLEN 60 +typedef struct +{ + void *lock; + ticks locktime; + char locked; + pthread_t thread; + int lockId; + char name[LOCKLEN]; + char where[LOCKLEN]; +} debug_lock_t; + +typedef struct +{ + pthread_t tid; + ticks time; + char name[LOCKLEN]; + char where[LOCKLEN]; + +} debug_thread_t; + +int debugThreadCount = 0; + +static debug_lock_t locks[MAXLOCKS]; +static debug_thread_t threads[MAXTHREADS]; +static int init_done = 0; +static pthread_spinlock_t initdestory; +static int lockId = 0; +static pthread_t watchdog = 0; +static dnbd3_signal_t* watchdogSignal = NULL; + +static void *debug_thread_watchdog(void *something); + +int debug_spin_init(const char *name, const char *file, int line, pthread_spinlock_t *lock, int shared) +{ + if ( !init_done ) { + memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) ); + memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) ); + pthread_spin_init( &initdestory, PTHREAD_PROCESS_PRIVATE ); + init_done = 1; + } + int first = -1; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == lock ) { + logadd( LOG_ERROR, "Lock %p (%s) already initialized (%s:%d)\n", (void*)lock, name, file, line ); + exit( 4 ); + } + if ( first == -1 && locks[i].lock == NULL ) first = i; + } + if ( first == -1 ) { + logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line ); + pthread_spin_unlock( &initdestory ); + debug_dump_lock_stats(); + exit( 4 ); + } + locks[first].lock = (void*)lock; + locks[first].locked = 0; + snprintf( locks[first].name, LOCKLEN, "%s", name ); + snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line ); + pthread_spin_unlock( &initdestory ); + return pthread_spin_init( lock, shared ); +} + +int debug_spin_lock(const char *name, const char *file, int line, pthread_spinlock_t *lock) +{ + debug_lock_t *l = NULL; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == lock ) { + l = &locks[i]; + break; + } + } + pthread_spin_unlock( &initdestory ); + if ( l == NULL ) { + logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + debug_dump_lock_stats(); + exit( 4 ); + } + debug_thread_t *t = NULL; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid != 0 ) continue; + threads[i].tid = pthread_self(); + timing_get( &threads[i].time ); + snprintf( threads[i].name, LOCKLEN, "%s", name ); + snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line ); + t = &threads[i]; + break; + } + pthread_spin_unlock( &initdestory ); + if ( t == NULL ) { + logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + const int retval = pthread_spin_lock( lock ); + pthread_spin_lock( &initdestory ); + t->tid = 0; + pthread_spin_unlock( &initdestory ); + if ( l->locked ) { + logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + l->locked = 1; + timing_get( &l->locktime ); + l->thread = pthread_self(); + snprintf( l->where, LOCKLEN, "L %s:%d", file, line ); + pthread_spin_lock( &initdestory ); + l->lockId = ++lockId; + pthread_spin_unlock( &initdestory ); + return retval; +} + +int debug_spin_trylock(const char *name, const char *file, int line, pthread_spinlock_t *lock) +{ + debug_lock_t *l = NULL; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == lock ) { + l = &locks[i]; + break; + } + } + pthread_spin_unlock( &initdestory ); + if ( l == NULL ) { + logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + debug_dump_lock_stats(); + exit( 4 ); + } + debug_thread_t *t = NULL; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid != 0 ) continue; + threads[i].tid = pthread_self(); + timing_get( &threads[i].time ); + snprintf( threads[i].name, LOCKLEN, "%s", name ); + snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line ); + t = &threads[i]; + break; + } + pthread_spin_unlock( &initdestory ); + if ( t == NULL ) { + logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + const int retval = pthread_spin_trylock( lock ); + pthread_spin_lock( &initdestory ); + t->tid = 0; + pthread_spin_unlock( &initdestory ); + if ( retval == 0 ) { + if ( l->locked ) { + logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + l->locked = 1; + timing_get( &l->locktime ); + l->thread = pthread_self(); + snprintf( l->where, LOCKLEN, "L %s:%d", file, line ); + pthread_spin_lock( &initdestory ); + l->lockId = ++lockId; + pthread_spin_unlock( &initdestory ); + } + return retval; +} + +int debug_spin_unlock(const char *name, const char *file, int line, pthread_spinlock_t *lock) +{ + debug_lock_t *l = NULL; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == lock ) { + l = &locks[i]; + break; + } + } + pthread_spin_unlock( &initdestory ); + if ( l == NULL ) { + logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + if ( !l->locked ) { + logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); + } + l->locked = 0; + l->thread = 0; + snprintf( l->where, LOCKLEN, "U %s:%d", file, line ); + int retval = pthread_spin_unlock( lock ); + return retval; +} + +int debug_spin_destroy(const char *name, const char *file, int line, pthread_spinlock_t *lock) +{ + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == lock ) { + if ( locks[i].locked ) { + logadd( LOG_ERROR, "Tried to destroy lock %p (%s) at %s:%d when it is still locked\n", (void*)lock, name, file, line ); + exit( 4 ); + } + locks[i].lock = NULL; + snprintf( locks[i].where, LOCKLEN, "D %s:%d", file, line ); + pthread_spin_unlock( &initdestory ); + return pthread_spin_destroy( lock ); + } + } + logadd( LOG_ERROR, "Tried to destroy non-existent lock %p (%s) at %s:%d\n", (void*)lock, name, file, line ); + exit( 4 ); +} + +void debug_dump_lock_stats() +{ + declare_now; + pthread_spin_lock( &initdestory ); + printf( "\n **** LOCKS ****\n\n" ); + for (int i = 0; i < MAXLOCKS; ++i) { + if ( locks[i].lock == NULL ) continue; + if ( locks[i].locked ) { + printf( "* *** %s ***\n" + "* Where: %s\n" + "* When: %d secs ago\n" + "* Locked: %d\n" + "* Serial: %d\n" + "* Thread: %d\n", locks[i].name, locks[i].where, (int)timing_diff( &locks[i].locktime, &now ), (int)locks[i].locked, locks[i].lockId, + (int)locks[i].thread ); + } else { + printf( "* *** %s ***\n" + "* Where: %s\n" + "* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked ); + } + } + printf( "\n **** WAITING THREADS ****\n\n" ); + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid == 0 ) continue; + printf( "* *** Thread %d ***\n" + "* Lock: %s\n" + "* Where: %s\n" + "* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) ); + } + pthread_spin_unlock( &initdestory ); +} + +static void *debug_thread_watchdog(void *something UNUSED) +{ + setThreadName( "debug-watchdog" ); + while ( !_shutdown ) { + if ( init_done ) { + declare_now; + pthread_spin_lock( &initdestory ); + for (int i = 0; i < MAXTHREADS; ++i) { + if ( threads[i].tid == 0 ) continue; + const uint32_t diff = timing_diff( &threads[i].time, &now ); + if ( diff > 6 && diff < 100000 ) { + printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" ); + pthread_spin_unlock( &initdestory ); + debug_dump_lock_stats(); + exit( 99 ); + } + } + pthread_spin_unlock( &initdestory ); + } + if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 ); + } + return NULL ; +} + +#endif + +void debug_locks_start_watchdog() +{ +#ifdef _DEBUG + watchdogSignal = signal_new(); + if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) { + logadd( LOG_ERROR, "Could not start debug-lock watchdog." ); + return; + } +#endif +} + +void debug_locks_stop_watchdog() +{ +#ifdef _DEBUG + _shutdown = true; + printf( "Killing debug watchdog...\n" ); + pthread_spin_lock( &initdestory ); + signal_call( watchdogSignal ); + pthread_spin_unlock( &initdestory ); + thread_join( watchdog, NULL ); + signal_close( watchdogSignal ); +#endif +} diff --git a/src/server/locks.h b/src/server/locks.h new file mode 100644 index 0000000..16b59a7 --- /dev/null +++ b/src/server/locks.h @@ -0,0 +1,85 @@ +#ifndef _LOCKS_H_ +#define _LOCKS_H_ + +#include +#include +#include +#include + +#ifdef _DEBUG + +#define spin_init( lock, type ) debug_spin_init( #lock, __FILE__, __LINE__, lock, type) +#define spin_lock( lock ) debug_spin_lock( #lock, __FILE__, __LINE__, lock) +#define spin_trylock( lock ) debug_spin_trylock( #lock, __FILE__, __LINE__, lock) +#define spin_unlock( lock ) debug_spin_unlock( #lock, __FILE__, __LINE__, lock) +#define spin_destroy( lock ) debug_spin_destroy( #lock, __FILE__, __LINE__, lock) + +int debug_spin_init(const char *name, const char *file, int line, pthread_spinlock_t *lock, int shared); +int debug_spin_lock(const char *name, const char *file, int line, pthread_spinlock_t *lock); +int debug_spin_trylock(const char *name, const char *file, int line, pthread_spinlock_t *lock); +int debug_spin_unlock(const char *name, const char *file, int line, pthread_spinlock_t *lock); +int debug_spin_destroy(const char *name, const char *file, int line, pthread_spinlock_t *lock); + +void debug_dump_lock_stats(); + + +#else + +#define spin_init( lock, type ) pthread_spin_init(lock, type) +#define spin_lock( lock ) pthread_spin_lock(lock) +#define spin_trylock( lock ) pthread_spin_trylock(lock) +#define spin_unlock( lock ) pthread_spin_unlock(lock) +#define spin_destroy( lock ) pthread_spin_destroy(lock) + +#endif + +#ifdef DEBUG_THREADS + +extern int debugThreadCount; +#define thread_create(thread,attr,routine,arg) (logadd( LOG_THREAD CREATE, "%d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg)) +static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg) +{ + int i; + if (attr == NULL || pthread_attr_getdetachstate(attr, &i) != 0 || i == PTHREAD_CREATE_JOINABLE) { + ++debugThreadCount; + } + return pthread_create( thread, attr, start_routine, arg ); +} + +#define thread_detach(thread) (logadd( LOG_THREAD DETACH, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread)) +static inline int debug_thread_detach(pthread_t thread) +{ + const int ret = pthread_detach(thread); + if (ret == 0) { + --debugThreadCount; + } else { + logadd( LOG_THREAD DETACH, "Tried to detach invalid thread (error %d)\n", (int)errno); + exit(1); + } + return ret; +} +#define thread_join(thread,value) (logadd( LOG_THREAD JOIN, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value)) +static inline int debug_thread_join(pthread_t thread, void **value_ptr) +{ + const int ret = pthread_join(thread, value_ptr); + if (ret == 0) { + --debugThreadCount; + } else { + logadd( LOG_THREAD JOIN, "Tried to join invalid thread (error %d)\n", (int)errno); + exit(1); + } + return ret; +} + +#else + +#define thread_create(thread,attr,routine,param) pthread_create( thread, attr, routine, param ) +#define thread_detach(thread) pthread_detach( thread ) +#define thread_join(thread,value) pthread_join( thread, value ) + +#endif + +void debug_locks_start_watchdog(); +void debug_locks_stop_watchdog(); + +#endif /* LOCKS_H_ */ diff --git a/src/server/net.c b/src/server/net.c new file mode 100644 index 0000000..00e88e0 --- /dev/null +++ b/src/server/net.c @@ -0,0 +1,731 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include "helper.h" +#include "image.h" +#include "uplink.h" +#include "locks.h" +#include "rpc.h" +#include "altservers.h" + +#include "../shared/sockhelper.h" +#include "../shared/timing.h" +#include "../shared/protocol.h" +#include "../serialize.h" + +#include + +#ifdef __linux__ +#include +#endif +#ifdef __FreeBSD__ +#include +#include +#include +#endif +#include +#include +#include + +static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS]; +static int _num_clients = 0; +static pthread_spinlock_t _clients_lock; + +static char nullbytes[500]; + +static atomic_uint_fast64_t totalBytesSent = 0; + +// Adding and removing clients -- list management +static bool addToList(dnbd3_client_t *client); +static void removeFromList(dnbd3_client_t *client); +static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client); + +static inline bool recv_request_header(int sock, dnbd3_request_t *request) +{ + ssize_t ret, fails = 0; +#ifdef AFL_MODE + sock = 0; +#endif + // Read request header from socket + while ( ( ret = recv( sock, request, sizeof(*request), MSG_WAITALL ) ) != sizeof(*request) ) { + if ( errno == EINTR && ++fails < 10 ) continue; + if ( ret >= 0 || ++fails > SOCKET_TIMEOUT_CLIENT_RETRIES ) return false; + if ( errno == EAGAIN ) continue; + logadd( LOG_DEBUG2, "Error receiving request: Could not read message header (%d/%d, e=%d)\n", (int)ret, (int)sizeof(*request), errno ); + return false; + } + // Make sure all bytes are in the right order (endianness) + fixup_request( *request ); + if ( request->magic != dnbd3_packet_magic ) { + logadd( LOG_DEBUG2, "Magic in client request incorrect (cmd: %d, len: %d)\n", (int)request->cmd, (int)request->size ); + return false; + } + // Payload sanity check + if ( request->cmd != CMD_GET_BLOCK && request->size > MAX_PAYLOAD ) { + logadd( LOG_WARNING, "Client tries to send a packet of type %d with %d bytes payload. Dropping client.", (int)request->cmd, (int)request->size ); + return false; + } + return true; +} + +static inline bool recv_request_payload(int sock, uint32_t size, serialized_buffer_t *payload) +{ +#ifdef AFL_MODE + sock = 0; +#endif + if ( size == 0 ) { + logadd( LOG_ERROR, "Called recv_request_payload() to receive 0 bytes" ); + return false; + } + if ( size > MAX_PAYLOAD ) { + logadd( LOG_ERROR, "Called recv_request_payload() for more bytes than the passed buffer could hold!" ); + return false; + } + if ( sock_recv( sock, payload->buffer, size ) != (ssize_t)size ) { + logadd( LOG_DEBUG1, "Could not receive request payload of length %d\n", (int)size ); + return false; + } + // Prepare payload buffer for reading + serializer_reset_read( payload, size ); + return true; +} + +/** + * Send reply with optional payload. payload can be null. The caller has to + * acquire the sendMutex first. + */ +static inline bool send_reply(int sock, dnbd3_reply_t *reply, void *payload) +{ + const uint32_t size = reply->size; + fixup_reply( *reply ); + if ( sock_sendAll( sock, reply, sizeof(dnbd3_reply_t), 1 ) != sizeof(dnbd3_reply_t) ) { + logadd( LOG_DEBUG1, "Sending reply header to client failed" ); + return false; + } + if ( size != 0 && payload != NULL ) { + if ( sock_sendAll( sock, payload, size, 1 ) != (ssize_t)size ) { + logadd( LOG_DEBUG1, "Sending payload of %"PRIu32" bytes to client failed", size ); + return false; + } + } + return true; +} + +/** + * Send given amount of null bytes. The caller has to acquire the sendMutex first. + */ +static inline bool sendPadding( const int fd, uint32_t bytes ) +{ + ssize_t ret; + while ( bytes >= sizeof(nullbytes) ) { + ret = sock_sendAll( fd, nullbytes, sizeof(nullbytes), 2 ); + if ( ret <= 0 ) + return false; + bytes -= (uint32_t)ret; + } + return sock_sendAll( fd, nullbytes, bytes, 2 ) == (ssize_t)bytes; +} + +void net_init() +{ + spin_init( &_clients_lock, PTHREAD_PROCESS_PRIVATE ); +} + +void* net_handleNewConnection(void *clientPtr) +{ + dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr; + dnbd3_request_t request; + + // Await data from client. Since this is a fresh connection, we expect data right away + sock_setTimeout( client->sock, _clientTimeout ); + do { +#ifdef AFL_MODE + const int ret = (int)recv( 0, &request, sizeof(request), MSG_WAITALL ); +#else + const int ret = (int)recv( client->sock, &request, sizeof(request), MSG_WAITALL ); +#endif + // It's expected to be a real dnbd3 client + // Check request for validity. This implicitly dictates that all HTTP requests are more than 24 bytes... + if ( ret != (int)sizeof(request) ) { + logadd( LOG_DEBUG2, "Error receiving request: Could not read message header (%d/%d, e=%d)", (int)ret, (int)sizeof(request), errno ); + goto fail_preadd; + } + + if ( request.magic != dnbd3_packet_magic ) { + // Let's see if this looks like an HTTP request + if ( ((char*)&request)[0] == 'G' || ((char*)&request)[0] == 'P' ) { + // Close enough... + rpc_sendStatsJson( client->sock, &client->host, &request, ret ); + } else { + logadd( LOG_DEBUG1, "Magic in client handshake incorrect" ); + } + goto fail_preadd; + } + // Magic OK, untangle byte order if required + fixup_request( request ); + if ( request.cmd != CMD_SELECT_IMAGE ) { + logadd( LOG_WARNING, "Client sent != CMD_SELECT_IMAGE in handshake (got cmd=%d, size=%d), dropping client.", (int)request.cmd, (int)request.size ); + goto fail_preadd; + } + } while (0); + // Fully init client struct + spin_init( &client->lock, PTHREAD_PROCESS_PRIVATE ); + pthread_mutex_init( &client->sendMutex, NULL ); + + spin_lock( &client->lock ); + host_to_string( &client->host, client->hostName, HOSTNAMELEN ); + client->hostName[HOSTNAMELEN-1] = '\0'; + spin_unlock( &client->lock ); + client->bytesSent = 0; + + if ( !addToList( client ) ) { + freeClientStruct( client ); + logadd( LOG_WARNING, "Could not add new client to list when connecting" ); + return NULL; + } + + dnbd3_reply_t reply; + + dnbd3_image_t *image = NULL; + int image_file = -1; + + int num; + bool bOk = false; + bool hasName = false; + + serialized_buffer_t payload; + uint16_t rid, client_version; + uint64_t start, end; + + dnbd3_server_entry_t server_list[NUMBER_SERVERS]; + + // Set to zero to make valgrind happy + memset( &reply, 0, sizeof(reply) ); + memset( &payload, 0, sizeof(payload) ); + reply.magic = dnbd3_packet_magic; + + // Receive first packet's payload + if ( recv_request_payload( client->sock, request.size, &payload ) ) { + char *image_name; + client_version = serializer_get_uint16( &payload ); + image_name = serializer_get_string( &payload ); + rid = serializer_get_uint16( &payload ); + const uint8_t flags = serializer_get_uint8( &payload ); + client->isServer = ( flags & FLAGS8_SERVER ); + if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) { + if ( client_version < MIN_SUPPORTED_CLIENT ) { + logadd( LOG_DEBUG1, "Client %s too old", client->hostName ); + } else { + logadd( LOG_DEBUG1, "Incomplete handshake received from %s", client->hostName ); + } + } else { + if ( !client->isServer || !_isProxy ) { + // Is a normal client, or we're not proxy + image = image_getOrLoad( image_name, rid ); + } else if ( _backgroundReplication != BGR_FULL && ( flags & FLAGS8_BG_REP ) ) { + // We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does... + // Reject, as this would basically force this proxy to do BGR too. + image = image_get( image_name, rid, true ); + if ( image != NULL && image->cache_map != NULL ) { + // Only exception is if the image is complete locally + image = image_release( image ); + } + } else if ( _lookupMissingForProxy ) { + // No BGR mismatch and we're told to lookup missing images on a known uplink server + // if the requesting client is a proxy + image = image_getOrLoad( image_name, rid ); + } else { + // No BGR mismatch, but don't lookup if image is unknown locally + image = image_get( image_name, rid, true ); + } + spin_lock( &client->lock ); + client->image = image; + spin_unlock( &client->lock ); + if ( image == NULL ) { + //logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid ); + } else if ( !image->working ) { + logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n", + client->hostName, image_name, (int)rid ); + } else { + bool penalty; + // Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable + bOk = true; + if ( image->cache_map != NULL ) { + spin_lock( &image->lock ); + if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) { + bOk = ( rand() % 4 ) == 1; + } + penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1; + spin_unlock( &image->lock ); + if ( penalty ) { // Wait 100ms if local caching is not working so this + usleep( 100000 ); // server gets a penalty and is less likely to be selected + } + } + if ( bOk ) { + spin_lock( &image->lock ); + image_file = image->readFd; + if ( !client->isServer ) { + // Only update immediately if this is a client. Servers are handled on disconnect. + timing_get( &image->atime ); + } + spin_unlock( &image->lock ); + serializer_reset_write( &payload ); + serializer_put_uint16( &payload, client_version < 3 ? client_version : PROTOCOL_VERSION ); // XXX: Since messed up fuse client was messed up before :( + serializer_put_string( &payload, image->name ); + serializer_put_uint16( &payload, (uint16_t)image->rid ); + serializer_put_uint64( &payload, image->virtualFilesize ); + reply.cmd = CMD_SELECT_IMAGE; + reply.size = serializer_get_written_length( &payload ); + if ( !send_reply( client->sock, &reply, &payload ) ) { + bOk = false; + } + } + } + } + } + + if ( bOk ) { + // add artificial delay if applicable + if ( client->isServer && _serverPenalty != 0 ) { + usleep( _serverPenalty ); + } else if ( !client->isServer && _clientPenalty != 0 ) { + usleep( _clientPenalty ); + } + // client handling mainloop + while ( recv_request_header( client->sock, &request ) ) { + if ( _shutdown ) break; + switch ( request.cmd ) { + + case CMD_GET_BLOCK:; + const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking + if ( offset >= image->virtualFilesize ) { + // Sanity check + logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName ); + reply.size = 0; + reply.cmd = CMD_ERROR; + send_reply( client->sock, &reply, NULL ); + break; + } + if ( offset + request.size > image->virtualFilesize ) { + // Sanity check + logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName ); + reply.size = 0; + reply.cmd = CMD_ERROR; + send_reply( client->sock, &reply, NULL ); + break; + } + + if ( request.size != 0 && image->cache_map != NULL ) { + // This is a proxyed image, check if we need to relay the request... + start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + bool isCached = true; + spin_lock( &image->lock ); + // Check again as we only aquired the lock just now + if ( image->cache_map != NULL ) { + const uint64_t firstByteInMap = start >> 15; + const uint64_t lastByteInMap = (end - 1) >> 15; + uint64_t pos; + // Middle - quick checking + if ( isCached ) { + pos = firstByteInMap + 1; + while ( pos < lastByteInMap ) { + if ( image->cache_map[pos] != 0xff ) { + isCached = false; + break; + } + ++pos; + } + } + // First byte + if ( isCached ) { + pos = start; + do { + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) { + isCached = false; + break; + } + pos += DNBD3_BLOCK_SIZE; + } while ( firstByteInMap == (pos >> 15) && pos < end ); + } + // Last byte - only check if request spans multiple bytes in cache map + if ( isCached && firstByteInMap != lastByteInMap ) { + pos = lastByteInMap << 15; + while ( pos < end ) { + assert( lastByteInMap == (pos >> 15) ); + const int map_x = (pos >> 12) & 7; // mod 8 + const uint8_t bit_mask = (uint8_t)( 1 << map_x ); + if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) { + isCached = false; + break; + } + pos += DNBD3_BLOCK_SIZE; + } + } + } + spin_unlock( &image->lock ); + if ( !isCached ) { + if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) { + logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d", + client->hostName, image->name, image->rid ); + image->working = false; + goto exit_client_cleanup; + } + break; // DONE, exit request.cmd switch + } + } + + reply.cmd = CMD_GET_BLOCK; + reply.size = request.size; + reply.handle = request.handle; + + fixup_reply( reply ); + const bool lock = image->uplink != NULL; + if ( lock ) pthread_mutex_lock( &client->sendMutex ); + // Send reply header + if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) { + if ( lock ) pthread_mutex_unlock( &client->sendMutex ); + logadd( LOG_DEBUG1, "Sending CMD_GET_BLOCK reply header to %s failed", client->hostName ); + goto exit_client_cleanup; + } + + if ( request.size != 0 ) { + // Send payload if request length > 0 + size_t done = 0; + off_t foffset = (off_t)offset; + size_t realBytes; + if ( offset + request.size <= image->realFilesize ) { + realBytes = request.size; + } else { + realBytes = (size_t)(image->realFilesize - offset); + } + while ( done < realBytes ) { + // TODO: Should we consider EOPNOTSUPP on BSD for sendfile and fallback to read/write? + // Linux would set EINVAL or ENOSYS instead, which it unfortunately also does for a couple of other failures :/ + // read/write would kill performance anyways so a fallback would probably be of little use either way. +#ifdef AFL_MODE + char buf[1000]; + size_t cnt = realBytes - done; + if ( cnt > 1000 ) { + cnt = 1000; + } + const ssize_t sent = pread( image_file, buf, cnt, foffset ); + if ( sent > 0 ) { + //write( client->sock, buf, sent ); // This is not verified in any way, so why even do it... + } else { + const int err = errno; +#elif defined(__linux__) + const ssize_t sent = sendfile( client->sock, image_file, &foffset, realBytes - done ); + if ( sent <= 0 ) { + const int err = errno; +#elif defined(__FreeBSD__) + off_t sent; + const int ret = sendfile( image_file, client->sock, foffset, realBytes - done, NULL, &sent, 0 ); + if ( ret == -1 || sent == 0 ) { + const int err = errno; + if ( ret == -1 ) { + if ( err == EAGAIN || err == EINTR ) { // EBUSY? manpage doesn't explicitly mention *sent here.. But then again we dont set the according flag anyways + done += sent; + continue; + } + sent = -1; + } +#endif + if ( lock ) pthread_mutex_unlock( &client->sendMutex ); + if ( sent == -1 ) { + if ( err != EPIPE && err != ECONNRESET && err != ESHUTDOWN + && err != EAGAIN && err != EWOULDBLOCK ) { + logadd( LOG_DEBUG1, "sendfile to %s failed (image to net. sent %d/%d, errno=%d)", + client->hostName, (int)done, (int)realBytes, err ); + } + if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) { + logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid ); + image->working = false; + } + } + goto exit_client_cleanup; + } + done += sent; + } + if ( request.size > (uint32_t)realBytes ) { + if ( !sendPadding( client->sock, request.size - (uint32_t)realBytes ) ) { + if ( lock ) pthread_mutex_unlock( &client->sendMutex ); + goto exit_client_cleanup; + } + } + } + if ( lock ) pthread_mutex_unlock( &client->sendMutex ); + // Global per-client counter + client->bytesSent += request.size; // Increase counter for statistics. + break; + + case CMD_GET_SERVERS: + // Build list of known working alt servers + num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS ); + reply.cmd = CMD_GET_SERVERS; + reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) ); + pthread_mutex_lock( &client->sendMutex ); + send_reply( client->sock, &reply, server_list ); + pthread_mutex_unlock( &client->sendMutex ); + goto set_name; + break; + + case CMD_KEEPALIVE: + reply.cmd = CMD_KEEPALIVE; + reply.size = 0; + pthread_mutex_lock( &client->sendMutex ); + send_reply( client->sock, &reply, NULL ); + pthread_mutex_unlock( &client->sendMutex ); +set_name: ; + if ( !hasName ) { + hasName = true; + setThreadName( client->hostName ); + } + break; + + case CMD_SET_CLIENT_MODE: + client->isServer = false; + break; + + case CMD_GET_CRC32: + reply.cmd = CMD_GET_CRC32; + pthread_mutex_lock( &client->sendMutex ); + if ( image->crc32 == NULL ) { + reply.size = 0; + send_reply( client->sock, &reply, NULL ); + } else { + const uint32_t size = reply.size = (uint32_t)( (IMGSIZE_TO_HASHBLOCKS(image->realFilesize) + 1) * sizeof(uint32_t) ); + send_reply( client->sock, &reply, NULL ); + send( client->sock, &image->masterCrc32, sizeof(uint32_t), MSG_MORE ); + send( client->sock, image->crc32, size - sizeof(uint32_t), 0 ); + } + pthread_mutex_unlock( &client->sendMutex ); + break; + + default: + logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd ); + break; + + } + } + } +exit_client_cleanup: ; + // First remove from list, then add to counter to prevent race condition + removeFromList( client ); + totalBytesSent += client->bytesSent; + // Access time, but only if client didn't just probe + if ( image != NULL ) { + spin_lock( &image->lock ); + if ( client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) { + timing_get( &image->atime ); + } + spin_unlock( &image->lock ); + } + freeClientStruct( client ); // This will also call image_release on client->image + return NULL ; +fail_preadd: ; + close( client->sock ); + free( client ); + return NULL; +} + +/** + * Get list of all clients. + */ +struct json_t* net_getListAsJson() +{ + json_t *jsonClients = json_array(); + json_t *clientStats; + int imgId, isServer; + uint64_t bytesSent; + char host[HOSTNAMELEN]; + host[HOSTNAMELEN-1] = '\0'; + + spin_lock( &_clients_lock ); + for ( int i = 0; i < _num_clients; ++i ) { + dnbd3_client_t * const client = _clients[i]; + if ( client == NULL || client->image == NULL ) + continue; + spin_lock( &client->lock ); + // Unlock so we give other threads a chance to access the client list. + // We might not get an atomic snapshot of the currently connected clients, + // but that doesn't really make a difference anyways. + spin_unlock( &_clients_lock ); + strncpy( host, client->hostName, HOSTNAMELEN - 1 ); + imgId = client->image->id; + isServer = (int)client->isServer; + bytesSent = client->bytesSent; + spin_unlock( &client->lock ); + clientStats = json_pack( "{sssisisI}", + "address", host, + "imageId", imgId, + "isServer", isServer, + "bytesSent", (json_int_t)bytesSent ); + json_array_append_new( jsonClients, clientStats ); + spin_lock( &_clients_lock ); + } + spin_unlock( &_clients_lock ); + return jsonClients; +} + +/** + * Get number of clients connected, total bytes sent, or both. + * we don't unlock the list while iterating or we might get an + * incorrect result if a client is disconnecting while iterating. + */ +void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent) +{ + int cc = 0, sc = 0; + uint64_t bs = 0; + + spin_lock( &_clients_lock ); + for ( int i = 0; i < _num_clients; ++i ) { + const dnbd3_client_t * const client = _clients[i]; + if ( client == NULL || client->image == NULL ) + continue; + if ( client->isServer ) { + sc += 1; + } else { + cc += 1; + } + bs += client->bytesSent; + } + spin_unlock( &_clients_lock ); + if ( clientCount != NULL ) { + *clientCount = cc; + } + if ( serverCount != NULL ) { + *serverCount = sc; + } + if ( bytesSent != NULL ) { + *bytesSent = totalBytesSent + bs; + } +} + +void net_disconnectAll() +{ + int i; + spin_lock( &_clients_lock ); + for (i = 0; i < _num_clients; ++i) { + if ( _clients[i] == NULL ) continue; + dnbd3_client_t * const client = _clients[i]; + spin_lock( &client->lock ); + if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR ); + spin_unlock( &client->lock ); + } + spin_unlock( &_clients_lock ); +} + +void net_waitForAllDisconnected() +{ + int retries = 10, count, i; + do { + count = 0; + spin_lock( &_clients_lock ); + for (i = 0; i < _num_clients; ++i) { + if ( _clients[i] == NULL ) continue; + count++; + } + spin_unlock( &_clients_lock ); + if ( count != 0 ) { + logadd( LOG_INFO, "%d clients still active...\n", count ); + sleep( 1 ); + } + } while ( count != 0 && --retries > 0 ); + _num_clients = 0; +} + +/* +++ + * Client list. + * + * Adding and removing clients. + */ + +/** + * Remove a client from the clients array + * Locks on: _clients_lock + */ +static void removeFromList(dnbd3_client_t *client) +{ + int i; + spin_lock( &_clients_lock ); + for ( i = _num_clients - 1; i >= 0; --i ) { + if ( _clients[i] == client ) { + _clients[i] = NULL; + } + if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients; + } + spin_unlock( &_clients_lock ); +} + +/** + * Free the client struct recursively. + * !! Make sure to call this function after removing the client from _dnbd3_clients !! + * Locks on: _clients[].lock, _images[].lock + * might call functions that lock on _images, _image[], uplink.queueLock, client.sendMutex + */ +static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client) +{ + spin_lock( &client->lock ); + pthread_mutex_lock( &client->sendMutex ); + if ( client->sock != -1 ) close( client->sock ); + client->sock = -1; + pthread_mutex_unlock( &client->sendMutex ); + if ( client->image != NULL ) { + spin_lock( &client->image->lock ); + if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client ); + spin_unlock( &client->image->lock ); + client->image = image_release( client->image ); + } + spin_unlock( &client->lock ); + spin_destroy( &client->lock ); + pthread_mutex_destroy( &client->sendMutex ); + free( client ); + return NULL ; +} + +//###// + +/** + * Add client to the clients array. + * Locks on: _clients_lock + */ +static bool addToList(dnbd3_client_t *client) +{ + int i; + spin_lock( &_clients_lock ); + for (i = 0; i < _num_clients; ++i) { + if ( _clients[i] != NULL ) continue; + _clients[i] = client; + spin_unlock( &_clients_lock ); + return true; + } + if ( _num_clients >= _maxClients ) { + spin_unlock( &_clients_lock ); + logadd( LOG_ERROR, "Maximum number of clients reached!" ); + return false; + } + _clients[_num_clients++] = client; + spin_unlock( &_clients_lock ); + return true; +} + diff --git a/src/server/net.h b/src/server/net.h new file mode 100644 index 0000000..6813b49 --- /dev/null +++ b/src/server/net.h @@ -0,0 +1,40 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef NET_H_ +#define NET_H_ + +#include "globals.h" + +struct json_t; + +void net_init(); + +void* net_handleNewConnection(void *clientPtr); + +struct json_t* net_getListAsJson(); + +void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent); + +void net_disconnectAll(); + +void net_waitForAllDisconnected(); + +#endif /* NET_H_ */ diff --git a/src/server/picohttpparser/README.md b/src/server/picohttpparser/README.md new file mode 100644 index 0000000..cb32f58 --- /dev/null +++ b/src/server/picohttpparser/README.md @@ -0,0 +1,116 @@ +PicoHTTPParser +============= + +Copyright (c) 2009-2014 [Kazuho Oku](https://github.com/kazuho), [Tokuhiro Matsuno](https://github.com/tokuhirom), [Daisuke Murase](https://github.com/typester), [Shigeo Mitsunari](https://github.com/herumi) + +PicoHTTPParser is a tiny, primitive, fast HTTP request/response parser. + +Unlike most parsers, it is stateless and does not allocate memory by itself. +All it does is accept pointer to buffer and the output structure, and setups the pointers in the latter to point at the necessary portions of the buffer. + +The code is widely deployed within Perl applications through popular modules that use it, including [Plack](https://metacpan.org/pod/Plack), [Starman](https://metacpan.org/pod/Starman), [Starlet](https://metacpan.org/pod/Starlet), [Furl](https://metacpan.org/pod/Furl). It is also the HTTP/1 parser of [H2O](https://github.com/h2o/h2o). + +Check out [test.c] to find out how to use the parser. + +The software is dual-licensed under the Perl License or the MIT License. + +Usage +----- + +The library exposes four functions: `phr_parse_request`, `phr_parse_response`, `phr_parse_headers`, `phr_decode_chunked`. + +### phr_parse_request + +The example below reads an HTTP request from socket `sock` using `read(2)`, parses it using `phr_parse_request`, and prints the details. + +```c +char buf[4096], *method, *path; +int pret, minor_version; +struct phr_header headers[100]; +size_t buflen = 0, prevbuflen = 0, method_len, path_len, num_headers; +ssize_t rret; + +while (1) { + /* read the request */ + while ((rret = read(sock, buf + buflen, sizeof(buf) - buflen)) == -1 && errno == EINTR) + ; + if (rret <= 0) + return IOError; + prevbuflen = buflen; + buflen += rret; + /* parse the request */ + num_headers = sizeof(headers) / sizeof(headers[0]); + pret = phr_parse_request(buf, buflen, &method, &method_len, &path, &path_len, + &minor_version, headers, &num_headers, prevbuflen); + if (pret > 0) + break; /* successfully parsed the request */ + else if (pret == -1) + return ParseError; + /* request is incomplete, continue the loop */ + assert(pret == -2); + if (buflen == sizeof(buf)) + return RequestIsTooLongError; +} + +printf("request is %d bytes long\n", pret); +printf("method is %.*s\n", (int)method_len, method); +printf("path is %.*s\n", (int)path_len, path); +printf("HTTP version is 1.%d\n", minor_version); +printf("headers:\n"); +for (i = 0; i != num_headers; ++i) { + printf("%.*s: %.*s\n", (int)headers[i].name_len, headers[i].name, + (int)headers[i].value_len, headers[i].value); +} +``` + +### phr_parse_response, phr_parse_headers + +`phr_parse_response` and `phr_parse_headers` provide similar interfaces as `phr_parse_request`. `phr_parse_response` parses an HTTP response, and `phr_parse_headers` parses the headers only. + +### phr_decode_chunked + +The example below decodes incoming data in chunked-encoding. The data is decoded in-place. + +```c +struct phr_chunked_decoder decoder = {}; /* zero-clear */ +char *buf = malloc(4096); +size_t size = 0, capacity = 4096, rsize; +ssize_t rret, pret; + +/* set consume_trailer to 1 to discard the trailing header, or the application + * should call phr_parse_headers to parse the trailing header */ +decoder.consume_trailer = 1; + +do { + /* expand the buffer if necessary */ + if (size == capacity) { + capacity *= 2; + buf = realloc(buf, capacity); + assert(buf != NULL); + } + /* read */ + while ((rret = read(sock, buf + size, capacity - size)) == -1 && errno == EINTR) + ; + if (rret <= 0) + return IOError; + /* decode */ + rsize = rret; + pret = phr_decode_chunked(&decoder, buf + size, &rsize); + if (pret == -1) + return ParseError; + size += rsize; +} while (pret == -2); + +/* successfully decoded the chunked data */ +assert(pret >= 0); +printf("decoded data is at %p (%zu bytes)\n", buf, size); +``` + +Benchmark +--------- + +![benchmark results](http://i.gyazo.com/a85c18d3162dfb46b485bb41e0ad443a.png) + +The benchmark code is from [fukamachi/fast-http@6b91103](https://github.com/fukamachi/fast-http/tree/6b9110347c7a3407310c08979aefd65078518478). + +The internals of picohttpparser has been described to some extent in [my blog entry]( http://blog.kazuhooku.com/2014/11/the-internals-h2o-or-how-to-write-fast.html). diff --git a/src/server/picohttpparser/picohttpparser.c b/src/server/picohttpparser/picohttpparser.c new file mode 100644 index 0000000..cfa05ef --- /dev/null +++ b/src/server/picohttpparser/picohttpparser.c @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase, + * Shigeo Mitsunari + * + * The software is licensed under either the MIT License (below) or the Perl + * license. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#ifdef __SSE4_2__ +#ifdef _MSC_VER +#include +#else +#include +#endif +#endif +#include "picohttpparser.h" + +/* $Id$ */ + +#if __GNUC__ >= 3 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +#ifdef _MSC_VER +#define ALIGNED(n) _declspec(align(n)) +#else +#define ALIGNED(n) __attribute__((aligned(n))) +#endif + +#define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u) + +#define CHECK_EOF() \ + if (buf == buf_end) { \ + *ret = -2; \ + return NULL; \ + } + +#define EXPECT_CHAR_NO_CHECK(ch) \ + if (*buf++ != ch) { \ + *ret = -1; \ + return NULL; \ + } + +#define EXPECT_CHAR(ch) \ + CHECK_EOF(); \ + EXPECT_CHAR_NO_CHECK(ch); + +#define ADVANCE_TOKEN(tok, toklen) \ + do { \ + const char *tok_start = buf; \ + static const char ALIGNED(16) ranges2[] = "\000\040\177\177"; \ + int found2; \ + buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2); \ + if (!found2) { \ + CHECK_EOF(); \ + } \ + while (1) { \ + if (*buf == ' ') { \ + break; \ + } else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \ + if ((unsigned char)*buf < '\040' || *buf == '\177') { \ + *ret = -1; \ + return NULL; \ + } \ + } \ + ++buf; \ + CHECK_EOF(); \ + } \ + tok = tok_start; \ + toklen = buf - tok_start; \ + } while (0) + +static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0" + "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1" + "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + +static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found) +{ + *found = 0; +#if __SSE4_2__ + if (likely(buf_end - buf >= 16)) { + __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges); + + size_t left = (buf_end - buf) & ~15; + do { + __m128i b16 = _mm_loadu_si128((const __m128i *)buf); + int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS); + if (unlikely(r != 16)) { + buf += r; + *found = 1; + break; + } + buf += 16; + left -= 16; + } while (likely(left != 0)); + } +#else + /* suppress unused parameter warning */ + (void)buf_end; + (void)ranges; + (void)ranges_size; +#endif + return buf; +} + +static const char *get_token_to_eol(const char *buf, const char *buf_end, struct string *token, int *ret) +{ + const char *token_start = buf; + +#ifdef __SSE4_2__ + static const char ranges1[] = "\0\010" + /* allow HT */ + "\012\037" + /* allow SP and up to but not including DEL */ + "\177\177" + /* allow chars w. MSB set */ + ; + int found; + buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found); + if (found) + goto FOUND_CTL; +#else + /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */ + while (likely(buf_end - buf >= 8)) { +#define DOIT() \ + do { \ + if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \ + goto NonPrintable; \ + ++buf; \ + } while (0) + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); +#undef DOIT + continue; + NonPrintable: + if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { + goto FOUND_CTL; + } + ++buf; + } +#endif + for (;; ++buf) { + CHECK_EOF(); + if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { + if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { + goto FOUND_CTL; + } + } + } +FOUND_CTL: + if (likely(*buf == '\015')) { + ++buf; + EXPECT_CHAR('\012'); + token->l = buf - 2 - token_start; + } else if (*buf == '\012') { + token->l = buf - token_start; + ++buf; + } else { + *ret = -1; + return NULL; + } + token->s = token_start; + + return buf; +} + +static const char *is_complete(const char *buf, const char *buf_end, size_t last_len, int *ret) +{ + int ret_cnt = 0; + buf = last_len < 3 ? buf : buf + last_len - 3; + + while (1) { + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + CHECK_EOF(); + EXPECT_CHAR('\012'); + ++ret_cnt; + } else if (*buf == '\012') { + ++buf; + ++ret_cnt; + } else { + ++buf; + ret_cnt = 0; + } + if (ret_cnt == 2) { + return buf; + } + } + + *ret = -2; + return NULL; +} + +#define PARSE_INT(valp_, mul_) \ + if (*buf < '0' || '9' < *buf) { \ + buf++; \ + *ret = -1; \ + return NULL; \ + } \ + *(valp_) = (mul_) * (*buf++ - '0'); + +#define PARSE_INT_3(valp_) \ + do { \ + int res_ = 0; \ + PARSE_INT(&res_, 100) \ + *valp_ = res_; \ + PARSE_INT(&res_, 10) \ + *valp_ += res_; \ + PARSE_INT(&res_, 1) \ + *valp_ += res_; \ + } while (0) + +/* returned pointer is always within [buf, buf_end), or null */ +static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret) +{ + /* we want at least [HTTP/1.] to try to parse */ + if (buf_end - buf < 9) { + *ret = -2; + return NULL; + } + EXPECT_CHAR_NO_CHECK('H'); + EXPECT_CHAR_NO_CHECK('T'); + EXPECT_CHAR_NO_CHECK('T'); + EXPECT_CHAR_NO_CHECK('P'); + EXPECT_CHAR_NO_CHECK('/'); + EXPECT_CHAR_NO_CHECK('1'); + EXPECT_CHAR_NO_CHECK('.'); + PARSE_INT(minor_version, 1); + return buf; +} + +static const char *parse_headers(const char *buf, const char *buf_end, struct phr_header *headers, size_t *num_headers, + size_t max_headers, int *ret) +{ + for (;; ++*num_headers) { + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + break; + } else if (*buf == '\012') { + ++buf; + break; + } + if (*num_headers == max_headers) { + *ret = -1; + return NULL; + } + if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) { + /* parsing name, but do not discard SP before colon, see + * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */ + headers[*num_headers].name.s = buf; + static const char ALIGNED(16) ranges1[] = "\x00 " /* control chars and up to SP */ + "\"\"" /* 0x22 */ + "()" /* 0x28,0x29 */ + ",," /* 0x2c */ + "//" /* 0x2f */ + ":@" /* 0x3a-0x40 */ + "[]" /* 0x5b-0x5d */ + "{\377"; /* 0x7b-0xff */ + int found; + buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found); + if (!found) { + CHECK_EOF(); + } + while (1) { + if (*buf == ':') { + break; + } else if (!token_char_map[(unsigned char)*buf]) { + *ret = -1; + return NULL; + } + ++buf; + CHECK_EOF(); + } + if ((headers[*num_headers].name.l = buf - headers[*num_headers].name.s) == 0) { + *ret = -1; + return NULL; + } + ++buf; + for (;; ++buf) { + CHECK_EOF(); + if (!(*buf == ' ' || *buf == '\t')) { + break; + } + } + } else { + headers[*num_headers].name.s = NULL; + headers[*num_headers].name.l = 0; + } + if ((buf = get_token_to_eol(buf, buf_end, &headers[*num_headers].value, ret)) == NULL) { + return NULL; + } + } + return buf; +} + +static const char *parse_request(const char *buf, const char *buf_end, struct string *method, struct string *path, + int *minor_version, struct phr_header *headers, size_t *num_headers, + size_t max_headers, int *ret) +{ + /* skip first empty line (some clients add CRLF after POST content) */ + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + } else if (*buf == '\012') { + ++buf; + } + + /* parse request line */ + ADVANCE_TOKEN(method->s, method->l); + ++buf; + ADVANCE_TOKEN(path->s, path->l); + ++buf; + if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { + return NULL; + } + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + } else if (*buf == '\012') { + ++buf; + } else { + *ret = -1; + return NULL; + } + + return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); +} + +int phr_parse_request(const char *buf_start, size_t len, struct string *method, struct string *path, + int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf_start + len; + size_t max_headers = *num_headers; + int r; + + method->s = NULL; + method->l = 0; + path->s = NULL; + path->l = 0; + *minor_version = -1; + *num_headers = 0; + + /* if last_len != 0, check if the request is complete (a fast countermeasure + againt slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_request(buf, buf_end, method, path, minor_version, headers, num_headers, max_headers, + &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +static const char *parse_response(const char *buf, const char *buf_end, int *minor_version, int *status, struct string *msg, + struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret) +{ + /* parse "HTTP/1.x" */ + if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { + return NULL; + } + /* skip space */ + if (*buf++ != ' ') { + *ret = -1; + return NULL; + } + /* parse status code, we want at least [:digit:][:digit:][:digit:] to try to parse */ + if (buf_end - buf < 4) { + *ret = -2; + return NULL; + } + PARSE_INT_3(status); + + /* skip space */ + if (*buf++ != ' ') { + *ret = -1; + return NULL; + } + /* get message */ + if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) { + return NULL; + } + + return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); +} + +int phr_parse_response(const char *buf_start, size_t len, int *minor_version, int *status, struct string *msg, + struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf + len; + size_t max_headers = *num_headers; + int r; + + *minor_version = -1; + *status = 0; + msg->s = NULL; + msg->l = 0; + *num_headers = 0; + + /* if last_len != 0, check if the response is complete (a fast countermeasure + against slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_response(buf, buf_end, minor_version, status, msg, headers, num_headers, max_headers, &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +int phr_parse_headers(const char *buf_start, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf + len; + size_t max_headers = *num_headers; + int r; + + *num_headers = 0; + + /* if last_len != 0, check if the response is complete (a fast countermeasure + against slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +enum { + CHUNKED_IN_CHUNK_SIZE, + CHUNKED_IN_CHUNK_EXT, + CHUNKED_IN_CHUNK_DATA, + CHUNKED_IN_CHUNK_CRLF, + CHUNKED_IN_TRAILERS_LINE_HEAD, + CHUNKED_IN_TRAILERS_LINE_MIDDLE +}; + +static int decode_hex(int ch) +{ + if ('0' <= ch && ch <= '9') { + return ch - '0'; + } else if ('A' <= ch && ch <= 'F') { + return ch - 'A' + 0xa; + } else if ('a' <= ch && ch <= 'f') { + return ch - 'a' + 0xa; + } else { + return -1; + } +} + +ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *_bufsz) +{ + size_t dst = 0, src = 0, bufsz = *_bufsz; + ssize_t ret = -2; /* incomplete */ + + while (1) { + switch (decoder->_state) { + case CHUNKED_IN_CHUNK_SIZE: + for (;; ++src) { + int v; + if (src == bufsz) + goto Exit; + if ((v = decode_hex(buf[src])) == -1) { + if (decoder->_hex_count == 0) { + ret = -1; + goto Exit; + } + break; + } + if (decoder->_hex_count == sizeof(size_t) * 2) { + ret = -1; + goto Exit; + } + decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v; + ++decoder->_hex_count; + } + decoder->_hex_count = 0; + decoder->_state = CHUNKED_IN_CHUNK_EXT; + /* fallthru */ + case CHUNKED_IN_CHUNK_EXT: + /* RFC 7230 A.2 "Line folding in chunk extensions is disallowed" */ + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] == '\012') + break; + } + ++src; + if (decoder->bytes_left_in_chunk == 0) { + if (decoder->consume_trailer) { + decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; + break; + } else { + goto Complete; + } + } + decoder->_state = CHUNKED_IN_CHUNK_DATA; + /* fallthru */ + case CHUNKED_IN_CHUNK_DATA: { + size_t avail = bufsz - src; + if (avail < decoder->bytes_left_in_chunk) { + if (dst != src) + memmove(buf + dst, buf + src, avail); + src += avail; + dst += avail; + decoder->bytes_left_in_chunk -= avail; + goto Exit; + } + if (dst != src) + memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk); + src += decoder->bytes_left_in_chunk; + dst += decoder->bytes_left_in_chunk; + decoder->bytes_left_in_chunk = 0; + decoder->_state = CHUNKED_IN_CHUNK_CRLF; + } + /* fallthru */ + case CHUNKED_IN_CHUNK_CRLF: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] != '\015') + break; + } + if (buf[src] != '\012') { + ret = -1; + goto Exit; + } + ++src; + decoder->_state = CHUNKED_IN_CHUNK_SIZE; + break; + case CHUNKED_IN_TRAILERS_LINE_HEAD: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] != '\015') + break; + } + if (buf[src++] == '\012') + goto Complete; + decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE; + /* fallthru */ + case CHUNKED_IN_TRAILERS_LINE_MIDDLE: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] == '\012') + break; + } + ++src; + decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; + break; + default: + assert(!"decoder is corrupt"); + } + } + +Complete: + ret = bufsz - src; +Exit: + if (dst != src) + memmove(buf + dst, buf + src, bufsz - src); + *_bufsz = dst; + return ret; +} + +int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder) +{ + return decoder->_state == CHUNKED_IN_CHUNK_DATA; +} + +#undef CHECK_EOF +#undef EXPECT_CHAR +#undef ADVANCE_TOKEN diff --git a/src/server/picohttpparser/picohttpparser.h b/src/server/picohttpparser/picohttpparser.h new file mode 100644 index 0000000..b315795 --- /dev/null +++ b/src/server/picohttpparser/picohttpparser.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase, + * Shigeo Mitsunari + * + * The software is licensed under either the MIT License (below) or the Perl + * license. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef picohttpparser_h +#define picohttpparser_h + +#include + +#ifdef _MSC_VER +#define ssize_t intptr_t +#endif + +/* $Id$ */ + +#ifdef __cplusplus +extern "C" { +#endif + +struct string { + const char *s; + size_t l; +}; + +/* contains name and value of a header (name == NULL if is a continuing line + * of a multiline header */ +struct phr_header { + struct string name; + struct string value; +}; + +/* returns number of bytes consumed if successful, -2 if request is partial, + * -1 if failed */ +int phr_parse_request(const char *buf, size_t len, struct string *method, struct string *path, + int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* ditto */ +int phr_parse_response(const char *_buf, size_t len, int *minor_version, int *status, struct string *msg, + struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* ditto */ +int phr_parse_headers(const char *buf, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* should be zero-filled before start */ +struct phr_chunked_decoder { + size_t bytes_left_in_chunk; /* number of bytes left in current chunk */ + char consume_trailer; /* if trailing headers should be consumed */ + char _hex_count; + char _state; +}; + +/* the function rewrites the buffer given as (buf, bufsz) removing the chunked- + * encoding headers. When the function returns without an error, bufsz is + * updated to the length of the decoded data available. Applications should + * repeatedly call the function while it returns -2 (incomplete) every time + * supplying newly arrived data. If the end of the chunked-encoded data is + * found, the function returns a non-negative number indicating the number of + * octets left undecoded at the tail of the supplied buffer. Returns -1 on + * error. + */ +ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *bufsz); + +/* returns if the chunked decoder is in middle of chunked data */ +int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/server/rpc.c b/src/server/rpc.c new file mode 100644 index 0000000..1ea09cb --- /dev/null +++ b/src/server/rpc.c @@ -0,0 +1,504 @@ +#include "rpc.h" +#include "helper.h" +#include "net.h" +#include "uplink.h" +#include "locks.h" +#include "image.h" +#include "altservers.h" +#include "../shared/sockhelper.h" +#include "fileutil.h" +#include "picohttpparser/picohttpparser.h" +#include "urldecode.h" + +#include +#include +#include +#include +#include + +#if JANSSON_VERSION_HEX < 0x020600 +#define json_stringn_nocheck(a,b) json_string_nocheck(a) +#endif + +#define ACL_ALL 0x7fffffff +#define ACL_STATS 1 +#define ACL_CLIENT_LIST 2 +#define ACL_IMAGE_LIST 4 +#define ACL_CONFIG 8 +#define ACL_LOG 16 +#define ACL_ALTSERVERS 32 + +#define HTTP_CLOSE 4 +#define HTTP_KEEPALIVE 9 + +// Make sure compiler does not reserve more space for static strings than required (or rather, does not tell so in sizeof calls) +// TODO Might be time for a dedicated string.h +_Static_assert( sizeof("test") == 5 && sizeof("test2") == 6, "Stringsize messup :/" ); +#define STRCMP(str,chr) ( (str).s != NULL && (str).l == sizeof(chr)-1 && strncmp( (str).s, (chr), MIN((str).l, sizeof(chr)-1) ) == 0 ) +#define STRSTART(str,chr) ( (str).s != NULL && (str).l >= sizeof(chr)-1 && strncmp( (str).s, (chr), MIN((str).l, sizeof(chr)-1) ) == 0 ) +#define SETSTR(name,value) do { name.s = value; name.l = sizeof(value)-1; } while (0) +#define DEFSTR(name,value) static struct string name = { .s = value, .l = sizeof(value)-1 }; +#define chartolower(c) ((char)( (c) >= 'A' && (c) <= 'Z' ? (c) + ('a'-'A') : (c) )) + +DEFSTR(STR_CONNECTION, "connection") +DEFSTR(STR_CLOSE, "close") +DEFSTR(STR_QUERY, "/query") +DEFSTR(STR_Q, "q") + +static inline bool equals(struct string *s1,struct string *s2) +{ + if ( s1->s == NULL ) { + return s2->s == NULL; + } else if ( s2->s == NULL || s1->l != s2->l ) { + return false; + } + return memcmp( s1->s, s2->s, s1->l ) == 0; +} + +static inline bool iequals(struct string *cmpMixed, struct string *cmpLower) +{ + if ( cmpMixed->s == NULL ) { + return cmpLower->s == NULL; + } else if ( cmpLower->s == NULL || cmpMixed->l != cmpLower->l ) { + return false; + } + for ( size_t i = 0; i < cmpMixed->l; ++i ) { + if ( chartolower( cmpMixed->s[i] ) != cmpLower->s[i] ) return false; + } + return true; +} + +#define MAX_ACLS 100 +static int aclCount = 0; +static dnbd3_access_rule_t aclRules[MAX_ACLS]; +static json_int_t randomRunId; +static pthread_spinlock_t aclLock; +#define MAX_CLIENTS 50 +#define CUTOFF_START 40 +static pthread_spinlock_t statusLock; +static struct { + int count; + bool overloaded; +} status; + +static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive); +static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive); +static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc); +static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value); +static int getacl(dnbd3_host_t *host); +static void addacl(int argc, char **argv, void *data); +static void loadAcl(); + +void rpc_init() +{ + spin_init( &aclLock, PTHREAD_PROCESS_PRIVATE ); + spin_init( &statusLock, PTHREAD_PROCESS_PRIVATE ); + randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL); + // + if ( sizeof(randomRunId) > 4 ) { + int fd = open( "/dev/urandom", O_RDONLY ); + if ( fd != -1 ) { + uint32_t bla = 1; + read( fd, &bla, 4 ); + randomRunId = (randomRunId << 32) | bla; + } + close( fd ); + } + loadAcl(); +} + +#define UPDATE_LOADSTATE(cnt) do { \ + if ( cnt < (CUTOFF_START/2) ) { \ + if ( status.overloaded ) status.overloaded = false; \ + } else if ( cnt > CUTOFF_START ) { \ + if ( !status.overloaded ) status.overloaded = true; \ + } \ +} while (0) + +void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int dataLen) +{ + int permissions = getacl( host ); + if ( permissions == 0 ) { + sendReply( sock, "403 Forbidden", "text/plain", "Access denied", -1, HTTP_CLOSE ); + return; + } + do { + spin_lock( &statusLock ); + const int curCount = ++status.count; + UPDATE_LOADSTATE( curCount ); + spin_unlock( &statusLock ); + if ( curCount > MAX_CLIENTS ) { + sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE ); + goto func_return; + } + } while (0); + char headerBuf[3000]; + if ( dataLen > 0 ) { + // We call this function internally with a maximum data len of sizeof(dnbd3_request_t) so no bounds checking + memcpy( headerBuf, data, dataLen ); + } + size_t hoff = dataLen; + bool hasName = false; + bool ok; + int keepAlive = HTTP_KEEPALIVE; + do { + // Read request from client + struct phr_header headers[100]; + size_t numHeaders, prevLen = 0, consumed; + struct string method, path; + int minorVersion; + do { + // Parse before calling recv, there might be a complete pipelined request in the buffer already + // If the request is incomplete, we allow exactly one additional recv() to complete it. + // This should suffice for real world scenarios as I don't know of any + // HTTP client that sends the request headers in multiple packets. Even + // with pipelining this should not break as we re-enter this loop after + // processing the requests one by one, so a potential partial request in the + // buffer will get another recv() (blocking mode) + // The alternative would be manual tracking of idle/request time to protect + // against never ending requests (slowloris) + int pret; + if ( hoff >= sizeof(headerBuf) ) goto func_return; // Request too large + if ( hoff != 0 ) { + numHeaders = 100; + pret = phr_parse_request( headerBuf, hoff, &method, &path, &minorVersion, headers, &numHeaders, prevLen ); + } else { + // Nothing in buffer yet, just set to -2 which is the phr goto func_return code for "partial request" + pret = -2; + } + if ( pret > 0 ) { + // > 0 means parsing completed without error + consumed = (size_t)pret; + break; + } + // Reaching here means partial request or parse error + if ( pret == -2 ) { // Partial, keep reading + prevLen = hoff; +#ifdef AFL_MODE + ssize_t ret = recv( 0, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 ); +#else + ssize_t ret = recv( sock, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 ); +#endif + if ( ret == 0 ) goto func_return; + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + if ( errno != EAGAIN && errno != EWOULDBLOCK ) { + sendReply( sock, "500 Internal Server Error", "text/plain", "Server made a boo-boo", -1, HTTP_CLOSE ); + } + goto func_return; // Timeout or unknown error + } + hoff += ret; + } else { // Parse error + sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE ); + goto func_return; + } + } while ( true ); + if ( keepAlive == HTTP_KEEPALIVE ) { + // Only keep the connection alive (and indicate so) if the client seems to support this + if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) { + keepAlive = HTTP_CLOSE; + } else { // And if there aren't too many active HTTP sessions + spin_lock( &statusLock ); + if ( status.overloaded ) keepAlive = HTTP_CLOSE; + spin_unlock( &statusLock ); + } + } + if ( method.s != NULL && path.s != NULL ) { + // Basic data filled from request parser + // Handle stuff + struct string file; + struct field getv[10]; + size_t getc = 10; + parsePath( &path, &file, getv, &getc ); + if ( method.s && method.s[0] == 'P' ) { + // POST only methods + } + // Don't care if GET or POST + if ( equals( &file, &STR_QUERY ) ) { + ok = handleStatus( sock, permissions, getv, getc, keepAlive ); + } else { + ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive ); + } + if ( !ok ) break; + } + // hoff might be beyond end if the client sent another request (burst) + const ssize_t extra = hoff - consumed; + if ( extra > 0 ) { + memmove( headerBuf, headerBuf + consumed, extra ); + } + hoff = extra; + if ( !hasName ) { + hasName = true; + setThreadName( "HTTP" ); + } + } while (true); +func_return:; + do { + spin_lock( &statusLock ); + const int curCount = --status.count; + UPDATE_LOADSTATE( curCount ); + spin_unlock( &statusLock ); + } while (0); +} + +void rpc_sendErrorMessage(int sock, const char* message) +{ + static const char *encoded = NULL; + static size_t len; + if ( encoded == NULL ) { + json_t *tmp = json_pack( "{ss}", "errorMsg", message ); + encoded = json_dumps( tmp, 0 ); + json_decref( tmp ); + len = strlen( encoded ); + } + sendReply( sock, "200 Somewhat OK", "application/json", encoded, len, HTTP_CLOSE ); +} + +static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive) +{ + bool ok; + bool stats = false, images = false, clients = false, space = false; + bool logfile = false, config = false, altservers = false; +#define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true + for (size_t i = 0; i < fields_num; ++i) { + if ( !equals( &fields[i].name, &STR_Q ) ) continue; + SETVAR(stats); + else SETVAR(space); + else SETVAR(images); + else SETVAR(clients); + else SETVAR(logfile); + else SETVAR(config); + else SETVAR(altservers); + } +#undef SETVAR + if ( ( stats || space ) && !(permissions & ACL_STATS) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive ); + } + if ( images && !(permissions & ACL_IMAGE_LIST) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive ); + } + if ( clients && !(permissions & ACL_CLIENT_LIST) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access client list", -1, keepAlive ); + } + if ( logfile && !(permissions & ACL_LOG) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access log", -1, keepAlive ); + } + if ( config && !(permissions & ACL_CONFIG) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access config", -1, keepAlive ); + } + if ( altservers && !(permissions & ACL_ALTSERVERS) ) { + return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access altservers", -1, keepAlive ); + } + + json_t *statisticsJson; + if ( stats ) { + int clientCount, serverCount; + uint64_t bytesSent; + const uint64_t bytesReceived = uplink_getTotalBytesReceived(); + net_getStats( &clientCount, &serverCount, &bytesSent ); + statisticsJson = json_pack( "{sIsIsisisIsI}", + "bytesReceived", (json_int_t) bytesReceived, + "bytesSent", (json_int_t) bytesSent, + "clientCount", clientCount, + "serverCount", serverCount, + "uptime", (json_int_t) dnbd3_serverUptime(), + "runId", randomRunId ); + } else { + statisticsJson = json_pack( "{sI}", + "runId", randomRunId ); + } + if ( space ) { + uint64_t spaceTotal = 0, spaceAvail = 0; + file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail ); + json_object_set_new( statisticsJson, "spaceTotal", json_integer( spaceTotal ) ); + json_object_set_new( statisticsJson, "spaceFree", json_integer( spaceAvail ) ); + } + if ( clients ) { + json_object_set_new( statisticsJson, "clients", net_getListAsJson() ); + } + if ( images ) { + json_object_set_new( statisticsJson, "images", image_getListAsJson() ); + } + if ( logfile ) { + char logbuf[4000]; + ssize_t len = log_fetch( logbuf, sizeof(logbuf) ); + json_t *val; + if ( len <= 0 ) { + val = json_null(); + } else { + val = json_stringn_nocheck( logbuf, (size_t)len ); + + } + json_object_set_new( statisticsJson, "logfile", val ); + } + if ( config ) { + char buf[2000]; + size_t len = globals_dumpConfig( buf, sizeof(buf) ); + json_object_set_new( statisticsJson, "config", json_stringn_nocheck( buf, len ) ); + } + if ( altservers ) { + json_object_set_new( statisticsJson, "altservers", altservers_toJson() ); + } + + char *jsonString = json_dumps( statisticsJson, 0 ); + json_decref( statisticsJson ); + ok = sendReply( sock, "200 OK", "application/json", jsonString, -1, keepAlive ); + free( jsonString ); + return ok; +} + +static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive) +{ + if ( plen == -1 ) plen = strlen( payload ); + char buffer[600]; + const char *connection = ( keepAlive == HTTP_KEEPALIVE ) ? "Keep-Alive" : "Close"; + int hlen = snprintf(buffer, sizeof(buffer), "HTTP/1.1 %s\r\n" + "Connection: %s\r\n" + "Content-Type: %s; charset=utf-8\r\n" + "Content-Length: %u\r\n" + "\r\n", + status, connection, ctype, (unsigned int)plen ); + if ( hlen < 0 || hlen >= (int)sizeof(buffer) ) return false; // Truncated + if ( send( sock, buffer, hlen, MSG_MORE ) != hlen ) return false; + if ( !sock_sendAll( sock, payload, plen, 10 ) ) return false; + if ( keepAlive == HTTP_CLOSE ) { + // Wait for flush + shutdown( sock, SHUT_WR ); +#ifdef AFL_MODE + sock = 0; +#endif + while ( read( sock, buffer, sizeof buffer ) > 0 ); + return false; + } + return true; +} + +static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc) +{ + size_t i = 0; + while ( i < path->l && path->s[i] != '?' ) ++i; + if ( i == path->l ) { + *getc = 0; + *file = *path; + return; + } + file->s = path->s; + file->l = i; + ++i; + path->s += i; + path->l -= i; + urldecode( path, getv, getc ); + path->s -= i; + path->l += i; +} + +static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value) +{ + for (size_t i = 0; i < numHeaders; ++i) { + if ( !iequals( &headers[i].name, name ) ) continue; + if ( iequals( &headers[i].value, value ) ) return true; + } + return false; +} + +static int getacl(dnbd3_host_t *host) +{ + if ( aclCount == 0 ) return 0x7fffff; // For now compat mode - no rules defined == all access + for (int i = 0; i < aclCount; ++i) { + if ( aclRules[i].bytes == 0 && aclRules[i].bitMask == 0 ) return aclRules[i].permissions; + if ( memcmp( aclRules[i].host, host->addr, aclRules[i].bytes ) != 0 ) continue; + if ( aclRules[i].bitMask != 0 && aclRules[i].host[aclRules[i].bytes] != ( host->addr[aclRules[i].bytes] & aclRules[i].bitMask ) ) continue; + return aclRules[i].permissions; + } +#ifdef AFL_MODE + return 0x7fffff; +#else + return 0; +#endif +} + +#define SETBIT(x) else if ( strcmp( argv[i], #x ) == 0 ) mask |= ACL_ ## x + +static void addacl(int argc, char **argv, void *data UNUSED) +{ + if ( argv[0][0] == '#' ) return; + spin_lock( &aclLock ); + if ( aclCount >= MAX_ACLS ) { + logadd( LOG_WARNING, "Too many ACL rules, ignoring %s", argv[0] ); + goto unlock_end; + } + int mask = 0; + for (int i = 1; i < argc; ++i) { + if (false) {} + SETBIT(ALL); + SETBIT(STATS); + SETBIT(CLIENT_LIST); + SETBIT(IMAGE_LIST); + else logadd( LOG_WARNING, "Invalid ACL flag '%s' for %s", argv[i], argv[0] ); + } + if ( mask == 0 ) { + logadd( LOG_INFO, "Ignoring empty rule for %s", argv[0] ); + goto unlock_end; + } + dnbd3_host_t host; + char *slash = strchr( argv[0], '/' ); + if ( slash != NULL ) { + *slash++ = '\0'; + } + if ( !parse_address( argv[0], &host ) ) goto unlock_end; + long int bits; + if ( slash != NULL ) { + char *last; + bits = strtol( slash, &last, 10 ); + if ( last == slash ) slash = NULL; + if ( host.type == HOST_IP4 && bits > 32 ) bits = 32; + if ( bits > 128 ) bits = 128; + } + if ( slash == NULL ) { + if ( host.type == HOST_IP4 ) { + bits = 32; + } else { + bits = 128; + } + } + memcpy( aclRules[aclCount].host, host.addr, 16 ); + aclRules[aclCount].bytes = (int)( bits / 8 ); + aclRules[aclCount].bitMask = 0; + aclRules[aclCount].permissions = mask; + bits %= 8; + if ( bits != 0 ) { + for (long int i = 0; i < bits; ++i) { + aclRules[aclCount].bitMask = ( aclRules[aclCount].bitMask >> 1 ) | 0x80; + } + aclRules[aclCount].host[aclRules[aclCount].bytes] &= (uint8_t)aclRules[aclCount].bitMask; + } + // We now have .bytes set to the number of bytes to memcmp. + // In case we have an odd bitmask, .bitMask will be != 0, so when comparing, + // we need AND the host[.bytes] of the address to compare with the value + // in .bitMask, and compate it, otherwise, a simple memcmp will do. + aclCount++; +unlock_end:; + spin_unlock( &aclLock ); +} + +static void loadAcl() +{ + static bool inProgress = false; + char *fn; + if ( asprintf( &fn, "%s/%s", _configDir, "rpc.acl" ) == -1 ) return; + spin_lock( &aclLock ); + if ( inProgress ) { + spin_unlock( &aclLock ); + return; + } + aclCount = 0; + inProgress = true; + spin_unlock( &aclLock ); + file_loadLineBased( fn, 1, 20, &addacl, NULL ); + spin_lock( &aclLock ); + inProgress = false; + spin_unlock( &aclLock ); + free( fn ); + logadd( LOG_INFO, "%d HTTPRPC ACL rules loaded", (int)aclCount ); +} + diff --git a/src/server/rpc.h b/src/server/rpc.h new file mode 100644 index 0000000..285242c --- /dev/null +++ b/src/server/rpc.h @@ -0,0 +1,10 @@ +#ifndef _RPC_H_ +#define _RPC_H_ + +struct dnbd3_host_t; + +void rpc_init(); +void rpc_sendStatsJson(int sock, struct dnbd3_host_t* host, const void *data, const int dataLen); +void rpc_sendErrorMessage(int sock, const char* message); + +#endif diff --git a/src/server/serialize.c b/src/server/serialize.c new file mode 100644 index 0000000..4934132 --- /dev/null +++ b/src/server/serialize.c @@ -0,0 +1,5 @@ +#include +#include +#include + +#include "../serialize.c" diff --git a/src/server/server.c b/src/server/server.c new file mode 100644 index 0000000..10ab208 --- /dev/null +++ b/src/server/server.c @@ -0,0 +1,495 @@ + /* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include "server.h" +#include "helper.h" + +#include "locks.h" +#include "image.h" +#include "uplink.h" +#include "net.h" +#include "altservers.h" +#include "integrity.h" +#include "threadpool.h" +#include "rpc.h" + +#include "../version.h" +#include "../shared/sockhelper.h" +#include "../shared/timing.h" + +#include +#include +#include + +#define LONGOPT_CRC4 1000 +#define LONGOPT_ASSERT 1001 +#define LONGOPT_CREATE 1002 +#define LONGOPT_REVISION 1003 +#define LONGOPT_SIZE 1004 +#define LONGOPT_ERRORMSG 1005 + +static poll_list_t *listeners = NULL; + +/** + * Time the server was started + */ +static ticks startupTime; +static bool sigReload = false, sigLogCycle = false; + +/** + * Copied to in signal handler so we can print info + * later on + */ +static siginfo_t lastSignal; + +void printSignal(); + +static poll_list_t* setupNetwork(char *bindAddress); + +static dnbd3_client_t* dnbd3_prepareClient(struct sockaddr_storage *client, int fd); + +static void dnbd3_handleSignal(int signum); + +static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data); + +static void* server_asyncImageListLoad(void *data); + +/** + * Print help text for usage instructions + */ +void dnbd3_printHelp(char *argv_0) +{ + printf( "Version: %s\n\n", VERSION_STRING ); + printf( "Usage: %s [OPTIONS]...\n", argv_0 ); + printf( "Start the DNBD3 server\n" ); + printf( "-c or --config Configuration directory (default /etc/dnbd3-server/)\n" ); + printf( "-n or --nodaemon Start server in foreground\n" ); + printf( "-b or --bind Local Address to bind to\n" ); + printf( "-h or --help Show this help text and quit\n" ); + printf( "-v or --version Show version and quit\n" ); + printf( "\nManagement functions:\n" ); + printf( "--crc [image-file] Generate crc block list for given image\n" ); + printf( "--create [image-name] --revision [rid] --size [filesize]\n" + "\tCreate a local empty image file with a zeroed cache-map for the specified image\n" ); + printf( "--errormsg [text] Just serve given error message via HTTP, no service otherwise\n" ); + printf( "\n" ); + exit( 0 ); +} + +/** + * Print version information + */ +void dnbd3_printVersion() +{ + printf( "Version: %s\n", VERSION_STRING ); + exit( 0 ); +} + +/** + * Clean up structs, connections, write out data, then exit + */ +void dnbd3_cleanup() +{ + int retries; + + _shutdown = true; + logadd( LOG_INFO, "Cleanup..." ); + + if ( listeners != NULL ) sock_destroyPollList( listeners ); + listeners = NULL; + + // Kill connection to all clients + net_disconnectAll(); + + // Disable threadpool + threadpool_close(); + + // Terminate the altserver checking thread + altservers_shutdown(); + + // Terminate all uplinks + image_killUplinks(); + + // Terminate integrity checker + integrity_shutdown(); + + // Wait for clients to disconnect + net_waitForAllDisconnected(); + + // Watchdog not needed anymore + debug_locks_stop_watchdog(); + + // Clean up images + retries = 5; + while ( !image_tryFreeAll() && --retries > 0 ) { + logadd( LOG_INFO, "Waiting for images to free...\n" ); + sleep( 1 ); + } + + free( _basePath ); + free( _configDir ); + exit( EXIT_SUCCESS ); +} + +/** + * Program entry point + */ +int main(int argc, char *argv[]) +{ + int demonize = 1; + int opt = 0; + int longIndex = 0; + char *paramCreate = NULL; + char *bindAddress = NULL; + char *errorMsg = NULL; + int64_t paramSize = -1; + int paramRevision = -1; + static const char *optString = "b:c:d:hnv?"; + static const struct option longOpts[] = { + { "config", required_argument, NULL, 'c' }, + { "nodaemon", no_argument, NULL, 'n' }, + { "reload", no_argument, NULL, 'r' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + { "bind", required_argument, NULL, 'b' }, + { "crc", required_argument, NULL, LONGOPT_CRC4 }, + { "assert", no_argument, NULL, LONGOPT_ASSERT }, + { "create", required_argument, NULL, LONGOPT_CREATE }, + { "revision", required_argument, NULL, LONGOPT_REVISION }, + { "size", required_argument, NULL, LONGOPT_SIZE }, + { "errormsg", required_argument, NULL, LONGOPT_ERRORMSG }, + { 0, 0, 0, 0 } + }; + + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + + while ( opt != -1 ) { + switch ( opt ) { + case 'c': + _configDir = strdup( optarg ); + break; + case 'n': + demonize = 0; + break; + case 'h': + case '?': + dnbd3_printHelp( argv[0] ); + break; + case 'v': + dnbd3_printVersion(); + break; + case 'b': + bindAddress = strdup( optarg ); + break; + case LONGOPT_CRC4: + return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE; + case LONGOPT_ASSERT: + printf( "Testing a failing assertion:\n" ); + assert( 4 == 5 ); + printf( "Assertion 4 == 5 seems to hold. ;-)\n" ); + return EXIT_SUCCESS; + case LONGOPT_CREATE: + paramCreate = strdup( optarg ); + break; + case LONGOPT_REVISION: + paramRevision = atoi( optarg ); + break; + case LONGOPT_SIZE: + paramSize = strtoll( optarg, NULL, 10 ); + break; + case LONGOPT_ERRORMSG: + errorMsg = strdup( optarg ); + break; + } + opt = getopt_long( argc, argv, optString, longOpts, &longIndex ); + } + + // Load general config + + if ( _configDir == NULL ) _configDir = strdup( "/etc/dnbd3-server" ); + globals_loadConfig(); + if ( _basePath == NULL && errorMsg == NULL ) { + logadd( LOG_ERROR, "Aborting, set proper basePath in %s/%s", _configDir, CONFIG_FILENAME ); + exit( EXIT_FAILURE ); + } + + timing_setBase(); + timing_get( &startupTime ); + +#ifdef AFL_MODE + // ###### AFL + // + image_serverStartup(); + net_init(); + uplink_globalsInit(); + rpc_init(); + if ( !image_loadAll( NULL ) || _shutdown ) { + fprintf( stderr, "Error loading images\n" ); + exit( 3 ); + } + { + struct sockaddr_storage client; + memset( &client, 0, sizeof client ); + client.ss_family = AF_INET; + dnbd3_client_t *dnbd3_client = dnbd3_prepareClient( &client, 1 ); + if ( dnbd3_client == NULL ) { + fprintf( stderr, "New client failed\n" ); + exit( 1 ); + } +#ifdef __AFL_HAVE_MANUAL_CONTROL + __AFL_INIT(); +#endif + net_handleNewConnection( dnbd3_client ); + exit( 0 ); + } + // + // ###### AFL END +#endif + + + // One-shots first: + + if ( paramCreate != NULL ) { + return image_create( paramCreate, paramRevision, paramSize ) ? 0 : EXIT_FAILURE; + } + + // No one-shot detected, normal server operation or errormsg serving + if ( demonize ) { + logadd( LOG_INFO, "Forking into background, see log file for further information" ); + daemon( 1, 0 ); + } + if ( errorMsg != NULL ) { + setupNetwork( bindAddress ); + logadd( LOG_INFO, "Running errormsg server" ); + while ( true ) { + const int fd = sock_accept( listeners, NULL, NULL ); + if ( fd >= 0 ) { + rpc_sendErrorMessage( fd, errorMsg ); + } else { + const int err = errno; + if ( err == EINTR || err == EAGAIN ) continue; + logadd( LOG_ERROR, "Client accept failure (err=%d)", err ); + usleep( 10000 ); // 10ms + } + } + exit( 0 ); + } + image_serverStartup(); + altservers_init(); + integrity_init(); + net_init(); + uplink_globalsInit(); + rpc_init(); + logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE ); + + if ( altservers_load() < 0 ) { + logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir ); + } + +#ifdef _DEBUG + debug_locks_start_watchdog(); +#endif + + // setup signal handler + struct sigaction sa; + memset( &sa, 0, sizeof(sa) ); + sa.sa_sigaction = dnbd3_handleSignal2; + sa.sa_flags = SA_SIGINFO; + //sa.sa_mask = ; + sigaction( SIGTERM, &sa, NULL ); + sigaction( SIGINT, &sa, NULL ); + sigaction( SIGUSR1, &sa, NULL ); + sigaction( SIGHUP, &sa, NULL ); + sigaction( SIGUSR2, &sa, NULL ); + signal( SIGPIPE, SIG_IGN ); + + logadd( LOG_INFO, "Loading images...." ); + // Load all images in base path + if ( !image_loadAll( NULL ) || _shutdown ) { + if ( _shutdown ) { + logadd( LOG_ERROR, "Received shutdown request while loading images." ); + } else { + logadd( LOG_ERROR, "Could not load images." ); + } + free( bindAddress ); + dnbd3_cleanup(); + return _shutdown ? 0 : 1; + } + + // Give other threads some time to start up before accepting connections + usleep( 100000 ); + + // setup network + listeners = setupNetwork( bindAddress ); + + // Initialize thread pool + if ( !threadpool_init( 8 ) ) { + logadd( LOG_ERROR, "Could not init thread pool!\n" ); + exit( EXIT_FAILURE ); + } + + logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING ); + + // +++++++++++++++++++++++++++++++++++++++++++++++++++ main loop + struct sockaddr_storage client; + socklen_t len; + int fd; + while ( !_shutdown ) { + // Handle signals + printSignal(); + if ( sigReload ) { + sigReload = false; + logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" ); + threadpool_run( &server_asyncImageListLoad, NULL ); + } + if ( sigLogCycle ) { + sigLogCycle = false; + logadd( LOG_INFO, "SIGUSR2 received, reopening log file..." ); + if ( log_openLogFile( NULL ) ) + logadd( LOG_INFO, "Log file has been reopened." ); + else + logadd( LOG_WARNING, "Could not cycle log file." ); + } + // + len = sizeof(client); + fd = sock_accept( listeners, &client, &len ); + if ( fd < 0 ) { + const int err = errno; + if ( err == EINTR || err == EAGAIN ) continue; + logadd( LOG_ERROR, "Client accept failure (err=%d)", err ); + usleep( 10000 ); // 10ms + continue; + } + + dnbd3_client_t *dnbd3_client = dnbd3_prepareClient( &client, fd ); + if ( dnbd3_client == NULL ) { + close( fd ); + continue; + } + + if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) { + logadd( LOG_ERROR, "Could not start thread for new connection." ); + free( dnbd3_client ); + continue; + } + } + printSignal(); + free( bindAddress ); + dnbd3_cleanup(); + return 0; +} + +void printSignal() +{ + if ( lastSignal.si_signo != 0 ) { + logadd( LOG_INFO, "Signal %d (via %d) by pid %u, uid %u", + lastSignal.si_signo, lastSignal.si_code, + (unsigned int)lastSignal.si_pid, (unsigned int)lastSignal.si_uid ); + if ( lastSignal.si_pid != 0 ) { + char buffer[500], path[100]; + snprintf( path, sizeof(path), "/proc/%u/exe", (unsigned int)lastSignal.si_pid ); + ssize_t len = readlink( path, buffer, sizeof(buffer) ); + if ( len > 0 ) { + logadd( LOG_INFO, "%u is %.*s", (unsigned int)lastSignal.si_pid, (int)len, buffer ); + } + } + lastSignal.si_signo = 0; + } +} + +static poll_list_t* setupNetwork(char *bindAddress) +{ + listeners = sock_newPollList(); + if ( listeners == NULL ) { + logadd( LOG_ERROR, "Didnt get a poll list!" ); + exit( EXIT_FAILURE ); + } + if ( !sock_listen( listeners, bindAddress, (uint16_t)_listenPort ) ) { + logadd( LOG_ERROR, "Could not listen on any local interface." ); + exit( EXIT_FAILURE ); + } + return listeners; +} + +/** + * Initialize and partially populate the client struct - called when an incoming + * connection is accepted. As this might be an HTTP request we don't initialize the + * locks, that would happen later once we know. + */ +static dnbd3_client_t* dnbd3_prepareClient(struct sockaddr_storage *client, int fd) +{ + dnbd3_client_t *dnbd3_client = calloc( 1, sizeof(dnbd3_client_t) ); + if ( dnbd3_client == NULL ) { // This will never happen thanks to memory overcommit + logadd( LOG_ERROR, "Could not alloc dnbd3_client_t for new client." ); + return NULL; + } + + if ( client->ss_family == AF_INET ) { + struct sockaddr_in *v4 = (struct sockaddr_in *)client; + dnbd3_client->host.type = HOST_IP4; + memcpy( dnbd3_client->host.addr, &(v4->sin_addr), 4 ); + dnbd3_client->host.port = v4->sin_port; + } else if ( client->ss_family == AF_INET6 ) { + struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)client; + dnbd3_client->host.type = HOST_IP6; + memcpy( dnbd3_client->host.addr, &(v6->sin6_addr), 16 ); + dnbd3_client->host.port = v6->sin6_port; + } else { + logadd( LOG_ERROR, "New client has unknown address family %d, disconnecting...", (int)client->ss_family ); + free( dnbd3_client ); + return NULL; + } + dnbd3_client->sock = fd; + return dnbd3_client; +} + +static void dnbd3_handleSignal(int signum) +{ + if ( _shutdown ) return; + if ( signum == SIGINT || signum == SIGTERM ) { + _shutdown = true; + } else if ( signum == SIGUSR1 || signum == SIGHUP ) { + sigReload = true; + } else if ( signum == SIGUSR2 ) { + sigLogCycle = true; + } +} + +static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED) +{ + memcpy( &lastSignal, info, sizeof(siginfo_t) ); + dnbd3_handleSignal( signum ); +} + +uint32_t dnbd3_serverUptime() +{ + ticks now; + timing_get( &now ); + return timing_diff( &startupTime, &now ); +} + +static void* server_asyncImageListLoad(void *data UNUSED) +{ + setThreadName( "img-list-loader" ); + globals_loadConfig(); + image_loadAll( NULL ); + return NULL; +} + diff --git a/src/server/server.h b/src/server/server.h new file mode 100644 index 0000000..bab8421 --- /dev/null +++ b/src/server/server.h @@ -0,0 +1,34 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef SERVER_H_ +#define SERVER_H_ + +#include "globals.h" +#include "../types.h" + +void dnbd3_cleanup(); +uint32_t dnbd3_serverUptime(); + +#if !defined(_FILE_OFFSET_BITS) || _FILE_OFFSET_BITS != 64 +#error Please set _FILE_OFFSET_BITS to 64 in your makefile/configuration +#endif + +#endif /* SERVER_H_ */ diff --git a/src/server/threadpool.c b/src/server/threadpool.c new file mode 100644 index 0000000..b55fe19 --- /dev/null +++ b/src/server/threadpool.c @@ -0,0 +1,126 @@ +#include "threadpool.h" +#include "globals.h" +#include "helper.h" +#include "locks.h" + +typedef struct _entry_t { + struct _entry_t *next; + pthread_t thread; + dnbd3_signal_t* signal; + void *(*startRoutine)(void *); + void * arg; +} entry_t; + +static void *threadpool_worker(void *entryPtr); + +static pthread_attr_t threadAttrs; + +static int maxIdleThreads = -1; +static entry_t *pool = NULL; +static pthread_spinlock_t poolLock; + +bool threadpool_init(int maxIdle) +{ + if ( maxIdle < 0 || maxIdleThreads >= 0 ) return false; + spin_init( &poolLock, PTHREAD_PROCESS_PRIVATE ); + maxIdleThreads = maxIdle; + pthread_attr_init( &threadAttrs ); + pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED ); + return true; +} + +void threadpool_close() +{ + _shutdown = true; + if ( maxIdleThreads < 0 ) return; + spin_lock( &poolLock ); + maxIdleThreads = -1; + entry_t *ptr = pool; + while ( ptr != NULL ) { + entry_t *current = ptr; + ptr = ptr->next; + signal_call( current->signal ); + } + spin_unlock( &poolLock ); + spin_destroy( &poolLock ); +} + +bool threadpool_run(void *(*startRoutine)(void *), void *arg) +{ + spin_lock( &poolLock ); + entry_t *entry = pool; + if ( entry != NULL ) pool = entry->next; + spin_unlock( &poolLock ); + if ( entry == NULL ) { + entry = (entry_t*)malloc( sizeof(entry_t) ); + if ( entry == NULL ) { + logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" ); + return false; + } + entry->signal = signal_newBlocking(); + if ( entry->signal == NULL ) { + logadd( LOG_WARNING, "Could not create signal for new thread pool thread\n" ); + free( entry ); + return false; + } + if ( 0 != thread_create( &(entry->thread), &threadAttrs, threadpool_worker, (void*)entry ) ) { + logadd( LOG_WARNING, "Could not create new thread for thread pool\n" ); + signal_close( entry->signal ); + free( entry ); + return false; + } + } + entry->next = NULL; + entry->startRoutine = startRoutine; + entry->arg = arg; + signal_call( entry->signal ); + return true; +} + +/** + * This is a worker thread of our thread pool. + */ +static void *threadpool_worker(void *entryPtr) +{ + blockNoncriticalSignals(); + entry_t *entry = (entry_t*)entryPtr; + for ( ;; ) { + // Wait for signal from outside that we have work to do + int ret = signal_clear( entry->signal ); + if ( _shutdown ) break; + if ( ret > 0 ) { + if ( entry->startRoutine == NULL ) { + logadd( LOG_DEBUG1, "Worker woke up but has no work to do!" ); + continue; + } + // Start assigned work + (*entry->startRoutine)( entry->arg ); + // Reset vars for safety + entry->startRoutine = NULL; + entry->arg = NULL; + if ( _shutdown ) break; + // Put thread back into pool if there are less than maxIdleThreds threads, just die otherwise + int threadCount = 0; + spin_lock( &poolLock ); + entry_t *ptr = pool; + while ( ptr != NULL ) { + threadCount++; + ptr = ptr->next; + } + if ( threadCount >= maxIdleThreads ) { + spin_unlock( &poolLock ); + break; + } + entry->next = pool; + pool = entry; + spin_unlock( &poolLock ); + setThreadName( "[pool]" ); + } else { + logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret ); + } + } + signal_close( entry->signal ); + free( entry ); + return NULL; +} + diff --git a/src/server/threadpool.h b/src/server/threadpool.h new file mode 100644 index 0000000..15dd151 --- /dev/null +++ b/src/server/threadpool.h @@ -0,0 +1,29 @@ +#ifndef _THREADPOOL_H_ +#define _THREADPOOL_H_ + +#include "../types.h" + +/** + * Initialize the thread pool. This must be called before using + * threadpool_run, and must only be called once. + * @param maxIdleThreadCount maximum number of idle threads in the pool + * @return true if initialized successfully + */ +bool threadpool_init(int maxIdleThreadCount); + +/** + * Shut down threadpool. + * Only call if it has been initialized before. + */ +void threadpool_close(); + +/** + * Run a thread using the thread pool. + * @param startRoutine function to run in new thread + * @param arg argument to pass to thead + * @return true if thread was started + */ +bool threadpool_run(void *(*startRoutine)(void *), void *arg); + +#endif + diff --git a/src/server/uplink.c b/src/server/uplink.c new file mode 100644 index 0000000..31b220d --- /dev/null +++ b/src/server/uplink.c @@ -0,0 +1,1034 @@ +#include "uplink.h" +#include "helper.h" +#include "locks.h" +#include "image.h" +#include "altservers.h" +#include "../shared/sockhelper.h" +#include "../shared/protocol.h" +#include "../shared/timing.h" +#include "../shared/crc32.h" + +#include +#include +#include +#include +#include +#include + +#define FILE_BYTES_PER_MAP_BYTE ( DNBD3_BLOCK_SIZE * 8 ) +#define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE ) +#define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) ) + +#define REP_NONE ( (uint64_t)0xffffffffffffffff ) + +static atomic_uint_fast64_t totalBytesReceived = 0; + +static void* uplink_mainloop(void *data); +static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly); +static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex); +static void uplink_handleReceive(dnbd3_connection_t *link); +static int uplink_sendKeepalive(const int fd); +static void uplink_addCrc32(dnbd3_connection_t *uplink); +static void uplink_sendReplicationRequest(dnbd3_connection_t *link); +static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force); +static bool uplink_saveCacheMap(dnbd3_connection_t *link); +static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link); +static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew); + +// ############ uplink connection handling + +void uplink_globalsInit() +{ +} + +uint64_t uplink_getTotalBytesReceived() +{ + return (uint64_t)totalBytesReceived; +} + +/** + * Create and initialize an uplink instance for the given + * image. Uplinks run in their own thread. + * Locks on: _images[].lock + */ +bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version) +{ + if ( !_isProxy || _shutdown ) return false; + dnbd3_connection_t *link = NULL; + assert( image != NULL ); + spin_lock( &image->lock ); + if ( image->uplink != NULL && !image->uplink->shutdown ) { + spin_unlock( &image->lock ); + if ( sock >= 0 ) close( sock ); + return true; // There's already an uplink, so should we consider this success or failure? + } + if ( image->cache_map == NULL ) { + logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name ); + goto failure; + } + link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) ); + spin_init( &link->queueLock, PTHREAD_PROCESS_PRIVATE ); + spin_init( &link->rttLock, PTHREAD_PROCESS_PRIVATE ); + link->image = image; + link->bytesReceived = 0; + link->idleTime = 0; + link->queueLen = 0; + link->fd = -1; + link->cacheFd = -1; + link->signal = NULL; + link->replicationHandle = REP_NONE; + spin_lock( &link->rttLock ); + link->cycleDetected = false; + if ( sock >= 0 ) { + link->betterFd = sock; + link->betterServer = *host; + link->rttTestResult = RTT_DOCHANGE; + link->betterVersion = version; + } else { + link->betterFd = -1; + link->rttTestResult = RTT_IDLE; + } + spin_unlock( &link->rttLock ); + link->recvBufferLen = 0; + link->shutdown = false; + if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) { + logadd( LOG_ERROR, "Could not start thread for new uplink." ); + goto failure; + } + spin_unlock( &image->lock ); + return true; +failure: ; + if ( link != NULL ) { + free( link ); + link = image->uplink = NULL; + } + spin_unlock( &image->lock ); + return false; +} + +/** + * Locks on image.lock, uplink.lock + * Calling it multiple times, even concurrently, will + * not break anything. + */ +void uplink_shutdown(dnbd3_image_t *image) +{ + bool join = false; + pthread_t thread; + assert( image != NULL ); + spin_lock( &image->lock ); + if ( image->uplink == NULL ) { + spin_unlock( &image->lock ); + return; + } + dnbd3_connection_t * const uplink = image->uplink; + spin_lock( &uplink->queueLock ); + if ( !uplink->shutdown ) { + uplink->shutdown = true; + signal_call( uplink->signal ); + thread = uplink->thread; + join = true; + } + spin_unlock( &uplink->queueLock ); + bool wait = image->uplink != NULL; + spin_unlock( &image->lock ); + if ( join ) thread_join( thread, NULL ); + while ( wait ) { + usleep( 5000 ); + spin_lock( &image->lock ); + wait = image->uplink != NULL && image->uplink->shutdown; + spin_unlock( &image->lock ); + } +} + +/** + * Remove given client from uplink request queue + * Locks on: uplink.queueLock + */ +void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client) +{ + spin_lock( &uplink->queueLock ); + for (int i = uplink->queueLen - 1; i >= 0; --i) { + if ( uplink->queue[i].client == client ) { + uplink->queue[i].client = NULL; + uplink->queue[i].status = ULR_FREE; + } + if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--; + } + spin_unlock( &uplink->queueLock ); +} + +/** + * Request a chunk of data through an uplink server + * Locks on: image.lock, uplink.queueLock + */ +bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops) +{ + if ( client == NULL || client->image == NULL ) return false; + if ( length > (uint32_t)_maxPayload ) { + logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length ); + return false; + } + spin_lock( &client->image->lock ); + if ( client->image->uplink == NULL ) { + spin_unlock( &client->image->lock ); + logadd( LOG_DEBUG1, "Uplink request for image with no uplink" ); + return false; + } + dnbd3_connection_t * const uplink = client->image->uplink; + if ( uplink->shutdown ) { + spin_unlock( &client->image->lock ); + logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" ); + return false; + } + // Check if the client is the same host as the uplink. If so assume this is a circular proxy chain + // This might be a false positive if there are multiple instances running on the same host (IP) + if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) { + spin_unlock( &client->image->lock ); + logadd( LOG_WARNING, "Proxy cycle detected (same host)." ); + spin_lock( &uplink->rttLock ); + uplink->cycleDetected = true; + spin_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); + return false; + } + + int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise + int existingType = -1; // ULR_* type of existing request + int i; + int freeSlot = -1; + bool requestLoop = false; + const uint64_t end = start + length; + + spin_lock( &uplink->queueLock ); + spin_unlock( &client->image->lock ); + for (i = 0; i < uplink->queueLen; ++i) { + if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) { + freeSlot = i; + continue; + } + if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue; + if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) { + if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) { + requestLoop = true; + break; + } + if ( foundExisting == -1 || existingType == ULR_PENDING ) { + foundExisting = i; + existingType = uplink->queue[i].status; + if ( freeSlot != -1 ) break; + } + } + } + if ( requestLoop ) { + spin_unlock( &uplink->queueLock ); + logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops ); + spin_lock( &uplink->rttLock ); + uplink->cycleDetected = true; + spin_unlock( &uplink->rttLock ); + signal_call( uplink->signal ); + return false; + } + if ( freeSlot == -1 ) { + if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) { + spin_unlock( &uplink->queueLock ); + logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." ); + return false; + } + freeSlot = uplink->queueLen++; + } + // Do not send request to uplink server if we have a matching pending request AND the request either has the + // status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise + // explicitly send this request to the uplink server. The second condition mentioned here is to prevent + // a race condition where the reply for the outstanding request already arrived and the uplink thread + // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might + // already have passed the index of the free slot we determined, but not reached the existing request we just found above. + if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request" +#ifdef _DEBUG + if ( foundExisting != -1 ) { + logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot ); + logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n" + "New %" PRIu64 "-%" PRIu64 " (%p)\n", + uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client, + start, end, (void*)client ); + } +#endif + // Fill structure + uplink->queue[freeSlot].from = start; + uplink->queue[freeSlot].to = end; + uplink->queue[freeSlot].handle = handle; + uplink->queue[freeSlot].client = client; + //int old = uplink->queue[freeSlot].status; + uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING); + uplink->queue[freeSlot].hopCount = hops; +#ifdef _DEBUG + timing_get( &uplink->queue[freeSlot].entered ); + //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end ); +#endif + spin_unlock( &uplink->queueLock ); + + if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed + if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno ); + } + } + return true; +} + +/** + * Uplink thread. + * Locks are irrelevant as this is never called from another function + */ +static void* uplink_mainloop(void *data) +{ +#define EV_SIGNAL (0) +#define EV_SOCKET (1) +#define EV_COUNT (2) + struct pollfd events[EV_COUNT]; + dnbd3_connection_t * const link = (dnbd3_connection_t*)data; + int numSocks, i, waitTime; + int altCheckInterval = SERVER_RTT_INTERVAL_INIT; + uint32_t discoverFailCount = 0; + uint32_t unsavedSeconds = 0; + ticks nextAltCheck, lastKeepalive; + char buffer[200]; + memset( events, 0, sizeof(events) ); + timing_get( &nextAltCheck ); + lastKeepalive = nextAltCheck; + // + assert( link != NULL ); + setThreadName( "idle-uplink" ); + blockNoncriticalSignals(); + // Make sure file is open for writing + if ( !uplink_reopenCacheFd( link, false ) ) { + // It might have failed - still offer proxy mode, we just can't cache + logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno ); + } + // + link->signal = signal_new(); + if ( link->signal == NULL ) { + logadd( LOG_WARNING, "error creating signal. Uplink unavailable." ); + goto cleanup; + } + events[EV_SIGNAL].events = POLLIN; + events[EV_SIGNAL].fd = signal_getWaitFd( link->signal ); + events[EV_SOCKET].fd = -1; + while ( !_shutdown && !link->shutdown ) { + // poll() + spin_lock( &link->rttLock ); + waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1; + spin_unlock( &link->rttLock ); + if ( waitTime == 0 ) { + // Nothing + } else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) { + waitTime = 1000; + } else { + declare_now; + waitTime = (int)timing_diffMs( &now, &nextAltCheck ); + if ( waitTime < 100 ) waitTime = 100; + if ( waitTime > 5000 ) waitTime = 5000; + } + events[EV_SOCKET].fd = link->fd; + numSocks = poll( events, EV_COUNT, waitTime ); + if ( _shutdown || link->shutdown ) goto cleanup; + if ( numSocks == -1 ) { // Error? + if ( errno == EINTR ) continue; + logadd( LOG_DEBUG1, "poll() error %d", (int)errno ); + usleep( 10000 ); + continue; + } + // Check if server switch is in order + spin_lock( &link->rttLock ); + if ( link->rttTestResult != RTT_DOCHANGE ) { + spin_unlock( &link->rttLock ); + } else { + link->rttTestResult = RTT_IDLE; + // The rttTest worker thread has finished our request. + // And says it's better to switch to another server + const int fd = link->fd; + link->fd = link->betterFd; + link->betterFd = -1; + link->currentServer = link->betterServer; + link->version = link->betterVersion; + link->cycleDetected = false; + spin_unlock( &link->rttLock ); + discoverFailCount = 0; + if ( fd != -1 ) close( fd ); + link->replicationHandle = REP_NONE; + link->image->working = true; + link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received + buffer[0] = '@'; + if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) { + logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 ); + setThreadName( buffer ); + } + // If we don't have a crc32 list yet, see if the new server has one + if ( link->image->crc32 == NULL ) { + uplink_addCrc32( link ); + } + // Re-send all pending requests + uplink_sendRequests( link, false ); + uplink_sendReplicationRequest( link ); + events[EV_SOCKET].events = POLLIN | POLLRDHUP; + timing_gets( &nextAltCheck, altCheckInterval ); + // The rtt worker already did the handshake for our image, so there's nothing + // more to do here + } + // Check events + // Signal + if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { + logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" ); + goto cleanup; + } else if ( (events[EV_SIGNAL].revents & POLLIN) ) { + // signal triggered -> pending requests + if ( signal_clear( link->signal ) == SIGNAL_ERROR ) { + logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name ); + } + if ( link->fd != -1 ) { + // Uplink seems fine, relay requests to it... + uplink_sendRequests( link, true ); + } else { // No uplink; maybe it was shutdown since it was idle for too long + link->idleTime = 0; + } + } + // Uplink socket + if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) { + uplink_connectionFailed( link, true ); + logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" ); + setThreadName( "panic-uplink" ); + } else if ( (events[EV_SOCKET].revents & POLLIN) ) { + uplink_handleReceive( link ); + if ( _shutdown || link->shutdown ) goto cleanup; + } + declare_now; + uint32_t timepassed = timing_diff( &lastKeepalive, &now ); + if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) { + lastKeepalive = now; + link->idleTime += timepassed; + unsavedSeconds += timepassed; + if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) { + // fsync/save every 4 minutes, or every 60 seconds if link is idle + unsavedSeconds = 0; + uplink_saveCacheMap( link ); + } + // Keep-alive + if ( link->fd != -1 && link->replicationHandle == REP_NONE ) { + // Send keep-alive if nothing is happening + if ( uplink_sendKeepalive( link->fd ) ) { + // Re-trigger periodically, in case it requires a minimum user count + uplink_sendReplicationRequest( link ); + } else { + uplink_connectionFailed( link, true ); + logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" ); + setThreadName( "panic-uplink" ); + } + } + // Don't keep link established if we're idle for too much + if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) { + close( link->fd ); + link->fd = events[EV_SOCKET].fd = -1; + link->cycleDetected = false; + if ( link->recvBufferLen != 0 ) { + link->recvBufferLen = 0; + free( link->recvBuffer ); + link->recvBuffer = NULL; + } + logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid ); + setThreadName( "idle-uplink" ); + } + } + // See if we should trigger an RTT measurement + spin_lock( &link->rttLock ); + const int rttTestResult = link->rttTestResult; + spin_unlock( &link->rttLock ); + if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) { + if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) { + // It seems it's time for a check + if ( image_isComplete( link->image ) ) { + // Quit work if image is complete + logadd( LOG_INFO, "Replication of %s complete.", link->image->name ); + setThreadName( "finished-uplink" ); + goto cleanup; + } else if ( !uplink_connectionShouldShutdown( link ) ) { + // Not complete - do measurement + altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous) + if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { + link->nextReplicationIndex = 0; + } + } + altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX); + timing_set( &nextAltCheck, &now, altCheckInterval ); + } + } else if ( rttTestResult == RTT_NOT_REACHABLE ) { + spin_lock( &link->rttLock ); + link->rttTestResult = RTT_IDLE; + spin_unlock( &link->rttLock ); + discoverFailCount++; + timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) ); + } +#ifdef _DEBUG + if ( link->fd != -1 && !link->shutdown ) { + bool resend = false; + ticks deadline; + timing_set( &deadline, &now, -10 ); + spin_lock( &link->queueLock ); + for (i = 0; i < link->queueLen; ++i) { + if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) { + snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n" + "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name, + link->queue[i].from, link->queue[i].to, link->queue[i].status ); + link->queue[i].entered = now; +#ifdef _DEBUG_RESEND_STARVING + link->queue[i].status = ULR_NEW; + resend = true; +#endif + spin_unlock( &link->queueLock ); + logadd( LOG_WARNING, "%s", buffer ); + spin_lock( &link->queueLock ); + } + } + spin_unlock( &link->queueLock ); + if ( resend ) + uplink_sendRequests( link, true ); + } +#endif + } + cleanup: ; + altservers_removeUplink( link ); + uplink_saveCacheMap( link ); + spin_lock( &link->image->lock ); + if ( link->image->uplink == link ) { + link->image->uplink = NULL; + } + spin_lock( &link->queueLock ); + const int fd = link->fd; + const dnbd3_signal_t* signal = link->signal; + link->fd = -1; + link->signal = NULL; + if ( !link->shutdown ) { + link->shutdown = true; + thread_detach( link->thread ); + } + // Do not access link->image after unlocking, since we set + // image->uplink to NULL. Acquire with image_lock first, + // like done below when checking whether to re-init uplink + spin_unlock( &link->image->lock ); + spin_unlock( &link->queueLock ); + if ( fd != -1 ) close( fd ); + if ( signal != NULL ) signal_close( signal ); + // Wait for the RTT check to finish/fail if it's in progress + while ( link->rttTestResult == RTT_INPROGRESS ) + usleep( 10000 ); + if ( link->betterFd != -1 ) { + close( link->betterFd ); + } + spin_destroy( &link->queueLock ); + spin_destroy( &link->rttLock ); + free( link->recvBuffer ); + link->recvBuffer = NULL; + if ( link->cacheFd != -1 ) { + close( link->cacheFd ); + } + dnbd3_image_t *image = image_lock( link->image ); + free( link ); // !!! + if ( image != NULL ) { + if ( !_shutdown && image->cache_map != NULL ) { + // Ingegrity checker must have found something in the meantime + uplink_init( image, -1, NULL, 0 ); + } + image_release( image ); + } + return NULL ; +} + +static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly) +{ + // Scan for new requests + int j; + spin_lock( &link->queueLock ); + for (j = 0; j < link->queueLen; ++j) { + if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue; + link->queue[j].status = ULR_PENDING; + uint8_t hops = link->queue[j].hopCount; + const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1); + const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart); + /* + logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")", + (void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize ); + */ + spin_unlock( &link->queueLock ); + if ( hops < 200 ) ++hops; + const int ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) ); + if ( !ret ) { + // Non-critical - if the connection dropped or the server was changed + // the thread will re-send this request as soon as the connection + // is reestablished. + logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" ); + altservers_serverFailed( &link->currentServer ); + return; + } + spin_lock( &link->queueLock ); + } + spin_unlock( &link->queueLock ); +} + +/** + * Send a block request to an uplink server without really having + * any client that needs that data. This will be used for background replication. + * + * We'll go through the cache map of the image and look for bytes that don't have + * all bits set. We then request the corresponding 8 blocks of 4kb from the uplink + * server. This means we might request data we already have, but it makes + * the code simpler. Worst case would be only one bit is zero, which means + * 4kb are missing, but we will request 32kb. + */ +static void uplink_sendReplicationRequest(dnbd3_connection_t *link) +{ + if ( link == NULL || link->fd == -1 ) return; + if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication + if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE ) + return; + dnbd3_image_t * const image = link->image; + if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return; + spin_lock( &image->lock ); + if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) { + // No cache map (=image complete), or replication pending, or not enough users, do nothing + spin_unlock( &image->lock ); + return; + } + const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize ); + const int lastBlockIndex = mapBytes - 1; + int endByte; + if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks + endByte = link->nextReplicationIndex + mapBytes; + } else { // Hashblock based: Only look for match in current hash block + endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK; + if ( endByte > mapBytes ) { + endByte = mapBytes; + } + } + int replicationIndex = -1; + for ( int j = link->nextReplicationIndex; j < endByte; ++j ) { + const int i = j % ( mapBytes ); // Wrap around for BGR_FULL + if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) { + // Found incomplete one + replicationIndex = i; + break; + } + } + spin_unlock( &image->lock ); + if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) { + // Nothing left in current block, find next one + replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte ); + } + if ( replicationIndex == -1 ) { + // Replication might be complete, uplink_mainloop should take care.... + link->nextReplicationIndex = -1; + return; + } + const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE; + link->replicationHandle = offset; + const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE ); + if ( !dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) ) ) { + logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" ); + return; + } + if ( replicationIndex == lastBlockIndex ) { + link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks + } + link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter + if ( _backgroundReplication == BGR_HASHBLOCK + && link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) { + // Just crossed a hash block boundary, look for new candidate starting at this very index + link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex ); + } +} + +/** + * find next index into cache_map that corresponds to the beginning + * of a hash block which is neither completely empty nor completely + * replicated yet. Returns -1 if no match. + */ +static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex) +{ + int retval = -1; + spin_lock( &link->image->lock ); + const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize ); + const uint8_t *cache_map = link->image->cache_map; + if ( cache_map != NULL ) { + int j; + const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK ); + for (j = 0; j < mapBytes; ++j) { + const int i = ( start + j ) % mapBytes; + const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock ); + const bool isEmpty = cache_map[i] == 0; + if ( !isEmpty && !isFull ) { + // Neither full nor empty, replicate + if ( retval == -1 ) { + retval = i; + } + break; + } + if ( ( i & MAP_INDEX_HASH_START_MASK ) == i ) { + // Reset state if we just crossed into the next hash chunk + retval = ( isEmpty ) ? ( i ) : ( -1 ); + } else if ( isFull ) { + if ( retval != -1 ) { + // It's a full one, previous one was empty -> replicate + break; + } + } else if ( isEmpty ) { + if ( retval == -1 ) { // Previous one was full -> replicate + retval = i; + break; + } + } + } + if ( j == mapBytes ) { // Nothing found, loop ran until end + retval = -1; + } + } + spin_unlock( &link->image->lock ); + return retval; +} + +/** + * Receive data from uplink server and process/dispatch + * Locks on: link.lock, images[].lock + */ +static void uplink_handleReceive(dnbd3_connection_t *link) +{ + dnbd3_reply_t inReply, outReply; + int ret, i; + for (;;) { + ret = dnbd3_read_reply( link->fd, &inReply, false ); + if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue; + if ( ret == REPLY_AGAIN ) break; + if ( unlikely( ret == REPLY_CLOSED ) ) { + logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path ); + goto error_cleanup; + } + if ( unlikely( ret == REPLY_WRONGMAGIC ) ) { + logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path ); + goto error_cleanup; + } + if ( unlikely( ret != REPLY_OK ) ) { + logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path ); + goto error_cleanup; + } + if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) { + logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path ); + goto error_cleanup; + } + + if ( unlikely( link->recvBufferLen < inReply.size ) ) { + link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536); + link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen ); + if ( link->recvBuffer == NULL ) { + logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" ); + exit( 1 ); + } + } + if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) { + logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path ); + goto error_cleanup; + } + // Payload read completely + // Bail out if we're not interested + if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue; + // Is a legit block reply + struct iovec iov[2]; + const uint64_t start = inReply.handle; + const uint64_t end = inReply.handle + inReply.size; + totalBytesReceived += inReply.size; + link->bytesReceived += inReply.size; + // 1) Write to cache file + if ( unlikely( link->cacheFd == -1 ) ) { + uplink_reopenCacheFd( link, false ); + } + if ( likely( link->cacheFd != -1 ) ) { + int err = 0; + bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid + uint32_t done = 0; + ret = 0; + while ( done < inReply.size ) { + ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done ); + if ( unlikely( ret == -1 ) ) { + err = errno; + if ( err == EINTR ) continue; + if ( err == ENOSPC || err == EDQUOT ) { + // try to free 256MiB + if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break; + tryAgain = false; + continue; // Success, retry write + } + if ( err == EBADF || err == EINVAL || err == EIO ) { + if ( !tryAgain || !uplink_reopenCacheFd( link, true ) ) + break; + tryAgain = false; + continue; // Write handle to image successfully re-opened, try again + } + logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err ); + break; + } + if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) { + logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid ); + break; + } + done += (uint32_t)ret; + } + if ( likely( done > 0 ) ) { + image_updateCachemap( link->image, start, start + done, true ); + } + if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) { + logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.", + link->image->name, (int)link->image->rid, err ); + } + } + // 2) Figure out which clients are interested in it + spin_lock( &link->queueLock ); + for (i = 0; i < link->queueLen; ++i) { + dnbd3_queued_request_t * const req = &link->queue[i]; + assert( req->status != ULR_PROCESSING ); + if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue; + assert( req->client != NULL ); + if ( req->from >= start && req->to <= end ) { // Match :-) + req->status = ULR_PROCESSING; + } + } + // 3) Send to interested clients - iterate backwards so request collaboration works, and + // so we can decrease queueLen on the fly while iterating. Should you ever change this to start + // from 0, you also need to change the "attach to existing request"-logic in uplink_request() + outReply.magic = dnbd3_packet_magic; + bool served = false; + for ( i = link->queueLen - 1; i >= 0; --i ) { + dnbd3_queued_request_t * const req = &link->queue[i]; + if ( req->status == ULR_PROCESSING ) { + size_t bytesSent = 0; + assert( req->from >= start && req->to <= end ); + dnbd3_client_t * const client = req->client; + outReply.cmd = CMD_GET_BLOCK; + outReply.handle = req->handle; + outReply.size = (uint32_t)( req->to - req->from ); + iov[0].iov_base = &outReply; + iov[0].iov_len = sizeof outReply; + iov[1].iov_base = link->recvBuffer + (req->from - start); + iov[1].iov_len = outReply.size; + fixup_reply( outReply ); + req->status = ULR_FREE; + req->client = NULL; + served = true; + pthread_mutex_lock( &client->sendMutex ); + spin_unlock( &link->queueLock ); + if ( client->sock != -1 ) { + ssize_t sent = writev( client->sock, iov, 2 ); + if ( sent > (ssize_t)sizeof outReply ) { + bytesSent = (size_t)sent - sizeof outReply; + } + } + pthread_mutex_unlock( &client->sendMutex ); + if ( bytesSent != 0 ) { + client->bytesSent += bytesSent; + } + spin_lock( &link->queueLock ); + } + if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--; + } + spin_unlock( &link->queueLock ); +#ifdef _DEBUG + if ( !served && start != link->replicationHandle ) { + logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end ); + } +#endif + if ( start == link->replicationHandle ) { + // Was our background replication + link->replicationHandle = REP_NONE; + // Try to remove from fs cache if no client was interested in this data + if ( !served && link->cacheFd != -1 ) { + posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED ); + } + } + if ( served ) { + // Was some client -- reset idle counter + link->idleTime = 0; + // Re-enable replication if disabled + if ( link->nextReplicationIndex == -1 ) { + link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK; + } + } + } + if ( link->replicationHandle == REP_NONE ) { + spin_lock( &link->queueLock ); + const bool rep = ( link->queueLen == 0 ); + spin_unlock( &link->queueLock ); + if ( rep ) uplink_sendReplicationRequest( link ); + } + return; + // Error handling from failed receive or message parsing + error_cleanup: ; + uplink_connectionFailed( link, true ); +} + +static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew) +{ + if ( link->fd == -1 ) + return; + altservers_serverFailed( &link->currentServer ); + close( link->fd ); + link->fd = -1; + link->replicationHandle = REP_NONE; + if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) { + link->nextReplicationIndex = 0; + } + if ( !findNew ) + return; + spin_lock( &link->rttLock ); + bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1; + spin_unlock( &link->rttLock ); + if ( bail ) + return; + altservers_findUplink( link ); +} + +/** + * Send keep alive request to server + */ +static int uplink_sendKeepalive(const int fd) +{ + static dnbd3_request_t request = { 0 }; + if ( request.magic == 0 ) { + request.magic = dnbd3_packet_magic; + request.cmd = CMD_KEEPALIVE; + fixup_request( request ); + } + return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request); +} + +static void uplink_addCrc32(dnbd3_connection_t *uplink) +{ + dnbd3_image_t *image = uplink->image; + if ( image == NULL || image->virtualFilesize == 0 ) return; + size_t bytes = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize ) * sizeof(uint32_t); + uint32_t masterCrc; + uint32_t *buffer = malloc( bytes ); + if ( !dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes ) || bytes == 0 ) { + free( buffer ); + return; + } + uint32_t lists_crc = crc32( 0, NULL, 0 ); + lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes ); + lists_crc = net_order_32( lists_crc ); + if ( lists_crc != masterCrc ) { + logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name ); + free( buffer ); + return; + } + uplink->image->masterCrc32 = masterCrc; + uplink->image->crc32 = buffer; + const size_t len = strlen( uplink->image->path ) + 30; + char path[len]; + snprintf( path, len, "%s.crc", uplink->image->path ); + const int fd = open( path, O_WRONLY | O_CREAT, 0644 ); + if ( fd >= 0 ) { + write( fd, &masterCrc, sizeof(uint32_t) ); + write( fd, buffer, bytes ); + close( fd ); + } +} + +/** + * Open the given image's main image file in + * rw mode, assigning it to the cacheFd struct member. + * + * @param force If cacheFd was previously assigned a file descriptor (not == -1), + * it will be closed first. Otherwise, nothing will happen and true will be returned + * immediately. + */ +static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force) +{ + if ( link->cacheFd != -1 ) { + if ( !force ) return true; + close( link->cacheFd ); + } + link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 ); + return link->cacheFd != -1; +} + +/** + * Saves the cache map of the given image. + * Return true on success. + * Locks on: imageListLock, image.lock + */ +static bool uplink_saveCacheMap(dnbd3_connection_t *link) +{ + dnbd3_image_t *image = link->image; + assert( image != NULL ); + + if ( link->cacheFd != -1 ) { + if ( fsync( link->cacheFd ) == -1 ) { + // A failing fsync means we have no guarantee that any data + // since the last fsync (or open if none) has been saved. Apart + // from keeping the cache_map from the last successful fsync + // around and restoring it there isn't much we can do to recover + // a consistent state. Bail out. + logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno ); + logadd( LOG_ERROR, "Bailing out immediately" ); + exit( 1 ); + } + } + + if ( image->cache_map == NULL ) return true; + logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid ); + spin_lock( &image->lock ); + // Lock and get a copy of the cache map, as it could be freed by another thread that is just about to + // figure out that this image's cache copy is complete + if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) { + spin_unlock( &image->lock ); + return true; + } + const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize); + uint8_t *map = malloc( size ); + memcpy( map, image->cache_map, size ); + // Unlock. Use path and cacheFd without locking. path should never change after initialization of the image, + // cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O + spin_unlock( &image->lock ); + assert( image->path != NULL ); + char mapfile[strlen( image->path ) + 4 + 1]; + strcpy( mapfile, image->path ); + strcat( mapfile, ".map" ); + + int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 ); + if ( fd == -1 ) { + const int err = errno; + free( map ); + logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile ); + return false; + } + + size_t done = 0; + while ( done < size ) { + const ssize_t ret = write( fd, map, size - done ); + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile ); + break; + } + if ( ret <= 0 ) { + logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile ); + break; + } + done += (size_t)ret; + } + if ( fsync( fd ) == -1 ) { + logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno ); + } + close( fd ); + free( map ); + return true; +} + +static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link) +{ + return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && _backgroundReplication != BGR_FULL ); +} + diff --git a/src/server/uplink.h b/src/server/uplink.h new file mode 100644 index 0000000..2b41dfc --- /dev/null +++ b/src/server/uplink.h @@ -0,0 +1,19 @@ +#ifndef _UPLINK_H_ +#define _UPLINK_H_ + +#include "globals.h" +#include "../types.h" + +void uplink_globalsInit(); + +uint64_t uplink_getTotalBytesReceived(); + +bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version); + +void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client); + +bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount); + +void uplink_shutdown(dnbd3_image_t *image); + +#endif /* UPLINK_H_ */ diff --git a/src/server/urldecode.c b/src/server/urldecode.c new file mode 100644 index 0000000..4553097 --- /dev/null +++ b/src/server/urldecode.c @@ -0,0 +1,61 @@ +#include "urldecode.h" +#include +#include + +#define hex2int(a) do { \ + if ( a >= 'a' ) { \ + a = (char)(a - ( 'a' - 'A' - 10 )); \ + } else if ( a > 'F' ) { \ + goto normie; \ + } else if ( a >= 'A' ) { \ + a = (char)(a - ( 'A' - 10 )); \ + } else if ( a < '0' || a > '9' ) { \ + goto normie; \ + } else { \ + a = (char)(a - '0'); \ + } \ +} while (0) + +void urldecode(struct string* str, struct field *out, size_t *out_num) +{ + char *src = (char*)str->s; + char *dst = src; + const char * const end = str->s + str->l; + char a, b; + size_t max_out = *out_num; + *out_num = 0; + do { + if ( *out_num == max_out ) return; + out->name.s = dst; + while ( src < end && *src != '=' ) { + *dst++ = *src++; + } + if ( src == end ) return; + out->name.l = (size_t)( dst - out->name.s ); + ++src; + out->value.s = ++dst; + while ( src < end && *src != '&' ) { + if ( *src == '%' && src + 2 < end ) { + if ( src[1] > 'f' || src[2] > 'f' ) goto normie; + a = src[1]; + hex2int(a); + b = src[2]; + hex2int(b); + *dst++ = (char)( (16 * a) + b ); + src += 3; + } else if (*src == '+') { + *dst++ = (char)' '; + ++src; + } else { + normie:; + *dst++ = *src++; + } + } + out->value.l = (size_t)( dst - out->value.s ); + out++; + (*out_num)++; + if ( src++ >= end ) return; + ++dst; + } while ( 1 ); +} + diff --git a/src/server/urldecode.h b/src/server/urldecode.h new file mode 100644 index 0000000..e27f8f8 --- /dev/null +++ b/src/server/urldecode.h @@ -0,0 +1,19 @@ +#ifndef _URLENCODE_H_ +#define _URLENCODE_H_ + +#include "picohttpparser/picohttpparser.h" + +struct field { + struct string name; + struct string value; +}; + +/** + * decode given x-form-urlencoded string. Breaks constness rules by + * casting the const char* s from str to char* and modifying it, then + * populating out with pointers into it, so make sure the memory + * is actually writable. + */ +void urldecode(struct string* str, struct field *out, size_t *out_num); + +#endif diff --git a/src/serverconfig.h b/src/serverconfig.h new file mode 100644 index 0000000..0cbb320 --- /dev/null +++ b/src/serverconfig.h @@ -0,0 +1,56 @@ +#ifndef _SERVERCONFIG_H_ +#define _SERVERCONFIG_H_ + +#include "config.h" + +// +++++ Performance/memory related +#define SERVER_MAX_CLIENTS 4000 +#define SERVER_MAX_IMAGES 5000 +#define SERVER_MAX_ALTS 100 +// +++++ Uplink handling (proxy mode) +#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value +#define SERVER_BAD_UPLINK_THRES 40 // Thresold for numFails at which we ignore a server for the time span below +#define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored +#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink +#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients +#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks + +#define SERVER_CACHE_MAP_SAVE_INTERVAL 90 + +// Time in ms to wait for a read/write call to complete on an uplink connection +#define SOCKET_TIMEOUT_UPLINK 5000 +// Same for client connections. Be a bit more liberal here +#define SOCKET_TIMEOUT_CLIENT 15000 +// When waiting for the next request header from client, allow the timeout from above +// to expire this many times. This allows for greater idle times without also increasing +// the timeout for cases where we wait for additional data or are actively sending a reply +#define SOCKET_TIMEOUT_CLIENT_RETRIES 3 + +#define SERVER_UPLINK_KEEPALIVE_INTERVAL 10 // (Seconds) Send keep-alive if nothing else is happening on the uplink +#define SERVER_UPLINK_IDLE_TIMEOUT 1800 // (Seconds) Timeout after which we tear down an uplink connection if no blocks needed to be fetched + +// +++++ Other magic constants +#define SERVER_RTT_PROBES 5 // How many probes to average over +#define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes +#define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes +#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad +#define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times + +#define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes + +// Which is the minimum protocol version the server expects from the client +#define MIN_SUPPORTED_CLIENT 2 +// Same for when we're a proxy talking to another server +#define MIN_SUPPORTED_SERVER 2 + +// Length of comment fields (for alt server etc.) +#define COMMENT_LENGTH 120 + +#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse +#define RTT_UNREACHABLE 0x7FFFFFFu // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds + +// How many seconds have to pass after the last client disconnected until the imagefd is closed +#define UNUSED_FD_TIMEOUT 3600 + +#endif + diff --git a/src/shared/crc32.c b/src/shared/crc32.c new file mode 100644 index 0000000..db941d3 --- /dev/null +++ b/src/shared/crc32.c @@ -0,0 +1,621 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * + * Modified for use in dnbd3 + * Original comment: + * + * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + * + * Original zlib.h license text: + * + + Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + +*/ + +#include "../types.h" +#include + +#define FAR +#define OF(args) args +#define local static + +/* Definitions for doing the crc four data bytes at a time. */ +#if !defined(NOBYFOUR) +# define BYFOUR +#endif +#ifdef BYFOUR +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +local const uint32_t crc_table[TBLS][256] = +{ + { + 0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U, + 0x706af48fU, 0xe963a535U, 0x9e6495a3U, 0x0edb8832U, 0x79dcb8a4U, + 0xe0d5e91eU, 0x97d2d988U, 0x09b64c2bU, 0x7eb17cbdU, 0xe7b82d07U, + 0x90bf1d91U, 0x1db71064U, 0x6ab020f2U, 0xf3b97148U, 0x84be41deU, + 0x1adad47dU, 0x6ddde4ebU, 0xf4d4b551U, 0x83d385c7U, 0x136c9856U, + 0x646ba8c0U, 0xfd62f97aU, 0x8a65c9ecU, 0x14015c4fU, 0x63066cd9U, + 0xfa0f3d63U, 0x8d080df5U, 0x3b6e20c8U, 0x4c69105eU, 0xd56041e4U, + 0xa2677172U, 0x3c03e4d1U, 0x4b04d447U, 0xd20d85fdU, 0xa50ab56bU, + 0x35b5a8faU, 0x42b2986cU, 0xdbbbc9d6U, 0xacbcf940U, 0x32d86ce3U, + 0x45df5c75U, 0xdcd60dcfU, 0xabd13d59U, 0x26d930acU, 0x51de003aU, + 0xc8d75180U, 0xbfd06116U, 0x21b4f4b5U, 0x56b3c423U, 0xcfba9599U, + 0xb8bda50fU, 0x2802b89eU, 0x5f058808U, 0xc60cd9b2U, 0xb10be924U, + 0x2f6f7c87U, 0x58684c11U, 0xc1611dabU, 0xb6662d3dU, 0x76dc4190U, + 0x01db7106U, 0x98d220bcU, 0xefd5102aU, 0x71b18589U, 0x06b6b51fU, + 0x9fbfe4a5U, 0xe8b8d433U, 0x7807c9a2U, 0x0f00f934U, 0x9609a88eU, + 0xe10e9818U, 0x7f6a0dbbU, 0x086d3d2dU, 0x91646c97U, 0xe6635c01U, + 0x6b6b51f4U, 0x1c6c6162U, 0x856530d8U, 0xf262004eU, 0x6c0695edU, + 0x1b01a57bU, 0x8208f4c1U, 0xf50fc457U, 0x65b0d9c6U, 0x12b7e950U, + 0x8bbeb8eaU, 0xfcb9887cU, 0x62dd1ddfU, 0x15da2d49U, 0x8cd37cf3U, + 0xfbd44c65U, 0x4db26158U, 0x3ab551ceU, 0xa3bc0074U, 0xd4bb30e2U, + 0x4adfa541U, 0x3dd895d7U, 0xa4d1c46dU, 0xd3d6f4fbU, 0x4369e96aU, + 0x346ed9fcU, 0xad678846U, 0xda60b8d0U, 0x44042d73U, 0x33031de5U, + 0xaa0a4c5fU, 0xdd0d7cc9U, 0x5005713cU, 0x270241aaU, 0xbe0b1010U, + 0xc90c2086U, 0x5768b525U, 0x206f85b3U, 0xb966d409U, 0xce61e49fU, + 0x5edef90eU, 0x29d9c998U, 0xb0d09822U, 0xc7d7a8b4U, 0x59b33d17U, + 0x2eb40d81U, 0xb7bd5c3bU, 0xc0ba6cadU, 0xedb88320U, 0x9abfb3b6U, + 0x03b6e20cU, 0x74b1d29aU, 0xead54739U, 0x9dd277afU, 0x04db2615U, + 0x73dc1683U, 0xe3630b12U, 0x94643b84U, 0x0d6d6a3eU, 0x7a6a5aa8U, + 0xe40ecf0bU, 0x9309ff9dU, 0x0a00ae27U, 0x7d079eb1U, 0xf00f9344U, + 0x8708a3d2U, 0x1e01f268U, 0x6906c2feU, 0xf762575dU, 0x806567cbU, + 0x196c3671U, 0x6e6b06e7U, 0xfed41b76U, 0x89d32be0U, 0x10da7a5aU, + 0x67dd4accU, 0xf9b9df6fU, 0x8ebeeff9U, 0x17b7be43U, 0x60b08ed5U, + 0xd6d6a3e8U, 0xa1d1937eU, 0x38d8c2c4U, 0x4fdff252U, 0xd1bb67f1U, + 0xa6bc5767U, 0x3fb506ddU, 0x48b2364bU, 0xd80d2bdaU, 0xaf0a1b4cU, + 0x36034af6U, 0x41047a60U, 0xdf60efc3U, 0xa867df55U, 0x316e8eefU, + 0x4669be79U, 0xcb61b38cU, 0xbc66831aU, 0x256fd2a0U, 0x5268e236U, + 0xcc0c7795U, 0xbb0b4703U, 0x220216b9U, 0x5505262fU, 0xc5ba3bbeU, + 0xb2bd0b28U, 0x2bb45a92U, 0x5cb36a04U, 0xc2d7ffa7U, 0xb5d0cf31U, + 0x2cd99e8bU, 0x5bdeae1dU, 0x9b64c2b0U, 0xec63f226U, 0x756aa39cU, + 0x026d930aU, 0x9c0906a9U, 0xeb0e363fU, 0x72076785U, 0x05005713U, + 0x95bf4a82U, 0xe2b87a14U, 0x7bb12baeU, 0x0cb61b38U, 0x92d28e9bU, + 0xe5d5be0dU, 0x7cdcefb7U, 0x0bdbdf21U, 0x86d3d2d4U, 0xf1d4e242U, + 0x68ddb3f8U, 0x1fda836eU, 0x81be16cdU, 0xf6b9265bU, 0x6fb077e1U, + 0x18b74777U, 0x88085ae6U, 0xff0f6a70U, 0x66063bcaU, 0x11010b5cU, + 0x8f659effU, 0xf862ae69U, 0x616bffd3U, 0x166ccf45U, 0xa00ae278U, + 0xd70dd2eeU, 0x4e048354U, 0x3903b3c2U, 0xa7672661U, 0xd06016f7U, + 0x4969474dU, 0x3e6e77dbU, 0xaed16a4aU, 0xd9d65adcU, 0x40df0b66U, + 0x37d83bf0U, 0xa9bcae53U, 0xdebb9ec5U, 0x47b2cf7fU, 0x30b5ffe9U, + 0xbdbdf21cU, 0xcabac28aU, 0x53b39330U, 0x24b4a3a6U, 0xbad03605U, + 0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U, + 0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU, + 0x2d02ef8dU +#ifdef BYFOUR + }, + { + 0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U, + 0x7d77f445U, 0x565aa786U, 0x4f4196c7U, 0xc8d98a08U, 0xd1c2bb49U, + 0xfaefe88aU, 0xe3f4d9cbU, 0xacb54f0cU, 0xb5ae7e4dU, 0x9e832d8eU, + 0x87981ccfU, 0x4ac21251U, 0x53d92310U, 0x78f470d3U, 0x61ef4192U, + 0x2eaed755U, 0x37b5e614U, 0x1c98b5d7U, 0x05838496U, 0x821b9859U, + 0x9b00a918U, 0xb02dfadbU, 0xa936cb9aU, 0xe6775d5dU, 0xff6c6c1cU, + 0xd4413fdfU, 0xcd5a0e9eU, 0x958424a2U, 0x8c9f15e3U, 0xa7b24620U, + 0xbea97761U, 0xf1e8e1a6U, 0xe8f3d0e7U, 0xc3de8324U, 0xdac5b265U, + 0x5d5daeaaU, 0x44469febU, 0x6f6bcc28U, 0x7670fd69U, 0x39316baeU, + 0x202a5aefU, 0x0b07092cU, 0x121c386dU, 0xdf4636f3U, 0xc65d07b2U, + 0xed705471U, 0xf46b6530U, 0xbb2af3f7U, 0xa231c2b6U, 0x891c9175U, + 0x9007a034U, 0x179fbcfbU, 0x0e848dbaU, 0x25a9de79U, 0x3cb2ef38U, + 0x73f379ffU, 0x6ae848beU, 0x41c51b7dU, 0x58de2a3cU, 0xf0794f05U, + 0xe9627e44U, 0xc24f2d87U, 0xdb541cc6U, 0x94158a01U, 0x8d0ebb40U, + 0xa623e883U, 0xbf38d9c2U, 0x38a0c50dU, 0x21bbf44cU, 0x0a96a78fU, + 0x138d96ceU, 0x5ccc0009U, 0x45d73148U, 0x6efa628bU, 0x77e153caU, + 0xbabb5d54U, 0xa3a06c15U, 0x888d3fd6U, 0x91960e97U, 0xded79850U, + 0xc7cca911U, 0xece1fad2U, 0xf5facb93U, 0x7262d75cU, 0x6b79e61dU, + 0x4054b5deU, 0x594f849fU, 0x160e1258U, 0x0f152319U, 0x243870daU, + 0x3d23419bU, 0x65fd6ba7U, 0x7ce65ae6U, 0x57cb0925U, 0x4ed03864U, + 0x0191aea3U, 0x188a9fe2U, 0x33a7cc21U, 0x2abcfd60U, 0xad24e1afU, + 0xb43fd0eeU, 0x9f12832dU, 0x8609b26cU, 0xc94824abU, 0xd05315eaU, + 0xfb7e4629U, 0xe2657768U, 0x2f3f79f6U, 0x362448b7U, 0x1d091b74U, + 0x04122a35U, 0x4b53bcf2U, 0x52488db3U, 0x7965de70U, 0x607eef31U, + 0xe7e6f3feU, 0xfefdc2bfU, 0xd5d0917cU, 0xcccba03dU, 0x838a36faU, + 0x9a9107bbU, 0xb1bc5478U, 0xa8a76539U, 0x3b83984bU, 0x2298a90aU, + 0x09b5fac9U, 0x10aecb88U, 0x5fef5d4fU, 0x46f46c0eU, 0x6dd93fcdU, + 0x74c20e8cU, 0xf35a1243U, 0xea412302U, 0xc16c70c1U, 0xd8774180U, + 0x9736d747U, 0x8e2de606U, 0xa500b5c5U, 0xbc1b8484U, 0x71418a1aU, + 0x685abb5bU, 0x4377e898U, 0x5a6cd9d9U, 0x152d4f1eU, 0x0c367e5fU, + 0x271b2d9cU, 0x3e001cddU, 0xb9980012U, 0xa0833153U, 0x8bae6290U, + 0x92b553d1U, 0xddf4c516U, 0xc4eff457U, 0xefc2a794U, 0xf6d996d5U, + 0xae07bce9U, 0xb71c8da8U, 0x9c31de6bU, 0x852aef2aU, 0xca6b79edU, + 0xd37048acU, 0xf85d1b6fU, 0xe1462a2eU, 0x66de36e1U, 0x7fc507a0U, + 0x54e85463U, 0x4df36522U, 0x02b2f3e5U, 0x1ba9c2a4U, 0x30849167U, + 0x299fa026U, 0xe4c5aeb8U, 0xfdde9ff9U, 0xd6f3cc3aU, 0xcfe8fd7bU, + 0x80a96bbcU, 0x99b25afdU, 0xb29f093eU, 0xab84387fU, 0x2c1c24b0U, + 0x350715f1U, 0x1e2a4632U, 0x07317773U, 0x4870e1b4U, 0x516bd0f5U, + 0x7a468336U, 0x635db277U, 0xcbfad74eU, 0xd2e1e60fU, 0xf9ccb5ccU, + 0xe0d7848dU, 0xaf96124aU, 0xb68d230bU, 0x9da070c8U, 0x84bb4189U, + 0x03235d46U, 0x1a386c07U, 0x31153fc4U, 0x280e0e85U, 0x674f9842U, + 0x7e54a903U, 0x5579fac0U, 0x4c62cb81U, 0x8138c51fU, 0x9823f45eU, + 0xb30ea79dU, 0xaa1596dcU, 0xe554001bU, 0xfc4f315aU, 0xd7626299U, + 0xce7953d8U, 0x49e14f17U, 0x50fa7e56U, 0x7bd72d95U, 0x62cc1cd4U, + 0x2d8d8a13U, 0x3496bb52U, 0x1fbbe891U, 0x06a0d9d0U, 0x5e7ef3ecU, + 0x4765c2adU, 0x6c48916eU, 0x7553a02fU, 0x3a1236e8U, 0x230907a9U, + 0x0824546aU, 0x113f652bU, 0x96a779e4U, 0x8fbc48a5U, 0xa4911b66U, + 0xbd8a2a27U, 0xf2cbbce0U, 0xebd08da1U, 0xc0fdde62U, 0xd9e6ef23U, + 0x14bce1bdU, 0x0da7d0fcU, 0x268a833fU, 0x3f91b27eU, 0x70d024b9U, + 0x69cb15f8U, 0x42e6463bU, 0x5bfd777aU, 0xdc656bb5U, 0xc57e5af4U, + 0xee530937U, 0xf7483876U, 0xb809aeb1U, 0xa1129ff0U, 0x8a3fcc33U, + 0x9324fd72U + }, + { + 0x00000000U, 0x01c26a37U, 0x0384d46eU, 0x0246be59U, 0x0709a8dcU, + 0x06cbc2ebU, 0x048d7cb2U, 0x054f1685U, 0x0e1351b8U, 0x0fd13b8fU, + 0x0d9785d6U, 0x0c55efe1U, 0x091af964U, 0x08d89353U, 0x0a9e2d0aU, + 0x0b5c473dU, 0x1c26a370U, 0x1de4c947U, 0x1fa2771eU, 0x1e601d29U, + 0x1b2f0bacU, 0x1aed619bU, 0x18abdfc2U, 0x1969b5f5U, 0x1235f2c8U, + 0x13f798ffU, 0x11b126a6U, 0x10734c91U, 0x153c5a14U, 0x14fe3023U, + 0x16b88e7aU, 0x177ae44dU, 0x384d46e0U, 0x398f2cd7U, 0x3bc9928eU, + 0x3a0bf8b9U, 0x3f44ee3cU, 0x3e86840bU, 0x3cc03a52U, 0x3d025065U, + 0x365e1758U, 0x379c7d6fU, 0x35dac336U, 0x3418a901U, 0x3157bf84U, + 0x3095d5b3U, 0x32d36beaU, 0x331101ddU, 0x246be590U, 0x25a98fa7U, + 0x27ef31feU, 0x262d5bc9U, 0x23624d4cU, 0x22a0277bU, 0x20e69922U, + 0x2124f315U, 0x2a78b428U, 0x2bbade1fU, 0x29fc6046U, 0x283e0a71U, + 0x2d711cf4U, 0x2cb376c3U, 0x2ef5c89aU, 0x2f37a2adU, 0x709a8dc0U, + 0x7158e7f7U, 0x731e59aeU, 0x72dc3399U, 0x7793251cU, 0x76514f2bU, + 0x7417f172U, 0x75d59b45U, 0x7e89dc78U, 0x7f4bb64fU, 0x7d0d0816U, + 0x7ccf6221U, 0x798074a4U, 0x78421e93U, 0x7a04a0caU, 0x7bc6cafdU, + 0x6cbc2eb0U, 0x6d7e4487U, 0x6f38fadeU, 0x6efa90e9U, 0x6bb5866cU, + 0x6a77ec5bU, 0x68315202U, 0x69f33835U, 0x62af7f08U, 0x636d153fU, + 0x612bab66U, 0x60e9c151U, 0x65a6d7d4U, 0x6464bde3U, 0x662203baU, + 0x67e0698dU, 0x48d7cb20U, 0x4915a117U, 0x4b531f4eU, 0x4a917579U, + 0x4fde63fcU, 0x4e1c09cbU, 0x4c5ab792U, 0x4d98dda5U, 0x46c49a98U, + 0x4706f0afU, 0x45404ef6U, 0x448224c1U, 0x41cd3244U, 0x400f5873U, + 0x4249e62aU, 0x438b8c1dU, 0x54f16850U, 0x55330267U, 0x5775bc3eU, + 0x56b7d609U, 0x53f8c08cU, 0x523aaabbU, 0x507c14e2U, 0x51be7ed5U, + 0x5ae239e8U, 0x5b2053dfU, 0x5966ed86U, 0x58a487b1U, 0x5deb9134U, + 0x5c29fb03U, 0x5e6f455aU, 0x5fad2f6dU, 0xe1351b80U, 0xe0f771b7U, + 0xe2b1cfeeU, 0xe373a5d9U, 0xe63cb35cU, 0xe7fed96bU, 0xe5b86732U, + 0xe47a0d05U, 0xef264a38U, 0xeee4200fU, 0xeca29e56U, 0xed60f461U, + 0xe82fe2e4U, 0xe9ed88d3U, 0xebab368aU, 0xea695cbdU, 0xfd13b8f0U, + 0xfcd1d2c7U, 0xfe976c9eU, 0xff5506a9U, 0xfa1a102cU, 0xfbd87a1bU, + 0xf99ec442U, 0xf85cae75U, 0xf300e948U, 0xf2c2837fU, 0xf0843d26U, + 0xf1465711U, 0xf4094194U, 0xf5cb2ba3U, 0xf78d95faU, 0xf64fffcdU, + 0xd9785d60U, 0xd8ba3757U, 0xdafc890eU, 0xdb3ee339U, 0xde71f5bcU, + 0xdfb39f8bU, 0xddf521d2U, 0xdc374be5U, 0xd76b0cd8U, 0xd6a966efU, + 0xd4efd8b6U, 0xd52db281U, 0xd062a404U, 0xd1a0ce33U, 0xd3e6706aU, + 0xd2241a5dU, 0xc55efe10U, 0xc49c9427U, 0xc6da2a7eU, 0xc7184049U, + 0xc25756ccU, 0xc3953cfbU, 0xc1d382a2U, 0xc011e895U, 0xcb4dafa8U, + 0xca8fc59fU, 0xc8c97bc6U, 0xc90b11f1U, 0xcc440774U, 0xcd866d43U, + 0xcfc0d31aU, 0xce02b92dU, 0x91af9640U, 0x906dfc77U, 0x922b422eU, + 0x93e92819U, 0x96a63e9cU, 0x976454abU, 0x9522eaf2U, 0x94e080c5U, + 0x9fbcc7f8U, 0x9e7eadcfU, 0x9c381396U, 0x9dfa79a1U, 0x98b56f24U, + 0x99770513U, 0x9b31bb4aU, 0x9af3d17dU, 0x8d893530U, 0x8c4b5f07U, + 0x8e0de15eU, 0x8fcf8b69U, 0x8a809decU, 0x8b42f7dbU, 0x89044982U, + 0x88c623b5U, 0x839a6488U, 0x82580ebfU, 0x801eb0e6U, 0x81dcdad1U, + 0x8493cc54U, 0x8551a663U, 0x8717183aU, 0x86d5720dU, 0xa9e2d0a0U, + 0xa820ba97U, 0xaa6604ceU, 0xaba46ef9U, 0xaeeb787cU, 0xaf29124bU, + 0xad6fac12U, 0xacadc625U, 0xa7f18118U, 0xa633eb2fU, 0xa4755576U, + 0xa5b73f41U, 0xa0f829c4U, 0xa13a43f3U, 0xa37cfdaaU, 0xa2be979dU, + 0xb5c473d0U, 0xb40619e7U, 0xb640a7beU, 0xb782cd89U, 0xb2cddb0cU, + 0xb30fb13bU, 0xb1490f62U, 0xb08b6555U, 0xbbd72268U, 0xba15485fU, + 0xb853f606U, 0xb9919c31U, 0xbcde8ab4U, 0xbd1ce083U, 0xbf5a5edaU, + 0xbe9834edU + }, + { + 0x00000000U, 0xb8bc6765U, 0xaa09c88bU, 0x12b5afeeU, 0x8f629757U, + 0x37def032U, 0x256b5fdcU, 0x9dd738b9U, 0xc5b428efU, 0x7d084f8aU, + 0x6fbde064U, 0xd7018701U, 0x4ad6bfb8U, 0xf26ad8ddU, 0xe0df7733U, + 0x58631056U, 0x5019579fU, 0xe8a530faU, 0xfa109f14U, 0x42acf871U, + 0xdf7bc0c8U, 0x67c7a7adU, 0x75720843U, 0xcdce6f26U, 0x95ad7f70U, + 0x2d111815U, 0x3fa4b7fbU, 0x8718d09eU, 0x1acfe827U, 0xa2738f42U, + 0xb0c620acU, 0x087a47c9U, 0xa032af3eU, 0x188ec85bU, 0x0a3b67b5U, + 0xb28700d0U, 0x2f503869U, 0x97ec5f0cU, 0x8559f0e2U, 0x3de59787U, + 0x658687d1U, 0xdd3ae0b4U, 0xcf8f4f5aU, 0x7733283fU, 0xeae41086U, + 0x525877e3U, 0x40edd80dU, 0xf851bf68U, 0xf02bf8a1U, 0x48979fc4U, + 0x5a22302aU, 0xe29e574fU, 0x7f496ff6U, 0xc7f50893U, 0xd540a77dU, + 0x6dfcc018U, 0x359fd04eU, 0x8d23b72bU, 0x9f9618c5U, 0x272a7fa0U, + 0xbafd4719U, 0x0241207cU, 0x10f48f92U, 0xa848e8f7U, 0x9b14583dU, + 0x23a83f58U, 0x311d90b6U, 0x89a1f7d3U, 0x1476cf6aU, 0xaccaa80fU, + 0xbe7f07e1U, 0x06c36084U, 0x5ea070d2U, 0xe61c17b7U, 0xf4a9b859U, + 0x4c15df3cU, 0xd1c2e785U, 0x697e80e0U, 0x7bcb2f0eU, 0xc377486bU, + 0xcb0d0fa2U, 0x73b168c7U, 0x6104c729U, 0xd9b8a04cU, 0x446f98f5U, + 0xfcd3ff90U, 0xee66507eU, 0x56da371bU, 0x0eb9274dU, 0xb6054028U, + 0xa4b0efc6U, 0x1c0c88a3U, 0x81dbb01aU, 0x3967d77fU, 0x2bd27891U, + 0x936e1ff4U, 0x3b26f703U, 0x839a9066U, 0x912f3f88U, 0x299358edU, + 0xb4446054U, 0x0cf80731U, 0x1e4da8dfU, 0xa6f1cfbaU, 0xfe92dfecU, + 0x462eb889U, 0x549b1767U, 0xec277002U, 0x71f048bbU, 0xc94c2fdeU, + 0xdbf98030U, 0x6345e755U, 0x6b3fa09cU, 0xd383c7f9U, 0xc1366817U, + 0x798a0f72U, 0xe45d37cbU, 0x5ce150aeU, 0x4e54ff40U, 0xf6e89825U, + 0xae8b8873U, 0x1637ef16U, 0x048240f8U, 0xbc3e279dU, 0x21e91f24U, + 0x99557841U, 0x8be0d7afU, 0x335cb0caU, 0xed59b63bU, 0x55e5d15eU, + 0x47507eb0U, 0xffec19d5U, 0x623b216cU, 0xda874609U, 0xc832e9e7U, + 0x708e8e82U, 0x28ed9ed4U, 0x9051f9b1U, 0x82e4565fU, 0x3a58313aU, + 0xa78f0983U, 0x1f336ee6U, 0x0d86c108U, 0xb53aa66dU, 0xbd40e1a4U, + 0x05fc86c1U, 0x1749292fU, 0xaff54e4aU, 0x322276f3U, 0x8a9e1196U, + 0x982bbe78U, 0x2097d91dU, 0x78f4c94bU, 0xc048ae2eU, 0xd2fd01c0U, + 0x6a4166a5U, 0xf7965e1cU, 0x4f2a3979U, 0x5d9f9697U, 0xe523f1f2U, + 0x4d6b1905U, 0xf5d77e60U, 0xe762d18eU, 0x5fdeb6ebU, 0xc2098e52U, + 0x7ab5e937U, 0x680046d9U, 0xd0bc21bcU, 0x88df31eaU, 0x3063568fU, + 0x22d6f961U, 0x9a6a9e04U, 0x07bda6bdU, 0xbf01c1d8U, 0xadb46e36U, + 0x15080953U, 0x1d724e9aU, 0xa5ce29ffU, 0xb77b8611U, 0x0fc7e174U, + 0x9210d9cdU, 0x2aacbea8U, 0x38191146U, 0x80a57623U, 0xd8c66675U, + 0x607a0110U, 0x72cfaefeU, 0xca73c99bU, 0x57a4f122U, 0xef189647U, + 0xfdad39a9U, 0x45115eccU, 0x764dee06U, 0xcef18963U, 0xdc44268dU, + 0x64f841e8U, 0xf92f7951U, 0x41931e34U, 0x5326b1daU, 0xeb9ad6bfU, + 0xb3f9c6e9U, 0x0b45a18cU, 0x19f00e62U, 0xa14c6907U, 0x3c9b51beU, + 0x842736dbU, 0x96929935U, 0x2e2efe50U, 0x2654b999U, 0x9ee8defcU, + 0x8c5d7112U, 0x34e11677U, 0xa9362eceU, 0x118a49abU, 0x033fe645U, + 0xbb838120U, 0xe3e09176U, 0x5b5cf613U, 0x49e959fdU, 0xf1553e98U, + 0x6c820621U, 0xd43e6144U, 0xc68bceaaU, 0x7e37a9cfU, 0xd67f4138U, + 0x6ec3265dU, 0x7c7689b3U, 0xc4caeed6U, 0x591dd66fU, 0xe1a1b10aU, + 0xf3141ee4U, 0x4ba87981U, 0x13cb69d7U, 0xab770eb2U, 0xb9c2a15cU, + 0x017ec639U, 0x9ca9fe80U, 0x241599e5U, 0x36a0360bU, 0x8e1c516eU, + 0x866616a7U, 0x3eda71c2U, 0x2c6fde2cU, 0x94d3b949U, 0x090481f0U, + 0xb1b8e695U, 0xa30d497bU, 0x1bb12e1eU, 0x43d23e48U, 0xfb6e592dU, + 0xe9dbf6c3U, 0x516791a6U, 0xccb0a91fU, 0x740cce7aU, 0x66b96194U, + 0xde0506f1U + }, + { + 0x00000000U, 0x96300777U, 0x2c610eeeU, 0xba510999U, 0x19c46d07U, + 0x8ff46a70U, 0x35a563e9U, 0xa395649eU, 0x3288db0eU, 0xa4b8dc79U, + 0x1ee9d5e0U, 0x88d9d297U, 0x2b4cb609U, 0xbd7cb17eU, 0x072db8e7U, + 0x911dbf90U, 0x6410b71dU, 0xf220b06aU, 0x4871b9f3U, 0xde41be84U, + 0x7dd4da1aU, 0xebe4dd6dU, 0x51b5d4f4U, 0xc785d383U, 0x56986c13U, + 0xc0a86b64U, 0x7af962fdU, 0xecc9658aU, 0x4f5c0114U, 0xd96c0663U, + 0x633d0ffaU, 0xf50d088dU, 0xc8206e3bU, 0x5e10694cU, 0xe44160d5U, + 0x727167a2U, 0xd1e4033cU, 0x47d4044bU, 0xfd850dd2U, 0x6bb50aa5U, + 0xfaa8b535U, 0x6c98b242U, 0xd6c9bbdbU, 0x40f9bcacU, 0xe36cd832U, + 0x755cdf45U, 0xcf0dd6dcU, 0x593dd1abU, 0xac30d926U, 0x3a00de51U, + 0x8051d7c8U, 0x1661d0bfU, 0xb5f4b421U, 0x23c4b356U, 0x9995bacfU, + 0x0fa5bdb8U, 0x9eb80228U, 0x0888055fU, 0xb2d90cc6U, 0x24e90bb1U, + 0x877c6f2fU, 0x114c6858U, 0xab1d61c1U, 0x3d2d66b6U, 0x9041dc76U, + 0x0671db01U, 0xbc20d298U, 0x2a10d5efU, 0x8985b171U, 0x1fb5b606U, + 0xa5e4bf9fU, 0x33d4b8e8U, 0xa2c90778U, 0x34f9000fU, 0x8ea80996U, + 0x18980ee1U, 0xbb0d6a7fU, 0x2d3d6d08U, 0x976c6491U, 0x015c63e6U, + 0xf4516b6bU, 0x62616c1cU, 0xd8306585U, 0x4e0062f2U, 0xed95066cU, + 0x7ba5011bU, 0xc1f40882U, 0x57c40ff5U, 0xc6d9b065U, 0x50e9b712U, + 0xeab8be8bU, 0x7c88b9fcU, 0xdf1ddd62U, 0x492dda15U, 0xf37cd38cU, + 0x654cd4fbU, 0x5861b24dU, 0xce51b53aU, 0x7400bca3U, 0xe230bbd4U, + 0x41a5df4aU, 0xd795d83dU, 0x6dc4d1a4U, 0xfbf4d6d3U, 0x6ae96943U, + 0xfcd96e34U, 0x468867adU, 0xd0b860daU, 0x732d0444U, 0xe51d0333U, + 0x5f4c0aaaU, 0xc97c0dddU, 0x3c710550U, 0xaa410227U, 0x10100bbeU, + 0x86200cc9U, 0x25b56857U, 0xb3856f20U, 0x09d466b9U, 0x9fe461ceU, + 0x0ef9de5eU, 0x98c9d929U, 0x2298d0b0U, 0xb4a8d7c7U, 0x173db359U, + 0x810db42eU, 0x3b5cbdb7U, 0xad6cbac0U, 0x2083b8edU, 0xb6b3bf9aU, + 0x0ce2b603U, 0x9ad2b174U, 0x3947d5eaU, 0xaf77d29dU, 0x1526db04U, + 0x8316dc73U, 0x120b63e3U, 0x843b6494U, 0x3e6a6d0dU, 0xa85a6a7aU, + 0x0bcf0ee4U, 0x9dff0993U, 0x27ae000aU, 0xb19e077dU, 0x44930ff0U, + 0xd2a30887U, 0x68f2011eU, 0xfec20669U, 0x5d5762f7U, 0xcb676580U, + 0x71366c19U, 0xe7066b6eU, 0x761bd4feU, 0xe02bd389U, 0x5a7ada10U, + 0xcc4add67U, 0x6fdfb9f9U, 0xf9efbe8eU, 0x43beb717U, 0xd58eb060U, + 0xe8a3d6d6U, 0x7e93d1a1U, 0xc4c2d838U, 0x52f2df4fU, 0xf167bbd1U, + 0x6757bca6U, 0xdd06b53fU, 0x4b36b248U, 0xda2b0dd8U, 0x4c1b0aafU, + 0xf64a0336U, 0x607a0441U, 0xc3ef60dfU, 0x55df67a8U, 0xef8e6e31U, + 0x79be6946U, 0x8cb361cbU, 0x1a8366bcU, 0xa0d26f25U, 0x36e26852U, + 0x95770cccU, 0x03470bbbU, 0xb9160222U, 0x2f260555U, 0xbe3bbac5U, + 0x280bbdb2U, 0x925ab42bU, 0x046ab35cU, 0xa7ffd7c2U, 0x31cfd0b5U, + 0x8b9ed92cU, 0x1daede5bU, 0xb0c2649bU, 0x26f263ecU, 0x9ca36a75U, + 0x0a936d02U, 0xa906099cU, 0x3f360eebU, 0x85670772U, 0x13570005U, + 0x824abf95U, 0x147ab8e2U, 0xae2bb17bU, 0x381bb60cU, 0x9b8ed292U, + 0x0dbed5e5U, 0xb7efdc7cU, 0x21dfdb0bU, 0xd4d2d386U, 0x42e2d4f1U, + 0xf8b3dd68U, 0x6e83da1fU, 0xcd16be81U, 0x5b26b9f6U, 0xe177b06fU, + 0x7747b718U, 0xe65a0888U, 0x706a0fffU, 0xca3b0666U, 0x5c0b0111U, + 0xff9e658fU, 0x69ae62f8U, 0xd3ff6b61U, 0x45cf6c16U, 0x78e20aa0U, + 0xeed20dd7U, 0x5483044eU, 0xc2b30339U, 0x612667a7U, 0xf71660d0U, + 0x4d476949U, 0xdb776e3eU, 0x4a6ad1aeU, 0xdc5ad6d9U, 0x660bdf40U, + 0xf03bd837U, 0x53aebca9U, 0xc59ebbdeU, 0x7fcfb247U, 0xe9ffb530U, + 0x1cf2bdbdU, 0x8ac2bacaU, 0x3093b353U, 0xa6a3b424U, 0x0536d0baU, + 0x9306d7cdU, 0x2957de54U, 0xbf67d923U, 0x2e7a66b3U, 0xb84a61c4U, + 0x021b685dU, 0x942b6f2aU, 0x37be0bb4U, 0xa18e0cc3U, 0x1bdf055aU, + 0x8def022dU + }, + { + 0x00000000U, 0x41311b19U, 0x82623632U, 0xc3532d2bU, 0x04c56c64U, + 0x45f4777dU, 0x86a75a56U, 0xc796414fU, 0x088ad9c8U, 0x49bbc2d1U, + 0x8ae8effaU, 0xcbd9f4e3U, 0x0c4fb5acU, 0x4d7eaeb5U, 0x8e2d839eU, + 0xcf1c9887U, 0x5112c24aU, 0x1023d953U, 0xd370f478U, 0x9241ef61U, + 0x55d7ae2eU, 0x14e6b537U, 0xd7b5981cU, 0x96848305U, 0x59981b82U, + 0x18a9009bU, 0xdbfa2db0U, 0x9acb36a9U, 0x5d5d77e6U, 0x1c6c6cffU, + 0xdf3f41d4U, 0x9e0e5acdU, 0xa2248495U, 0xe3159f8cU, 0x2046b2a7U, + 0x6177a9beU, 0xa6e1e8f1U, 0xe7d0f3e8U, 0x2483dec3U, 0x65b2c5daU, + 0xaaae5d5dU, 0xeb9f4644U, 0x28cc6b6fU, 0x69fd7076U, 0xae6b3139U, + 0xef5a2a20U, 0x2c09070bU, 0x6d381c12U, 0xf33646dfU, 0xb2075dc6U, + 0x715470edU, 0x30656bf4U, 0xf7f32abbU, 0xb6c231a2U, 0x75911c89U, + 0x34a00790U, 0xfbbc9f17U, 0xba8d840eU, 0x79dea925U, 0x38efb23cU, + 0xff79f373U, 0xbe48e86aU, 0x7d1bc541U, 0x3c2ade58U, 0x054f79f0U, + 0x447e62e9U, 0x872d4fc2U, 0xc61c54dbU, 0x018a1594U, 0x40bb0e8dU, + 0x83e823a6U, 0xc2d938bfU, 0x0dc5a038U, 0x4cf4bb21U, 0x8fa7960aU, + 0xce968d13U, 0x0900cc5cU, 0x4831d745U, 0x8b62fa6eU, 0xca53e177U, + 0x545dbbbaU, 0x156ca0a3U, 0xd63f8d88U, 0x970e9691U, 0x5098d7deU, + 0x11a9ccc7U, 0xd2fae1ecU, 0x93cbfaf5U, 0x5cd76272U, 0x1de6796bU, + 0xdeb55440U, 0x9f844f59U, 0x58120e16U, 0x1923150fU, 0xda703824U, + 0x9b41233dU, 0xa76bfd65U, 0xe65ae67cU, 0x2509cb57U, 0x6438d04eU, + 0xa3ae9101U, 0xe29f8a18U, 0x21cca733U, 0x60fdbc2aU, 0xafe124adU, + 0xeed03fb4U, 0x2d83129fU, 0x6cb20986U, 0xab2448c9U, 0xea1553d0U, + 0x29467efbU, 0x687765e2U, 0xf6793f2fU, 0xb7482436U, 0x741b091dU, + 0x352a1204U, 0xf2bc534bU, 0xb38d4852U, 0x70de6579U, 0x31ef7e60U, + 0xfef3e6e7U, 0xbfc2fdfeU, 0x7c91d0d5U, 0x3da0cbccU, 0xfa368a83U, + 0xbb07919aU, 0x7854bcb1U, 0x3965a7a8U, 0x4b98833bU, 0x0aa99822U, + 0xc9fab509U, 0x88cbae10U, 0x4f5def5fU, 0x0e6cf446U, 0xcd3fd96dU, + 0x8c0ec274U, 0x43125af3U, 0x022341eaU, 0xc1706cc1U, 0x804177d8U, + 0x47d73697U, 0x06e62d8eU, 0xc5b500a5U, 0x84841bbcU, 0x1a8a4171U, + 0x5bbb5a68U, 0x98e87743U, 0xd9d96c5aU, 0x1e4f2d15U, 0x5f7e360cU, + 0x9c2d1b27U, 0xdd1c003eU, 0x120098b9U, 0x533183a0U, 0x9062ae8bU, + 0xd153b592U, 0x16c5f4ddU, 0x57f4efc4U, 0x94a7c2efU, 0xd596d9f6U, + 0xe9bc07aeU, 0xa88d1cb7U, 0x6bde319cU, 0x2aef2a85U, 0xed796bcaU, + 0xac4870d3U, 0x6f1b5df8U, 0x2e2a46e1U, 0xe136de66U, 0xa007c57fU, + 0x6354e854U, 0x2265f34dU, 0xe5f3b202U, 0xa4c2a91bU, 0x67918430U, + 0x26a09f29U, 0xb8aec5e4U, 0xf99fdefdU, 0x3accf3d6U, 0x7bfde8cfU, + 0xbc6ba980U, 0xfd5ab299U, 0x3e099fb2U, 0x7f3884abU, 0xb0241c2cU, + 0xf1150735U, 0x32462a1eU, 0x73773107U, 0xb4e17048U, 0xf5d06b51U, + 0x3683467aU, 0x77b25d63U, 0x4ed7facbU, 0x0fe6e1d2U, 0xccb5ccf9U, + 0x8d84d7e0U, 0x4a1296afU, 0x0b238db6U, 0xc870a09dU, 0x8941bb84U, + 0x465d2303U, 0x076c381aU, 0xc43f1531U, 0x850e0e28U, 0x42984f67U, + 0x03a9547eU, 0xc0fa7955U, 0x81cb624cU, 0x1fc53881U, 0x5ef42398U, + 0x9da70eb3U, 0xdc9615aaU, 0x1b0054e5U, 0x5a314ffcU, 0x996262d7U, + 0xd85379ceU, 0x174fe149U, 0x567efa50U, 0x952dd77bU, 0xd41ccc62U, + 0x138a8d2dU, 0x52bb9634U, 0x91e8bb1fU, 0xd0d9a006U, 0xecf37e5eU, + 0xadc26547U, 0x6e91486cU, 0x2fa05375U, 0xe836123aU, 0xa9070923U, + 0x6a542408U, 0x2b653f11U, 0xe479a796U, 0xa548bc8fU, 0x661b91a4U, + 0x272a8abdU, 0xe0bccbf2U, 0xa18dd0ebU, 0x62defdc0U, 0x23efe6d9U, + 0xbde1bc14U, 0xfcd0a70dU, 0x3f838a26U, 0x7eb2913fU, 0xb924d070U, + 0xf815cb69U, 0x3b46e642U, 0x7a77fd5bU, 0xb56b65dcU, 0xf45a7ec5U, + 0x370953eeU, 0x763848f7U, 0xb1ae09b8U, 0xf09f12a1U, 0x33cc3f8aU, + 0x72fd2493U + }, + { + 0x00000000U, 0x376ac201U, 0x6ed48403U, 0x59be4602U, 0xdca80907U, + 0xebc2cb06U, 0xb27c8d04U, 0x85164f05U, 0xb851130eU, 0x8f3bd10fU, + 0xd685970dU, 0xe1ef550cU, 0x64f91a09U, 0x5393d808U, 0x0a2d9e0aU, + 0x3d475c0bU, 0x70a3261cU, 0x47c9e41dU, 0x1e77a21fU, 0x291d601eU, + 0xac0b2f1bU, 0x9b61ed1aU, 0xc2dfab18U, 0xf5b56919U, 0xc8f23512U, + 0xff98f713U, 0xa626b111U, 0x914c7310U, 0x145a3c15U, 0x2330fe14U, + 0x7a8eb816U, 0x4de47a17U, 0xe0464d38U, 0xd72c8f39U, 0x8e92c93bU, + 0xb9f80b3aU, 0x3cee443fU, 0x0b84863eU, 0x523ac03cU, 0x6550023dU, + 0x58175e36U, 0x6f7d9c37U, 0x36c3da35U, 0x01a91834U, 0x84bf5731U, + 0xb3d59530U, 0xea6bd332U, 0xdd011133U, 0x90e56b24U, 0xa78fa925U, + 0xfe31ef27U, 0xc95b2d26U, 0x4c4d6223U, 0x7b27a022U, 0x2299e620U, + 0x15f32421U, 0x28b4782aU, 0x1fdeba2bU, 0x4660fc29U, 0x710a3e28U, + 0xf41c712dU, 0xc376b32cU, 0x9ac8f52eU, 0xada2372fU, 0xc08d9a70U, + 0xf7e75871U, 0xae591e73U, 0x9933dc72U, 0x1c259377U, 0x2b4f5176U, + 0x72f11774U, 0x459bd575U, 0x78dc897eU, 0x4fb64b7fU, 0x16080d7dU, + 0x2162cf7cU, 0xa4748079U, 0x931e4278U, 0xcaa0047aU, 0xfdcac67bU, + 0xb02ebc6cU, 0x87447e6dU, 0xdefa386fU, 0xe990fa6eU, 0x6c86b56bU, + 0x5bec776aU, 0x02523168U, 0x3538f369U, 0x087faf62U, 0x3f156d63U, + 0x66ab2b61U, 0x51c1e960U, 0xd4d7a665U, 0xe3bd6464U, 0xba032266U, + 0x8d69e067U, 0x20cbd748U, 0x17a11549U, 0x4e1f534bU, 0x7975914aU, + 0xfc63de4fU, 0xcb091c4eU, 0x92b75a4cU, 0xa5dd984dU, 0x989ac446U, + 0xaff00647U, 0xf64e4045U, 0xc1248244U, 0x4432cd41U, 0x73580f40U, + 0x2ae64942U, 0x1d8c8b43U, 0x5068f154U, 0x67023355U, 0x3ebc7557U, + 0x09d6b756U, 0x8cc0f853U, 0xbbaa3a52U, 0xe2147c50U, 0xd57ebe51U, + 0xe839e25aU, 0xdf53205bU, 0x86ed6659U, 0xb187a458U, 0x3491eb5dU, + 0x03fb295cU, 0x5a456f5eU, 0x6d2fad5fU, 0x801b35e1U, 0xb771f7e0U, + 0xeecfb1e2U, 0xd9a573e3U, 0x5cb33ce6U, 0x6bd9fee7U, 0x3267b8e5U, + 0x050d7ae4U, 0x384a26efU, 0x0f20e4eeU, 0x569ea2ecU, 0x61f460edU, + 0xe4e22fe8U, 0xd388ede9U, 0x8a36abebU, 0xbd5c69eaU, 0xf0b813fdU, + 0xc7d2d1fcU, 0x9e6c97feU, 0xa90655ffU, 0x2c101afaU, 0x1b7ad8fbU, + 0x42c49ef9U, 0x75ae5cf8U, 0x48e900f3U, 0x7f83c2f2U, 0x263d84f0U, + 0x115746f1U, 0x944109f4U, 0xa32bcbf5U, 0xfa958df7U, 0xcdff4ff6U, + 0x605d78d9U, 0x5737bad8U, 0x0e89fcdaU, 0x39e33edbU, 0xbcf571deU, + 0x8b9fb3dfU, 0xd221f5ddU, 0xe54b37dcU, 0xd80c6bd7U, 0xef66a9d6U, + 0xb6d8efd4U, 0x81b22dd5U, 0x04a462d0U, 0x33cea0d1U, 0x6a70e6d3U, + 0x5d1a24d2U, 0x10fe5ec5U, 0x27949cc4U, 0x7e2adac6U, 0x494018c7U, + 0xcc5657c2U, 0xfb3c95c3U, 0xa282d3c1U, 0x95e811c0U, 0xa8af4dcbU, + 0x9fc58fcaU, 0xc67bc9c8U, 0xf1110bc9U, 0x740744ccU, 0x436d86cdU, + 0x1ad3c0cfU, 0x2db902ceU, 0x4096af91U, 0x77fc6d90U, 0x2e422b92U, + 0x1928e993U, 0x9c3ea696U, 0xab546497U, 0xf2ea2295U, 0xc580e094U, + 0xf8c7bc9fU, 0xcfad7e9eU, 0x9613389cU, 0xa179fa9dU, 0x246fb598U, + 0x13057799U, 0x4abb319bU, 0x7dd1f39aU, 0x3035898dU, 0x075f4b8cU, + 0x5ee10d8eU, 0x698bcf8fU, 0xec9d808aU, 0xdbf7428bU, 0x82490489U, + 0xb523c688U, 0x88649a83U, 0xbf0e5882U, 0xe6b01e80U, 0xd1dadc81U, + 0x54cc9384U, 0x63a65185U, 0x3a181787U, 0x0d72d586U, 0xa0d0e2a9U, + 0x97ba20a8U, 0xce0466aaU, 0xf96ea4abU, 0x7c78ebaeU, 0x4b1229afU, + 0x12ac6fadU, 0x25c6adacU, 0x1881f1a7U, 0x2feb33a6U, 0x765575a4U, + 0x413fb7a5U, 0xc429f8a0U, 0xf3433aa1U, 0xaafd7ca3U, 0x9d97bea2U, + 0xd073c4b5U, 0xe71906b4U, 0xbea740b6U, 0x89cd82b7U, 0x0cdbcdb2U, + 0x3bb10fb3U, 0x620f49b1U, 0x55658bb0U, 0x6822d7bbU, 0x5f4815baU, + 0x06f653b8U, 0x319c91b9U, 0xb48adebcU, 0x83e01cbdU, 0xda5e5abfU, + 0xed3498beU + }, + { + 0x00000000U, 0x6567bcb8U, 0x8bc809aaU, 0xeeafb512U, 0x5797628fU, + 0x32f0de37U, 0xdc5f6b25U, 0xb938d79dU, 0xef28b4c5U, 0x8a4f087dU, + 0x64e0bd6fU, 0x018701d7U, 0xb8bfd64aU, 0xddd86af2U, 0x3377dfe0U, + 0x56106358U, 0x9f571950U, 0xfa30a5e8U, 0x149f10faU, 0x71f8ac42U, + 0xc8c07bdfU, 0xada7c767U, 0x43087275U, 0x266fcecdU, 0x707fad95U, + 0x1518112dU, 0xfbb7a43fU, 0x9ed01887U, 0x27e8cf1aU, 0x428f73a2U, + 0xac20c6b0U, 0xc9477a08U, 0x3eaf32a0U, 0x5bc88e18U, 0xb5673b0aU, + 0xd00087b2U, 0x6938502fU, 0x0c5fec97U, 0xe2f05985U, 0x8797e53dU, + 0xd1878665U, 0xb4e03addU, 0x5a4f8fcfU, 0x3f283377U, 0x8610e4eaU, + 0xe3775852U, 0x0dd8ed40U, 0x68bf51f8U, 0xa1f82bf0U, 0xc49f9748U, + 0x2a30225aU, 0x4f579ee2U, 0xf66f497fU, 0x9308f5c7U, 0x7da740d5U, + 0x18c0fc6dU, 0x4ed09f35U, 0x2bb7238dU, 0xc518969fU, 0xa07f2a27U, + 0x1947fdbaU, 0x7c204102U, 0x928ff410U, 0xf7e848a8U, 0x3d58149bU, + 0x583fa823U, 0xb6901d31U, 0xd3f7a189U, 0x6acf7614U, 0x0fa8caacU, + 0xe1077fbeU, 0x8460c306U, 0xd270a05eU, 0xb7171ce6U, 0x59b8a9f4U, + 0x3cdf154cU, 0x85e7c2d1U, 0xe0807e69U, 0x0e2fcb7bU, 0x6b4877c3U, + 0xa20f0dcbU, 0xc768b173U, 0x29c70461U, 0x4ca0b8d9U, 0xf5986f44U, + 0x90ffd3fcU, 0x7e5066eeU, 0x1b37da56U, 0x4d27b90eU, 0x284005b6U, + 0xc6efb0a4U, 0xa3880c1cU, 0x1ab0db81U, 0x7fd76739U, 0x9178d22bU, + 0xf41f6e93U, 0x03f7263bU, 0x66909a83U, 0x883f2f91U, 0xed589329U, + 0x546044b4U, 0x3107f80cU, 0xdfa84d1eU, 0xbacff1a6U, 0xecdf92feU, + 0x89b82e46U, 0x67179b54U, 0x027027ecU, 0xbb48f071U, 0xde2f4cc9U, + 0x3080f9dbU, 0x55e74563U, 0x9ca03f6bU, 0xf9c783d3U, 0x176836c1U, + 0x720f8a79U, 0xcb375de4U, 0xae50e15cU, 0x40ff544eU, 0x2598e8f6U, + 0x73888baeU, 0x16ef3716U, 0xf8408204U, 0x9d273ebcU, 0x241fe921U, + 0x41785599U, 0xafd7e08bU, 0xcab05c33U, 0x3bb659edU, 0x5ed1e555U, + 0xb07e5047U, 0xd519ecffU, 0x6c213b62U, 0x094687daU, 0xe7e932c8U, + 0x828e8e70U, 0xd49eed28U, 0xb1f95190U, 0x5f56e482U, 0x3a31583aU, + 0x83098fa7U, 0xe66e331fU, 0x08c1860dU, 0x6da63ab5U, 0xa4e140bdU, + 0xc186fc05U, 0x2f294917U, 0x4a4ef5afU, 0xf3762232U, 0x96119e8aU, + 0x78be2b98U, 0x1dd99720U, 0x4bc9f478U, 0x2eae48c0U, 0xc001fdd2U, + 0xa566416aU, 0x1c5e96f7U, 0x79392a4fU, 0x97969f5dU, 0xf2f123e5U, + 0x05196b4dU, 0x607ed7f5U, 0x8ed162e7U, 0xebb6de5fU, 0x528e09c2U, + 0x37e9b57aU, 0xd9460068U, 0xbc21bcd0U, 0xea31df88U, 0x8f566330U, + 0x61f9d622U, 0x049e6a9aU, 0xbda6bd07U, 0xd8c101bfU, 0x366eb4adU, + 0x53090815U, 0x9a4e721dU, 0xff29cea5U, 0x11867bb7U, 0x74e1c70fU, + 0xcdd91092U, 0xa8beac2aU, 0x46111938U, 0x2376a580U, 0x7566c6d8U, + 0x10017a60U, 0xfeaecf72U, 0x9bc973caU, 0x22f1a457U, 0x479618efU, + 0xa939adfdU, 0xcc5e1145U, 0x06ee4d76U, 0x6389f1ceU, 0x8d2644dcU, + 0xe841f864U, 0x51792ff9U, 0x341e9341U, 0xdab12653U, 0xbfd69aebU, + 0xe9c6f9b3U, 0x8ca1450bU, 0x620ef019U, 0x07694ca1U, 0xbe519b3cU, + 0xdb362784U, 0x35999296U, 0x50fe2e2eU, 0x99b95426U, 0xfcdee89eU, + 0x12715d8cU, 0x7716e134U, 0xce2e36a9U, 0xab498a11U, 0x45e63f03U, + 0x208183bbU, 0x7691e0e3U, 0x13f65c5bU, 0xfd59e949U, 0x983e55f1U, + 0x2106826cU, 0x44613ed4U, 0xaace8bc6U, 0xcfa9377eU, 0x38417fd6U, + 0x5d26c36eU, 0xb389767cU, 0xd6eecac4U, 0x6fd61d59U, 0x0ab1a1e1U, + 0xe41e14f3U, 0x8179a84bU, 0xd769cb13U, 0xb20e77abU, 0x5ca1c2b9U, + 0x39c67e01U, 0x80fea99cU, 0xe5991524U, 0x0b36a036U, 0x6e511c8eU, + 0xa7166686U, 0xc271da3eU, 0x2cde6f2cU, 0x49b9d394U, 0xf0810409U, + 0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU, + 0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U, + 0xf10605deU +#endif + } +}; + +#ifdef NO_ENDIAN +// Currently not in use, always use the BYFOUR method with known endianness +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +uint32_t crc32(crc, buf, len) + uint32_t crc; + const uint8_t *buf; + size_t len; +{ + if (buf == NULL) return 0; + + crc = crc ^ 0xffffffffU; + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc ^ 0xffffffffU; +} +#endif + +#ifdef BYFOUR + +/* + This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit + integer pointer type. This violates the strict aliasing rule, where a + compiler can assume, for optimization purposes, that two pointers to + fundamentally different types won't ever point to the same memory. This can + manifest as a problem only if one of the pointers is written to. This code + only reads from those pointers. So long as this code remains isolated in + this compilation unit, there won't be a problem. For this reason, this code + should not be copied and pasted into a compilation unit in which other code + writes to the buffer that is passed to these routines. + */ + +#ifdef LITTLE_ENDIAN +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +uint32_t crc32(crc, buf, len) + uint32_t crc; + const uint8_t *buf; + size_t len; +{ + if (buf == NULL) return 0; + register uint32_t c; + register const uint32_t FAR *buf4; + + c = ~crc; + while (len && ((uintptr_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const uint32_t FAR *)(const void FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const uint8_t FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return c; +} +#endif + +#ifdef BIG_ENDIAN +/* ========================================================================= */ +#define DOBIG4 c ^= *buf4++; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +uint32_t crc32(crc, buf, len) + uint32_t crc; + const uint8_t *buf; + size_t len; +{ + if (buf == NULL) return 0; + register uint32_t c; + register const uint32_t FAR *buf4; + + c = ~net_order_32(crc); + while (len && ((uintptr_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const uint32_t FAR *)(const void FAR *)buf; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf = (const uint8_t FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return net_order_32(c); +} +#endif + +#endif /* BYFOUR */ + diff --git a/src/shared/crc32.h b/src/shared/crc32.h new file mode 100644 index 0000000..00b8bdd --- /dev/null +++ b/src/shared/crc32.h @@ -0,0 +1,9 @@ +#ifndef _CRC32_H_ +#define _CRC32_H_ + +#include + +uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len); + +#endif + diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c new file mode 100644 index 0000000..5e5cf7f --- /dev/null +++ b/src/shared/fdsignal.c @@ -0,0 +1,14 @@ +#include "fdsignal.h" + +#if defined(linux) || defined(__linux) || defined(__linux__) +//#warning "Using eventfd based signalling" +#include "fdsignal.inc/eventfd.c" +#elif __SIZEOF_INT__ == 4 && __SIZEOF_POINTER__ == 8 +//#warning "Using pointer-packing pipe based signalling" +#include "fdsignal.inc/pipe64.c" +#else +_Static_assert( sizeof(int) != 4 || sizeof(void*) != 8, "Something's goofy, fix preprocessor check above!" ); +//#warning "Using fallback pipe based signalling" +#include "fdsignal.inc/pipe_malloc.c" +#endif + diff --git a/src/shared/fdsignal.h b/src/shared/fdsignal.h new file mode 100644 index 0000000..960a2a9 --- /dev/null +++ b/src/shared/fdsignal.h @@ -0,0 +1,57 @@ +#ifndef _FD_SIGNAL_H_ +#define _FD_SIGNAL_H_ + +#define SIGNAL_OK (0) +#define SIGNAL_TIMEOUT (-2) +#define SIGNAL_ERROR (-1) + +typedef struct _dnbd3_signal dnbd3_signal_t; + +/** + * Create a new signal, nonblocking. + * @return NULL on error, pointer to dnbd3_signal_t on success. + */ +dnbd3_signal_t* signal_new(); + +/** + * Create a new signal, blocking. + * @return NULL on error, pointer to dnbd3_signal_t on success. + */ +dnbd3_signal_t* signal_newBlocking(); + +/** + * Trigger the given signal, so a wait or clear call will succeed. + * @return SIGNAL_OK on success, SIGNAL_ERROR on error + */ +int signal_call(const dnbd3_signal_t* const signal); + +/** + * Wait for given signal, with an optional timeout. + * If timeout == 0, just poll once. + * If timeout < 0, wait forever. + * @return > 0 telling how many times the signal was called, + * SIGNAL_TIMEOUT if the timeout was reached, + * SIGNAL_ERROR if some error occured + */ +int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs); + +/** + * Clears any pending signals on this signal. + * @return number of signals that were pending, + * SIGNAL_ERROR if some error occured + */ +int signal_clear(const dnbd3_signal_t* const signal); + +/** + * Close the given signal. + */ +void signal_close(const dnbd3_signal_t* const signal); + +/** + * Get a file descriptor for the given signal that can be + * waited on using poll or similar. + * @return -1 if the signal is invalid + */ +int signal_getWaitFd(const dnbd3_signal_t* const signal); + +#endif diff --git a/src/shared/fdsignal.inc/eventfd.c b/src/shared/fdsignal.inc/eventfd.c new file mode 100644 index 0000000..358d41c --- /dev/null +++ b/src/shared/fdsignal.inc/eventfd.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include + +/* + * Linux implementation of signals. + * Internally, eventfds are used for signalling, as they + * provide the least overhead. We don't allocate any struct + * ever, but cast the event fd+1 to dnbd3_signal_t* + * to save all the malloc() and free() calls. + */ + +dnbd3_signal_t* signal_new() +{ + // On error, eventfd() returns -1, so essentially we return NULL on error. + // (Yes, NULL doesn't have to be 0 everywhere, but cmon) + return (dnbd3_signal_t*)(intptr_t)( eventfd( 0, EFD_NONBLOCK ) + 1 ); +} + +dnbd3_signal_t* signal_newBlocking() +{ + return (dnbd3_signal_t*)(intptr_t)( eventfd( 0, 0 ) + 1 ); +} + +int signal_call(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + static const uint64_t one = 1; + const int signalFd = ( (int)(intptr_t)signal ) - 1; + return write( signalFd, &one, sizeof one ) == sizeof one ? SIGNAL_OK : SIGNAL_ERROR; +} + +int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + const int signalFd = ( (int)(intptr_t)signal ) - 1; + struct pollfd ps = { + .fd = signalFd, + .events = POLLIN + }; + int ret = poll( &ps, 1, timeoutMs ); + if ( ret == 0 ) return SIGNAL_TIMEOUT; + if ( ret == -1 ) return SIGNAL_ERROR; + if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR; + return signal_clear( signal ); +} + +int signal_clear(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + uint64_t ret; + const int signalFd = ( (int)(intptr_t)signal ) - 1; + if ( read( signalFd, &ret, sizeof ret ) != sizeof ret ) { + if ( errno == EAGAIN ) return 0; + return SIGNAL_ERROR; + } + return (int)ret; +} + +void signal_close(const dnbd3_signal_t* const signal) +{ + const int signalFd = ( (int)(intptr_t)signal ) - 1; + close( signalFd ); +} + +int signal_getWaitFd(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return -1; + const int signalFd = ( (int)(intptr_t)signal ) - 1; + return signalFd; +} + diff --git a/src/shared/fdsignal.inc/pipe64.c b/src/shared/fdsignal.inc/pipe64.c new file mode 100644 index 0000000..4f0614b --- /dev/null +++ b/src/shared/fdsignal.inc/pipe64.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +#define P_READ (0) +#define P_WRITE (1) + +/* + * Generic (posix) implelentation of signals, using pipes. + * 64bit version, packing two ints into a pointer. + * This version requires that you use -fno-strict-aliasing + * since it's doing evil pointer casting. + */ + +dnbd3_signal_t* signal_new() +{ + int fds[2]; + if ( pipe( fds ) == -1 ) return NULL; + fcntl( fds[P_READ], F_SETFL, O_NONBLOCK ); + fcntl( fds[P_WRITE], F_SETFL, O_NONBLOCK ); + return (dnbd3_signal_t*)*((uintptr_t*)fds); +} + +dnbd3_signal_t* signal_newBlocking() +{ + int fds[2]; + if ( pipe( fds ) == -1 ) return NULL; + return (dnbd3_signal_t*)*((uintptr_t*)fds); +} + +int signal_call(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + static char one = 1; + const int* fds = (int*)&signal; + // Write one byte on every call, so the number of bytes read will + // match the number of events + return write( fds[P_WRITE], &one, 1 ) > 0 ? SIGNAL_OK : SIGNAL_ERROR; +} + +int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + const int* fds = (int*)&signal; + struct pollfd ps = { + .fd = fds[P_READ], + .events = POLLIN + }; + int ret = poll( &ps, 1, timeoutMs ); + if ( ret == 0 ) return SIGNAL_TIMEOUT; + if ( ret == -1 ) return SIGNAL_ERROR; + if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR; + return signal_clear( signal ); +} + +int signal_clear(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + char throwaway[100]; + const int* fds = (int*)&signal; + ssize_t ret, total = 0; + do { + ret = read( fds[P_READ], throwaway, sizeof throwaway ); + if ( ret < 0 ) { + if ( errno == EAGAIN ) return total; + return SIGNAL_ERROR; + } + total += ret; + } while ( (size_t)ret == sizeof throwaway ); + return (int)total; +} + +void signal_close(const dnbd3_signal_t* const signal) +{ + const int* fds = (int*)&signal; + close( fds[P_READ] ); + close( fds[P_WRITE] ); +} + +int signal_getWaitFd(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return -1; + const int* fds = (int*)&signal; + return fds[P_READ]; +} + diff --git a/src/shared/fdsignal.inc/pipe_malloc.c b/src/shared/fdsignal.inc/pipe_malloc.c new file mode 100644 index 0000000..b23ddcd --- /dev/null +++ b/src/shared/fdsignal.inc/pipe_malloc.c @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include + +struct _dnbd3_signal { + int read; + int write; +}; + +/* + * Generic (posix) implelentation of signals, using pipes. + * A struct containing both fds will be malloc()ed for each + * signal. + */ + +dnbd3_signal_t* signal_new() +{ + dnbd3_signal_t *ret = signal_newBlocking(); + if ( ret == NULL ) return NULL; + fcntl( ret->read, F_SETFL, O_NONBLOCK ); + fcntl( ret->write, F_SETFL, O_NONBLOCK ); + return ret; +} + +dnbd3_signal_t* signal_newBlocking() +{ + int fds[2]; + if ( pipe( fds ) == -1 ) return NULL; + dnbd3_signal_t* ret = malloc( sizeof(dnbd3_signal_t) ); + ret->read = fds[0]; + ret->write = fds[1]; + return ret; +} + +int signal_call(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + static char one = 1; + // Write one byte on every call, so the number of bytes read will + // match the number of events + return write( signal->write, &one, 1 ) > 0 ? SIGNAL_OK : SIGNAL_ERROR; +} + +int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + struct pollfd ps = { + .fd = signal->read, + .events = POLLIN + }; + int ret = poll( &ps, 1, timeoutMs ); + if ( ret == 0 ) return SIGNAL_TIMEOUT; + if ( ret == -1 ) return SIGNAL_ERROR; + if ( ps.revents & ( POLLERR | POLLNVAL ) ) return SIGNAL_ERROR; + return signal_clear( signal ); +} + +int signal_clear(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return SIGNAL_ERROR; + char throwaway[100]; + ssize_t ret, total = 0; + do { + ret = read( signal->read, throwaway, sizeof throwaway ); + if ( ret < 0 ) { + if ( errno == EAGAIN ) return (int)total; + return SIGNAL_ERROR; + } + total += ret; + } while ( (size_t)ret == sizeof throwaway ); + return (int)total; +} + +void signal_close(const dnbd3_signal_t* const signal) +{ + close( signal->read ); + close( signal->write ); + free( (void*)signal ); +} + +int signal_getWaitFd(const dnbd3_signal_t* const signal) +{ + if ( signal == NULL ) return -1; + return signal->read; +} + diff --git a/src/shared/log.c b/src/shared/log.c new file mode 100644 index 0000000..055acb4 --- /dev/null +++ b/src/shared/log.c @@ -0,0 +1,204 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Simon Rettberg + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#include "log.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define LINE_LEN (800) + +static pthread_mutex_t logLock = PTHREAD_MUTEX_INITIALIZER; +static _Atomic logmask_t maskFile = 31; +static _Atomic logmask_t maskCon = 15; + +static char *logFile = NULL; +static int logFd = -1; + +static bool consoleTimestamps = false; + + +static int writeLevel(char *buffer, logmask_t level); + + +bool log_hasMask(const logmask_t mask) +{ + return ( ( maskFile | maskCon ) & mask ) == mask; +} + +void log_setFileMask(logmask_t mask) +{ + maskFile = mask; +} + +void log_setConsoleMask(logmask_t mask) +{ + maskCon = mask; +} + +void log_setConsoleTimestamps(bool on) +{ + consoleTimestamps = on; +} + +bool log_openLogFile(const char *path) +{ + pthread_mutex_lock( &logLock ); + if ( logFd >= 0 ) { + close( logFd ); + } + if ( path == NULL && logFile == NULL ) + goto unlock; + if ( path != NULL ) { + free( logFile ); + logFile = strdup( path ); + } + logFd = open( logFile, O_WRONLY | O_CREAT | O_APPEND, 0644 ); + if ( logFd < 0 ) + goto unlock; +unlock: ; + pthread_mutex_unlock( &logLock ); + return logFd >= 0; +} + +void logadd(const logmask_t mask, const char *fmt, ...) +{ + if ( ( (maskFile | maskCon) & mask ) == 0 ) + return; + va_list ap; + int ret; + time_t rawtime; + struct tm timeinfo; + char buffer[LINE_LEN]; + bool toFile = maskFile & mask; + bool toStdout = maskCon & mask; + size_t offset; + + if ( toFile || ( toStdout && consoleTimestamps ) ) { + time( &rawtime ); + localtime_r( &rawtime, &timeinfo ); + offset = strftime( buffer, LINE_LEN, "[%d.%m. %H:%M:%S] ", &timeinfo ); + } else { + offset = 0; + } + const char *stdoutLine = buffer + offset; + offset += writeLevel( buffer + offset, mask ); + va_start( ap, fmt ); + ret = vsnprintf( buffer + offset, LINE_LEN - offset, fmt, ap ); + va_end( ap ); + if ( ret < 0 ) return; + offset += ret; + if ( offset + 1 >= LINE_LEN ) { + buffer[LINE_LEN-2] = '\0'; + offset = LINE_LEN - 2; + } + if ( buffer[offset-1] != '\n' ) { + buffer[offset++] = '\n'; + buffer[offset] = '\0'; + } + if ( toFile ) { + pthread_mutex_lock( &logLock ); + if ( logFd >= 0 ) { + size_t done = 0; + while (done < offset ) { + const ssize_t wr = write( logFd, buffer + done, offset - done ); + if ( wr < 0 ) { + if ( errno == EINTR ) continue; + printf( "Logging to file failed! (errno=%d)\n", errno ); + break; + } + done += (size_t)wr; + } + } + pthread_mutex_unlock( &logLock ); + } + if ( toStdout ) { + if ( consoleTimestamps ) stdoutLine = buffer; +#ifdef AFL_MODE + fputs( stdoutLine, stderr ); + fflush( stderr ); +#else + fputs( stdoutLine, stdout ); + fflush( stdout ); +#endif + } +} + +ssize_t log_fetch(char *buffer, int size) +{ + if ( logFile == NULL || size <= 1 ) + return -1; + int fd = open( logFile, O_RDONLY ); + if ( fd < 0 ) + return -1; + off_t off = lseek( fd, 0, SEEK_END ); + if ( off == (off_t)-1 ) { + close( fd ); + return -1; + } + if ( (off_t)size <= off ) { + off -= size; + } else { + off = 0; + } + ssize_t ret = pread( fd, buffer, size - 1, off ); + close( fd ); + buffer[ret] = '\0'; + return ret; +} + +static int writeLevel(char *buffer, logmask_t level) +{ + const char *word; + char *dest = buffer; + switch ( level ) { + case LOG_ERROR: + word = "ERROR"; + break; + case LOG_WARNING: + word = "WARNING"; + break; + case LOG_MINOR: + word = "Warning"; + break; + case LOG_INFO: + word = "Info"; + break; + case LOG_DEBUG1: + word = "DEBUG1"; + break; + case LOG_DEBUG2: + word = "DEBUG2"; + break; + default: + word = "!?!?!?"; + break; + } + while ( ( *dest++ = *word++ ) ); + *--dest = ':'; + *++dest = ' '; + return (int)( dest - buffer ) + 1; +} + diff --git a/src/shared/log.h b/src/shared/log.h new file mode 100644 index 0000000..5b1e8f7 --- /dev/null +++ b/src/shared/log.h @@ -0,0 +1,65 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Simon Rettberg + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef LOG_H_ +#define LOG_H_ + +#include +#include + +typedef unsigned int logmask_t; +#define LOG_ERROR ((logmask_t)1) // Fatal error, server will terminate +#define LOG_WARNING ((logmask_t)2) // Major issue, something is broken but keep running +#define LOG_MINOR ((logmask_t)4) // Minor issue, more of a hickup than serious problem +#define LOG_INFO ((logmask_t)8) // Informational message +#define LOG_DEBUG1 ((logmask_t)16) // Debug information, use this for non-spammy stuff +#define LOG_DEBUG2 ((logmask_t)32) // Use this for debug messages that will show up a lot + + +/** + * Check if cansoleMask | fileMask has all of mask set. + */ +bool log_hasMask(const logmask_t mask); + +void log_setFileMask(logmask_t mask); + +void log_setConsoleMask(logmask_t mask); + +void log_setConsoleTimestamps(bool on); + +/** + * Open or reopen the log file. If path is NULL and the + * function was called with a path before, the same path + * will be used again. + */ +bool log_openLogFile(const char *path); + +/** + * Add a line to the log + */ +void logadd(const logmask_t mask, const char *text, ...) + __attribute__ ((format (printf, 2, 3))); + +/** + * Return last size bytes of log. + */ +ssize_t log_fetch(char *buffer, int size); + +#endif /* LOG_H_ */ diff --git a/src/shared/protocol.h b/src/shared/protocol.h new file mode 100644 index 0000000..d87bbd8 --- /dev/null +++ b/src/shared/protocol.h @@ -0,0 +1,159 @@ +#ifndef _PROTOCOL_H_ +#define _PROTOCOL_H_ + +#include "sockhelper.h" + +#include "../types.h" +#include "../serialize.h" + +#include +#include +#include +#include + +// Client tells server that it is another server +#define FLAGS8_SERVER (1) +// Client (which is a proxy) tells server that it has background-replication enabled +#define FLAGS8_BG_REP (2) + +// 2017-10-16: We now support hop-counting, macro to pass hop count conditinally to a function +#define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 ) + +// 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var +#define SI_SERVER_FLAGS ( FLAGS8_SERVER | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) + +#define REPLY_OK (0) +#define REPLY_ERRNO (-1) +#define REPLY_AGAIN (-2) +#define REPLY_INTR (-3) +#define REPLY_CLOSED (-4) +#define REPLY_INCOMPLETE (-5) +#define REPLY_WRONGMAGIC (-6) + +static inline int dnbd3_read_reply(int sock, dnbd3_reply_t *reply, bool wait) +{ + ssize_t ret = recv( sock, reply, sizeof(*reply), (wait ? MSG_WAITALL : MSG_DONTWAIT) | MSG_NOSIGNAL ); + if ( ret == 0 ) return REPLY_CLOSED; + if ( ret < 0 ) { + if ( errno == EAGAIN || errno == EWOULDBLOCK ) return REPLY_AGAIN; + if ( errno == EINTR ) return REPLY_INTR; + return REPLY_ERRNO; + } + if ( !wait && ret != sizeof(*reply) ) ret += recv( sock, ((char*)reply) + ret, sizeof(*reply) - ret, MSG_WAITALL | MSG_NOSIGNAL ); + if ( ret != sizeof(*reply) ) return REPLY_INCOMPLETE; + fixup_reply( *reply ); + if ( reply->magic != dnbd3_packet_magic ) return REPLY_WRONGMAGIC; + return REPLY_OK; +} + +static inline bool dnbd3_get_reply(int sock, dnbd3_reply_t *reply) +{ + int ret; + do { + ret = dnbd3_read_reply( sock, reply, true ); + } while ( ret == REPLY_INTR ); + return ret == REPLY_OK; +} + +static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid, uint8_t flags8) +{ + serialized_buffer_t serialized; + dnbd3_request_t request; + struct iovec iov[2]; + serializer_reset_write( &serialized ); + serializer_put_uint16( &serialized, PROTOCOL_VERSION ); + serializer_put_string( &serialized, name ); + serializer_put_uint16( &serialized, rid ); + serializer_put_uint8( &serialized, flags8 ); + const ssize_t len = serializer_get_written_length( &serialized ); + request.magic = dnbd3_packet_magic; + request.cmd = CMD_SELECT_IMAGE; + request.size = (uint32_t)len; +#ifdef _DEBUG + request.handle = 0; + request.offset = 0; +#endif + fixup_request( request ); + iov[0].iov_base = &request; + iov[0].iov_len = sizeof(request); + iov[1].iov_base = &serialized; + iov[1].iov_len = len; + ssize_t ret; + do { + ret = writev( sock, iov, 2 ); + } while ( ret == -1 && errno == EINTR ); + return ret == len + (ssize_t)sizeof(request); +} + +static inline bool dnbd3_get_block(int sock, uint64_t offset, uint32_t size, uint64_t handle, uint8_t hopCount) +{ + dnbd3_request_t request; + request.magic = dnbd3_packet_magic; + request.handle = handle; + request.cmd = CMD_GET_BLOCK; + // When writing before "fixup", we can get away with assigning to offset instead of offset_small if we + // do it before assigning to .hops. Faster on 64bit machines (so, on everything) + request.offset = offset; + request.hops = hopCount; + request.size = size; + fixup_request( request ); + return sock_sendAll( sock, &request, sizeof(request), 2 ) == (ssize_t)sizeof(request); +} + +static inline bool dnbd3_get_crc32(int sock, uint32_t *master, void *buffer, size_t *bufferLen) +{ + dnbd3_request_t request; + dnbd3_reply_t reply; + request.magic = dnbd3_packet_magic; + request.handle = 0; + request.cmd = CMD_GET_CRC32; + request.offset = 0; + request.size = 0; + fixup_request( request ); + if ( sock_sendAll( sock, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) return false; + if ( !dnbd3_get_reply( sock, &reply ) ) return false; + if ( reply.size == 0 ) { + *bufferLen = 0; + return true; + } + if ( reply.size < 4 ) return false; + reply.size -= 4; + if ( reply.cmd != CMD_GET_CRC32 || reply.size > *bufferLen ) return false; + *bufferLen = reply.size; + if ( sock_recv( sock, master, sizeof(uint32_t) ) != (ssize_t)sizeof(uint32_t) ) return false; + return sock_recv( sock, buffer, reply.size ) == (ssize_t)reply.size; +} + +/** + * Pass a full serialized_buffer_t and a socket fd. Parsed data will be returned in further arguments. + * Note that all strings will point into the passed buffer, so there's no need to free them. + * This function will also read the header for you, as this message can only occur during connection, + * where no unrequested messages could arrive inbetween. + */ +static inline bool dnbd3_select_image_reply(serialized_buffer_t *buffer, int sock, uint16_t *protocol_version, char **name, uint16_t *rid, + uint64_t *imageSize) +{ + errno = 0; + dnbd3_reply_t reply; + if ( !dnbd3_get_reply( sock, &reply ) ) { + return false; + } + errno = 0; + if ( reply.cmd != CMD_SELECT_IMAGE || reply.size < 3 || reply.size > MAX_PAYLOAD ) { + return false; + } + // receive reply payload + ssize_t ret = sock_recv( sock, buffer, reply.size ); + if ( ret != (ssize_t)reply.size ) { + return false; + } + // handle/check reply payload + serializer_reset_read( buffer, reply.size ); + *protocol_version = serializer_get_uint16( buffer ); + *name = serializer_get_string( buffer ); + *rid = serializer_get_uint16( buffer ); + *imageSize = serializer_get_uint64( buffer ); + return true; +} + +#endif diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c new file mode 100644 index 0000000..ab34aa1 --- /dev/null +++ b/src/shared/sockhelper.c @@ -0,0 +1,430 @@ +#include "sockhelper.h" +#include "log.h" +#include // inet_ntop +#include +#include +#include +#include +#include +#include +#include + +#define MAXLISTEN 20 + +struct _poll_list { + int count; + struct pollfd entry[MAXLISTEN]; +}; + +int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms) +{ + // TODO: Move out of here, this unit should contain general socket functions + // TODO: Abstract away from sockaddr_in* like the rest of the functions here do, + // so WITH_IPV6 can finally be removed as everything is transparent. b- but how? + struct sockaddr_storage ss; + int proto, addrlen; + memset( &ss, 0, sizeof ss ); + if ( addr->type == HOST_IP4 ) { + // Set host (IPv4) + struct sockaddr_in *addr4 = (struct sockaddr_in*)&ss; + addr4->sin_family = AF_INET; + memcpy( &addr4->sin_addr, addr->addr, 4 ); + addr4->sin_port = addr->port; + proto = PF_INET; + addrlen = sizeof *addr4; + } +#ifdef WITH_IPV6 + else if ( addr->type == HOST_IP6 ) { + // Set host (IPv6) + struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)&ss; + addr6->sin6_family = AF_INET6; + memcpy( &addr6->sin6_addr, addr->addr, 16 ); + addr6->sin6_port = addr->port; + proto = PF_INET6; + addrlen = sizeof *addr6; + } +#endif + else { + logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type ); + return -1; + } + int client_sock = socket( proto, SOCK_STREAM, IPPROTO_TCP ); + if ( client_sock == -1 ) return -1; + // Apply connect timeout + if ( connect_ms == -1 ) { + sock_set_nonblock( client_sock ); + } else { + sock_setTimeout( client_sock, connect_ms ); + } + for ( int i = 0; i < 5; ++i ) { + int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen ); + if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break; + if ( errno == EINTR ) { + // http://www.madore.org/~david/computers/connect-intr.html +#ifdef __linux__ + continue; +#else + struct pollfd unix_really_sucks = { .fd = client_sock, .events = POLLOUT | POLLIN }; + while ( i-- > 0 ) { + int pr = poll( &unix_really_sucks, 1, connect_ms == 0 ? -1 : connect_ms ); + if ( pr == 1 && ( unix_really_sucks.revents & POLLOUT ) ) break; + if ( pr == -1 && errno == EINTR ) continue; + close( client_sock ); + return -1; + } + sockaddr_storage junk; + socklen_t more_junk = sizeof(junk); + if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) { + close( client_sock ); + return -1; + } + break; +#endif + } // EINTR + close( client_sock ); + return -1; + } + if ( connect_ms != -1 && connect_ms != rw_ms ) { + // Apply read/write timeout + sock_setTimeout( client_sock, rw_ms ); + } + return client_sock; +} + +// TODO: Pretty much same as in server/* +int sock_resolveToDnbd3Host(const char * const address, dnbd3_host_t * const dest, const int count) +{ + if ( count <= 0 ) + return 0; + struct addrinfo hints, *res, *ptr; + char bufferAddr[100], bufferPort[6]; + char *addr = bufferAddr; + const char *portStr = NULL; + int addCount = 0; + + // See if we have a port + snprintf( bufferAddr, sizeof bufferAddr, "%s", address ); + char *c1, *c2; + c1 = strchr( addr, ':' ); + if ( c1 != NULL ) { + c2 = strchr( c1 + 1, ':' ); + if ( c2 == NULL ) { + *c1 = '\0'; + portStr = c1 + 1; + } else if ( *addr == '[' ) { + // IPv6 - support [1:2::3]:123 + do { + c1 = strchr( c2 + 1, ':' ); + if ( c1 != NULL ) c2 = c1; + } while ( c1 != NULL ); + if ( *(c2 - 1 ) == ']' ) { + *( c2 - 1 ) = '\0'; + *c2 = '\0'; + addr += 1; + portStr = c2 + 1; + } + } + } + if ( portStr == NULL ) { + portStr = bufferPort; + snprintf( bufferPort, sizeof bufferPort, "%d", (int)PORT ); + } + + // Set hints for local addresses. + memset( &hints, 0, sizeof( hints ) ); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if ( getaddrinfo( addr, portStr, &hints, &res ) != 0 || res == NULL ) { + return 0; + } + for ( ptr = res; ptr != NULL && count > 0; ptr = ptr->ai_next ) { + if ( sock_sockaddrToDnbd3( ptr->ai_addr, &dest[addCount] ) ) { + addCount += 1; + } + } + + freeaddrinfo( res ); + return addCount; +} + +bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host) +{ + if ( sa->sa_family == AF_INET ) { + // Set host (IPv4) + struct sockaddr_in *addr4 = (struct sockaddr_in*)sa; + host->type = HOST_IP4; + host->port = addr4->sin_port; + memcpy( host->addr, &addr4->sin_addr, 4 ); + return true; + } +#ifdef WITH_IPV6 + if ( sa->sa_family == AF_INET6 ) { + // Set host (IPv6) + struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)sa; + host->type = HOST_IP6; + host->port = addr6->sin6_port; + memcpy( host->addr, &addr6->sin6_addr, 16 ); + return true; + } +#endif + return false; +} + +void sock_setTimeout(const int sockfd, const int milliseconds) +{ + struct timeval tv; + tv.tv_sec = milliseconds / 1000; + tv.tv_usec = (milliseconds * 1000) % 1000000; + setsockopt( sockfd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) ); + setsockopt( sockfd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv) ); +} + +poll_list_t* sock_newPollList() +{ + poll_list_t *list = (poll_list_t*)malloc( sizeof( poll_list_t ) ); + list->count = 0; + return list; +} + +void sock_destroyPollList(poll_list_t *list) +{ + for ( int i = 0; i < list->count; ++i ) { + if ( list->entry[i].fd >= 0 ) close( list->entry[i].fd ); + } + free( list ); +} + +size_t sock_printHost(const dnbd3_host_t * const host, char * const buffer, const size_t len) +{ + // Worst case: Port 5 chars, ':' to separate ip and port 1 char, terminating null 1 char = 7, [] for IPv6 + if ( len < 10 ) return 0; + char *output = buffer; + if ( host->type == HOST_IP6 ) { + *output++ = '['; + inet_ntop( AF_INET6, host->addr, output, (socklen_t)( len - 10 ) ); + output += strlen( output ); + *output++ = ']'; + } else if ( host->type == HOST_IP4 ) { + inet_ntop( AF_INET, host->addr, output, (socklen_t)( len - 8 ) ); + output += strlen( output ); + } else { + int ret = snprintf( output, len, "", (int)host->type ); + if ( ret <= 0 ) return 0; + return MIN( (size_t)ret, len-1 ); + } + *output = '\0'; + if ( host->port != 0 ) { + // There are still at least 7 bytes left in the buffer, port is at most 5 bytes + ':' + '\0' = 7 + int ret = snprintf( output, 7, ":%d", (int)ntohs( host->port ) ); + if ( ret < 0 ) ret = 0; + output += MIN( ret, 6 ); + } + return output - buffer; +} + +size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLen, char *output, const size_t len) +{ + char host[100], port[10]; + int outlen = 0; + int ret = getnameinfo( addr, addrLen, host, sizeof(host), port, sizeof(port), NI_NUMERICHOST | NI_NUMERICSERV ); + if ( ret == 0 ) { + if ( addr->sa_family == AF_INET ) { + outlen = snprintf( output, len, "%s:%s", host, port ); + } else { + outlen = snprintf( output, len, "[%s]:%s", host, port ); + } + } + if ( outlen <= 0 ) return 0; + return MIN( (size_t)outlen, len-1 ); +} + +bool sock_listen(poll_list_t* list, char* bind_addr, uint16_t port) +{ + if ( list->count >= MAXLISTEN ) return false; + struct addrinfo hints, *res = NULL, *ptr; + char portStr[6]; + const int on = 1; + int openCount = 0; + // Set hints for local addresses. + memset( &hints, 0, sizeof(hints) ); + hints.ai_flags = AI_PASSIVE; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + snprintf( portStr, sizeof portStr, "%d", (int)port ); + if ( getaddrinfo( bind_addr, portStr, &hints, &res ) != 0 || res == NULL ) return false; + // Attempt to bind to all of the addresses as long as there's room in the poll list + for( ptr = res; ptr != NULL; ptr = ptr->ai_next ) { + char bla[100]; + if ( !sock_printable( (struct sockaddr*)ptr->ai_addr, ptr->ai_addrlen, bla, 100 ) ) snprintf( bla, 100, "[invalid]" ); + logadd( LOG_DEBUG1, "Binding to %s...", bla ); + int sock = socket( ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol ); + if ( sock < 0 ) { + logadd( LOG_WARNING, "(Bind to %s): cannot socket(), errno=%d", bla, errno ); + continue; + } + setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on) ); + if ( ptr->ai_family == PF_INET6 ) setsockopt( sock, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on) ); + if ( bind( sock, ptr->ai_addr, ptr->ai_addrlen ) == -1 ) { + logadd( LOG_WARNING, "(Bind to %s): cannot bind(), errno=%d", bla, errno ); + close( sock ); + continue; + } + if ( listen( sock, 20 ) == -1 ) { + logadd( LOG_WARNING, "(Bind to %s): cannot listen(), errno=%d", bla, errno ); + close( sock ); + continue; + } + list->entry[list->count].fd = sock; + list->entry[list->count].events = POLLIN | POLLRDHUP; + list->count++; + openCount++; + if ( list->count >= MAXLISTEN ) break; + } + freeaddrinfo( res ); + return openCount > 0; +} + +bool sock_listenAny(poll_list_t* list, uint16_t port) +{ + return sock_listen( list, NULL, port ); +} + +int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_ms, int rw_ms) +{ + // Nonblocking connect seems to be hard to get right in a portable fashion + // that's why you might see some weird checks here and there. For now there's + // only Linux and FreeBSD, but let's try to not make this code fall on its nose + // should dnbd3 be ported to other platforms. + if ( list->count < MAXLISTEN && host != NULL ) { + int sock = sock_connect( host, -1, -1 ); + if ( sock != -1 ) { + list->entry[list->count].fd = sock; + list->entry[list->count].events = POLLIN | POLLOUT | POLLRDHUP; + list->count++; + } + } + if ( list->count == 0 ) { + return -2; + } + int ret, tries = 5; + do { + ret = poll( list->entry, list->count, connect_ms ); + if ( ret > 0 ) break; + if ( ret == 0 ) return -1; + if ( ret == -1 && ( errno == EINTR || errno == EAGAIN ) ) { + if ( --tries == 0 ) return -1; + if ( connect_ms > 1 ) connect_ms /= 2; // Maybe properly account time one day + continue; + } + return -1; + } while ( true ); + for ( int i = list->count - 1; i >= 0; --i ) { + int fd = -1; + if ( list->entry[i].revents & ( POLLIN | POLLOUT ) ) { + struct sockaddr_storage tmp; + socklen_t len = sizeof(tmp); + fd = list->entry[i].fd; + if ( getpeername( fd, (struct sockaddr*)&tmp, &len ) == -1 ) { // More portable then SO_ERROR ... + close( fd ); + fd = -1; + } + } else if ( list->entry[i].revents != 0 ) { + close( list->entry[i].fd ); + } else { + continue; + } + // Either error or connect success + list->count--; + if ( i != list->count ) list->entry[i] = list->entry[list->count]; + if ( fd != -1 ) { + sock_set_block( fd ); + if ( rw_ms != -1 && rw_ms != connect_ms ) { + sock_setTimeout( fd, rw_ms ); + } + return fd; + } + } + return -1; +} + +int sock_accept(poll_list_t *list, struct sockaddr_storage *addr, socklen_t *length_ptr) +{ + int ret = poll( list->entry, list->count, -1 ); + if ( ret < 0 ) { + return -1; + } + for ( int i = list->count - 1; i >= 0; --i ) { + if ( list->entry[i].revents == 0 ) continue; + if ( list->entry[i].revents == POLLIN ) return accept( list->entry[i].fd, (struct sockaddr *)addr, length_ptr ); + if ( list->entry[i].revents & ( POLLNVAL | POLLHUP | POLLERR | POLLRDHUP ) ) { + logadd( LOG_DEBUG1, "poll fd revents=%d for index=%d and fd=%d", (int)list->entry[i].revents, i, list->entry[i].fd ); + close( list->entry[i].fd ); + list->count--; + if ( i != list->count ) list->entry[i] = list->entry[list->count]; + } + } + return -1; +} + +void sock_set_nonblock(int sock) +{ + int flags = fcntl( sock, F_GETFL, 0 ); + if ( flags == -1 ) flags = 0; + fcntl( sock, F_SETFL, flags | O_NONBLOCK ); +} + +void sock_set_block(int sock) +{ + int flags = fcntl( sock, F_GETFL, 0 ); + if ( flags == -1 ) flags = 0; + fcntl( sock, F_SETFL, flags & ~(int)O_NONBLOCK ); +} + +bool sock_append(poll_list_t *list, const int sock, bool wantRead, bool wantWrite) +{ + if ( sock == -1 || list->count >= MAXLISTEN ) return false; + list->entry[list->count++].fd = sock; + list->entry[list->count++].events = (short)( ( wantRead ? POLLIN : 0 ) | ( wantWrite ? POLLOUT : 0 ) | POLLRDHUP ); + list->count++; + return true; +} + +ssize_t sock_sendAll(const int sock, const void *buffer, const size_t len, int maxtries) +{ + size_t done = 0; + ssize_t ret = 0; + while ( done < len ) { + if ( maxtries >= 0 && --maxtries == -1 ) break; + ret = send( sock, (const uint8_t*)buffer + done, len - done, MSG_NOSIGNAL ); + if ( ret == -1 ) { + if ( errno == EINTR ) continue; + if ( errno == EAGAIN || errno == EWOULDBLOCK ) { + continue; + } + break; + } + if ( ret == 0 ) break; + done += ret; + } + if ( done == 0 ) return ret; + return done; +} + +ssize_t sock_recv(const int sock, void *buffer, const size_t len) +{ + size_t done = 0; + ssize_t ret = 0; + int intrs = 0; + while ( done < len ) { + ret = recv( sock, (char*)buffer + done, len - done, MSG_NOSIGNAL ); + if ( ret == -1 ) { + if ( errno == EINTR && ++intrs < 10 ) continue; + break; + } + if ( ret == 0 ) break; + done += ret; + } + if ( done == 0 ) return ret; + return done; +} + diff --git a/src/shared/sockhelper.h b/src/shared/sockhelper.h new file mode 100644 index 0000000..8d70789 --- /dev/null +++ b/src/shared/sockhelper.h @@ -0,0 +1,120 @@ +#ifndef SOCKHELPER_H_ +#define SOCKHELPER_H_ + +/* + * Helper functions for dealing with sockets. These functions should + * abstract from the IP version by using getaddrinfo() and thelike. + */ + +#include "../types.h" +#include +#include +#include + +typedef struct _poll_list poll_list_t; + +/** + * Connect to given dnbd3_host_t. + * @param addr - address of host to connect to + * @param connect_ms - timeout in milliseconds after which the connection attempt fails + * @param rw_ms - read/write timeout in milliseconds to apply on successful connect + * @return socket file descriptor, or -1 on error + */ +int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms); + +/** + * Resolve/parse given address and put the result(s) into passed dnbd3_host_t array, + * but only up to count entries. + * @return Number of items added to array + */ +int sock_resolveToDnbd3Host(const char * const address, dnbd3_host_t * const dest, const int count); + +bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host); + +void sock_setTimeout(const int sockfd, const int milliseconds); + +size_t sock_printHost(const dnbd3_host_t * const host, char *output, const size_t len); + +size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLen, char *output, const size_t len); + +/** + * Create new poll list. + */ +poll_list_t* sock_newPollList(); + +/** + * Delete a poll list, closing all sockets first if necessary. + */ +void sock_destroyPollList(poll_list_t *list); + +/** + * Listen on all interfaces/available IP addresses, using the given protocol. + * IPv4 and IPv6 are supported. + * @param protocol_family PF_INET or PF_INET6 + * @param port port to listen on + * @return true if any listen call was successful + */ +bool sock_listenAny(poll_list_t* list, uint16_t port); + +/** + * Listen on a specific address and port. + * @param bind_addr human readable address to bind to for listening + * @param port to listen on + */ +bool sock_listen(poll_list_t* list, char* bind_addr, uint16_t port); + +/** + * Asynchroneously connect to multiple hosts. + * This can be called multiple times with varying timeouts. Calling it + * the first time on an empty list is identical to sock_connect(). On + * consecutive calls, more nonblocking sockets in connecting state will + * be added to the list, and on each of these calls, all the pending + * sockets will be checked for successful connection (or error), respecting + * the passed timeout. + * host can be NULL to just wait on the sockets already in the list. + * If at least one socket completed the connection + * within the given timeout, it will be removed from the list and + * returned. On error or timeout, -1 is returned. If there are no more sockets + * in the list, -2 is returned. + */ +int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_ms, int rw_ms); + +/** + * This is a multi-socket version of accept. Pass in an array of listening sockets. + * If any of the sockets has an incoming connection, accept it and return the new socket's fd. + * On error, return -1, just like accept(). + * @param sockets array of listening socket fds + * @param socket_count number of sockets in that array + * @return fd of new client socket, -1 on error + */ +int sock_accept(poll_list_t *list, struct sockaddr_storage *addr, socklen_t *length_ptr); + +void sock_set_nonblock(int sock); + +void sock_set_block(int sock); + +/** + * Add given socket to array. Take an existing empty slot ( == -1) if available, + * append to end otherwise. Updates socket count variable passed by reference. + * + * @param poll_list_t list the poll list to add the socket to + * @param sock socket fd to add + * @param wantRead whether to set the EPOLLIN flag + * @param wantWrite whether to set the EPOLLOUT flag + * @return true on success, false iff the array is already full or socket is < 0 + */ +bool sock_append(poll_list_t *list, const int sock, bool wantRead, bool wantWrite); + +/** + * Send the whole buffer, calling write() multiple times if neccessary. + * Give up after calling write() maxtries times. + * Set maxtries < 0 to try infinitely. + */ +ssize_t sock_sendAll(const int sock, const void *buffer, const size_t len, int maxtries); + +/** + * Send given buffer, repeatedly calling recv on partial send or EINTR. + */ +ssize_t sock_recv(const int sock, void *buffer, const size_t len); + +#endif /* SOCKHELPER_H_ */ diff --git a/src/shared/timing.c b/src/shared/timing.c new file mode 100644 index 0000000..4ca1002 --- /dev/null +++ b/src/shared/timing.c @@ -0,0 +1,21 @@ +#include "timing.h" +#include +#include +#include +#include + +struct timespec basetime; + +void timing_abort() +{ + printf( "Cannot get CLOCK_MONOTONIC(_RAW), errno=%d\n", errno ); + exit( 1 ); +} + +void timing_setBase() +{ + if ( clock_gettime( BEST_CLOCK_SOURCE, &basetime ) == -1 ) { + memset( &basetime, 0, sizeof(basetime) ); + } +} + diff --git a/src/shared/timing.h b/src/shared/timing.h new file mode 100644 index 0000000..f3d8802 --- /dev/null +++ b/src/shared/timing.h @@ -0,0 +1,162 @@ +#ifndef _D_TIMING_H +#define _D_TIMING_H + +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 199309L +#endif + +#include +#include +#include + +#ifdef CLOCK_MONOTONIC_RAW +#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC_RAW +#else +#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC +#endif + +typedef struct timespec ticks; + +extern struct timespec basetime; + +/** + * Assign src to dst while adding secs seconds. + */ +#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + secs; (dst)->tv_nsec = (src)->tv_nsec; } while (0) + +/** + * Define variable now, initialize to timing_get. + */ +#define declare_now ticks now; timing_get( &now ) + +/** + * Call this once to calibrate on startup. + * Although overflows of CLOCK_MONOTONIC(_RAW) should + * by definition never happen, we still have a fixed size + * int that could at some point. By forcing the counter + * to start at 0 on startup the point of overflow + * will be very far in the future (decades for 32bit time_t, + * end of universe for 64bit). + */ +void timing_setBase(); + +/** + * Internal, do not use. Moved to another function + * to prevent inlining of error handling code, which + * should be very unlikely to ever trigger. + */ +_Noreturn void timing_abort(); + +/** + * Get current time. Shortcut for clock_gettime with error check. + */ +static inline void timing_get(ticks* retval) +{ + if ( clock_gettime( BEST_CLOCK_SOURCE, retval ) == -1 ) timing_abort(); + retval->tv_sec -= basetime.tv_sec; +} + +/** + * Get a ticks instance somewhere in the future. + * Useful for timeouts. + */ +static inline void timing_gets(ticks* retval, int32_t addSeconds) +{ + timing_get( retval ); + retval->tv_sec += addSeconds; +} + +static inline void timing_addSeconds(ticks* retval, ticks* base, int32_t addSeconds) +{ + retval->tv_sec = base->tv_sec + addSeconds; + retval->tv_nsec = base->tv_nsec; +} + +/** + * Check whether given timeout is reached. + * Might trigger up to one second early. + */ +static inline bool timing_reached(const ticks* timeout, const ticks* now) +{ + return now->tv_sec >= timeout->tv_sec; +} +#define timing_1le2(one,two) timing_reached(one,two) + +/** + * Precise check whether given timeout has been reached. + */ +static inline bool timing_reachedPrecise(const ticks* timeout, const ticks* now) +{ + return now->tv_sec > timeout->tv_sec + || (now->tv_sec == timeout->tv_sec && now->tv_nsec > timeout->tv_nsec); +} + +/** + * Shortcut for above. Useful if not used in loop. + * Might trigger up to one second early. + */ +static inline bool timing_isReached(const ticks* timeout) +{ + ticks now; + timing_get( &now ); + return timing_reached( timeout, &now ); +} +/** + * Shortcut for above. Useful if not used in loop. + */ +static inline bool timing_isReachedPrecise(const ticks* timeout) +{ + ticks now; + timing_get( &now ); + return timing_reachedPrecise( timeout, &now ); +} + + +/** + * Get difference between two ticks, rounded down to seconds. + * Make sure you pass the arguments in the proper order. If + * end is before start, 0 will always be returned. + */ +static inline uint32_t timing_diff(const ticks *start, const ticks *end) +{ + if ( end->tv_sec <= start->tv_sec ) return 0; + return (uint32_t)( ( end->tv_sec - start->tv_sec ) + + ( start->tv_nsec > end->tv_nsec ? -1 : 0 ) ); +} + +/** + * Get difference between two ticks, rounded down to milliseconds. + * Same as above; passing arguments in reverse will always return 0. + */ +static inline uint64_t timing_diffMs(const ticks *start, const ticks *end) +{ + if ( end->tv_sec < start->tv_sec ) return 0; + uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000; + if ( start->tv_nsec >= end->tv_nsec ) { + if ( diff == 0 ) return 0; + diff -= (start->tv_nsec - end->tv_nsec) / 1000000; + } else { + diff += (end->tv_nsec - start->tv_nsec) / 1000000; + } + return diff; +} + +/** + * Get difference between two ticks, rounded down to microseconds. + * Same as above; passing arguments in reverse will always return 0. + */ +static inline uint64_t timing_diffUs(const ticks *start, const ticks *end) +{ + if ( end->tv_sec < start->tv_sec ) return 0; + uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000000; + if ( start->tv_nsec >= end->tv_nsec ) { + if ( diff == 0 ) return 0; + diff -= ( start->tv_nsec - end->tv_nsec ) / 1000; + } else { + diff += ( end->tv_nsec - start->tv_nsec ) / 1000; + } + return diff; +} + + +#endif diff --git a/src/types.h b/src/types.h new file mode 100644 index 0000000..ec37d9b --- /dev/null +++ b/src/types.h @@ -0,0 +1,196 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef TYPES_H_ +#define TYPES_H_ + +#include "config.h" +#ifndef KERNEL_MODULE +#include +#include +#endif + +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + +#ifdef __GNUC__ +#define UNUSED __attribute__ ((unused)) +#else +#error "Please add define for your compiler for UNUSED, or define to nothing for your compiler if not supported" +#endif + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +#ifdef __linux__ +#define HAVE_THREAD_NAMES +#endif + +#ifdef __FreeBSD__ +#ifndef MSG_MORE +#define MSG_MORE 0 +#endif +#ifndef POLLRDHUP +#define POLLRDHUP 0x2000 +#endif +#include +#endif + +#ifdef AFL_MODE +#define send(a,b,c,d) write(a,b,c) +#define recv(a,b,c,d) read(a,b,c) +#endif + + +// ioctl +#define DNBD3_MAGIC 'd' +#define IOCTL_OPEN _IO(0xab, 1) +#define IOCTL_CLOSE _IO(0xab, 2) +#define IOCTL_SWITCH _IO(0xab, 3) +#define IOCTL_ADD_SRV _IO(0xab, 4) +#define IOCTL_REM_SRV _IO(0xab, 5) + +#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72); +// Flip bytes around on big endian when putting stuff on the net +#define net_order_64(a) ((uint64_t)((((a) & 0xFFull) << 56) | (((a) & 0xFF00ull) << 40) | (((a) & 0xFF0000ull) << 24) | (((a) & 0xFF000000ull) << 8) | (((a) & 0xFF00000000ull) >> 8) | (((a) & 0xFF0000000000ull) >> 24) | (((a) & 0xFF000000000000ull) >> 40) | (((a) & 0xFF00000000000000ull) >> 56))) +#define net_order_32(a) ((uint32_t)((((a) & (uint32_t)0xFF) << 24) | (((a) & (uint32_t)0xFF00) << 8) | (((a) & (uint32_t)0xFF0000) >> 8) | (((a) & (uint32_t)0xFF000000) >> 24))) +#define net_order_16(a) ((uint16_t)((((a) & (uint16_t)0xFF) << 8) | (((a) & (uint16_t)0xFF00) >> 8))) +#define fixup_request(a) do { \ + (a).cmd = net_order_16((a).cmd); \ + (a).size = net_order_32((a).size); \ + (a).offset = net_order_64((a).offset); \ +} while (0) +#define fixup_reply(a) do { \ + (a).cmd = net_order_16((a).cmd); \ + (a).size = net_order_32((a).size); \ +} while (0) +#define ENDIAN_MODE "Big Endian" +#ifndef BIG_ENDIAN +#define BIG_ENDIAN +#endif +#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64) +static const uint16_t dnbd3_packet_magic = (0x73) | (0x72 << 8); +// Make little endian our network byte order as probably 99.999% of machines this will be used on are LE +#define net_order_64(a) (a) +#define net_order_32(a) (a) +#define net_order_16(a) (a) +#define fixup_request(a) while(0) +#define fixup_reply(a) while(0) +#define ENDIAN_MODE "Little Endian" +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN +#endif +#else +#error "Unknown Endianness" +#endif + +typedef uint8_t dnbd3_af; + +static const dnbd3_af HOST_NONE = (dnbd3_af)0; +static const dnbd3_af HOST_IP4 = (dnbd3_af)2; +static const dnbd3_af HOST_IP6 = (dnbd3_af)10; + +#pragma pack(1) +typedef struct dnbd3_host_t +{ + uint8_t addr[16]; // 16byte (network representation, so it can be directly passed to socket functions) + uint16_t port; // 2byte (network representation, so it can be directly passed to socket functions) + dnbd3_af type; // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored) +} dnbd3_host_t; +#pragma pack(0) + +#pragma pack(1) +typedef struct +{ + uint16_t len; + dnbd3_host_t host; + uint16_t imgnamelen; + char *imgname; + int rid; + int read_ahead_kb; + uint8_t use_server_provided_alts; +} dnbd3_ioctl_t; +#pragma pack(0) + +// network +#define CMD_GET_BLOCK 1 +#define CMD_SELECT_IMAGE 2 +#define CMD_GET_SERVERS 3 +#define CMD_ERROR 4 +#define CMD_KEEPALIVE 5 +#define CMD_LATEST_RID 6 +#define CMD_SET_CLIENT_MODE 7 +#define CMD_GET_CRC32 8 + +#define DNBD3_REQUEST_SIZE 24 +#pragma pack(1) +typedef struct +{ + uint16_t magic; // 2byte + uint16_t cmd; // 2byte + uint32_t size; // 4byte + union { + struct { +#ifdef LITTLE_ENDIAN + uint64_t offset_small:56; // 7byte + uint8_t hops; // 1byte +#elif defined(BIG_ENDIAN) + uint8_t hops; // 1byte + uint64_t offset_small:56; // 7byte +#endif + }; + uint64_t offset; // 8byte + }; + uint64_t handle; // 8byte +} dnbd3_request_t; +#pragma pack(0) +_Static_assert( sizeof(dnbd3_request_t) == DNBD3_REQUEST_SIZE, "dnbd3_request_t is messed up" ); + +#define DNBD3_REPLY_SIZE 16 +#pragma pack(1) +typedef struct +{ + uint16_t magic; // 2byte + uint16_t cmd; // 2byte + uint32_t size; // 4byte + uint64_t handle; // 8byte +} dnbd3_reply_t; +#pragma pack(0) +_Static_assert( sizeof(dnbd3_reply_t) == DNBD3_REPLY_SIZE, "dnbd3_reply_t is messed up" ); + +#pragma pack(1) +typedef struct +{ + dnbd3_host_t host; + uint8_t failures; // 1byte (number of times server has been consecutively unreachable) +} dnbd3_server_entry_t; +#pragma pack(0) + +#endif /* TYPES_H_ */ diff --git a/src/version.c.in b/src/version.c.in new file mode 100644 index 0000000..54854c9 --- /dev/null +++ b/src/version.c.in @@ -0,0 +1,4 @@ +#include "version.h" + +const char * VERSION_STRING = "@VERSION@"; + diff --git a/src/version.h b/src/version.h new file mode 100644 index 0000000..0c4a66b --- /dev/null +++ b/src/version.h @@ -0,0 +1,30 @@ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha + * + * This file may be licensed under the terms of of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef VERSION_H_ +#define VERSION_H_ + +extern const char *VERSION_STRING; + +// This is done in a little weird way but otherwise eclipse complains about +// unresolvable symbols etc... +#include "version.c" + +#endif /* VERSION_H_ */ -- cgit v1.2.3-55-g7522