summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/CMakeLists.txt28
-rw-r--r--src/bench/CMakeLists.txt22
-rw-r--r--src/bench/connection.c12
-rw-r--r--src/bench/connection.h2
-rw-r--r--src/bench/helper.h2
-rw-r--r--src/bench/main.c8
-rw-r--r--src/bench/serialize.c5
-rw-r--r--src/client/CMakeLists.txt18
-rw-r--r--src/client/client.c284
-rw-r--r--src/clientconfig.h36
-rw-r--r--src/config.h43
-rw-r--r--src/cowtest/CMakeLists.txt27
-rw-r--r--src/cowtest/main.c1305
-rw-r--r--src/cowtest/readme.md110
-rw-r--r--src/fuse/CMakeLists.txt35
-rw-r--r--src/fuse/connection.c585
-rw-r--r--src/fuse/connection.h33
-rw-r--r--src/fuse/cowDoc/img/datastructure.jpgbin0 -> 397688 bytes
-rw-r--r--src/fuse/cowDoc/img/readrequest.svg4
-rw-r--r--src/fuse/cowDoc/readme.md367
-rw-r--r--src/fuse/cowfile.c1777
-rw-r--r--src/fuse/cowfile.h146
-rw-r--r--src/fuse/helper.c6
-rw-r--r--src/fuse/helper.h14
-rw-r--r--src/fuse/main.c604
-rw-r--r--src/fuse/main.h12
-rw-r--r--src/fuse/serialize.c5
-rw-r--r--src/kernel/.clang-format552
-rw-r--r--src/kernel/CMakeLists.txt66
-rw-r--r--src/kernel/Kbuild5
-rw-r--r--src/kernel/blk.c740
-rw-r--r--src/kernel/blk.h18
-rw-r--r--src/kernel/core.c81
-rw-r--r--src/kernel/dnbd3.h84
-rw-r--r--src/kernel/dnbd3_main.c250
-rw-r--r--src/kernel/dnbd3_main.h148
-rw-r--r--src/kernel/net.c1929
-rw-r--r--src/kernel/net.h29
l---------src/kernel/serialize.c1
-rw-r--r--src/kernel/serialize_kmod.c5
-rw-r--r--src/kernel/sysfs.c177
-rw-r--r--src/kernel/sysfs.h20
-rw-r--r--src/kernel/utils.c41
-rw-r--r--src/kernel/utils.h29
-rw-r--r--src/serialize.h40
-rw-r--r--src/server/CMakeLists.txt112
-rw-r--r--src/server/altservers.c79
-rw-r--r--src/server/altservers.h2
-rw-r--r--src/server/fileutil.c2
-rw-r--r--src/server/fuse.c661
-rw-r--r--src/server/fuse.h10
-rw-r--r--src/server/globals.c72
-rw-r--r--src/server/globals.h93
-rw-r--r--src/server/helper.h4
-rw-r--r--src/server/image.c685
-rw-r--r--src/server/image.h48
-rw-r--r--src/server/ini.c2
-rw-r--r--src/server/integrity.c20
-rw-r--r--src/server/locks.c4
-rw-r--r--src/server/locks.h20
-rw-r--r--src/server/net.c174
-rw-r--r--src/server/net.h4
-rw-r--r--src/server/picohttpparser/CMakeLists.txt11
-rw-r--r--src/server/reference.h5
-rw-r--r--src/server/rpc.c29
-rw-r--r--src/server/serialize.c5
-rw-r--r--src/server/server.c84
-rw-r--r--src/server/server.h4
-rw-r--r--src/server/threadpool.c19
-rw-r--r--src/server/threadpool.h5
-rw-r--r--src/server/uplink.c1127
-rw-r--r--src/server/uplink.h8
-rw-r--r--src/serverconfig.h58
-rw-r--r--src/shared/CMakeLists.txt28
-rw-r--r--src/shared/crc32.c238
-rw-r--r--src/shared/crc32.h9
-rw-r--r--src/shared/fdsignal.c2
-rw-r--r--src/shared/fdsignal.h57
-rw-r--r--src/shared/log.c36
-rw-r--r--src/shared/log.h65
-rw-r--r--src/shared/protocol.h159
-rw-r--r--src/shared/serialize.c (renamed from src/serialize.c)43
-rw-r--r--src/shared/sockhelper.c36
-rw-r--r--src/shared/sockhelper.h120
-rw-r--r--src/shared/timing.c2
-rw-r--r--src/shared/timing.h162
-rw-r--r--src/types.h186
-rw-r--r--src/version.c.in3
-rw-r--r--src/version.h30
89 files changed, 10117 insertions, 4111 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..bea33ed
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-src
+ LANGUAGES C)
+
+if(DNBD3_BENCHMARK)
+ add_subdirectory(bench)
+endif(DNBD3_BENCHMARK)
+
+if(DNBD3_KERNEL_MODULE)
+ add_subdirectory(client)
+ add_subdirectory(kernel)
+endif(DNBD3_KERNEL_MODULE)
+
+if(DNBD3_CLIENT_FUSE)
+ add_subdirectory(fuse)
+endif(DNBD3_CLIENT_FUSE)
+
+if(DNBD3_CLIENT_FUSE_COW_TEST)
+ add_subdirectory(cowtest)
+endif(DNBD3_CLIENT_FUSE_COW_TEST)
+
+if(DNBD3_SERVER)
+ add_subdirectory(server)
+endif(DNBD3_SERVER)
+
+add_subdirectory(shared)
diff --git a/src/bench/CMakeLists.txt b/src/bench/CMakeLists.txt
new file mode 100644
index 0000000..24542a7
--- /dev/null
+++ b/src/bench/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-bench
+ LANGUAGES C)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_BENCH_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_BENCH_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h)
+
+add_executable(dnbd3-bench ${DNBD3_BENCH_SOURCE_FILES})
+target_link_libraries(dnbd3-bench dnbd3-version dnbd3-shared ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS dnbd3-bench RUNTIME DESTINATION bin
+ COMPONENT bench)
+
+add_linter(dnbd3-bench-lint "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
+add_linter_fix(dnbd3-bench-lint-fix "${DNBD3_BENCH_SOURCE_FILES}" "${DNBD3_BENCH_HEADER_FILES}")
diff --git a/src/bench/connection.c b/src/bench/connection.c
index 26be440..974bc8a 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -1,10 +1,10 @@
#include "connection.h"
#include "helper.h"
-#include "../config.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
#include <stdlib.h>
#include <pthread.h>
@@ -97,7 +97,7 @@ bool connection_init_n_times(
counters->fails++;
logadd( LOG_ERROR, "rid mismatch" );
//} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
- } else if ( !dnbd3_get_block( sock, (((uint64_t)rand()) << 16 + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
+ } else if ( !dnbd3_get_block( sock, (((uint64_t)rand() << 16) + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
counters->fails++;
logadd( LOG_ERROR, "send: get block failed" );
} else if ( !dnbd3_get_reply( sock, &reply ) ) {
diff --git a/src/bench/connection.h b/src/bench/connection.h
index 770bf0d..422c93e 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -1,7 +1,7 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
#include <stdbool.h>
#include <stdint.h>
#include "helper.h"
diff --git a/src/bench/helper.h b/src/bench/helper.h
index e0c0262..53f32bf 100644
--- a/src/bench/helper.h
+++ b/src/bench/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
diff --git a/src/bench/main.c b/src/bench/main.c
index f8c55c3..37e2821 100644
--- a/src/bench/main.c
+++ b/src/bench/main.c
@@ -4,8 +4,9 @@
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/version.h>
#include <stdio.h>
#include <stdlib.h>
@@ -19,6 +20,7 @@
static void printUsage(char *argv0, int exitCode)
{
+ printf( "Version: %s\n", DNBD3_VERSION_LONG );
printf( "Usage: %s [--debug] --host <serverAddress(es)> --image <imageName> [--rid revision]\n", argv0 );
printf( "Or: %s [-d] -h <serverAddress(es)> -i <imageName> [-r revision]\n", argv0 );
printf( " -h --host List of space separated hosts to use\n" );
@@ -74,6 +76,8 @@ int main(int argc, char *argv[])
int n_threads = 1;
int bs = 4096;
+ log_init();
+
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
}
diff --git a/src/bench/serialize.c b/src/bench/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/bench/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
new file mode 100644
index 0000000..41f182e
--- /dev/null
+++ b/src/client/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-client
+ LANGUAGES C)
+
+# add compile option to enable enhanced BSD netdb features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_CLIENT_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/client.c)
+
+add_executable(dnbd3-client ${DNBD3_CLIENT_SOURCE_FILES})
+target_link_libraries(dnbd3-client dnbd3-version dnbd3-build dnbd3-shared)
+install(TARGETS dnbd3-client RUNTIME DESTINATION bin
+ COMPONENT kernel)
+
+add_linter(dnbd3-client-lint "${DNBD3_CLIENT_SOURCE_FILES}")
+add_linter_fix(dnbd3-client-lint-fix "${DNBD3_CLIENT_SOURCE_FILES}")
diff --git a/src/client/client.c b/src/client/client.c
index 37f0558..0cf222e 100644
--- a/src/client/client.c
+++ b/src/client/client.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,9 +18,10 @@
*
*/
-#include "../clientconfig.h"
-#include "../types.h"
-#include "../version.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/types.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include <stdio.h>
#include <stdlib.h>
@@ -33,19 +34,19 @@
#include <arpa/inet.h>
#include <string.h>
#include <sys/stat.h>
+#include <sys/socket.h>
#include <sys/un.h>
#include <errno.h>
-#define SOCK_PATH "/var/run/dnbd3.socket"
+#define SOCK_PATH "/run/dnbd3.socket"
#define SOCK_BUFFER 1000
#define DEV_LEN 15
#define MAX_DEVS 50
-
+#define TMP_STR_LEN 100
static int openDevices[MAX_DEVS];
-static const char *optString = "f:h:i:r:d:a:cs:HV?k";
+static const char *optString = "h:i:r:d:a:cs:SA:R:HV?k";
static const struct option longOpts[] = {
- { "file", required_argument, NULL, 'f' },
{ "host", required_argument, NULL, 'h' },
{ "image", required_argument, NULL, 'i' },
{ "rid", required_argument, NULL, 'r' },
@@ -53,8 +54,9 @@ static const struct option longOpts[] = {
{ "ahead", required_argument, NULL, 'a' },
{ "close", no_argument, NULL, 'c' },
{ "switch", required_argument, NULL, 's' },
- { "add", required_argument, NULL, 'adds' },
- { "remove", required_argument, NULL, 'rems' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "add", required_argument, NULL, 'A' },
+ { "remove", required_argument, NULL, 'R' },
{ "help", no_argument, NULL, 'H' },
{ "version", no_argument, NULL, 'V' },
{ "daemon", no_argument, NULL, 'D' },
@@ -66,9 +68,9 @@ static const struct option longOpts[] = {
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg);
static void dnbd3_client_daemon();
-static void dnbd3_daemon_action(int client, int argc, char **argv);
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv);
static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *actionName, char *host);
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead);
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers);
static int dnbd3_daemon_send(int argc, char **argv);
static void dnbd3_print_help(char *argv_0);
static void dnbd3_print_version();
@@ -84,11 +86,11 @@ static char host_to_string(const dnbd3_host_t *host, char *target, size_t target
if ( targetlen < 10 ) return false;
if ( host->type == HOST_IP6 ) {
*target++ = '[';
- inet_ntop( AF_INET6, host->addr, target, targetlen - 10 );
+ inet_ntop( AF_INET6, host->addr, target, (socklen_t)targetlen - 10 );
target += strlen( target );
*target++ = ']';
} else if ( host->type == HOST_IP4 ) {
- inet_ntop( AF_INET, host->addr, target, targetlen - 8 );
+ inet_ntop( AF_INET, host->addr, target, (socklen_t)targetlen - 8 );
target += strlen( target );
} else {
snprintf( target, targetlen, "<?addrtype=%d>", (int)host->type );
@@ -135,8 +137,9 @@ static char parse_address(char *string, dnbd3_host_t *host)
// Scan for port
char *portpos = NULL, *ptr = string;
while ( *ptr ) {
- if ( *ptr == ':' )
- portpos = ptr;
+ if ( *ptr == ':' ) {
+ portpos = ptr;
+ }
++ptr;
}
if ( portpos == NULL ) return 0; // No port in string
@@ -192,33 +195,77 @@ static int dnbd3_get_ip(char *hostname, dnbd3_host_t *host)
return true;
}
+/* parses hosts from space separated cmdln string, resolves them and saves them into hosts */
+static int dnbd3_get_resolved_hosts(char *hosts_str, dnbd3_host_t *hosts, const size_t hosts_len)
+{
+ char *hosts_current_token = hosts_str;
+ char *hosts_last_host;
+ int hosts_index = 0;
+ char host_str[TMP_STR_LEN];
+ size_t host_str_len = 0;
+
+ do {
+ /* get next host from string */
+ while ( *hosts_current_token == ' ' ) {
+ hosts_current_token++;
+ }
+
+ /* buffer substring of host to get ip from it */
+ hosts_last_host = strchr( hosts_current_token, ' ' );
+ host_str_len = (hosts_last_host == NULL ? TMP_STR_LEN : (size_t)(hosts_last_host - hosts_current_token) + 1);
+ if ( host_str_len > TMP_STR_LEN ) {
+ host_str_len = TMP_STR_LEN;
+ }
+
+ snprintf( host_str, host_str_len, "%s", hosts_current_token );
+
+ if ( !dnbd3_get_ip( host_str, &hosts[hosts_index] ) )
+ return false;
+
+ hosts_index++;
+
+ /* continue processing of hosts */
+ hosts_current_token = hosts_last_host + 1;
+
+ } while ( hosts_last_host != NULL && hosts_index < hosts_len );
+
+ return hosts_index;
+}
+
int main(int argc, char *argv[])
{
char *dev = NULL;
char host[50];
int action = -1;
+ bool learnNewServers = true;
+ int active_device_num = 0;
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(dnbd3_ioctl_t) );
- msg.len = (uint16_t)sizeof(dnbd3_ioctl_t);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
+ msg.hosts_num = 0;
msg.read_ahead_kb = DEFAULT_READ_AHEAD_KB;
- msg.host.port = htons( PORT );
- msg.host.type = 0;
msg.imgname = NULL;
- msg.use_server_provided_alts = true;
int opt = 0;
int longIndex = 0;
+ // In case the client was invoked as a suid binary, change uid back to original user
+ // and warn the user as this was legacy mode
+ if ( geteuid() == 0 && getuid() != 0 ) {
+ fprintf( stderr, "Warning! %s is a setuid binary. This is deprecated and not needed anymore.\n", argv[0] );
+ fprintf( stderr, "Switching back o user %d\n", (int)getuid() );
+ setgid( getgid() );
+ setuid( getuid() );
+ }
+
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
while ( opt != -1 ) {
switch ( opt ) {
- case 'f':
- break;
case 'h':
- if ( !dnbd3_get_ip( optarg, &msg.host ) ) exit( EXIT_FAILURE );
+ msg.hosts_num = (uint8_t)dnbd3_get_resolved_hosts( optarg, msg.hosts, MAX_HOSTS_PER_IOCTL );
+ if ( !msg.hosts_num )
+ exit( EXIT_FAILURE );
break;
case 'i':
action = IOCTL_OPEN;
@@ -238,25 +285,34 @@ int main(int argc, char *argv[])
action = IOCTL_CLOSE;
break;
case 's':
- dnbd3_get_ip( optarg, &msg.host );
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_SWITCH;
break;
- case 'adds':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_ADD_SRV;
break;
- case 'rems':
- dnbd3_get_ip( optarg, &msg.host );
+ case 'R':
+ dnbd3_get_ip( optarg, &msg.hosts[0] );
+ msg.hosts_num = 1;
action = IOCTL_REM_SRV;
break;
case 'H':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'V':
- dnbd3_print_version();
+ dnbd3_print_version( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case '?':
dnbd3_print_help( argv[0] );
+ exit( EXIT_SUCCESS );
break;
case 'D':
dnbd3_client_daemon();
@@ -265,6 +321,14 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+ /* abort if sticky mode is set and image will not be opened */
+ if ( !learnNewServers && action != IOCTL_OPEN ) {
+ printf( "ERROR: sticky mode can only be set if image will be opened.\n" );
+ exit( EXIT_FAILURE );
+ }
+
+ msg.use_server_provided_alts = learnNewServers;
+
// See if socket exists, if so, try to send to daemon
struct stat st;
if ( stat( SOCK_PATH, &st ) == 0 ) {
@@ -275,39 +339,37 @@ int main(int argc, char *argv[])
// Direct requests
- // In case the client was invoked as a suid binary, change uid back to original user
- // when being used for direct ioctl, so that the device's permissions are taken into account
- if ( geteuid() == 0 ) {
- setgid( getgid() );
- setuid( getuid() );
- }
-
- host_to_string( &msg.host, host, 50 );
-
// close device
- if ( action == IOCTL_CLOSE && msg.host.type == 0 && dev && (msg.imgname == NULL )) {
+ if ( action == IOCTL_CLOSE && msg.hosts_num == 0 && dev && (msg.imgname == NULL )) {
printf( "INFO: Closing device %s\n", dev );
- if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, IOCTL_CLOSE, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Couldn't close device.\n" );
exit( EXIT_FAILURE );
}
// switch host
- if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.host.type != 0 && dev && (msg.imgname == NULL )) {
+ if ( (action == IOCTL_SWITCH || action == IOCTL_ADD_SRV || action == IOCTL_REM_SRV) && msg.hosts_num == 1 && dev && (msg.imgname == NULL )) {
+ host_to_string( &msg.hosts[0], host, 50 );
if ( action == IOCTL_SWITCH ) printf( "INFO: Switching device %s to %s\n", dev, host );
if ( action == IOCTL_ADD_SRV ) printf( "INFO: %s: adding %s\n", dev, host );
if ( action == IOCTL_REM_SRV ) printf( "INFO: %s: removing %s\n", dev, host );
- if ( dnbd3_ioctl( dev, action, &msg ) ) exit( EXIT_SUCCESS );
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) exit( EXIT_SUCCESS );
printf( "Failed! Maybe the device is not connected?\n" );
exit( EXIT_FAILURE );
}
// connect
- if ( action == IOCTL_OPEN && msg.host.type != 0 && dev && (msg.imgname != NULL )) {
- printf( "INFO: Connecting device %s to %s for image %s\n", dev, host, msg.imgname );
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) exit( EXIT_SUCCESS );
- printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
- exit( EXIT_FAILURE );
+ if ( action == IOCTL_OPEN && msg.hosts_num > 0 && dev && (msg.imgname != NULL )) {
+ printf( "INFO: Connecting device %s for image %s\n", dev, msg.imgname );
+ active_device_num = dnbd3_ioctl( dev, IOCTL_OPEN, &msg );
+ if ( active_device_num >= 0 ) {
+ host_to_string( &msg.hosts[active_device_num], host, 50 );
+ printf( "INFO: Device %s for image %s is connected to server %s\n", dev, msg.imgname, host);
+ exit( EXIT_SUCCESS );
+ } else {
+ printf( "ERROR: connecting device failed. Maybe it's already connected?\n" );
+ exit( EXIT_FAILURE );
+ }
}
dnbd3_print_help( argv[0] );
@@ -317,17 +379,19 @@ int main(int argc, char *argv[])
static int dnbd3_ioctl(const char *dev, const int command, dnbd3_ioctl_t * const msg)
{
const int fd = open( dev, O_WRONLY );
- if ( fd < 0 ) {
- printf( "open() for %s failed.\n", dev );
- return false;
+ if ( fd == -1 ) {
+ perror( "open() failed" );
+ return -ENODEV;
+ }
+ if ( msg != NULL && msg->imgname != NULL ) {
+ msg->imgnamelen = (uint16_t)strlen( msg->imgname );
}
- if ( msg != NULL && msg->imgname != NULL ) msg->imgnamelen = (uint16_t)strlen( msg->imgname );
const int ret = ioctl( fd, command, msg );
if ( ret < 0 ) {
- printf( "ioctl() failed.\n" );
+ perror( "ioctl() failed" );
}
close( fd );
- return ret >= 0;
+ return ret;
}
static void dnbd3_client_daemon()
@@ -338,11 +402,8 @@ static void dnbd3_client_daemon()
struct timeval tv;
int done, ret, len;
socklen_t socklen;
-
- if ( geteuid() != 0 ) {
- printf( "Only root can run the dnbd3-client in daemon mode!\n" );
- exit( 1 );
- }
+ struct ucred ucred;
+ int fdTest;
if ( (listener = socket( AF_UNIX, SOCK_STREAM, 0 )) == -1 ) {
perror( "socket" );
@@ -356,12 +417,21 @@ static void dnbd3_client_daemon()
perror( "bind" );
exit( 1 );
}
- chmod( addrLocal.sun_path, 0600 );
+ fchmod( listener, 0666 );
+ chmod( SOCK_PATH, 0666 );
if ( listen( listener, 5 ) == -1 ) {
perror( "listen" );
+ unlink( addrLocal.sun_path );
exit( 1 );
}
+ fdTest = open( "/dev/dnbd0", O_RDWR );
+ if ( fdTest == -1 ) {
+ perror( "Opening /dev/dnbd0 failed. Daemon will probably not work" );
+ } else {
+ close( fdTest );
+ }
+
memset( openDevices, -1, sizeof(openDevices) );
for (;;) {
@@ -372,6 +442,14 @@ static void dnbd3_client_daemon()
continue;
}
+ socklen = sizeof(ucred);
+ if ( getsockopt( client, SOL_SOCKET, SO_PEERCRED, &ucred, &socklen ) == -1 ) {
+ perror( "Could not get credentials of connection" );
+ close( client );
+ continue;
+ }
+ printf("Call from user %d\n", (int)ucred.uid );
+
tv.tv_sec = 1;
tv.tv_usec = 0;
setsockopt( client, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv) );
@@ -398,27 +476,28 @@ static void dnbd3_client_daemon()
}
if ( pos >= end ) break;
argv[argc++] = pos;
- printf("Arg %d: '%s'\n", argc, pos);
+ //printf("Arg %d: '%s'\n", argc, pos);
while ( *pos != '\0' ) { // This will always be in bounds because of -4 above
if ( ++pos >= end ) break;
}
}
- dnbd3_daemon_action( client, argc, argv );
+ dnbd3_daemon_action( client, (int)ucred.uid, argc, argv );
}
close( client );
}
}
-static void dnbd3_daemon_action(int client, int argc, char **argv)
+static void dnbd3_daemon_action(int client, int uid, int argc, char **argv)
{
int opt = 0;
int longIndex = 0;
char *host = NULL, *image = NULL, *device = NULL;
- int rid = 0, uid = 0, killMe = false, ahead = 512;
+ int rid = 0, killMe = false, ahead = 512;
int len;
int action = -1;
const char *actionName = NULL;
+ bool learnNewServers = true;
optind = 1;
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
@@ -439,18 +518,18 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
case 'r':
rid = atoi( optarg );
break;
- case 'U':
- uid = atoi( optarg );
- break;
case 'c':
action = IOCTL_CLOSE;
actionName = "Close";
break;
- case 'adds':
+ case 'S':
+ learnNewServers = false;
+ break;
+ case 'A':
action = IOCTL_ADD_SRV;
actionName = "Add Server";
break;
- case 'rems':
+ case 'R':
action = IOCTL_REM_SRV;
actionName = "Remove Server";
break;
@@ -465,14 +544,14 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
}
if ( killMe ) {
- if ( uid != 0 ) {
+ if ( uid != geteuid() ) {
printf( "Ignoring kill request by user %d\n", uid );
close( client );
return;
}
printf( "Received kill request; exiting.\n" );
- close( client );
unlink( SOCK_PATH );
+ close( client );
exit( 0 );
}
@@ -486,7 +565,7 @@ static void dnbd3_daemon_action(int client, int argc, char **argv)
return;
}
if ( action == IOCTL_OPEN && host != NULL && image != NULL && rid >= 0 ) {
- device = dnbd3_daemon_open( uid, host, image, rid, ahead );
+ device = dnbd3_daemon_open( uid, host, image, rid, ahead, learnNewServers);
if ( device != NULL ) {
len = strlen( device );
send( client, &len, sizeof(len), 0 );
@@ -509,11 +588,9 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
} else {
index = atoi( device );
}
- dnbd3_ioctl_t msg;
- memset( &msg, 0, sizeof(msg) );
- msg.len = (uint16_t)sizeof(msg);
+ dnbd3_ioctl_t msg = { .len = (uint16_t)sizeof(msg) };
if ( host != NULL ) {
- dnbd3_get_ip( host, &msg.host );
+ dnbd3_get_ip( host, &msg.hosts[0] );
}
if ( index < 0 || index >= MAX_DEVS ) {
printf( "%s request with invalid device id %d\n", actionName, index );
@@ -528,7 +605,7 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
printf( "%s: User %d cannot access %s owned by %d\n", actionName, uid, dev, openDevices[index] );
return false;
}
- if ( dnbd3_ioctl( dev, action, &msg ) ) {
+ if ( dnbd3_ioctl( dev, action, &msg ) == 0 ) {
printf( "%s request for device %s of user %d successful\n", actionName, dev, uid );
openDevices[index] = -1;
return true;
@@ -537,23 +614,26 @@ static int dnbd3_daemon_ioctl(int uid, char *device, int action, const char *act
return false;
}
-static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead)
+static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int readAhead, const bool doLearnNewServers)
{
int i, sameUser = 0;
struct stat st;
static char dev[DEV_LEN];
printf( "Opening a device for %s on %s\n", image, host );
// Check number of open devices
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] == uid ) sameUser++;
- }
- if ( sameUser > 1 ) {
- printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
- return NULL ;
+ if ( uid != 0 ) {
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] == uid ) sameUser++;
+ }
+ if ( sameUser > 1 ) {
+ printf( "Ignoring request by %d as there are already %d open devices for that user.\n", uid, sameUser );
+ return NULL;
+ }
}
// Find free device
- for (i = 0; i < MAX_DEVS; ++i) {
- if ( openDevices[i] != -1 ) continue;
+ for ( i = 0; i < MAX_DEVS; ++i ) {
+ if ( openDevices[i] != -1 )
+ continue;
snprintf( dev, DEV_LEN, "/dev/dnbd%d", i );
if ( stat( dev, &st ) == -1 ) {
break;
@@ -561,16 +641,16 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
// Open
dnbd3_ioctl_t msg;
msg.len = (uint16_t)sizeof(msg);
- if ( !dnbd3_get_ip( host, &msg.host ) ) {
+ if ( !dnbd3_get_ip( host, &msg.hosts[0] ) ) {
printf( "Cannot parse host address %s\n", host );
return NULL ;
}
msg.imgname = image;
msg.imgnamelen = strlen( image );
msg.rid = rid;
- msg.use_server_provided_alts = true;
+ msg.use_server_provided_alts = doLearnNewServers;
msg.read_ahead_kb = readAhead;
- if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) ) {
+ if ( dnbd3_ioctl( dev, IOCTL_OPEN, &msg ) >= 0 ) {
openDevices[i] = uid;
printf( "Device %s now occupied by %d\n", dev, uid );
return dev;
@@ -584,7 +664,6 @@ static char* dnbd3_daemon_open(int uid, char *host, char *image, int rid, int re
static int dnbd3_daemon_send(int argc, char **argv)
{
- const int uid = getuid();
int s, i, len;
struct sockaddr_un remote;
char buffer[SOCK_BUFFER];
@@ -604,7 +683,6 @@ static int dnbd3_daemon_send(int argc, char **argv)
// (Re)build argument string into a single one, arguments separated by null chars
char *pos = buffer;
char *end = buffer + SOCK_BUFFER;
- pos += snprintf( pos, end - pos, "--user%c%d", (int)'\0', uid ) + 1;
for (i = 1; i < argc && pos < end; ++i) {
pos += snprintf( pos, end - pos, "%s", argv[i] ) + 1;
}
@@ -643,28 +721,28 @@ static int dnbd3_daemon_send(int argc, char **argv)
static void dnbd3_print_help(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
- printf( "\nUsage: %s\n"
- "\t-h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n", argv_0 );
- printf( "Start the DNBD3 client.\n" );
- //printf("-f or --file \t\t Configuration file (default /etc/dnbd3-client.conf)\n");
- printf( "-h or --host \t\t Host running dnbd3-server.\n" );
+ printf( "Usage: %s\n", argv_0 );
+ printf( " -h <host> -i <image name> [-r <rid>] -d <device> [-a <KB>] || -c -d <device>\n\n" );
+ printf( "Start the DNBD3 client.\n\n" );
+ printf( "-h or --host \t\t List of space separated hosts to use.\n" );
printf( "-i or --image \t\t Image name of exported image.\n" );
printf( "-r or --rid \t\t Release-ID of exported image (default 0, latest).\n" );
printf( "-d or --device \t\t DNBD3 device name.\n" );
printf( "-a or --ahead \t\t Read ahead in KByte (default %i).\n", DEFAULT_READ_AHEAD_KB );
printf( "-c or --close \t\t Disconnect and close device.\n" );
printf( "-s or --switch \t\t Switch dnbd3-server on device (DEBUG).\n" );
+ printf( "-S or --sticky \t\t Use only servers from command line (no learning from servers)\n" );
+ printf( "-A or --add \t\t Add given dnbd3-server on device.\n");
+ printf( "-R or --remove \t\t Remove given dnbd3-server on device.\n");
printf( "-H or --help \t\t Show this help text and quit.\n" );
printf( "-V or --version \t Show version and quit.\n\n" );
- printf( "\t--daemon \t Run as helper daemon\n" );
- printf( "\t--kill \t Kill running helper daemon\n" );
+ printf( " --daemon \t\t Run as helper daemon\n" );
+ printf( " --kill \t\t Kill running helper daemon\n\n" );
printf( "The helper daemon makes it possible for normal users to connect dnbd3 devices.\n" );
- printf( "The client binary needs to be a setuid program for this to work!\n\n" );
}
void dnbd3_print_version()
{
- printf( "Version: %s\n", VERSION_STRING );
- exit( EXIT_SUCCESS );
+ printf( "dnbd3-client version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
}
diff --git a/src/clientconfig.h b/src/clientconfig.h
deleted file mode 100644
index f35f673..0000000
--- a/src/clientconfig.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _CLIENTCONFIG_H_
-#define _CLIENTCONFIG_H_
-
-// Which is the minimum protocol version the client expects from the server
-#define MIN_SUPPORTED_SERVER 2
-
-// in seconds if not stated otherwise (MS = milliseconds)
-#define SOCKET_TIMEOUT_CLIENT_DATA 2
-#define SOCKET_TIMEOUT_CLIENT_DISCOVERY 1
-
-#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
-#define RTT_ABSOLUTE_THRESHOLD (80000) // Or 80ms worse
-#define RTT_UNREACHABLE 0x7FFFFFFul // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
-// This must be a power of two:
-#define RTT_BLOCK_SIZE 4096
-
-#define STARTUP_MODE_DURATION 30
-// Interval of several repeating tasks (in seconds)
-#define TIMER_INTERVAL_PROBE_STARTUP 4
-#define TIMER_INTERVAL_PROBE_NORMAL 22
-#define TIMER_INTERVAL_PROBE_PANIC 2
-#define TIMER_INTERVAL_KEEPALIVE_PACKET 6
-
-// Expect a keepalive response every X seconds
-#define SOCKET_KEEPALIVE_TIMEOUT 8
-
-// Number of unsuccessful alt_server probes before read errors are reported to the block layer
-// (ALL servers will be probed this many times)
-// Set to 0 to disable
-#define PROBE_COUNT_TIMEOUT 0
-
-// ++ Kernel module ++
-#define DEFAULT_READ_AHEAD_KB 512
-#define NUMBER_DEVICES 8
-
-#endif
diff --git a/src/config.h b/src/config.h
deleted file mode 100644
index 50336af..0000000
--- a/src/config.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef CONFIG_H_
-#define CONFIG_H_
-
-// +++++ Network +++++
-// Default port
-#define PORT 5003
-#define RPC_PORT (PORT+1)
-
-// No serialized payload allowed exceeding this many bytes (so actual data from client->server is not affected by this limit!)
-#define MAX_PAYLOAD 1000
-
-// Protocol version should be increased whenever new features/messages are added,
-// so either the client or server can run in compatibility mode, or they can
-// cancel the connection right away if the protocol has changed too much
-#define PROTOCOL_VERSION 3
-// 2017-10-16: Update to v3: Change header to support request hop-counting
-
-#define NUMBER_SERVERS 8 // Number of alt servers per image/device
-
-// +++++ Block Device +++++
-#define DNBD3_BLOCK_SIZE ((uint64_t)4096) // NEVER CHANGE THIS OR THE WORLD WILL END!
-
-#endif /* CONFIG_H_ */
diff --git a/src/cowtest/CMakeLists.txt b/src/cowtest/CMakeLists.txt
new file mode 100644
index 0000000..235a371
--- /dev/null
+++ b/src/cowtest/CMakeLists.txt
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-fuse-cow-test
+ LANGUAGES C)
+
+
+# find atomic library required by DNBD3_CLIENT_FUSE_COW_TEST
+#find_package(Stdatomic REQUIRED)
+#find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_CLIENT_FUSE_COW_TEST_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+#set(DNBD3_CLIENT_FUSE_COW_TEST_HEADER_FILES )
+
+
+add_executable(dnbd3-fuse-cow-test ${DNBD3_CLIENT_FUSE_COW_TEST_SOURCE_FILES})
+target_link_libraries(dnbd3-fuse-cow-test dnbd3-version dnbd3-shared ${CMAKE_THREAD_LIBS_INIT})
+
+
+install(TARGETS dnbd3-fuse-cow-test RUNTIME DESTINATION bin
+ COMPONENT cowtest)
+
+#add_linter(dnbd3-fuse-lint "${DNBD3_CLIENT_FUSE_COW_TEST_SOURCE_FILES}" "${DNBD3_CLIENT_FUSE_COW_TEST_HEADER_FILES}")
+#add_linter_fix(dnbd3-fuse-lint-fix "${DNBD3_CLIENT_FUSE_COW_TEST_SOURCE_FILES}" "${DNBD3_CLIENT_FUSE_COW_TEST_HEADER_FILES}")
diff --git a/src/cowtest/main.c b/src/cowtest/main.c
new file mode 100644
index 0000000..dc2bac5
--- /dev/null
+++ b/src/cowtest/main.c
@@ -0,0 +1,1305 @@
+#include <dnbd3/config/cow.h>
+#include <dnbd3/types.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdatomic.h>
+#include <time.h>
+#include <pthread.h>
+#include <getopt.h>
+#include <signal.h>
+
+typedef bool ( *func_ptr )();
+typedef struct verify_test
+{
+ off_t offset;
+ size_t size;
+ func_ptr test;
+} verify_test_t;
+
+typedef struct special_test
+{
+ off_t offset;
+ size_t size;
+
+} special_test_t;
+
+
+
+const size_t l2Size = 1024;
+const size_t bitfieldByteSize = 40;
+const size_t l2Capacity = l2Size * DNBD3_BLOCK_SIZE * bitfieldByteSize * 8;
+const size_t testFileSize = l2Capacity * 2.9L;
+
+atomic_bool randomTestLoop = true;
+
+#define RND_MAX_WRITE_SIZE 4096 * 320
+#define RND_TRUNCATE_PROBABILITY 5
+#define RND_UNALIGNED_WRITE_PROBABILITY 80
+#define RND_DEFAULT_MIN_SIZE_PERCENT 0.9f
+#define RND_DEFAULT_MAX_SIZE_PERCENT 1.1f
+#define BASE_DATA (char)42
+#define CLAMP( x, min, max ) MAX( MIN( x, min ), max )
+
+int delay = 0;
+static char filePath[400];
+static int fh = 0;
+
+bool printOnError = true;
+
+/**
+ * @brief generates a Test file
+ *
+ * @param path Location where the file is created
+ * @param size Size of the file in byte
+ */
+bool generateTestFile( char *path, size_t size )
+{
+ int fh;
+ if ( ( fh = open( path, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not create test file: " );
+ return false;
+ }
+
+ ssize_t writtenSize = 0;
+ char buf[DNBD3_BLOCK_SIZE * 50];
+ memset( buf, BASE_DATA, DNBD3_BLOCK_SIZE * 50 );
+ while ( writtenSize < (ssize_t)size ) {
+ size_t sizeToWrite = MIN( DNBD3_BLOCK_SIZE * 50, size - writtenSize );
+ ssize_t tmp = pwrite( fh, buf, sizeToWrite, writtenSize );
+ if ( tmp == 0 ) {
+ printf( "Error while populating the test file: " );
+ return false;
+ }
+ if ( tmp == -1 ) {
+ perror( "Error while populating the test file: " );
+ return false;
+ }
+ writtenSize += tmp;
+ }
+
+ close( fh );
+ printf( "Generated Test File of size: %zu bytes. \n", size );
+ return true;
+}
+
+/**
+ * @brief prints the Hexadecimal Values of a string
+ *
+ * @param str
+ * @param len
+ */
+void printCharInHexadecimal( const char *str, const char *got, int len )
+{
+ int pr = 0;
+ for ( int i = 0; i < len; ++i ) {
+ if ( pr > 0 ) {
+ pr--;
+ if ( str[i] != got[i] ) {
+ printf( "[%02x/%02x] ", (int)str[i], (int)got[i] );
+ } else {
+ printf( "%02x ", (int)str[i] );
+ }
+ if ( pr == 0 ) {
+ printf( " .." );
+ }
+ } else {
+ if ( str[i] != got[i] ) {
+ pr = 4;
+ i = MAX( -1, i - 4 );
+ if ( i != -1 ) {
+ printf(".. " );
+ }
+ }
+ }
+ }
+ printf( "\n" );
+}
+
+/**
+ * @brief compares two arrays. Prints both arrays as hexadecimal if they are not equal and the given error Message
+ *
+ * @param buff
+ * @param expected
+ * @param size
+ * @param errorMessage
+ * @return true if both arrays are equal
+ * @return false if both arrays are not equal
+ */
+bool compare( char buff[], char expected[], size_t size, char errorMessage[] )
+{
+ if ( memcmp( buff, expected, size ) != 0 ) {
+ printf( "%s\n", errorMessage );
+ if ( printOnError ) {
+ printf( "Diff [want/got]: \n" );
+ printCharInHexadecimal( expected, buff, (int)size );
+ }
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief performs a file read and if the read size is smaller than expected prints given error,
+ *
+ * @param fh fileHandle
+ * @param buf buffer
+ * @param size size to read
+ * @param off offset in file
+ * @param error error message which should be displayed if read is smaller than expected
+ */
+bool readSizeTested( int fh, char *buf, ssize_t size, off_t off, char *error )
+{
+ ssize_t readSize = pread( fh, buf, size, off );
+ if ( readSize == -1 ) {
+ perror( "Read failed: " );
+ } else if ( readSize < size ) {
+ printf( "%s \n size read: %zu\n Expected %zu\n", error, readSize, size );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief writes to a file, if write is smaller then expected, given error message is printed
+ *
+ * @param fh filehandle
+ * @param buf buffer which contains the data
+ * @param size size to write
+ * @param off offset in file
+ * @param error error Message
+ */
+bool writeSizeTested( int fh, char *buf, ssize_t size, off_t off, char *error )
+{
+ ssize_t writeSize = pwrite( fh, buf, size, off );
+ if ( writeSize == -1 )
+ perror( "write failed: " );
+ if ( writeSize < size ) {
+ printf( "%s", error );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief similar to writeSizeTested, only that it writes in to files
+ *
+ * @param fhm filehandle one
+ * @param fhn filehandle two
+ * @param buf buffer which contains the data
+ * @param size size to write
+ * @param off offset in file
+ */
+bool writeTwoFilesTested( int fhm, int fhn, char *buf, ssize_t size, off_t off )
+{
+ printf( "write offset: %zu size: %zu\n", off, size );
+
+ if ( !writeSizeTested( fhm, buf, size, off, "failed to write on mounted image" ) ) {
+ return false;
+ }
+ if ( !writeSizeTested( fhn, buf, size, off, "failed to write on normal image" ) ) {
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief changes a file to a given size an then verifies that size
+ *
+ * @param filePath
+ * @param size
+ */
+bool changeFileSizeAndVerify( char *filePath, size_t size )
+{
+ if ( truncate( filePath, size ) != 0 ) {
+ perror( "truncate failed: " );
+ return false;
+ }
+ // verify
+ struct stat st;
+ stat( filePath, &st );
+ size_t newSize = st.st_size;
+
+ if ( size != newSize ) {
+ printf( "truncate failed, wrong file size\n expectedSize: %zu\n got: %zu\n", size, newSize );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief changes two files to a given size an then verifies that size
+ *
+ * @param filePath
+ * @param filePath2
+ * @param size
+ */
+bool changeTwoFileSizeAndVerify( char *filePath, char *filePath2, size_t size )
+{
+ printf( "change filesize to: %zu\n", size );
+ return changeFileSizeAndVerify( filePath, size ) && changeFileSizeAndVerify( filePath2, size );
+}
+
+/*
+* Most tests below implement an test and a verify method.
+* The tests read and modify the data while the verfiy methods
+* compare the data modified by the tests with the expected data.
+* The verify method is mostly used by the tests at the end and by
+* the verifyFinalFile method to compare the whole image.
+*/
+
+bool verifySingleBit()
+{
+ char buff[DNBD3_BLOCK_SIZE];
+ char expected[DNBD3_BLOCK_SIZE];
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE );
+ expected[0] = 1;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, 0, "SingleBit test Failed: first read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: first read not as expected" ) )
+ return false;
+
+ expected[0] = BASE_DATA;
+ expected[DNBD3_BLOCK_SIZE / 2] = 1;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read not as expected" ) )
+ return false;
+ printf( "testSingleBit successful!\n" );
+ return true;
+}
+
+/**
+ * @brief This test, tests that individual bits can be written.
+ * First the first bit is set to 1, then in the second block the
+ * middle bit is set to 1.
+ *
+ */
+bool testSingleBit()
+{
+ char buff[DNBD3_BLOCK_SIZE];
+ char expected[DNBD3_BLOCK_SIZE];
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE );
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, 0, "SingleBit test Failed: first read to small" ) )
+ return false;
+
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: initial read" ) )
+ return false;
+ expected[0] = 1;
+ if ( !writeSizeTested( fh, expected, DNBD3_BLOCK_SIZE, 0, "SingleBit test Failed: first write failed" ) )
+ return false;
+
+ expected[0] = BASE_DATA;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE, "SingleBit test Failed: second read" ) )
+ return false;
+ expected[0] = 1;
+ if ( !writeSizeTested(
+ fh, expected, 1, DNBD3_BLOCK_SIZE + DNBD3_BLOCK_SIZE / 2, "SingleBit test Failed: second write failed" ) )
+ return false;
+ return verifySingleBit();
+}
+
+bool verifyWriteOverTwoBlocks()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ if ( !readSizeTested(
+ fh, buff, DNBD3_BLOCK_SIZE * 2, DNBD3_BLOCK_SIZE * 3, "writeOverTwoBlocks test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "OverTwoBlocks test Failed: write not as expected" ) )
+ return false;
+ printf( "writeOverTwoBlocks successful!\n" );
+ return true;
+}
+
+/**
+ * @brief This test, tests that write operations over two dnbd3
+ * blocks are possible.
+ *
+ */
+
+bool writeOverTwoBlocks()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE * 2 );
+ if ( !readSizeTested(
+ fh, buff, DNBD3_BLOCK_SIZE * 2, DNBD3_BLOCK_SIZE * 3, "writeOverTwoBlocks test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "OverTwoBlocks test Failed: initial read" ) )
+ return false;
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ if ( !writeSizeTested( fh, expected, DNBD3_BLOCK_SIZE * 2, DNBD3_BLOCK_SIZE * 3,
+ "writeOverTwoBlocks test Failed: write failed" ) )
+ return false;
+ return verifyWriteOverTwoBlocks();
+}
+
+bool verifyWriteOverL2()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ size_t offset = l2Capacity * 2 - DNBD3_BLOCK_SIZE;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 2, offset, "writeOverL2 test Failed: read to small" ) ) {
+ return false;
+ }
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "writeOverL2 test Failed: write not as expected" ) ) {
+ return false;
+ }
+ printf( "writeOverL2 successful!\n" );
+ return true;
+}
+
+
+/**
+ * @brief This test, tests that write operations over L2's borders
+ * are possible.
+ *
+ */
+bool writeOverL2()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE * 2 );
+ size_t offset = l2Capacity * 2 - DNBD3_BLOCK_SIZE;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 2, offset, "writeOverL2 test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "writeOverL2 test Failed: initial read" ) )
+ return false;
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ if ( !writeSizeTested( fh, expected, DNBD3_BLOCK_SIZE * 2, offset, "writeOverL2 test Failed: write failed" ) )
+ return false;
+
+ return verifyWriteOverL2();
+}
+
+
+bool verifyWriteNotOnBlockBorder()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ size_t offset = DNBD3_BLOCK_SIZE * 11 - DNBD3_BLOCK_SIZE / 2;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 2, offset, "writeNotOnBlockBorder test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "writeNotOnBlockBorder test Failed: write not as expected" ) )
+ return false;
+ printf( "writeNotOnBlockBorder successful!\n" );
+ return true;
+}
+
+
+/**
+ * @brief This test, tests writes that are not at a 4096byte block boundary.
+ *
+ * @return true
+ * @return false
+ */
+bool writeNotOnBlockBorder()
+{
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE * 2 );
+ size_t offset = DNBD3_BLOCK_SIZE * 11 - DNBD3_BLOCK_SIZE / 2;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 2, offset, "writeNotOnBlockBorder test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2, "writeNotOnBlockBorder test Failed: initial read" ) )
+ return false;
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 2 );
+ if ( !writeSizeTested(
+ fh, expected, DNBD3_BLOCK_SIZE * 2, offset, "writeNotOnBlockBorder test Failed: write failed" ) )
+ return false;
+ return verifyWriteNotOnBlockBorder();
+}
+
+bool verifyLongNonAlignedPattern()
+{
+ int size = DNBD3_BLOCK_SIZE * 10;
+ char buffer[size];
+ char expected[size];
+ for ( int i = 0; i < size; i++ ) {
+ expected[i] = (char)( i % 255 );
+ }
+
+ off_t offset = l2Capacity * 3 - 1;
+ size_t totalSize = l2Capacity + 2;
+ off_t endOffset = offset + totalSize;
+
+ while ( offset < endOffset ) {
+ size_t sizeToRead = MIN( size, endOffset - offset );
+ if ( !readSizeTested( fh, buffer, sizeToRead, offset, "writeLongNonAlignedPattern test Failed: read failed" ) ) {
+ return false;
+ }
+ if ( !compare( buffer, expected, sizeToRead, "writeLongNonAlignedPattern test Failed: compare failed" ) )
+ return false;
+ offset += sizeToRead;
+ }
+ printf( "LongNonAlignedPattern successful!\n" );
+ return true;
+}
+
+/**
+ * @brief This test, tests a longer writing process over an L2 boundary.
+ * The data is a pattern that is not a multiple of 4096 bytes.
+ *
+ */
+bool writeLongNonAlignedPattern()
+{
+ int size = DNBD3_BLOCK_SIZE * 10;
+ char buffer[size];
+
+ for ( int i = 0; i < size; i++ ) {
+ buffer[i] = (char)( i % 255 );
+ }
+
+ off_t offset = l2Capacity * 3 - 1;
+ size_t totalSize = l2Capacity + 2;
+ off_t endOffset = offset + totalSize;
+
+ while ( offset < endOffset ) {
+ size_t sizeToWrite = MIN( size, endOffset - offset );
+ if ( !writeSizeTested(
+ fh, buffer, sizeToWrite, offset, "writeLongNonAlignedPattern test Failed: write failed" ) ) {
+ return false;
+ }
+ offset += sizeToWrite;
+ }
+ return verifyLongNonAlignedPattern();
+}
+
+bool verifyFileSizeChanges()
+{
+ printf( "verify size changes...\n" );
+ char buff[DNBD3_BLOCK_SIZE * 2];
+ char expected[DNBD3_BLOCK_SIZE * 2];
+
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE );
+ memset( expected + DNBD3_BLOCK_SIZE, 0, DNBD3_BLOCK_SIZE );
+ off_t offset = (size_t)( ( (double)testFileSize ) * 0.9 ) - DNBD3_BLOCK_SIZE;
+
+ if ( !readSizeTested(
+ fh, buff, DNBD3_BLOCK_SIZE * 2, offset, "verifyFileSizeChanges test Failed: read to small\n" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2,
+ "verifyFileSizeChanges test Failed: increased data not as expected.\n" ) )
+ return false;
+ offset += DNBD3_BLOCK_SIZE * 2;
+
+ offset = offset - ( offset % DNBD3_BLOCK_SIZE );
+
+ memset( expected, 0, DNBD3_BLOCK_SIZE );
+
+ while ( offset < (off_t) (l2Capacity * 3 - 1 )) {
+ size_t sizeToRead = MIN( DNBD3_BLOCK_SIZE * 2, ( l2Capacity * 3 - 1 ) - offset );
+ if ( !readSizeTested( fh, buff, sizeToRead, offset, "verifyFileSizeChanges test Failed: read to small" ) )
+ return false;
+
+ if ( !compare( buff, expected, sizeToRead, "verifyFileSizeChanges test Failed: data not 0.\n" ) )
+ return false;
+ offset += sizeToRead;
+ }
+ printf( "verified fileSizeChanges.\n" );
+ return true;
+}
+
+/**
+ * @brief An increase and decrease in the size of the file is tested.
+ * Furthermore, after a decrease in size and subsequent increase in size,
+ * the increased part of the file is completely 0.
+ */
+bool fileSizeChanges()
+{
+ // check if increased is 0
+ char buff[DNBD3_BLOCK_SIZE * 10];
+ char expected[DNBD3_BLOCK_SIZE * 10];
+ memset( expected, 0, DNBD3_BLOCK_SIZE * 10 );
+
+ // decrease FileSize
+ printf( "Decrease Filesize to: %zu\n", (size_t)( ( (double)testFileSize ) * 0.9 ) );
+ if ( !changeFileSizeAndVerify( filePath, (size_t)( ( (double)testFileSize ) * 0.9 ) ) ) {
+ return false;
+ }
+
+ printf( "increase Filesize to: %zu\n", testFileSize );
+ if ( !changeFileSizeAndVerify( filePath, testFileSize ) ) {
+ return false;
+ }
+
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE );
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, (size_t)( ( (double)testFileSize ) * 0.9 ) - DNBD3_BLOCK_SIZE,
+ "fileSizeChanges test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 10, "fileSizeChanges test Failed: increased not as expected.\n" ) )
+ return false;
+
+ memset( expected, 0, DNBD3_BLOCK_SIZE );
+ // increase filesize
+
+ if ( !changeFileSizeAndVerify( filePath, testFileSize + 2 * l2Capacity ) ) {
+ return false;
+ }
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, testFileSize + l2Capacity,
+ "fileSizeChanges test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 10, "fileSizeChanges test Failed: increased data not 0" ) )
+ return false;
+ printf( "increased data is 0 as expected\n" );
+ // write on increased blocks
+ memset( expected, 1, DNBD3_BLOCK_SIZE * 10 );
+ if ( !writeSizeTested(
+ fh, expected, DNBD3_BLOCK_SIZE * 10, testFileSize, "fileSizeChanges test Failed: write failed" ) )
+ return false;
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, testFileSize, "fileSizeChanges test Failed: read to small" ) )
+ return false;
+ if ( !compare(
+ buff, expected, DNBD3_BLOCK_SIZE * 10, "fileSizeChanges test Failed: write on increased size failed" ) )
+ return false;
+ printf( "writes to new Block Ok\n" );
+ // decrease filesize
+ printf( "Truncate file to: %zu \n", testFileSize );
+ if ( !changeFileSizeAndVerify( filePath, testFileSize ) ) {
+ return false;
+ }
+ printf( "size verified\n" );
+ // increase again, check its 0 again
+ printf( "Truncate file to: %zu\n", testFileSize + 2 * l2Capacity );
+ if ( !changeFileSizeAndVerify( filePath, testFileSize + 2 * l2Capacity ) ) {
+ return false;
+ }
+
+ printf( "size verified\n" );
+ memset( expected, 0, DNBD3_BLOCK_SIZE * 10 );
+
+
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, testFileSize, "fileSizeChanges test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 2,
+ "fileSizeChanges test Failed: increased data (second time) not 0" ) )
+ return false;
+ return verifyFileSizeChanges();
+}
+
+
+bool verifyInterleavedTest()
+{
+ char buff[DNBD3_BLOCK_SIZE * 10];
+ char expected[DNBD3_BLOCK_SIZE * 10];
+ off_t offset = 35 * DNBD3_BLOCK_SIZE;
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE * 10 );
+ memset( expected, 10, DNBD3_BLOCK_SIZE );
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 2 ) ), 12, DNBD3_BLOCK_SIZE );
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 4 ) ), 14, DNBD3_BLOCK_SIZE );
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 5 ) ), 15, DNBD3_BLOCK_SIZE );
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 8 ) ), 18, DNBD3_BLOCK_SIZE );
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, offset, "interleavedTest test Failed: read 2 to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 10, "interleavedTest test Failed: read not as expected" ) )
+ return false;
+ printf( "interleavedTest successful!\n" );
+ return true;
+}
+
+/**
+ * @brief Writes several DNBD3 blocks in a cow block,
+ * leaving gaps between the DNBD3 blocks.
+ *
+ */
+bool interleavedTest()
+{
+ printf( "starting interleavedTest \n" );
+ char buff[DNBD3_BLOCK_SIZE * 10];
+ char expected[DNBD3_BLOCK_SIZE * 10];
+ off_t offset = 35 * DNBD3_BLOCK_SIZE;
+ memset( expected, BASE_DATA, DNBD3_BLOCK_SIZE * 10 );
+ if ( !readSizeTested( fh, buff, DNBD3_BLOCK_SIZE * 10, offset, "interleavedTest test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, DNBD3_BLOCK_SIZE * 10, "interleavedTest test Failed: read data not 0" ) )
+ return false;
+
+ memset( expected, 10, DNBD3_BLOCK_SIZE );
+ if ( !writeSizeTested( fh, expected, DNBD3_BLOCK_SIZE, offset, "interleavedTest test Failed: write 1 failed" ) )
+ return false;
+
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 2 ) ), 12, DNBD3_BLOCK_SIZE );
+ if ( !writeSizeTested( fh, ( expected + ( DNBD3_BLOCK_SIZE * 2 ) ), DNBD3_BLOCK_SIZE, offset + DNBD3_BLOCK_SIZE * 2,
+ "interleavedTest test Failed: write 2 failed" ) )
+ return false;
+
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 4 ) ), 14, DNBD3_BLOCK_SIZE );
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 5 ) ), 15, DNBD3_BLOCK_SIZE );
+
+ if ( !writeSizeTested( fh, ( expected + ( DNBD3_BLOCK_SIZE * 4 ) ), DNBD3_BLOCK_SIZE * 2,
+ offset + DNBD3_BLOCK_SIZE * 4, "interleavedTest test Failed: write 3 failed" ) )
+ return false;
+
+ memset( ( expected + ( DNBD3_BLOCK_SIZE * 8 ) ), 18, DNBD3_BLOCK_SIZE );
+ if ( !writeSizeTested( fh, ( expected + ( DNBD3_BLOCK_SIZE * 8 ) ), DNBD3_BLOCK_SIZE, offset + DNBD3_BLOCK_SIZE * 8,
+ "interleavedTest test Failed: write 4 failed" ) )
+ return false;
+ return verifyInterleavedTest();
+}
+
+bool verifyMultipleWrites()
+{
+ size_t size = DNBD3_BLOCK_SIZE * 10 * bitfieldByteSize;
+ char buff[size];
+ char expected[size];
+ off_t offset = 100 * DNBD3_BLOCK_SIZE * bitfieldByteSize;
+ memset( expected, 3, size );
+ if ( !readSizeTested( fh, buff, size, offset, "multipleWrites test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, size, "multipleWrites: read incorrect data" ) )
+ return false;
+ printf( "MultipleWrites successful!\n" );
+ return true;
+}
+/**
+ * @brief The multiple writing of the same blocks is tested.
+ * A delay can be set with the parameter -d. In this way,
+ * for example, it can be ensured that the block is uploaded
+ * between the write operations.
+ */
+bool multipleWrites()
+{
+ printf( "starting multipleWrites\n" );
+ size_t size = DNBD3_BLOCK_SIZE * 10 * bitfieldByteSize;
+ char buff[size];
+ char expected[size];
+ off_t offset = 100 * DNBD3_BLOCK_SIZE * bitfieldByteSize;
+
+ for ( int i = 1; i <= 3; i++ ) {
+ printf( "multipleWrites: %i/3 \n", i );
+
+ memset( expected, i, size );
+ if ( !writeSizeTested( fh, expected, size, offset, "multipleWrites: write Failed" ) )
+ return false;
+ if ( !readSizeTested( fh, buff, size, offset, "multipleWrites test Failed: read to small" ) )
+ return false;
+ if ( !compare( buff, expected, size, "multipleWrites: read incorrect data" ) )
+ return false;
+ if ( delay > 0 && i < 3 ) {
+ printf( "waiting %is\n", delay );
+ sleep( delay );
+ }
+ }
+ return verifyMultipleWrites();
+}
+
+/**
+ * @brief runs the different test of the standard test
+ *
+ * @param path
+ */
+bool runTest( char *path )
+{
+ if ( ( fh = open( path, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open test file" );
+ printf( "Given path: %s \n", path );
+ return false;
+ }
+ strcpy( filePath, path );
+ printf( "file opened: %s\n", path );
+
+ if ( !testSingleBit() )
+ return false;
+ if ( !writeOverTwoBlocks() )
+ return false;
+
+ if ( !writeNotOnBlockBorder() )
+ return false;
+
+ if ( !writeOverL2() )
+ return false;
+ if ( !fileSizeChanges() )
+ return false;
+ if ( !interleavedTest() )
+ return false;
+ if ( !multipleWrites() ) {
+ return false;
+ }
+ if ( !writeLongNonAlignedPattern() ) {
+ return false;
+ }
+ printf( "All test's successful.\n" );
+ return true;
+}
+
+
+void verifyTests( verify_test_t *tests )
+{
+ // offset, size, function
+ off_t fileSizeOffset = (size_t)( ( (double)testFileSize * 0.9 ) - DNBD3_BLOCK_SIZE );
+ size_t fileSizeSize = ( l2Capacity * 3 - 1 ) - fileSizeOffset;
+ tests[0] = ( verify_test_t ){ 0, 2 * DNBD3_BLOCK_SIZE, verifySingleBit };
+ tests[1] = ( verify_test_t ){ DNBD3_BLOCK_SIZE * 3, DNBD3_BLOCK_SIZE * 3, verifyWriteOverTwoBlocks };
+ tests[2] = ( verify_test_t ){ DNBD3_BLOCK_SIZE * 11 - DNBD3_BLOCK_SIZE / 2, DNBD3_BLOCK_SIZE * 2,
+ verifyWriteNotOnBlockBorder };
+ tests[3] = ( verify_test_t ){ 35 * DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE * 10, verifyInterleavedTest };
+ tests[4] = ( verify_test_t ){ 100 * DNBD3_BLOCK_SIZE * bitfieldByteSize, DNBD3_BLOCK_SIZE * 10 * bitfieldByteSize,
+ verifyMultipleWrites };
+ tests[5] = ( verify_test_t ){ l2Capacity * 2 - DNBD3_BLOCK_SIZE, DNBD3_BLOCK_SIZE * 2, verifyWriteOverL2 };
+ tests[6] = ( verify_test_t ){ fileSizeOffset, fileSizeSize, verifyFileSizeChanges };
+ tests[7] = ( verify_test_t ){ l2Capacity * 3 - 1, l2Capacity + 2, verifyLongNonAlignedPattern };
+}
+
+/**
+ * @brief verifies a file tested with the standard test.
+ * The file is read from start to finish.
+ *
+ * @param path
+ */
+bool verifyFinalFile( char *path )
+{
+ if ( ( fh = open( path, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open test file" );
+ printf( "Given path: %s \n", path );
+ return false;
+ }
+ // verify file size
+
+ size_t fileSize = testFileSize + 2 * l2Capacity;
+ struct stat st;
+ fstat( fh, &st );
+ size_t size = st.st_size;
+ if ( size != fileSize ) {
+ printf( "verify Failed, wrong file size\n expectedSize: %zu\n got: %zu\n", fileSize, size );
+ return false;
+ }
+
+ // read to whole file
+
+ int maxReadSize = DNBD3_BLOCK_SIZE * COW_BITFIELD_SIZE * 8;
+ char buffer[maxReadSize];
+ char emptyData[maxReadSize];
+ memset( emptyData, BASE_DATA, maxReadSize );
+ size_t offset = 0;
+
+
+ int numberOfTests = 8;
+ verify_test_t tests[numberOfTests];
+ verifyTests( tests );
+
+ int currentTest = 0;
+ bool swapToIncreased = false;
+
+
+ while ( offset < fileSize ) {
+ size_t sizeToRead = MIN( (size_t)maxReadSize, fileSize - offset );
+ if ( currentTest < numberOfTests ) {
+ sizeToRead = MIN( sizeToRead, tests[currentTest].offset - offset );
+ }
+ if ( currentTest < numberOfTests && tests[currentTest].offset == (off_t)offset ) {
+ if ( !tests[currentTest].test() ) {
+ return false;
+ }
+ offset += tests[currentTest].size;
+ currentTest++;
+ } else {
+ // if offset > testFileSize filler data is 0
+ if ( !swapToIncreased && offset > testFileSize ) {
+ memset( emptyData, 0, maxReadSize );
+ }
+ ssize_t sizeRead = pread( fh, buffer, sizeToRead, offset );
+ if ( sizeRead <= 0 ) {
+ perror( "Error while reading data: " );
+ printf( "verify failed. \n" );
+ return false;
+ }
+ if ( !compare( buffer, emptyData, sizeRead, "verify failed. Expected 0 data" ) ) {
+ printf( "Offset: %zu \n", offset );
+ return false;
+ }
+ offset += (size_t)sizeRead;
+ }
+ }
+ printf( "file verified successful.\n" );
+ return true;
+}
+
+/**
+ * @brief generates random data by reading it from "/dev/urandom".
+ *
+ * @param fhr filehandle to "/dev/urandom
+ * @param dest buffer into which the random data is to be stored
+ * @param size
+ */
+void generateRandomData( int fhr, char *dest, size_t size )
+{
+ read( fhr, dest, size );
+}
+
+/**
+ * @brief prints a progress bar
+ *
+ * @param progress Percent of the progress bar [0.0-1.0]
+ */
+void printProgress( float progress )
+{
+ progress = MIN( 1, progress );
+ progress = MAX( 0, progress );
+ int barWidth = 50;
+ char buf[barWidth + 1];
+ buf[barWidth] = 0;
+ int pos = (int)( (float)barWidth * progress );
+ memset( buf, '=', pos );
+ memset( ( buf + pos ), ' ', ( barWidth - pos ) );
+ printf( "\033[F[%s] %i%%\n", buf, (int)( progress * 100 ) );
+}
+
+/**
+ * @brief Calculates the offset at which the two arrays are different
+ *
+ * @param buf1
+ * @param buf2
+ * @param size
+ */
+off_t findDiffOffset( char *buf1, char *buf2, size_t size )
+{
+ for ( size_t i = 0; i < size; i++ ) {
+ if ( buf1[i] != buf2[i] ) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/**
+ * @brief compare two files for equality
+ *
+ * @param mountedImagePath path to file one
+ * @param normalImagePath path to file two
+ * @param fhm filehandle of file one
+ * @param fhn filehandle of file two
+ */
+bool compareTwoFiles( char *mountedImagePath, char *normalImagePath, int fhm, int fhn )
+{
+ char buf[RND_MAX_WRITE_SIZE];
+ char exBuf[RND_MAX_WRITE_SIZE];
+ off_t offset = 0;
+ struct stat st;
+ stat( mountedImagePath, &st );
+ size_t sizeMounted = st.st_size;
+ stat( normalImagePath, &st );
+ size_t sizeNormal = st.st_size;
+
+ if ( sizeMounted != sizeNormal ) {
+ printf( "Error size difference, mounted: %zu normal: %zu \n", sizeMounted, sizeNormal );
+ return false;
+ }
+ printf( "\n" );
+ while ( offset < (off_t)sizeMounted ) {
+ size_t sizeToRead = MIN( RND_MAX_WRITE_SIZE, sizeMounted - offset );
+ read( fhm, buf, sizeToRead );
+
+ read( fhn, exBuf, sizeToRead );
+
+ if ( memcmp( buf, exBuf, sizeToRead ) != 0 ) {
+ off_t dif = findDiffOffset( buf, exBuf, sizeToRead );
+ printf( "Error: Different data, offset: %zu \n expected: %i got %i \n", offset + dif, (int)exBuf[dif],
+ (int)buf[dif] );
+ return false;
+ }
+
+ offset += sizeToRead;
+ printProgress( ( (float)offset ) / ( (float)sizeMounted ) );
+ }
+ printf( "\nTest successful !!!\n" );
+ return true;
+}
+
+/**
+ * @brief opens two files and then compares them for equality
+ *
+ * @param mountedImagePath
+ * @param normalImagePath
+ */
+bool startCompareTwoFiles( char *mountedImagePath, char *normalImagePath )
+{
+ int fhm, fhn;
+ bool ok = true;
+ if ( ( fhm = open( mountedImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open mounted Image" );
+ printf( "Given path: %s \n", mountedImagePath );
+ ok = false;
+ }
+ if ( ( fhn = open( normalImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open normal Image" );
+ printf( "Given path: %s \n", normalImagePath );
+ ok = false;
+ }
+ if(!ok){
+ return false;
+ }
+ return compareTwoFiles( mountedImagePath, normalImagePath, fhm, fhn );
+}
+
+
+bool specialTwoFilesTest( char *mountedImagePath, char *normalImagePath )
+{
+ int fhm;
+ int fhn;
+ int fhr;
+ char buf[RND_MAX_WRITE_SIZE];
+ bool ok = true;
+ if ( ( fhm = open( mountedImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open mounted Image" );
+ printf( "Given path: %s \n", mountedImagePath );
+ ok = false;
+ }
+ if ( ( fhn = open( normalImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open normal Image" );
+ printf( "Given path: %s \n", normalImagePath );
+ ok = false;
+ }
+ if ( ( fhr = open( "/dev/urandom", O_RDONLY ) ) == -1 ) {
+ perror( "Could not open /dev/urandom" );
+ ok = false;
+ }
+ if(!ok){
+ return false;
+ }
+ special_test_t tests[] = {
+ {976314368, 569344},
+ {970432512, 1253376},
+ {959447040, 692224},
+ {782128012, 0},
+ {945591351, 0},
+ {956534784, 344064},
+ {966615040, 397312 },
+ {906517288, 0 },
+ {2062985199, 0},
+ {966663420, 1097920 },
+ {969617408, 327680},
+ {957513728, 1105920},
+ {964941207, 1183680},
+ {958701568, 741376},
+ {958701568, 102400},
+ {970027008, 20480},
+ };
+
+ for ( int i = 0; i < (int) (sizeof( tests ) / sizeof( special_test_t )); i++ ) {
+ if ( tests[i].size == 0 ) {
+ changeTwoFileSizeAndVerify( mountedImagePath, normalImagePath, tests[i].offset );
+ } else {
+ generateRandomData( fhr, buf, tests[i].size );
+ writeTwoFilesTested( fhm, fhn, buf, tests[i].size, tests[i].offset );
+ }
+ }
+ printf( "\n" );
+ return compareTwoFiles( mountedImagePath, normalImagePath, fhm, fhn );
+}
+
+bool randomWriteTest( char *mountedImagePath, char *normalImagePath, float minSizePercent, float maxSizePercent)
+{
+ int fhm;
+ int fhn;
+ int fhr;
+ srand( (unsigned)time( NULL ) );
+ struct stat st;
+ stat( normalImagePath, &st );
+ size_t startFileSize = st.st_size;
+ size_t maxOffset = (size_t)( startFileSize * 1.1L );
+ double minFileSize = (double)startFileSize * minSizePercent;
+ double fileSizeVariation = (double)startFileSize * ( maxSizePercent - minFileSize );
+
+ char buf[RND_MAX_WRITE_SIZE];
+
+ printf( "===starting random write test ===\n" );
+ printf( "mounted image path %s\n", mountedImagePath );
+ printf( "normal image path %s\n", normalImagePath );
+
+ bool ok = true;
+ if ( ( fhm = open( mountedImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open mounted Image" );
+ printf( "Given path: %s \n", mountedImagePath );
+ ok = false;
+ }
+ if ( ( fhn = open( normalImagePath, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ perror( "Could not open normal Image" );
+ printf( "Given path: %s \n", normalImagePath );
+ ok = false;
+ }
+ if ( ( fhr = open( "/dev/urandom", O_RDONLY ) ) == -1 ) {
+ perror( "Could not open /dev/urandom" );
+ ok = false;
+ }
+ if(!ok){
+ return (void*) false;
+ }
+ // RANDOM WRITE LOOP
+ printf( "Press Ctrl-C to stop and compare\n" );
+ while ( randomTestLoop ) {
+ //select test
+ int r = rand() % 100;
+ if ( r < RND_TRUNCATE_PROBABILITY ) {
+ // truncate both images
+ size_t size = (size_t)( ( rand() % (int)( fileSizeVariation ) ) + minFileSize );
+
+ printf( "change filesize to: %zu\n", size );
+ if ( !changeFileSizeAndVerify( mountedImagePath, size ) ) {
+ return (void*) false;
+ }
+ if ( !changeFileSizeAndVerify( normalImagePath, size ) ) {
+ return (void*) false;
+ }
+
+ } else {
+ // write test
+ off_t offset = rand() % maxOffset;
+ size_t size = ( rand() + offset ) % RND_MAX_WRITE_SIZE;
+ if ( size < RND_MAX_WRITE_SIZE / 2 ) {
+ size /= 1 + ( rand() % 8192 );
+ }
+ size = MAX( size, 1 );
+ if ( r > RND_TRUNCATE_PROBABILITY + RND_UNALIGNED_WRITE_PROBABILITY ) {
+ // align to block
+ offset = offset - ( offset % 4096 );
+ size = MAX( size - ( size % 4096 ), 4096 );
+ }
+
+ generateRandomData( fhr, buf, size );
+ printf( "write offset: %zu size: %zu r: %d\n", offset, size, r );
+ if ( !writeSizeTested( fhm, buf, size, offset, "failed to write on mounted image" ) )
+ return false;
+ if ( !writeSizeTested( fhn, buf, size, offset, "failed to write on normal image" ) )
+ return false;
+ }
+ }
+
+ // COMPARE BOTH IMAGES
+ printf( "comparing both files: \n\n" );
+ compareTwoFiles( mountedImagePath, normalImagePath, fhm, fhn );
+ return true;
+}
+/**
+ * @brief Handler for cancel the random test on stg+c
+ *
+ */
+void cancled_handler(__attribute__( ( unused ) ) int s) {
+ randomTestLoop = false;
+}
+/**
+ * @brief starts the random write test
+ *
+ * @param mountedImagePath path of the mounted file
+ * @param normalImagePath path of the file one the drive
+ * @param minSizePercent Minimum percentage to which the file is resized.
+ * @param maxSizePercent Maximum percentage to which the file is resized.
+ */
+bool startRandomWriteTest( char *mountedImagePath, char *normalImagePath, float minSizePercent, float maxSizePercent )
+{
+ if ( minSizePercent < 0 || maxSizePercent < minSizePercent || maxSizePercent < 0.1 ) {
+ printf( "minSizePercent or maxSizePercent of wrong value, reverting to default.\n" );
+ minSizePercent = RND_DEFAULT_MIN_SIZE_PERCENT;
+ maxSizePercent = RND_DEFAULT_MAX_SIZE_PERCENT;
+ }
+ printf( "minSizePercent: %.1f%% maxSizePercent: %.1f%%\n", minSizePercent * 100, maxSizePercent * 100 );
+
+ struct sigaction sigIntHandler;
+ sigIntHandler.sa_handler = cancled_handler;
+ sigemptyset(&sigIntHandler.sa_mask);
+ sigIntHandler.sa_flags = SA_SIGINFO;
+ sigaction(SIGINT, &sigIntHandler, NULL);
+ return randomWriteTest(mountedImagePath, normalImagePath, minSizePercent, maxSizePercent);
+}
+
+static const char *optString = "d:c:t:v:r:x:y:z:w:h:";
+static const struct option longOpts[] = {
+ { "delay", required_argument, NULL, 'd' },
+ { "testFile", optional_argument, NULL, 'c' },
+ { "test", required_argument, NULL, 't' },
+ { "verify", required_argument, NULL, 'v' },
+ { "specialTwoFiles", required_argument, NULL, 'w' },
+ { "randomTest", required_argument, NULL, 'r' },
+ { "compare", required_argument, NULL, 'x' },
+ { "minSizePercent", required_argument, NULL, 'y' },
+ { "maxSizePercent", required_argument, NULL, 'z' },
+ { "help", required_argument, NULL, 'h' },
+ { 0, 0, 0, 0 }
+};
+
+
+void printUsageStandardTest()
+{
+ printf( "The standard test, tests certain edge cases. Here the previously prepared file (created with -c) must be mounted via the dnbd3 server.\n");
+ printf( "The test performs certain read, write and resize operations to ensure functionality and test for edge cases.\n" );
+ printf( "\n" );
+ printf( "Instructions on how to use the standard test: \n" );
+ printf( "1. Generate the test image with -c <path> and copy it to the image location of the dnbd3 server. Also make sure that the cow servers OriginalImageDirectory points to the same Directory or copied in that Directory too. This step is only needed once for setting up.\n" );
+ printf( "2. Start the dnbd3 and cow server.\n" );
+ printf( "3. Mount the image in cow mode.\n" );
+ printf( "4. Run the test with -t <path>, where the path points to the mounted image.\n" );
+ printf( "5. Optional verify again with -v <path>.\n" );
+ printf( "6. Optional unmount the image and then load it again (with -L <path> in the fuse client). Then verify the loaded image with -v <path>.\n" );
+ printf( "7. Unmount and merge the image (to merge the image use -m on the fuse client).\n" );
+ printf( "8. Verify the merged image from the cow server with -v <path>.\n" );
+}
+
+void printUsageRandomTest()
+{
+ printf( "For the random test, a test file (created with -c) is mounted via the dnbd3 server and a copy of it on the normal hard disk is required.");
+ printf( "The test performs random identical write and size changes on both test files. When the test is cancelled (ctrl+c), both files are tested for equality. \n" );
+ printf( "\n" );
+ printf( "Instructions on how to use the random test: \n" );
+ printf( "1. Generate the test image with -c <path> and copy it to the image location of the dnbd3 server. Also make sure that the cow servers OriginalImageDirectory points to the same Directory or copied in that Directory too. This step is only needed once for setting up.\n" );
+ printf( "2. Copy the generated image to another location.\n" );
+ printf( "3. Start the dnbd3 and cow server.\n" );
+ printf( "4. Mount the image in cow mode.\n" );
+ printf( "5. Run the test with -t <mountedFile> <normalFile>, where the <mountedFile> points to the mounted image and <normalFile> points to the copied image on the disk.\n" );
+ printf( "6. After some time press strg+c and both images will be compared for equalness.\n" );
+ printf( "7. Unmount the image and merge.\n" );
+ printf( "8. Run -x <mergedFile> <normalFile> where the <mergedFile> points to the merged image and <normalFile> points to the copied image on the disk. This will verify that the merged image is equal to the image on the disk.\n" );
+}
+
+void printUsage()
+{
+ printf( "There are two test variants, the standard test in which different edge cases are tested and "
+ "a random test in which data or size changes are randomly made in a mounted file and a file "
+ "located on the normal file system and then compared. "
+ "To get information about the two tests and how to run them use -h test and -h randomTest.\n" );
+ printf( "Usage: \n" );
+ printf( " -c --testFile <file> Creates test file at the path. \n" );
+ printf( " -d --delay <seconds> Delay in Seconds for multiple block write in the standard test.\n" );
+ printf( " -t --test <file> Runs the standard test procedure. \n" );
+ printf( " -v --verify <file> verifies a file. \n" );
+ printf( " -r --randomTest <mountedFile> <normalFile> randomly writes in both file's and after cancel(strg+c) compares them if they are equal.\n" );
+ printf( " -y --minSizePercent <percent> sets the minimum size in percent(integer) the file will be reduced to in the random test.\n" );
+ printf( " -z --maxSizePercent <percent> sets the maximum size in percent(integer) the file will be enlarged to in the random test.\n" );
+ printf( " -x --compare <mountedFile> <normalFile> compares two files for equalness.\n" );
+}
+
+int main( int argc, char *argv[] )
+{
+ if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
+ printUsage();
+ return 0;
+ }
+ int opt, lidx;
+
+ bool runTestFile = false;
+ bool runStandardTest = false;
+ bool runVerifyTest = false;
+ bool runRandomTest = false;
+ bool runCompare = false;
+ bool runSpecialTwoFiles = false;
+ char fileCreationPath[400];
+ char *standardTestPath;
+ char *verifyPath;
+ char *rndMountedPath;
+ char *rndNormalPath;
+ size_t generateFileSize = testFileSize;
+ float minSizePercent = RND_DEFAULT_MIN_SIZE_PERCENT;
+ float maxSizePercent = RND_DEFAULT_MAX_SIZE_PERCENT;
+
+ while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
+ char *pEnd;
+ switch ( opt ) {
+ case 'd':
+
+ delay = (int)strtol( optarg, &pEnd, 10 );
+ printf( "Delay set to %i\n", delay );
+ break;
+ case 'c':
+ strncpy( fileCreationPath, optarg, 400 );
+ if ( optind >= argc && argv[optind] != NULL && argv[optind][0] != '-' ) {
+ generateFileSize = (size_t)strtol( argv[optind], &pEnd, 10 );
+ ++optind;
+ }
+
+ runTestFile = true;
+ break;
+ case 't':
+ standardTestPath = optarg;
+ runStandardTest = true;
+ break;
+ case 'v':
+ verifyPath = optarg;
+ runVerifyTest = true;
+ break;
+ case 'r':
+ if ( optind >= argc && argv[optind] != NULL && argv[optind][0] != '-' ) {
+ printUsage();
+ return 0;
+ }
+ rndMountedPath = optarg;
+ rndNormalPath = argv[optind];
+ optind++;
+ runRandomTest = true;
+
+ break;
+ case 'x':
+ if ( optind >= argc && argv[optind] != NULL && argv[optind][0] != '-' ) {
+ printUsage();
+ return 0;
+ }
+ rndMountedPath = optarg;
+ rndNormalPath = argv[optind];
+ optind++;
+ runCompare = true;
+ break;
+ case 'w':
+ if ( optind >= argc && argv[optind] != NULL && argv[optind][0] != '-' ) {
+ printUsage();
+ return 0;
+ }
+ rndMountedPath = optarg;
+ rndNormalPath = argv[optind];
+ optind++;
+ runSpecialTwoFiles = true;
+ break;
+ case 'y': minSizePercent = ( (float)strtol( optarg, &pEnd, 10 ) ) / 100; break;
+ case 'z': maxSizePercent = ( (float)strtol( optarg, &pEnd, 10 ) ) / 100; break;
+ case 'h':
+ if ( strcmp( optarg, "test" ) == 0 ) {
+ printUsageStandardTest();
+ return 0;
+ } else if ( strcmp( optarg, "randomTest" ) == 0 ) {
+ printUsageRandomTest();
+ return 0;
+ } else {
+ printUsage();
+ return 0;
+ }
+ break;
+ default:
+ printUsage();
+ return 0;
+ break;
+ }
+ }
+ bool result = true;
+ if ( runTestFile ) {
+ result = generateTestFile( fileCreationPath, generateFileSize );
+ } else if ( runStandardTest ) {
+ printf( "starting standard test\n" );
+ result = runTest( standardTestPath );
+ } else if ( runVerifyTest ) {
+ printf( "verifying file \n" );
+ result = verifyFinalFile( verifyPath );
+ } else if ( runRandomTest ) {
+ result = startRandomWriteTest( rndMountedPath, rndNormalPath, minSizePercent, maxSizePercent );
+ } else if ( runCompare ) {
+ result = startCompareTwoFiles( rndMountedPath, rndNormalPath );
+ } else if ( runSpecialTwoFiles ) {
+ result = specialTwoFilesTest( rndMountedPath, rndNormalPath );
+ } else {
+ printUsage();
+ }
+ if(!result ){
+ return EXIT_FAILURE;
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/src/cowtest/readme.md b/src/cowtest/readme.md
new file mode 100644
index 0000000..dae6ecf
--- /dev/null
+++ b/src/cowtest/readme.md
@@ -0,0 +1,110 @@
+# Cowtest
+
+### Table of Contents
+1. [Introduction](#introduction)
+2. [Usage](#usage)
+3. [Tests](#tests)
+
+
+
+# Introduction
+
+This test collection is used to check whether the cow implementation of the fuse client is working correctly. It can check whether read and write operations on the cow fuse client work correctly and whether the cow server merges the image correctly.
+
+
+# Usage
+
+### Parameters
+- `-c <path>` generates a test image in the specified path. This image is needed for the tests.
+- `-t <file>` performs the standard tests on the image at the specified location.
+- `-v <file>` checks if previous tests of the image were successful (also reads the image completely).
+- `-r <mountedImageFile> <normalImageFile>` writes randomly and changes the size of two images. After pressing ctrl +c, both images are compared for equality.
+- `-x <mergedImageFile> <normalImageFile>` Checks if both images are equal.
+
+
+
+### Example usage for standard test
+
+1. Generate the test image with `-c <path>` and copy it into the image directory of the dnbd3 server. Also make sure that the `OriginalImageDirectory` of the Cow server points to the same directory or has also been copied to this directory. This step is only required once for the setup.
+2. Start the dnbd3 and cow server.
+3. Mount the image in cow mode (`-c <path>` and `-C <address>` on the fuse client).
+4. Run the test with `-t <path>`, with the path pointing to the mounted image.
+5. Optional verify again with `-v <path>`.
+6. Optionally unmount the image and then load it again (with `-L <path>` instead of `-C <path>` on the Fuse client). Then verify the loaded image with `-v <path>`.
+7. Unmount and merge the image (`-m` on the fuse client).
+8. Verify the merged image from the cow server with `-v <path>`.
+
+### Example usage for random writes
+1. Generate the test image with `-c <path>` and copy it into the image directory of the dnbd3 server. Also make sure that the `OriginalImageDirectory` of the Cow server points to the same directory or has also been copied to this directory. This step is only required once for the setup.
+2. Make a copy of the created image in another location.
+3. Start the dnbd3 and cow server.
+4. Mount the image in cow mode (`-c <path>` and `-C <address>` on the fuse client).
+5. Run the test with `-t <mountedImagePath> <normalImagePath>`, where `<mountedImagePath>` refers to the mounted image and `<normalImagePath>` refers to the copied image on the hard disk.
+6. After some time, press ctrl+c to end the test. Afterwards, both images are automatically compared for equality.
+7. Unmount the image and merge it(`-m` on the fuse client).
+8. Run `-x <mergedImagePath> <normalImagePath>` where `<mergedImagePath>` points to the merged image and `<normalImagePath>` points to the copied image on the hard disk. This verifies that the merged image matches the image on the hard disk.
+
+Another help for running or setting up the tests can be the git ci test script [test-cow-fuse.yml](../../.github/workflows/test-cow-fuse.yml). There, a complete test setup is created and the standard test as well as the random test is executed.
+
+# Tests
+
+### TestSingleBit
+Reads the first block and checks whether all bits are 0. Then sets the first bit to 1 and writes it.
+This test checks the basic functions and whether the image is still "clean".
+Then sets a single bit in the second block to 1 to verify that padding works correctly.
+
+| offset | size |
+| -------| -----|
+| 0 | 2 * DNBD3_BLOCK_SIZE|
+
+
+### WriteOverTwoBlocks
+Tests that continuous writes over two DNBD3_BLOCK's are possible.
+
+| offset | size |
+| -------| -----|
+| DNBD3_BLOCK_SIZE * 3| size: DNBD3_BLOCK_SIZE * 2|
+
+
+### WriteNotOnBlockBorder
+Verifies that writes are not aligned to block boundaries (multiples of 4096).
+
+| offset | size |
+| -------| -----|
+| DNBD3_BLOCK_SIZE * 11 - DNBD3_BLOCK_SIZE / 2| DNBD3_BLOCK_SIZE * 2 |
+
+
+### InterleavedTest
+
+| offset | size |
+| -------| -----|
+|DNBD3_BLOCK_SIZE * 35 | DNBD3_BLOCK_SIZE * 10|
+
+### WriteOverL2
+Tests whether continuous writes across L2 boundaries are possible.
+
+| offset | size |
+| -------| -----|
+|l2Capacity * 2 - DNBD3_BLOCK_SIZE | DNBD3_BLOCK_SIZE * 2 |
+
+
+### MultipleWrites
+Writes different data several times on the same block. The individual writes can be delayed with the parameter `-d`. This is useful to test whether uploading the same blocks multiple times works as intended.
+
+
+| offset | size |
+| -------| -----|
+| 100 * DNBD3_BLOCK_SIZE * bitfieldByteSize | DNBD3_BLOCK_SIZE * 10 * bitfieldByteSize |
+
+
+### fileSizeChanges
+Tests changes to the file size. First it increases the file size by 2 * l2Capacity with a truncate. Then it checks if all bits in the newly allocated memory space are set to 0. Then it writes data to the file to check if writes are possible. After that, it is truncated back to the original size. Then it is reduced again to
+the original size + 2 * l2Capacity and checks whether all bits in the newly allocated memory space are 0 again (so that the previously written data is set to 0 again).
+
+### LongNonAlignedPattern
+This test writes a long pattern over 3 l2 blocks. The pattern repeats chars from 0 to 254, so it is not a multiple of 4096, which results in all blocks being filled with different data. Furthermore, this test is not block-aligned.
+
+
+| offset | size |
+| -------| -----|
+|l2Capacity * 3 - 1|l2Capacity + 2|
diff --git a/src/fuse/CMakeLists.txt b/src/fuse/CMakeLists.txt
new file mode 100644
index 0000000..05b3fcd
--- /dev/null
+++ b/src/fuse/CMakeLists.txt
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-fuse
+ LANGUAGES C)
+
+find_package(Fuse REQUIRED)
+
+# find atomic library required by dnbd3-fuse
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# find curl for cow
+find_package(CURL REQUIRED)
+
+# add compile option to enable enhanced POSIX pthread features
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_FUSE_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cowfile.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/connection.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.c)
+set(DNBD3_FUSE_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cowfile.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/connection.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/main.h)
+
+add_executable(dnbd3-fuse ${DNBD3_FUSE_SOURCE_FILES})
+target_include_directories(dnbd3-fuse PRIVATE ${FUSE_INCLUDE_DIRS} ${CURL_INCLUDE_DIR} )
+target_link_libraries(dnbd3-fuse dnbd3-build dnbd3-version dnbd3-shared ${FUSE_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${CURL_LIBRARIES} )
+install(TARGETS dnbd3-fuse RUNTIME DESTINATION bin
+ COMPONENT fuse)
+
+add_linter(dnbd3-fuse-lint "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
+add_linter_fix(dnbd3-fuse-lint-fix "${DNBD3_FUSE_SOURCE_FILES}" "${DNBD3_FUSE_HEADER_FILES}")
diff --git a/src/fuse/connection.c b/src/fuse/connection.c
index 98b1d36..dede680 100644
--- a/src/fuse/connection.c
+++ b/src/fuse/connection.c
@@ -1,19 +1,23 @@
#include "connection.h"
#include "helper.h"
-#include "../clientconfig.h"
-#include "../shared/protocol.h"
-#include "../shared/fdsignal.h"
-#include "../shared/sockhelper.h"
-#include "../shared/log.h"
+#include <dnbd3/config/client.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
+#include "main.h"
+#include "cowfile.h"
#include <stdlib.h>
#include <pthread.h>
#include <string.h>
#include <stdio.h>
+#include <stdatomic.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <inttypes.h>
+#include <signal.h>
/* Constants */
static const size_t SHORTBUF = 100;
@@ -30,9 +34,18 @@ static const int FAIL_BACKOFF_START_COUNT = 8;
static bool connectionInitDone = false;
static bool threadInitDone = false;
static pthread_mutex_t mutexInit = PTHREAD_MUTEX_INITIALIZER;
-static bool keepRunning = true;
+// For multi-threaded concurrent connection during init
+static pthread_mutex_t mutexCondConn = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t condConn = PTHREAD_COND_INITIALIZER;
+static atomic_int pendingConnectionAttempts = 0;
+// Shutdown flag
+atomic_bool keepRunning = true;
+// Should we learn new alt-servers from servers we connect to?
static bool learnNewServers;
+static pthread_t tidReceiver;
+static pthread_t tidBackground;
+
// List of pending requests
static struct {
dnbd3_async_t *head;
@@ -55,15 +68,21 @@ static struct {
ticks startupTime;
} connection;
+struct conn_data {
+ char *lowerImage;
+ uint16_t rid;
+ int idx;
+};
+
// Known alt servers
typedef struct _alt_server {
dnbd3_host_t host;
- int consecutiveFails;
- int rtt;
+ atomic_int consecutiveFails;
+ atomic_int rtt;
int rtts[RTT_COUNT];
int rttIndex;
- int bestCount;
- int liveRtt;
+ atomic_int bestCount;
+ atomic_int liveRtt;
} alt_server_t;
static dnbd3_server_entry_t newservers[MAX_ALTS];
@@ -83,136 +102,233 @@ static pthread_rwlock_t altLock = PTHREAD_RWLOCK_INITIALIZER;
/* Static methods */
-static void* connection_receiveThreadMain(void *sock);
-static void* connection_backgroundThread(void *something);
+static void* connectThread(void * data);
+static void* connection_receiveThreadMain( void *sock );
+static void* connection_backgroundThread( void *something );
-static void addAltServers();
+static bool hasAltServer( dnbd3_host_t *host );
+static void addAltServers( void );
static void sortAltServers();
static void probeAltServers();
-static void switchConnection(int sockFd, alt_server_t *srv);
-static void requestAltServers();
-static bool throwDataAway(int sockFd, uint32_t amount);
+static size_t receiveRequest(const int sock, dnbd3_async_t* request );
+static void switchConnection( int sockFd, alt_server_t *srv );
+static void requestAltServers( void );
+static bool sendAltServerRequest( int sock );
+static bool throwDataAway( int sockFd, uint32_t amount );
+
+static void enqueueRequest( dnbd3_async_t *request );
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request );
-static void enqueueRequest(dnbd3_async_t *request);
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request);
+static void blockSignals();
-bool connection_init(const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew)
+bool connection_init( const char *hosts, const char *lowerImage, const uint16_t rid, const bool doLearnNew )
{
- int sock = -1;
char host[SHORTBUF];
- size_t hlen;
- serialized_buffer_t buffer;
- uint16_t remoteVersion, remoteRid;
- char *remoteName;
- uint64_t remoteSize;
- struct sockaddr_storage sa;
- socklen_t salen;
- poll_list_t *cons = sock_newPollList();
+ dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+ const char *current, *end;
+ int altIndex = 0;
timing_setBase();
pthread_mutex_lock( &mutexInit );
- if ( !connectionInitDone && keepRunning ) {
- dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
- const char *current, *end;
- int altIndex = 0;
- learnNewServers = doLearnNew;
- memset( altservers, 0, sizeof altservers );
- connection.sockFd = -1;
- current = hosts;
- do {
- // Get next host from string
- while ( *current == ' ' ) current++;
- end = strchr( current, ' ' );
- size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
- if ( len > SHORTBUF ) len = SHORTBUF;
- snprintf( host, len, "%s", current );
- int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
- for ( int i = 0; i < newHosts; ++i ) {
- if ( altIndex >= MAX_ALTS )
- break;
- altservers[altIndex].host = tempHosts[i];
- altIndex += 1;
- }
- current = end + 1;
- } while ( end != NULL && altIndex < MAX_ALTS );
- logadd( LOG_INFO, "Got %d servers from init call", altIndex );
- // Connect
- for ( int i = 0; i < altIndex + 5; ++i ) {
- if ( i >= altIndex ) {
- // Additional iteration - no corresponding slot in altservers, this
- // is just so we can make a final calls with longer timeout
- sock = sock_multiConnect( cons, NULL, 400, 3000 );
- if ( sock == -2 ) {
- logadd( LOG_ERROR, "Could not connect to any host" );
- sock = -1;
- break;
- }
- } else {
- if ( altservers[i].host.type == 0 )
- continue;
- // Try to connect - 100ms timeout
- sock = sock_multiConnect( cons, &altservers[i].host, 100, 3000 );
- }
- if ( sock == -2 || sock == -1 )
- continue;
- salen = sizeof(sa);
- if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
- logadd( LOG_ERROR, "getpeername on successful connection failed!? (errno=%d)", errno );
- close( sock );
- sock = -1;
+ if ( connectionInitDone ) {
+ pthread_mutex_unlock( &mutexInit );
+ return false;
+ }
+ learnNewServers = doLearnNew;
+ memset( altservers, 0, sizeof altservers );
+ connection.sockFd = -1;
+ current = hosts;
+ pthread_attr_t threadAttrs;
+ pthread_attr_init( &threadAttrs );
+ pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
+ // Resolve all hosts and connect
+ pthread_mutex_lock( &mutexCondConn );
+ do {
+ // Get next host from string
+ while ( *current == ' ' || *current == '\t' || *current == '\n' ) {
+ current++;
+ }
+ end = current;
+ while ( *end != ' ' && *end != '\t' && *end != '\n' && *end != '\0' ) {
+ end++;
+ }
+ if ( end == current )
+ break;
+ size_t len = (size_t)( end - current ) + 1;
+ if ( len > SHORTBUF ) {
+ len = SHORTBUF;
+ }
+ snprintf( host, len, "%s", current );
+ int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+ for ( int i = 0; i < newHosts; ++i ) {
+ if ( altIndex >= MAX_ALTS )
+ break;
+ if ( hasAltServer( &tempHosts[i] ) )
continue;
- }
- hlen = sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
- logadd( LOG_INFO, "Connected to %.*s", (int)hlen, host );
- if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
- logadd( LOG_ERROR, "Could not send select image" );
- } else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
- logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
- } else if ( rid != 0 && rid != remoteRid ) {
- logadd( LOG_ERROR, "rid mismatch (want: %d, got: %d)", (int)rid, (int)remoteRid );
- } else {
- logadd( LOG_INFO, "Requested: '%s:%d'", lowerImage, (int)rid );
- logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
- sock_setTimeout( sock, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
- image.name = strdup( remoteName );
- image.rid = remoteRid;
- image.size = remoteSize;
- if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &connection.currentServer ) ) {
- logadd( LOG_ERROR, "sockaddr to dnbd3_host_t failed!?" );
- connection.currentServer.type = 0;
+ altservers[altIndex].host = tempHosts[i];
+ // Start thread for async connect if not connected yet
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd == -1 ) {
+ pthread_t t;
+ struct conn_data *cd = malloc( sizeof(*cd) );
+ // We cannot be sure a thread is taking longer than this function runs, so better copy
+ cd->lowerImage = strdup( lowerImage );
+ cd->rid = rid;
+ cd->idx = altIndex;
+ pendingConnectionAttempts++;
+ if ( ( errno = pthread_create( &t, &threadAttrs, &connectThread, (void*)cd ) ) != 0 ) {
+ pendingConnectionAttempts--;
+ logadd( LOG_ERROR, "Could not create connect thread %d, errno=%d", cd->idx, errno );
+ free( cd->lowerImage );
+ free( cd );
+ continue;
}
- connection.panicSignal = signal_new();
- timing_get( &connection.startupTime );
- connection.sockFd = sock;
- requests.head = NULL;
- requests.tail = NULL;
- requestAltServers();
- break;
- }
- // Failed
- if ( sock != -1 ) {
- close( sock );
- sock = -1;
+ struct timespec timeout;
+ clock_gettime( CLOCK_REALTIME, &timeout );
+ timeout.tv_nsec += 200 * 1000 * 1000;
+ if ( timeout.tv_nsec >= 1000 * 1000 * 1000 ) {
+ timeout.tv_nsec -= 1000 * 1000 * 1000;
+ timeout.tv_sec += 1;
+ }
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &timeout );
}
+ // End async connect
+ altIndex += 1;
}
- if ( sock != -1 ) {
- connectionInitDone = true;
- }
+ current = end + 1;
+ } while ( *end != '\0' && altIndex < MAX_ALTS );
+ logadd( LOG_INFO, "Got %d servers from init call", altIndex );
+ // Wait a maximum of five seconds if we're not connected yet
+ if ( connection.sockFd == -1 && pendingConnectionAttempts > 0 ) {
+ struct timespec end;
+ clock_gettime( CLOCK_REALTIME, &end );
+ end.tv_sec += 5;
+ pthread_cond_timedwait( &condConn, &mutexCondConn, &end );
+ }
+ pthread_mutex_unlock( &mutexCondConn );
+ pthread_attr_destroy( &threadAttrs );
+ if ( connection.sockFd != -1 ) {
+ connectionInitDone = true;
}
pthread_mutex_unlock( &mutexInit );
- sock_destroyPollList( cons );
- return sock != -1;
+ return connectionInitDone;
+}
+
+static void* connectThread(void * data)
+{
+ struct conn_data *cd = (struct conn_data*)data;
+ int idx = cd->idx;
+ int sock = -1;
+ serialized_buffer_t buffer;
+ uint16_t remoteVersion, remoteRid;
+ char *remoteName;
+ uint64_t remoteSize;
+ char host[SHORTBUF];
+ struct sockaddr_storage sa;
+ socklen_t salen = sizeof(sa);
+
+ if ( idx < 0 || idx >= MAX_ALTS || altservers[idx].host.type == 0 ) {
+ logadd( LOG_ERROR, "BUG: Index out of range, or empty server in connect thread (%d)", idx );
+ goto bailout;
+ }
+
+ sock_printHost( &altservers[idx].host, host, sizeof(host) );
+ logadd( LOG_INFO, "Trying to connect to %s", host );
+ sock = sock_connect( &altservers[idx].host, 1500, SOCKET_TIMEOUT_RECV * 1000 );
+ if ( sock == -1 ) {
+ logadd( LOG_INFO, "[%s] Connection failed", host );
+ goto bailout;
+ }
+
+ salen = sizeof( sa );
+ if ( getpeername( sock, (struct sockaddr*)&sa, &salen ) == -1 ) {
+ logadd( LOG_ERROR, "[%s] getpeername on successful connection failed!? (errno=%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ sock_printable( (struct sockaddr*)&sa, salen, host, sizeof(host) );
+ logadd( LOG_INFO, "[%s] Connected", host );
+ if ( !dnbd3_select_image( sock, cd->lowerImage, cd->rid, 0 ) ) {
+ logadd( LOG_ERROR, "[%s] Could not send select image", host );
+ goto bailout;
+ }
+
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_ERROR, "[%s] Could not read select image reply (%d)", host, errno );
+ goto bailout;
+ }
+ atomic_thread_fence( memory_order_acquire );
+ if ( connection.sockFd != -1 )
+ goto bailout;
+
+ if ( cd->rid != 0 && cd->rid != remoteRid ) {
+ logadd( LOG_ERROR, "[%s] rid mismatch (want: %d, got: %d)",
+ host, (int)cd->rid, (int)remoteRid );
+ goto bailout;
+ }
+ // Seems we got a winner
+ pthread_mutex_lock( &mutexCondConn );
+ if ( connection.sockFd != -1 || connectionInitDone ) {
+ pthread_mutex_unlock( &mutexCondConn );
+ logadd( LOG_INFO, "[%s] Raced by other connection", host );
+ goto bailout;
+ }
+ logadd( LOG_INFO, "Requested: '%s:%d'", cd->lowerImage, (int)cd->rid );
+ logadd( LOG_INFO, "Returned: '%s:%d'", remoteName, (int)remoteRid );
+ image.name = strdup( remoteName );
+ image.rid = remoteRid;
+ image.size = remoteSize;
+ connection.currentServer = altservers[idx].host;
+ connection.panicSignal = signal_new();
+ timing_get( &connection.startupTime );
+ requests.head = NULL;
+ requests.tail = NULL;
+ if ( learnNewServers && !sendAltServerRequest( sock ) )
+ goto bailout;
+ // Everything good, tell main connect function
+ connection.sockFd = sock;
+ atomic_thread_fence( memory_order_release );
+ pendingConnectionAttempts--;
+ if ( idx != 0 ) {
+ // Make server first in list - enough to swap host, other data has not changed yet
+ lock_write( &altLock );
+ dnbd3_host_t tmp = altservers[idx].host;
+ altservers[idx].host = altservers[0].host;
+ altservers[0].host = tmp;
+ unlock_rw( &altLock );
+ }
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ return NULL;
+
+bailout:
+ if ( sock != -1 ) {
+ close( sock );
+ }
+ free( cd->lowerImage );
+ free( cd );
+ // Last one has to wake up main thread, which is waiting for up to 5 seconds for
+ // any connect thread to succeed. If none succeeded, there is no point in waiting
+ // any longer.
+ if ( --pendingConnectionAttempts == 0 ) {
+ pthread_mutex_lock( &mutexCondConn );
+ pthread_cond_signal( &condConn );
+ pthread_mutex_unlock( &mutexCondConn );
+ }
+ return NULL;
}
bool connection_initThreads()
{
pthread_mutex_lock( &mutexInit );
- if ( !keepRunning || !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
+ if ( !connectionInitDone || threadInitDone || connection.sockFd == -1 ) {
pthread_mutex_unlock( &mutexInit );
return false;
}
bool success = true;
- pthread_t thread;
threadInitDone = true;
logadd( LOG_DEBUG1, "Initializing stuff" );
if ( pthread_mutex_init( &connection.sendMutex, NULL ) != 0
@@ -220,10 +336,10 @@ bool connection_initThreads()
logadd( LOG_ERROR, "Mutex or spinlock init failure" );
success = false;
} else {
- if ( pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)connection.sockFd ) != 0 ) {
+ if ( pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)connection.sockFd ) != 0 ) {
logadd( LOG_ERROR, "Could not create receive thread" );
success = false;
- } else if ( pthread_create( &thread, NULL, &connection_backgroundThread, NULL ) != 0 ) {
+ } else if ( pthread_create( &tidBackground, NULL, &connection_backgroundThread, NULL ) != 0 ) {
logadd( LOG_ERROR, "Could not create background thread" );
success = false;
}
@@ -236,12 +352,23 @@ bool connection_initThreads()
return success;
}
+char * connection_getImageName()
+{
+ return image.name;
+}
+
+uint16_t connection_getImageRID()
+{
+ return image.rid;
+}
+
+
uint64_t connection_getImageSize()
{
return image.size;
}
-bool connection_read(dnbd3_async_t *request)
+bool connection_read( dnbd3_async_t *request )
{
if ( !connectionInitDone ) return false;
pthread_mutex_lock( &connection.sendMutex );
@@ -250,9 +377,7 @@ bool connection_read(dnbd3_async_t *request)
if ( !dnbd3_get_block( connection.sockFd, request->offset, request->length, (uint64_t)request, 0 ) ) {
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
- pthread_mutex_unlock( &connection.sendMutex );
signal_call( connection.panicSignal );
- return true;
}
}
pthread_mutex_unlock( &connection.sendMutex );
@@ -261,24 +386,36 @@ bool connection_read(dnbd3_async_t *request)
void connection_close()
{
- if ( keepRunning ) {
- logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
- }
+ static bool signalled = false;
+ logadd( LOG_INFO, "Tearing down dnbd3 connections and workers" );
pthread_mutex_lock( &mutexInit );
keepRunning = false;
+ if ( threadInitDone && !signalled ) {
+ signalled = true;
+ pthread_kill( tidReceiver, SIGHUP );
+ pthread_kill( tidBackground, SIGHUP );
+ }
+ pthread_mutex_unlock( &mutexInit );
if ( !connectionInitDone ) {
- pthread_mutex_unlock( &mutexInit );
return;
}
- pthread_mutex_unlock( &mutexInit );
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
+ logadd( LOG_DEBUG1, "Shutting down socket..." );
shutdown( connection.sockFd, SHUT_RDWR );
}
pthread_mutex_unlock( &connection.sendMutex );
}
-size_t connection_printStats(char *buffer, const size_t len)
+void connection_join()
+{
+ if ( !threadInitDone )
+ return;
+ pthread_join( tidReceiver, NULL );
+ pthread_join( tidBackground, NULL );
+}
+
+size_t connection_printStats( char *buffer, const size_t len )
{
int ret;
size_t remaining = len;
@@ -308,7 +445,7 @@ size_t connection_printStats(char *buffer, const size_t len)
*buffer++ = ' ';
}
const size_t addrlen = sock_printHost( &altservers[i].host, buffer, remaining );
- remaining -= (addrlen + 1); // For space or * above
+ remaining -= ( addrlen + 1 ); // For space or * above
buffer += addrlen;
if ( remaining < 3 )
break;
@@ -324,7 +461,7 @@ size_t connection_printStats(char *buffer, const size_t len)
width += 3;
}
ret = snprintf( buffer, remaining, "% *d %s Unreachable:% 5d BestCount:% 5d Live:% 5dµs\n",
- width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
+ width, value, unit, altservers[i].consecutiveFails, altservers[i].bestCount, altservers[i].liveRtt );
if ( ret < 0 ) {
ret = 0;
}
@@ -339,23 +476,23 @@ size_t connection_printStats(char *buffer, const size_t len)
return len - remaining;
}
-static void* connection_receiveThreadMain(void *sockPtr)
+static void* connection_receiveThreadMain( void *sockPtr )
{
int sockFd = (int)(size_t)sockPtr;
dnbd3_reply_t reply;
- pthread_detach( pthread_self() );
+ blockSignals();
while ( keepRunning ) {
int ret;
do {
ret = dnbd3_read_reply( sockFd, &reply, true );
+ if ( !keepRunning ) goto fail;
if ( ret == REPLY_OK ) break;
} while ( ret == REPLY_INTR || ret == REPLY_AGAIN );
if ( ret != REPLY_OK ) {
logadd( LOG_DEBUG1, "Error receiving reply on receiveThread (%d)", ret );
goto fail;
}
-
if ( reply.cmd == CMD_GET_BLOCK ) {
// Get block reply. find matching request
dnbd3_async_t *request = removeRequest( (dnbd3_async_t*)reply.handle );
@@ -369,7 +506,7 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
} else {
// Found a match
- const ssize_t ret = sock_recv( sockFd, request->buffer, request->length );
+ const ssize_t ret = receiveRequest( sockFd, request );
if ( ret != (ssize_t)request->length ) {
logadd( LOG_DEBUG1, "receiving payload for a block reply failed" );
connection_read( request );
@@ -390,10 +527,14 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
unlock_rw( &altLock );
}
- // Success, wake up caller
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ // TODO: See comment in receiveRequest()
+ if( useCow ) {
+ cowfile_handleCallback( request );
+ }
+ else {
+ fuse_reply_buf( request->fuse_req, container_of( request, dnbd3_async_parent_t, request )->buffer, request->length );
+ free( request );
+ }
}
} else if ( reply.cmd == CMD_GET_SERVERS ) {
// List of known alt servers
@@ -416,7 +557,6 @@ static void* connection_receiveThreadMain(void *sockPtr)
}
}
}
- logadd( LOG_DEBUG1, "Aus der Schleife rausgeflogen! ARRRRRRRRRR" );
fail:;
// Make sure noone is trying to use the socket for sending by locking,
pthread_mutex_lock( &connection.sendMutex );
@@ -424,7 +564,9 @@ fail:;
// as someone could have established a new connection already
if ( connection.sockFd == sockFd ) {
connection.sockFd = -1;
- signal_call( connection.panicSignal );
+ if ( keepRunning ) {
+ signal_call( connection.panicSignal );
+ }
}
pthread_mutex_unlock( &connection.sendMutex );
// As we're the only reader, it's safe to close the socket now
@@ -432,20 +574,23 @@ fail:;
return NULL;
}
-static void* connection_backgroundThread(void *something UNUSED)
+static void* connection_backgroundThread( void *something UNUSED )
{
ticks nextKeepalive;
ticks nextRttCheck;
+ blockSignals();
timing_get( &nextKeepalive );
nextRttCheck = nextKeepalive;
while ( keepRunning ) {
ticks now;
timing_get( &now );
- uint32_t wt1 = timing_diffMs( &now, &nextKeepalive );
- uint32_t wt2 = timing_diffMs( &now, &nextRttCheck );
+ uint32_t wt1 = (uint32_t)timing_diffMs( &now, &nextKeepalive );
+ uint32_t wt2 = (uint32_t)timing_diffMs( &now, &nextRttCheck );
if ( wt1 > 0 && wt2 > 0 ) {
int waitRes = signal_wait( connection.panicSignal, (int)MIN( wt1, wt2 ) + 1 );
+ if ( !keepRunning )
+ break;
if ( waitRes == SIGNAL_ERROR ) {
logadd( LOG_WARNING, "Error waiting on signal in background thread! Errno = %d", errno );
}
@@ -460,20 +605,20 @@ static void* connection_backgroundThread(void *something UNUSED)
}
sortAltServers();
probeAltServers();
- if ( panic || timing_diff( &connection.startupTime, &now ) <= STARTUP_MODE_DURATION ) {
+ if ( panic || timing_diff( &connection.startupTime, &now ) <= DISCOVER_STARTUP_PHASE_COUNT * TIMER_INTERVAL_PROBE_STARTUP ) {
timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_STARTUP );
} else {
- timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_NORMAL );
+ timing_addSeconds( &nextRttCheck, &now, TIMER_INTERVAL_PROBE_MAX );
}
}
// Send keepalive packet
if ( timing_reachedPrecise( &nextKeepalive, &now ) ) {
pthread_mutex_lock( &connection.sendMutex );
if ( connection.sockFd != -1 ) {
- dnbd3_request_t request;
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_KEEPALIVE;
- request.handle = request.offset = request.size = 0;
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_KEEPALIVE,
+ };
fixup_request( request );
ssize_t ret = sock_sendAll( connection.sockFd, &request, sizeof request, 2 );
if ( (size_t)ret != sizeof request ) {
@@ -483,7 +628,7 @@ static void* connection_backgroundThread(void *something UNUSED)
}
}
pthread_mutex_unlock( &connection.sendMutex );
- timing_addSeconds( &nextKeepalive, &now, TIMER_INTERVAL_KEEPALIVE_PACKET );
+ timing_addSeconds( &nextKeepalive, &now, KEEPALIVE_INTERVAL );
}
}
return NULL;
@@ -491,7 +636,20 @@ static void* connection_backgroundThread(void *something UNUSED)
// Private quick helpers
-static void addAltServers()
+/**
+ * Check if given host is in list of altsevers.
+ * Does not lock 'altLock', do so at caller site.
+ */
+static bool hasAltServer( dnbd3_host_t *host )
+{
+ for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
+ if ( isSameAddress( host, &altservers[eIdx].host ) )
+ return true;
+ }
+ return false;
+}
+
+static void addAltServers( void )
{
pthread_mutex_lock( &newAltLock );
lock_write( &altLock );
@@ -499,11 +657,8 @@ static void addAltServers()
if ( newservers[nIdx].host.type == 0 )
continue;
// Got a new alt server, see if it's already known
- for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
- if ( isSameAddress( &newservers[nIdx].host, &altservers[eIdx].host ) ) {
- goto skip_server;
- }
- }
+ if ( hasAltServer( &newservers[nIdx].host ) )
+ continue;
// Not known yet, add - find free slot
int slot = -1;
for ( int eIdx = 0; eIdx < MAX_ALTS; ++eIdx ) {
@@ -528,9 +683,8 @@ static void addAltServers()
altservers[slot].host = newservers[nIdx].host;
altservers[slot].liveRtt = 0;
}
-skip_server:;
}
- memset( newservers, 0, sizeof(newservers) );
+ memset( newservers, 0, sizeof( newservers ) );
unlock_rw( &altLock );
pthread_mutex_unlock( &newAltLock );
}
@@ -604,7 +758,7 @@ static void probeAltServers()
pthread_spin_lock( &requests.lock );
if ( requests.head != NULL ) {
if ( !panic && current != NULL ) {
- const int maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
+ const uint64_t maxDelay = MAX( current->rtt * 5, 1000000 ); // Give at least one second
dnbd3_async_t *iterator;
for ( iterator = requests.head; iterator != NULL; iterator = iterator->next ) {
// A request with measurement tag is pending
@@ -626,7 +780,7 @@ static void probeAltServers()
}
lock_read( &altLock );
- for ( int altIndex = 0; altIndex < (panic ? MAX_ALTS : MAX_ALTS_ACTIVE); ++altIndex ) {
+ for ( int altIndex = 0; altIndex < ( panic ? MAX_ALTS : MAX_ALTS_ACTIVE ); ++altIndex ) {
alt_server_t * const srv = &altservers[altIndex];
if ( srv->host.type == 0 )
continue;
@@ -634,65 +788,70 @@ static void probeAltServers()
&& rand() % srv->consecutiveFails >= FAIL_BACKOFF_START_COUNT ) {
continue;
}
+ srv->rttIndex += 1;
if ( srv->rttIndex >= RTT_COUNT ) {
srv->rttIndex = 0;
- } else {
- srv->rttIndex += 1;
}
// Probe
+ char hstr[100];
+ sock_printHost( &srv->host, hstr, 100 );
ticks start;
timing_get( &start );
errno = 0;
int sock = sock_connect( &srv->host, panic ? 1000 : 333, 1000 );
if ( sock == -1 ) {
- logadd( LOG_DEBUG1, "Could not connect for probing. errno = %d", errno );
+ logadd( LOG_DEBUG1, "%s probe: Could not connect for probing. errno = %d", hstr, errno );
goto fail;
}
if ( !dnbd3_select_image( sock, image.name, image.rid, 0 ) ) {
- logadd( LOG_DEBUG1, "probe: select_image failed" );
+ logadd( LOG_DEBUG1, "%s probe: select_image failed (sock=%d, errno=%d)", hstr, sock, errno );
goto fail;
}
- if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize )) {
- logadd( LOG_DEBUG1, "probe: select image reply failed" );
+ if ( !dnbd3_select_image_reply( &buffer, sock, &remoteProto, &remoteName, &remoteRid, &remoteSize ) ) {
+ logadd( LOG_DEBUG1, "%s probe: select image reply failed", hstr );
goto fail;
}
if ( remoteProto < MIN_SUPPORTED_SERVER ) {
- logadd( LOG_WARNING, "Unsupported remote version (local: %d, remote: %d)", (int)PROTOCOL_VERSION, (int)remoteProto );
+ logadd( LOG_WARNING, "%s probe: Unsupported remote version (local: %d, remote: %d)", hstr, (int)PROTOCOL_VERSION, (int)remoteProto );
srv->consecutiveFails += 10;
goto fail;
}
if ( remoteRid != image.rid || strcmp( remoteName, image.name ) != 0 ) {
- logadd( LOG_WARNING, "Remote rid or name mismatch (got '%s')", remoteName );
+ logadd( LOG_WARNING, "%s probe: Remote rid or name mismatch (got '%s')", hstr, remoteName );
srv->consecutiveFails += 10;
goto fail;
}
if ( !dnbd3_get_block( sock, testOffset, testLength, 0, 0 ) ) {
- logadd( LOG_DEBUG1, "-> block request fail" );
+ logadd( LOG_DEBUG1, "%s probe: -> block request fail", hstr );
goto fail;
}
int a = 111;
- if ( !(a = dnbd3_get_reply( sock, &reply )) || reply.size != testLength ) {
- logadd( LOG_DEBUG1, "<- get block reply fail %d %d", a, (int)reply.size );
+ if ( !( a = dnbd3_get_reply( sock, &reply ) ) || reply.size != testLength ) {
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply fail %d %d", hstr, a, (int)reply.size );
goto fail;
}
if ( request != NULL && removeRequest( request ) != NULL ) {
// Request successfully removed from queue
- const ssize_t ret = sock_recv( sock, request->buffer, request->length );
+ ssize_t const ret = receiveRequest( sock, request);
if ( ret != (ssize_t)request->length ) {
- logadd( LOG_DEBUG1, "[RTT] receiving payload for a block reply failed" );
+ logadd( LOG_DEBUG1, "%s probe: receiving payload for a block reply failed", hstr );
// Failure, add to queue again
connection_read( request );
goto fail;
}
- // Success, wake up caller
- logadd( LOG_DEBUG1, "[RTT] Successful direct probe" );
- request->success = true;
- request->finished = true;
- signal_call( request->signal );
+ // Success, reply to fuse
+ if( useCow ) {
+ cowfile_handleCallback( request );
+ }
+ else {
+ fuse_reply_buf( request->fuse_req, container_of( request, dnbd3_async_parent_t, request )->buffer, request->length );
+ free( request );
+ }
+ logadd( LOG_DEBUG1, "%s probe: Successful direct probe", hstr );
} else {
// Wasn't a request that's in our request queue
if ( !throwDataAway( sock, testLength ) ) {
- logadd( LOG_DEBUG1, "<- get block reply payload fail" );
+ logadd( LOG_DEBUG1, "%s probe: <- get block reply payload fail", hstr );
goto fail;
}
}
@@ -701,7 +860,7 @@ static void probeAltServers()
// Panic mode? Just switch to server
if ( panic ) {
unlock_rw( &altLock );
- switchConnection( sock, srv );
+ if ( keepRunning ) switchConnection( sock, srv );
return;
}
// Non-panic mode:
@@ -733,7 +892,8 @@ static void probeAltServers()
close( sock );
}
continue;
-fail:;
+fail:
+ ;
if ( sock != -1 ) {
close( sock );
}
@@ -774,7 +934,7 @@ fail:;
// Regular logic: Apply threshold when considering switch
if ( !doSwitch && current != NULL ) {
doSwitch = current->rtt > best->rtt + RTT_ABSOLUTE_THRESHOLD
- || RTT_THRESHOLD_FACTOR(current->rtt) > best->rtt + 1000;
+ || RTT_THRESHOLD_FACTOR( current->rtt ) > best->rtt + 1000;
}
}
// Switch if a better server was found
@@ -796,11 +956,20 @@ fail:;
}
}
-static void switchConnection(int sockFd, alt_server_t *srv)
+static size_t receiveRequest(const int sock, dnbd3_async_t* request )
+{
+ if( useCow ) {
+ cow_sub_request_t * cow_request = container_of( request, cow_sub_request_t, dRequest );
+ return sock_recv( sock, cow_request->buffer, request->length );
+ } else {
+ return sock_recv( sock, container_of( request, dnbd3_async_parent_t, request )->buffer, request->length );
+ }
+}
+
+static void switchConnection( int sockFd, alt_server_t *srv )
{
- pthread_t thread;
struct sockaddr_storage addr;
- socklen_t addrLen = sizeof(addr);
+ socklen_t addrLen = sizeof( addr );
char message[200] = "Connection switched to ";
const size_t len = strlen( message );
int ret;
@@ -829,9 +998,10 @@ static void switchConnection(int sockFd, alt_server_t *srv)
signal_call( connection.panicSignal );
return;
}
+ pthread_detach( tidReceiver );
timing_get( &connection.startupTime );
- pthread_create( &thread, NULL, &connection_receiveThreadMain, (void*)(size_t)sockFd );
- sock_printable( (struct sockaddr*)&addr, sizeof(addr), message + len, sizeof(message) - len );
+ pthread_create( &tidReceiver, NULL, &connection_receiveThreadMain, ( void* )(size_t)sockFd );
+ sock_printable( (struct sockaddr*)&addr, sizeof( addr ), message + len, sizeof( message ) - len );
logadd( LOG_INFO, "%s", message );
// resend queue
if ( queue != NULL ) {
@@ -855,22 +1025,28 @@ static void switchConnection(int sockFd, alt_server_t *srv)
/**
* Does not lock, so get the sendMutex first!
*/
-static void requestAltServers()
+static void requestAltServers( void )
{
if ( connection.sockFd == -1 || !learnNewServers )
return;
- dnbd3_request_t request = { 0 };
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_GET_SERVERS;
- fixup_request( request );
- if ( sock_sendAll( connection.sockFd, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) {
- logadd( LOG_WARNING, "Connection failed while requesting alt server list" );
+ if ( !sendAltServerRequest( connection.sockFd ) ) {
+ logadd( LOG_WARNING, "Main connection failed while requesting alt server list" );
shutdown( connection.sockFd, SHUT_RDWR );
connection.sockFd = -1;
}
}
-static bool throwDataAway(int sockFd, uint32_t amount)
+static bool sendAltServerRequest( int sock )
+{
+ dnbd3_request_t request = {
+ .magic = dnbd3_packet_magic,
+ .cmd = CMD_GET_SERVERS,
+ };
+ fixup_request( request );
+ return sock_sendAll( sock, &request, sizeof( request ), 2 ) == (ssize_t)sizeof( request );
+}
+
+static bool throwDataAway( int sockFd, uint32_t amount )
{
size_t done = 0;
char tempBuffer[SHORTBUF];
@@ -883,11 +1059,9 @@ static bool throwDataAway(int sockFd, uint32_t amount)
return true;
}
-static void enqueueRequest(dnbd3_async_t *request)
+static void enqueueRequest( dnbd3_async_t *request )
{
request->next = NULL;
- request->finished = false;
- request->success = false;
//logadd( LOG_DEBUG2, "Queue: %p @ %s : %d", request, file, line );
// Measure latency and add to switch formula
timing_get( &request->time );
@@ -901,7 +1075,7 @@ static void enqueueRequest(dnbd3_async_t *request)
pthread_spin_unlock( &requests.lock );
}
-static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
+static dnbd3_async_t* removeRequest( dnbd3_async_t *request )
{
pthread_spin_lock( &requests.lock );
//logadd( LOG_DEBUG2, "Remov: %p @ %s : %d", request, file, line );
@@ -925,3 +1099,20 @@ static dnbd3_async_t* removeRequest(dnbd3_async_t *request)
return iterator;
}
+static void blockSignals()
+{
+ sigset_t sigmask;
+ if ( pthread_sigmask( 0, NULL, &sigmask ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot get current sigmask of thread" );
+ sigemptyset( &sigmask );
+ }
+ sigaddset( &sigmask, SIGUSR1 );
+ sigaddset( &sigmask, SIGUSR2 );
+ sigaddset( &sigmask, SIGPIPE );
+ sigaddset( &sigmask, SIGINT );
+ sigaddset( &sigmask, SIGTERM );
+ sigdelset( &sigmask, SIGHUP );
+ if ( pthread_sigmask( SIG_SETMASK, &sigmask, NULL ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot set sigmask of thread" );
+ }
+}
diff --git a/src/fuse/connection.h b/src/fuse/connection.h
index cae554c..b22e3ce 100644
--- a/src/fuse/connection.h
+++ b/src/fuse/connection.h
@@ -1,35 +1,50 @@
#ifndef _CONNECTION_H_
#define _CONNECTION_H_
-#include "../shared/fdsignal.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/shared/timing.h>
+#include <stdatomic.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
+#include <sys/socket.h>
+#define FUSE_USE_VERSION 30
+#include <fuse_lowlevel.h>
+
+
+extern atomic_bool keepRunning;
struct _dnbd3_async;
typedef struct _dnbd3_async {
struct _dnbd3_async *next; // Next in this linked list (provate field, not set by caller)
- dnbd3_signal_t* signal; // Used to signal the caller
- char* buffer; // Caller-provided buffer to be filled
ticks time; // When request was put on wire, 0 if not measuring
uint64_t offset;
uint32_t length;
- bool finished; // Will be set to true if the request has been handled
- bool success; // Will be set to true if the request succeeded
+ fuse_req_t fuse_req;
} dnbd3_async_t;
-bool connection_init(const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers);
+typedef struct _dnbd3_async_parent {
+ dnbd3_async_t request;
+ char buffer[]; // Must be last member!
+} dnbd3_async_parent_t;
+
+bool connection_init( const char *hosts, const char *image, const uint16_t rid, const bool learnNewServers );
bool connection_initThreads();
uint64_t connection_getImageSize();
-bool connection_read(dnbd3_async_t *request);
+char * connection_getImageName();
+
+uint16_t connection_getImageRID();
+
+bool connection_read( dnbd3_async_t *request );
void connection_close();
-size_t connection_printStats(char *buffer, const size_t len);
+void connection_join();
+
+size_t connection_printStats( char *buffer, const size_t len );
#endif /* CONNECTION_H_ */
diff --git a/src/fuse/cowDoc/img/datastructure.jpg b/src/fuse/cowDoc/img/datastructure.jpg
new file mode 100644
index 0000000..d471d2a
--- /dev/null
+++ b/src/fuse/cowDoc/img/datastructure.jpg
Binary files differ
diff --git a/src/fuse/cowDoc/img/readrequest.svg b/src/fuse/cowDoc/img/readrequest.svg
new file mode 100644
index 0000000..a16f95c
--- /dev/null
+++ b/src/fuse/cowDoc/img/readrequest.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Do not edit this file with editors other than diagrams.net -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="719px" height="701px" viewBox="-0.5 -0.5 719 701" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2022-06-27T15:42:31.682Z&quot; agent=&quot;5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36 OPR/87.0.4390.45&quot; etag=&quot;_RkRrWUXEJff4oZ7Tmkg&quot; version=&quot;20.0.3&quot;&gt;&lt;diagram id=&quot;otKvBIWVLEUd5MDrhZ-X&quot; name=&quot;Page-1&quot;&gt;5VpZc5swEP4tffBjOiAO48fm6jHpTGfcmTSPspENLSAihI376yuBZA45NrGxoclMxmFXu5LY3W8lrRgZN2H2mcDY+45dFIyA5mYj43YEgG4CMOJ/mrspOOOJVTCWxHeFUMmY+n+RYGqCm/ouSmqCFOOA+nGdOcdRhOa0xoOE4HVdbIGD+qgxXCKFMZ3DQOU++i71Cq4DxiX/C/KXnhxZtydFSwilsHiTxIMuXldYxt3IuCEY0+IpzG5QwI0n7VLo3b/Qup0YQRFto4BW8ZRkP52Hr4+mt/LJ+ttmdqWbRTcrGKTijcVs6UaaALnMIoLEhHp4iSMY3JXca4LTyEV8HI1RpcwDxjFj6oz5G1G6Ee6FKcWM5dEwEK0LHFHRqPMuijnwgV98WcFKcErmaM8bGiJoIFkiukdusnUJi2WEQ0TJhukRFEDqr+rzgCKolls5ofqJELipCMTYj2hS6fkHZzABgQ/LFMEh0KHbNR+yh6JHSVWmVrJyP7/C54bicoKgm7/rc4oSqgRA6V7uq7XnUzSNYW71NUN93ZWQzIUnTbD15AoRirL9vlRtLxRMp24kabN1iUddgsyrYFHK7fJWxcJHGFBTLLiAQYLeGHLGLZEDToXOSc4Y70hgdsDme53EMKr5w35OebLNbXaV5Eb7xAR0Lc5yy8l29rTk/2cBnv9hAlyZ62uMhnxCTACGPO6LX5XMx58R2RFeLBJEq4pBMY6GIlc22oJ3X8y70FSx6OFwliaHcdgB8GQ22gKvLfKccyEPOIq3KUkHATxmU7L5JRpz4ol3/tGS5G0mBiuojaQyn27V2HNFi1GlEiekTocgd9qC3OwT5AZ4Fxm3rTOMF3B7GWeoGOw84354l/nWAkPLt7qKu6Hk2w5hN2kJO71X2BmT4ebA7eJXWe+easvd7sWvByca4xOdeNRBz9Tq0J40zuqK+AnStnVIfK/8eQ6davB2tmj8p9neaJ79Qe/Z3n4P2V6XZcSD6d7qM90Da7jpvg9v9HsAkdO8ZJlh+CmrdSnufClrV/nnbQHEagmQU/c1p/lBzVYhZoPy2xr2E6GML9Ai0hv+uXwhy2gbt/bZCllqEb6vpbbcu7+ucKWcAUQhSz5fuJDVtlp98qq+e1PduKOw9EbkFPMXSo3g6WB7DdSt2xvLg8Bs6WCn112bep86oDzYLDD1nweNYcbtsTmxy3hvW9Do5eK6me4OVCga4he55gZqyUHcc+fwS8qNdu/7EcvuHYfD/A7k2Eu1DnFotD2g9oLDMXgdEJvyl/ngRD08N5G4IDjk3kCEAWsAK+MAEKmebIXRXBypB4Wev9FpVgZsp6X9jvhIh5HlV3NFjJbfHhp3/wA=&lt;/diagram&gt;&lt;/mxfile&gt;" style="background-color: rgb(255, 255, 255);"><defs/><g><path d="M 520 50 L 520 130 L 446.37 130" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 441.12 130 L 448.12 126.5 L 446.37 130 L 448.12 133.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="460" y="10" width="120" height="40" rx="16.8" ry="16.8" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 30px; margin-left: 461px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">read request</div></div></div></foreignObject><text x="520" y="34" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">read request</text></switch></g><path d="M 200 490 L 200 523.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 200 528.88 L 196.5 521.88 L 200 523.63 L 203.5 521.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 510px; margin-left: 200px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">false</div></div></div></foreignObject><text x="200" y="513" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">false</text></switch></g><path d="M 200 410 L 260 450 L 200 490 L 140 450 Z" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 450px; margin-left: 141px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span style="font-size: 10px;">block == local &amp;&amp;<br />offset &lt; endoffset</span></div></div></div></foreignObject><text x="200" y="454" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">block == local &amp;&amp;...</text></switch></g><path d="M 620 450 L 640 450 L 640 340 L 626.37 340" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 621.12 340 L 628.12 336.5 L 626.37 340 L 628.12 343.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 395px; margin-left: 640px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">true</div></div></div></foreignObject><text x="640" y="398" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">true</text></switch></g><path d="M 560 490 L 560 523.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 560 528.88 L 556.5 521.88 L 560 523.63 L 563.5 521.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 510px; margin-left: 560px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">false</div></div></div></foreignObject><text x="560" y="513" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">false</text></switch></g><path d="M 560 410 L 620 450 L 560 490 L 500 450 Z" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 450px; margin-left: 501px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span style="font-size: 10px;">block != local &amp;&amp;<br />offset &lt; endoffset</span></div></div></div></foreignObject><text x="560" y="454" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">block != local &amp;&amp;...</text></switch></g><path d="M 380 170 L 380 203.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 380 208.88 L 376.5 201.88 L 380 203.63 L 383.5 201.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 190px; margin-left: 380px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">true</div></div></div></foreignObject><text x="380" y="193" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">true</text></switch></g><path d="M 380 90 L 380 60 L 20 60 L 20 620 L 380 620 L 380 643.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 380 648.88 L 376.5 641.88 L 380 643.63 L 383.5 641.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 340px; margin-left: 20px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">false</div></div></div></foreignObject><text x="20" y="343" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">false</text></switch></g><path d="M 380 90 L 440 130 L 380 170 L 320 130 Z" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 130px; margin-left: 321px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span style="font-size: 10px;">offset &lt; endoffset</span></div></div></div></foreignObject><text x="380" y="134" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">offset &lt; endoffset</text></switch></g><path d="M 320 250 L 200 250 L 200 303.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 200 308.88 L 196.5 301.88 L 200 303.63 L 203.5 301.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 250px; margin-left: 230px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">true</div></div></div></foreignObject><text x="230" y="253" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">true</text></switch></g><path d="M 440 250 L 560 250 L 560 303.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 560 308.88 L 556.5 301.88 L 560 303.63 L 563.5 301.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 250px; margin-left: 530px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">false</div></div></div></foreignObject><text x="530" y="253" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">false</text></switch></g><path d="M 380 210 L 440 250 L 380 290 L 320 250 Z" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 250px; margin-left: 321px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><span style="font-size: 10px;">block == local</span></div></div></div></foreignObject><text x="380" y="254" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">block == local</text></switch></g><path d="M 200 370 L 200 403.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 200 408.88 L 196.5 401.88 L 200 403.63 L 203.5 401.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="140" y="310" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 340px; margin-left: 141px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">move to next block</div></div></div></foreignObject><text x="200" y="344" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">move to next block</text></switch></g><path d="M 140 450 L 120 450 L 120 340 L 133.63 340" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 138.88 340 L 131.88 343.5 L 133.63 340 L 131.88 336.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 1px; height: 1px; padding-top: 395px; margin-left: 120px;"><div data-drawio-colors="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 10px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; background-color: rgb(255, 255, 255); white-space: nowrap;">true</div></div></div></foreignObject><text x="120" y="398" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="10px" text-anchor="middle">true</text></switch></g><path d="M 560 370 L 560 403.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 560 408.88 L 556.5 401.88 L 560 403.63 L 563.5 401.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="500" y="310" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 340px; margin-left: 501px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">move to next block</div></div></div></foreignObject><text x="560" y="344" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">move to next block</text></switch></g><path d="M 140 560 L 60 560 L 60 130 L 313.63 130" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 318.88 130 L 311.88 133.5 L 313.63 130 L 311.88 126.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="140" y="530" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 560px; margin-left: 141px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">read blocks local</div></div></div></foreignObject><text x="200" y="564" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">read blocks local</text></switch></g><path d="M 620 560 L 700 560 L 700 130 L 446.37 130" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 441.12 130 L 448.12 126.5 L 446.37 130 L 448.12 133.5 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="500" y="530" width="120" height="60" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 560px; margin-left: 501px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">read blocks from server</div></div></div></foreignObject><text x="560" y="564" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">read blocks from ser...</text></switch></g><rect x="320" y="650" width="120" height="40" rx="16.8" ry="16.8" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility" style="overflow: visible; text-align: left;"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 670px; margin-left: 321px;"><div data-drawio-colors="color: rgb(0, 0, 0); " style="box-sizing: border-box; font-size: 0px; text-align: center;"><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">read done</div></div></div></foreignObject><text x="380" y="674" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">read done</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.diagrams.net/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg> \ No newline at end of file
diff --git a/src/fuse/cowDoc/readme.md b/src/fuse/cowDoc/readme.md
new file mode 100644
index 0000000..51a0052
--- /dev/null
+++ b/src/fuse/cowDoc/readme.md
@@ -0,0 +1,367 @@
+
+# Fuse Copy on Write (CoW)
+
+### Table of Contents
+1. [Introduction](#introduction)
+2. [Usage](#usage)
+3. [Implementation Details](#implementation-details)
+4. [REST Api](#rest-api)
+
+
+# Introduction
+
+This extension of the fuse dnbd3 client makes it possible to mount images in a writable way. The changes are saved in a separate file ) on the client computer (also called Copy on Write, cow for short). These changes are uploaded to the cow server in the background. As soon as the user unmounts the image, all remaining changes are uploaded. Once all have been uploaded, the changes can be merged into a copy of the original image on the cow server (this can be set in the start parameters).
+
+
+A typical use case is updating or adding software to an existing image.
+
+# Usage
+
+### New Parameters
+- `-c <path>` Enables the cow functionality. The `path` parameter sets the path for the temporary `meta` and `data` files in which the changes are saved.
+- `-C <address>` sets the address of the cow server. The Cow server is responsible for merging the original image with the client's changes.
+
+- `- L <path>` Similar to `-c <path>`, but instead of creating a new session, an existing one is loaded from the specified path.
+- `-m` the client requests a merge after the image has been unmounted and all changes have been uploaded.
+
+- `--cow-stats-file` creates a status file at the same location as the data and meta file. The file contains information about the current session, for more information see [here](#status).
+- `--cow-stats-stdout` similar to `--cow-stats-file` but the information will be printed in the stdout.
+
+Example parameters for creating a new cow session:
+```
+./dnbd3-fuse "/home/user/VMs/mount" -f -h localhost -i imagename -c "/home/user/temp" -C "192.168.178.20:5000" --cow-stats-stdout -m
+
+```
+
+# Implementation Details
+
+
+## Data structure
+
+The data structure is divided into two main parts. The actual data of the writing on the image and the corresponding metadata. It is also important to distinguish between a dnbd3 block, which is 4096 bytes in size, and a cow cluster, which combines 320 dnbd3 blocks. A cow cluster has a `cow_l2_entry_t` structure in its according l2 table that contains the corresponding metadata. The metadata is used to determine if a dnbd3 block has been written to, where that block is stored in the `data` file, when it was last modified and when it was uploaded. But more on this later.
+
+
+### Metadata
+
+![Datastructure](img/datastructure.jpg)
+
+The metadata file, which ultimately stores the `cow_l2_entry_t` structs, contains a layer 1 (L1) and a layer 2 (L2) table for looking up the struct, which ultimately points to the actual data in the data file.
+The entire L1 table is initialised at the beginning and cannot be resized, therefore the size of the L1 table limits the maximum size of the image.
+The L2 tables are created dynamically as needed. So at the beginning, all L1 pointers are invalid (-1).
+
+The L2 tables contain 1024 `cow_l2_entry_t` structs each. An L2 table is created as soon as any data is written to any offset in the image that corresponds to the range it covers, which is a span of 1024 * 320 * 4096 bytes.
+
+```C
+typedef struct cow_l2_entry
+{
+ atomic_int_least64_t offset;
+ atomic_uint_least64_t timeChanged;
+ _Atomic(uint32_t) uploads;
+ _Atomic(uint32_t) fails;
+ atomic_char bitfield[40];
+} cow_l2_entry_t;
+```
+Each `cow_l2_entry_t` contains a 40 byte, 320 bit bit-field. The bit-field indicates whether the corresponding dnbd3 block has been written locally. If, for example, the bit field begins with 01... the first 4096 bytes contain no data and the next 4096 do.
+So each `cow_l2_entry_t` stores the metadata for up to 320\*4096 bytes. The offset field is the offset into the data file where the corresponding data is stored. `timeChanged` contains the unix timestamp when the cluster was last written. It is 0 if it has never been changed or if the latest changes have already been uploaded.
+
+
+For example, to get the `cow_l2_entry_t` for offset 4033085440, one would take L1[3], since
+```
+4033085440 / ( COW_FULL_L2_TABLE_DATA_SIZE ) ≈ 3.005
+```
+
+Then one would take the fifth `cow_l2_entry_t` in the L2 array because of
+```
+(4033085440 mod COW_FULL_L2_TABLE_DATA_SIZE) / COW_DATA_CLUSTER_SIZE = 5
+```
+Where:
+```
+COW_FULL_L2_TABLE_DATA_SIZE = 1024 * 320 * 4096
+COW_DATA_CLUSTER_SIZE = 320 * 4096
+```
+
+Since the result is an integer, the offset refers to the first dnbd3 block in that cluster. Otherwise, the block number within the cluster be calculated via
+```
+(4033085440 % (320 * 4096)) / 4096
+```
+which is the index in the bit-field that tells whether the block has been written to the data file.
+
+
+### Read Request
+
+When a read request is made, for each dnbd3 block block it is checked whether it already exists in the data file (i.e. has already been written to once). If so, it is read from the data file, otherwise it needs to be requested from the dnbd3 server. To increase performance, several subsequent blocks that are also local/non-local are combined into a larger reads from disk or requests from the server.
+
+![readrequest](img/readrequest.svg)
+
+The diagram above is somewhat simplified for clarity. The server's read operations are asynchronous. This means that while iterating over the 4k blocks from the read request, it does not wait for a response from the server for blocks that are missing locally, but fires off a request to the dnbd3 server asynchronously, continuing to check the remaining blocks. As soon as all pending requests to the server are completed, the combined data is handed over to fuse, completing the request.
+To keep track of pending requests, each request to the dnbd3 server increments the field `workCounter` in the according `cow_request_t` by one, and each time a request is completed, it is decreased by one. As soon as `workCounter` reaches `0`, all data is known to be fetched properly and assembled in a buffer that can be handed over to fuse.
+
+### Write Request
+
+If, in a write request, the beginning or end does not match a multiple of 4096, the beginning and/or end block must be padded if the accoding dnbd3 block hasn't been written to before.
+This is because the granularity of the cow bit-field represents a full dnbd3 block of 4096 bytes, so we cannot write partial data to those blocks, as there is no mechanism to annotate which parts of the block have been written to, and which are still missing.
+To work around this limitation, we need to fill the partial block's missing data with data from the dnbd3 server if it is still within the range of the original image size. If it is outside the original image size (because the image has grown), the missing bytes can simply be set to 0 and no request needs to be made.
+The write request calculates the corresponding `cow_l2_entry_t` from the offset. If the corresponding `cow_l2_entry_t` does not yet exist, it is created. The data will be written to the data file, and the offset stored in `cow_l2_entry_t.offset`.
+Then the corresponding bit in the bit-field is set and `timeChanged` is updated. If there is more data to write to the current cluster, the next `cow_l2_entry_t` is calculated and the above steps are repeated.
+The variable `workCounter` is also used here to ensure that the padding of the data occurs before the fuse request returns.
+
+
+### Background Cluster Upload
+
+For uploading clusters, there is a background thread that periodically loops over all cow clusters and checks whether `timeChanged` is not 0 and the time difference between now and `timeChanged` is greater than `COW_MIN_UPLOAD_DELAY`.
+If so, the entire cluster is uploaded. The `timeChanged` before the upload is remembered.
+After the upload, `timeChanged` is set to 0 if it still has the same time as the temporarily stored one (if not, there was a change during the upload and it has to be uploaded again).
+Once the image is unmounted, `COW_MIN_UPLOAD_DELAY` is ignored and all clusters that still have a `timeChanged` other than 0 are uploaded.
+The upload is done via a [rest request](#/api/file/update).
+There are two different limits for the number of parallel uploads in the [config/cow.h](#config-variables).
+
+## Files
+
+When a new CoW session is started, a new `meta`, `data` and, if so set in the command line arguments, a `status` file is created.
+
+### status
+
+The file `status` can be activated with the command line parameter `--cow-stats-file`.
+
+The file will contain the following:
+
+```
+uuid=<uuid>
+state=backgroundUpload
+inQueue=0
+modifiedBlocks=0
+idleClusters=0
+totalClustersUploaded=0
+activeUploads=0
+avgSpeedKb=0.00
+```
+- The `uuid` is the session uuid used by the Cow server to identify the session.
+
+- The `status` is `backgroundUpload` when the image is still mounted and cow clusters are uploaded in the background.
+It is `uploading` when the image has been unmounted and all clusters that have not yet been uploaded are uploaded.
+It is `done` when the image has been unmounted and all clusters have been uploaded.
+- `Queue` are the cow clusters that are currently being uploaded or are waiting for a free slot.
+- `ModifiedClusters` are cow clusters that have changes that have not yet been uploaded to the server because the changes are too recent.
+- `totalClustersUploaded` the total amount of cow clusters uploaded since the image was mounted.
+- `activeUploads` is the number of clusters currently being uploaded.
+- `ulspeed` the current upload speed in kb/s.
+
+Once all clusters have been uploaded, the status is set to `done`.
+If you define `COW_DUMP_BLOCK_UPLOADS`, a list of all clusters, sorted by the number of uploads, is copied to the status file after the cluster upload is completed.
+
+With the command line parameter `--cow-stats-stdout` the same output of the stats file will be printed in stdout.
+
+### meta
+
+The `meta` file contains the following header:
+```C
+// cowfile.h
+typedef struct cowfile_metadata_header
+{
+ uint64_t magicValue; // 8byte
+ atomic_uint_least64_t imageSize; // 8byte
+ int32_t version; // 4byte
+ int32_t blocksize; // 4byte
+ uint64_t validRemoteSize; // 8byte
+ uint32_t startL1; // 4byte
+ uint32_t startL2; // 4byte
+ int32_t bitfieldSize; // 4byte
+ int32_t nextL2; // 4byte
+ atomic_int_least64_t metaSize; // 8byte
+ atomic_int_least64_t nextClusterOffset; // 8byte
+ uint64_t maxImageSize; // 8byte
+ uint64_t creationTime; // 8byte
+ char uuid[40]; // 40byte
+ char imageName[200]; // 200byte
+} cowfile_metadata_header_t;
+```
+After this header, the above-mentioned l1 and then the l2 data structure begins at byte offsets specified by members startL1 and startL2. The offsets are absolute from the beginning of the file.
+
+### data
+
+The `data` file starts with `COW_FILE_DATA_MAGIC_VALUE` and at the `COW_DATA_CLUSTER_SIZE` (40 * 8 * 4096) offset the first cluster starts.
+
+### magic values in the file headers
+
+The magic values in both files are used to ensure that an appropriate file is read and that the machine has the correct endianness.
+```C
+//config.h
+#define COW_FILE_META_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825E) // Magic Value to recognize a Cow meta file
+#define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file
+```
+
+### Threads
+
+This extension uses two new threads:
+```
+tidCowUploader
+tidStatUpdater
+```
+`tidCowUploader` is the thread that uploads blocks to the cow server.
+
+`tidStatUpdater` updates the stats in stdout or the stats files (depending on parameters).
+
+### Locks
+
+This extension uses a new lock `cow.l2CreateLock`. It is used when a new L2 table is allocated.
+
+### Config Variables
+
+The following configuration variables have been added to `config/cow.h`.
+```c
+//config.h
+// +++++ COW +++++
+#define COW_BITFIELD_SIZE 40 // NEVER CHANGE THIS OR THE WORLD WILL ALSO END!
+#define COW_FILE_META_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825E) // Magic Value to recognize a Cow meta file
+#define COW_FILE_DATA_MAGIC_VALUE ((uint64_t)0xEBE44D6E72F7825F) // Magic Value to recognize a Cow data file
+#define COW_MIN_UPLOAD_DELAY 60 // in seconds
+#define COW_STATS_UPDATE_TIME 5 // time in seconds the cow status files gets updated (while uploading clusters)
+#define COW_MAX_PARALLEL_UPLOADS 10 // maximum number of parallel uploads
+#define COW_MAX_PARALLEL_BACKGROUND_UPLOADS 2 // maximum number of parallel uploads while the image is still mounted
+#define COW_URL_STRING_SIZE 500 // Max string size for an url
+#define COW_SHOW_UL_SPEED 1 // enable display of ul speed in cow status file
+#define COW_MAX_IMAGE_SIZE 1000LL * 1000LL * 1000LL * 1000LL; // Maximum size an image can have(tb*gb*mb*kb)
+// +++++ COW API Endpoints +++++
+#define COW_API_PREFIX "%s/v1/"
+#define COW_API_CREATE COW_API_PREFIX "file/create"
+#define COW_API_UPDATE COW_API_PREFIX "file/update?uuid=%s&clusterindex=%lu"
+#define COW_API_START_MERGE COW_API_PREFIX "file/merge"
+```
+
+- `COW_MIN_UPLOAD_DELAY` sets the minimum time in seconds that must have elapsed since the last change to a cow cluster before it is uploaded.
+This value can be fine-tuned. A larger value usually reduces redundant uploading of clusters.
+A smaller value reduces the time for the final upload after the image has been unmounted.
+If you set `COW_DUMP_BLOCK_UPLOADS` and set the command line parameter `--cow-stats-file`, then a list of all clusters, sorted by the number of uploads, will be written to the status file after the cluster upload is complete.
+This can help in fine-tuning `COW_MIN_UPLOAD_DELAY`.
+- `COW_STATS_UPDATE_TIME` defines the update frequency of the stdout output/statistics file in seconds. Setting it too low could affect performance as a loop runs over all clusters.
+- `COW_MAX_PARALLEL_BACKGROUND_UPLOADS` defines the maximum number of parallel cluster uploads. This number is used when the image is still mounted and the user is still using it.
+- `COW_MAX_PARALLEL_UPLOADS` defines the maximum number of parallel cluster uploads. This number is used once the image has been unmounted to upload the remaining modified clusters.
+
+
+# REST API
+
+The following Rest API is used to transmit the data and commands to the cow server:
+
+## Required methods
+
+### v1/file/create
+
+#### POST
+##### Parameters
+
+| Name | Located in | Description | Required | Schema |
+| ---- | ---------- | ----------- | -------- | ---- |
+| imageName | post | Name of image | Yes | relative path |
+| revision | post | revision id of image | Yes | integer |
+| bitfieldSize | post | number of bits per L2 cluster | Yes | integer |
+
+##### Responses
+
+| Code | Description |
+| ---- | ----------- |
+| 200 | Success |
+| 404 | Source image not found |
+
+This request is used as soon as a new cow session is created. The returned uuid is used in all subsequent requests to identify the session.
+
+
+### v1/file/update
+
+#### POST
+##### Parameters
+
+| Name | Located in | Description | Required | Schema |
+| ---- | ---------- | ----------- | -------- | ---- |
+| uuid | query | | Yes | string (uuid) |
+| clusterindex | query | | Yes | integer |
+
+##### Responses
+
+| Code | Description |
+| ---- | ----------- |
+| 200 | Success |
+| 503 | Server can't keep up, if Retry-After header is present, it can request a backoff interval, specified in seconds. |
+
+Used to upload a cluster. The cluster number is the absolute cluster number. The body contains an "application/octet-stream", where the first bytes are the bit field, directly followed by the actual cluster data. The cluster data is sparse, i.e. only blocks for which the bit is set are present, all other blocks are skipped.
+
+
+### v1/file/merge
+
+#### POST
+##### Parameters
+
+| Name | Located in | Description | Required | Schema |
+| ---- | ---------- | ----------- | -------- | ---- |
+| uuid | Form | | Yes | string (uuid) |
+| originalFileSize | Form | | Yes | integer |
+| newFileSize | Form | | Yes | integer |
+
+##### Responses
+
+| Code | Description |
+| ---- | ----------- |
+| 200 | Success |
+Used to start the merge on the server.
+
+## Optional methods, not used by dnbd3-fuse
+
+### v1/File/GetTopModifiedBlocks
+
+#### GET
+##### Parameters
+
+| Name | Located in | Description | Required | Schema |
+| ---- | ---------- | ----------- | -------- | ---- |
+| uuid | query | | Yes | string (uuid) |
+| amount | query | | Yes | integer |
+
+##### Responses
+
+| Code | Description |
+| ---- | ----------- |
+| 200 | Success |
+
+This request returns a list containing the cluster IDs and the number of uploads, sorted by the number of uploads. This is useful if you want to fine-tune `COW_MIN_UPLOAD_DELAY`.
+
+### v1/File/Status
+
+#### GET
+##### Parameters
+
+| Name | Located in | Description | Required | Schema |
+| ---- | ---------- | ----------- | -------- | ---- |
+| uuid | query | | Yes | string (uuid) |
+
+##### Responses
+
+| Code | Description |
+| ---- | ----------- |
+| 200 | Success |
+
+Returns the SessionStatus model that provides information about the session.
+
+## Models
+
+#### BlockStatistics
+
+| Name | Type | Description | Required |
+| ---- | ---- | ----------- | -------- |
+| clusterNumber | integer | | Yes |
+| modifications | integer | | Yes |
+
+#### SessionState
+
+| Name | Type | Description | Required |
+| ---- | ---- | ----------- | -------- |
+| SessionState | string | | |
+
+#### SessionStatus
+
+| Name | Type | Description | Required |
+| ---- | ---- | ----------- | -------- |
+| state | string | _Enum:_ `"Copying"`, `"Active"`, `"Merging"`, `"Done"`, `"Failed"` | Yes |
+| imageName | string | | Yes |
+| originalImageVersion | integer | | Yes |
+| newImageVersion | integer | | Yes |
+| mergedClusters | integer | | Yes |
+| totalClusters | integer | | Yes |
diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c
new file mode 100644
index 0000000..525eef0
--- /dev/null
+++ b/src/fuse/cowfile.c
@@ -0,0 +1,1777 @@
+#include "cowfile.h"
+#include "main.h"
+#include "connection.h"
+
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
+#include <dnbd3/shared/log.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <curl/curl.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#define UUID_STRLEN 36
+// Maximum assumed page size, in case the cow data gets transferred between different architectures
+// 16k should be the largest minimum in existence (Itanium)
+#define MAX_PAGE_SIZE 16384
+
+extern void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+
+static const int CURRENT_COW_VERSION = 3;
+
+static bool statStdout;
+static bool statFile;
+static pthread_t tidCowUploader;
+static pthread_t tidStatUpdater;
+static const char *cowServerAddress;
+static CURL *curl;
+static cowfile_metadata_header_t *metadata = NULL;
+static atomic_uint_fast64_t bytesUploaded;
+static uint64_t totalBlocksUploaded = 0;
+static int activeUploads = 0;
+static int uploadLoopThrottle = 0;
+static atomic_bool uploadLoop = true; // Keep upload loop running?
+static atomic_bool uploadLoopDone = false; // Upload loop has finished all work?
+static atomic_bool uploadCancelled = false; // Skip uploading remaining blocks
+static struct curl_slist *uploadHeaders = NULL;
+
+static struct cow
+{
+ char *metadata_mmap;
+ l1 *l1;
+ l2 *l2;
+ int fdMeta;
+ int fdData;
+ int fdStats;
+ pthread_mutex_t l2CreateLock;
+} cow;
+
+static size_t curlHeaderCallbackUploadBlock( char *buffer, size_t size, size_t nitems, void *userdata );
+
+static int countOneBits( atomic_uchar *bf, int numBytes )
+{
+ int bitCount = 0;
+ for ( int i = 0; i < numBytes; ++i ) {
+ unsigned char value = bf[i];
+ while ( value > 0 ) {
+ if ( ( value & 1 ) == 1 ) {
+ bitCount++;
+ }
+ value >>= 1;
+ }
+ }
+ return bitCount;
+}
+
+#define IS_4K_ALIGNED(v) ( ( (uint64_t)(v) & DNBD3_BLOCK_MASK ) == 0 )
+
+static bool writeAll( int fd, const char *buf, size_t count, off_t offset )
+{
+ while ( count > 0 ) {
+ ssize_t ret = pwrite( fd, buf, count, offset );
+ if ( ret == (ssize_t)count )
+ return true;
+ if ( ret == -1 ) {
+ if ( errno == EINTR )
+ continue;
+ return false;
+ }
+ if ( ret == 0 )
+ return false;
+ count -= ret;
+ buf += ret;
+ }
+ return true;
+}
+
+/**
+ * @brief Computes the l1 index for an absolute file offset
+ *
+ * @param offset absolute file offset
+ * @return int l1 index
+ */
+static int offsetToL1Index( size_t offset )
+{
+ return (int)( offset / COW_FULL_L2_TABLE_DATA_SIZE );
+}
+
+/**
+ * @brief Computes the l2 index for an absolute file offset
+ *
+ * @param offset absolute file offset
+ * @return int l2 index
+ */
+static int offsetToL2Index( size_t offset )
+{
+ return (int)( ( offset % COW_FULL_L2_TABLE_DATA_SIZE ) / COW_DATA_CLUSTER_SIZE );
+}
+
+/**
+ * @brief Computes the bit in the bitfield from the absolute file offset
+ *
+ * @param offset absolute file offset
+ * @return int bit(0-319) in the bitfield
+ */
+static int getBitfieldOffsetBit( size_t offset )
+{
+ return (int)( offset / DNBD3_BLOCK_SIZE ) % ( COW_BITFIELD_SIZE * 8 );
+}
+
+/**
+ * @brief Sets the specified bits in the specified range threadsafe to 1.
+ *
+ * @param byte of a bitfield
+ * @param from start bit
+ * @param to end bit
+ * @param value set bits to 1 or 0
+ */
+static void setBits( atomic_uchar *byte, int64_t from, int64_t to, bool value )
+{
+ char mask = (char)( ( 255 >> ( 7 - ( to - from ) ) ) << from );
+ if ( value ) {
+ atomic_fetch_or( byte, mask );
+ } else {
+ atomic_fetch_and( byte, ~mask );
+ }
+}
+
+/**
+ * @brief Sets the specified bits in the specified range threadsafe to 1.
+ *
+ * @param bitfield of a cow_l2_entry
+ * @param from start bit
+ * @param to end bit
+ * @param value set bits to 1 or 0
+ */
+static void setBitsInBitfield( atomic_uchar *bitfield, int64_t from, int64_t to, bool value )
+{
+ assert( from >= 0 && to < COW_BITFIELD_SIZE * 8 );
+ int64_t start = from / 8;
+ int64_t end = to / 8;
+
+ for ( int64_t i = start; i <= end; i++ ) {
+ setBits( ( bitfield + i ), from - i * 8, MIN( 7, to - i * 8 ), value );
+ from = ( i + 1 ) * 8;
+ }
+}
+
+/**
+ * @brief Checks if the n bit of a bit field is 0 or 1.
+ *
+ * @param bitfield of a cow_l2_entry
+ * @param n the bit which should be checked
+ */
+static bool checkBit( atomic_uchar *bitfield, int64_t n )
+{
+ return ( bitfield[n / 8] >> ( n % 8 ) ) & 1;
+}
+
+
+/**
+ * Generic callback for writing received data to a 500 byte buffer.
+ * MAKE SURE THE BUFFER IS EMPTY AT THE START! (i.e. buffer[0] = '\0')
+ */
+static size_t curlWriteCb500( char *buffer, size_t itemSize, size_t nitems, void *userpointer )
+{
+ char *dest = (char*)userpointer;
+ size_t done = strlen( dest );
+ size_t bytes = itemSize * nitems;
+
+ assert( done < 500 );
+ if ( done < 499 ) {
+ size_t n = MIN( bytes, 499 - done );
+ memcpy( dest + done, buffer, n );
+ dest[done + n] = '\0';
+ }
+ return bytes;
+}
+
+/**
+ * @brief Create a Session with the cow server and gets the session uuid.
+ */
+static bool createSession( const char *imageName, uint16_t rid )
+{
+ CURLcode res;
+ char url[COW_URL_STRING_SIZE];
+ char body[1000], reply[500];
+ const char *nameEsc;
+
+ curl_easy_reset( curl );
+ snprintf( url, COW_URL_STRING_SIZE, COW_API_CREATE, cowServerAddress );
+ logadd( LOG_INFO, "COW_API_CREATE URL: %s", url );
+ curl_easy_setopt( curl, CURLOPT_POST, 1L );
+ curl_easy_setopt( curl, CURLOPT_URL, url );
+
+ nameEsc = curl_easy_escape( curl, imageName, 0 );
+ if ( nameEsc == NULL ) {
+ logadd( LOG_ERROR, "Error escaping imageName" );
+ nameEsc = imageName; // Hope for the best
+ }
+ snprintf( body, sizeof body, "revision=%d&bitfieldSize=%d&imageName=%s",
+ (int)rid, (int)metadata->bitfieldSize, nameEsc );
+ if ( nameEsc != imageName ) {
+ curl_free( (char*)nameEsc );
+ }
+ curl_easy_setopt( curl, CURLOPT_POSTFIELDS, body );
+
+ reply[0] = '\0';
+ curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, curlWriteCb500 );
+ curl_easy_setopt( curl, CURLOPT_WRITEDATA, reply );
+
+ res = curl_easy_perform( curl );
+
+ /* Check for errors */
+ if ( res != CURLE_OK ) {
+ logadd( LOG_ERROR, "COW_API_CREATE failed: curl says %s", curl_easy_strerror( res ) );
+ return false;
+ }
+
+ long http_code = 0;
+ curl_easy_getinfo( curl, CURLINFO_RESPONSE_CODE, &http_code );
+ if ( http_code < 200 || http_code >= 300 ) {
+ logadd( LOG_ERROR, "COW_API_CREATE failed: http code %ld, %s", http_code, reply );
+ return false;
+ }
+ if ( strlen( reply ) > UUID_STRLEN ) {
+ logadd( LOG_ERROR, "Returned session id is too long: '%s'", reply );
+ return false;
+ }
+ strncpy( metadata->uuid, reply, sizeof(metadata->uuid) );
+ logadd( LOG_DEBUG1, "Cow session started, uuid: %s", metadata->uuid );
+ return true;
+}
+
+/**
+ * @brief Implementation of CURLOPT_READFUNCTION, this function will first send the bit field and
+ * then the block data in one bitstream. this function is usually called multiple times per block,
+ * because the buffer is usually not large for one block and its bitfield.
+ * for more details see: https://curl.se/libcurl/c/CURLOPT_READFUNCTION.html
+ *
+ * @param ptr to the buffer
+ * @param size of one element in buffer
+ * @param nmemb number of elements in buffer
+ * @param userdata from CURLOPT_READFUNCTION
+ * @return size_t size written in buffer
+ */
+static size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb, void *userdata )
+{
+ cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata;
+ size_t len = 0;
+ // Check if we're still in the bitfield
+ if ( uploadBlock->position < COW_BITFIELD_SIZE ) {
+ size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb );
+ memcpy( ptr + uploadBlock->position, uploadBlock->bitfield + uploadBlock->position,
+ lenCpy );
+ uploadBlock->position += lenCpy;
+ len += lenCpy;
+ }
+ // No elseif here, might just have crossed over...
+ if ( uploadBlock->position >= COW_BITFIELD_SIZE ) {
+ // Subtract the bitfield size from everything first
+ off_t inClusterOffset = uploadBlock->position - COW_BITFIELD_SIZE;
+ ssize_t spaceLeft = ( size * nmemb ) - len;
+ // Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since
+ // we do a lot of 4k/32k reads, but it's not that performance critical I guess...
+ while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) {
+ int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE );
+ size_t readSize;
+ // Small performance hack: All bits one in a byte, do a 32k instead of 4k read
+ // TODO: preadv with a large iov, reading unchanged blocks into a trash-buffer
+ if ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE * 8
+ && bitNumber % 8 == 0
+ && uploadBlock->bitfield[bitNumber / 8] == 0xff ) {
+ readSize = DNBD3_BLOCK_SIZE * 8;
+ } else {
+ readSize = DNBD3_BLOCK_SIZE;
+ }
+ // If handling single block, check bits in our copy, as global bitfield could change
+ if ( readSize != DNBD3_BLOCK_SIZE || checkBit( uploadBlock->bitfield, bitNumber ) ) {
+ ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize,
+ uploadBlock->cluster->offset + inClusterOffset );
+ if ( lengthRead == -1 ) {
+ if ( errno == EAGAIN )
+ continue;
+ logadd( LOG_ERROR, "Upload: Reading from COW file failed with errno %d", errno );
+ return CURL_READFUNC_ABORT;
+ }
+ if ( lengthRead != (ssize_t)readSize ) {
+ logadd( LOG_ERROR, "Upload: Reading from COW file failed with short read (%d/%d)",
+ (int)lengthRead, (int)readSize );
+ return CURL_READFUNC_ABORT;
+ }
+ len += lengthRead;
+ spaceLeft -= lengthRead;
+ }
+ inClusterOffset += readSize;
+ uploadBlock->position += readSize;
+ }
+ }
+ return len;
+}
+
+
+/**
+ * @brief Requests the merging of the image on the cow server.
+ */
+static bool postMergeRequest()
+{
+ CURLcode res;
+ char url[COW_URL_STRING_SIZE];
+ char body[500], reply[500];
+ char *uuid;
+
+ curl_easy_reset( curl );
+ snprintf( url, COW_URL_STRING_SIZE, COW_API_START_MERGE, cowServerAddress );
+ curl_easy_setopt( curl, CURLOPT_URL, url );
+ curl_easy_setopt( curl, CURLOPT_POST, 1L );
+ curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, curlWriteCb500 );
+ curl_easy_setopt( curl, CURLOPT_WRITEDATA, reply );
+
+ uuid = curl_easy_escape( curl, metadata->uuid, 0 );
+ if ( uuid == NULL ) {
+ logadd( LOG_ERROR, "Error escaping uuid" );
+ uuid = metadata->uuid; // Hope for the best
+ }
+ snprintf( body, sizeof body, "originalFileSize=%"PRIu64"&newFileSize=%"PRIu64"&uuid=%s",
+ metadata->validRemoteSize, metadata->imageSize, uuid );
+ if ( uuid != metadata->uuid ) {
+ curl_free( uuid );
+ }
+ curl_easy_setopt( curl, CURLOPT_POSTFIELDS, body );
+
+ reply[0] = '\0';
+ res = curl_easy_perform( curl );
+ if ( res != CURLE_OK ) {
+ logadd( LOG_WARNING, "COW_API_START_MERGE failed. curl reported: %s", curl_easy_strerror( res ) );
+ return false;
+ }
+ long http_code = 0;
+ curl_easy_getinfo( curl, CURLINFO_RESPONSE_CODE, &http_code );
+ if ( http_code < 200 || http_code >= 300 ) {
+ logadd( LOG_WARNING, "COW_API_START_MERGE failed with http: %ld: %s", http_code, reply );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief Wrapper for postMergeRequest so if its fails it will be tried again.
+ *
+ */
+static void requestRemoteMerge()
+{
+ int fails = 0;
+ bool success = false;
+ success = postMergeRequest();
+ while ( fails <= 5 && !success ) {
+ fails++;
+ logadd( LOG_WARNING, "Trying again. %i/5", fails );
+ sleep( 10 );
+ postMergeRequest();
+ }
+}
+
+/**
+ * @brief Implementation of the CURLOPT_XFERINFOFUNCTION.
+ * For more infos see: https://curl.se/libcurl/c/CURLOPT_XFERINFOFUNCTION.html
+ *
+ * Each active transfer callbacks this function.
+ * This function computes the uploaded bytes between each call and adds it to
+ * bytesUploaded, which is used to compute the kb/s uploaded over all transfers.
+ *
+ * @param ulNow number of bytes uploaded by this transfer so far.
+ * @return int always returns 0 to continue the callbacks.
+ */
+static int progress_callback( void *clientp, UNUSED curl_off_t dlTotal,
+ UNUSED curl_off_t dlNow, UNUSED curl_off_t ulTotal, curl_off_t ulNow )
+{
+ cow_curl_read_upload_t *uploadingCluster = (cow_curl_read_upload_t *)clientp;
+ bytesUploaded += ( ulNow - uploadingCluster->ulLast );
+ uploadingCluster->ulLast = ulNow;
+ return 0;
+}
+
+#ifdef COW_DUMP_BLOCK_UPLOADS
+static int cmpfunc( const void *a, const void *b )
+{
+ return (int)( ( (cow_cluster_statistics_t *)b )->uploads - ( (cow_cluster_statistics_t *)a )->uploads );
+}
+/**
+ * @brief Writes all block numbers sorted by the number of uploads into the statsfile.
+ *
+ */
+static void dumpBlockUploads()
+{
+ long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE );
+
+ cow_cluster_statistics_t blockUploads[l1MaxOffset * COW_L2_TABLE_SIZE];
+ uint64_t currentBlock = 0;
+ for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) {
+ if ( cow.l1[l1Index] == -1 ) {
+ continue;
+ }
+ for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
+ cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
+
+ blockUploads[currentBlock].uploads = block->uploads;
+ blockUploads[currentBlock].clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+ currentBlock++;
+ }
+ }
+ qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc );
+
+ dprintf( cow.fdStats, "\n\n[BlockStats]\n" );
+ for ( uint64_t i = 0; i < currentBlock; i++ ) {
+ dprintf( cow.fdStats, "%" PRIu64 "=%" PRIu64 " \n",
+ blockUploads[i].clusterNumber, blockUploads[i].uploads );
+ }
+}
+#endif
+
+/**
+ * @brief Updates the status to the stdout/statfile depending on the startup parameters.
+ *
+ * @param inQueue Blocks that have changes old enough to be uploaded.
+ * @param modified Blocks that have been changed but whose changes are not old enough to be uploaded.
+ * @param idle Blocks that do not contain changes that have not yet been uploaded.
+ * @param speedBuffer ptr to char array that contains the current upload speed.
+ */
+static void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t idle, char *speedBuffer )
+{
+ char buffer[300];
+ const char *state;
+
+ if ( uploadLoop ) {
+ state = "backgroundUpload";
+ } else if ( !uploadLoopDone ) {
+ state = "uploading";
+ } else {
+ state = "done";
+ }
+
+ int len = snprintf( buffer, sizeof buffer,
+ "[General]\n"
+ "uuid=%s\n"
+ "state=%s\n"
+ "inQueue=%" PRIu64 "\n"
+ "modifiedClusters=%" PRIu64 "\n"
+ "idleClusters=%" PRIu64 "\n"
+ "totalClustersUploaded=%" PRIu64 "\n"
+ "activeUploads=%i\n"
+ "%s%s\n",
+ metadata->uuid,
+ state, inQueue, modified, idle, totalBlocksUploaded, activeUploads,
+ COW_SHOW_UL_SPEED ? "avgSpeedKb=" : "",
+ speedBuffer );
+
+ if ( len == -1 ) {
+ logadd( LOG_ERROR, "snprintf error" );
+ return;
+ }
+
+ if ( statStdout ) {
+ logadd( LOG_INFO, "%s", buffer );
+ }
+
+ if ( statFile ) {
+ // Pad with a bunch of newlines so we don't change the file size all the time
+ ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 );
+ memset( buffer + len, '\n', extra );
+ if ( pwrite( cow.fdStats, buffer, len + extra, 0 ) != len + extra ) {
+ logadd( LOG_WARNING, "Could not update cow status file" );
+ }
+#ifdef COW_DUMP_BLOCK_UPLOADS
+ if ( !uploadLoop && uploadLoopDone ) {
+ lseek( cow.fdStats, len + extra, SEEK_SET );
+ dumpBlockUploads();
+ }
+#endif
+ }
+}
+
+/**
+ * @brief Starts the upload of a given block.
+ *
+ * @param cm Curl_multi
+ * @param uploadingCluster containing the data for the block to upload.
+ */
+static bool addUpload( CURLM *cm, cow_curl_read_upload_t *uploadingCluster )
+{
+ CURL *eh = curl_easy_init();
+
+ char url[COW_URL_STRING_SIZE];
+
+ snprintf( url, COW_URL_STRING_SIZE,
+ COW_API_UPDATE, cowServerAddress, metadata->uuid, uploadingCluster->clusterNumber );
+
+ curl_easy_setopt( eh, CURLOPT_URL, url );
+ curl_easy_setopt( eh, CURLOPT_POST, 1L );
+ curl_easy_setopt( eh, CURLOPT_HEADERFUNCTION, curlHeaderCallbackUploadBlock );
+ curl_easy_setopt( eh, CURLOPT_HEADERDATA, (void *)uploadingCluster );
+ curl_easy_setopt( eh, CURLOPT_READFUNCTION, curlReadCallbackUploadBlock );
+ curl_easy_setopt( eh, CURLOPT_READDATA, (void *)uploadingCluster );
+ curl_easy_setopt( eh, CURLOPT_WRITEFUNCTION, curlWriteCb500 );
+ curl_easy_setopt( eh, CURLOPT_WRITEDATA, (void *)uploadingCluster->replyBuffer );
+ curl_easy_setopt( eh, CURLOPT_PRIVATE, (void *)uploadingCluster );
+ // min upload speed of 1kb/s over 10 sec otherwise the upload is canceled.
+ curl_easy_setopt( eh, CURLOPT_LOW_SPEED_TIME, 10L );
+ curl_easy_setopt( eh, CURLOPT_LOW_SPEED_LIMIT, 1000L );
+
+ curl_easy_setopt( eh, CURLOPT_POSTFIELDSIZE_LARGE,
+ (long)( COW_BITFIELD_SIZE
+ + DNBD3_BLOCK_SIZE * countOneBits( uploadingCluster->bitfield, COW_BITFIELD_SIZE ) )
+ );
+
+ if ( COW_SHOW_UL_SPEED ) {
+ uploadingCluster->ulLast = 0;
+ curl_easy_setopt( eh, CURLOPT_NOPROGRESS, 0L );
+ curl_easy_setopt( eh, CURLOPT_XFERINFOFUNCTION, progress_callback );
+ curl_easy_setopt( eh, CURLOPT_XFERINFODATA, uploadingCluster );
+ }
+ curl_easy_setopt( eh, CURLOPT_HTTPHEADER, uploadHeaders );
+ curl_multi_add_handle( cm, eh );
+
+ return true;
+}
+
+static size_t curlHeaderCallbackUploadBlock( char *buffer, size_t size, size_t nitems, void *userdata )
+{
+ size_t len, offset;
+ int delay;
+ cow_curl_read_upload_t *uploadingCluster = (cow_curl_read_upload_t*)userdata;
+
+ // If the "Retry-After" header is set, we interpret this as the server being overloaded
+ // or not ready yet to take another update. We slow down our upload loop then.
+ // We'll only accept a delay in seconds here, not an HTTP Date string.
+ // Otherwise, increase the fails counter.
+ len = size * nitems;
+ if ( len < 13 )
+ return len;
+ for ( int i = 0; i < 11; ++i ) {
+ buffer[i] |= 0x60;
+ }
+ if ( strncmp( buffer, "retry-after:", 12 ) != 0 )
+ return len;
+ offset = 12;
+ while ( offset + 1 < len && buffer[offset] == ' ' ) {
+ offset++;
+ }
+ delay = atoi( buffer + offset );
+ if ( delay > 0 ) {
+ if ( delay > 120 ) {
+ // Cap to two minutes
+ delay = 120;
+ }
+ uploadLoopThrottle = MAX( uploadLoopThrottle, delay );
+ uploadingCluster->retryTime = delay;
+ }
+ return len;
+}
+
+/**
+ * @brief After an upload completes, either successful or unsuccessful this
+ * function cleans everything up. If unsuccessful and there are some tries left
+ * retries to upload the block.
+ *
+ * @param cm Curl_multi
+ * @param msg CURLMsg
+ * @return true returned if the upload was successful or retries are still possible.
+ * @return false returned if the upload was unsuccessful.
+ */
+static bool clusterUploadDoneHandler( CURLM *cm, CURLMsg *msg )
+{
+ bool success = false;
+ cow_curl_read_upload_t *uploadingCluster;
+ CURLcode res;
+ CURLcode res2;
+ res = curl_easy_getinfo( msg->easy_handle, CURLINFO_PRIVATE, &uploadingCluster );
+
+ long http_code = 0;
+ res2 = curl_easy_getinfo( msg->easy_handle, CURLINFO_RESPONSE_CODE, &http_code );
+
+ if ( msg->msg != CURLMSG_DONE ) {
+ logadd( LOG_ERROR, "multi_message->msg unexpectedly not DONE (%d)", (int)msg->msg );
+ } else if ( msg->data.result != CURLE_OK ) {
+ logadd( LOG_ERROR, "curl_easy returned non-OK after multi-finish: %s",
+ curl_easy_strerror( msg->data.result ) );
+ logadd( LOG_ERROR, "(%ld, %s)", http_code, uploadingCluster->replyBuffer );
+ } else if ( res != CURLE_OK || res2 != CURLE_OK ) {
+ logadd( LOG_ERROR, "curl_easy_getinfo failed after multifinish (%d, %d)", (int)res, (int)res2 );
+ } else if ( http_code == 503 ) {
+ if ( uploadingCluster->retryTime > 0 ) {
+ logadd( LOG_INFO, "COW server is asking to backoff for %d seconds", uploadingCluster->retryTime );
+ } else {
+ logadd( LOG_ERROR, "COW server returned 503 without Retry-After value: %s",
+ uploadingCluster->replyBuffer );
+ }
+ } else if ( http_code < 200 || http_code >= 300 ) {
+ logadd( LOG_ERROR, "COW server returned HTTP %ld: %s", http_code, uploadingCluster->replyBuffer );
+ } else {
+ // everything went ok, reset timeChanged of underlying cluster, but only if it
+ // didn't get updated again in the meantime.
+ atomic_compare_exchange_strong( &uploadingCluster->cluster->timeChanged, &uploadingCluster->time, 0 );
+ uploadingCluster->cluster->uploads++;
+ uploadingCluster->cluster->fails = 0;
+ totalBlocksUploaded++;
+ success = true;
+ }
+ if ( !success ) {
+ uploadingCluster->cluster->fails++;
+ if ( uploadingCluster->retryTime > 0 ) {
+ // Don't reset timeChanged timestamp, so the next iteration of uploadModifiedClusters
+ // will queue this upload again after the throttle time expired.
+ } else {
+ logadd( LOG_ERROR, "Uploading cluster failed %i/5 times", uploadingCluster->cluster->fails );
+ // Pretend the block changed again just now, to prevent immediate retry
+ atomic_compare_exchange_strong( &uploadingCluster->cluster->timeChanged, &uploadingCluster->time,
+ time( NULL ) );
+ }
+ }
+ curl_multi_remove_handle( cm, msg->easy_handle );
+ curl_easy_cleanup( msg->easy_handle );
+ free( uploadingCluster );
+
+ return success;
+}
+
+/**
+ * @param cm Curl_multi
+ * @param activeUploads ptr to integer which holds the number of current uploads
+ * @param minNumberUploads break out of loop as soon as there are less than these many transfers running
+ * else COW_MAX_PARALLEL_BACKGROUND_UPLOADS.
+ * @return true returned if all uploads were successful
+ * @return false returned if one ore more upload failed.
+ */
+static bool curlMultiLoop( CURLM *cm, int minNumberUploads )
+{
+ CURLMsg *msg;
+ int msgsLeft = -1;
+ bool status = true;
+
+ if ( minNumberUploads <= 0 ) {
+ minNumberUploads = 1;
+ }
+ for ( ;; ) {
+ CURLMcode mc = curl_multi_perform( cm, &activeUploads );
+ if ( mc != CURLM_OK ) {
+ logadd( LOG_ERROR, "curl_multi_perform error %d, bailing out", (int)mc );
+ status = false;
+ break;
+ }
+
+ while ( ( msg = curl_multi_info_read( cm, &msgsLeft ) ) != NULL ) {
+ if ( !clusterUploadDoneHandler( cm, msg ) ) {
+ status = false;
+ }
+ }
+ if ( activeUploads < minNumberUploads ) {
+ break;
+ }
+ // ony wait if there are active uploads
+ if ( activeUploads > 0 ) {
+ mc = curl_multi_wait( cm, NULL, 0, 1000, NULL );
+ if ( mc != CURLM_OK ) {
+ logadd( LOG_ERROR, "curl_multi_wait error %d, bailing out", (int)mc );
+ status = false;
+ break;
+ }
+ }
+
+ }
+ return status;
+}
+
+/**
+ * @brief loops through all blocks and uploads them.
+ *
+ * @param ignoreMinUploadDelay If true uploads all blocks that have changes while
+ * ignoring COW_MIN_UPLOAD_DELAY
+ * @param cm Curl_multi
+ * @return true if all blocks uploaded successful
+ * @return false if one ore more blocks failed to upload
+ */
+bool uploadModifiedClusters( bool ignoreMinUploadDelay, CURLM *cm )
+{
+ bool success = true;
+ const time_t now = time( NULL );
+
+ long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE );
+ // Iterate over all blocks, L1 first
+ for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) {
+ if ( cow.l1[l1Index] == -1 ) {
+ continue; // Not allocated
+ }
+ // Now all L2 clusters
+ for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
+ cow_l2_entry_t *cluster = ( cow.l2[cow.l1[l1Index]] + l2Index );
+ if ( cluster->offset == -1 ) {
+ continue; // Not allocated
+ }
+ if ( cluster->timeChanged == 0 ) {
+ continue; // Not changed
+ }
+ if ( !ignoreMinUploadDelay && ( now - cluster->timeChanged < COW_MIN_UPLOAD_DELAY ) ) {
+ continue; // Last change not old enough
+ }
+ // Run curl mainloop at least one, but keep doing so while max concurrent uploads is reached
+ int minUploads = ignoreMinUploadDelay
+ ? COW_MAX_PARALLEL_UPLOADS
+ : COW_MAX_PARALLEL_BACKGROUND_UPLOADS;
+ if ( !curlMultiLoop( cm, minUploads ) ) {
+ success = false;
+ }
+ // Maybe one of the uploads was rejected by the server asking us to slow down a bit.
+ // Check for that case and don't trigger a new upload.
+ if ( uploadLoopThrottle > 0 ) {
+ goto DONE;
+ }
+ cow_curl_read_upload_t *b = malloc( sizeof( cow_curl_read_upload_t ) );
+ b->cluster = cluster;
+ b->clusterNumber = ( l1Index * COW_L2_TABLE_SIZE + l2Index );
+ b->position = 0;
+ b->retryTime = 0;
+ b->time = cluster->timeChanged;
+ b->replyBuffer[0] = '\0';
+ // Copy, so it doesn't change during upload
+ // when we assemble the data in curlReadCallbackUploadBlock()
+ for ( int i = 0; i < COW_BITFIELD_SIZE; ++i ) {
+ b->bitfield[i] = cluster->bitfield[i];
+ }
+ addUpload( cm, b );
+ if ( !ignoreMinUploadDelay && !uploadLoop ) {
+ goto DONE;
+ }
+ }
+ }
+DONE:
+ // Finish all the transfers still active
+ while ( activeUploads > 0 ) {
+ if ( !curlMultiLoop( cm, 1 ) ) {
+ success = false;
+ break;
+ }
+ }
+ return success;
+}
+
+
+/**
+ * @brief Computes the data for the status to the stdout/statfile every COW_STATS_UPDATE_TIME seconds.
+ *
+ */
+
+void *cowfile_statUpdater( UNUSED void *something )
+{
+ uint64_t lastUpdateTime = time( NULL );
+ time_t now;
+ char speedBuffer[20] = "0";
+
+ while ( !uploadLoopDone ) {
+ int modified = 0;
+ int inQueue = 0;
+ int idle = 0;
+ long unsigned int l1MaxOffset = 1 + ( ( metadata->imageSize - 1 ) / COW_FULL_L2_TABLE_DATA_SIZE );
+ now = time( NULL );
+ for ( long unsigned int l1Index = 0; l1Index < l1MaxOffset; l1Index++ ) {
+ if ( cow.l1[l1Index] == -1 ) {
+ continue;
+ }
+ for ( int l2Index = 0; l2Index < COW_L2_TABLE_SIZE; l2Index++ ) {
+ cow_l2_entry_t *block = ( cow.l2[cow.l1[l1Index]] + l2Index );
+ if ( block->offset == -1 ) {
+ continue;
+ }
+ if ( block->timeChanged != 0 ) {
+ if ( !uploadLoop || now > block->timeChanged + COW_MIN_UPLOAD_DELAY ) {
+ inQueue++;
+ } else {
+ modified++;
+ }
+ } else {
+ idle++;
+ }
+ }
+ }
+
+ if ( COW_SHOW_UL_SPEED ) {
+ double delta;
+ double bytes = (double)atomic_exchange( &bytesUploaded, 0 );
+ now = time( NULL );
+ delta = (double)( now - lastUpdateTime );
+ lastUpdateTime = now;
+ if ( delta > 0 ) {
+ snprintf( speedBuffer, sizeof speedBuffer, "%.2f", bytes / 1000.0 / delta );
+ }
+ }
+
+ updateCowStatsFile( inQueue, modified, idle, speedBuffer );
+ sleep( COW_STATS_UPDATE_TIME );
+ }
+ return NULL;
+}
+
+void quitSigHandler( int sig UNUSED )
+{
+ uploadCancelled = true;
+ uploadLoop = false;
+}
+
+/**
+ * @brief main loop for blockupload in the background
+ */
+static void *uploaderThreadMain( UNUSED void *something )
+{
+ CURLM *cm;
+
+ cm = curl_multi_init();
+ curl_multi_setopt( cm, CURLMOPT_MAXCONNECTS,
+ (long)MAX( COW_MAX_PARALLEL_UPLOADS, COW_MAX_PARALLEL_BACKGROUND_UPLOADS ) );
+
+ do {
+ // Unblock so this very thread gets the signal for abandoning the upload
+ struct sigaction newHandler = { .sa_handler = &quitSigHandler };
+ sigemptyset( &newHandler.sa_mask );
+ sigaction( SIGQUIT, &newHandler, NULL );
+ sigset_t sigmask;
+ sigemptyset( &sigmask );
+ sigaddset( &sigmask, SIGQUIT );
+ pthread_sigmask( SIG_UNBLOCK, &sigmask, NULL );
+ } while ( 0 );
+
+ while ( uploadLoop ) {
+ while ( uploadLoopThrottle > 0 && uploadLoop ) {
+ sleep( 1 );
+ uploadLoopThrottle--;
+ }
+ sleep( 2 );
+ if ( !uploadLoop )
+ break;
+ uploadModifiedClusters( false, cm );
+ }
+
+ if ( uploadCancelled ) {
+ uploadLoopDone = true;
+ logadd( LOG_INFO, "Not uploading remaining clusters, SIGQUIT received" );
+ } else {
+ // force the upload of all remaining blocks because the user dismounted the image
+ logadd( LOG_INFO, "Start uploading the remaining clusters." );
+ if ( !uploadModifiedClusters( true, cm ) ) {
+ uploadLoopDone = true;
+ logadd( LOG_ERROR, "One or more clusters failed to upload" );
+ } else {
+ uploadLoopDone = true;
+ logadd( LOG_DEBUG1, "All clusters uploaded" );
+ if ( cow_merge_after_upload ) {
+ requestRemoteMerge();
+ logadd( LOG_DEBUG1, "Requesting merge" );
+ }
+ }
+ }
+ curl_multi_cleanup( cm );
+ return NULL;
+}
+
+/**
+ * @brief Create a Cow Stats File an inserts the session uuid
+ *
+ * @param path where the file is created
+ * @return true
+ * @return false if failed to create or to write into the file
+ */
+static bool createCowStatsFile( char *path )
+{
+ char pathStatus[strlen( path ) + 12];
+
+ snprintf( pathStatus, strlen( path ) + 12, "%s%s", path, "/status" );
+
+ char buffer[100];
+ int len = snprintf( buffer, 100, "[General]\nuuid=%s\nstate=active\n", metadata->uuid );
+ if ( statStdout ) {
+ logadd( LOG_INFO, "%s", buffer );
+ }
+ if ( statFile ) {
+ if ( ( cow.fdStats = open( pathStatus, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ logadd( LOG_ERROR, "Could not create cow status file. Bye.\n" );
+ return false;
+ }
+
+ if ( pwrite( cow.fdStats, buffer, len, 0 ) != len ) {
+ logadd( LOG_ERROR, "Could not write to cow status file. Bye.\n" );
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool commonInit( const char* serverAddress, const char *cowUuid )
+{
+ CURLcode m;
+
+ if ( cowUuid != NULL && strlen( cowUuid ) > UUID_STRLEN ) {
+ logadd( LOG_ERROR, "COW UUID too long: '%s'", cowUuid );
+ return false;
+ }
+ uploadHeaders = curl_slist_append( uploadHeaders, "Content-Type: application/octet-stream" );
+ pthread_mutex_init( &cow.l2CreateLock, NULL );
+ cowServerAddress = serverAddress;
+ if ( ( m = curl_global_init( CURL_GLOBAL_ALL ) ) != CURLE_OK ) {
+ logadd( LOG_ERROR, "curl_global_init failed: %s",
+ curl_easy_strerror( m ) );
+ return false;
+ }
+ curl = curl_easy_init();
+ if ( curl == NULL ) {
+ logadd( LOG_ERROR, "Error on curl_easy_init" );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief initializes the cow functionality, creates the data & meta file.
+ *
+ * @param path where the files should be stored
+ * @param image_Name name of the original file/image
+ * @param imageSizePtr
+ * @param cowUuid optional, use given UUID for talking to COW server instead of creating session
+ */
+bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion,
+ atomic_uint_fast64_t **imageSizePtr,
+ char *serverAddress, bool sStdout, bool sfile, const char *cowUuid )
+{
+ char pathMeta[strlen( path ) + 6];
+ char pathData[strlen( path ) + 6];
+
+ if ( !commonInit( serverAddress, cowUuid ) )
+ return false;
+
+ statStdout = sStdout;
+ statFile = sfile;
+
+ snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" );
+ snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
+
+ if ( ( cow.fdMeta = open( pathMeta, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ logadd( LOG_ERROR, "Could not create cow meta file. Bye.\n %s \n", pathMeta );
+ return false;
+ }
+
+ if ( ( cow.fdData = open( pathData, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ logadd( LOG_ERROR, "Could not create cow data file. Bye.\n" );
+ return false;
+ }
+ struct stat fs;
+ if ( fstat( cow.fdData, &fs ) == -1 || fs.st_size != 0 ) {
+ logadd( LOG_ERROR, "/data file already exists and is not empty" );
+ return false;
+ }
+
+ size_t metaDataSizeHeader = sizeof( cowfile_metadata_header_t );
+
+ // Calculate how many full l2 tables we need to address COW_MAX_IMAGE_SIZE
+ size_t l1NumEntries = ( ( COW_MAX_IMAGE_SIZE + COW_FULL_L2_TABLE_DATA_SIZE - 1 )
+ / COW_FULL_L2_TABLE_DATA_SIZE );
+ // Make sure l1 and l2 are aligned to struct size
+ size_t sizeL1 = sizeof(cow.l1[0]);
+ size_t sizeL2 = sizeof(cow.l2[0]);
+ size_t startL1 = ( ( metaDataSizeHeader + sizeL1 - 1 ) / sizeL1 ) * sizeL1;
+ size_t startL2 = ( ( startL1 + l1NumEntries * sizeL1 + sizeL2 - 1 ) / sizeL2 ) * sizeL2;
+
+ // size of l1 array + number of l2's * size of l2
+ size_t ps = getpagesize();
+ if ( ps == 0 || ps > INT_MAX ) {
+ logadd( LOG_ERROR, "Cannot get native page size, aborting..." );
+ return false;
+ }
+ size_t metaSize = ( ( startL2 + l1NumEntries * sizeof( l2 ) + ps - 1 ) / ps ) * ps;
+
+ if ( ftruncate( cow.fdMeta, metaSize ) != 0 ) {
+ logadd( LOG_ERROR, "Could not set file size of meta data file (errno=%d). Bye.\n", errno );
+ return false;
+ }
+
+ cow.metadata_mmap = mmap( NULL, metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
+
+ if ( cow.metadata_mmap == MAP_FAILED ) {
+ logadd( LOG_ERROR, "Error while mmap()ing meta data, errno=%d", errno );
+ return false;
+ }
+
+ metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
+ metadata->magicValue = COW_FILE_META_MAGIC_VALUE;
+ metadata->imageSize = **imageSizePtr;
+ metadata->version = CURRENT_COW_VERSION;
+ metadata->validRemoteSize = **imageSizePtr;
+ metadata->startL1 = (uint32_t)startL1;
+ metadata->startL2 = (uint32_t)startL2;
+ metadata->bitfieldSize = COW_BITFIELD_SIZE;
+ metadata->nextL2 = 0;
+ metadata->metaSize = ATOMIC_VAR_INIT( metaSize );
+ metadata->nextClusterOffset = ATOMIC_VAR_INIT( COW_DATA_CLUSTER_SIZE );
+ metadata->maxImageSize = COW_MAX_IMAGE_SIZE;
+ metadata->creationTime = time( NULL );
+ snprintf( metadata->imageName, 200, "%s", image_Name );
+
+ cow.l1 = (l1 *)( cow.metadata_mmap + startL1 );
+ cow.l2 = (l2 *)( cow.metadata_mmap + startL2 );
+ for ( size_t i = 0; i < l1NumEntries; i++ ) {
+ cow.l1[i] = -1;
+ }
+
+ // write header to data file
+ uint64_t header = COW_FILE_DATA_MAGIC_VALUE;
+ if ( pwrite( cow.fdData, &header, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+ logadd( LOG_ERROR, "Could not write header to cow data file. Bye.\n" );
+ return false;
+ }
+
+ if ( cowUuid != NULL ) {
+ snprintf( metadata->uuid, sizeof(metadata->uuid), "%s", cowUuid );
+ logadd( LOG_INFO, "Using provided upload session id" );
+ } else if ( !createSession( image_Name, imageVersion ) ) {
+ return false;
+ }
+ createCowStatsFile( path );
+ *imageSizePtr = &metadata->imageSize;
+ return true;
+}
+
+/**
+ * @brief loads an existing cow state from the meta & data files
+ *
+ * @param path where the meta & data file is located
+ * @param imageSizePtr
+ */
+bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *serverAddress, bool sStdout, bool sFile, const char *cowUuid )
+{
+ char pathMeta[strlen( path ) + 6];
+ char pathData[strlen( path ) + 6];
+
+ if ( !commonInit( serverAddress, cowUuid ) )
+ return false;
+
+ statStdout = sStdout;
+ statFile = sFile;
+
+ snprintf( pathMeta, strlen( path ) + 6, "%s%s", path, "/meta" );
+ snprintf( pathData, strlen( path ) + 6, "%s%s", path, "/data" );
+
+ if ( ( cow.fdMeta = open( pathMeta, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ logadd( LOG_ERROR, "Could not open cow meta file. Bye.\n" );
+ return false;
+ }
+ if ( ( cow.fdData = open( pathData, O_RDWR, S_IRUSR | S_IWUSR ) ) == -1 ) {
+ logadd( LOG_ERROR, "Could not open cow data file. Bye.\n" );
+ return false;
+ }
+
+ cowfile_metadata_header_t header;
+ {
+ size_t sizeToRead = sizeof( cowfile_metadata_header_t );
+ size_t readBytes = 0;
+ while ( readBytes < sizeToRead ) {
+ ssize_t bytes = pread( cow.fdMeta, ( ( &header ) + readBytes ), sizeToRead - readBytes, 0 );
+ if ( bytes <= 0 ) {
+ logadd( LOG_ERROR, "Error while reading meta file header. Bye.\n" );
+ return false;
+ }
+ readBytes += bytes;
+ }
+
+
+ if ( header.magicValue != COW_FILE_META_MAGIC_VALUE ) {
+ if ( __builtin_bswap64( header.magicValue ) == COW_FILE_META_MAGIC_VALUE ) {
+ logadd( LOG_ERROR, "cow meta file of wrong endianess. Bye.\n" );
+ return false;
+ }
+ logadd( LOG_ERROR, "cow meta file of unkown format. Bye.\n" );
+ return false;
+ }
+
+ if ( header.bitfieldSize != COW_BITFIELD_SIZE ) {
+ logadd( LOG_ERROR, "cow meta file has unexpected bitfield size %d", (int)header.bitfieldSize );
+ return false;
+ }
+ if ( header.startL1 >= header.startL2 || header.startL2 >= header.metaSize ) {
+ logadd( LOG_ERROR, "l1/l2 offset messed up in metadata." );
+ return false;
+ }
+
+ struct stat st;
+ fstat( cow.fdMeta, &st );
+ if ( st.st_size < (off_t)header.metaSize ) {
+ logadd( LOG_ERROR, "cow meta file too small. Bye." );
+ return false;
+ }
+ }
+ {
+ uint64_t magicValueDataFile;
+ if ( pread( cow.fdData, &magicValueDataFile, sizeof( uint64_t ), 0 ) != sizeof( uint64_t ) ) {
+ logadd( LOG_ERROR, "Error while reading cow data file, wrong file?. Bye." );
+ return false;
+ }
+
+ if ( magicValueDataFile != COW_FILE_DATA_MAGIC_VALUE ) {
+ if ( __builtin_bswap64( magicValueDataFile ) == COW_FILE_DATA_MAGIC_VALUE ) {
+ logadd( LOG_ERROR, "cow data file of wrong endianess. Bye." );
+ return false;
+ }
+ logadd( LOG_ERROR, "cow data file of unkown format. Bye." );
+ return false;
+ }
+ struct stat st;
+ fstat( cow.fdData, &st ); // add cluster size, since we don't preallocate
+ if ( header.nextClusterOffset > st.st_size + (int)COW_DATA_CLUSTER_SIZE ) {
+ logadd( LOG_ERROR, "cow data file too small. Expected=%jd, Is=%jd.",
+ (intmax_t)header.nextClusterOffset, (intmax_t)st.st_size );
+ return false;
+ }
+ }
+
+ cow.metadata_mmap = mmap( NULL, header.metaSize, PROT_READ | PROT_WRITE, MAP_SHARED, cow.fdMeta, 0 );
+
+ if ( cow.metadata_mmap == MAP_FAILED ) {
+ logadd( LOG_ERROR, "Error while mapping mmap, errno=%d.", errno );
+ return false;
+ }
+ if ( header.version != CURRENT_COW_VERSION ) {
+ logadd( LOG_ERROR, "Error wrong file version got: %i expected: %i. Bye.",
+ metadata->version, CURRENT_COW_VERSION );
+ return false;
+ }
+
+
+ metadata = (cowfile_metadata_header_t *)( cow.metadata_mmap );
+
+ if ( cowUuid != NULL ) {
+ logadd( LOG_INFO, "Overriding stored upload session id with provided one" );
+ snprintf( metadata->uuid, sizeof(metadata->uuid), "%s", cowUuid );
+ }
+
+ *imageSizePtr = &metadata->imageSize;
+ cow.l1 = (l1 *)( cow.metadata_mmap + metadata->startL1 );
+ cow.l2 = (l2 *)( cow.metadata_mmap + metadata->startL2 );
+ createCowStatsFile( path );
+ return true;
+}
+/**
+ * @brief Starts the cow BackgroundThreads which are needed for stats and data upload
+ *
+ */
+bool cowfile_startBackgroundThreads()
+{
+ if( pthread_create( &tidCowUploader, NULL, &uploaderThreadMain, NULL ) != 0 ) {
+ logadd( LOG_ERROR, "Could not create cow uploader thread");
+ return false;
+ }
+ if ( statFile || statStdout ) {
+ if(pthread_create( &tidStatUpdater, NULL, &cowfile_statUpdater, NULL ) != 0 ) {
+ logadd( LOG_ERROR, "Could not create stat updater thread");
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * Check if block at given offset is local, i.e. has been modified.
+ * @param meta The cow_l2_entry for the according cluster MUST be provided
+ * @param offset offset of data, can be absolute image offset as it will be transformed into cluster offset
+ */
+static bool isBlockLocal( cow_l2_entry_t *meta, off_t offset )
+{
+ if ( meta == NULL )
+ return false;
+ return checkBit( meta->bitfield, ( offset % COW_DATA_CLUSTER_SIZE ) / DNBD3_BLOCK_SIZE );
+}
+
+/**
+ * @brief Get the cow_l2_entry_t from l1Index and l2Index.
+ * l1 offset must be valid
+ *
+ * @param l1Index
+ * @param l2Index
+ * @return cow_l2_entry_t*
+ */
+static cow_l2_entry_t *getL2Entry( int l1Index, int l2Index, bool create )
+{
+ if ( cow.l1[l1Index] == -1 )
+ return NULL;
+ cow_l2_entry_t *block = cow.l2[cow.l1[l1Index]] + l2Index;
+ if ( block->offset == -1 ) {
+ if ( !create )
+ return NULL;
+ block->offset = atomic_fetch_add( &metadata->nextClusterOffset, COW_DATA_CLUSTER_SIZE );
+ }
+ return block;
+}
+
+/**
+ * @brief creates an new L2 table and initializes the containing cow_l2_entry_t
+ *
+ * @param l1Index
+ */
+static bool createL2Table( int l1Index )
+{
+ pthread_mutex_lock( &cow.l2CreateLock );
+ if ( cow.l1[l1Index] == -1 ) {
+ int idx = metadata->nextL2++;
+ for ( int i = 0; i < COW_L2_TABLE_SIZE; i++ ) {
+ cow.l2[idx][i].offset = -1;
+ cow.l2[idx][i].timeChanged = ATOMIC_VAR_INIT( 0 );
+ cow.l2[idx][i].uploads = ATOMIC_VAR_INIT( 0 );
+ for ( int j = 0; j < COW_BITFIELD_SIZE; j++ ) {
+ cow.l2[idx][i].bitfield[j] = ATOMIC_VAR_INIT( 0 );
+ }
+ }
+ cow.l1[l1Index] = idx;
+ }
+ pthread_mutex_unlock( &cow.l2CreateLock );
+ return true;
+}
+
+/**
+ * @brief Is called once a fuse write request ist finished.
+ * Calls the corrsponding fuse reply depending on the type and
+ * success of the request.
+ *
+ * @param req fuse_req_t
+ * @param cowRequest
+ */
+
+static void finishWriteRequest( fuse_req_t req, cow_request_t *cowRequest )
+{
+ if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) != 1 )
+ return; // More sub-requests are pending, bail out
+ if ( cowRequest->errorCode != 0 ) {
+ fuse_reply_err( req, cowRequest->errorCode );
+ } else {
+ uint64_t newSize = cowRequest->bytesWorkedOn + cowRequest->fuseRequestOffset;
+ if ( newSize > metadata->imageSize ) {
+ uint64_t oldSize;
+ do {
+ oldSize = metadata->imageSize;
+ newSize = MAX( oldSize, newSize );
+ } while ( !atomic_compare_exchange_weak( &metadata->imageSize, &oldSize, newSize ) );
+ }
+ fuse_reply_write( req, cowRequest->bytesWorkedOn );
+ }
+ free( cowRequest );
+}
+
+/**
+ * @brief Called after the padding data was received from the dnbd3 server.
+ * The data from the write request will be combined with the data from the server
+ * so that we get a full DNBD3_BLOCK and is then written on the disk.
+ * @param sRequest
+ */
+static void writePaddedBlock( cow_sub_request_t *sRequest )
+{
+ assert( ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ) + sRequest->size <= DNBD3_BLOCK_SIZE );
+ // Here, we again check if the block is written locally - there might have been a second write
+ // that wrote the full block, hence didn't have to wait for remote data and finished faster.
+ // In that case, don't pad from remote as we'd overwrite newer data.
+ if ( isBlockLocal( sRequest->cluster, sRequest->inClusterOffset ) ) {
+ logadd( LOG_INFO, "It happened!" );
+ } else {
+ // copy write Data
+ // writeBuffer is the received data, patch data from fuse write into it
+ memcpy( sRequest->writeBuffer + ( sRequest->inClusterOffset % DNBD3_BLOCK_SIZE ), sRequest->writeSrc,
+ sRequest->size );
+ if ( !writeAll( cow.fdData, sRequest->writeBuffer, DNBD3_BLOCK_SIZE,
+ sRequest->cluster->offset + ( sRequest->inClusterOffset & ~DNBD3_BLOCK_MASK ) ) ) {
+ sRequest->cowRequest->errorCode = errno;
+ } else {
+ sRequest->cowRequest->bytesWorkedOn += sRequest->size;
+ int64_t bit = sRequest->inClusterOffset / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( sRequest->cluster->bitfield, bit, bit, true );
+ sRequest->cluster->timeChanged = time( NULL );
+ }
+ }
+
+ finishWriteRequest( sRequest->dRequest.fuse_req, sRequest->cowRequest );
+ free( sRequest );
+}
+
+/**
+ * @brief If a block does not start or finish on an multiple of DNBD3_BLOCK_SIZE, the blocks need to be
+ * padded. If this block is inside the original image size, the padding data will be read from the server.
+ * Otherwise it will be padded with 0 since the it must be a block after the end of the image.
+ * @param req fuse_req_t
+ * @param cowRequest cow_request_t
+ * @param startOffset Absolute offset where the real data starts
+ * @param endOffset Absolute offset where the real data ends
+ * @param srcBuffer pointer to the data that needs to be padded, ie. data from user space.
+ */
+static bool padBlockForWrite( fuse_req_t req, cow_request_t *cowRequest,
+ off_t startOffset, off_t endOffset, const char *srcBuffer )
+{
+ // Make sure we pad exactly one block
+ endOffset = MIN( (uint64_t)endOffset, ( startOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK );
+ assert( startOffset < endOffset );
+ size_t size = (size_t)( endOffset - startOffset );
+ int l1Index = offsetToL1Index( startOffset );
+ int l2Index = offsetToL2Index( startOffset );
+ off_t inClusterOffset = startOffset % COW_DATA_CLUSTER_SIZE;
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+ if ( isBlockLocal( cluster, startOffset ) ) {
+ // No padding at all, keep existing data
+ bool ret = writeAll( cow.fdData, srcBuffer, size, cluster->offset + inClusterOffset );
+ if ( ret ) {
+ cowRequest->bytesWorkedOn += size;
+ cluster->timeChanged = time( NULL );
+ }
+ return ret;
+ }
+ // Not local, need some form of padding
+ createL2Table( l1Index );
+ if ( cluster == NULL ) {
+ cluster = getL2Entry( l1Index, l2Index, true );
+ }
+ uint64_t validImageSize = metadata->validRemoteSize; // As we don't lock
+ if ( startOffset >= (off_t)validImageSize ) {
+ // After end of remote valid data, pad with zeros entirely
+ char buf[DNBD3_BLOCK_SIZE] = {0};
+ off_t start = startOffset % DNBD3_BLOCK_SIZE;
+ assert( start + size <= DNBD3_BLOCK_SIZE );
+ memcpy( buf + start, srcBuffer, size );
+ bool ret = writeAll( cow.fdData, buf, DNBD3_BLOCK_SIZE,
+ cluster->offset + ( inClusterOffset & ~DNBD3_BLOCK_MASK ) );
+ if ( ret ) {
+ int64_t bit = inClusterOffset / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( cluster->bitfield, bit, bit, true );
+ cowRequest->bytesWorkedOn += size;
+ cluster->timeChanged = time( NULL );
+ }
+ return ret;
+ }
+ // Need to fetch padding from upstream, allocate struct plus one block
+ cow_sub_request_t *sub = calloc( sizeof( *sub ) + DNBD3_BLOCK_SIZE, 1 );
+ sub->callback = writePaddedBlock;
+ sub->inClusterOffset = inClusterOffset;
+ sub->cluster = cluster;
+ sub->size = size;
+ sub->writeSrc = srcBuffer;
+ sub->cowRequest = cowRequest;
+ sub->buffer = sub->writeBuffer;
+
+ sub->dRequest.length = (uint32_t)MIN( DNBD3_BLOCK_SIZE, validImageSize - startOffset );
+ sub->dRequest.offset = startOffset & ~DNBD3_BLOCK_MASK;
+ sub->dRequest.fuse_req = req;
+
+ atomic_fetch_add( &cowRequest->workCounter, 1 );
+
+ if ( !connection_read( &sub->dRequest ) ) {
+ free( sub );
+ errno = ENOTSOCK;
+ // Don't need to go via finishWriteRequest here since the caller will take care of error handling
+ atomic_fetch_sub( &cowRequest->workCounter, 1 );
+ return false;
+ }
+ return true;
+}
+
+/**
+ * @brief Will be called after a dnbd3_async_t is finished.
+ * Calls the corrsponding callback function, either writePaddedBlock or readRemoteCallback
+ * depending if the original fuse request was a write or read.
+ *
+ */
+void cowfile_handleCallback( dnbd3_async_t *request )
+{
+ cow_sub_request_t *sRequest = container_of( request, cow_sub_request_t, dRequest );
+ sRequest->callback( sRequest );
+}
+
+
+/**
+ * @brief called once dnbd3_async_t is finished. Increases bytesWorkedOn by the number of bytes
+ * this request had. Also checks if it was the last dnbd3_async_t to finish the fuse request, if
+ * so replys to fuse and cleans up the request.
+ *
+ */
+static void readRemoteCallback( cow_sub_request_t *sRequest )
+{
+ atomic_fetch_add( &sRequest->cowRequest->bytesWorkedOn, sRequest->dRequest.length );
+
+ if ( atomic_fetch_sub( &sRequest->cowRequest->workCounter, 1 ) == 1 ) {
+ if ( sRequest->cowRequest->bytesWorkedOn != sRequest->cowRequest->fuseRequestSize ) {
+ // Because connection_read() will always return exactly as many bytes as requested,
+ // or simply never finish.
+ logadd( LOG_ERROR, "BUG? Pad read has invalid size. worked on: %"PRIu64", request size: %"
+ PRIu64", offset: %"PRIu64,
+ (uint64_t)sRequest->cowRequest->bytesWorkedOn,
+ (uint64_t)sRequest->cowRequest->fuseRequestSize,
+ (uint64_t)sRequest->cowRequest->fuseRequestOffset );
+ fuse_reply_err( sRequest->dRequest.fuse_req, EIO );
+ } else {
+ fuse_reply_buf( sRequest->dRequest.fuse_req, sRequest->cowRequest->readBuffer,
+ sRequest->cowRequest->bytesWorkedOn );
+ }
+ free( sRequest->cowRequest->readBuffer );
+ free( sRequest->cowRequest );
+ }
+ free( sRequest );
+}
+
+/**
+ * @brief changes the imageSize
+ *
+ * @param req fuse request
+ * @param size new size the image should have
+ * @param ino fuse_ino_t
+ * @param fi fuse_file_info
+ */
+
+void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ if ( size < metadata->imageSize ) {
+ // truncate file
+ if ( size < metadata->validRemoteSize ) {
+ metadata->validRemoteSize = size;
+ }
+ } else if ( size > metadata->imageSize ) {
+ // grow file, pad with zeroes
+ off_t offset = metadata->imageSize;
+ int l1Index = offsetToL1Index( offset );
+ int l2Index = offsetToL2Index( offset );
+ int l1EndIndex = offsetToL1Index( size );
+ int l2EndIndex = offsetToL2Index( size );
+ // Special case, first cluster through which the size change passes
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+ if ( cluster != NULL ) {
+ off_t inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+ // if the new size is inside a DNBD3_BLOCK it might still contain old data before a truncate
+ if ( !IS_4K_ALIGNED( metadata->imageSize ) ) {
+ size_t sizeToWrite = DNBD3_BLOCK_SIZE - ( metadata->imageSize % DNBD3_BLOCK_SIZE );
+
+ if ( checkBit( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE ) ) {
+ char buf[DNBD3_BLOCK_SIZE] = {0};
+ ssize_t bytesWritten = pwrite( cow.fdData, buf, sizeToWrite, cluster->offset + inClusterOffset );
+
+ if ( bytesWritten < (ssize_t)sizeToWrite ) {
+ fuse_reply_err( req, bytesWritten == -1 ? errno : EIO );
+ return;
+ }
+ cluster->timeChanged = time( NULL );
+ offset += sizeToWrite;
+ }
+ }
+ // all remaining bits in cluster will get set to 0
+ inClusterOffset = offset % COW_DATA_CLUSTER_SIZE;
+ setBitsInBitfield( cluster->bitfield, inClusterOffset / DNBD3_BLOCK_SIZE,
+ ( COW_BITFIELD_SIZE * 8 ) - 1, false );
+ cluster->timeChanged = time( NULL );
+ l2Index++;
+ if ( l2Index >= COW_L2_TABLE_SIZE ) {
+ l2Index = 0;
+ l1Index++;
+ }
+ }
+ // normal case, if clusters exist, null bitfields
+ while ( l1Index < l1EndIndex || ( l1Index == l1EndIndex && l2Index <= l2EndIndex ) ) {
+ if ( cow.l1[l1Index] == -1 ) {
+ l1Index++;
+ l2Index = 0;
+ continue;
+ }
+ cluster = getL2Entry( l1Index, l2Index, false );
+ if ( cluster != NULL ) {
+ memset( cluster->bitfield, 0, COW_BITFIELD_SIZE );
+ cluster->timeChanged = time( NULL );
+ }
+ l2Index++;
+ if ( l2Index >= COW_L2_TABLE_SIZE ) {
+ l2Index = 0;
+ l1Index++;
+ }
+ }
+ }
+ metadata->imageSize = size;
+ if ( req != NULL ) {
+ image_ll_getattr( req, ino, fi );
+ }
+}
+
+/**
+ * @brief Implementation of a write request.
+ *
+ * @param req fuse_req_t
+ * @param cowRequest
+ * @param offset Offset where the write starts,
+ * @param size Size of the write.
+ */
+void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, size_t size )
+{
+ // if beyond end of file, pad with 0
+ if ( offset > (off_t)metadata->imageSize ) {
+ cowfile_setSize( NULL, offset, 0, NULL );
+ }
+
+
+ off_t currentOffset = offset;
+ off_t endOffset = offset + size;
+
+ if ( !IS_4K_ALIGNED( currentOffset ) ) {
+ // Handle case where start is not 4k aligned
+ if ( !padBlockForWrite( req, cowRequest, currentOffset, endOffset, cowRequest->writeBuffer ) ) {
+ goto fail;
+ }
+ // Move forward to next block border
+ currentOffset = ( currentOffset + DNBD3_BLOCK_SIZE ) & ~DNBD3_BLOCK_MASK;
+ }
+ if ( currentOffset < endOffset && !IS_4K_ALIGNED( endOffset ) ) {
+ // Handle case where end is not 4k aligned
+ off_t lastBlockStart = endOffset & ~DNBD3_BLOCK_MASK;
+ if ( !padBlockForWrite( req, cowRequest, lastBlockStart, endOffset,
+ cowRequest->writeBuffer + ( lastBlockStart - offset ) ) ) {
+ goto fail;
+ }
+ endOffset = lastBlockStart;
+ }
+
+ // From here on start and end are block-aligned
+ int l1Index = offsetToL1Index( currentOffset );
+ int l2Index = offsetToL2Index( currentOffset );
+ while ( currentOffset < endOffset ) {
+ if ( cow.l1[l1Index] == -1 ) {
+ createL2Table( l1Index );
+ }
+ //loop over L2 array (metadata)
+ while ( currentOffset < endOffset && l2Index < COW_L2_TABLE_SIZE ) {
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, true );
+ size_t inClusterOffset = currentOffset % COW_DATA_CLUSTER_SIZE;
+ // How many bytes we can write to this cluster before crossing a boundary,
+ // or before the write request is complete
+ size_t bytesToWriteToCluster =
+ MIN( (size_t)( endOffset - currentOffset ), COW_DATA_CLUSTER_SIZE - inClusterOffset );
+
+ if ( !writeAll( cow.fdData, cowRequest->writeBuffer + ( currentOffset - offset ),
+ bytesToWriteToCluster, cluster->offset + inClusterOffset ) ) {
+ goto fail;
+ }
+ int64_t f = inClusterOffset / DNBD3_BLOCK_SIZE;
+ int64_t t = ( inClusterOffset + bytesToWriteToCluster - 1 ) / DNBD3_BLOCK_SIZE;
+ setBitsInBitfield( cluster->bitfield, f, t, true );
+ cowRequest->bytesWorkedOn += bytesToWriteToCluster;
+ currentOffset += bytesToWriteToCluster;
+ cluster->timeChanged = time( NULL );
+ l2Index++;
+ }
+ l1Index++;
+ l2Index = 0;
+ }
+ goto success;
+
+fail:
+ if ( cowRequest->errorCode == 0 ) {
+ cowRequest->errorCode = errno != 0 ? errno : EIO;
+ }
+ // Fallthrough
+success:
+ finishWriteRequest( req, cowRequest );
+}
+
+
+/**
+ * @brief Request data, that is not available locally, via the network.
+ *
+ * @param req fuse_req_t
+ * @param offset from the start of the file
+ * @param size of data to request
+ * @param buffer into which the data is to be written
+ * @param cowRequest cow_request_t
+ */
+static void readRemote( fuse_req_t req, off_t offset, ssize_t size, char *buffer, cow_request_t *cowRequest )
+{
+ assert( offset < (off_t)metadata->validRemoteSize );
+ assert( offset + size <= (off_t)metadata->validRemoteSize );
+ if ( size == 0 )
+ return;
+ assert( size > 0 );
+ cow_sub_request_t *sRequest = malloc( sizeof( cow_sub_request_t ) );
+ sRequest->callback = readRemoteCallback;
+ sRequest->dRequest.length = (uint32_t)size;
+ sRequest->dRequest.offset = offset;
+ sRequest->dRequest.fuse_req = req;
+ sRequest->cowRequest = cowRequest;
+ sRequest->buffer = buffer;
+
+ atomic_fetch_add( &cowRequest->workCounter, 1 );
+ if ( !connection_read( &sRequest->dRequest ) ) {
+ cowRequest->errorCode = EIO;
+ free( sRequest );
+ if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
+ fuse_reply_err( req, EIO );
+ free( cowRequest->readBuffer );
+ free( cowRequest );
+ }
+ }
+}
+
+/**
+ * @brief Get the Block Data Source object
+ *
+ * @param block
+ * @param bitfieldOffset
+ * @param offset
+ * @return enum dataSource
+ */
+enum dataSource getBlockDataSource( cow_l2_entry_t *block, off_t bitfieldOffset, off_t offset )
+{
+ if ( block != NULL && checkBit( block->bitfield, bitfieldOffset ) ) {
+ return ds_local;
+ }
+ if ( offset >= (off_t)metadata->validRemoteSize ) {
+ return ds_zero;
+ }
+ return ds_remote;
+}
+
+/**
+ * @brief Reads data at given offset. If the data are available locally,
+ * they are read locally, otherwise they are requested remotely.
+ *
+ * @param req fuse_req_t
+ * @param size of date to read
+ * @param offset offset where the read starts.
+ * @return uint64_t Number of bytes read.
+ */
+void cowfile_read( fuse_req_t req, size_t size, off_t startOffset )
+{
+ cow_request_t *cowRequest = malloc( sizeof( cow_request_t ) );
+ cowRequest->fuseRequestSize = size;
+ cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 );
+ cowRequest->workCounter = ATOMIC_VAR_INIT( 1 );
+ cowRequest->errorCode = ATOMIC_VAR_INIT( 0 );
+ cowRequest->readBuffer = calloc( size, 1 );
+ cowRequest->fuseRequestOffset = startOffset;
+ off_t lastReadOffset = -1;
+ off_t endOffset = startOffset + size;
+ off_t searchOffset = startOffset;
+ int l1Index = offsetToL1Index( startOffset );
+ int l2Index = offsetToL2Index( startOffset );
+ int bitfieldOffset = getBitfieldOffsetBit( startOffset );
+ cow_l2_entry_t *cluster = getL2Entry( l1Index, l2Index, false );
+ enum dataSource dataState = ds_invalid;
+ bool flushCurrentSpan = false; // Set if we need to read the current span and start the next one
+ bool newSourceType = true; // Set if we're starting a new span, and the source type needs to be determined
+
+ while ( searchOffset < endOffset ) {
+ if ( newSourceType ) {
+ newSourceType = false;
+ lastReadOffset = searchOffset;
+ dataState = getBlockDataSource( cluster, bitfieldOffset, searchOffset );
+ } else if ( getBlockDataSource( cluster, bitfieldOffset, searchOffset ) != dataState ) {
+ // Source type changed, obviously need to flush current span
+ flushCurrentSpan = true;
+ } else {
+ bitfieldOffset++;
+ // If reading from local cow file, crossing a cluster border means we need to flush
+ // since the next cluster might be somewhere else in the data file
+ if ( dataState == ds_local && bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+ flushCurrentSpan = true;
+ }
+ }
+
+ // compute the absolute image offset from bitfieldOffset, l2Index and l1Index
+ // bitfieldOffset might be out of bounds here, but that doesn't matter for the calculation
+ searchOffset = DNBD3_BLOCK_SIZE * bitfieldOffset + l2Index * COW_DATA_CLUSTER_SIZE
+ + l1Index * COW_FULL_L2_TABLE_DATA_SIZE;
+ if ( flushCurrentSpan || searchOffset >= endOffset ) {
+ ssize_t spanEndOffset = MIN( searchOffset, endOffset );
+ if ( dataState == ds_remote ) {
+ if ( spanEndOffset > (ssize_t)metadata->validRemoteSize ) {
+ // Account for bytes we leave zero, because they're beyond the (truncated) original image size
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, spanEndOffset - metadata->validRemoteSize );
+ spanEndOffset = metadata->validRemoteSize;
+ }
+ readRemote( req, lastReadOffset, spanEndOffset - lastReadOffset,
+ cowRequest->readBuffer + ( lastReadOffset - startOffset ), cowRequest );
+ } else if ( dataState == ds_zero ) {
+ // Past end of image, account for leaving them zero
+ ssize_t numBytes = spanEndOffset - lastReadOffset;
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+ } else if ( dataState == ds_local ) {
+ ssize_t numBytes = spanEndOffset - lastReadOffset;
+ // Compute the startOffset in the data file where the read starts
+ off_t localRead = cluster->offset + ( lastReadOffset % COW_DATA_CLUSTER_SIZE );
+ ssize_t totalBytesRead = 0;
+ while ( totalBytesRead < numBytes ) {
+ ssize_t bytesRead = pread( cow.fdData, cowRequest->readBuffer + ( lastReadOffset - startOffset ),
+ numBytes - totalBytesRead, localRead + totalBytesRead );
+ if ( bytesRead == -1 ) {
+ cowRequest->errorCode = errno;
+ goto fail;
+ } else if ( bytesRead == 0 ) {
+ logadd( LOG_ERROR, "EOF for read at localRead=%"PRIu64", totalBR=%"PRIu64,
+ (uint64_t)localRead, (uint64_t)totalBytesRead );
+ logadd( LOG_ERROR, "searchOffset=%"PRIu64", endOffset=%"PRIu64", imageSize=%"PRIu64,
+ searchOffset, endOffset, metadata->imageSize );
+ cowRequest->errorCode = EIO;
+ goto fail;
+ }
+ totalBytesRead += bytesRead;
+ }
+
+ atomic_fetch_add( &cowRequest->bytesWorkedOn, numBytes );
+ } else {
+ assert( 4 == 6 );
+ }
+ lastReadOffset = searchOffset;
+ flushCurrentSpan = false;
+ // Since the source type changed, reset
+ newSourceType = true;
+ }
+ if ( bitfieldOffset == COW_BITFIELD_SIZE * 8 ) {
+ // Advance to next cluster in current l2 table
+ bitfieldOffset = 0;
+ l2Index++;
+ if ( l2Index >= COW_L2_TABLE_SIZE ) {
+ // Advance to next l1 entry, reset l2 index
+ l2Index = 0;
+ l1Index++;
+ }
+ cluster = getL2Entry( l1Index, l2Index, false );
+ }
+ }
+fail:;
+ if ( atomic_fetch_sub( &cowRequest->workCounter, 1 ) == 1 ) {
+ if ( cowRequest->errorCode != 0 || cowRequest->bytesWorkedOn != size ) {
+ logadd( LOG_ERROR, "incomplete read or I/O error (errno=%d, workedOn: %"PRIu64", size: %"PRIu64")",
+ cowRequest->errorCode, (uint64_t)cowRequest->bytesWorkedOn, (uint64_t)size );
+ fuse_reply_err( req, cowRequest->errorCode != 0 ? cowRequest->errorCode : EIO );
+ } else {
+ fuse_reply_buf( req, cowRequest->readBuffer, cowRequest->bytesWorkedOn );
+ }
+ free( cowRequest->readBuffer );
+ free( cowRequest );
+ }
+}
+
+
+/**
+ * @brief stops the StatUpdater and CowUploader threads
+ * and waits for them to finish, then cleans up curl.
+ *
+ */
+void cowfile_close()
+{
+ uploadLoop = false;
+ pthread_join( tidCowUploader, NULL );
+ if ( statFile || statStdout ) {
+ // Send a signal in case it's hanging in the sleep call
+ pthread_kill( tidStatUpdater, SIGHUP );
+ pthread_join( tidStatUpdater, NULL );
+ }
+
+ curl_slist_free_all( uploadHeaders );
+ if ( curl ) {
+ curl_easy_cleanup( curl );
+ curl_global_cleanup();
+ }
+}
diff --git a/src/fuse/cowfile.h b/src/fuse/cowfile.h
new file mode 100644
index 0000000..3b1711c
--- /dev/null
+++ b/src/fuse/cowfile.h
@@ -0,0 +1,146 @@
+#ifndef _COWFILE_H_
+#define _COWFILE_H_
+
+#include "connection.h"
+
+#include <dnbd3/config/cow.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+
+// Net storage capacity of a single cluster in the data file
+#define COW_DATA_CLUSTER_SIZE ( COW_BITFIELD_SIZE * 8 * DNBD3_BLOCK_SIZE )
+// Number of entries per L2 table
+#define COW_L2_TABLE_SIZE 1024
+// Net storage capacity in data file represented by a full L2 table
+#define COW_FULL_L2_TABLE_DATA_SIZE ( COW_L2_TABLE_SIZE * COW_DATA_CLUSTER_SIZE )
+
+_Static_assert( ATOMIC_INT_LOCK_FREE == 2, "ATOMIC INT not lock free" );
+_Static_assert( ATOMIC_LONG_LOCK_FREE == 2, "ATOMIC LONG not lock free" );
+_Static_assert( ATOMIC_LLONG_LOCK_FREE == 2, "ATOMIC LLONG not lock free" );
+_Static_assert( sizeof( atomic_uint_least64_t ) == 8, "atomic_uint_least64_t not 8 byte" );
+_Static_assert( sizeof( _Atomic(uint32_t) ) == 4, "_Atomic(uint32_t) not 4 byte" );
+_Static_assert( sizeof( atomic_int_least64_t ) == 8, "atomic_int_least64_t not 8 byte" );
+
+enum dataSource
+{
+ ds_invalid,
+ ds_local,
+ ds_remote,
+ ds_zero
+};
+
+#define COW_METADATA_HEADER_SIZE 320
+typedef struct cowfile_metadata_header
+{
+ uint64_t magicValue; // 8byte
+ atomic_uint_least64_t imageSize; // 8byte
+ int32_t version; // 4byte
+ int32_t blocksize; // 4byte
+ uint64_t validRemoteSize; // 8byte
+ uint32_t startL1; // 4byte
+ uint32_t startL2; // 4byte
+ int32_t bitfieldSize; // 4byte
+ int32_t nextL2; // 4byte
+ atomic_int_least64_t metaSize; // 8byte
+ atomic_int_least64_t nextClusterOffset; // 8byte
+ uint64_t maxImageSize; // 8byte
+ uint64_t creationTime; // 8byte
+ char uuid[40]; // 40byte
+ char imageName[200]; // 200byte
+} cowfile_metadata_header_t;
+_Static_assert( sizeof( cowfile_metadata_header_t ) == COW_METADATA_HEADER_SIZE,
+ "cowfile_metadata_header is messed up" );
+
+#define COW_L2_ENTRY_SIZE 64
+typedef struct cow_l2_entry
+{
+ atomic_int_least64_t offset;
+ atomic_int_least64_t timeChanged;
+ _Atomic(uint32_t) uploads;
+ _Atomic(uint32_t) fails;
+ atomic_uchar bitfield[COW_BITFIELD_SIZE];
+} cow_l2_entry_t;
+_Static_assert( sizeof( cow_l2_entry_t ) == COW_L2_ENTRY_SIZE, "cow_l2_entry_t is messed up" );
+
+/**
+ * Open request for reading/writing the virtual image we expose.
+ */
+typedef struct cow_request
+{
+ size_t fuseRequestSize; // Number of bytes to be read/written
+ off_t fuseRequestOffset; // Absolute offset into the image, as seen by user space
+ char *readBuffer; // Used only in read case
+ const char *writeBuffer; // Used only in write case
+ atomic_size_t bytesWorkedOn; // Used for tracking how many bytes we have touched (exluding padding etc)
+ atomic_int workCounter; // How many pending sub requests (see below)
+ atomic_int errorCode; // For reporting back to fuse
+ fuse_ino_t ino; // Inode of file, used for ??? (For reporting back to fuse, dont know if needed?)
+ struct fuse_file_info *fi; // Used for ??? (For reporting back to fuse, dont know if needed?)
+ //fuse_req_t req; // Fuse request
+} cow_request_t;
+
+typedef struct cow_sub_request cow_sub_request_t;
+typedef void ( *cow_callback )( cow_sub_request_t *sRequest );
+
+/**
+ * A sub-request for above, which needs to be completed successfully
+ * before the parent cow_request can be completed.
+ * TODO Please verify field comments
+ */
+typedef struct cow_sub_request
+{
+ size_t size; // size of this sub-request
+ off_t inClusterOffset; // offset relative to the beginning of the cluster
+ const char *writeSrc; // pointer to the data of a write request which needs padding
+ char *buffer; // The pointer points to the original read buffer to the place where the sub read request should be copied to.
+ cow_l2_entry_t *cluster; // the cluster inClusterOffset refers to
+ cow_callback callback; // Callback when we're done handling this
+ cow_request_t *cowRequest; // parent request
+ dnbd3_async_t dRequest; // Probably request to dnbd3-server for non-aligned writes (wrt 4k dnbd3 block)
+ char writeBuffer[]; // buffer for a padding write request, gets filled from a remote read, then the writeSrc data gets copied into it.
+} cow_sub_request_t;
+
+typedef struct cow_curl_read_upload
+{
+ atomic_uint_least64_t time;
+ cow_l2_entry_t *cluster;
+ size_t position;
+ uint64_t clusterNumber;
+ int64_t ulLast;
+ int retryTime;
+ atomic_uchar bitfield[COW_BITFIELD_SIZE];
+ char replyBuffer[500];
+} cow_curl_read_upload_t;
+
+
+typedef struct cow_cluster_statistics
+{
+ uint64_t clusterNumber;
+ uint64_t uploads;
+} cow_cluster_statistics_t;
+
+typedef int32_t l1;
+typedef cow_l2_entry_t l2[COW_L2_TABLE_SIZE];
+
+bool cowfile_init( char *path, const char *image_Name, uint16_t imageVersion, atomic_uint_fast64_t **imageSizePtr,
+ char *serverAddress, bool sStdout, bool sFile, const char *cowUuid );
+
+bool cowfile_load( char *path, atomic_uint_fast64_t **imageSizePtr, char *serverAddress, bool sStdout, bool sFile, const char *cowUuid );
+bool cowfile_startBackgroundThreads();
+void cowfile_read( fuse_req_t req, size_t size, off_t offset );
+
+void cowfile_write( fuse_req_t req, cow_request_t *cowRequest, off_t offset, size_t size );
+
+void cowfile_handleCallback( dnbd3_async_t *request );
+
+void cowfile_setSize( fuse_req_t req, size_t size, fuse_ino_t ino, struct fuse_file_info *fi );
+
+void readRemoteData( cow_sub_request_t *sRequest );
+
+int cow_printStats( char *buffer, const size_t len );
+
+void cowfile_close();
+
+#endif /* COWFILE_H_ */
diff --git a/src/fuse/helper.c b/src/fuse/helper.c
index d81b08f..f54073b 100644
--- a/src/fuse/helper.c
+++ b/src/fuse/helper.c
@@ -18,8 +18,8 @@ void printLog( log_info *info )
}
//rewind(file);
- fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", ( uint64_t )( info->imageSize/ ( 1024ll*1024ll ) ) );
- fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", ( uint64_t )( info->receivedBytes/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ImageSize: %"PRIu64" MiB\n", (uint64_t)( info->imageSize/ ( 1024ll*1024ll ) ) );
+ fprintf( logFile, "ReceivedMiB: %"PRIu64" MiB\n", (uint64_t)( info->receivedBytes/ ( 1024ll*1024ll ) ) );
fprintf( logFile, "imageBlockCount: %"PRIu64"\n", info->imageBlockCount );
fprintf( logFile, "Blocksize: 4KiB\n\n" );
fprintf( logFile, "Block access count:\n" );
@@ -29,7 +29,7 @@ void printLog( log_info *info )
if ( i % 50 == 0 ) {
fprintf( logFile, "\n" );
}
- fprintf( logFile, "%i ", ( int ) info->blockRequestCount[i] );
+ fprintf( logFile, "%i ", (int) info->blockRequestCount[i] );
}
fprintf( logFile, "\n" );
fclose( logFile );
diff --git a/src/fuse/helper.h b/src/fuse/helper.h
index 9e5d127..b1fa513 100644
--- a/src/fuse/helper.h
+++ b/src/fuse/helper.h
@@ -1,7 +1,7 @@
#ifndef IMAGEHELPER_H
#define IMAGEHELPER_H
-#include "../types.h"
+#include <dnbd3/types.h>
#include <netdb.h>
#include <stdbool.h>
@@ -18,18 +18,18 @@ typedef struct log_info {
-void printLog(log_info *info);
+void printLog( log_info *info );
-int connect_to_server(char *server_adress, int port);
+int connect_to_server( char *server_adress, int port );
-static inline bool isSameAddressPort(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddressPort( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (a->port == b->port) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( a->port == b->port ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
-static inline bool isSameAddress(const dnbd3_host_t * const a, const dnbd3_host_t * const b)
+static inline bool isSameAddress( const dnbd3_host_t * const a, const dnbd3_host_t * const b )
{
- return (a->type == b->type) && (0 == memcmp( a->addr, b->addr, (a->type == HOST_IP4 ? 4 : 16) ));
+ return ( a->type == b->type ) && ( 0 == memcmp( a->addr, b->addr, ( a->type == HOST_IP4 ? 4 : 16 ) ) );
}
#endif
diff --git a/src/fuse/main.c b/src/fuse/main.c
index 1a5643c..13dd168 100644
--- a/src/fuse/main.c
+++ b/src/fuse/main.c
@@ -5,271 +5,359 @@
* See the file COPYING.
*
* Changed by Stephan Schwaer
+ * FUSE lowlevel by Alan Reichert
* */
+#include "main.h"
+#include "cowfile.h"
#include "connection.h"
#include "helper.h"
-#include "../shared/protocol.h"
-#include "../shared/log.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/config.h>
-#define FUSE_USE_VERSION 30
-#include <fuse.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
/* for printing uint */
-#define __STDC_FORMAT_MACROS
+//#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <getopt.h>
#include <time.h>
#include <signal.h>
#include <pthread.h>
-
#define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
-static const char * const IMAGE_PATH = "/img";
-static const char * const STATS_PATH = "/status";
+#define INO_ROOT (1)
+#define INO_STATS (2)
+#define INO_IMAGE (3)
+
+static const char *IMAGE_NAME = "img";
+static const char *STATS_NAME = "status";
+
+static struct fuse_session *_fuseSession = NULL;
+bool useCow = false;
+bool cow_merge_after_upload = false;
+static atomic_uint_fast64_t imageSize;
+static atomic_uint_fast64_t *imageSizePtr =&imageSize;
-static uint64_t imageSize;
/* Debug/Benchmark variables */
static bool useDebug = false;
static log_info logInfo;
static struct timespec startupTime;
static uid_t owner;
-static bool keepRunning = true;
-static void (*fuse_sigIntHandler)(int) = NULL;
-static void (*fuse_sigTermHandler)(int) = NULL;
-static struct fuse_operations dnbd3_fuse_no_operations;
-
-#define SIGPOOLSIZE 6
-static pthread_spinlock_t sigLock;
-static dnbd3_signal_t *signalPool[SIGPOOLSIZE];
-static dnbd3_signal_t **sigEnd = signalPool + SIGPOOLSIZE;
-static void signalInit()
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize );
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset );
+static void image_destroy( void *private_data );
+static void image_ll_init( void *userdata, struct fuse_conn_info *conn );
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi );
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi );
+static void image_ll_write( fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi );
+static void image_ll_setattr( fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set, struct fuse_file_info *fi );
+static int image_stat( fuse_ino_t ino, struct stat *stbuf );
+static void printUsage( char *argv0, int exitCode );
+static void printVersion();
+
+static int image_stat( fuse_ino_t ino, struct stat *stbuf )
{
- pthread_spin_init( &sigLock, PTHREAD_PROCESS_PRIVATE );
- for ( size_t i = 0; i < SIGPOOLSIZE; ++i ) {
- signalPool[i] = NULL;
+ switch ( ino ) {
+ case INO_ROOT:
+ stbuf->st_mode = S_IFDIR | 0550;
+ if( useCow ) {
+ stbuf->st_mode = S_IFDIR | 0770;
+ }
+ stbuf->st_nlink = 2;
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_IMAGE:
+ if ( useCow ) {
+ stbuf->st_mode = S_IFREG | 0660;
+ } else {
+ stbuf->st_mode = S_IFREG | 0440;
+ }
+ stbuf->st_nlink = 1;
+ stbuf->st_size = *imageSizePtr;
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_STATS:
+ stbuf->st_mode = S_IFREG | 0440;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = 4096;
+ clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return -1;
}
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = owner;
+ stbuf->st_ino = ino;
+ return 0;
}
-static inline dnbd3_signal_t *signalGet()
+
+void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it != NULL ) {
- dnbd3_signal_t *ret = *it;
- *it = NULL;
- pthread_spin_unlock( &sigLock );
- return ret;
- }
+ struct stat stbuf = { 0 };
+ ( void ) fi;
+
+ if ( image_stat( ino, &stbuf ) == -1 ) {
+ fuse_reply_err( req, ENOENT );
+ } else {
+ fuse_reply_attr( req, &stbuf, ino == INO_IMAGE ? 1200 : 1 ); // seconds validity timeout
}
- pthread_spin_unlock( &sigLock );
- return signal_newBlocking();
}
-static inline void signalPut(dnbd3_signal_t *signal)
+
+static void image_ll_lookup( fuse_req_t req, fuse_ino_t parent, const char *name )
{
- pthread_spin_lock( &sigLock );
- for ( dnbd3_signal_t **it = signalPool; it < sigEnd; ++it ) {
- if ( *it == NULL ) {
- *it = signal;
- pthread_spin_unlock( &sigLock );
+ ( void )parent;
+
+ if ( strcmp( name, IMAGE_NAME ) == 0 || strcmp( name, STATS_NAME ) == 0 ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, IMAGE_NAME ) == 0 ) {
+ e.ino = INO_IMAGE;
+ e.attr_timeout = e.entry_timeout = 1200;
+ } else {
+ e.ino = INO_STATS;
+ e.attr_timeout = e.entry_timeout = 0;
+ }
+ if ( image_stat( e.ino, &e.attr ) == 0 ) {
+ fuse_reply_entry( req, &e );
return;
}
}
- pthread_spin_unlock( &sigLock );
- signal_close( signal );
+ fuse_reply_err( req, ENOENT );
}
-static int image_getattr(const char *path, struct stat *stbuf)
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
{
- int res = 0;
- memset( stbuf, 0, sizeof( struct stat ) );
- stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
- stbuf->st_uid = owner;
- if ( strcmp( path, "/" ) == 0 ) {
- stbuf->st_mode = S_IFDIR | 0550;
- stbuf->st_nlink = 2;
- } else if ( strcmp( path, IMAGE_PATH ) == 0 ) {
- stbuf->st_mode = S_IFREG | 0440;
- stbuf->st_nlink = 1;
- stbuf->st_size = imageSize;
- } else if ( strcmp( path, STATS_PATH ) == 0 ) {
- stbuf->st_mode = S_IFREG | 0440;
- stbuf->st_nlink = 1;
- stbuf->st_size = 4096;
- clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
- } else {
- res = -ENOENT;
- }
- return res;
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
}
-static int image_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset UNUSED, struct fuse_file_info *fi UNUSED)
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
{
- if ( strcmp( path, "/" ) != 0 ) {
- return -ENOENT;
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
}
- filler( buf, ".", NULL, 0 );
- filler( buf, "..", NULL, 0 );
- filler( buf, IMAGE_PATH + 1, NULL, 0 );
- filler( buf, STATS_PATH + 1, NULL, 0 );
- return 0;
+ return fuse_reply_buf( req, NULL, 0 );
}
-static int image_open(const char *path, struct fuse_file_info *fi)
+static void image_ll_readdir( fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi )
{
- if ( strcmp( path, IMAGE_PATH ) != 0 && strcmp( path, STATS_PATH ) != 0 ) {
- return -ENOENT;
- }
- if ( ( fi->flags & 3 ) != O_RDONLY ) {
- return -EACCES;
+ ( void ) fi;
+
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, ENOTDIR );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, IMAGE_NAME, INO_IMAGE );
+ dirbuf_add( req, &b, STATS_NAME, INO_STATS );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
}
- return 0;
}
-static int fillStatsFile(char *buf, size_t size, off_t offset) {
- if ( offset == 0 ) {
- return (int)connection_printStats( buf, size );
+static void image_ll_open( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi )
+{
+ if ( ino != INO_IMAGE && ino != INO_STATS ) {
+ fuse_reply_err( req, EISDIR );
+ } else if ( ( fi->flags & 3 ) != O_RDONLY && !useCow ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ // auto caching
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
}
+}
+
+static void fillStatsFile( fuse_req_t req, size_t size, off_t offset ) {
char buffer[4096];
int ret = (int)connection_printStats( buffer, sizeof buffer );
int len = MIN( ret - (int)offset, (int)size );
- if ( len == 0 )
- return 0;
if ( len < 0 ) {
- return -EOF;
+ fuse_reply_err( req, 0 );
+ return;
}
- memcpy( buf, buffer + offset, len );
- return len;
+ fuse_reply_buf( req, buffer + offset, len );
}
-static int image_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi UNUSED)
+static void image_ll_read( fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, struct fuse_file_info *fi )
{
- if ( size > __INT_MAX__ ) {
- // fuse docs say we MUST fill the buffer with exactly size bytes and return size,
- // otherwise the buffer will we padded with zeros. Since the return value is just
- // an int, we could not properly fulfill read requests > 2GB. Since there is no
- // mention of a guarantee that this will never happen, better add a safety check.
- // Way to go fuse.
- return -EIO;
+ assert( ino == INO_STATS || ino == INO_IMAGE );
+
+ ( void )fi;
+
+ if ( ino == INO_STATS ) {
+ fillStatsFile( req, size, offset );
+ return;
}
- if ( path[1] == STATS_PATH[1] ) {
- return fillStatsFile( buf, size, offset );
+
+ if ( size == 0 || size > UINT32_MAX ) {
+ fuse_reply_err( req, 0 );
+ return;
}
- if ( (uint64_t)offset >= imageSize ) {
- return 0;
+ if ( (uint64_t)offset >= *imageSizePtr ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ if ( offset + size > *imageSizePtr ) {
+ size = *imageSizePtr - offset;
}
- if ( offset + size > imageSize ) {
- size = imageSize - offset;
+ if ( useCow ) {
+ cowfile_read(req, size, offset);
+ return;
}
if ( useDebug ) {
- /* count the requested blocks */
uint64_t startBlock = offset / ( 4096 );
const uint64_t endBlock = ( offset + size - 1 ) / ( 4096 );
- for ( ; startBlock <= endBlock; startBlock++ ) {
+ for ( ; startBlock <= endBlock; startBlock++ )
+ {
++logInfo.blockRequestCount[startBlock];
}
}
- dnbd3_async_t request;
- request.buffer = buf;
- request.length = (uint32_t)size;
- request.offset = offset;
- request.signal = signalGet();
- if ( !connection_read( &request ) ) {
- signalPut( request.signal );
- return -EINVAL;
- }
- while ( !request.finished ) {
- int ret = signal_wait( request.signal, 5000 );
- if ( !keepRunning ) {
- connection_close();
- break;
- }
- if ( ret < 0 ) {
- debugf( "fuse_read signal wait returned %d", ret );
- }
- }
- signalPut( request.signal );
- if ( request.success ) {
- return request.length;
- } else {
- return -EIO;
+ dnbd3_async_parent_t *parent = malloc( sizeof(dnbd3_async_parent_t) + size );
+ parent->request.length = (uint32_t)size;
+ parent->request.offset = offset;
+ parent->request.fuse_req = req;
+
+ if ( !connection_read( &parent->request ) ) {
+ fuse_reply_err( req, EIO );
+ free( parent );
}
}
-static void image_sigHandler(int signum) {
- keepRunning = false;
- if ( signum == SIGINT && fuse_sigIntHandler != NULL ) {
- fuse_sigIntHandler(signum);
- }
- if ( signum == SIGTERM && fuse_sigTermHandler != NULL ) {
- fuse_sigTermHandler(signum);
- }
+static void noopSigHandler( int signum )
+{
+ (void)signum;
}
-static void* image_init(struct fuse_conn_info *conn UNUSED)
+static void image_ll_init( void *userdata UNUSED, struct fuse_conn_info *conn UNUSED )
{
+ ( void ) userdata;
+ ( void ) conn;
if ( !connection_initThreads() ) {
logadd( LOG_ERROR, "Could not initialize threads for dnbd3 connection, exiting..." );
- exit( EXIT_FAILURE );
- }
- // Prepare our handler
- struct sigaction newHandler;
- memset( &newHandler, 0, sizeof(newHandler) );
- newHandler.sa_handler = &image_sigHandler;
- sigemptyset( &newHandler.sa_mask );
- struct sigaction oldHandler;
- // Retrieve old handlers when setting
- sigaction( SIGINT, &newHandler, &oldHandler );
- fuse_sigIntHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGINT handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- sigaction( SIGTERM, &newHandler, &oldHandler );
- fuse_sigTermHandler = oldHandler.sa_handler;
- logadd( LOG_DEBUG1, "Previous SIGTERM handler was %p", (void*)(uintptr_t)fuse_sigIntHandler );
- return NULL;
+ if ( _fuseSession != NULL ) {
+ fuse_session_exit( _fuseSession );
+ }
+ }
}
/* close the connection */
-static void image_destroy(void *private_data UNUSED)
+static void image_destroy( void *private_data UNUSED )
{
if ( useDebug ) {
printLog( &logInfo );
}
connection_close();
- return;
+}
+
+
+static void image_ll_write( fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ assert( ino == INO_STATS || ino == INO_IMAGE );
+
+ ( void )fi;
+
+ if ( ino == INO_STATS ) {
+ fuse_reply_err( req, EACCES );
+ return;
+ }
+
+ cow_request_t* cowRequest = malloc(sizeof(cow_request_t));
+ cowRequest->fuseRequestSize = size;
+ cowRequest->workCounter = ATOMIC_VAR_INIT( 1 );
+ cowRequest->writeBuffer = buf;
+ cowRequest->readBuffer = NULL;
+ cowRequest->errorCode = ATOMIC_VAR_INIT( 0 );
+ cowRequest->fuseRequestOffset = off;
+ cowRequest->bytesWorkedOn = ATOMIC_VAR_INIT( 0 );
+ cowfile_write(req, cowRequest, off, size);
+}
+
+static void image_ll_setattr( fuse_req_t req, fuse_ino_t ino, struct stat *attr, int to_set, struct fuse_file_info *fi )
+{
+ if ( ino != INO_IMAGE ) {
+ fuse_reply_err( req, EACCES );
+ return;
+ }
+ if (to_set & FUSE_SET_ATTR_SIZE) {
+ cowfile_setSize( req, attr->st_size, ino, fi);
+ return;
+ }
+ fuse_reply_err( req, EACCES );
}
/* map the implemented fuse operations */
-static struct fuse_operations image_oper = {
- .getattr = image_getattr,
- .readdir = image_readdir,
- .open = image_open,
- .read = image_read,
- .init = image_init,
+static struct fuse_lowlevel_ops image_oper = {
+ .lookup = image_ll_lookup,
+ .getattr = image_ll_getattr,
+ .readdir = image_ll_readdir,
+ .open = image_ll_open,
+ .read = image_ll_read,
+ .init = image_ll_init,
+ .destroy = image_destroy,
+};
+
+/* map the implemented fuse operations with copy on write */
+static struct fuse_lowlevel_ops image_oper_cow = {
+ .lookup = image_ll_lookup,
+ .getattr = image_ll_getattr,
+ .readdir = image_ll_readdir,
+ .open = image_ll_open,
+ .read = image_ll_read,
+ .init = image_ll_init,
.destroy = image_destroy,
+ .write = image_ll_write,
+ .setattr = image_ll_setattr,
};
+
static void printVersion()
{
char *arg[] = { "foo", "-V" };
- printf( "DNBD3-Fuse Version 1.2.3.4, protocol version %d\n", (int)PROTOCOL_VERSION );
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ printf( "dnbd3-fuse version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
+ printf( "Protocol version: %d\n", (int)PROTOCOL_VERSION );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
exit( 0 );
}
-static void printUsage(char *argv0, int exitCode)
+static void printUsage( char *argv0, int exitCode )
{
char *arg[] = { argv0, "-h" };
- fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
+ struct fuse_args args = FUSE_ARGS_INIT( 2, arg );
+ fuse_parse_cmdline( &args, NULL, NULL, NULL );
printf( "\n" );
- printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
- printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
+ printf( "Usage: %s [--debug] [--option mountOpts] --host <serverAddress(es)> --image <imageName> [--rid revision] <mountPoint>\n", argv0 );
+ printf( "Or: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] <mountPoint>\n", argv0 );
+ printf( "For cow: %s [-d] [-o mountOpts] -h <serverAddress(es)> -i <imageName> [-r revision] -c <path> -C <cowServerAddress> -m [--cow-stats-stdout] [--cow-stats-file] <mountPoint>\n", argv0 );
printf( " -d --debug Don't fork, write stats file, and print debug output (fuse -> stderr, dnbd3 -> stdout)\n" );
printf( " -f Don't fork (dnbd3 -> stdout)\n" );
printf( " -h --host List of space separated hosts to use\n" );
@@ -279,26 +367,41 @@ static void printUsage(char *argv0, int exitCode)
printf( " -r --rid Revision to use (omit or pass 0 for latest)\n" );
printf( " -S --sticky Use only servers from command line (no learning from servers)\n" );
printf( " -s Single threaded mode\n" );
+ printf( " -c Enables cow, creates the cow files at given location\n" );
+ printf( " -L Loads the cow files from the given location\n" );
+ printf( " -C Host address of the cow server\n" );
+ printf( "--upload-uuid <id> Use provided UUID as upload session id instead of asking server/loading from file\n" );
+ printf( "--cow-stats-stdout prints the cow status in stdout\n" );
+ printf( "--cow-stats-file creates and updates the cow status file\n" );
+ printf( " -m --merge tell server to merge and create new revision on exit\n" );
exit( exitCode );
}
-static const char *optString = "dfHh:i:l:o:r:SsVv";
+static const char *optString = "dfHh:i:l:o:r:SsVvc:L:C:m";
static const struct option longOpts[] = {
- { "debug", no_argument, NULL, 'd' },
- { "help", no_argument, NULL, 'H' },
- { "host", required_argument, NULL, 'h' },
- { "image", required_argument, NULL, 'i' },
- { "log", required_argument, NULL, 'l' },
- { "option", required_argument, NULL, 'o' },
- { "rid", required_argument, NULL, 'r' },
- { "sticky", no_argument, NULL, 'S' },
- { "version", no_argument, NULL, 'v' },
- { 0, 0, 0, 0 }
+ { "debug", no_argument, NULL, 'd' },
+ { "help", no_argument, NULL, 'H' },
+ { "host", required_argument, NULL, 'h' },
+ { "image", required_argument, NULL, 'i' },
+ { "log", required_argument, NULL, 'l' },
+ { "option", required_argument, NULL, 'o' },
+ { "rid", required_argument, NULL, 'r' },
+ { "sticky", no_argument, NULL, 'S' },
+ { "version", no_argument, NULL, 'v' },
+ { "cow", required_argument, NULL, 'c' },
+ { "loadcow", required_argument, NULL, 'L' },
+ { "cowServer", required_argument, NULL, 'C' },
+ { "merge", no_argument, NULL, 'm' },
+ { "upload-uuid", required_argument, NULL, 'uuid' },
+ { "cow-stats-stdout", no_argument, NULL, 'sout' },
+ { "cow-stats-file", no_argument, NULL, 'sfil' },
+ { 0, 0, 0, 0 }
};
-int main(int argc, char *argv[])
+int main( int argc, char *argv[] )
{
char *server_address = NULL;
+ char *cow_server_address = NULL;
char *image_Name = NULL;
char *log_file = NULL;
uint16_t rid = 0;
@@ -306,6 +409,17 @@ int main(int argc, char *argv[])
int newArgc;
int opt, lidx;
bool learnNewServers = true;
+ bool single_thread = false;
+ struct fuse_chan *ch;
+ char *mountpoint;
+ int foreground = 0;
+ char *cow_file_path = NULL;
+ bool loadCow = false;
+ bool sStdout = false;
+ bool sFile = false;
+ const char *cowUuidOverride = NULL;
+
+ log_init();
if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
printUsage( argv[0], 0 );
@@ -316,9 +430,10 @@ int main(int argc, char *argv[])
log_setConsoleTimestamps( true );
log_setFileMask( 65535 );
- newArgv = calloc( argc + 10, sizeof(char*) );
+ newArgv = calloc( argc + 10, sizeof( char* ) );
newArgv[0] = argv[0];
newArgc = 1;
+
while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
switch ( opt ) {
case 'h':
@@ -328,7 +443,7 @@ int main(int argc, char *argv[])
image_Name = optarg;
break;
case 'r':
- rid = (uint16_t)atoi(optarg);
+ rid = (uint16_t)atoi( optarg );
break;
case 'o':
newArgv[newArgc++] = "-o";
@@ -357,15 +472,40 @@ int main(int argc, char *argv[])
case 'd':
useDebug = true;
newArgv[newArgc++] = "-d";
+ foreground = 1;
break;
case 's':
- newArgv[newArgc++] = "-s";
+ single_thread = true;
break;
case 'S':
learnNewServers = false;
break;
case 'f':
- newArgv[newArgc++] = "-f";
+ foreground = 1;
+ break;
+ case 'c':
+ cow_file_path = optarg;
+ useCow = true;
+ break;
+ case 'C':
+ cow_server_address = optarg;
+ break;
+ case 'm':
+ cow_merge_after_upload = true;
+ break;
+ case 'L':
+ cow_file_path = optarg;
+ useCow = true;
+ loadCow = true;
+ break;
+ case 'sout':
+ sStdout = true;
+ break;
+ case 'sfil':
+ sFile = true;
+ break;
+ case 'uuid':
+ cowUuidOverride = optarg;
break;
default:
printUsage( argv[0], EXIT_FAILURE );
@@ -385,6 +525,37 @@ int main(int argc, char *argv[])
logadd( LOG_WARNING, "Could not open log file at '%s'", log_file );
}
}
+ if( useCow && cow_server_address == NULL ) {
+ printf( "for -c you also need a cow server address. Please also use -C\n" );
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+ if( cow_merge_after_upload && !useCow ) {
+ printf( "-m only works if cow is enabled. \n" );
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+ if ( loadCow ) {
+ if( cow_server_address == NULL ) {
+ printf( "for -L you also need a cow server address. Please also use -C\n" );
+ printUsage( argv[0], EXIT_FAILURE );
+ }
+
+ if ( !cowfile_load( cow_file_path, &imageSizePtr, cow_server_address, sStdout, sFile, cowUuidOverride ) ) {
+ return EXIT_FAILURE;
+ }
+ }
+ do {
+ // The empty handler prevents fuse from registering its own handler
+ struct sigaction newHandler = { .sa_handler = &noopSigHandler };
+ sigemptyset( &newHandler.sa_mask );
+ sigaction( SIGHUP, &newHandler, NULL );
+ sigaction( SIGQUIT, &newHandler, NULL );
+ } while ( 0 );
+ if ( useCow ) {
+ sigset_t sigmask;
+ sigemptyset( &sigmask );
+ sigaddset( &sigmask, SIGQUIT ); // Block here and unblock in cow as abort signal
+ pthread_sigmask( SIG_BLOCK, &sigmask, NULL );
+ }
if ( !connection_init( server_address, image_Name, rid, learnNewServers ) ) {
logadd( LOG_ERROR, "Could not connect to any server. Bye.\n" );
@@ -394,27 +565,82 @@ int main(int argc, char *argv[])
/* initialize benchmark variables */
logInfo.receivedBytes = 0;
- logInfo.imageSize = imageSize;
- logInfo.imageBlockCount = ( imageSize + 4095 ) / 4096;
+ logInfo.imageSize = *imageSizePtr;
+ logInfo.imageBlockCount = ( *imageSizePtr + 4095 ) / 4096;
if ( useDebug ) {
logInfo.blockRequestCount = calloc( logInfo.imageBlockCount, sizeof(uint8_t) );
} else {
logInfo.blockRequestCount = NULL;
}
-
- // Since dnbd3 is always read only and the remote image will not change
+
newArgv[newArgc++] = "-o";
- newArgv[newArgc++] = "ro,auto_cache,default_permissions";
+ if ( useCow ) {
+ newArgv[newArgc++] = "default_permissions";
+ } else {
+ newArgv[newArgc++] = "ro,default_permissions";
+ }
// Mount point goes last
newArgv[newArgc++] = argv[optind];
- printf( "ImagePathName: %s\nFuseArgs:",IMAGE_PATH );
+ printf( "ImagePathName: /%s\nFuseArgs:", IMAGE_NAME );
for ( int i = 0; i < newArgc; ++i ) {
printf( " '%s'", newArgv[i] );
}
- putchar('\n');
+ putchar( '\n' );
clock_gettime( CLOCK_REALTIME, &startupTime );
owner = getuid();
- signalInit();
- return fuse_main( newArgc, newArgv, &image_oper, NULL );
+
+ if ( useCow & !loadCow) {
+ if( !cowfile_init( cow_file_path, connection_getImageName(), connection_getImageRID(), &imageSizePtr, cow_server_address, sStdout, sFile, cowUuidOverride ) ) {
+ return EXIT_FAILURE;
+ }
+ }
+
+ // Fuse lowlevel loop
+ struct fuse_args args = FUSE_ARGS_INIT( newArgc, newArgv );
+ int fuse_err = 1;
+ if ( fuse_parse_cmdline( &args, &mountpoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Parsing command line failed" );
+ } else if ( ( ch = fuse_mount( mountpoint, &args ) ) == NULL ) {
+ logadd( LOG_ERROR, "Mounting file system failed" );
+ } else {
+ if(useCow){
+ _fuseSession = fuse_lowlevel_new( &args, &image_oper_cow, sizeof( image_oper_cow ), NULL );
+ } else{
+ _fuseSession = fuse_lowlevel_new( &args, &image_oper, sizeof( image_oper ), NULL );
+ }
+ if ( _fuseSession == NULL ) {
+ logadd( LOG_ERROR, "Could not initialize fuse session" );
+ } else {
+ fuse_session_add_chan( _fuseSession, ch );
+ // Do not spawn any threads before we daemonize, they'd die at this point
+ fuse_daemonize( foreground );
+ if ( fuse_set_signal_handlers( _fuseSession ) == -1 ) {
+ logadd( LOG_WARNING, "Could not install fuse signal handlers" );
+ }
+ if ( useCow ) {
+ if ( !cowfile_startBackgroundThreads() ) {
+ logadd( LOG_ERROR, "Could not start cow background threads" );
+ }
+ }
+ if ( single_thread ) {
+ fuse_err = fuse_session_loop( _fuseSession );
+ } else {
+ fuse_err = fuse_session_loop_mt( _fuseSession ); //MT produces errors (race conditions) in libfuse and didnt improve speed at all
+ }
+ fuse_remove_signal_handlers( _fuseSession );
+ fuse_session_remove_chan( ch );
+ fuse_session_destroy( _fuseSession );
+ _fuseSession = NULL;
+ }
+ fuse_unmount( mountpoint, ch );
+ if( useCow ) {
+ cowfile_close();
+ }
+ }
+ fuse_opt_free_args( &args );
+ free( newArgv );
+ connection_join();
+ logadd( LOG_DEBUG1, "Terminating. FUSE REPLIED: %d\n", fuse_err );
+ return fuse_err;
}
diff --git a/src/fuse/main.h b/src/fuse/main.h
new file mode 100644
index 0000000..bf21805
--- /dev/null
+++ b/src/fuse/main.h
@@ -0,0 +1,12 @@
+#ifndef _MAIN_H_
+#define _MAIN_H_
+
+#define FUSE_USE_VERSION 30
+#include <fuse_lowlevel.h>
+#include <stdbool.h>
+
+extern bool useCow;
+extern bool cow_merge_after_upload;
+void image_ll_getattr( fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi );
+
+#endif /* main_H_ */
diff --git a/src/fuse/serialize.c b/src/fuse/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/fuse/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/kernel/.clang-format b/src/kernel/.clang-format
new file mode 100644
index 0000000..c1fe2c6
--- /dev/null
+++ b/src/kernel/.clang-format
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | sort | uniq
+ForEachMacros:
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - '__ata_qc_for_each'
+ - 'ata_qc_for_each'
+ - 'ata_qc_for_each_raw'
+ - 'ata_qc_for_each_with_internal'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - '__bio_for_each_bvec'
+ - 'bio_for_each_bvec'
+ - 'bio_for_each_bvec_all'
+ - 'bio_for_each_integrity_vec'
+ - '__bio_for_each_segment'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'bitmap_for_each_clear_region'
+ - 'bitmap_for_each_set_region'
+ - 'blkg_for_each_descendant_post'
+ - 'blkg_for_each_descendant_pre'
+ - 'blk_queue_for_each_rl'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'bpf_for_each_spilled_reg'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'cxl_for_each_cmd'
+ - 'device_for_each_child_node'
+ - 'dma_fence_chain_for_each'
+ - 'do_for_each_ftrace_op'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_atomic_for_each_plane_damage'
+ - 'drm_client_for_each_connector_iter'
+ - 'drm_client_for_each_modeset'
+ - 'drm_connector_for_each_possible_encoder'
+ - 'drm_for_each_bridge_in_chain'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_crtc_reverse'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_for_each_privobj'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'flow_action_for_each'
+ - 'for_each_active_dev_scope'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_aggr_pgid'
+ - 'for_each_available_child_of_node'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_bvec'
+ - 'for_each_card_auxs'
+ - 'for_each_card_auxs_safe'
+ - 'for_each_card_components'
+ - 'for_each_card_dapms'
+ - 'for_each_card_pre_auxs'
+ - 'for_each_card_prelinks'
+ - 'for_each_card_rtds'
+ - 'for_each_card_rtds_safe'
+ - 'for_each_card_widgets'
+ - 'for_each_card_widgets_safe'
+ - 'for_each_cgroup_storage_type'
+ - 'for_each_child_of_node'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_cmsghdr'
+ - 'for_each_compatible_node'
+ - 'for_each_component_dais'
+ - 'for_each_component_dais_safe'
+ - 'for_each_comp_order'
+ - 'for_each_console'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_not'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dapm_widgets'
+ - 'for_each_dev_addr'
+ - 'for_each_dev_scope'
+ - 'for_each_displayid_db'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_dpcm_be'
+ - 'for_each_dpcm_be_rollback'
+ - 'for_each_dpcm_be_safe'
+ - 'for_each_dpcm_fe'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_element'
+ - 'for_each_element_extid'
+ - 'for_each_element_id'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_free_mem_pfn_range_in_zone'
+ - 'for_each_free_mem_pfn_range_in_zone_from'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_hstate'
+ - 'for_each_if'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_nr'
+ - 'for_each_link_codecs'
+ - 'for_each_link_cpus'
+ - 'for_each_link_platforms'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_member'
+ - 'for_each_memcg_cache_index'
+ - 'for_each_mem_pfn_range'
+ - '__for_each_mem_range'
+ - 'for_each_mem_range'
+ - '__for_each_mem_range_rev'
+ - 'for_each_mem_range_rev'
+ - 'for_each_mem_region'
+ - 'for_each_migratetype_order'
+ - 'for_each_msi_entry'
+ - 'for_each_msi_entry_safe'
+ - 'for_each_net'
+ - 'for_each_net_continue_reverse'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_continue_reverse'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_net_rcu'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_mst_mgr_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_nonreserved_multicast_dest_pgid'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_cpu_node'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_old_mst_mgr_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_mst_mgr_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_plane_in_state_reverse'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pci_msi_entry'
+ - 'for_each_pcm_streams'
+ - 'for_each_physmem_range'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_present_cpu'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_property_of_node'
+ - 'for_each_registered_fb'
+ - 'for_each_requested_gpio'
+ - 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_mem_range'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_rtd_codec_dais'
+ - 'for_each_rtd_components'
+ - 'for_each_rtd_cpu_dais'
+ - 'for_each_rtd_dais'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_set_clump8'
+ - 'for_each_sg'
+ - 'for_each_sg_dma_page'
+ - 'for_each_sg_page'
+ - 'for_each_sgtable_dma_page'
+ - 'for_each_sgtable_dma_sg'
+ - 'for_each_sgtable_page'
+ - 'for_each_sgtable_sg'
+ - 'for_each_sibling_event'
+ - 'for_each_subelement'
+ - 'for_each_subelement_extid'
+ - 'for_each_subelement_id'
+ - '__for_each_thread'
+ - 'for_each_thread'
+ - 'for_each_unicast_dest_pgid'
+ - 'for_each_vsi'
+ - 'for_each_wakeup_source'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'genradix_for_each'
+ - 'genradix_for_each_from'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hctx_for_each_ctx'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - 'hlist_for_each_entry_srcu'
+ - '__hlist_for_each_rcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'i3c_bus_for_each_i2cdev'
+ - 'i3c_bus_for_each_i3cdev'
+ - 'ide_host_for_each_port'
+ - 'ide_port_for_each_dev'
+ - 'ide_port_for_each_present_dev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_continue_ul'
+ - 'idr_for_each_entry_ul'
+ - 'in_dev_for_each_ifa_rcu'
+ - 'in_dev_for_each_ifa_rtnl'
+ - 'inet_bind_bucket_for_each'
+ - 'inet_lhash2_for_each_icsk_rcu'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_func_safe'
+ - 'klp_for_each_func_static'
+ - 'klp_for_each_object'
+ - 'klp_for_each_object_safe'
+ - 'klp_for_each_object_static'
+ - 'kunit_suite_for_each_test_case'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_vcpu'
+ - 'list_for_each'
+ - 'list_for_each_codec'
+ - 'list_for_each_codec_safe'
+ - 'list_for_each_continue'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_rcu'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_entry_srcu'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'mci_for_each_dimm'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'nanddev_io_for_each_page'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_nested'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'pcl_for_each_chunk'
+ - 'pcl_for_each_segment'
+ - 'pcm_for_each_format'
+ - 'ping_portaddr_for_each_entry'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_block'
+ - 'rdma_for_each_port'
+ - 'rdma_umem_for_each_dma_block'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_from'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_from'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_from'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_from'
+ - '__rq_for_each_bio'
+ - 'rq_for_each_bvec'
+ - 'rq_for_each_segment'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'shdma_for_each_chan'
+ - '__shost_for_each_device'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_array_for_each'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'tb_property_for_each'
+ - 'tcf_exts_for_each_action'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'virtio_device_for_each_vq'
+ - 'while_for_each_ftrace_op'
+ - 'xa_for_each'
+ - 'xa_for_each_marked'
+ - 'xa_for_each_range'
+ - 'xa_for_each_start'
+ - 'xas_for_each'
+ - 'xas_for_each_conflict'
+ - 'xas_for_each_marked'
+ - 'xbc_array_for_each_value'
+ - 'xbc_for_each_key_value'
+ - 'xbc_node_for_each_array_value'
+ - 'xbc_node_for_each_child'
+ - 'xbc_node_for_each_key_value'
+ - 'zorro_for_each_dev'
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
new file mode 100644
index 0000000..6bc61ff
--- /dev/null
+++ b/src/kernel/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-kernel
+ LANGUAGES C)
+
+# include macros to define Linux kernel build targets
+include(Kernel)
+
+# set C flags for a Linux kernel module
+set(KERNEL_C_FLAGS "-DDNBD3_KERNEL_MODULE -I ${PROJECT_INCLUDE_GEN_DIR}"
+ CACHE STRING "C flags to be used for building the dnbd3 kernel module")
+# set C flags for the debug mode of a Linux kernel module
+set(KERNEL_C_FLAGS_DEBUG "-g -DDEBUG"
+ CACHE STRING "Additional C flags to be used for building the dnbd3 kernel module in debug mode")
+
+# append include directories to the C flags
+get_property(KERNEL_INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+foreach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} -I ${KERNEL_INCLUDE_DIR}")
+endforeach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+
+# append debug C flags if debug mode is enabled
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} ${KERNEL_C_FLAGS_DEBUG}")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+# dnbd3 Linux kernel module
+set(KERNEL_MODULE_DNBD3_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.c)
+set(KERNEL_MODULE_DNBD3_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.h)
+
+add_kernel_module(dnbd3 "${KERNEL_BUILD_DIR}"
+ "${KERNEL_INSTALL_DIR}"
+ "CONFIG_BLK_DEV_DNBD3=m"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}"
+ ${CMAKE_CURRENT_SOURCE_DIR}/Kbuild)
+
+# add dependency to generate project version header before dnbd3.ko is built
+add_dependencies(dnbd3 dnbd3-generate-version)
+
+set(CHECKPATCH_IGNORE_WARNINGS "NEW_TYPEDEFS"
+ "MSLEEP"
+ "CONSTANT_COMPARISON"
+ "DEEP_INDENTATION"
+ "PREFER_PR_LEVEL"
+ "LINUX_VERSION_CODE"
+ "JIFFIES_COMPARISON"
+ "KREALLOC_ARG_REUSE")
+
+add_kernel_linter(dnbd3-lint "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+add_kernel_linter_fix(dnbd3-lint-fix "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+
+add_linter_fix(dnbd3-lint-fix-clang "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
diff --git a/src/kernel/Kbuild b/src/kernel/Kbuild
new file mode 100644
index 0000000..26afa98
--- /dev/null
+++ b/src/kernel/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Linux kernel module dnbd3
+obj-$(CONFIG_BLK_DEV_DNBD3) := dnbd3.o
+dnbd3-y += dnbd3_main.o blk.o net.o serialize.o sysfs.o
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index 889b988..69e4583 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,248 +19,259 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "blk.h"
#include "net.h"
#include "sysfs.h"
+#include "dnbd3_main.h"
#include <linux/pagemap.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#define dnbd3_req_read(req) \
- req_op(req) == REQ_OP_READ
-#define dnbd3_req_fs(req) \
- dnbd3_req_read(req) || req_op(req) == REQ_OP_WRITE
-#define dnbd3_req_special(req) \
- blk_rq_is_private(req)
-#else
-#define dnbd3_req_read(req) \
- rq_data_dir(req) == READ
-#define dnbd3_req_fs(req) \
- req->cmd_type == REQ_TYPE_FS
-#define dnbd3_req_special(req) \
- req->cmd_type == REQ_TYPE_SPECIAL
-#endif
-
-int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+static int dnbd3_close_device(dnbd3_device_t *dev)
{
- struct gendisk *disk;
- struct request_queue *blk_queue;
-
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_receive);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
+ int result;
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- memset(&dev->initial_server, 0, sizeof(dev->initial_server));
- dev->better_sock = NULL;
+ if (dev->imgname)
+ dev_info(dnbd3_device_to_dev(dev), "closing down device.\n");
+ dev->panic = false;
+ result = dnbd3_net_disconnect(dev);
+ kfree(dev->imgname);
dev->imgname = NULL;
- dev->rid = 0;
- dev->update_available = 0;
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- dev->disconnecting = 0;
- dev->panic = 0;
- dev->panic_count = 0;
- dev->reported_size = 0;
-
- if (!(disk = alloc_disk(1)))
- {
- printk("ERROR: dnbd3 alloc_disk failed.\n");
- return -EIO;
- }
-
- disk->major = major;
- disk->first_minor = minor;
- sprintf(disk->disk_name, "dnbd%d", minor);
- set_capacity(disk, 0);
- set_disk_ro(disk, 1);
- disk->fops = &dnbd3_blk_ops;
-
- spin_lock_init(&dev->blk_lock);
- if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL)
- {
- printk("ERROR: dnbd3 blk_init_queue failed.\n");
- return -EIO;
- }
-
- blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
- blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
-
- disk->queue = blk_queue;
- disk->private_data = dev;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-#else
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-#endif
-#define ONE_MEG (1048576)
- blk_queue_max_segment_size(disk->queue, ONE_MEG);
- blk_queue_max_segments(disk->queue, 0xffff);
- blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
- disk->queue->limits.max_sectors = 256;
- dev->disk = disk;
-#undef ONE_MEG
- add_disk(disk);
- dnbd3_sysfs_init(dev);
- return 0;
+ /* new requests might have been queued up, */
+ /* but now that imgname is NULL no new ones can show up */
+ blk_mq_freeze_queue(dev->queue);
+ set_capacity(dev->disk, 0);
+ blk_mq_unfreeze_queue(dev->queue);
+ return result;
}
-int dnbd3_blk_del_device(dnbd3_device_t *dev)
-{
- dnbd3_sysfs_exit(dev);
- dnbd3_net_disconnect(dev);
- del_gendisk(dev->disk);
- put_disk(dev->disk);
- blk_cleanup_queue(dev->disk->queue);
- return 0;
-}
-
-struct block_device_operations dnbd3_blk_ops =
- { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, };
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
int result = -100;
dnbd3_device_t *dev = bdev->bd_disk->private_data;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
struct request_queue *blk_queue = dev->disk->queue;
+#endif
char *imgname = NULL;
dnbd3_ioctl_t *msg = NULL;
- //unsigned long irqflags;
+ int i = 0, j;
+ u8 locked = 0;
- while (dev->disconnecting)
- {
- // do nothing
- }
-
- if (arg != 0)
- {
+ if (arg != 0) {
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
- if (msg == NULL) return -ENOMEM;
- if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg))
- {
+ if (msg == NULL)
+ return -ENOMEM;
+ if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) {
result = -ENOEXEC;
goto cleanup_return;
}
- if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0)
- {
+ if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) {
result = -ENOENT;
goto cleanup_return;
}
- if (msg->imgname != NULL && msg->imgnamelen > 0)
- {
+ if (msg->imgname != NULL && msg->imgnamelen > 0) {
imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
- if (imgname == NULL)
- {
+ if (imgname == NULL) {
result = -ENOMEM;
goto cleanup_return;
}
- if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0)
- {
+ if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) {
result = -ENOENT;
goto cleanup_return;
}
imgname[msg->imgnamelen] = '\0';
- //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname);
}
}
-
- switch (cmd)
- {
+ switch (cmd) {
case IOCTL_OPEN:
- if (dev->imgname != NULL)
- {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
+ break;
}
- else if (imgname == NULL)
- {
+ locked = 1;
+ if (dev->imgname != NULL) {
+ result = -EBUSY;
+ } else if (imgname == NULL) {
result = -EINVAL;
- }
- else if (msg == NULL)
- {
+ } else if (msg == NULL) {
result = -EINVAL;
- }
- else
- {
- if (sizeof(msg->host) != sizeof(dev->cur_server.host))
- printk("Odd size bug#1 triggered in IOCTL\n");
- memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host));
- dev->cur_server.failures = 0;
- memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server));
+ } else {
+ /* assert that at least one and not to many hosts are given */
+ if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) {
+ result = -EINVAL;
+ break;
+ }
+
dev->imgname = imgname;
dev->rid = msg->rid;
dev->use_server_provided_alts = msg->use_server_provided_alts;
- // Forget all alt servers on explicit connect, set first al server to initial server
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0]));
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- if (blk_queue->backing_dev_info != NULL) {
+
+ dev_info(dnbd3_device_to_dev(dev), "opening device.\n");
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set optimal request size for the queue to half the read-ahead
+ blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512));
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ // set readahead from optimal request size of the queue
+ // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE
+ blk_queue_update_readahead(dev->queue);
+#endif
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ if (blk_queue->backing_dev_info != NULL)
blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
- }
#else
blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
#endif
- if (dnbd3_net_connect(dev) == 0)
- {
- result = 0;
- imgname = NULL; // Prevent kfree at the end
+
+ /* add specified servers to alt server list */
+ for (i = 0; i < NUMBER_SERVERS; i++)
+ dev->alt_servers[i].host.ss_family = 0;
+ for (i = 0; i < msg->hosts_num; i++) {
+ /* copy provided host into corresponding alt server slot */
+ if (dnbd3_add_server(dev, &msg->hosts[i]) == 0)
+ dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n",
+ &dev->alt_servers[i].host);
+ else
+ dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n",
+ &dev->alt_servers[i].host);
}
- else
- {
- result = -ENOENT;
+
+ /*
+ * probe added alt servers in specified order and
+ * choose first working server as initial server
+ */
+ result = -EPROTONOSUPPORT;
+ for (i = 0; i < NUMBER_SERVERS; i++) {
+ /* probe added alt server */
+ if (dev->alt_servers[i].host.ss_family == 0)
+ continue; // Empty slot
+
+ result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true);
+ if (result == 0) {
+ /* connection established, store index of server and exit loop */
+ result = i;
+ break;
+ }
+ }
+
+ if (result >= 0) {
+ /* connection was successful */
+ dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n",
+ &dev->cur_server.host);
+ imgname = NULL; // Prevent kfree at the end
+ } else {
+ /* probing failed */
dev->imgname = NULL;
}
}
break;
case IOCTL_CLOSE:
- dnbd3_blk_fail_all_requests(dev);
- result = dnbd3_net_disconnect(dev);
- dnbd3_blk_fail_all_requests(dev);
- set_capacity(dev->disk, 0);
- if (dev->imgname)
- {
- kfree(dev->imgname);
- dev->imgname = NULL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
}
+ locked = 1;
+ result = dnbd3_close_device(dev);
break;
case IOCTL_SWITCH:
- result = -EINVAL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
+ }
+ locked = 1;
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ } else if (msg == NULL) {
+ result = -EINVAL;
+ } else {
+ dnbd3_alt_server_t *alt_server;
+ struct sockaddr_storage new_addr;
+
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(&msg->hosts[0], dev);
+ if (alt_server == NULL) {
+ mutex_unlock(&dev->alt_servers_lock);
+ /* specified server is not known, so do not switch */
+ result = -ENOENT;
+ } else {
+ /* specified server is known, so try to switch to it */
+ new_addr = alt_server->host;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &new_addr)) {
+ /* specified server is current server, so do not switch */
+ result = 0;
+ } else {
+ dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
+ &new_addr);
+ result = dnbd3_new_connection(dev, &new_addr, false);
+ if (result != 0) {
+ /* switching didn't work */
+ result = -EAGAIN;
+ }
+ }
+ if (result == 0) {
+ /* fake RTT so we don't switch away again soon */
+ mutex_lock(&dev->alt_servers_lock);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ alt_server = &dev->alt_servers[i];
+ if (is_same_server(&alt_server->host, &new_addr)) {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ alt_server->rtts[j] = 1;
+ alt_server->best_count = 100;
+ } else {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ if (alt_server->rtts[j] < 500000)
+ alt_server->rtts[j] = 500000;
+ alt_server->best_count = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ }
+ }
+ }
break;
case IOCTL_ADD_SRV:
- case IOCTL_REM_SRV:
- if (dev->imgname == NULL)
- {
- result = -ENOENT;
+ case IOCTL_REM_SRV: {
+ struct sockaddr_storage addr;
+ dnbd3_host_t *host;
+
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ break;
}
- else if (dev->new_servers_num >= NUMBER_SERVERS)
- {
- result = -EAGAIN;
+ if (msg == NULL) {
+ result = -EINVAL;
+ break;
}
- else if (msg == NULL)
- {
+ host = &msg->hosts[0];
+ if (!dnbd3_host_to_sockaddr(host, &addr)) {
result = -EINVAL;
+ break;
}
- else
- {
- memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host));
- dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM
- ++dev->new_servers_num;
- result = 0;
+
+ if (cmd == IOCTL_ADD_SRV) {
+ result = dnbd3_add_server(dev, host);
+ if (result == -EEXIST)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr);
+ else if (result == -ENOSPC)
+ dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr);
+ } else { // IOCTL_REM_SRV
+ result = dnbd3_rem_server(dev, host);
+ if (result == -ENOENT)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr);
}
break;
-
+ }
case BLKFLSBUF:
result = 0;
break;
@@ -270,113 +282,325 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
}
cleanup_return:
- if (msg) kfree(msg);
- if (imgname) kfree(imgname);
+ kfree(msg);
+ kfree(imgname);
+ if (locked)
+ dnbd3_flag_reset(dev->connection_lock);
return result;
}
-/**
- * dev->blk_lock and q->queue_lock are being held
- * when this is called!
+static const struct block_device_operations dnbd3_blk_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = dnbd3_blk_ioctl,
+};
+
+static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+}
+
+/*
+ * Linux kernel blk-mq driver function (entry point) to handle block IO requests
*/
-void dnbd3_blk_request(struct request_queue *q)
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
- struct request *req;
- dnbd3_device_t *dev;
+ struct request *rq = bd->rq;
+ dnbd3_device_t *dev = rq->q->queuedata;
+ struct dnbd3_cmd *cmd;
- while ((req = blk_fetch_request(q)) != NULL)
- {
- dev = req->rq_disk->private_data;
+ if (dev->imgname == NULL || !device_active(dev))
+ return BLK_STS_IOERR;
- if (dev->imgname == NULL)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (req_op(rq) != REQ_OP_READ)
+ return BLK_STS_IOERR;
- if (!(dnbd3_req_fs(req)))
- {
- __blk_end_request_all(req, 0);
- continue;
- }
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
+ return BLK_STS_TIMEOUT;
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (rq_data_dir(rq) != READ)
+ return BLK_STS_NOTSUPP;
- if (!(dnbd3_req_read(req)))
- {
- __blk_end_request_all(req, -EACCES);
- continue;
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32);
+ blk_mq_start_request(rq);
+ dnbd3_add_queue(dev, rq);
+ return BLK_STS_OK;
+}
+
+static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ , bool reserved
+#endif
+ )
+{
+ unsigned long irqflags;
+ struct request *rq_iter;
+ bool found = false;
+ dnbd3_device_t *dev = req->q->queuedata;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->send_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // If still in send queue, do nothing
+ if (found)
+ return BLK_EH_RESET_TIMER;
+
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ list_del_init(&req->queuelist);
+ break;
}
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (!found) {
+ dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n");
+ // Assume it was fnished concurrently
+ return BLK_EH_DONE;
+ }
+ // Add to send queue again and trigger work, reset timeout
+ dnbd3_add_queue(dev, req);
+ return BLK_EH_RESET_TIMER;
+}
+
+static
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+const
+#endif
+struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .timeout = dnbd3_rq_timeout,
+};
+
+int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ int ret;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->index = minor;
+ // lock for imgname, cur_server etc.
+ spin_lock_init(&dev->blk_lock);
+ spin_lock_init(&dev->send_queue_lock);
+ spin_lock_init(&dev->recv_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+ INIT_LIST_HEAD(&dev->recv_queue);
+ dnbd3_flag_reset(dev->connection_lock);
+ dnbd3_flag_reset(dev->discover_running);
+ mutex_init(&dev->alt_servers_lock);
+ dnbd3_net_work_init(dev);
+
+ // memset has done this already but I like initial values to be explicit
+ dev->imgname = NULL;
+ dev->rid = 0;
+ dev->update_available = false;
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->reported_size = 0;
+
+ // set up tag_set for blk-mq
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tag_set.driver_data = dev;
+ dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ;
+
+ ret = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (ret) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n");
+ goto out;
+ }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set up blk-mq and disk
+ dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev);
+ if (IS_ERR(dev->disk)) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n");
+ ret = PTR_ERR(dev->disk);
+ goto out_cleanup_tags;
+ }
+ dev->queue = dev->disk->queue;
+#else
+ // set up blk-mq
+ dev->queue = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(dev->queue)) {
+ ret = PTR_ERR(dev->queue);
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n");
+ goto out_cleanup_tags;
+ }
+ dev->queue->queuedata = dev;
+#endif
+
+ blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+ blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue);
+#else
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue);
+#endif
+#define ONE_MEG (1048576)
+ blk_queue_max_segment_size(dev->queue, ONE_MEG);
+ blk_queue_max_segments(dev->queue, 0xffff);
+ blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
+ dev->queue->limits.max_sectors = 256;
+#undef ONE_MEG
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ // set up disk
+ dev->disk = alloc_disk(1);
+ if (!dev->disk) {
+ dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n");
+ ret = -ENOMEM;
+ goto out_cleanup_queue;
+ }
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \
+ || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ dev->disk->flags |= GENHD_FL_NO_PART;
+#else
+ dev->disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ dev->disk->major = major;
+ dev->disk->first_minor = minor;
+ dev->disk->minors = 1;
+ dev->disk->fops = &dnbd3_blk_ops;
+ dev->disk->private_data = dev;
+ dev->disk->queue = dev->queue;
+ sprintf(dev->disk->disk_name, "dnbd%d", minor);
+ set_capacity(dev->disk, 0);
+ set_disk_ro(dev->disk, 1);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ ret = add_disk(dev->disk);
+ if (ret != 0)
+ goto out_cleanup_queue;
+#else
+ add_disk(dev->disk);
+#endif
+
+ // set up sysfs
+ dnbd3_sysfs_init(dev);
+
+ return 0;
+
+out_cleanup_queue:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+out_cleanup_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out:
+ mutex_destroy(&dev->alt_servers_lock);
+ return ret;
+}
+
+int dnbd3_blk_del_device(dnbd3_device_t *dev)
+{
+ while (!dnbd3_flag_get(dev->connection_lock))
+ schedule();
+ dnbd3_close_device(dev);
+ dnbd3_sysfs_exit(dev);
+ del_gendisk(dev->disk);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+ put_disk(dev->disk);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+ blk_mq_free_tag_set(&dev->tag_set);
+ mutex_destroy(&dev->alt_servers_lock);
+ return 0;
+}
+
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev)
+{
+ struct request *blk_request;
+ unsigned long flags;
+ struct list_head local_copy;
+ int count = 0;
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
+ INIT_LIST_HEAD(&local_copy);
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ list_add_tail(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
}
+ // Do this even if we didn't move anything from the recv list to the send
+ // list. It might have already contained something, which needs to be
+ // re-requested anyways if this was called because of a server switch.
+ spin_lock_irqsave(&dev->blk_lock, flags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, flags);
}
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
{
- struct request *blk_request, *tmp_request;
- struct request *blk_request2, *tmp_request2;
+ struct request *blk_request;
unsigned long flags;
struct list_head local_copy;
- int dup;
+ int count = 0;
+
INIT_LIST_HEAD(&local_copy);
- spin_lock_irqsave(&dev->blk_lock, flags);
- while (!list_empty(&dev->request_queue_receive))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- while (!list_empty(&dev->request_queue_send))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ while (!list_empty(&dev->send_queue)) {
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist)
- {
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
list_del_init(&blk_request->queuelist);
- if (dnbd3_req_fs(blk_request))
- {
- spin_lock_irqsave(&dev->blk_lock, flags);
- __blk_end_request_all(blk_request, -EIO);
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- }
- else if (dnbd3_req_special(blk_request))
- {
- kfree(blk_request);
- }
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
}
}
diff --git a/src/kernel/blk.h b/src/kernel/blk.h
index 5091d19..c6dcb8d 100644
--- a/src/kernel/blk.h
+++ b/src/kernel/blk.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,22 +22,17 @@
#ifndef BLK_H_
#define BLK_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV
-#endif
-
-extern struct block_device_operations dnbd3_blk_ops;
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
-
-void dnbd3_blk_request(struct request_queue *q);
+// The device has been set up via IOCTL_OPEN and hasn't been closed yet
+#define device_active(dev) ((dev)->reported_size != 0)
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor);
int dnbd3_blk_del_device(dnbd3_device_t *dev);
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev);
+
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev);
#endif /* BLK_H_ */
diff --git a/src/kernel/core.c b/src/kernel/core.c
deleted file mode 100644
index 69a2540..0000000
--- a/src/kernel/core.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include "clientconfig.h"
-#include "dnbd3.h"
-#include "blk.h"
-
-int major;
-static unsigned int max_devs = NUMBER_DEVICES;
-static dnbd3_device_t *dnbd3_device;
-
-static int __init dnbd3_init(void)
-{
- int i;
-
- dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL);
- if (!dnbd3_device)
- return -ENOMEM;
-
- // initialize block device
- if ((major = register_blkdev(0, "dnbd3")) == 0)
- {
- printk("ERROR: dnbd3 register_blkdev failed.\n");
- return -EIO;
- }
-
- printk("DNBD3 kernel module loaded. Machine type: " ENDIAN_MODE "\n");
-
- // add MAX_NUMBER_DEVICES devices
- for (i = 0; i < max_devs; i++)
- {
- if (dnbd3_blk_add_device(&dnbd3_device[i], i) != 0)
- {
- printk("ERROR: adding device failed.\n");
- return -EIO; // TODO: delete all devices added so far. it could happen that it's not the first one that fails. also call unregister_blkdev and free memory
- }
- }
-
- printk("INFO: dnbd3 init successful (%i devices).\n", max_devs);
- return 0;
-}
-
-static void __exit dnbd3_exit(void)
-{
- int i;
-
- for (i = 0; i < max_devs; i++)
- {
- dnbd3_blk_del_device(&dnbd3_device[i]);
- }
-
- unregister_blkdev(major, "dnbd3");
- kfree(dnbd3_device);
- printk("INFO: dnbd3 exit.\n");
-}
-
-module_init( dnbd3_init);
-module_exit( dnbd3_exit);
-
-MODULE_DESCRIPTION("Distributed Network Block Device 3");
-MODULE_LICENSE("GPL");
-
-module_param(max_devs, int, 0444);
-MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
deleted file mode 100644
index f8af69f..0000000
--- a/src/kernel/dnbd3.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef DNBD_H_
-#define DNBD_H_
-
-#include <linux/version.h>
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <net/sock.h>
-
-#define KERNEL_MODULE
-#include "config.h"
-#include "types.h"
-#include "serialize.h"
-
-extern int major;
-
-typedef struct
-{
- dnbd3_host_t host;
- unsigned long rtts[4]; // Last four round trip time measurements in µs
- uint16_t protocol_version; // dnbd3 protocol version of this server
- uint8_t failures; // How many times the server was unreachable
-} dnbd3_server_t;
-
-typedef struct
-{
- // block
- struct gendisk *disk;
- spinlock_t blk_lock;
-
- // sysfs
- struct kobject kobj;
-
- // network
- char *imgname;
- struct socket *sock;
- dnbd3_server_t cur_server, initial_server;
- unsigned long cur_rtt;
- serialized_buffer_t payload_buffer;
- dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers
- int new_servers_num; // number of new alt servers that are waiting to be copied to above array
- dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers
- uint8_t discover, panic, disconnecting, update_available, panic_count;
- uint8_t use_server_provided_alts;
- uint16_t rid;
- uint32_t heartbeat_count;
- uint64_t reported_size;
- // server switch
- struct socket *better_sock;
-
- // process
- struct task_struct * thread_send;
- struct task_struct * thread_receive;
- struct task_struct *thread_discover;
- struct timer_list hb_timer;
- wait_queue_head_t process_queue_send;
- wait_queue_head_t process_queue_receive;
- wait_queue_head_t process_queue_discover;
- struct list_head request_queue_send;
- struct list_head request_queue_receive;
-
-} dnbd3_device_t;
-
-#endif /* DNBD_H_ */
diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c
new file mode 100644
index 0000000..cb42567
--- /dev/null
+++ b/src/kernel/dnbd3_main.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <dnbd3/config/client.h>
+#include <dnbd3/version.h>
+#include <net/ipv6.h>
+#include "dnbd3_main.h"
+#include "blk.h"
+
+int major;
+static unsigned int max_devs = NUMBER_DEVICES;
+static dnbd3_device_t *dnbd3_devices;
+
+struct device *dnbd3_device_to_dev(dnbd3_device_t *dev)
+{
+ return disk_to_dev(dev->disk);
+}
+
+int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest)
+{
+ struct sockaddr_in *sin4;
+ struct sockaddr_in6 *sin6;
+
+ memset(dest, 0, sizeof(*dest));
+ if (host->type == HOST_IP4) {
+ sin4 = (struct sockaddr_in *)dest;
+ sin4->sin_family = AF_INET;
+ memcpy(&(sin4->sin_addr), host->addr, 4);
+ sin4->sin_port = host->port;
+ } else if (host->type == HOST_IP6) {
+ sin6 = (struct sockaddr_in6 *)dest;
+ sin6->sin6_family = AF_INET6;
+ memcpy(&(sin6->sin6_addr), host->addr, 16);
+ sin6->sin6_port = host->port;
+ } else
+ return 0;
+ return 1;
+}
+
+int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y)
+{
+ if (x->ss_family != y->ss_family)
+ return 0;
+ switch (x->ss_family) {
+ case AF_INET: {
+ const struct sockaddr_in *sinx = (const struct sockaddr_in *)x;
+ const struct sockaddr_in *siny = (const struct sockaddr_in *)y;
+
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x;
+ const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y;
+
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Get a free slot pointer from the alt_servers list. Tries to find an
+ * entirely empty slot first, then looks for a slot with a server that
+ * wasn't reachable recently, finally returns NULL if none of the
+ * conditions match.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+static dnbd3_alt_server_t *get_free_alt_server(dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
+ return &dev->alt_servers[i];
+ }
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].failures > 10)
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (is_same_server(addr, &dev->alt_servers[i].host))
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+/**
+ * Returns pointer to existing entry in alt_servers that matches the given
+ * alt server, or NULL if not found.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev)
+{
+ struct sockaddr_storage addr;
+
+ if (!dnbd3_host_to_sockaddr(host, &addr))
+ return NULL;
+ return get_existing_alt_from_addr(&addr, dev);
+}
+
+int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ int result;
+ dnbd3_alt_server_t *alt_server;
+
+ if (host->type != HOST_IP4 && host->type != HOST_IP6)
+ return -EINVAL;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // ADD
+ if (alt_server != NULL) {
+ // Exists
+ result = -EEXIST;
+ } else {
+ // OK add
+ alt_server = get_free_alt_server(dev);
+ if (alt_server == NULL) {
+ result = -ENOSPC;
+ } else {
+ dnbd3_host_to_sockaddr(host, &alt_server->host);
+ alt_server->protocol_version = 0;
+ alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
+ = alt_server->rtts[3] = RTT_UNREACHABLE;
+ alt_server->failures = 0;
+ alt_server->best_count = 0;
+ result = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ dnbd3_alt_server_t *alt_server;
+ int result;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // REMOVE
+ if (alt_server == NULL) {
+ // Not found
+ result = -ENOENT;
+ } else {
+ // Remove
+ alt_server->host.ss_family = 0;
+ result = 0;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+static int __init dnbd3_init(void)
+{
+ int i;
+
+ dnbd3_devices = kcalloc(max_devs, sizeof(*dnbd3_devices), GFP_KERNEL);
+ if (!dnbd3_devices)
+ return -ENOMEM;
+
+ // initialize block device
+ major = register_blkdev(0, "dnbd3");
+ if (major == 0) {
+ pr_err("register_blkdev failed\n");
+ return -EIO;
+ }
+
+ pr_info("kernel module in version %s loaded\n", DNBD3_VERSION);
+ pr_debug("machine type %s\n", DNBD3_ENDIAN_MODE);
+
+ // add MAX_NUMBER_DEVICES devices
+ for (i = 0; i < max_devs; i++) {
+ if (dnbd3_blk_add_device(&dnbd3_devices[i], i) != 0) {
+ pr_err("dnbd3_blk_add_device failed\n");
+ // TODO: delete all devices added so far.
+ // It could happen that it's not the first one that fails.
+ // Also call unregister_blkdev and free memory.
+ return -EIO;
+ }
+ }
+
+ pr_info("init successful (%i devices)\n", max_devs);
+
+ return 0;
+}
+
+static void __exit dnbd3_exit(void)
+{
+ int i;
+
+ pr_debug("exiting kernel module...\n");
+ for (i = 0; i < max_devs; i++)
+ dnbd3_blk_del_device(&dnbd3_devices[i]);
+
+ unregister_blkdev(major, "dnbd3");
+ kfree(dnbd3_devices);
+
+ pr_info("exit kernel module done\n");
+}
+
+module_init(dnbd3_init);
+module_exit(dnbd3_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device 3");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DNBD3_VERSION);
+
+module_param(max_devs, int, 0444);
+MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h
new file mode 100644
index 0000000..a932ba2
--- /dev/null
+++ b/src/kernel/dnbd3_main.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef DNBD_H_
+#define DNBD_H_
+
+#include <dnbd3/config/client.h>
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/blk-mq.h>
+
+#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION)
+#define RHEL_CHECK_VERSION(CONDITION) (CONDITION)
+#else
+#define RHEL_CHECK_VERSION(CONDITION) (0)
+#endif
+
+extern int major;
+
+typedef struct {
+ unsigned long rtts[DISCOVER_HISTORY_SIZE]; // Last X round trip time measurements in µs
+ uint16_t protocol_version; // dnbd3 protocol version of this server
+ uint8_t failures; // How many times the server was unreachable
+ uint8_t best_count; // Number of times server measured best
+ struct sockaddr_storage host; // Address of server
+} dnbd3_alt_server_t;
+
+typedef struct {
+ // block
+ int index;
+ struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue *queue;
+ spinlock_t blk_lock;
+
+ // sysfs
+ struct kobject kobj;
+
+ char *imgname;
+ uint16_t rid;
+ struct socket *sock;
+ struct { // use blk_lock
+ unsigned long rtt;
+ struct sockaddr_storage host;
+ uint16_t protocol_version;
+ } cur_server;
+ serialized_buffer_t payload_buffer;
+ struct mutex alt_servers_lock;
+ dnbd3_alt_server_t alt_servers[NUMBER_SERVERS];
+ bool use_server_provided_alts;
+ bool panic;
+ u8 panic_count;
+ bool update_available;
+ atomic_t connection_lock;
+ // Size if image/device - this is 0 if the device is not in use,
+ // otherwise this is also the value we expect from alt servers.
+ uint64_t reported_size;
+ struct delayed_work keepalive_work;
+
+ // sending
+ struct workqueue_struct *send_wq;
+ spinlock_t send_queue_lock;
+ struct list_head send_queue;
+ struct mutex send_mutex;
+ struct work_struct send_work;
+ // receiving
+ struct workqueue_struct *recv_wq;
+ spinlock_t recv_queue_lock;
+ struct list_head recv_queue;
+ struct mutex recv_mutex;
+ struct work_struct recv_work;
+ // discover
+ atomic_t discover_running;
+ struct delayed_work discover_work;
+ u32 discover_interval;
+ u32 discover_count;
+
+} dnbd3_device_t;
+
+struct dnbd3_cmd {
+ u64 handle;
+};
+
+extern inline struct device *dnbd3_device_to_dev(dnbd3_device_t *dev);
+
+extern inline int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y);
+
+extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev);
+
+extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+extern int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+#define dnbd3_flag_get(x) (atomic_cmpxchg(&(x), 0, 1) == 0)
+#define dnbd3_flag_reset(x) atomic_set(&(x), 0)
+#define dnbd3_flag_taken(x) (atomic_read(&(x)) != 0)
+
+/*
+ * shims for making older kernels look like the current one, if possible, to avoid too
+ * much inline #ifdef which makes code harder to read.
+ */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
+#define BLK_EH_DONE BLK_EH_NOT_HANDLED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
+#define blk_status_t int
+#define BLK_STS_OK 0
+#define BLK_STS_IOERR (-EIO)
+#define BLK_STS_TIMEOUT (-ETIME)
+#define BLK_STS_NOTSUPP (-ENOTSUPP)
+#endif
+
+#endif /* DNBD_H_ */
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 9e48b86..5ef4016 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,1106 +19,1112 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "net.h"
#include "blk.h"
-#include "utils.h"
+#include "dnbd3_main.h"
-#include "serialize.h"
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/random.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
+#define get_random_u32 prandom_u32
+#endif
#include <linux/time.h>
-#include <linux/signal.h>
+#include <linux/ktime.h>
+#include <linux/tcp.h>
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
-#else
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern((af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
+#ifndef ktime_to_s
+#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC)
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-// cmd_flags and cmd_type are merged into cmd_flags now
-#if REQ_FLAG_BITS > 24
-#error "Fix CMD bitshift"
-#endif
-// Pack into cmd_flags field by shifting CMD_* into unused bits of cmd_flags
-#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = REQ_OP_DRV_IN | ((cmd) << REQ_FLAG_BITS)
-#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS)
-#define dnbd3_req_op(req) req_op(req)
-#define DNBD3_DEV_READ REQ_OP_READ
-#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
+#ifdef DEBUG
+#define ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+ } while (0)
#else
-// Old way with type and flags separated
-#define dnbd3_cmd_to_priv(req, cmd) do { \
- (req)->cmd_type = REQ_TYPE_SPECIAL; \
- (req)->cmd_flags = (cmd); \
-} while (0)
-#define dnbd3_priv_to_cmd(req) (req)->cmd_flags
-#define dnbd3_req_op(req) (req)->cmd_type
-#define DNBD3_DEV_READ REQ_TYPE_FS
-#define DNBD3_REQ_OP_SPECIAL REQ_TYPE_SPECIAL
+#define ASSERT(x) \
+ do { \
+ } while (0)
#endif
-/**
- * Some macros for easier debug output. Location in source-code
- * as well as server IP:port info will be printed.
- * The error_* macros include a "goto error;" at the end
- */
-#if 1 // Change to 0 to disable debug messages
-#define debug_print_va_host(_host, _fmt, ...) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " _fmt " (%s, %pI4:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " _fmt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_va_host(_host, _fmt, ...) do { \
- debug_print_va_host(_host, _fmt, __VA_ARGS__); \
- goto error; \
-} while(0)
-#define debug_dev_va(_fmt, ...) debug_print_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define error_dev_va(_fmt, ...) debug_error_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define debug_alt_va(_fmt, ...) debug_print_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-#define error_alt_va(_fmt, ...) debug_error_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-
-#define debug_print_host(_host, txt) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " txt " (%s, %pI4:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " txt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_host(_host, txt) do { \
- debug_print_host(_host, txt); \
- goto error; \
-} while(0)
-#define debug_dev(txt) debug_print_host(dev->cur_server.host, txt)
-#define error_dev(txt) debug_error_host(dev->cur_server.host, txt)
-#define debug_alt(txt) debug_print_host(dev->alt_servers[i].host, txt)
-#define error_alt(txt) debug_error_host(dev->alt_servers[i].host, txt)
-
-#else // Silent
-#define debug_dev(x) do { } while(0)
-#define error_dev(x) goto error
-#define debug_dev_va(x, ...) do { } while(0)
-#define error_dev_va(x, ...) goto error
-#define debug_alt(x) do { } while(0)
-#define error_alt(x) goto error
-#define debug_alt_va(x, ...) do { } while(0)
-#define error_alt_va(x, ...) goto error
-#endif
+#define dnbd3_dev_dbg_host(dev, host, fmt, ...) \
+ dev_dbg(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_info_host(dev, host, fmt, ...) \
+ dev_info(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_err_host(dev, host, fmt, ...) \
+ dev_err(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
-static inline int is_same_server(const dnbd3_server_t * const a, const dnbd3_server_t * const b)
-{
- return (a->host.type == b->host.type) && (a->host.port == b->host.port)
- && (0 == memcmp(a->host.addr, b->host.addr, (a->host.type == HOST_IP4 ? 4 : 16)));
-}
+#define dnbd3_dev_dbg_cur(dev, fmt, ...) \
+ dnbd3_dev_dbg_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_info_cur(dev, fmt, ...) \
+ dnbd3_dev_info_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_err_cur(dev, fmt, ...) \
+ dnbd3_dev_err_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
-static inline dnbd3_server_t *get_existing_server(const dnbd3_server_entry_t * const newserver,
- dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if ((newserver->host.type == dev->alt_servers[i].host.type)
- && (newserver->host.port == dev->alt_servers[i].host.port)
- && (0
- == memcmp(newserver->host.addr, dev->alt_servers[i].host.addr, (newserver->host.type == HOST_IP4 ? 4 : 16))))
- {
- return &dev->alt_servers[i];
- break;
- }
- }
- return NULL ;
-}
-
-static inline dnbd3_server_t *get_free_alt_server(dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == 0)
- return &dev->alt_servers[i];
- }
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].failures > 10)
- return &dev->alt_servers[i];
- }
- return NULL ;
-}
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes);
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count);
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr);
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size);
-int dnbd3_net_connect(dnbd3_device_t *dev)
-{
- struct request *req1 = NULL;
- struct timeval timeout;
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, u16 protocol_version);
- if (dev->disconnecting) {
- debug_dev("CONNECT: Still disconnecting!!!\n");
- while (dev->disconnecting)
- schedule();
- }
- if (dev->thread_receive != NULL) {
- debug_dev("CONNECT: Still receiving!!!\n");
- while (dev->thread_receive != NULL)
- schedule();
- }
- if (dev->thread_send != NULL) {
- debug_dev("CONNECT: Still sending!!!\n");
- while (dev->thread_send != NULL)
- schedule();
- }
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket **sock_out);
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
- timeout.tv_usec = 0;
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info);
- // do some checks before connecting
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket *sock);
- req1 = kmalloc(sizeof(*req1), GFP_ATOMIC );
- if (!req1)
- error_dev("FATAL: Kmalloc(1) failed.");
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd);
- if (dev->cur_server.host.port == 0 || dev->cur_server.host.type == 0 || dev->imgname == NULL )
- error_dev("FATAL: Host, port or image name not set.");
- if (dev->sock)
- error_dev("ERROR: Already connected.");
-
- if (dev->cur_server.host.type != HOST_IP4 && dev->cur_server.host.type != HOST_IP6)
- error_dev_va("ERROR: Unknown address type %d", (int)dev->cur_server.host.type);
-
- debug_dev("INFO: Connecting...");
-
- if (dev->better_sock == NULL )
- {
- // no established connection yet from discovery thread, start new one
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov[2];
- uint16_t rid;
- char *name;
- int mlen;
- init_msghdr(msg);
-
- if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0)
- error_dev("ERROR: Couldn't create socket (v6).");
-
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- dev->sock->sk->sk_allocation = GFP_NOIO;
- if (dev->cur_server.host.type == HOST_IP4)
- {
- struct sockaddr_in sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- memcpy(&(sin.sin_addr), dev->cur_server.host.addr, 4);
- sin.sin_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v4)");
- }
- else
- {
- struct sockaddr_in6 sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin6_family = AF_INET6;
- memcpy(&(sin.sin6_addr), dev->cur_server.host.addr, 16);
- sin.sin6_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v6)");
- }
- // Request filesize
- dnbd3_request.magic = dnbd3_packet_magic;
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(&dev->payload_buffer);
- serializer_put_uint16(&dev->payload_buffer, PROTOCOL_VERSION);
- serializer_put_string(&dev->payload_buffer, dev->imgname);
- serializer_put_uint16(&dev->payload_buffer, dev->rid);
- serializer_put_uint8(&dev->payload_buffer, 0); // is_server = false
- iov[1].iov_base = &dev->payload_buffer;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&dev->payload_buffer);
- fixup_request(dnbd3_request);
- mlen = sizeof(dnbd3_request) + iov[1].iov_len;
- if (kernel_sendmsg(dev->sock, &msg, iov, 2, mlen) != mlen)
- error_dev("ERROR: Couldn't send CMD_SIZE_REQUEST.");
- // receive reply header
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_dev("FATAL: Received corrupted reply header after CMD_SIZE_REQUEST.");
- // check reply header
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 3 || dnbd3_reply.size > MAX_PAYLOAD
- || dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("FATAL: Received invalid reply to CMD_SIZE_REQUEST, image doesn't exist on server.");
- // receive reply payload
- iov[0].iov_base = &dev->payload_buffer;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_dev("FATAL: Cold not read CMD_SELECT_IMAGE payload on handshake.");
- // handle/check reply payload
- serializer_reset_read(&dev->payload_buffer, dnbd3_reply.size);
- dev->cur_server.protocol_version = serializer_get_uint16(&dev->payload_buffer);
- if (dev->cur_server.protocol_version < MIN_SUPPORTED_SERVER)
- error_dev("FATAL: Server version is lower than min supported version.");
- name = serializer_get_string(&dev->payload_buffer);
- if (dev->rid != 0 && strcmp(name, dev->imgname) != 0)
- error_dev_va("FATAL: Server offers image '%s', requested '%s'", name, dev->imgname);
- if (strlen(dev->imgname) < strlen(name))
- {
- dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_ATOMIC );
- if (dev->imgname == NULL )
- error_dev("FATAL: Reallocating buffer for new image name failed");
- }
- strcpy(dev->imgname, name);
- rid = serializer_get_uint16(&dev->payload_buffer);
- if (dev->rid != 0 && dev->rid != rid)
- error_dev_va("FATAL: Server provides rid %d, requested was %d.", (int)rid, (int)dev->rid);
- dev->rid = rid;
- dev->reported_size = serializer_get_uint64(&dev->payload_buffer);
- if (dev->reported_size < 4096)
- error_dev("ERROR: Reported size by server is < 4096");
- // store image information
- set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
- debug_dev_va("INFO: Filesize: %llu.", dev->reported_size);
- dev->update_available = 0;
- }
- else // Switching server, connection is already established and size request was executed
- {
- debug_dev("INFO: On-the-fly server change.");
- dev->sock = dev->better_sock;
- dev->better_sock = NULL;
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- }
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic);
- dev->panic = 0;
- dev->panic_count = 0;
+static void dnbd3_discover(dnbd3_device_t *dev);
- // Enqueue request to request_queue_send for a fresh list of alt servers
- dnbd3_cmd_to_priv(req1, CMD_GET_SERVERS);
- list_add(&req1->queuelist, &dev->request_queue_send);
+static void dnbd3_internal_discover(dnbd3_device_t *dev);
- // create required threads
- dev->thread_send = kthread_create(dnbd3_net_send, dev, dev->disk->disk_name);
- dev->thread_receive = kthread_create(dnbd3_net_receive, dev, dev->disk->disk_name);
- dev->thread_discover = kthread_create(dnbd3_net_discover, dev, dev->disk->disk_name);
- // start them up
- wake_up_process(dev->thread_send);
- wake_up_process(dev->thread_receive);
- wake_up_process(dev->thread_discover);
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms);
- wake_up(&dev->process_queue_send);
+// Use as write-only dump, don't care about race conditions etc.
+static u8 __garbage_mem[PAGE_SIZE];
- // add heartbeat timer
- dev->heartbeat_count = 0;
+/**
+ * Delayed work triggering sending of keepalive packet.
+ */
+static void dnbd3_keepalive_workfn(struct work_struct *work)
+{
+ unsigned long irqflags;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, keepalive_work.work);
-// init_timer_key changed from kernel version 4.14 to 4.15, see and compare to 4.15:
-// https://elixir.bootlin.com/linux/v4.14.32/source/include/linux/timer.h#L98
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
- timer_setup(&dev->hb_timer, dnbd3_net_heartbeat, 0);
-#else
- // Old timer setup
- init_timer(&dev->hb_timer);
- dev->hb_timer.data = (unsigned long)dev;
- dev->hb_timer.function = dnbd3_net_heartbeat;
-#endif
- dev->hb_timer.expires = jiffies + HZ;
- add_timer(&dev->hb_timer);
- return 0;
- error: ;
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ dnbd3_send_empty_request(dev, CMD_KEEPALIVE);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->keepalive_work, KEEPALIVE_INTERVAL * HZ);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
- if (req1)
- kfree(req1);
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-int dnbd3_net_disconnect(dnbd3_device_t *dev)
+/**
+ * Delayed work triggering discovery (alt server check)
+ */
+static void dnbd3_discover_workfn(struct work_struct *work)
{
- if (dev->disconnecting)
- return 0;
-
- if (dev->cur_server.host.port)
- debug_dev("INFO: Disconnecting device.");
-
- dev->disconnecting = 1;
-
- // clear heartbeat timer
- del_timer(&dev->hb_timer);
-
- dev->discover = 0;
-
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
-
- // kill sending and receiving threads
- if (dev->thread_send)
- {
- kthread_stop(dev->thread_send);
- }
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, discover_work.work);
- if (dev->thread_receive)
- {
- kthread_stop(dev->thread_receive);
- }
+ dnbd3_discover(dev);
+}
- if (dev->thread_discover)
- {
- kthread_stop(dev->thread_discover);
- dev->thread_discover = NULL;
- }
+/**
+ * For manually triggering an immediate discovery
+ */
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic)
+{
+ unsigned long irqflags;
- // clear socket
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ if (!device_active(dev))
+ return;
+ if (panic && dnbd3_flag_get(dev->connection_lock)) {
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (!dev->panic) {
+ // Panic freshly turned on
+ dev->panic = true;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC;
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->connection_lock);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
-
- dev->disconnecting = 0;
-
- return 0;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, 1);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg)
-{
- dnbd3_device_t *dev = (dnbd3_device_t *)container_of(arg, dnbd3_device_t, hb_timer);
-#else
-void dnbd3_net_heartbeat(unsigned long arg)
+/**
+ * Wrapper for the actual discover function below. Check run conditions
+ * here and re-schedule delayed task here.
+ */
+static void dnbd3_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = (dnbd3_device_t *)arg;
-#endif
- // Because different events need different intervals, the timer is called once a second.
- // Other intervals can be derived using dev->heartbeat_count.
-#define timeout_seconds(x) (dev->heartbeat_count % (x) == 0)
-
- if (!dev->panic)
- {
- if (timeout_seconds(TIMER_INTERVAL_KEEPALIVE_PACKET))
- {
- struct request *req = kmalloc(sizeof(struct request), GFP_ATOMIC );
- // send keepalive
- if (req)
- {
- dnbd3_cmd_to_priv(req, CMD_KEEPALIVE);
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- wake_up(&dev->process_queue_send);
- }
- else
- {
- debug_dev("ERROR: Couldn't create keepalive request.");
- }
- }
- if ((dev->heartbeat_count > STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_NORMAL))
- || (dev->heartbeat_count <= STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_STARTUP)))
- {
- // Normal discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ unsigned long irqflags;
+
+ if (!device_active(dev) || dnbd3_flag_taken(dev->connection_lock))
+ return; // device not active anymore, or just about to switch
+ if (!dnbd3_flag_get(dev->discover_running))
+ return; // Already busy
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ cancel_delayed_work(&dev->discover_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_internal_discover(dev);
+ dev->discover_count++;
+ // Re-queueing logic
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, dev->discover_interval * HZ);
+ if (dev->discover_interval < TIMER_INTERVAL_PROBE_MAX
+ && dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT) {
+ dev->discover_interval += 2;
}
}
- else if (timeout_seconds(TIMER_INTERVAL_PROBE_PANIC))
- {
- // Panic discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
-
- dev->hb_timer.expires = jiffies + HZ;
-
- ++dev->heartbeat_count;
- add_timer(&dev->hb_timer);
-#undef timeout_seconds
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->discover_running);
}
-int dnbd3_net_discover(void *data)
+/**
+ * Discovery. Probe all (or some) known alt servers,
+ * and initiate connection switch if appropriate
+ */
+static void dnbd3_internal_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = data;
- struct sockaddr_in sin4;
- struct sockaddr_in6 sin6;
struct socket *sock, *best_sock = NULL;
+ dnbd3_alt_server_t *alt;
+ struct sockaddr_storage host_compare, best_server;
+ uint16_t remote_version;
+ ktime_t start, end;
+ unsigned long rtt = 0, best_rtt = 0;
+ int i, j, k, isize, fails, rtt_threshold;
+ int do_change = 0;
+ u8 check_order[NUMBER_SERVERS];
+ const bool ready = dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT;
+ const u32 turn = dev->discover_count % DISCOVER_HISTORY_SIZE;
+
+ // Shuffle alt_servers
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ check_order[i] = i;
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- dnbd3_server_t *alt_server;
- struct msghdr msg;
- struct kvec iov[2];
-
- char *buf, *name;
- serialized_buffer_t *payload;
- uint64_t filesize;
- uint16_t rid;
-
- struct timeval start, end;
- unsigned long rtt, best_rtt = 0;
- unsigned long irqflags;
- int i, j, isize, best_server, current_server;
- int turn = 0;
- int ready = 0, do_change = 0;
- char check_order[NUMBER_SERVERS];
- int mlen;
-
- struct request *last_request = (struct request *)123, *cur_request = (struct request *)456;
-
- struct timeval timeout;
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DISCOVERY;
- timeout.tv_usec = 0;
-
- memset(&sin4, 0, sizeof(sin4));
- memset(&sin6, 0, sizeof(sin6));
-
- init_msghdr(msg);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ j = get_random_u32() % NUMBER_SERVERS;
+ if (j != i) {
+ int tmp = check_order[i];
- buf = kmalloc(4096, GFP_KERNEL);
- if (!buf)
- {
- debug_dev("FATAL: Kmalloc failed (discover)");
- return -1;
+ check_order[i] = check_order[j];
+ check_order[j] = tmp;
+ }
}
- payload = (serialized_buffer_t *)buf; // Reuse this buffer to save kernel mem
- dnbd3_request.magic = dnbd3_packet_magic;
+ best_server.ss_family = 0;
+ best_rtt = RTT_UNREACHABLE;
- for (i = 0; i < NUMBER_SERVERS; ++i) {
- check_order[i] = i;
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_discover,
- kthread_should_stop() || dev->discover || dev->thread_discover == NULL);
+ if (!ready || dev->panic)
+ isize = NUMBER_SERVERS;
+ else
+ isize = 3;
- if (kthread_should_stop() || dev->imgname == NULL || dev->thread_discover == NULL )
+ for (j = 0; j < NUMBER_SERVERS; ++j) {
+ if (!device_active(dev))
break;
+ i = check_order[j];
+ mutex_lock(&dev->alt_servers_lock);
+ host_compare = dev->alt_servers[i].host;
+ fails = dev->alt_servers[i].failures;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (host_compare.ss_family == 0)
+ continue; // Empty slot
+ // Reduced probability for hosts that have been unreachable
+ if (!dev->panic && fails > 50 && (get_random_u32() % 4) != 0)
+ continue; // If not in panic mode, skip server if it failed too many times
+ if (isize-- <= 0 && !is_same_server(&dev->cur_server.host, &host_compare))
+ continue; // Only test isize servers plus current server
+
+ // Initialize socket and connect
+ sock = NULL;
+ if (dnbd3_connect(dev, &host_compare, &sock) != 0)
+ goto error;
- if (!dev->discover)
- continue;
- dev->discover = 0;
-
- if (dev->reported_size < 4096)
- continue;
-
- // Check if the list of alt servers needs to be updated and do so if necessary
- if (dev->new_servers_num)
- {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- for (i = 0; i < dev->new_servers_num; ++i)
- {
- if (dev->new_servers[i].host.type != HOST_IP4 && dev->new_servers[i].host.type != HOST_IP6) // Invalid entry?
- continue;
- alt_server = get_existing_server(&dev->new_servers[i], dev);
- if (alt_server != NULL ) // Server already known
- {
- if (dev->new_servers[i].failures == 1)
- {
- // REMOVE request
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Removing alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Removing alt server %pI6", alt_server->host.addr);
- alt_server->host.type = 0;
- continue;
- }
- // ADD, so just reset fail counter
- alt_server->failures = 0;
- continue;
- }
- if (dev->new_servers[i].failures == 1) // REMOVE, but server is not in list anyways
- continue;
- alt_server = get_free_alt_server(dev);
- if (alt_server == NULL ) // All NUMBER_SERVERS slots are taken, ignore entry
- continue;
- // Add new server entry
- alt_server->host = dev->new_servers[i].host;
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Adding alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Adding alt server %pI6", alt_server->host.addr);
- alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE;
- alt_server->protocol_version = 0;
- alt_server->failures = 0;
- }
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
+ remote_version = 0;
+ if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false))
+ goto error;
- current_server = best_server = -1;
- best_rtt = 0xFFFFFFFul;
- if (dev->heartbeat_count < STARTUP_MODE_DURATION || dev->panic)
- {
- isize = NUMBER_SERVERS;
- }
- else
- {
- isize = 3;
- }
- if (NUMBER_SERVERS > isize) {
- for (i = 0; i < isize; ++i) {
- j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS;
- if (j != i) {
- mlen = check_order[i];
- check_order[i] = check_order[j];
- check_order[j] = mlen;
- }
+ // panic mode, take first responding server
+ if (dev->panic) {
+ dnbd3_dev_info_host(dev, &host_compare, "panic mode, changing to new server\n");
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_host(dev, &host_compare, "...raced, ignoring\n");
+ } else {
+ // Check global flag, a connect might have been in progress
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000);
+ if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0)
+ sock_release(sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
}
}
- for (j = 0; j < NUMBER_SERVERS; ++j)
- {
- i = check_order[j];
- if (dev->alt_servers[i].host.type == 0) // Empty slot
- continue;
- if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times
- continue;
- if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- continue;
-
- // Initialize socket and connect
- if (dnbd3_sock_create(dev->alt_servers[i].host.type, SOCK_STREAM, IPPROTO_TCP, &sock) < 0)
- {
- debug_alt("ERROR: Couldn't create socket (discover).");
- sock = NULL;
- continue;
- }
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- sock->sk->sk_allocation = GFP_NOIO;
- if (dev->alt_servers[i].host.type == HOST_IP4)
- {
- sin4.sin_family = AF_INET;
- memcpy(&sin4.sin_addr, dev->alt_servers[i].host.addr, 4);
- sin4.sin_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin4, sizeof(sin4), 0) < 0)
- goto error;
- }
- else
- {
- sin6.sin6_family = AF_INET6;
- memcpy(&sin6.sin6_addr, dev->alt_servers[i].host.addr, 16);
- sin6.sin6_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin6, sizeof(sin6), 0) < 0)
- goto error;
- }
+ // actual rtt measurement is just the first block requests and reply
+ start = ktime_get_real();
+ if (!dnbd3_request_test_block(dev, &host_compare, sock))
+ goto error;
+ end = ktime_get_real();
- // Request filesize
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(payload);
- serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
- serializer_put_string(payload, dev->imgname); // image name
- serializer_put_uint16(payload, dev->rid); // revision id
- serializer_put_uint8(payload, 0); // are we a server? (no!)
- iov[1].iov_base = payload;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(payload);
- fixup_request(dnbd3_request);
- mlen = iov[1].iov_len + sizeof(dnbd3_request);
- if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen)
- error_alt("ERROR: Requesting image size failed.");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving image size packet (header) failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4)
- error_alt("ERROR: Content of image size packet (header) mismatched (discover).");
-
- // receive data
- iov[0].iov_base = payload;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_alt("ERROR: Receiving image size packet (payload) failed (discover).");
- serializer_reset_read(payload, dnbd3_reply.size);
-
- dev->alt_servers[i].protocol_version = serializer_get_uint16(payload);
- if (dev->alt_servers[i].protocol_version < MIN_SUPPORTED_SERVER)
- error_alt_va("ERROR: Server version too old (client: %d, server: %d, min supported: %d).",
- (int)PROTOCOL_VERSION, (int)dev->alt_servers[i].protocol_version, (int)MIN_SUPPORTED_SERVER);
-
- name = serializer_get_string(payload);
- if (name == NULL )
- error_alt("ERROR: Server did not supply an image name (discover).");
-
- if (strcmp(name, dev->imgname) != 0)
- error_alt_va("ERROR: Image name does not match requested one (client: '%s', server: '%s') (discover).",
- dev->imgname, name);
-
- rid = serializer_get_uint16(payload);
- if (rid != dev->rid)
- error_alt_va("ERROR: Server supplied wrong rid (client: '%d', server: '%d') (discover).",
- (int)dev->rid, (int)rid);
-
- filesize = serializer_get_uint64(payload);
- if (filesize != dev->reported_size)
- error_alt_va("ERROR: Reported image size of %llu does not match expected value %llu.(discover).",
- (unsigned long long)filesize, (unsigned long long)dev->reported_size);
-
- // panic mode, take first responding server
- if (dev->panic)
- {
- dev->panic = 0;
- debug_alt("WARN: Panic mode, changing server:");
- if (best_sock != NULL )
- sock_release(best_sock);
- dev->better_sock = sock; // Pass over socket to take a shortcut in *_connect();
- kfree(buf);
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[i], sizeof(dev->cur_server));
- dnbd3_net_connect(dev);
- return 0;
- }
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ dev->alt_servers[i].protocol_version = remote_version;
+ dev->alt_servers[i].rtts[turn] =
+ (unsigned long)ktime_us_delta(end, start);
- // Request block
- dnbd3_request.cmd = CMD_GET_BLOCK;
- // Do *NOT* pick a random block as it has proven to cause severe
- // cache thrashing on the server
- dnbd3_request.offset = 0;
- dnbd3_request.size = RTT_BLOCK_SIZE;
- fixup_request(dnbd3_request);
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
-
- // start rtt measurement
- do_gettimeofday(&start);
-
- if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0)
- error_alt("ERROR: Requesting test block failed (discover).");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving test block header packet failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic
- != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE)
- error_alt_va("ERROR: Unexpected reply to block request: cmd=%d, size=%d (discover).",
- (int)dnbd3_reply.cmd, (int)dnbd3_reply.size);
-
- // receive data
- iov[0].iov_base = buf;
- iov[0].iov_len = RTT_BLOCK_SIZE;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE)
- error_alt("ERROR: Receiving test block payload failed (discover).");
-
- do_gettimeofday(&end); // end rtt measurement
-
- dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull
- + (end.tv_usec - start.tv_usec));
-
- rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2]
- + dev->alt_servers[i].rtts[3]) / 4;
-
- if (best_rtt > rtt)
- {
- // This one is better, keep socket open in case we switch
- best_rtt = rtt;
- best_server = i;
- if (best_sock != NULL )
- sock_release(best_sock);
- best_sock = sock;
- sock = NULL;
- }
- else
- {
- // Not better, discard connection
- sock_release(sock);
- sock = NULL;
- }
+ rtt = 0;
- // update cur servers rtt
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = rtt;
- current_server = i;
- }
+ for (k = 0; k < DISCOVER_HISTORY_SIZE; ++k)
+ rtt += dev->alt_servers[i].rtts[k];
+ rtt /= DISCOVER_HISTORY_SIZE;
dev->alt_servers[i].failures = 0;
+ if (dev->alt_servers[i].best_count > 1)
+ dev->alt_servers[i].best_count -= 2;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
- continue;
-
- error: ;
- ++dev->alt_servers[i].failures;
+ if (best_rtt > rtt) {
+ // This one is better, keep socket open in case we switch
+ best_rtt = rtt;
+ best_server = host_compare;
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ best_sock = sock;
+ sock = NULL;
+ } else {
+ // Not better, discard connection
sock_release(sock);
sock = NULL;
- dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = RTT_UNREACHABLE;
- current_server = i;
- }
- continue;
}
- if (dev->panic)
- {
- // After 21 retries, bail out by reporting errors to block layer
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count < 255 && ++dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
- dnbd3_blk_fail_all_requests(dev);
- }
+ // update cur servers rtt
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = rtt;
- if (best_server == -1 || kthread_should_stop() || dev->thread_discover == NULL ) // No alt server could be reached at all or thread should stop
- {
- if (best_sock != NULL ) // Should never happen actually
- {
- sock_release(best_sock);
- best_sock = NULL;
- }
- continue;
- }
+ continue;
- do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0
- && RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500;
-
- if (ready && !do_change) {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (!list_empty(&dev->request_queue_send))
- {
- cur_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- do_change = (cur_request == last_request);
- if (do_change)
- printk("WARNING: Hung request on %s\n", dev->disk->disk_name);
- }
- else
- {
- cur_request = (struct request *)123;
- }
- last_request = cur_request;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+error:
+ if (sock != NULL) {
+ sock_release(sock);
+ sock = NULL;
}
-
- // take server with lowest rtt
- if (do_change)
- {
- printk("INFO: Server %d on %s is faster (%lluµs vs. %lluµs)\n", best_server, dev->disk->disk_name,
- (unsigned long long)best_rtt, (unsigned long long)dev->cur_rtt);
- kfree(buf);
- dev->better_sock = best_sock; // Take shortcut by continuing to use open connection
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[best_server], sizeof(dev->cur_server));
- dev->cur_rtt = best_rtt;
- dnbd3_net_connect(dev);
- return 0;
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ if (remote_version)
+ dev->alt_servers[i].protocol_version = remote_version;
+ ++dev->alt_servers[i].failures;
+ dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
+ if (dev->alt_servers[i].best_count > 2)
+ dev->alt_servers[i].best_count -= 3;
}
-
- // Clean up connection that was held open for quicker server switch
- if (best_sock != NULL )
- {
- sock_release(best_sock);
- best_sock = NULL;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = RTT_UNREACHABLE;
+ } // END - for loop over alt_servers
+
+ if (best_server.ss_family == 0) {
+ // No alt server could be reached
+ ASSERT(!best_sock);
+ if (dev->panic) {
+ if (dev->panic_count < 255)
+ dev->panic_count++;
+ // If probe timeout is set, report error to block layer
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
+ dnbd3_blk_fail_all_requests(dev);
}
+ return;
+ }
- if (!ready || (start.tv_usec & 15) != 0)
- turn = (turn + 1) % 4;
- if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing
- ready = 1;
-
+ // If best server was repeatedly measured best, lower the switching threshold more
+ mutex_lock(&dev->alt_servers_lock);
+ alt = get_existing_alt_from_addr(&best_server, dev);
+ if (alt != NULL) {
+ if (alt->best_count < 178)
+ alt->best_count += 3;
+ rtt_threshold = 1800 - (alt->best_count * 10);
+ remote_version = alt->protocol_version;
+ } else {
+ rtt_threshold = 1800;
+ remote_version = 0;
}
- kfree(buf);
- return 0;
+ mutex_unlock(&dev->alt_servers_lock);
+
+ do_change = ready && !is_same_server(&best_server, &dev->cur_server.host)
+ && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold;
+
+ // take server with lowest rtt
+ // if a (dis)connect is already in progress, we do nothing, this is not panic mode
+ if (do_change && device_active(dev) && dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_cur(dev, "server %pISpc is faster (%lluµs vs. %lluµs)\n",
+ &best_server,
+ (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
+ set_socket_timeout(best_sock, false, // recv
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_RECV * 1000) + 500);
+ set_socket_timeout(best_sock, true, // send
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_SEND * 1000) + 500);
+ if (dnbd3_set_primary_connection(dev, best_sock, &best_server, remote_version) != 0)
+ sock_release(best_sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
+ }
+
+ // Clean up connection that was held open for quicker server switch
+ if (best_sock != NULL)
+ sock_release(best_sock);
}
-int dnbd3_net_send(void *data)
+/**
+ * Worker for sending pending requests. This will be triggered whenever
+ * we get a new request from the block layer. The worker will then
+ * work through all the requests in the send queue, request them from
+ * the server, and return again.
+ */
+static void dnbd3_send_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request;
-
- dnbd3_request_t dnbd3_request;
- struct msghdr msg;
- struct kvec iov;
-
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, send_work);
+ struct request *blk_request;
+ struct dnbd3_cmd *cmd;
unsigned long irqflags;
- init_msghdr(msg);
-
- dnbd3_request.magic = dnbd3_packet_magic;
-
- set_user_nice(current, -20);
-
- // move already sent requests to request_queue_send again
- while (!list_empty(&dev->request_queue_receive))
- {
- printk("WARN: Request queue was not empty on %s\n", dev->disk->disk_name);
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- list_add(&blk_request->queuelist, &dev->request_queue_send);
- }
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_send, kthread_should_stop() || !list_empty(&dev->request_queue_send));
-
- if (kthread_should_stop())
+ mutex_lock(&dev->send_mutex);
+ while (dev->sock && device_active(dev)) {
+ // extract next block request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ if (list_empty(&dev->send_queue)) {
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
break;
-
- // extract block request
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (list_empty(&dev->request_queue_send))
- {
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
- blk_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- // what to do?
- switch (dnbd3_req_op(blk_request))
- {
- case DNBD3_DEV_READ:
- dnbd3_request.cmd = CMD_GET_BLOCK;
- dnbd3_request.offset = blk_rq_pos(blk_request) << 9; // *512
- dnbd3_request.size = blk_rq_bytes(blk_request); // bytes left to complete entire request
- // enqueue request to request_queue_receive
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- list_add_tail(&blk_request->queuelist, &dev->request_queue_receive);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- break;
- case DNBD3_REQ_OP_SPECIAL:
- dnbd3_request.cmd = dnbd3_priv_to_cmd(blk_request);
- dnbd3_request.size = 0;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // append to receive queue
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_add_tail(&blk_request->queuelist, &dev->recv_queue);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+
+ cmd = blk_mq_rq_to_pdu(blk_request);
+ if (!dnbd3_send_request(dev->sock, CMD_GET_BLOCK, cmd->handle,
+ blk_rq_pos(blk_request) << 9 /* sectors */, blk_rq_bytes(blk_request))) {
+ if (!dnbd3_flag_taken(dev->connection_lock)) {
+ dnbd3_dev_err_cur(dev, "connection to server lost (send)\n");
+ dnbd3_start_discover(dev, true);
+ }
break;
-
- default:
- printk("ERROR: Unknown command (send %u %u)\n", (int)blk_request->cmd_flags, (int)dnbd3_req_op(blk_request));
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
-
- // send net request
- dnbd3_request.handle = (uint64_t)(uintptr_t)blk_request; // Double cast to prevent warning on 32bit
- fixup_request(dnbd3_request);
- iov.iov_base = &dnbd3_request;
- iov.iov_len = sizeof(dnbd3_request);
- if (kernel_sendmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_request)) != sizeof(dnbd3_request))
- {
- debug_dev("ERROR: Connection to server lost (send)");
- goto error;
- }
- wake_up(&dev->process_queue_receive);
}
-
- dev->thread_send = NULL;
- return 0;
-
- error: ;
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
- dev->thread_send = NULL;
- return -1;
+ mutex_unlock(&dev->send_mutex);
}
-int dnbd3_net_receive(void *data)
+/**
+ * The receive workfn stays active for as long as the connection to a server
+ * lasts, i.e. it only gets restarted when we switch to a new server.
+ */
+static void dnbd3_recv_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request, *received_request;
-
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, recv_work);
+ struct request *blk_request;
+ struct request *rq_iter;
+ struct dnbd3_cmd *cmd;
+ dnbd3_reply_t reply_hdr;
struct req_iterator iter;
struct bio_vec bvec_inst;
struct bio_vec *bvec = &bvec_inst;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov;
void *kaddr;
unsigned long irqflags;
- sigset_t blocked, oldset;
uint16_t rid;
- unsigned long int recv_timeout = jiffies;
-
- int count, remaining, ret;
-
- init_msghdr(msg);
- set_user_nice(current, -20);
+ int remaining;
+ int ret;
- while (!kthread_should_stop())
- {
+ mutex_lock(&dev->recv_mutex);
+ while (dev->sock) {
// receive net reply
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = sizeof(dnbd3_reply);
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags);
- if (ret == -EAGAIN)
- {
- if (jiffies < recv_timeout) recv_timeout = jiffies; // Handle overflow
- if ((jiffies - recv_timeout) / HZ > SOCKET_KEEPALIVE_TIMEOUT)
- error_dev_va("ERROR: Receive timeout reached (%d of %d secs).", (int)((jiffies - recv_timeout) / HZ), (int)SOCKET_KEEPALIVE_TIMEOUT);
- continue;
+ ret = dnbd3_recv_reply(dev->sock, &reply_hdr);
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(dev, "remote peer has performed an orderly shutdown\n");
+ goto out_unlock;
+ } else if (ret < 0) {
+ if (ret == -EAGAIN) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "receive timeout reached\n");
+ } else {
+ /* for all errors other than -EAGAIN, print errno */
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "connection to server lost (receive, errno=%d)\n", ret);
+ }
+ goto out_unlock;
}
- if (ret <= 0)
- error_dev("ERROR: Connection to server lost (receive)");
- if (ret != sizeof(dnbd3_reply))
- error_dev("ERROR: Recv msg header.");
- fixup_reply(dnbd3_reply);
- // check error
- if (dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("ERROR: Wrong packet magic (Receive).");
- if (dnbd3_reply.cmd == 0)
- error_dev("ERROR: Command was 0 (Receive).");
+ /* check if arrived data is valid */
+ if (ret != sizeof(reply_hdr)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv partial msg header (%d/%d bytes)\n",
+ ret, (int)sizeof(reply_hdr));
+ goto out_unlock;
+ }
- // Update timeout
- recv_timeout = jiffies;
+ // check error
+ if (reply_hdr.magic != dnbd3_packet_magic) {
+ dnbd3_dev_err_cur(dev, "wrong packet magic (receive)\n");
+ goto out_unlock;
+ }
// what to do?
- switch (dnbd3_reply.cmd)
- {
+ switch (reply_hdr.cmd) {
case CMD_GET_BLOCK:
// search for replied request in queue
blk_request = NULL;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(received_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- if ((uint64_t)(uintptr_t)received_request == dnbd3_reply.handle) // Double cast to prevent warning on 32bit
- {
- blk_request = received_request;
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ cmd = blk_mq_rq_to_pdu(rq_iter);
+ if (cmd->handle == reply_hdr.handle) {
+ blk_request = rq_iter;
+ list_del_init(&blk_request->queuelist);
break;
}
}
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- if (blk_request == NULL )
- error_dev_va("ERROR: Received block data for unrequested handle (%llu: %llu).\n",
- (unsigned long long)dnbd3_reply.handle, (unsigned long long)dnbd3_reply.size);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (blk_request == NULL) {
+ dnbd3_dev_err_cur(dev, "received block data for unrequested handle (%llx: len=%llu)\n",
+ reply_hdr.handle,
+ (u64)reply_hdr.size);
+ goto out_unlock;
+ }
// receive data and answer to block layer
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
- rq_for_each_segment(bvec_inst, blk_request, iter)
+ rq_for_each_segment(bvec_inst, blk_request, iter) {
#else
- rq_for_each_segment(bvec, blk_request, iter)
+ rq_for_each_segment(bvec, blk_request, iter) {
#endif
- {
- siginitsetinv(&blocked, sigmask(SIGKILL));
- sigprocmask(SIG_SETMASK, &blocked, &oldset);
-
kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
iov.iov_base = kaddr;
iov.iov_len = bvec->bv_len;
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags) != bvec->bv_len)
- {
- kunmap(bvec->bv_page);
- sigprocmask(SIG_SETMASK, &oldset, NULL );
- error_dev("ERROR: Receiving from net to block layer.");
- }
+ ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
kunmap(bvec->bv_page);
-
- sigprocmask(SIG_SETMASK, &oldset, NULL );
+ if (ret != bvec->bv_len) {
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(
+ dev, "remote peer has performed an orderly shutdown\n");
+ } else if (ret < 0) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "disconnect: receiving from net to block layer\n");
+ } else {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "receiving from net to block layer (%d bytes)\n", ret);
+ }
+ // Requeue request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ goto out_unlock;
+ }
}
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- __blk_end_request_all(blk_request, 0);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
+ blk_mq_end_request(blk_request, BLK_STS_OK);
+ break;
case CMD_GET_SERVERS:
- if (!dev->use_server_provided_alts)
- {
- remaining = dnbd3_reply.size;
- goto consume_payload;
- }
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- count = MIN(NUMBER_SERVERS, dnbd3_reply.size / sizeof(dnbd3_server_entry_t));
-
- if (count != 0)
- {
- iov.iov_base = dev->new_servers;
- iov.iov_len = count * sizeof(dnbd3_server_entry_t);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags)
- != (count * sizeof(dnbd3_server_entry_t)))
- error_dev("ERROR: Recv CMD_GET_SERVERS payload.");
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = count;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
- // If there were more servers than accepted, remove the remaining data from the socket buffer
- remaining = dnbd3_reply.size - (count * sizeof(dnbd3_server_entry_t));
- consume_payload: while (remaining > 0)
- {
- count = MIN(sizeof(dnbd3_reply), remaining); // Abuse the reply struct as the receive buffer
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = count;
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
- if (ret <= 0)
- error_dev("ERROR: Recv additional payload from CMD_GET_SERVERS.");
- remaining -= ret;
+ remaining = reply_hdr.size;
+ if (dev->use_server_provided_alts) {
+ dnbd3_server_entry_t new_server;
+
+ while (remaining >= sizeof(dnbd3_server_entry_t)) {
+ if (dnbd3_recv_bytes(dev->sock, &new_server, sizeof(new_server))
+ != sizeof(new_server)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv CMD_GET_SERVERS payload\n");
+ goto out_unlock;
+ }
+ // TODO: Log
+ if (new_server.failures == 0) { // ADD
+ dnbd3_add_server(dev, &new_server.host);
+ } else { // REM
+ dnbd3_rem_server(dev, &new_server.host);
+ }
+ remaining -= sizeof(new_server);
+ }
}
- continue;
+ if (!dnbd3_drain_socket(dev, dev->sock, remaining))
+ goto out_unlock;
+ break;
case CMD_LATEST_RID:
- if (dnbd3_reply.size != 2)
- {
- printk("ERROR: CMD_LATEST_RID.size != 2.\n");
+ if (reply_hdr.size < 2) {
+ dev_err(dnbd3_device_to_dev(dev), "CMD_LATEST_RID.size < 2\n");
continue;
}
- iov.iov_base = &rid;
- iov.iov_len = sizeof(rid);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags) <= 0)
- {
- printk("ERROR: Could not receive CMD_LATEST_RID payload.\n");
- }
- else
- {
+ if (dnbd3_recv_bytes(dev->sock, &rid, 2) != 2) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dev_err(dnbd3_device_to_dev(dev), "could not receive CMD_LATEST_RID payload\n");
+ } else {
rid = net_order_16(rid);
- printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid);
+ dnbd3_dev_info_cur(dev, "latest rid of %s is %d (currently using %d)\n",
+ dev->imgname, (int)rid, (int)dev->rid);
dev->update_available = (rid > dev->rid ? 1 : 0);
}
+ if (reply_hdr.size > 2)
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size - 2);
continue;
case CMD_KEEPALIVE:
- if (dnbd3_reply.size != 0)
- printk("ERROR: keep alive packet with payload.\n");
+ if (reply_hdr.size != 0) {
+ dev_dbg(dnbd3_device_to_dev(dev), "keep alive packet with payload\n");
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size);
+ }
continue;
default:
- printk("ERROR: Unknown command (Receive)\n");
- continue;
+ dev_err(dnbd3_device_to_dev(dev), "unknown command: %d (receive), aborting connection\n", (int)reply_hdr.cmd);
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ // This will check if we actually still need a new connection
+ dnbd3_start_discover(dev, true);
+ mutex_unlock(&dev->recv_mutex);
+}
+/**
+ * Set send or receive timeout of given socket
+ */
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+ int opt = set_send ? SO_SNDTIMEO_NEW : SO_RCVTIMEO_NEW;
+ struct __kernel_sock_timeval timeout;
+#else
+ int opt = set_send ? SO_SNDTIMEO : SO_RCVTIMEO;
+ struct timeval timeout;
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+ sockptr_t timeout_ptr = KERNEL_SOCKPTR(&timeout);
+#else
+ char *timeout_ptr = (char *)&timeout;
+#endif
+
+ timeout.tv_sec = timeout_ms / 1000;
+ timeout.tv_usec = (timeout_ms % 1000) * 1000;
+ sock_setsockopt(sock, SOL_SOCKET, opt, timeout_ptr, sizeof(timeout));
+}
+
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out)
+{
+ ktime_t start;
+ int ret, connect_time_ms;
+ struct socket *sock;
+ int retries = 4;
+ const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in)
+ : sizeof(struct sockaddr_in6);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+ ret = sock_create_kern(&init_net, addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#else
+ ret = sock_create_kern(addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#endif
+ if (ret < 0) {
+ dev_err(dnbd3_device_to_dev(dev), "couldn't create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Only one retry, TCP no delay */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+ tcp_sock_set_syncnt(sock->sk, 1);
+ tcp_sock_set_nodelay(sock->sk);
+ /* because of our aggressive timeouts, this is pointless */
+ sock_no_linger(sock->sk);
+#else
+ /* add legacy version of this, but ignore others as they're not that important */
+ ret = 1;
+ kernel_setsockopt(sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&ret, sizeof(ret));
+#endif
+ /* allow this socket to use reserved mem (vm.mem_free_kbytes) */
+ sk_set_memalloc(sock->sk);
+ sock->sk->sk_allocation = GFP_NOIO;
+
+ if (dev->panic && dev->panic_count > 1) {
+ /* in panic mode for some time, start increasing timeouts */
+ connect_time_ms = dev->panic_count * 1000;
+ } else {
+ /* otherwise, use 2*RTT of current server */
+ connect_time_ms = dev->cur_server.rtt * 2 / 1000;
+ }
+ /* but obey a minimal configurable value, and maximum sanity check */
+ if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000)
+ connect_time_ms = SOCKET_TIMEOUT_SEND * 1000;
+ else if (connect_time_ms > 60000)
+ connect_time_ms = 60000;
+ set_socket_timeout(sock, false, connect_time_ms); // recv
+ set_socket_timeout(sock, true, connect_time_ms); // send
+ start = ktime_get_real();
+ while (--retries > 0) {
+ ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0);
+ connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start);
+ if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) {
+ /* Either I'm losing my mind or there was a specific build of kernel
+ * 5.x where SO_RCVTIMEO didn't affect the connect call above, so
+ * this function would hang for over a minute for unreachable hosts.
+ * Leave in this debug check for twice the configured timeout
+ */
+ dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n",
+ connect_time_ms);
}
+ if (ret != 0) {
+ if (ret == -EINTR)
+ dnbd3_dev_dbg_host(dev, addr, "connect: interrupted system call (blocked %dms)\n",
+ connect_time_ms);
+ else
+ dnbd3_dev_dbg_host(dev, addr, "connect: failed (%d, blocked %dms)\n",
+ ret, connect_time_ms);
+ goto error;
+ }
+ *sock_out = sock;
+ return 0;
}
+error:
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
- printk("dnbd3_net_receive terminated normally.\n");
- dev->thread_receive = NULL;
- return 0;
+#define dnbd3_err_dbg_host(...) do { \
+ if (dev->panic || dev->sock == NULL) \
+ dnbd3_dev_err_host(__VA_ARGS__); \
+ else \
+ dnbd3_dev_dbg_host(__VA_ARGS__); \
+} while (0)
+
+/**
+ * Execute protocol handshake on a newly connected socket.
+ * If this is the initial connection to any server, ie. we're being called
+ * through the initial ioctl() to open a device, we'll store the rid, filesize
+ * etc. in the dev struct., otherwise, this is a potential switch to another
+ * server, so we validate the filesize, rid, name against what we expect.
+ * The server's protocol version is returned in 'remote_version'
+ */
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_data)
+{
+ unsigned long irqflags;
+ const char *name;
+ uint64_t filesize;
+ int mlen;
+ uint16_t rid;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov[2];
+ serialized_buffer_t *payload;
+ dnbd3_reply_t reply_hdr;
+ dnbd3_request_t request_hdr = { .magic = dnbd3_packet_magic };
+
+ payload = kmalloc(sizeof(*payload), GFP_KERNEL);
+ if (payload == NULL)
+ goto error;
+
+ if (copy_data && device_active(dev))
+ dev_warn(dnbd3_device_to_dev(dev), "Called handshake function with copy_data enabled when reported_size is not zero\n");
+
+ // Request filesize
+ request_hdr.cmd = CMD_SELECT_IMAGE;
+ iov[0].iov_base = &request_hdr;
+ iov[0].iov_len = sizeof(request_hdr);
+ serializer_reset_write(payload);
+ serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
+ serializer_put_string(payload, dev->imgname); // image name
+ serializer_put_uint16(payload, dev->rid); // revision id
+ serializer_put_uint8(payload, 0); // are we a server? (no!)
+ iov[1].iov_base = payload;
+ request_hdr.size = iov[1].iov_len = serializer_get_written_length(payload);
+ fixup_request(request_hdr);
+ mlen = iov[0].iov_len + iov[1].iov_len;
+ if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) {
+ dnbd3_err_dbg_host(dev, addr, "requesting image size failed\n");
+ goto error;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving image size packet (header) failed\n");
+ goto error;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic
+ || reply_hdr.cmd != CMD_SELECT_IMAGE || reply_hdr.size < 4
+ || reply_hdr.size > sizeof(*payload)) {
+ dnbd3_err_dbg_host(dev, addr,
+ "corrupt CMD_SELECT_IMAGE reply\n");
+ goto error;
+ }
+
+ // receive data
+ iov[0].iov_base = payload;
+ iov[0].iov_len = reply_hdr.size;
+ if (kernel_recvmsg(sock, &msg, iov, 1, reply_hdr.size, msg.msg_flags)
+ != reply_hdr.size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "receiving payload of CMD_SELECT_IMAGE reply failed\n");
+ goto error;
+ }
+ serializer_reset_read(payload, reply_hdr.size);
+
+ *remote_version = serializer_get_uint16(payload);
+ name = serializer_get_string(payload);
+ rid = serializer_get_uint16(payload);
+ filesize = serializer_get_uint64(payload);
+
+ if (*remote_version < MIN_SUPPORTED_SERVER) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server version too old (client: %d, server: %d, min supported: %d)\n",
+ (int)PROTOCOL_VERSION, (int)*remote_version,
+ (int)MIN_SUPPORTED_SERVER);
+ goto error;
+ }
+ if (name == NULL) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply an image name\n");
+ goto error;
+ }
+ if (rid == 0) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply a revision id\n");
+ goto error;
+ }
+
+ if (copy_data) {
+ if (filesize < DNBD3_BLOCK_SIZE) {
+ dnbd3_err_dbg_host(dev, addr, "reported size by server is < 4096\n");
+ goto error;
+ }
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (strlen(dev->imgname) < strlen(name)) {
+ dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_KERNEL);
+ if (dev->imgname == NULL) {
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_err_dbg_host(dev, addr, "reallocating buffer for new image name failed\n");
+ goto error;
+ }
+ }
+ strcpy(dev->imgname, name);
+ dev->rid = rid;
+ // store image information
+ dev->reported_size = filesize;
+ dev->update_available = 0;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
+ dnbd3_dev_dbg_host(dev, addr, "image size: %llu\n", dev->reported_size);
+ } else {
+ /* switching connection, sanity checks */
+ if (rid != dev->rid) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server supplied wrong rid (client: '%d', server: '%d')\n",
+ (int)dev->rid, (int)rid);
+ goto error;
+ }
+
+ if (strcmp(name, dev->imgname) != 0) {
+ dnbd3_err_dbg_host(dev, addr, "server offers image '%s', requested '%s'\n", name, dev->imgname);
+ goto error;
+ }
+
+ if (filesize != dev->reported_size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "reported image size of %llu does not match expected value %llu\n",
+ (unsigned long long)filesize, (unsigned long long)dev->reported_size);
+ goto error;
+ }
+ }
+ kfree(payload);
+ return true;
+
+error:
+ kfree(payload);
+ return false;
+}
+
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ dnbd3_request_t request_hdr = {
+ .magic = dnbd3_packet_magic,
+ .cmd = cmd,
+ .size = size,
+ .offset = offset,
+ .handle = handle,
+ };
+ struct kvec iov = { .iov_base = &request_hdr, .iov_len = sizeof(request_hdr) };
+
+ fixup_request(request_hdr);
+ return kernel_sendmsg(sock, &msg, &iov, 1, sizeof(request_hdr)) == sizeof(request_hdr);
+}
+
+/**
+ * Send a request with given cmd type and empty payload.
+ */
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd)
+{
+ int ret;
+
+ mutex_lock(&dev->send_mutex);
+ ret = dev->sock
+ && dnbd3_send_request(dev->sock, cmd, 0, 0, 0);
+ mutex_unlock(&dev->send_mutex);
+ return ret;
+}
+
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov = { .iov_base = buffer, .iov_len = count };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, count, msg.msg_flags);
+}
+
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr)
+{
+ int ret = dnbd3_recv_bytes(sock, reply_hdr, sizeof(*reply_hdr));
+
+ fixup_reply(*reply_hdr);
+ return ret;
+}
+
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes)
+{
+ int ret;
+ struct kvec iov;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+
+ while (bytes > 0) {
+ iov.iov_base = __garbage_mem;
+ iov.iov_len = sizeof(__garbage_mem);
+ ret = kernel_recvmsg(sock, &msg, &iov, 1, MIN(bytes, iov.iov_len), msg.msg_flags);
+ if (ret <= 0) {
+ dnbd3_dev_err_cur(dev, "draining payload failed (ret=%d)\n", ret);
+ return false;
+ }
+ bytes -= ret;
+ }
+ return true;
+}
+
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock)
+{
+ dnbd3_reply_t reply_hdr;
+
+ // Request block
+ if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) {
+ dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n");
+ return false;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n");
+ return false;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK
+ || reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) {
+ dnbd3_err_dbg_host(dev, addr,
+ "unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n",
+ (int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle);
+ return false;
+ }
- error:
+ // receive data
+ return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE);
+}
+#undef dnbd3_err_dbg_host
+
+static void replace_main_socket(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ mutex_lock(&dev->send_mutex);
+ // First, shutdown connection, so receive worker will leave its mainloop
if (dev->sock)
kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ mutex_lock(&dev->recv_mutex);
+ // Receive worker is done, get rid of socket and replace
+ if (dev->sock)
+ sock_release(dev->sock);
+ dev->sock = sock;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (addr == NULL) {
+ memset(&dev->cur_server, 0, sizeof(dev->cur_server));
+ } else {
+ dev->cur_server.host = *addr;
+ dev->cur_server.rtt = 0;
+ dev->cur_server.protocol_version = protocol_version;
}
- dev->thread_receive = NULL;
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ mutex_unlock(&dev->recv_mutex);
+ mutex_unlock(&dev->send_mutex);
}
+static void dnbd3_release_resources(dnbd3_device_t *dev)
+{
+ if (dev->send_wq)
+ destroy_workqueue(dev->send_wq);
+ dev->send_wq = NULL;
+ if (dev->recv_wq)
+ destroy_workqueue(dev->recv_wq);
+ dev->recv_wq = NULL;
+ mutex_destroy(&dev->send_mutex);
+ mutex_destroy(&dev->recv_mutex);
+}
+
+/**
+ * Establish new connection on a dnbd3 device.
+ * Return 0 on success, errno otherwise
+ */
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init)
+{
+ unsigned long irqflags;
+ struct socket *sock = NULL;
+ uint16_t proto_version;
+ int ret;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (init && device_active(dev)) {
+ dnbd3_dev_err_cur(dev, "device already configured/connected\n");
+ return -EBUSY;
+ }
+ if (!init && !device_active(dev)) {
+ dev_warn(dnbd3_device_to_dev(dev), "connection switch called on unconfigured device\n");
+ return -ENOTCONN;
+ }
+
+ dnbd3_dev_dbg_host(dev, addr, "connecting...\n");
+ ret = dnbd3_connect(dev, addr, &sock);
+ if (ret != 0 || sock == NULL)
+ goto error;
+
+ /* execute the "select image" handshake */
+ // if init is true, reported_size will be set
+ if (!dnbd3_execute_handshake(dev, sock, addr, &proto_version, init)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (init) {
+ // We're setting up the device for use - allocate resources
+ // Do not goto error before this
+ ASSERT(!dev->send_wq);
+ ASSERT(!dev->recv_wq);
+ mutex_init(&dev->send_mutex);
+ mutex_init(&dev->recv_mutex);
+ // a designated queue for sending, that allows one active task only
+ dev->send_wq = alloc_workqueue("dnbd%d-send",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ 1, dev->index);
+ dev->recv_wq = alloc_workqueue("dnbd%d-recv",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ 1, dev->index);
+ if (!dev->send_wq || !dev->recv_wq) {
+ ret = -ENOMEM;
+ goto error_dealloc;
+ }
+ }
+
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000); // recv
+ dnbd3_set_primary_connection(dev, sock, addr, proto_version);
+ sock = NULL; // In case we ever goto error* after this point
+
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (init) {
+ dev->discover_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_STARTUP;
+ // discovery and keepalive are not critical, use the power efficient queue
+ queue_delayed_work(system_power_efficient_wq, &dev->discover_work,
+ dev->discover_interval * HZ);
+ queue_delayed_work(system_power_efficient_wq, &dev->keepalive_work,
+ KEEPALIVE_INTERVAL * HZ);
+ // but the receiver is performance critical AND runs indefinitely, use the
+ // the cpu intensive queue, as jobs submitted there will not cound towards
+ // the concurrency limit of per-cpu worker threads. It still feels a little
+ // dirty to avoid managing our own thread, but nbd does it too.
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ return 0;
+
+error_dealloc:
+ if (init) {
+ // If anything fails during initialization, free resources again
+ dnbd3_release_resources(dev);
+ }
+error:
+ if (init)
+ dev->reported_size = 0;
+ if (sock)
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
+
+void dnbd3_net_work_init(dnbd3_device_t *dev)
+{
+ INIT_WORK(&dev->send_work, dnbd3_send_workfn);
+ INIT_WORK(&dev->recv_work, dnbd3_recv_workfn);
+ INIT_DELAYED_WORK(&dev->discover_work, dnbd3_discover_workfn);
+ INIT_DELAYED_WORK(&dev->keepalive_work, dnbd3_keepalive_workfn);
+}
+
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (addr->ss_family == 0 || dev->imgname == NULL || sock == NULL) {
+ dnbd3_dev_err_cur(dev, "connect: host, image name or sock not set\n");
+ return -EINVAL;
+ }
+
+ replace_main_socket(dev, sock, addr, protocol_version);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_SWITCH;
+ queue_work(dev->recv_wq, &dev->recv_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+
+ if (dev->use_server_provided_alts)
+ dnbd3_send_empty_request(dev, CMD_GET_SERVERS);
+
+ dnbd3_dev_info_cur(dev, "connection switched\n");
+ dnbd3_blk_requeue_all_requests(dev);
+ return 0;
+}
+
+/**
+ * Disconnect the device, shutting it down.
+ */
+int dnbd3_net_disconnect(dnbd3_device_t *dev)
+{
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (!device_active(dev))
+ return -ENOTCONN;
+ dev_dbg(dnbd3_device_to_dev(dev), "disconnecting device ...\n");
+
+ dev->reported_size = 0;
+ /* quickly fail all requests */
+ dnbd3_blk_fail_all_requests(dev);
+ replace_main_socket(dev, NULL, NULL, 0);
+
+ cancel_delayed_work_sync(&dev->keepalive_work);
+ cancel_delayed_work_sync(&dev->discover_work);
+ cancel_work_sync(&dev->send_work);
+ cancel_work_sync(&dev->recv_work);
+
+ dnbd3_blk_fail_all_requests(dev);
+ dnbd3_release_resources(dev);
+ dev_dbg(dnbd3_device_to_dev(dev), "all workers shut down\n");
+ return 0;
+}
diff --git a/src/kernel/net.h b/src/kernel/net.h
index a06a20c..69fa523 100644
--- a/src/kernel/net.h
+++ b/src/kernel/net.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,30 +22,12 @@
#ifndef NET_H_
#define NET_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#define init_msghdr(h) do { \
- h.msg_name = NULL; \
- h.msg_namelen = 0; \
- h.msg_control = NULL; \
- h.msg_controllen = 0; \
- h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
- } while (0)
+void dnbd3_net_work_init(dnbd3_device_t *dev);
-int dnbd3_net_connect(dnbd3_device_t *lo);
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init);
-int dnbd3_net_disconnect(dnbd3_device_t *lo);
-
-int dnbd3_net_send(void *data);
-
-int dnbd3_net_receive(void *data);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg);
-#else
-void dnbd3_net_heartbeat(unsigned long arg);
-#endif
-
-int dnbd3_net_discover(void *data);
+int dnbd3_net_disconnect(dnbd3_device_t *dev);
#endif /* NET_H_ */
diff --git a/src/kernel/serialize.c b/src/kernel/serialize.c
new file mode 120000
index 0000000..5a4e4ac
--- /dev/null
+++ b/src/kernel/serialize.c
@@ -0,0 +1 @@
+../shared/serialize.c \ No newline at end of file
diff --git a/src/kernel/serialize_kmod.c b/src/kernel/serialize_kmod.c
deleted file mode 100644
index 50746df..0000000
--- a/src/kernel/serialize_kmod.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-#define KERNEL_MODULE
-#include "serialize.c"
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
index 4406072..9deba96 100644
--- a/src/kernel/sysfs.c
+++ b/src/kernel/sysfs.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,156 +22,138 @@
#include <linux/kobject.h>
#include "sysfs.h"
-#include "utils.h"
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
+/**
+ * Print currently connected server IP:PORT
+ */
ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev)
{
- if (dev->cur_server.host.type == HOST_IP4)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- else if (dev->cur_server.host.type == HOST_IP6)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- *buf = '\0';
- return 0;
-}
-
-ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev)
-{
- return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE);
-}
+ ssize_t ret;
-ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev)
-{
- int i, num = 0;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type) ++num;
- }
- return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE);
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%pISpc\n", &dev->cur_server.host), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * List alt servers. One line per server, format is:
+ * IP:PORT RTT consecutive_failures best_count
+ */
ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev)
{
- int i, size = PAGE_SIZE, ret;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == HOST_IP4)
- ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else if (dev->alt_servers[i].host.type == HOST_IP6)
- ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else
+ int i, size = PAGE_SIZE;
+ ssize_t ret;
+
+ if (mutex_lock_interruptible(&dev->alt_servers_lock) != 0)
+ return 0;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
continue;
+
+ ret = MIN(snprintf(buf, size, "%pISpc %llu %d %d\n", &dev->alt_servers[i].host,
+ (unsigned long long)((dev->alt_servers[i].rtts[0] +
+ dev->alt_servers[i].rtts[1] +
+ dev->alt_servers[i].rtts[2] +
+ dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures,
+ (int)dev->alt_servers[i].best_count),
+ size);
size -= ret;
buf += ret;
- if (size <= 0)
- {
+ if (size <= 0) {
size = 0;
break;
}
}
+ mutex_unlock(&dev->alt_servers_lock);
return PAGE_SIZE - size;
}
+/**
+ * Show name of image in use
+ */
ssize_t show_image_name(char *buf, dnbd3_device_t *dev)
{
- if (dev->imgname == NULL) return sprintf(buf, "(null)");
- return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ ssize_t ret;
+
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * Show rid of image in use
+ */
ssize_t show_rid(char *buf, dnbd3_device_t *dev)
{
+ // No locking here, primitive type, no pointer to allocated memory
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE);
}
ssize_t show_update_available(char *buf, dnbd3_device_t *dev)
{
+ // Same story
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE);
}
-device_attr_t cur_server_addr =
-{
- .attr = {.name = "cur_server_addr", .mode = 0444 },
- .show = show_cur_server_addr,
- .store = NULL,
-};
-
-device_attr_t cur_server_rtt =
-{
- .attr = {.name = "cur_server_rtt", .mode = 0444 },
- .show = show_cur_server_rtt,
- .store = NULL,
-};
-
-device_attr_t alt_server_num =
-{
- .attr = {.name = "alt_server_num", .mode = 0444 },
- .show = show_alt_server_num,
- .store = NULL,
+device_attr_t cur_server_addr = {
+ .attr = { .name = "cur_server_addr", .mode = 0444 },
+ .show = show_cur_server_addr,
+ .store = NULL,
};
-device_attr_t alt_servers =
-{
- .attr = {.name = "alt_servers", .mode = 0444 },
- .show = show_alt_servers,
- .store = NULL,
+device_attr_t alt_servers = {
+ .attr = { .name = "alt_servers", .mode = 0444 },
+ .show = show_alt_servers,
+ .store = NULL,
};
-device_attr_t image_name =
-{
- .attr = {.name = "image_name", .mode = 0444 },
- .show = show_image_name,
- .store = NULL,
+device_attr_t image_name = {
+ .attr = { .name = "image_name", .mode = 0444 },
+ .show = show_image_name,
+ .store = NULL,
};
-device_attr_t rid =
-{
- .attr = {.name = "rid", .mode = 0444 },
- .show = show_rid,
- .store = NULL,
+device_attr_t rid = {
+ .attr = { .name = "rid", .mode = 0444 },
+ .show = show_rid,
+ .store = NULL,
};
-device_attr_t update_available =
-{
- .attr = {.name = "update_available", .mode = 0444 },
- .show = show_update_available,
- .store = NULL,
+device_attr_t update_available = {
+ .attr = { .name = "update_available", .mode = 0444 },
+ .show = show_update_available,
+ .store = NULL,
};
ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
device_attr_t *device_attr = container_of(attr, device_attr_t, attr);
dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj);
+
return device_attr->show(buf, dev);
}
-struct attribute *device_attrs[] =
-{
+struct attribute *device_attrs[] = {
&cur_server_addr.attr,
- &cur_server_rtt.attr,
- &alt_server_num.attr,
&alt_servers.attr,
- &image_name.attr,
- &rid.attr,
+ &image_name.attr, &rid.attr,
&update_available.attr,
NULL,
};
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ATTRIBUTE_GROUPS(device);
+#endif
-struct sysfs_ops device_ops =
-{
+const struct sysfs_ops device_ops = {
.show = device_show,
};
@@ -179,14 +162,16 @@ void release(struct kobject *kobj)
kobj->state_initialized = 0;
}
-struct kobj_type device_ktype =
-{
+struct kobj_type device_ktype = {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
.default_attrs = device_attrs,
+#else
+ .default_groups = device_groups,
+#endif
.sysfs_ops = &device_ops,
.release = release,
};
-
void dnbd3_sysfs_init(dnbd3_device_t *dev)
{
int error;
@@ -196,7 +181,7 @@ void dnbd3_sysfs_init(dnbd3_device_t *dev)
error = kobject_init_and_add(kobj, ktype, parent, "%s", "net");
if (error)
- printk("Error initializing dnbd3 device!\n");
+ dev_err(dnbd3_device_to_dev(dev), "initializing sysfs for device failed!\n");
}
void dnbd3_sysfs_exit(dnbd3_device_t *dev)
diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h
index 0a747a5..1db4a07 100644
--- a/src/kernel/sysfs.h
+++ b/src/kernel/sysfs.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,25 +22,16 @@
#ifndef SYSFS_H_
#define SYSFS_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
void dnbd3_sysfs_init(dnbd3_device_t *dev);
void dnbd3_sysfs_exit(dnbd3_device_t *dev);
-typedef struct
-{
+typedef struct {
struct attribute attr;
- ssize_t (*show)(char *, dnbd3_device_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_device_t *);
+ ssize_t (*show)(char *buf, dnbd3_device_t *dev);
+ ssize_t (*store)(const char *buf, size_t len, dnbd3_device_t *dev);
} device_attr_t;
-typedef struct
-{
- struct attribute attr;
- ssize_t (*show)(char *, dnbd3_server_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_server_t *);
-} server_attr_t;
-
-
#endif /* SYSFS_H_ */
diff --git a/src/kernel/utils.c b/src/kernel/utils.c
deleted file mode 100644
index 902025f..0000000
--- a/src/kernel/utils.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include <linux/kernel.h>
-
-#include "utils.h"
-
-unsigned int inet_addr(char *str)
-{
- int a, b, c, d;
- char arr[4];
- sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d);
- arr[0] = a;
- arr[1] = b;
- arr[2] = c;
- arr[3] = d;
- return *(unsigned int *) arr;
-}
-
-void inet_ntoa(struct in_addr addr, char *str)
-{
- unsigned char *ptr = (unsigned char *) &addr;
- sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff);
-}
diff --git a/src/kernel/utils.h b/src/kernel/utils.h
deleted file mode 100644
index e54b3cf..0000000
--- a/src/kernel/utils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef UTILS_H_
-#define UTILS_H_
-
-#include <linux/in.h>
-
-unsigned int inet_addr(char *str);
-void inet_ntoa(struct in_addr addr, char *str);
-
-#endif /* UTILS_H_ */
diff --git a/src/serialize.h b/src/serialize.h
deleted file mode 100644
index 1b73531..0000000
--- a/src/serialize.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef SERIALIZER_H_
-#define SERIALIZER_H_
-
-// Careful with includes - this is used in kernel module too
-#include "config.h"
-
-typedef struct
-{
- char buffer[MAX_PAYLOAD]; // This MUST be the first member or send_reply() will blow up
- char *buffer_end;
- char *buffer_pointer;
-} serialized_buffer_t;
-
-void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len);
-
-void serializer_reset_write(serialized_buffer_t *buffer);
-
-uint32_t serializer_get_written_length(serialized_buffer_t *buffer);
-
-//
-
-uint8_t serializer_get_uint8(serialized_buffer_t *buffer);
-
-uint16_t serializer_get_uint16(serialized_buffer_t *buffer);
-
-uint64_t serializer_get_uint64(serialized_buffer_t *buffer);
-
-char *serializer_get_string(serialized_buffer_t *buffer);
-
-//
-
-void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value);
-
-void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value);
-
-void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value);
-
-void serializer_put_string(serialized_buffer_t *buffer, const char *value);
-
-#endif
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
new file mode 100644
index 0000000..9a1e1c4
--- /dev/null
+++ b/src/server/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-server
+ LANGUAGES C)
+
+# find Jansson package required by the dnbd3-server
+find_package(Jansson)
+if(NOT JANSSON_FOUND)
+ message(FATAL_ERROR "*** No jansson lib found, can't build dnbd3-server!")
+endif(NOT JANSSON_FOUND)
+
+# find atomic library required by the dnbd3-server
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to enable enhanced POSIX features
+add_definitions(-D_GNU_SOURCE)
+
+if(DNBD3_SERVER_AFL)
+ # check if DNBD3_RELEASE_HARDEN is disabled
+ if(DNBD3_RELEASE_HARDEN)
+ message(FATAL_ERROR "DNBD3_SERVER_AFL can only be enabled if DNBD3_RELEASE_HARDEN is disabled")
+ endif(DNBD3_RELEASE_HARDEN)
+
+ # build dnbd3-server with AFL support
+ message(STATUS "Building dnbd3-server with AFL support")
+ add_definitions(-DDNBD3_SERVER_AFL)
+
+ # change compiler for dnbd3-server sources if AFL enabled
+ include(CheckAFLCCompiler)
+ check_afl_c_compiler(AFL_C_COMPILER AFL_C_COMPILER_NAME ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ID})
+ if(AFL_C_COMPILER)
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER} - done")
+ # change C compiler to a corresponding AFL C compiler
+ set(CMAKE_C_COMPILER "${AFL_C_COMPILER}")
+ else(AFL_C_COMPILER)
+ # no corresponding AFL C compiler found
+ message(STATUS "Check for working AFL C compiler: ${AFL_C_COMPILER_NAME} - failed")
+ message(FATAL_ERROR "No corresponding AFL C compiler ${AFL_C_COMPILER_NAME} was found for the C compiler ${CMAKE_C_COMPILER}!")
+ endif(AFL_C_COMPILER)
+endif(DNBD3_SERVER_AFL)
+
+set(DNBD3_SERVER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.c)
+set(DNBD3_SERVER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/altservers.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fileutil.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/fuse.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/globals.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/helper.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/image.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/ini.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/integrity.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/locks.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reference.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/reftypes.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/rpc.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/server.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/threadpool.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/uplink.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/urldecode.h)
+
+add_executable(dnbd3-server ${DNBD3_SERVER_SOURCE_FILES})
+target_include_directories(dnbd3-server PRIVATE ${JANSSON_INCLUDE_DIR})
+target_link_libraries(dnbd3-server dnbd3-version dnbd3-build dnbd3-shared picohttpparser Libatomic::Libatomic ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
+
+if(DNBD3_SERVER_FUSE)
+ find_package(Fuse REQUIRED)
+ # include Fuse headers and link with Fuse library
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_FUSE)
+ target_include_directories(dnbd3-server PRIVATE ${FUSE_INCLUDE_DIRS})
+ target_link_libraries(dnbd3-server ${FUSE_LIBRARIES})
+endif(DNBD3_SERVER_FUSE)
+
+if(UNIX AND NOT APPLE)
+ # link dnbd3-server with librt if server is compiled for a Unix system
+ target_link_libraries(dnbd3-server rt)
+endif(UNIX AND NOT APPLE)
+
+if(DNBD3_SERVER_DEBUG_LOCKS)
+ # enable debugging of locks used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_LOCKS)
+endif(DNBD3_SERVER_DEBUG_LOCKS)
+
+if(DNBD3_SERVER_DEBUG_THREADS)
+ # enable debugging of threads used in the dnbd3-server
+ target_compile_options(dnbd3-server PRIVATE -DDNBD3_SERVER_DEBUG_THREADS)
+endif(DNBD3_SERVER_DEBUG_THREADS)
+
+install(TARGETS dnbd3-server RUNTIME DESTINATION bin
+ COMPONENT server)
+
+add_linter(dnbd3-server-lint "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+add_linter_fix(dnbd3-server-lint-fix "${DNBD3_SERVER_SOURCE_FILES}" "${DNBD3_SERVER_HEADER_FILES}")
+
+# add external dependency (HTTP parser) for the dnbd3-server
+add_subdirectory(picohttpparser)
diff --git a/src/server/altservers.c b/src/server/altservers.c
index 943345c..4413ca6 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -5,16 +5,16 @@
#include "helper.h"
#include "image.h"
#include "fileutil.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../serverconfig.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/config/server.h>
#include "reference.h"
#include <assert.h>
#include <inttypes.h>
#include <jansson.h>
-#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid)
+#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image))
#define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
#define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
@@ -172,7 +172,7 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
if ( uplink->rttTestResult != RTT_INPROGRESS ) {
dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
if ( current == uplink ) {
- threadpool_run( &altservers_runCheck, uplink );
+ threadpool_run( &altservers_runCheck, uplink, "UPLINK" );
} else if ( current != NULL ) {
ref_put( &current->reference );
}
@@ -268,12 +268,32 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server
int idx[size];
int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
for ( int i = 0; i < num; ++i ) {
- servers[i] = altServers[i].host;
+ servers[i] = altServers[idx[i]].host;
}
return num;
}
/**
+ * Returns true if there is at least one alt-server the
+ * given image name would be allowed to be cloned from.
+ */
+bool altservers_imageHasAltServers(const char *image)
+{
+ bool ret = false;
+ mutex_lock( &altServersLock );
+ for ( int i = 0; i < numAltServers; ++i ) {
+ if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) )
+ continue;
+ if ( !isImageAllowed( &altServers[i], image ) )
+ continue;
+ ret = true;
+ break;
+ }
+ mutex_unlock( &altServersLock );
+ return ret;
+}
+
+/**
* Get <size> alt servers. If there are more alt servers than
* requested, random servers will be picked.
* This function is suited for finding uplink servers as
@@ -450,6 +470,11 @@ static void *altservers_runCheck(void *data)
void altservers_findUplink(dnbd3_uplink_t *uplink)
{
altservers_findUplinkInternal( uplink );
+ // Above function is sync, which means normally when it
+ // returns, rttTestResult will not be RTT_INPROGRESS.
+ // But we might have an ansync call running in parallel, which would
+ // mean the above call returns immediately. Wait for that check
+ // to finish too.
while ( uplink->rttTestResult == RTT_INPROGRESS ) {
usleep( 5000 );
}
@@ -504,17 +529,29 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
return;
}
- LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) );
assert( uplink->rttTestResult == RTT_INPROGRESS );
// Test them all
dnbd3_server_connection_t best = { .fd = -1 };
unsigned long bestRtt = RTT_UNREACHABLE;
unsigned long currentRtt = RTT_UNREACHABLE;
+ uint64_t offset = 0;
+ uint32_t length = DNBD3_BLOCK_SIZE;
+ // Try to use the range of the first request in the queue as RTT block.
+ // In case we have a cluster of servers where none of them has a complete
+ // copy, we at least make sure the one we're potentially switching to
+ // has the next block we're about to request.
+ mutex_lock( &uplink->queueLock );
+ if ( uplink->queue != NULL ) {
+ offset = uplink->queue->from;
+ length = (uint32_t)( uplink->queue->to - offset );
+ }
+ mutex_unlock( &uplink->queueLock );
for (itAlt = 0; itAlt < numAlts; ++itAlt) {
int server = servers[itAlt];
// Connect
clock_gettime( BEST_CLOCK_SOURCE, &start );
- int sock = sock_connect( &altServers[server].host, 750, 1000 );
+ int sock = sock_connect( &altServers[server].host, 750, _uplinkTimeout );
if ( sock == -1 ) { // Connection failed means global error
altservers_serverFailed( server );
continue;
@@ -524,7 +561,8 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
goto image_failed;
}
// See if selecting the image succeeded ++++++++++++++++++++++++++++++
- uint16_t protocolVersion, rid;
+ uint16_t protocolVersion = 0;
+ uint16_t rid;
uint64_t imageSize;
char *name;
serialized_buffer_t serialized;
@@ -543,9 +581,9 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
if ( imageSize != image->virtualFilesize ) {
ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
}
- // Request first block (NOT random!) ++++++++++++++++++++++++++++++
- if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
- LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server );
+ // Request block (NOT random! First or from queue) ++++++++++++
+ if ( !dnbd3_get_block( sock, offset, length, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+ LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request block", server );
}
// See if requesting the block succeeded ++++++++++++++++++++++
dnbd3_reply_t reply;
@@ -553,13 +591,18 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
}
// check reply header
- if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+ if ( reply.cmd != CMD_GET_BLOCK || reply.size != length ) {
// Sanity check failed; count this as global error (malicious/broken server)
ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
}
// flush payload to include this into measurement
char buffer[DNBD3_BLOCK_SIZE];
- if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+ uint32_t todo = length;
+ ssize_t ret;
+ while ( todo != 0 && ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
+ todo -= (uint32_t)ret;
+ }
+ if ( todo != 0 ) {
ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
}
clock_gettime( BEST_CLOCK_SOURCE, &end );
@@ -567,9 +610,6 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
mutex_lock( &uplink->rttLock );
const bool isCurrent = ( uplink->current.index == server );
mutex_unlock( &uplink->rttLock );
- // Penaltize rtt if this was a cycle; this will treat this server with lower priority
- // in the near future too, so we prevent alternating between two servers that are both
- // part of a cycle and have the lowest latency.
uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
+ (end.tv_nsec - start.tv_nsec) / 1000); // µs
uint32_t avg = altservers_updateRtt( uplink, server, rtt );
@@ -614,7 +654,6 @@ failed:
} else {
LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
}
- sock_setTimeout( best.fd, _uplinkTimeout );
mutex_lock( &uplink->rttLock );
uplink->better = best;
uplink->rttTestResult = RTT_DOCHANGE;
@@ -628,10 +667,6 @@ failed:
if ( best.fd != -1 ) {
close( best.fd );
}
- if ( !image->working || uplink->cycleDetected ) {
- image->working = true;
- LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
- }
uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
mutex_lock( &uplink->rttLock );
uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 8e29aaa..78f6fcc 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou
int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
+bool altservers_imageHasAltServers(const char *image);
+
bool altservers_toString(int server, char *buffer, size_t len);
int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
diff --git a/src/server/fileutil.c b/src/server/fileutil.c
index 336ab68..9a9f066 100644
--- a/src/server/fileutil.c
+++ b/src/server/fileutil.c
@@ -68,7 +68,7 @@ bool file_setSize(int fd, uint64_t size)
// Try really hard... image loading logic relies on the file
// having the proper apparent size
uint8_t byte = 0;
- pread( fd, &byte, 1, size - 1 );
+ (void)!pread( fd, &byte, 1, size - 1 );
if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true;
return false;
}
diff --git a/src/server/fuse.c b/src/server/fuse.c
new file mode 100644
index 0000000..12913a6
--- /dev/null
+++ b/src/server/fuse.c
@@ -0,0 +1,661 @@
+#include "fuse.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/log.h>
+
+#ifndef DNBD3_SERVER_FUSE
+
+//
+bool dfuse_init(const char *opts UNUSED, const char *dir UNUSED)
+{
+ logadd( LOG_ERROR, "FUSE: Not compiled in" );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+}
+
+#else
+
+#define PATHLEN (2000)
+static char nullbytes[DNBD3_BLOCK_SIZE];
+
+// FUSE ENABLED
+#define FUSE_USE_VERSION 30
+//
+#include <dnbd3/config.h>
+#include "locks.h"
+#include "threadpool.h"
+#include "image.h"
+#include "uplink.h"
+#include "reference.h"
+#include "helper.h"
+
+#include <fuse_lowlevel.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <signal.h>
+
+#define INO_ROOT (1)
+#define INO_CTRL (2)
+#define INO_DIR (3)
+static const char *NAME_CTRL = "control";
+static const char *NAME_DIR = "images";
+
+typedef struct {
+ fuse_req_t req;
+ uint16_t rid;
+ char name[PATHLEN];
+} lookup_t;
+
+static fuse_ino_t inoCounter = 10;
+typedef struct _dfuse_dir {
+ struct _dfuse_dir *next;
+ struct _dfuse_dir *child;
+ const char *name;
+ uint64_t size;
+ fuse_ino_t ino;
+ int refcount;
+ lookup_t *img;
+} dfuse_entry_t;
+
+typedef struct {
+ dfuse_entry_t *entry;
+ dnbd3_image_t *image;
+} cmdopen_t;
+
+static dfuse_entry_t sroot = {
+ .name = "images",
+ .ino = INO_DIR,
+ .refcount = 2,
+}, *root = &sroot;
+static pthread_mutex_t dirLock;
+
+#define INIT_NONE (0)
+#define INIT_DONE (1)
+#define INIT_SHUTDOWN (2)
+#define INIT_INPROGRESS (3)
+
+static struct fuse_session *fuseSession = NULL;
+static struct fuse_chan *fuseChannel = NULL;
+static char *fuseMountPoint = NULL;
+static pthread_t fuseThreadId;
+static bool haveThread = false;
+static _Atomic(int) initState = INIT_NONE;
+static pthread_mutex_t initLock;
+static struct timespec startupTime;
+
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name);
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino);
+
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer);
+static void cleanupFuse();
+static void* fuseMainLoop(void *data);
+
+static void ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ fi->fh = 0;
+ if ( ino == INO_CTRL ) {
+ if ( ( fi->flags & 3 ) != O_WRONLY ) {
+ fuse_reply_err( req, EINVAL );
+ } else {
+ fi->nonseekable = 1;
+ fuse_reply_open( req, fi );
+ }
+ } else if ( ino == INO_ROOT ) {
+ fuse_reply_err( req, EISDIR );
+ } else {
+ if ( ( fi->flags & 3 ) != O_RDONLY ) {
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else if ( entry->img == NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, EISDIR );
+ } else if ( entry->img->rid == 0 ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOENT );
+ } else {
+ entry->refcount++;
+ mutex_unlock( &dirLock );
+ dnbd3_image_t *image = image_get( entry->img->name, entry->img->rid, true );
+ if ( image == NULL ) {
+ fuse_reply_err( req, ENOENT );
+ mutex_lock( &dirLock );
+ entry->refcount--;
+ mutex_unlock( &dirLock );
+ } else {
+ cmdopen_t *handle = malloc( sizeof(cmdopen_t) );
+ handle->entry = entry;
+ handle->image = image;
+ fi->fh = (uintptr_t)handle;
+ fi->keep_cache = 1;
+ fuse_reply_open( req, fi );
+ }
+ }
+ }
+}
+
+static dfuse_entry_t* addImage(dfuse_entry_t **dir, const char *name, lookup_t *img)
+{
+ const char *slash = strchr( name, '/' );
+ if ( slash == NULL ) {
+ // Name portion at the end
+ char *path = NULL;
+ if ( asprintf( &path, "%s:%d", name, (int)img->rid ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ entry->img = img;
+ } else {
+ free( path );
+ if ( entry->img == NULL ) {
+ return NULL;
+ }
+ }
+ return entry;
+ } else {
+ // Dirname
+ char *path = NULL;
+ if ( asprintf( &path, "%.*s", (int)( slash - name ), name ) == -1 )
+ abort();
+ dfuse_entry_t *entry = dirLookup( *dir, path );
+ if ( entry == NULL ) {
+ entry = calloc( 1, sizeof( *entry ) );
+ entry->next = *dir;
+ *dir = entry;
+ entry->name = path;
+ entry->ino = inoCounter++;
+ } else {
+ free( path );
+ }
+ return addImage( &entry->child, slash + 1, img );
+ }
+}
+
+static void ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_CTRL ) {
+ fuse_reply_err( req, EROFS );
+ return;
+ }
+ if ( off != 0 ) {
+ fuse_reply_err( req, ESPIPE );
+ return;
+ }
+ if ( size >= PATHLEN ) {
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ size_t colon = 0;
+ int rid = 0;
+ for ( size_t i = 0; i < size; ++i ) {
+ if ( buf[i] == '\0' || buf[i] == '\n' ) {
+ if ( colon == 0 ) {
+ colon = i;
+ }
+ break;
+ }
+ if ( colon != 0 ) {
+ if ( !isdigit( buf[i] ) ) {
+ logadd( LOG_WARNING, "FUSE: Malformed rid" );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ rid = rid * 10 + ( buf[i] - '0' ); // Can overflow but who cares
+ } else if ( buf[i] == ':' ) {
+ colon = i; // Image name starting with ':' would be broken...
+ }
+ }
+ if ( rid < 0 || rid > 65535 ) {
+ logadd( LOG_WARNING, "FUSE: Invalid rid '%d'", rid );
+ fuse_reply_err( req, EINVAL );
+ return;
+ }
+ if ( colon == 0 ) {
+ colon = size;
+ }
+ lookup_t *lu = malloc( sizeof(lookup_t) );
+ lu->rid = (uint16_t)rid;
+ lu->req = req;
+ if ( snprintf( lu->name, PATHLEN, "%.*s", (int)colon, buf ) == -1 ) {
+ free( lu );
+ fuse_reply_err( req, ENOSPC );
+ return;
+ }
+ logadd( LOG_DEBUG1, "FUSE: Request for '%s:%d'", lu->name, (int)lu->rid );
+ dnbd3_image_t *image = image_getOrLoad( lu->name, lu->rid );
+ if ( image == NULL ) {
+ fuse_reply_err( lu->req, ENOENT );
+ free( lu );
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = addImage( &root->child, lu->name, lu );
+ if ( entry != NULL ) {
+ entry->size = image->virtualFilesize;
+ }
+ lu->rid = image->rid; // In case it was 0
+ mutex_unlock( &dirLock );
+ image_release( image );
+ if ( entry == NULL ) {
+ fuse_reply_err( lu->req, EINVAL );
+ free( lu );
+ } else {
+ fuse_reply_write( lu->req, size );
+ }
+ }
+}
+
+static void ll_read( fuse_req_t req, fuse_ino_t ino UNUSED, size_t size, off_t off, struct fuse_file_info *fi )
+{
+ if ( fi->fh == 0 ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ dnbd3_image_t *image = handle->image;
+ if ( off < 0 || (uint64_t)off >= image->virtualFilesize ) {
+ fuse_reply_err( req, 0 );
+ return;
+ }
+ if ( off + size > image->virtualFilesize ) {
+ size = image->virtualFilesize - off;
+ }
+
+ // Check if cached locally
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ // This is a proxyed image, check if we need to relay the request...
+ const uint64_t start = (uint64_t)off & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (off + size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ ref_put( &cache->reference );
+ if ( size > (uint32_t)_maxPayload ) {
+ size = (uint32_t)_maxPayload;
+ }
+ if ( !uplink_request( image, req, &uplinkCallback, 0, off, (uint32_t)size ) ) {
+ logadd( LOG_DEBUG1, "FUSE: Could not relay uncached request to upstream proxy for image %s:%d",
+ image->name, image->rid );
+ fuse_reply_err( req, EIO );
+ }
+ return; // ASYNC
+ }
+ ref_put( &cache->reference );
+ }
+
+ // Is cached
+ size_t readSize = size;
+ if ( off + readSize > image->realFilesize ) {
+ if ( (uint64_t)off >= image->realFilesize ) {
+ readSize = 0;
+ } else {
+ readSize = image->realFilesize - off;
+ }
+ }
+ struct fuse_bufvec *vec = calloc( 1, sizeof(*vec) + sizeof(struct fuse_buf) );
+ if ( readSize != 0 ) {
+ // Real data from file
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = readSize,
+ .flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_RETRY | FUSE_BUF_FD_SEEK,
+ .fd = image->readFd,
+ .pos = off,
+ };
+ }
+ if ( readSize != size ) {
+ vec->buf[vec->count++] = (struct fuse_buf){
+ .size = size - readSize,
+ .mem = nullbytes,
+ .fd = -1,
+ };
+ }
+ fuse_reply_data( req, vec, FUSE_BUF_SPLICE_MOVE );
+ free( vec );
+}
+
+static bool statInternal(fuse_ino_t ino, struct stat *stbuf)
+{
+ switch ( ino ) {
+ case INO_ROOT:
+ case INO_DIR:
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ stbuf->st_mtim = startupTime;
+ break;
+ case INO_CTRL:
+ stbuf->st_mode = S_IFREG | 0222;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = 0;
+ clock_gettime( CLOCK_REALTIME, &stbuf->st_mtim );
+ break;
+ default:
+ return false;
+ }
+ stbuf->st_ctim = stbuf->st_atim = startupTime;
+ stbuf->st_uid = 0;
+ stbuf->st_ino = ino;
+ return true;
+}
+
+/**
+ * HOLD LOCK
+ */
+static dfuse_entry_t* dirLookup(dfuse_entry_t *dir, const char *name)
+{
+ if ( dir == NULL )
+ return NULL;
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ if ( strcmp( it->name, name ) == 0 )
+ return it;
+ }
+ return NULL;
+}
+
+static dfuse_entry_t* inoRecursive(dfuse_entry_t *dir, fuse_ino_t ino)
+{
+ for ( dfuse_entry_t *it = dir; it != NULL; it = it->next ) {
+ logadd( LOG_DEBUG1, "ino %d is %s", (int)it->ino, it->name );
+ if ( it->ino == ino )
+ return it;
+ if ( it->img == NULL ) {
+ dir = inoRecursive( it->child, ino );
+ if ( dir != NULL )
+ return dir;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * HOLD LOCK
+ */
+static void entryToStat(dfuse_entry_t *entry, struct stat *stbuf)
+{
+ if ( entry->img == NULL ) {
+ stbuf->st_mode = S_IFDIR | 0555;
+ stbuf->st_nlink = 2;
+ } else {
+ stbuf->st_mode = S_IFREG | 0444;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = entry->size;
+ }
+ stbuf->st_ino = entry->ino;
+ stbuf->st_uid = 0;
+ stbuf->st_ctim = stbuf->st_atim = stbuf->st_mtim = startupTime;
+}
+
+static void ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ logadd( LOG_DEBUG2, "Lookup at ino %d for '%s'", (int)parent, name );
+ if ( parent == INO_ROOT ) {
+ struct fuse_entry_param e = { 0 };
+ if ( strcmp( name, NAME_DIR ) == 0 ) {
+ e.ino = INO_DIR;
+ } else if ( strcmp( name, NAME_CTRL ) == 0 ) {
+ e.ino = INO_CTRL;
+ e.attr_timeout = e.entry_timeout = 3600;
+ }
+ if ( e.ino != 0 && statInternal( e.ino, &e.attr ) ) {
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ } else {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *dir = inoRecursive( root, parent );
+ if ( dir != NULL ) {
+ if ( dir->img != NULL ) {
+ mutex_unlock( &dirLock );
+ fuse_reply_err( req, ENOTDIR );
+ return;
+ }
+ dfuse_entry_t *entry = dirLookup( dir->child, name );
+ if ( entry != NULL ) {
+ struct fuse_entry_param e = { .ino = entry->ino };
+ entryToStat( entry, &e.attr );
+ mutex_unlock( &dirLock );
+ fuse_reply_entry( req, &e );
+ return;
+ }
+ }
+ mutex_unlock( &dirLock );
+ }
+ fuse_reply_err( req, ENOENT );
+}
+
+struct dirbuf {
+ char *p;
+ size_t size;
+};
+
+static void dirbuf_add( fuse_req_t req, struct dirbuf *b, const char *name, fuse_ino_t ino )
+{
+ struct stat stbuf = { .st_ino = ino };
+ size_t oldsize = b->size;
+ b->size += fuse_add_direntry( req, NULL, 0, name, NULL, 0 );
+ b->p = ( char * ) realloc( b->p, b->size );
+ fuse_add_direntry( req, b->p + oldsize, b->size - oldsize, name, &stbuf, b->size );
+ return;
+}
+
+static int reply_buf_limited( fuse_req_t req, const char *buf, size_t bufsize, off_t off, size_t maxsize )
+{
+ if ( off >= 0 && off < (off_t)bufsize ) {
+ return fuse_reply_buf( req, buf + off, MIN( bufsize - off, maxsize ) );
+ }
+ return fuse_reply_buf( req, NULL, 0 );
+}
+
+static void ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, struct fuse_file_info *fi UNUSED)
+{
+ if ( ino != INO_ROOT ) {
+ fuse_reply_err( req, EACCES );
+ } else {
+ struct dirbuf b;
+ memset( &b, 0, sizeof( b ) );
+ dirbuf_add( req, &b, ".", INO_ROOT );
+ dirbuf_add( req, &b, "..", INO_ROOT );
+ dirbuf_add( req, &b, NAME_CTRL, INO_CTRL );
+ dirbuf_add( req, &b, NAME_DIR, INO_DIR );
+ reply_buf_limited( req, b.p, b.size, off, size );
+ free( b.p );
+ }
+}
+
+static void ll_getattr(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi UNUSED)
+{
+ struct stat stbuf = { .st_ino = 0 };
+ if ( !statInternal( ino, &stbuf ) ) {
+ mutex_lock( &dirLock );
+ dfuse_entry_t *entry = inoRecursive( root, ino );
+ if ( entry != NULL ) {
+ entryToStat( entry, &stbuf );
+ }
+ mutex_unlock( &dirLock );
+ }
+ if ( stbuf.st_ino == 0 ) {
+ fuse_reply_err( req, ENOENT );
+ } else {
+ fuse_reply_attr( req, &stbuf, 0 );
+ }
+}
+
+void ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr UNUSED, int to_set UNUSED, struct fuse_file_info *fi)
+{
+ ll_getattr( req, ino, fi );
+}
+
+void ll_release(fuse_req_t req, fuse_ino_t ino UNUSED, struct fuse_file_info *fi)
+{
+ if ( fi->fh != 0 ) {
+ cmdopen_t *handle = (cmdopen_t*)fi->fh;
+ image_release( handle->image );
+ mutex_lock( &dirLock );
+ handle->entry->refcount--;
+ mutex_unlock( &dirLock );
+ free( handle );
+ }
+ fuse_reply_err( req, 0 );
+}
+
+static void uplinkCallback(void *data, uint64_t handle UNUSED, uint64_t start UNUSED, uint32_t length, const char *buffer)
+{
+ fuse_req_t req = (fuse_req_t)data;
+ if ( buffer == NULL ) {
+ fuse_reply_err( req, EIO );
+ } else {
+ fuse_reply_buf( req, buffer, length );
+ }
+}
+
+#define DUMP(key,type) logadd( LOG_DEBUG1, "FUSE: " #key ": " type, conn->key )
+void ll_init(void *userdata, struct fuse_conn_info *conn)
+{
+ DUMP( capable, "%u" );
+ DUMP( congestion_threshold, "%u" );
+ DUMP( max_background, "%u" );
+ //DUMP( max_read, "%u" );
+ DUMP( max_readahead, "%u" );
+ DUMP( max_write, "%u" );
+ DUMP( want, "%u" );
+ conn->want |= FUSE_CAP_SPLICE_READ | FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
+}
+#undef DUMP
+
+/* map the implemented fuse operations */
+static struct fuse_lowlevel_ops fuseOps = {
+ .lookup = ll_lookup,
+ .getattr = ll_getattr,
+ .setattr = ll_setattr,
+ .readdir = ll_readdir,
+ .open = ll_open,
+ .release = ll_release,
+ .read = ll_read,
+ .write = ll_write,
+ .init = ll_init,
+ //.destroy = ll_destroy,
+};
+
+bool dfuse_init(const char *opts, const char *dir)
+{
+ int ex = INIT_NONE;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_INPROGRESS ) ) {
+ logadd( LOG_ERROR, "Calling dfuse_init twice" );
+ exit( 1 );
+ }
+ mutex_init( &initLock, LOCK_FUSE_INIT );
+ mutex_lock( &initLock );
+ mutex_init( &dirLock, LOCK_FUSE_DIR );
+ clock_gettime( CLOCK_REALTIME, &startupTime );
+ struct fuse_args args = FUSE_ARGS_INIT( 0, NULL );
+ fuse_opt_add_arg( &args, "dnbd3fs" ); // argv[0]
+ if ( opts != NULL ) {
+ fuse_opt_add_arg( &args, opts );
+ }
+ fuse_opt_add_arg( &args, "-odefault_permissions" );
+ fuse_opt_add_arg( &args, dir ); // last param is mount point
+ //
+ if ( fuse_parse_cmdline( &args, &fuseMountPoint, NULL, NULL ) == -1 ) {
+ logadd( LOG_ERROR, "FUSE: Error parsing command line" );
+ goto fail;
+ }
+ fuseChannel = fuse_mount( fuseMountPoint, &args );
+ if ( fuseChannel == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Cannot mount to %s", dir );
+ goto fail;
+ }
+ fuseSession = fuse_lowlevel_new( &args, &fuseOps, sizeof( fuseOps ), NULL );
+ if ( fuseSession == NULL ) {
+ logadd( LOG_ERROR, "FUSE: Error initializing fuse session" );
+ goto fail;
+ }
+ fuse_session_add_chan( fuseSession, fuseChannel );
+ if ( 0 != thread_create( &fuseThreadId, NULL, &fuseMainLoop, (void *)NULL ) ) {
+ logadd( LOG_ERROR, "FUSE: Could not start thread" );
+ goto fail;
+ }
+ haveThread = true;
+ // Init OK
+ mutex_unlock( &initLock );
+ return true;
+fail:
+ cleanupFuse();
+ fuse_opt_free_args( &args );
+ initState = INIT_SHUTDOWN;
+ mutex_unlock( &initLock );
+ return false;
+}
+
+void dfuse_shutdown()
+{
+ if ( initState == INIT_NONE )
+ return;
+ for ( ;; ) {
+ int ex = INIT_DONE;
+ if ( atomic_compare_exchange_strong( &initState, &ex, INIT_SHUTDOWN ) )
+ break; // OK, do the shutdown
+ if ( ex == INIT_INPROGRESS )
+ continue; // dfuse_init in progress, wait for mutex
+ // Wrong state
+ logadd( LOG_WARNING, "Called dfuse_shutdown without dfuse_init first" );
+ return;
+ }
+ logadd( LOG_INFO, "Shutting down fuse mainloop..." );
+ mutex_lock( &initLock );
+ if ( fuseSession != NULL ) {
+ fuse_session_exit( fuseSession );
+ }
+ if ( !haveThread ) {
+ cleanupFuse();
+ }
+ mutex_unlock( &initLock );
+ if ( haveThread ) {
+ logadd( LOG_DEBUG1, "FUSE: Sending USR1 to mainloop thread" );
+ pthread_kill( fuseThreadId, SIGUSR1 );
+ pthread_join( fuseThreadId, NULL );
+ }
+}
+
+static void* fuseMainLoop(void *data UNUSED)
+{
+ int ex = INIT_INPROGRESS;
+ if ( !atomic_compare_exchange_strong( &initState, &ex, INIT_DONE ) ) {
+ logadd( LOG_WARNING, "FUSE: Unexpected state in fuseMainLoop: %d", ex );
+ return NULL;
+ }
+ setThreadName( "fuse" );
+ logadd( LOG_INFO, "FUSE: Starting mainloop" );
+ fuse_session_loop_mt( fuseSession );
+ logadd( LOG_INFO, "FUSE: Left mainloop" );
+ mutex_lock( &initLock );
+ cleanupFuse();
+ mutex_unlock( &initLock );
+ return NULL;
+}
+
+static void cleanupFuse()
+{
+ if ( fuseChannel != NULL ) {
+ fuse_session_remove_chan( fuseChannel );
+ }
+ if ( fuseSession != NULL ) {
+ fuse_session_destroy( fuseSession );
+ fuseSession = NULL;
+ }
+ if ( fuseMountPoint != NULL && fuseChannel != NULL ) {
+ fuse_unmount( fuseMountPoint, fuseChannel );
+ }
+ fuseChannel = NULL;
+}
+
+#endif // DNBD3_SERVER_FUSE
diff --git a/src/server/fuse.h b/src/server/fuse.h
new file mode 100644
index 0000000..f01ad58
--- /dev/null
+++ b/src/server/fuse.h
@@ -0,0 +1,10 @@
+#ifndef _FUSE_H_
+#define _FUSE_H_
+
+#include <stdbool.h>
+
+bool dfuse_init(const char *opts, const char *dir);
+
+void dfuse_shutdown();
+
+#endif
diff --git a/src/server/globals.c b/src/server/globals.c
index f8c3f66..f6432cb 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -1,7 +1,7 @@
#include "globals.h"
#include "ini.h"
#include "locks.h"
-#include "../shared/log.h"
+#include <dnbd3/shared/log.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
@@ -19,22 +19,26 @@ atomic_int _clientPenalty = 0;
atomic_bool _isProxy = false;
atomic_int _backgroundReplication = BGR_FULL;
atomic_int _bgrMinClients = 0;
+atomic_int _bgrWindowSize = 1;
atomic_bool _lookupMissingForProxy = true;
atomic_bool _sparseFiles = false;
+atomic_bool _ignoreAllocErrors = false;
atomic_bool _removeMissingImages = true;
-atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
-atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT;
+atomic_uint _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
+atomic_uint _clientTimeout = SOCKET_TIMEOUT_CLIENT;
atomic_bool _closeUnusedFd = false;
atomic_bool _vmdkLegacyMode = false;
// Not really needed anymore since we have '+' and '-' in alt-servers
atomic_bool _proxyPrivateOnly = false;
+atomic_bool _pretendClient = false;
atomic_int _autoFreeDiskSpaceDelay = 3600 * 10;
// [limits]
atomic_int _maxClients = SERVER_MAX_CLIENTS;
atomic_int _maxImages = SERVER_MAX_IMAGES;
-atomic_int _maxPayload = 9000000; // 9MB
+atomic_uint _maxPayload = 9000000; // 9MB
atomic_uint_fast64_t _maxReplicationSize = (uint64_t)100000000000LL;
-atomic_bool _pretendClient = false;
+atomic_uint _maxPrefetch = 262144; // 256KB
+atomic_uint _minRequestSize = 0;
/**
* True when loading config the first time. Consecutive loads will
@@ -58,31 +62,35 @@ static const char* units = "KMGTPEZY";
static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optname);
static bool parse64u(const char *in, atomic_uint_fast64_t *out, const char *optname);
-static bool parse32(const char *in, atomic_int *out, const char *optname) UNUSED;
-static bool parse32u(const char *in, atomic_int *out, const char *optname);
+static bool parse32(const char *in, atomic_int *out, const char *optname);
+static bool parse32u(const char *in, atomic_uint *out, const char *optname);
static int ini_handler(void *custom UNUSED, const char* section, const char* key, const char* value)
{
if ( initialLoad ) {
if ( _basePath == NULL ) SAVE_TO_VAR_STR( dnbd3, basePath );
SAVE_TO_VAR_BOOL( dnbd3, vmdkLegacyMode );
- SAVE_TO_VAR_UINT( dnbd3, listenPort );
- SAVE_TO_VAR_UINT( limits, maxClients );
- SAVE_TO_VAR_UINT( limits, maxImages );
+ SAVE_TO_VAR_INT( dnbd3, listenPort );
+ SAVE_TO_VAR_INT( limits, maxClients );
+ SAVE_TO_VAR_INT( limits, maxImages );
}
SAVE_TO_VAR_BOOL( dnbd3, isProxy );
SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+ SAVE_TO_VAR_INT( dnbd3, bgrWindowSize );
SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
+ SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages );
SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd );
- SAVE_TO_VAR_UINT( dnbd3, serverPenalty );
- SAVE_TO_VAR_UINT( dnbd3, clientPenalty );
+ SAVE_TO_VAR_INT( dnbd3, serverPenalty );
+ SAVE_TO_VAR_INT( dnbd3, clientPenalty );
SAVE_TO_VAR_UINT( dnbd3, uplinkTimeout );
SAVE_TO_VAR_UINT( dnbd3, clientTimeout );
SAVE_TO_VAR_UINT( limits, maxPayload );
SAVE_TO_VAR_UINT64( limits, maxReplicationSize );
+ SAVE_TO_VAR_UINT( limits, maxPrefetch );
+ SAVE_TO_VAR_UINT( limits, minRequestSize );
SAVE_TO_VAR_BOOL( dnbd3, pretendClient );
SAVE_TO_VAR_INT( dnbd3, autoFreeDiskSpaceDelay );
if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) {
@@ -111,7 +119,10 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
void globals_loadConfig()
{
char *name = NULL;
- asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
+ if ( asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ) == -1 ) {
+ logadd( LOG_ERROR, "Memory allocation error for config filename" );
+ exit( 1 );
+ }
if ( name == NULL ) return;
if ( initialLoad ) {
mutex_init( &loadLock, LOCK_LOAD_CONFIG );
@@ -125,9 +136,30 @@ void globals_loadConfig()
if ( initialLoad ) {
sanitizeFixedConfig();
}
- if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
- logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
- _sparseFiles = false;
+ if ( _isProxy ) {
+ if ( _backgroundReplication == BGR_FULL && _sparseFiles && _bgrMinClients < 5 ) {
+ logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
+ _sparseFiles = false;
+ }
+ if ( _bgrWindowSize < 1 ) {
+ _bgrWindowSize = 1;
+ } else if ( _bgrWindowSize > UPLINK_MAX_QUEUE - 10 ) {
+ _bgrWindowSize = UPLINK_MAX_QUEUE - 10;
+ logadd( LOG_MINOR, "Limiting bgrWindowSize to %d, because of UPLINK_MAX_QUEUE",
+ _bgrWindowSize );
+ }
+ if ( _maxPayload < 256 * 1024 ) {
+ logadd( LOG_WARNING, "maxPayload was increased to 256k" );
+ _maxPayload = 256 * 1024;
+ }
+ if ( _maxPrefetch > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing maxPrefetch to maxPayload" );
+ _maxPrefetch = _maxPayload;
+ }
+ if ( _minRequestSize > _maxPayload ) {
+ logadd( LOG_WARNING, "Reducing minRequestSize to maxPayload" );
+ _minRequestSize = _maxPayload;
+ }
}
// Dump config as interpreted
char buffer[2000];
@@ -281,7 +313,7 @@ static bool parse32(const char *in, atomic_int *out, const char *optname)
return true;
}
-static bool parse32u(const char *in, atomic_int *out, const char *optname)
+static bool parse32u(const char *in, atomic_uint *out, const char *optname)
{
atomic_int_fast64_t v;
if ( !parse64( in, &v, optname ) ) return false;
@@ -289,7 +321,7 @@ static bool parse32u(const char *in, atomic_int *out, const char *optname)
logadd( LOG_WARNING, "'%s' must be between %d and %d, but is '%s'", optname, (int)0, (int)INT_MAX, in );
return false;
}
- *out = (int)v;
+ *out = (unsigned int)v;
return true;
}
@@ -320,8 +352,10 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PBOOL(backgroundReplication);
}
PINT(bgrMinClients);
+ PINT(bgrWindowSize);
PBOOL(lookupMissingForProxy);
PBOOL(sparseFiles);
+ PBOOL(ignoreAllocErrors);
PBOOL(removeMissingImages);
PINT(uplinkTimeout);
PINT(clientTimeout);
@@ -335,6 +369,8 @@ size_t globals_dumpConfig(char *buffer, size_t size)
PINT(maxImages);
PINT(maxPayload);
PUINT64(maxReplicationSize);
+ PINT(maxPrefetch);
+ PINT(minRequestSize);
return size - rem;
}
diff --git a/src/server/globals.h b/src/server/globals.h
index df8c595..bde1184 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -1,9 +1,9 @@
#ifndef _GLOBALS_H_
#define _GLOBALS_H_
-#include "../types.h"
-#include "../shared/fdsignal.h"
-#include "../serverconfig.h"
+#include <dnbd3/types.h>
+#include <dnbd3/shared/fdsignal.h>
+#include <dnbd3/config/server.h>
#include <stdint.h>
#include <stdatomic.h>
#include <time.h>
@@ -18,18 +18,30 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t;
typedef struct _dnbd3_image dnbd3_image_t;
typedef struct _dnbd3_client dnbd3_client_t;
-typedef struct
+typedef void (*uplink_callback)(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
+
+typedef struct _dnbd3_queue_client
{
- uint64_t handle; // Client defined handle to pass back in reply
- uint64_t from; // First byte offset of requested block (ie. 4096)
- uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
- dnbd3_client_t * client; // Client to send reply to
- int status; // status of this entry: ULR_*
-#ifdef _DEBUG
- ticks entered; // When this request entered the queue (for debugging)
+ struct _dnbd3_queue_client *next;
+ void* data; // Passed back to callback
+ uint64_t handle; // Passed back to callback
+ uint64_t from, to; // Client range
+ uplink_callback callback; // Callback function
+} dnbd3_queue_client_t;
+
+typedef struct _dnbd3_queue_entry
+{
+ struct _dnbd3_queue_entry *next;
+ uint64_t handle; // Our handle for this entry
+ uint64_t from; // First byte offset of requested block (ie. 4096)
+ uint64_t to; // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+ dnbd3_queue_client_t *clients;
+#ifdef DEBUG
+ ticks entered; // When this request entered the queue (for debugging)
#endif
- uint8_t hopCount; // How many hops this request has already taken across proxies
-} dnbd3_queued_request_t;
+ uint8_t hopCount; // How many hops this request has already taken across proxies
+ bool sent; // Already sent to uplink?
+} dnbd3_queue_entry_t;
typedef struct _ns
{
@@ -91,11 +103,12 @@ struct _dnbd3_uplink
bool cycleDetected; // connection cycle between proxies detected for current remote server
int nextReplicationIndex; // Which index in the cache map we should start looking for incomplete blocks at
// If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
- uint64_t replicationHandle; // Handle of pending replication request
atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
- atomic_int queueLen; // length of queue
- uint32_t idleTime; // How many seconds the uplink was idle (apart from keep-alives)
- dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+ atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
+ int queueLen; // length of queue
+ int idleTime; // How many seconds the uplink was idle (apart from keep-alives)
+ dnbd3_queue_entry_t *queue;
+ atomic_uint_fast32_t queueId;
dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
};
@@ -110,6 +123,8 @@ typedef struct
typedef struct
{
ref reference;
+ atomic_bool dirty; // Cache map has been modified outside uplink (only integrity checker for now)
+ bool unchanged; // How many times in a row a reloaded cache map went unchanged
_Atomic uint8_t map[];
} dnbd3_cache_map_t;
@@ -128,7 +143,6 @@ struct _dnbd3_image
uint64_t virtualFilesize; // virtual size of image (real size rounded up to multiple of 4k)
uint64_t realFilesize; // actual file size on disk
ticks atime; // last access time
- ticks lastWorkCheck; // last time a non-working image has been checked
ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
uint32_t *crc32; // list of crc32 checksums for each 16MiB block in image
uint32_t masterCrc32; // CRC-32 of the crc-32 list
@@ -136,10 +150,18 @@ struct _dnbd3_image
atomic_int completenessEstimate; // Completeness estimate in percent
atomic_int users; // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
int id; // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
- atomic_bool working; // true if image exists and completeness is == 100% or a working upstream proxy is connected
+ struct {
+ atomic_bool read; // Error reading from file
+ atomic_bool write; // Error writing to file
+ atomic_bool changed; // File disappeared or changed, thorough check required if it seems to be back
+ atomic_bool uplink; // No uplink connected
+ atomic_bool queue; // Too many requests waiting on uplink
+ } problem;
uint16_t rid; // revision of image
+ bool accessed; // image was accessed since .meta was written
pthread_mutex_t lock;
};
+#define PIMG(x) (x)->name, (int)(x)->rid
struct _dnbd3_client
{
@@ -147,6 +169,7 @@ struct _dnbd3_client
atomic_uint_fast64_t bytesSent; // Byte counter for this client.
dnbd3_image_t * _Atomic image; // Image in use by this client, or NULL during handshake
int sock;
+ _Atomic uint8_t relayedCount; // How many requests are in-flight to the uplink server
bool isServer; // true if a server in proxy mode, false if real client
dnbd3_host_t host;
char hostName[HOSTNAMELEN]; // inet_ntop version of host
@@ -206,12 +229,12 @@ extern atomic_bool _removeMissingImages;
/**
* Read timeout when waiting for or sending data on an uplink
*/
-extern atomic_int _uplinkTimeout;
+extern atomic_uint _uplinkTimeout;
/**
* Read timeout when waiting for or sending data from/to client
*/
-extern atomic_int _clientTimeout;
+extern atomic_uint _clientTimeout;
/**
* If true, images with no active client will have their fd closed after some
@@ -234,6 +257,11 @@ extern atomic_int _backgroundReplication;
extern atomic_int _bgrMinClients;
/**
+ * How many in-flight replication requests we should target (per uplink)
+ */
+extern atomic_int _bgrWindowSize;
+
+/**
* (In proxy mode): If connecting client is a proxy, and the requested image
* is not known locally, should we ask our known alt servers for it?
* Otherwise the request is rejected.
@@ -255,6 +283,12 @@ extern atomic_bool _lookupMissingForProxy;
extern atomic_bool _sparseFiles;
/**
+ * If true, don't abort image replication if preallocating
+ * the image fails, but retry with sparse file.
+ */
+extern atomic_bool _ignoreAllocErrors;
+
+/**
* Port to listen on (default: #define PORT (5003))
*/
extern atomic_int _listenPort;
@@ -275,7 +309,7 @@ extern atomic_int _maxImages;
* Usually this isn't even a megabyte for "real" clients (blockdev
* or fuse).
*/
-extern atomic_int _maxPayload;
+extern atomic_uint _maxPayload;
/**
* If in proxy mode, don't replicate images that are
@@ -298,6 +332,21 @@ extern atomic_bool _pretendClient;
extern atomic_int _autoFreeDiskSpaceDelay;
/**
+ * When handling a client request, this sets the maximum amount
+ * of bytes we prefetch offset right at the end of the client request.
+ * The prefetch size will be MIN( length * 3, _maxPrefetch ), if
+ * length <= _maxPrefetch, so effectively, setting this to 0 disables
+ * any prefetching.
+ */
+extern atomic_uint _maxPrefetch;
+
+/**
+ * Use with care. Can severely degrade performance.
+ * Set either 0 or very high.
+ */
+extern atomic_uint _minRequestSize;
+
+/**
* Load the server configuration.
*/
void globals_loadConfig();
diff --git a/src/server/helper.h b/src/server/helper.h
index 102cb36..3e1b661 100644
--- a/src/server/helper.h
+++ b/src/server/helper.h
@@ -2,8 +2,8 @@
#define HELPER_H_
#include "server.h"
-#include "../shared/log.h"
-#include "../types.h"
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <netinet/in.h>
#include <string.h>
#include <unistd.h>
diff --git a/src/server/image.c b/src/server/image.c
index 16dae45..51fd5b6 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -5,9 +5,9 @@
#include "locks.h"
#include "integrity.h"
#include "altservers.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
#include "reference.h"
#include <assert.h>
@@ -46,16 +46,21 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image);
static dnbd3_image_t* image_free(dnbd3_image_t *image);
static bool image_load_all_internal(char *base, char *path);
static bool image_addToList(dnbd3_image_t *image);
-static bool image_load(char *base, char *path, int withUplink);
+static bool image_load(char *base, char *path, bool withUplink);
static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize);
static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
static bool image_ensureDiskSpace(uint64_t size, bool force);
static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count);
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
static void* closeUnusedFds(void*);
+static bool isImageFromUpstream(dnbd3_image_t *image);
+static void* saveLoadAllCacheMaps(void*);
+static void saveCacheMap(dnbd3_image_t *image);
static void allocCacheMap(dnbd3_image_t *image, bool complete);
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime);
+static void loadImageMeta(dnbd3_image_t *image);
static void cmfree(ref *ref)
{
@@ -73,6 +78,7 @@ void image_serverStartup()
mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
mutex_init( &reloadLock, LOCK_RELOAD );
server_addJob( &closeUnusedFds, NULL, 10, 900 );
+ server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 );
}
/**
@@ -118,39 +124,35 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
const uint64_t firstByteInMap = start >> 15;
const uint64_t lastByteInMap = (end - 1) >> 15;
uint64_t pos;
- // First byte
- uint8_t fb = 0, lb = 0;
- for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- fb |= bit_mask;
- }
- // Last byte
- if ( lastByteInMap != firstByteInMap ) {
- for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
- assert( lastByteInMap == (pos >> 15) );
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- lb |= bit_mask;
- }
- }
- atomic_thread_fence( memory_order_acquire );
- if ( set ) {
- uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
- uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
- setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] );
+ // First and last byte masks
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ if ( firstByteInMap == lastByteInMap ) {
+ if ( set ) {
+ uint8_t o = atomic_fetch_or( &cache->map[firstByteInMap], (uint8_t)(fb & lb) );
+ setNewBlocks = o != ( o | (fb & lb) );
+ } else {
+ atomic_fetch_and( &cache->map[firstByteInMap], (uint8_t)~(fb & lb) );
+ }
} else {
- atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
- atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
- }
- const uint8_t nval = set ? 0xff : 0;
- // Everything in between
- for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
- if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
- setNewBlocks = true;
+ atomic_thread_fence( memory_order_acquire );
+ if ( set ) {
+ uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+ uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+ setNewBlocks = ( fo != ( fo | fb ) || lo != ( lo | lb ) );
+ } else {
+ atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+ atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+ }
+ // Everything in between
+ const uint8_t nval = set ? 0xff : 0;
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+ setNewBlocks = true;
+ }
}
+ atomic_thread_fence( memory_order_release );
}
- atomic_thread_fence( memory_order_release );
if ( setNewBlocks && image->crc32 != NULL ) {
// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
// for checking, even though this might lead to checking some hash block again, if it was
@@ -164,6 +166,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
integrity_check( image, block, false );
}
}
+ } else if ( !set ) {
+ cache->dirty = true;
}
ref_put( &cache->reference );
}
@@ -239,35 +243,74 @@ bool image_isComplete(dnbd3_image_t *image)
*/
bool image_ensureOpen(dnbd3_image_t *image)
{
- if ( image->readFd != -1 ) return image;
- int newFd = open( image->path, O_RDONLY );
+ bool sizeChanged = false;
+ if ( image->readFd != -1 && !image->problem.changed )
+ return true;
+ int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd );
if ( newFd == -1 ) {
- logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "[access] Cannot open '%s' for reading (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
} else {
- // Check size
+ // Check size + read access
+ char buffer[100];
const off_t flen = lseek( newFd, 0, SEEK_END );
if ( flen == -1 ) {
- logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Could not seek to end of %s (errno=%d)", image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
} else if ( (uint64_t)flen != image->realFilesize ) {
- logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+ if ( !image->problem.changed ) {
+ logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64,
+ image->realFilesize, (uint64_t)flen );
+ }
+ sizeChanged = true;
+ } else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+ if ( !image->problem.read ) {
+ logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)",
+ (int)sizeof(buffer), image->path, errno );
+ image->problem.read = true;
+ }
close( newFd );
newFd = -1;
}
}
if ( newFd == -1 ) {
- mutex_lock( &image->lock );
- image->working = false;
- mutex_unlock( &image->lock );
+ if ( sizeChanged ) {
+ image->problem.changed = true;
+ }
return false;
}
+
+ // Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32,
+ // but only if the size we just got above is correct.
+ if ( image->problem.changed && !sizeChanged ) {
+ if ( image->crc32 == NULL ) {
+ // Cannot verify further, hope for the best
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) );
+ } else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
+ // This should have checked the first block (if complete) -> All is well again
+ image->problem.changed = false;
+ logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) );
+ }
+ } else {
+ image->problem.changed = sizeChanged;
+ }
+
mutex_lock( &image->lock );
if ( image->readFd == -1 ) {
image->readFd = newFd;
+ image->problem.read = false;
mutex_unlock( &image->lock );
} else {
- // There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+ // There was a race while opening the file (happens cause not locked cause blocking),
+ // we lost the race so close new fd and proceed.
+ // *OR* we dup()'ed above for cheating when the image changed before.
mutex_unlock( &image->lock );
close( newFd );
}
@@ -296,10 +339,9 @@ dnbd3_image_t* image_byId(int imgId)
* point...
* Locks on: imageListLock, _images[].lock
*/
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool ensureFdOpen)
{
int i;
- const char *removingText = _removeMissingImages ? ", removing from list" : "";
dnbd3_image_t *candidate = NULL;
// Simple sanity check
const size_t slen = strlen( name );
@@ -326,84 +368,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
candidate->users++;
mutex_unlock( &imageListLock );
- // Found, see if it works
- // TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
- // TODO: But remember size-changed images forever
- if ( candidate->working || checkIfWorking ) {
- // Is marked working, but might not have an fd open
- if ( !image_ensureOpen( candidate ) ) {
- mutex_lock( &candidate->lock );
- timing_get( &candidate->lastWorkCheck );
- mutex_unlock( &candidate->lock );
- if ( _removeMissingImages ) {
- candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
- }
- return candidate;
- }
- }
-
- if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
-
- // ...not working...
-
- // Don't re-check too often
- mutex_lock( &candidate->lock );
- bool check;
- declare_now;
- check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
- if ( check ) {
- candidate->lastWorkCheck = now;
- }
- mutex_unlock( &candidate->lock );
- if ( !check ) {
+ if ( !ensureFdOpen ) // Don't want to re-check
return candidate;
- }
- // reaching this point means:
- // 1) We should check if the image is working, it might or might not be in working state right now
- // 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
- // 3) We made sure not to re-check this image too often
-
- // Common for ro and rw images: Size check, read check
- const off_t len = lseek( candidate->readFd, 0, SEEK_END );
- bool reload = false;
- if ( len == -1 ) {
- logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
- reload = true;
- } else if ( (uint64_t)len != candidate->realFilesize ) {
- logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
- ". Try sending SIGHUP to server if you know what you're doing.",
- candidate->path, candidate->realFilesize, (uint64_t)len );
- } else {
- // Seek worked, file size is same, now see if we can read from file
- char buffer[100];
- if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
- logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.",
- (int)sizeof(buffer), candidate->path, errno, removingText );
- reload = true;
- } else if ( !candidate->working ) {
- // Seems everything is fine again \o/
- candidate->working = true;
- logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
- }
- }
+ if ( image_ensureOpen( candidate ) && !candidate->problem.read )
+ return candidate; // We have a read fd and no read or changed problems
- if ( reload ) {
+ // -- image could not be opened again, or is open but has problem --
+
+ if ( _removeMissingImages && !file_isReadable( candidate->path ) ) {
+ candidate = image_remove( candidate );
+ // No image_release here, the image is still returned and should be released by caller
+ } else if ( candidate->readFd != -1 ) {
+ // We cannot just close the fd as it might be in use. Make a copy and remove old entry.
+ candidate = image_remove( candidate );
// Could not access the image with exising fd - mark for reload which will re-open the file.
// make a copy of the image struct but keep the old one around. If/When it's not being used
// anymore, it will be freed automatically.
- logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path );
+ logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path );
dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
img->path = strdup( candidate->path );
img->name = strdup( candidate->name );
img->virtualFilesize = candidate->virtualFilesize;
img->realFilesize = candidate->realFilesize;
- img->atime = now;
+ timing_get( &img->atime );
img->masterCrc32 = candidate->masterCrc32;
img->readFd = -1;
img->rid = candidate->rid;
img->users = 1;
- img->working = false;
+ img->problem.read = true;
+ img->problem.changed = candidate->problem.changed;
img->ref_cacheMap = NULL;
mutex_init( &img->lock, LOCK_IMAGE );
if ( candidate->crc32 != NULL ) {
@@ -419,18 +413,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
if ( image_addToList( img ) ) {
image_release( candidate );
candidate = img;
+ // Check if image is incomplete, initialize uplink
+ if ( candidate->ref_cacheMap != NULL ) {
+ uplink_init( candidate, -1, NULL, -1 );
+ }
+ // Try again with new instance
+ image_ensureOpen( candidate );
} else {
img->users = 0;
image_free( img );
}
- // Check if image is incomplete, initialize uplink
- if ( candidate->ref_cacheMap != NULL ) {
- uplink_init( candidate, -1, NULL, -1 );
- }
- // readFd == -1 and working == FALSE at this point,
- // this function needs some splitting up for handling as we need to run most
- // of the above code again. for now we know that the next call for this
- // name:rid will get ne newly inserted "img" and try to re-open the file.
+ // readFd == -1 and problem.read == true
}
return candidate; // We did all we can, hopefully it's working
@@ -449,6 +442,7 @@ dnbd3_image_t* image_lock(dnbd3_image_t *image)
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) {
+ assert( _images[i]->id == image->id );
image->users++;
mutex_unlock( &imageListLock );
return image;
@@ -479,6 +473,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
// responsible for freeing it
for (int i = 0; i < _num_images; ++i) {
if ( _images[i] == image ) { // Found, do nothing
+ assert( _images[i]->id == image->id );
mutex_unlock( &imageListLock );
return NULL;
}
@@ -518,6 +513,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
mutex_lock( &imageListLock );
for ( int i = _num_images - 1; i >= 0; --i ) {
if ( _images[i] == image ) {
+ assert( _images[i]->id == image->id );
_images[i] = NULL;
mustFree = ( image->users == 0 );
}
@@ -630,12 +626,18 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
{
assert( image != NULL );
assert( image->users == 0 );
- logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid );
+ logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) );
// uplink_shutdown might return false to tell us
// that the shutdown is in progress. Bail out since
// this will get called again when the uplink is done.
if ( !uplink_shutdown( image ) )
return NULL;
+ if ( isImageFromUpstream( image ) ) {
+ saveMetaData( image, NULL, 0 );
+ if ( image->ref_cacheMap != NULL ) {
+ saveCacheMap( image );
+ }
+ }
mutex_lock( &image->lock );
ref_setref( &image->ref_cacheMap, NULL );
free( image->crc32 );
@@ -700,7 +702,8 @@ static bool image_load_all_internal(char *base, char *path)
while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) {
entry = *entryPtr;
- if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue;
+ if ( entry.d_name[0] == '.' )
+ continue; // No hidden files, no . or ..
if ( strlen( entry.d_name ) > SUBDIR_LEN ) {
logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN );
continue;
@@ -717,7 +720,7 @@ static bool image_load_all_internal(char *base, char *path)
if ( S_ISDIR( st.st_mode ) ) {
image_load_all_internal( base, subpath ); // Recurse
} else if ( !isForbiddenExtension( subpath ) ) {
- image_load( base, subpath, true ); // Load image if possible
+ image_load( base, subpath, false ); // Load image if possible
}
}
closedir( dir );
@@ -756,10 +759,9 @@ static bool image_addToList(dnbd3_image_t *image)
* Note that this is NOT THREAD SAFE so make sure its always
* called on one thread only.
*/
-static bool image_load(char *base, char *path, int withUplink)
+static bool image_load(char *base, char *path, bool withUplink)
{
int revision = -1;
- struct stat st;
dnbd3_cache_map_t *cache = NULL;
uint32_t *crc32list = NULL;
dnbd3_image_t *existing = NULL;
@@ -824,7 +826,9 @@ static bool image_load(char *base, char *path, int withUplink)
fdImage = open( path, O_RDONLY );
}
if ( fdImage == -1 ) {
- logadd( LOG_ERROR, "Could not open '%s' for reading...", path );
+ if ( errno != ENOENT ) {
+ logadd( LOG_ERROR, "[load] Cannot open '%s' for reading (errno=%d)", path, errno );
+ }
goto load_error;
}
// Determine file size
@@ -855,16 +859,16 @@ static bool image_load(char *base, char *path, int withUplink)
// Compare data just loaded to identical image we apparently already loaded
if ( existing != NULL ) {
if ( existing->realFilesize != realFilesize ) {
- logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) );
// Image will be replaced below
} else if ( existing->crc32 != NULL && crc32list != NULL
&& memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) {
- logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+ logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) );
logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." );
logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" );
// Image will be replaced below
} else if ( existing->crc32 == NULL && crc32list != NULL ) {
- logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) );
existing->crc32 = crc32list;
existing->masterCrc32 = masterCrc;
crc32list = NULL;
@@ -872,7 +876,7 @@ static bool image_load(char *base, char *path, int withUplink)
goto load_error; // Keep existing
} else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
// Just ignore that fact, if replication is really complete the cache map will be removed anyways
- logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
+ logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) );
function_return = true;
goto load_error; // Keep existing
} else {
@@ -900,19 +904,10 @@ static bool image_load(char *base, char *path, int withUplink)
image->rid = (uint16_t)revision;
image->users = 0;
image->readFd = -1;
- image->working = ( cache == NULL );
timing_get( &image->nextCompletenessEstimate );
image->completenessEstimate = -1;
mutex_init( &image->lock, LOCK_IMAGE );
- int32_t offset;
- if ( stat( path, &st ) == 0 ) {
- // Negatively offset atime by file modification time
- offset = (int32_t)( st.st_mtime - time( NULL ) );
- if ( offset > 0 ) offset = 0;
- } else {
- offset = 0;
- }
- timing_gets( &image->atime, offset );
+ loadImageMeta( image );
// Prevent freeing in cleanup
cache = NULL;
@@ -925,7 +920,7 @@ static bool image_load(char *base, char *path, int withUplink)
// Image is definitely incomplete, initialize uplink worker
if ( image->ref_cacheMap != NULL ) {
- image->working = false;
+ image->problem.uplink = true;
if ( withUplink ) {
uplink_init( image, -1, NULL, -1 );
}
@@ -937,14 +932,14 @@ static bool image_load(char *base, char *path, int withUplink)
// Keep fd for reading
fdImage = -1;
// Check CRC32
- image_checkRandomBlocks( image, 4 );
+ image_checkRandomBlocks( image, 4, -1 );
} else {
logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
image->readFd = -1; // Keep fdImage instead, will be closed below
image = image_free( image );
goto load_error;
}
- logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
+ logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) );
function_return = true;
// Clean exit:
@@ -1027,10 +1022,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
return retval;
}
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
+/**
+ * Check up to count random blocks from given image. If fromFd is -1, the check will
+ * be run asynchronously using the integrity checker. Otherwise, the check will
+ * happen in the function and return the result of the check.
+ * @param image image to check
+ * @param count number of blocks to check (max)
+ * @param fromFd, check synchronously and use this fd for reading, -1 = async
+ * @return true = OK, false = error. Meaningless if fromFd == -1
+ */
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd)
{
if ( image->crc32 == NULL )
- return;
+ return true;
// This checks the first block and (up to) count - 1 random blocks for corruption
// via the known crc32 list. This is very sloppy and is merely supposed to detect
// accidental corruption due to broken dnbd3-proxy functionality or file system
@@ -1038,7 +1042,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
assert( count > 0 );
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
- int blocks[count];
+ int blocks[count+1]; // +1 for "-1" in sync case
int index = 0, j;
int block;
if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
@@ -1062,9 +1066,16 @@ while_end: ;
if ( cache != NULL ) {
ref_put( &cache->reference );
}
- for ( int i = 0; i < index; ++i ) {
- integrity_check( image, blocks[i], true );
+ if ( fromFd == -1 ) {
+ // Async
+ for ( int i = 0; i < index; ++i ) {
+ integrity_check( image, blocks[i], true );
+ }
+ return true;
}
+ // Sync
+ blocks[index] = -1;
+ return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize );
}
/**
@@ -1079,7 +1090,7 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_ERROR, "revision id invalid: %d", revision );
return false;
}
- char path[PATHLEN], cache[PATHLEN];
+ char path[PATHLEN], cache[PATHLEN+4];
char *lastSlash = strrchr( image, '/' );
if ( lastSlash == NULL ) {
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
@@ -1090,7 +1101,7 @@ bool image_create(char *image, int revision, uint64_t size)
*lastSlash = '/';
snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
}
- snprintf( cache, PATHLEN, "%s.map", path );
+ snprintf( cache, PATHLEN+4, "%s.map", path );
size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
const int mapsize = IMGSIZE_TO_MAPBYTES(size);
// Write files
@@ -1111,14 +1122,19 @@ bool image_create(char *image, int revision, uint64_t size)
logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err );
}
// Now write image
+ bool fallback = false;
if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) {
logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk"
" space without actually writing all zeroes to the block device." );
logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect"
" divine performance during replication." );
- goto failure_cleanup;
- } else if ( _sparseFiles && !file_setSize( fdImage, size ) ) {
+ if ( !_ignoreAllocErrors ) {
+ goto failure_cleanup;
+ }
+ fallback = true;
+ }
+ if ( ( _sparseFiles || fallback ) && !file_setSize( fdImage, size ) ) {
logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." );
goto failure_cleanup;
@@ -1162,14 +1178,18 @@ dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
// Sanity check
if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
|| name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
- // If in proxy mode, check with upstream server first
+ // Re-check latest local revision
+ image = loadImageServer( name, revision );
+ // If in proxy mode, check with upstream servers
if ( _isProxy ) {
+ // Forget the locally loaded one
+ image_release( image );
+ // Check with upstream - if unsuccessful, will return the same
+ // as loadImageServer did
image = loadImageProxy( name, revision, len );
- if ( image != NULL )
- return image;
}
// Lookup on local storage
- return loadImageServer( name, revision );
+ return image;
}
/**
@@ -1227,19 +1247,20 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
int uplinkSock = -1;
dnbd3_host_t uplinkServer;
const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV );
- uint16_t remoteProtocolVersion;
uint16_t remoteRid = revision;
- uint64_t remoteImageSize;
+ uint16_t acceptedRemoteRid = 0;
+ uint16_t remoteProtocolVersion = 0;
struct sockaddr_storage sa;
socklen_t salen;
poll_list_t *cons = sock_newPollList();
logadd( LOG_DEBUG2, "Trying to clone %s:%d from %d hosts", name, (int)revision, count );
for (int i = 0; i < count + 5; ++i) { // "i < count + 5" for 5 additional iterations, waiting on pending connects
- char *remoteName;
+ char *remoteName = NULL;
+ uint64_t remoteImageSize = 0;
bool ok = false;
int sock;
if ( i >= count ) {
- sock = sock_multiConnect( cons, NULL, 100, 1000 );
+ sock = sock_multiConnect( cons, NULL, 100, _uplinkTimeout );
if ( sock == -2 ) break;
} else {
if ( log_hasMask( LOG_DEBUG2 ) ) {
@@ -1248,7 +1269,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
host[len] = '\0';
logadd( LOG_DEBUG2, "Trying to replicate from %s", host );
}
- sock = sock_multiConnect( cons, &servers[i], 100, 1000 );
+ sock = sock_multiConnect( cons, &servers[i], 100, _uplinkTimeout );
}
if ( sock == -1 || sock == -2 ) continue;
salen = sizeof(sa);
@@ -1273,7 +1294,11 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
} else {
ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc.
}
- ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ if ( ok ) {
+ ok = image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+ } else {
+ logadd( LOG_INFO, "Not enough space to replicate '%s:%d'", name, (int)revision );
+ }
mutex_unlock( &reloadLock );
if ( !ok ) goto server_fail;
@@ -1282,26 +1307,32 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
if ( !sock_sockaddrToDnbd3( (struct sockaddr*)&sa, &uplinkServer ) ) {
uplinkServer.type = 0;
}
- break;
+ acceptedRemoteRid = remoteRid;
+ break; // TODO: Maybe we should try the remaining servers if rid == 0, in case there's an even newer one
server_fail: ;
close( sock );
}
sock_destroyPollList( cons );
- // If we still have a pointer to a local image, release the reference
- if ( image != NULL ) image_release( image );
+ // If we still have a pointer to a local image, compare rid
+ if ( image != NULL ) {
+ if ( ( revision == 0 && image->rid >= acceptedRemoteRid ) || ( image->rid == revision ) ) {
+ return image;
+ }
+ // release the reference
+ image_release( image );
+ }
// If everything worked out, this call should now actually return the image
- image = image_get( name, remoteRid, false );
+ image = image_get( name, acceptedRemoteRid, false );
if ( image != NULL && uplinkSock != -1 ) {
// If so, init the uplink and pass it the socket
- sock_setTimeout( uplinkSock, _uplinkTimeout );
if ( !uplink_init( image, uplinkSock, &uplinkServer, remoteProtocolVersion ) ) {
close( uplinkSock );
} else {
// Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
int i = 0;
- while ( !image->working && ++i < 100 )
+ while ( image->problem.uplink && ++i < 100 )
usleep( 2000 );
}
} else if ( uplinkSock != -1 ) {
@@ -1318,6 +1349,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
{
char imageFile[PATHLEN] = "";
uint16_t detectedRid = 0;
+ bool isLegacyFile = false;
if ( requestedRid != 0 ) {
snprintf( imageFile, PATHLEN, "%s/%s.r%d", _basePath, name, (int)requestedRid );
@@ -1354,6 +1386,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
&& ( detectedRid == 0 || !file_isReadable( imageFile ) ) ) {
snprintf( imageFile, PATHLEN, "%s/%s", _basePath, name );
detectedRid = 1;
+ isLegacyFile = true;
}
logadd( LOG_DEBUG2, "Trying to load %s:%d ( -> %d) as %s", name, (int)requestedRid, (int)detectedRid, imageFile );
// No file was determined, or it doesn't seem to exist/be readable
@@ -1361,7 +1394,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
logadd( LOG_DEBUG2, "Not found, bailing out" );
return image_get( name, requestedRid, true );
}
- if ( !_vmdkLegacyMode && requestedRid == 0 ) {
+ if ( !isLegacyFile && requestedRid == 0 ) {
// rid 0 requested - check if detected rid is readable, decrease rid if not until we reach 0
while ( detectedRid != 0 ) {
dnbd3_image_t *image = image_get( name, detectedRid, true );
@@ -1429,9 +1462,13 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name );
} else {
int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 );
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, crc32list, crc32len );
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, crc32list, crc32len );
close( fd );
+ if ( (size_t)ret != crc32len + sizeof(masterCrc) ) {
+ logadd( LOG_WARNING, "Could not save freshly received crc32 list for %s:%d", name, (int)revision );
+ unlink( crcFile );
+ }
}
}
free( crc32list );
@@ -1564,14 +1601,23 @@ json_t* image_getListAsJson()
ref_put( &uplink->reference );
}
- jsonImage = json_pack( "{sisssisisisisI}",
+ int problems = 0;
+#define addproblem(name,val) if (image->problem.name) problems |= (1 << val)
+ addproblem(read, 0);
+ addproblem(write, 1);
+ addproblem(changed, 2);
+ addproblem(uplink, 3);
+ addproblem(queue, 4);
+
+ jsonImage = json_pack( "{sisssisisisisIsi}",
"id", image->id, // id, name, rid never change, so access them without locking
"name", image->name,
"rid", (int) image->rid,
"users", image->users,
"complete", completeness,
"idle", idleTime,
- "size", (json_int_t)image->virtualFilesize );
+ "size", (json_int_t)image->virtualFilesize,
+ "problems", problems );
if ( bytesReceived != 0 ) {
json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) );
}
@@ -1594,7 +1640,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image)
assert( image != NULL );
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
if ( cache == NULL )
- return image->working ? 100 : 0;
+ return 100;
const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
if ( unlikely( len == 0 ) ) {
ref_put( &cache->reference );
@@ -1705,46 +1751,51 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
/**
* Make sure at least size bytes are available in _basePath.
* Will delete old images to make room for new ones.
- * TODO: Store last access time of images. Currently the
- * last access time is reset to the file modification time
- * on server restart. Thus it will
- * currently only delete images if server uptime is > 24 hours.
+ * It will only delete images if a configurable uptime is
+ * reached.
* This can be overridden by setting force to true, in case
* free space is desperately needed.
* Return true iff enough space is available. false in random other cases
*/
static bool image_ensureDiskSpace(uint64_t size, bool force)
{
- for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
+ for ( int maxtries = 0; maxtries < 50; ++maxtries ) {
uint64_t available;
if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
- logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno );
+ logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left.", errno );
return true;
}
if ( available > size )
return true; // Yay
- if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
+ if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) {
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but auto-freeing of disk space is disabled.",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+ }
if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 );
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)), _autoFreeDiskSpaceDelay / 60 );
return false;
}
- logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
- (int)(size / (1024 * 1024)) );
+ logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...",
+ (int)(available / (1024ll * 1024)),
+ (int)(size / (1024ll * 1024)) );
// Find least recently used image
dnbd3_image_t *oldest = NULL;
int i;
mutex_lock( &imageListLock );
for (i = 0; i < _num_images; ++i) {
dnbd3_image_t *current = _images[i];
- if ( current == NULL ) continue;
- if ( current->users == 0 ) { // Not in use :-)
- if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
- // Oldest access time so far
- oldest = current;
- }
- }
+ if ( current == NULL || current->users != 0 )
+ continue; // Empty slot or in use
+ if ( oldest != NULL && timing_1le2( &oldest->atime, &current->atime ) )
+ continue; // Already got a newer one
+ if ( !isImageFromUpstream( current ) )
+ continue; // Not replicated, don't touch
+ // Oldest access time so far
+ oldest = current;
}
if ( oldest != NULL ) {
oldest->users++;
@@ -1760,7 +1811,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
image_release( oldest ); // We did users++ above; image might have to be freed entirely
return false;
}
- logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
+ logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) );
char *filename = strdup( oldest->path ); // Copy name as we remove the image first
oldest = image_remove( oldest ); // Remove from list first...
oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed
@@ -1790,15 +1841,14 @@ static void* closeUnusedFds(void* nix UNUSED)
timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
int fds[FDCOUNT];
int fdindex = 0;
+ setThreadName( "unused-fd-close" );
mutex_lock( &imageListLock );
for ( int i = 0; i < _num_images; ++i ) {
dnbd3_image_t * const image = _images[i];
if ( image == NULL || image->readFd == -1 )
continue;
- // TODO: Also close for idle uplinks (uplink_connectionShouldShutdown)
- // TODO: And close writeFd for idle uplinks....
if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
- logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) );
fds[fdindex++] = image->readFd;
image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock
if ( fdindex == FDCOUNT )
@@ -1813,6 +1863,177 @@ static void* closeUnusedFds(void* nix UNUSED)
return NULL;
}
+static bool isImageFromUpstream(dnbd3_image_t *image)
+{
+ if ( !_isProxy )
+ return false; // Nothing to do
+ // Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+ // for which we have any upstream servers configured. If there's none, don't touch
+ // the cache map on disk.
+ if ( !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
+ return true;
+}
+
+static void* saveLoadAllCacheMaps(void* nix UNUSED)
+{
+ static ticks nextSave;
+ declare_now;
+ bool full = timing_reached( &nextSave, &now );
+ time_t walltime = 0;
+ setThreadName( "cache-mapper" );
+ if ( full ) {
+ walltime = time( NULL );
+ // Update at start to avoid concurrent runs
+ timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
+ }
+ mutex_lock( &imageListLock );
+ for ( int i = 0; i < _num_images; ++i ) {
+ dnbd3_image_t * const image = _images[i];
+ if ( image == NULL )
+ continue;
+ image->users++;
+ mutex_unlock( &imageListLock );
+ const bool fromUpstream = isImageFromUpstream( image );
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ if ( fromUpstream ) {
+ // Replicated image, we're responsible for updating the map, so save it
+ // Save if dirty bit is set, blocks were invalidated
+ bool save = cache->dirty;
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( !save ) {
+ // Otherwise, consider longer timeout and byte count limits of uplink
+ if ( uplink != NULL ) {
+ assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+ uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+ if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+ save = true;
+ }
+ }
+ }
+ if ( save ) {
+ cache->dirty = false;
+ if ( uplink != NULL ) {
+ uplink->bytesReceivedLastSave = uplink->bytesReceived;
+ }
+ saveCacheMap( image );
+ }
+ if ( uplink != NULL ) {
+ ref_put( &uplink->reference );
+ }
+ } else {
+ // We're not replicating this image, if there's a cache map, reload
+ // it periodically, since we might read from a shared storage that
+ // another server instance is writing to.
+ if ( full || ( !cache->unchanged && !image->problem.read ) ) {
+ logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
+ dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+ if ( onDisk == NULL ) {
+ // Should be complete now
+ logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
+ ref_setref( &image->ref_cacheMap, NULL );
+ } else {
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+ // Unchanged
+ cache->unchanged = true;
+ onDisk->reference.free( &onDisk->reference );
+ } else {
+ // Replace
+ ref_setref( &image->ref_cacheMap, &onDisk->reference );
+ logadd( LOG_DEBUG2, "Map changed" );
+ }
+ }
+ }
+ } // end reload cache map
+ ref_put( &cache->reference );
+ } // end has cache map
+ if ( full && fromUpstream ) {
+ saveMetaData( image, &now, walltime );
+ }
+ image_release( image ); // Always do this instead of users-- to handle freeing
+ mutex_lock( &imageListLock );
+ }
+ mutex_unlock( &imageListLock );
+ return NULL;
+}
+
+/**
+ * Saves the cache map of the given image.
+ * Return false if this image doesn't have a cache map, or if the image
+ * doesn't have any uplink to replicate from. In this case the image might
+ * still have a cache map that was loaded from disk, and should be reloaded
+ * periodically.
+ * @param image the image
+ */
+static void saveCacheMap(dnbd3_image_t *image)
+{
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache == NULL )
+ return; // Race - wasn't NULL in function call above...
+
+ logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) );
+ const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+ char mapfile[strlen( image->path ) + 4 + 1];
+ strcpy( mapfile, image->path );
+ strcat( mapfile, ".map" );
+
+ int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+ if ( fd == -1 ) {
+ const int err = errno;
+ ref_put( &cache->reference );
+ logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+ return;
+ }
+
+ // On Linux we could use readFd, but in general it's not guaranteed to work
+ int imgFd = open( image->path, O_WRONLY );
+ if ( imgFd == -1 ) {
+ logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno );
+ } else {
+ if ( fsync( imgFd ) == -1 ) {
+ logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno );
+ dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize);
+ const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+ if ( old == NULL ) {
+ // Could not load old map. FS might be toast.
+ logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." );
+ memset( cache->map, 0, mapSize );
+ } else {
+ // AND the maps together to be safe
+ for ( int i = 0; i < mapSize; ++i ) {
+ cache->map[i] &= old->map[i];
+ }
+ old->reference.free( &old->reference );
+ }
+ }
+ close( imgFd );
+ }
+
+ // Write current map to file
+ size_t done = 0;
+ while ( done < size ) {
+ const ssize_t ret = write( fd, cache->map + done, size - done );
+ if ( ret == -1 ) {
+ if ( errno == EINTR ) continue;
+ logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+ break;
+ }
+ if ( ret <= 0 ) {
+ logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+ break;
+ }
+ done += (size_t)ret;
+ }
+ ref_put( &cache->reference );
+ if ( fsync( fd ) == -1 ) {
+ logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+ }
+ close( fd );
+ // TODO fsync on parent directory
+}
+
static void allocCacheMap(dnbd3_image_t *image, bool complete)
{
const uint8_t val = complete ? 0xff : 0;
@@ -1822,7 +2043,7 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
memset( cache->map, val, byteSize );
mutex_lock( &image->lock );
if ( image->ref_cacheMap != NULL ) {
- logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid );
+ logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) );
free( cache );
} else {
ref_setref( &image->ref_cacheMap, &cache->reference );
@@ -1830,3 +2051,77 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
mutex_unlock( &image->lock );
}
+/**
+ * It's assumed you hold a reference to the image
+ */
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime)
+{
+ if ( !image->accessed )
+ return;
+ ticks tmp;
+ uint32_t diff;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "Cannot asprintf meta" );
+ return;
+ }
+ if ( now == NULL ) {
+ timing_get( &tmp );
+ now = &tmp;
+ walltime = time( NULL );
+ }
+ mutex_lock( &image->lock );
+ image->accessed = false;
+ diff = timing_diff( &image->atime, now );
+ mutex_unlock( &image->lock );
+ FILE *f = fopen( fn, "w" );
+ if ( f == NULL ) {
+ logadd( LOG_WARNING, "Cannot open %s for writing", fn );
+ } else {
+ fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) );
+ fclose( f );
+ }
+ free( fn );
+ // TODO: fsync() dir
+}
+
+static void loadImageMeta(dnbd3_image_t *image)
+{
+ int32_t offset = 1;
+ char *fn;
+ if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+ logadd( LOG_WARNING, "asprintf load" );
+ } else {
+ int fh = open( fn, O_RDONLY );
+ free( fn );
+ if ( fh != -1 ) {
+ char buf[200];
+ ssize_t ret = read( fh, buf, sizeof(buf)-1 );
+ close( fh );
+ if ( ret > 0 ) {
+ buf[ret] = '\0';
+ // Do it the cheap way until we actually store more stuff
+ char *pos = strstr( buf, "atime=" );
+ if ( pos != NULL ) {
+ offset = (int32_t)( atol( pos + 6 ) - time( NULL ) );
+ }
+ }
+ }
+ }
+ if ( offset == 1 ) {
+ // Nothing from .meta file, use old guesstimate
+ struct stat st;
+ if ( stat( image->path, &st ) == 0 ) {
+ // Negatively offset atime by file modification time
+ offset = (int32_t)( st.st_mtime - time( NULL ) );
+ } else {
+ offset = 0;
+ }
+ image->accessed = true;
+ }
+ if ( offset > 0 ) {
+ offset = 0;
+ }
+ timing_gets( &image->atime, offset );
+}
+
diff --git a/src/server/image.h b/src/server/image.h
index 89791fc..7b6583c 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -19,7 +19,7 @@ bool image_ensureOpen(dnbd3_image_t *image);
dnbd3_image_t* image_byId(int imgId);
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
+dnbd3_image_t* image_get(const char *name, uint16_t revision, bool checkIfWorking);
bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
@@ -49,6 +49,52 @@ void image_closeUnusedFd();
bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
+bool image_saveCacheMap(dnbd3_image_t *image);
+
+/**
+ * Check if given range is cached. Be careful when using this function because:
+ * 1) you need to hold a reference to the cache map
+ * 2) start and end are assumed to be 4k aligned
+ * 3) start and end are not checked to be in bounds (we don't know the image in this context)
+ */
+static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end)
+{
+ const uint64_t firstByteInMap = start >> 15;
+ const uint64_t lastByteInMap = (end - 1) >> 15;
+ const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+ const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+ uint64_t pos;
+ uint8_t b;
+ bool isCached;
+ if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+ b = cache->map[firstByteInMap];
+ isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+ } else {
+ isCached = true;
+ atomic_thread_fence( memory_order_acquire );
+ // First byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+ isCached = ( ( b & fb ) == fb );
+ }
+ // Last byte
+ if ( isCached ) {
+ b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+ isCached = ( ( b & lb ) == lb );
+ }
+ // Middle, must be all bits set (0xff)
+ if ( isCached ) {
+ for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+ if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+ isCached = false;
+ break;
+ }
+ }
+ }
+ }
+ return isCached;
+}
+
// one byte in the map covers 8 4kib blocks, so 32kib per byte
// "+ (1 << 15) - 1" is required to account for the last bit of
// the image that is smaller than 32kib
diff --git a/src/server/ini.c b/src/server/ini.c
index c796d5c..37c44a3 100644
--- a/src/server/ini.c
+++ b/src/server/ini.c
@@ -52,7 +52,7 @@ static char* find_char_or_comment(const char* s, char c)
/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
static char* strncpy0(char* dest, const char* src, size_t size)
{
- strncpy( dest, src, size );
+ strncpy( dest, src, size - 1 );
dest[size - 1] = '\0';
return dest;
}
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 4006dfc..91e53b8 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED)
readFd = directFd;
}
}
- if ( readFd == -1 ) { // Try buffered; flush to disk for that
- image_ensureOpen( image );
- readFd = image->readFd;
+ if ( readFd == -1 ) { // Try buffered as fallback
+ if ( image_ensureOpen( image ) && !image->problem.read ) {
+ readFd = image->readFd;
+ }
}
if ( readFd == -1 ) {
logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
@@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED)
// Done with this task as nothing left
checkQueue[i].image = NULL;
if ( i + 1 == queueLen ) queueLen--;
- // Mark as working again if applicable
- if ( !foundCorrupted ) {
- dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
- if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
- mutex_lock( &image->lock );
- image->working = uplink->current.fd != -1 && image->readFd != -1;
- mutex_unlock( &image->lock );
- ref_put( &uplink->reference );
- }
- }
} else {
// Still more blocks to go...
checkQueue[i].block = blocks[0];
@@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED)
}
if ( foundCorrupted && !_shutdown ) {
// Something was fishy, make sure uplink exists
- mutex_lock( &image->lock );
- image->working = false;
- mutex_unlock( &image->lock );
uplink_init( image, -1, NULL, -1 );
}
// Release :-)
diff --git a/src/server/locks.c b/src/server/locks.c
index b39576b..3be73b3 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -7,9 +7,9 @@
#include "locks.h"
#include "helper.h"
-#include "../shared/timing.h"
+#include <dnbd3/shared/timing.h>
-#ifdef _DEBUG
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
#define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
#define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
#define MAXLPT 20
diff --git a/src/server/locks.h b/src/server/locks.h
index e5c9801..3b04caa 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -23,10 +23,12 @@
#define LOCK_UPLINK_RTT 200
#define LOCK_UPLINK_SEND 210
#define LOCK_RPC_ACL 220
+#define LOCK_FUSE_INIT 300
+#define LOCK_FUSE_DIR 310
//
-#ifdef _DEBUG
+#ifdef DNBD3_SERVER_DEBUG_LOCKS
#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
@@ -55,10 +57,12 @@ void debug_dump_lock_stats();
#endif
-#ifdef DEBUG_THREADS
+#ifdef DNBD3_SERVER_DEBUG_THREADS
+
+#include <dnbd3/shared/log.h>
extern int debugThreadCount;
-#define thread_create(thread,attr,routine,arg) (logadd( LOG_THREAD CREATE, "%d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
+#define thread_create(thread,attr,routine,arg) (logadd( LOG_INFO, "THREAD_CREATE: %d @ %s:%d\n", debugThreadCount, __FILE__, (int)__LINE__), debug_thread_create(thread, attr, routine, arg))
static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg)
{
int i;
@@ -68,26 +72,26 @@ static inline pthread_t debug_thread_create(pthread_t *thread, const pthread_att
return pthread_create( thread, attr, start_routine, arg );
}
-#define thread_detach(thread) (logadd( LOG_THREAD DETACH, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
+#define thread_detach(thread) (logadd( LOG_INFO, "THREAD_DETACH: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_detach(thread))
static inline int debug_thread_detach(pthread_t thread)
{
const int ret = pthread_detach(thread);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD DETACH, "Tried to detach invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_DETACH: Tried to detach invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
}
-#define thread_join(thread,value) (logadd( LOG_THREAD JOIN, "%d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
+#define thread_join(thread,value) (logadd( LOG_INFO, "THREAD_JOIN: %d @ %s:%d\n", debugThreadCount, __FILE__, __LINE__), debug_thread_join(thread,value))
static inline int debug_thread_join(pthread_t thread, void **value_ptr)
{
const int ret = pthread_join(thread, value_ptr);
if (ret == 0) {
--debugThreadCount;
} else {
- logadd( LOG_THREAD JOIN, "Tried to join invalid thread (error %d)\n", (int)errno);
+ logadd( LOG_INFO, "THREAD_JOIN: Tried to join invalid thread (error %d)\n", (int)errno);
exit(1);
}
return ret;
@@ -99,6 +103,6 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr)
#define thread_detach(thread) pthread_detach( thread )
#define thread_join(thread,value) pthread_join( thread, value )
-#endif
+#endif /* DNBD3_SERVER_DEBUG_THREADS */
#endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
index aba4e7d..eb51d29 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -26,10 +26,10 @@
#include "altservers.h"
#include "reference.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
-#include "../shared/protocol.h"
-#include "../serialize.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/serialize.h>
#include <assert.h>
@@ -58,11 +58,12 @@ static atomic_uint_fast64_t totalBytesSent = 0;
static bool addToList(dnbd3_client_t *client);
static void removeFromList(dnbd3_client_t *client);
static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client);
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start, uint32_t length, const char *buffer);
static inline bool recv_request_header(int sock, dnbd3_request_t *request)
{
ssize_t ret, fails = 0;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
// Read request header from socket
@@ -89,7 +90,7 @@ static inline bool recv_request_header(int sock, dnbd3_request_t *request)
static inline bool recv_request_payload(int sock, uint32_t size, serialized_buffer_t *payload)
{
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
if ( size == 0 ) {
@@ -113,7 +114,7 @@ static inline bool recv_request_payload(int sock, uint32_t size, serialized_buff
* Send reply with optional payload. payload can be null. The caller has to
* acquire the sendMutex first.
*/
-static inline bool send_reply(int sock, dnbd3_reply_t *reply, void *payload)
+static inline bool send_reply(int sock, dnbd3_reply_t *reply, const void *payload)
{
const uint32_t size = reply->size;
fixup_reply( *reply );
@@ -159,7 +160,7 @@ void* net_handleNewConnection(void *clientPtr)
// Await data from client. Since this is a fresh connection, we expect data right away
sock_setTimeout( client->sock, _clientTimeout );
do {
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
const int ret = (int)recv( 0, &request, sizeof(request), MSG_WAITALL );
#else
const int ret = (int)recv( client->sock, &request, sizeof(request), MSG_WAITALL );
@@ -197,6 +198,7 @@ void* net_handleNewConnection(void *clientPtr)
client->hostName[HOSTNAMELEN-1] = '\0';
mutex_unlock( &client->lock );
client->bytesSent = 0;
+ client->relayedCount = 0;
if ( !addToList( client ) ) {
freeClientStruct( client );
@@ -207,6 +209,7 @@ void* net_handleNewConnection(void *clientPtr)
dnbd3_reply_t reply;
dnbd3_image_t *image = NULL;
+ dnbd3_cache_map_t *cache = NULL;
int image_file = -1;
int num;
@@ -215,7 +218,6 @@ void* net_handleNewConnection(void *clientPtr)
serialized_buffer_t payload;
uint16_t rid, client_version;
- uint64_t start, end;
dnbd3_server_entry_t server_list[NUMBER_SERVERS];
@@ -262,22 +264,24 @@ void* net_handleNewConnection(void *clientPtr)
atomic_thread_fence( memory_order_release );
if ( unlikely( image == NULL ) ) {
//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
- } else if ( unlikely( !image->working ) ) {
+ } else if ( unlikely( image->problem.read || image->problem.changed ) ) {
logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
client->hostName, image_name, (int)rid );
} else {
// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
bOk = true;
if ( image->ref_cacheMap != NULL ) {
- dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
- if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
+ if ( image->problem.queue || image->problem.write ) {
bOk = ( rand() % 4 ) == 1;
}
- if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
- usleep( 100000 ); // server gets a penalty and is less likely to be selected
- }
- if ( uplink != NULL ) {
- ref_put( &uplink->reference );
+ if ( bOk ) {
+ if ( image->problem.write ) { // Wait 100ms if local caching is not working so this
+ usleep( 100000 ); // server gets a penalty and is less likely to be selected
+ }
+ if ( image->problem.uplink ) {
+ // Penaltize depending on completeness, if no uplink is available
+ usleep( ( 100 - image->completenessEstimate ) * 100 );
+ }
}
}
if ( bOk ) {
@@ -286,6 +290,7 @@ void* net_handleNewConnection(void *clientPtr)
if ( !client->isServer ) {
// Only update immediately if this is a client. Servers are handled on disconnect.
timing_get( &image->atime );
+ image->accessed = true;
}
mutex_unlock( &image->lock );
serializer_reset_write( &payload );
@@ -313,9 +318,8 @@ void* net_handleNewConnection(void *clientPtr)
// client handling mainloop
while ( recv_request_header( client->sock, &request ) ) {
if ( _shutdown ) break;
- switch ( request.cmd ) {
+ if ( likely ( request.cmd == CMD_GET_BLOCK ) ) {
- case CMD_GET_BLOCK:;
const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
reply.handle = request.handle;
if ( unlikely( offset >= image->virtualFilesize ) ) {
@@ -324,7 +328,7 @@ void* net_handleNewConnection(void *clientPtr)
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
}
if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
// Sanity check
@@ -332,63 +336,36 @@ void* net_handleNewConnection(void *clientPtr)
reply.size = 0;
reply.cmd = CMD_ERROR;
send_reply( client->sock, &reply, NULL );
- break;
+ continue;
}
- dnbd3_cache_map_t *cache;
- if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
+ if ( cache == NULL ) {
+ cache = ref_get_cachemap( image );
+ }
+
+ if ( request.size != 0 && cache != NULL ) {
// This is a proxyed image, check if we need to relay the request...
- start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- bool isCached = true;
- const uint64_t firstByteInMap = start >> 15;
- const uint64_t lastByteInMap = (end - 1) >> 15;
- uint64_t pos;
- uint8_t b;
- atomic_thread_fence( memory_order_acquire );
- // Middle - quick checking
- if ( isCached ) {
- for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
- if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
- isCached = false;
- break;
- }
- }
- }
- // First byte
- if ( isCached ) {
- b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
- for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (b & bit_mask) == 0 ) {
- isCached = false;
- break;
+ const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+ if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+ if ( unlikely( client->relayedCount > 250 ) ) {
+ logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" );
+ for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) {
+ usleep( 10000 );
}
- }
- }
- // Last byte - only check if request spans multiple bytes in cache map
- if ( isCached && firstByteInMap != lastByteInMap ) {
- b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
- for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
- assert( lastByteInMap == (pos >> 15) );
- const int map_x = (pos >> 12) & 7; // mod 8
- const uint8_t bit_mask = (uint8_t)( 1 << map_x );
- if ( (b & bit_mask) == 0 ) {
- isCached = false;
- break;
+ if ( client->relayedCount > 250 ) {
+ logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" );
+ goto exit_client_cleanup;
}
}
- }
- ref_put( &cache->reference );
- if ( !isCached ) {
- if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
- logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+ client->relayedCount++;
+ if ( !uplink_requestClient( client, &uplinkCallback, request.handle, offset, request.size, request.hops ) ) {
+ client->relayedCount--;
+ logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
client->hostName, image->name, image->rid );
- image->working = false;
goto exit_client_cleanup;
}
- break; // DONE, exit request.cmd switch
+ continue; // Reply arrives on uplink some time later, handle next request now
}
}
@@ -419,7 +396,7 @@ void* net_handleNewConnection(void *clientPtr)
// TODO: Should we consider EOPNOTSUPP on BSD for sendfile and fallback to read/write?
// Linux would set EINVAL or ENOSYS instead, which it unfortunately also does for a couple of other failures :/
// read/write would kill performance anyways so a fallback would probably be of little use either way.
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
char buf[1000];
size_t cnt = realBytes - done;
if ( cnt > 1000 ) {
@@ -456,7 +433,7 @@ void* net_handleNewConnection(void *clientPtr)
}
if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
- image->working = false;
+ image->problem.read = true;
}
}
goto exit_client_cleanup;
@@ -473,7 +450,16 @@ void* net_handleNewConnection(void *clientPtr)
if ( lock ) mutex_unlock( &client->sendMutex );
// Global per-client counter
client->bytesSent += request.size; // Increase counter for statistics.
- break;
+ continue;
+ }
+ // Any other command
+ // Release cache map every now and then, in case the image was replicated
+ // entirely. Will be re-grabbed on next CMD_GET_BLOCK otherwise.
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ cache = NULL;
+ }
+ switch ( request.cmd ) {
case CMD_GET_SERVERS:
// Build list of known working alt servers
@@ -522,9 +508,9 @@ set_name: ;
logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
break;
- }
- }
- }
+ } // end switch
+ } // end loop
+ } // end bOk
exit_client_cleanup: ;
// First remove from list, then add to counter to prevent race condition
removeFromList( client );
@@ -533,8 +519,12 @@ exit_client_cleanup: ;
if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
mutex_lock( &image->lock );
timing_get( &image->atime );
+ image->accessed = true;
mutex_unlock( &image->lock );
}
+ if ( cache != NULL ) {
+ ref_put( &cache->reference );
+ }
freeClientStruct( client ); // This will also call image_release on client->image
return NULL ;
fail_preadd: ;
@@ -695,9 +685,21 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
if ( client->image != NULL ) {
dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
if ( uplink != NULL ) {
- uplink_removeClient( uplink, client );
+ if ( client->relayedCount != 0 ) {
+ uplink_removeEntry( uplink, client, &uplinkCallback );
+ }
ref_put( &uplink->reference );
}
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount );
+ int i;
+ for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) {
+ usleep( 10000 );
+ }
+ if ( client->relayedCount != 0 ) {
+ logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount );
+ }
+ }
}
mutex_lock( &client->sendMutex );
if ( client->sock != -1 ) {
@@ -739,15 +741,21 @@ static bool addToList(dnbd3_client_t *client)
return true;
}
-void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle)
+static void uplinkCallback(void *data, uint64_t handle, uint64_t start UNUSED, uint32_t length, const char *buffer)
{
- dnbd3_reply_t reply;
- reply.magic = dnbd3_packet_magic;
- reply.cmd = cmd;
- reply.handle = handle;
- reply.size = 0;
+ dnbd3_client_t *client = (dnbd3_client_t*)data;
+ dnbd3_reply_t reply = {
+ .magic = dnbd3_packet_magic,
+ .cmd = buffer == NULL ? CMD_ERROR : CMD_GET_BLOCK,
+ .handle = handle,
+ .size = length,
+ };
mutex_lock( &client->sendMutex );
- send_reply( client->sock, &reply, NULL );
+ send_reply( client->sock, &reply, buffer );
+ if ( buffer == NULL ) {
+ shutdown( client->sock, SHUT_RDWR );
+ }
+ client->relayedCount--;
mutex_unlock( &client->sendMutex );
}
diff --git a/src/server/net.h b/src/server/net.h
index 7719aef..2d6e5e7 100644
--- a/src/server/net.h
+++ b/src/server/net.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -37,6 +37,4 @@ void net_disconnectAll();
void net_waitForAllDisconnected();
-void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle);
-
#endif /* NET_H_ */
diff --git a/src/server/picohttpparser/CMakeLists.txt b/src/server/picohttpparser/CMakeLists.txt
new file mode 100644
index 0000000..cc6ec96
--- /dev/null
+++ b/src/server/picohttpparser/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(picohttpparser
+ LANGUAGES C)
+
+set(PICOHTTPPARSER_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.c)
+set(PICOHTTPPARSER_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/picohttpparser.h)
+
+add_library(picohttpparser STATIC ${PICOHTTPPARSER_SOURCE_FILES})
+target_include_directories(picohttpparser PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/src/server/reference.h b/src/server/reference.h
index 4eda546..75a681f 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref )
return ref;
}
+static inline void ref_inc( ref *ref )
+{
+ ++ref->count;
+}
+
static inline void ref_put( ref *ref )
{
if ( --ref->count == 0 ) {
diff --git a/src/server/rpc.c b/src/server/rpc.c
index a454d6d..119bbd5 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -5,7 +5,9 @@
#include "locks.h"
#include "image.h"
#include "altservers.h"
-#include "../shared/sockhelper.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
#include "fileutil.h"
#include "picohttpparser/picohttpparser.h"
#include "urldecode.h"
@@ -101,8 +103,8 @@ void rpc_init()
int fd = open( "/dev/urandom", O_RDONLY );
if ( fd != -1 ) {
uint32_t bla = 1;
- read( fd, &bla, 4 );
- randomRunId = (randomRunId << 32) | bla;
+ (void)!read( fd, &bla, 4 );
+ randomRunId = ((randomRunId & 0xffffffff) << 32) | bla;
}
close( fd );
}
@@ -144,7 +146,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
while ( !_shutdown ) {
// Read request from client
struct phr_header headers[100];
- size_t numHeaders, prevLen = 0, consumed;
+ size_t numHeaders, prevLen = 0, consumed = 0;
struct string method, path;
int minorVersion;
while ( !_shutdown ) {
@@ -174,7 +176,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
// Reaching here means partial request or parse error
if ( pret == -2 ) { // Partial, keep reading
prevLen = hoff;
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
ssize_t ret = recv( 0, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
#else
ssize_t ret = recv( sock, headerBuf + hoff, sizeof(headerBuf) - hoff, 0 );
@@ -259,7 +261,7 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
{
bool ok;
bool stats = false, images = false, clients = false, space = false;
- bool logfile = false, config = false, altservers = false;
+ bool logfile = false, config = false, altservers = false, version = false;
#define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true
for (size_t i = 0; i < fields_num; ++i) {
if ( !equals( &fields[i].name, &STR_Q ) ) continue;
@@ -270,9 +272,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
else SETVAR(logfile);
else SETVAR(config);
else SETVAR(altservers);
+ else SETVAR(version);
}
#undef SETVAR
- if ( ( stats || space ) && !(permissions & ACL_STATS) ) {
+ if ( ( stats || space || version ) && !(permissions & ACL_STATS) ) {
return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive );
}
if ( images && !(permissions & ACL_IMAGE_LIST) ) {
@@ -308,6 +311,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
statisticsJson = json_pack( "{sI}",
"runId", randomRunId );
}
+ if ( version ) {
+ json_object_set_new( statisticsJson, "version", json_string( DNBD3_VERSION_LONG ", built " DNBD3_BUILD_DATE ) );
+ json_object_set_new( statisticsJson, "build", json_string( DNBD3_BUILD ) );
+ }
if ( space ) {
uint64_t spaceTotal = 0, spaceAvail = 0;
file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail );
@@ -405,9 +412,11 @@ static bool sendReply(int sock, const char *status, const char *ctype, const cha
if ( keepAlive == HTTP_CLOSE ) {
// Wait for flush
shutdown( sock, SHUT_WR );
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
sock = 0;
#endif
+ // Don't wait too long in case other side ignores the shutdown
+ sock_setTimeout( sock, 600 );
while ( read( sock, buffer, sizeof buffer ) > 0 );
return false;
}
@@ -451,7 +460,7 @@ static int getacl(dnbd3_host_t *host)
if ( aclRules[i].bitMask != 0 && aclRules[i].host[aclRules[i].bytes] != ( host->addr[aclRules[i].bytes] & aclRules[i].bitMask ) ) continue;
return aclRules[i].permissions;
}
-#ifdef AFL_MODE
+#ifdef DNBD3_SERVER_AFL
return 0x7fffff;
#else
return 0;
@@ -487,7 +496,7 @@ static void addacl(int argc, char **argv, void *data UNUSED)
*slash++ = '\0';
}
if ( !parse_address( argv[0], &host ) ) goto unlock_end;
- long int bits;
+ long int bits = 0;
if ( slash != NULL ) {
char *last;
bits = strtol( slash, &last, 10 );
diff --git a/src/server/serialize.c b/src/server/serialize.c
deleted file mode 100644
index 4934132..0000000
--- a/src/server/serialize.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "../serialize.c"
diff --git a/src/server/server.c b/src/server/server.c
index 0dddea7..0f75935 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -29,10 +29,12 @@
#include "integrity.h"
#include "threadpool.h"
#include "rpc.h"
+#include "fuse.h"
-#include "../version.h"
-#include "../shared/sockhelper.h"
-#include "../shared/timing.h"
+#include <dnbd3/version.h>
+#include <dnbd3/build.h>
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/timing.h>
#include <signal.h>
#include <getopt.h>
@@ -104,10 +106,14 @@ static void queueJobInternal(job_t *job);
*/
void dnbd3_printHelp(char *argv_0)
{
- printf( "Version: %s\n\n", VERSION_STRING );
+ printf( "Version: %s\n\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
printf( "Usage: %s [OPTIONS]...\n", argv_0 );
printf( "Start the DNBD3 server\n" );
printf( "-c or --config Configuration directory (default /etc/dnbd3-server/)\n" );
+#ifdef DNBD3_SERVER_FUSE
+ printf( "-m or --mount FUSE mount point\n");
+#endif
printf( "-n or --nodaemon Start server in foreground\n" );
printf( "-b or --bind Local Address to bind to\n" );
printf( "-h or --help Show this help text and quit\n" );
@@ -126,7 +132,8 @@ void dnbd3_printHelp(char *argv_0)
*/
void dnbd3_printVersion()
{
- printf( "Version: %s\n", VERSION_STRING );
+ printf( "dnbd3-server version: %s\n", DNBD3_VERSION_LONG );
+ printf( "Built: %s\n", DNBD3_BUILD_DATE );
exit( 0 );
}
@@ -140,6 +147,8 @@ _Noreturn static void dnbd3_cleanup()
_shutdown = true;
logadd( LOG_INFO, "Cleanup..." );
+ dfuse_shutdown();
+
if ( hasTimerThread ) {
pthread_kill( timerThread, SIGINT );
thread_join( timerThread, NULL );
@@ -190,11 +199,13 @@ int main(int argc, char *argv[])
char *paramCreate = NULL;
char *bindAddress = NULL;
char *errorMsg = NULL;
+ char *mountDir = NULL;
int64_t paramSize = -1;
int paramRevision = -1;
- static const char *optString = "b:c:d:hnv?";
+ static const char *optString = "b:c:m:d:hnv?";
static const struct option longOpts[] = {
{ "config", required_argument, NULL, 'c' },
+ { "mount", required_argument, NULL, 'm' },
{ "nodaemon", no_argument, NULL, 'n' },
{ "reload", no_argument, NULL, 'r' },
{ "help", no_argument, NULL, 'h' },
@@ -209,6 +220,16 @@ int main(int argc, char *argv[])
{ 0, 0, 0, 0 }
};
+ log_init();
+
+ /* set proper output stream for AFL */
+#ifdef DNBD3_SERVER_AFL
+ if ( log_setConsoleOutputStream(stderr) < 0 ) {
+ logadd( LOG_ERROR, "Failed to set output stream for AFL to stderr" );
+ exit( EXIT_FAILURE );
+ }
+#endif
+
mainPid = getpid();
mainThread = pthread_self();
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
@@ -218,6 +239,13 @@ int main(int argc, char *argv[])
case 'c':
_configDir = strdup( optarg );
break;
+ case 'm':
+#ifndef DNBD3_SERVER_FUSE
+ fprintf( stderr, "FUSE support not enabled at build time.\n" );
+ return 8;
+#endif
+ mountDir = strdup( optarg );
+ break;
case 'n':
demonize = 0;
break;
@@ -263,6 +291,7 @@ int main(int argc, char *argv[])
opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
}
+
// Load general config
if ( _configDir == NULL ) _configDir = strdup( "/etc/dnbd3-server" );
@@ -275,9 +304,7 @@ int main(int argc, char *argv[])
timing_setBase();
timing_get( &startupTime );
-#ifdef AFL_MODE
- // ###### AFL
- //
+#ifdef DNBD3_SERVER_AFL
image_serverStartup();
net_init();
uplink_globalsInit();
@@ -301,9 +328,7 @@ int main(int argc, char *argv[])
net_handleNewConnection( dnbd3_client );
exit( 0 );
}
- //
- // ###### AFL END
-#endif
+#endif /* DNBD3_SERVER_AFL */
// One-shots first:
@@ -315,7 +340,10 @@ int main(int argc, char *argv[])
// No one-shot detected, normal server operation or errormsg serving
if ( demonize ) {
logadd( LOG_INFO, "Forking into background, see log file for further information" );
- daemon( 1, 0 );
+ if ( daemon( 0, 0 ) == -1 ) {
+ logadd( LOG_ERROR, "Could not daemon(): errno=%d", errno );
+ exit( 1 );
+ }
}
if ( errorMsg != NULL ) {
setupNetwork( bindAddress );
@@ -339,7 +367,15 @@ int main(int argc, char *argv[])
net_init();
uplink_globalsInit();
rpc_init();
- logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE );
+ if ( mountDir != NULL && !dfuse_init( "-oallow_other", mountDir ) ) {
+ logadd( LOG_ERROR, "Cannot mount fuse directory to %s", mountDir );
+ dnbd3_cleanup();
+ return EXIT_FAILURE;
+ }
+ logadd( LOG_INFO, "DNBD3 server starting...." );
+ logadd( LOG_INFO, "Machine type: " DNBD3_ENDIAN_MODE );
+ logadd( LOG_INFO, "Build Type: %s", DNBD3_BUILD );
+ logadd( LOG_INFO, "Version: %s, built %s", DNBD3_VERSION_LONG, DNBD3_BUILD_DATE );
if ( altservers_load() < 0 ) {
logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
@@ -379,10 +415,11 @@ int main(int argc, char *argv[])
// Initialize thread pool
if ( !threadpool_init( 8 ) ) {
logadd( LOG_ERROR, "Could not init thread pool!\n" );
+ dnbd3_cleanup();
exit( EXIT_FAILURE );
}
- logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
+ logadd( LOG_INFO, "Server is ready." );
if ( thread_create( &timerThread, NULL, &timerMainloop, NULL ) == 0 ) {
hasTimerThread = true;
@@ -398,7 +435,7 @@ int main(int argc, char *argv[])
if ( sigReload ) {
sigReload = false;
logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" );
- threadpool_run( &server_asyncImageListLoad, NULL );
+ threadpool_run( &server_asyncImageListLoad, NULL, "IMAGE_RELOAD" );
}
if ( sigLogCycle ) {
sigLogCycle = false;
@@ -425,7 +462,7 @@ int main(int argc, char *argv[])
continue;
}
- if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) {
+ if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client, "CLIENT" ) ) {
logadd( LOG_ERROR, "Could not start thread for new connection." );
free( dnbd3_client );
continue;
@@ -520,10 +557,11 @@ static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) {
pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread
}
- }
- if ( pthread_equal( pthread_self(), mainThread ) ) {
- // Signal received by main thread -- handle
- dnbd3_handleSignal( signum );
+ // Source is not this process -- only then do we honor signals
+ if ( pthread_equal( pthread_self(), mainThread ) ) {
+ // Signal received by main thread -- handle
+ dnbd3_handleSignal( signum );
+ }
}
}
@@ -568,7 +606,7 @@ static int handlePendingJobs(void)
jobHead = *temp; // Make it list head
*temp = NULL; // Split off part before that
while ( todo != NULL ) {
- threadpool_run( todo->startRoutine, todo->arg );
+ threadpool_run( todo->startRoutine, todo->arg, "TIMER_TASK" );
old = todo;
todo = todo->next;
if ( old->intervalSecs == 0 ) {
diff --git a/src/server/server.h b/src/server/server.h
index a026eb6..e93d8f5 100644
--- a/src/server/server.h
+++ b/src/server/server.h
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -22,7 +22,7 @@
#define SERVER_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
uint32_t dnbd3_serverUptime();
void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs);
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 0b46fd6..a21bd0d 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -8,6 +8,7 @@ typedef struct _entry_t {
dnbd3_signal_t* signal;
void *(*startRoutine)(void *);
void * arg;
+ const char *name;
} entry_t;
static void *threadpool_worker(void *entryPtr);
@@ -56,21 +57,22 @@ void threadpool_waitEmpty()
} while ( activeThreads != 0 );
}
-bool threadpool_run(void *(*startRoutine)(void *), void *arg)
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name)
{
if ( unlikely( _shutdown ) ) {
logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
return false;
}
+#ifdef DEBUG
if ( unlikely( startRoutine == NULL ) ) {
logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
return false; // Or bail out!?
}
+#endif
entry_t *entry = NULL;
for ( int i = 0; i < maxIdleThreads; ++i ) {
- entry_t *cur = pool[i];
- if ( cur != NULL && atomic_compare_exchange_weak( &pool[i], &cur, NULL ) ) {
- entry = cur;
+ entry = atomic_exchange( &pool[i], NULL );
+ if ( entry != NULL ) {
break;
}
}
@@ -87,7 +89,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
return false;
}
if ( 0 != thread_create( &(entry->thread), &threadAttrs, threadpool_worker, (void*)entry ) ) {
- logadd( LOG_WARNING, "Could not create new thread for thread pool\n" );
+ logadd( LOG_WARNING, "Could not create new thread for thread pool (%d active)\n", (int)activeThreads );
signal_close( entry->signal );
free( entry );
return false;
@@ -96,6 +98,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
}
entry->startRoutine = startRoutine;
entry->arg = arg;
+ entry->name = name;
atomic_thread_fence( memory_order_release );
signal_call( entry->signal );
return true;
@@ -120,10 +123,15 @@ keep_going:;
logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
continue;
}
+#ifdef DEBUG
if ( entry->startRoutine == NULL ) {
logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
exit( 1 );
}
+ if ( entry->name != NULL ) {
+ setThreadName( entry->name );
+ }
+#endif
// Start assigned work
(*entry->startRoutine)( entry->arg );
// Reset vars for safety
@@ -143,6 +151,7 @@ keep_going:;
// Reaching here means pool is full; just let the thread exit
break;
}
+ setThreadName( "[dead]" );
signal_close( entry->signal );
free( entry );
activeThreads--;
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index ee0b3aa..c30d44f 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -1,7 +1,7 @@
#ifndef _THREADPOOL_H_
#define _THREADPOOL_H_
-#include "../types.h"
+#include <dnbd3/types.h>
/**
* Initialize the thread pool. This must be called before using
@@ -26,9 +26,10 @@ void threadpool_waitEmpty();
* Run a thread using the thread pool.
* @param startRoutine function to run in new thread
* @param arg argument to pass to thead
+ * @param name STRING CONSTANT (literal) for debugging purposes
* @return true if thread was started
*/
-bool threadpool_run(void *(*startRoutine)(void *), void *arg);
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name);
#endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index f39e633..8a83124 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -4,10 +4,11 @@
#include "image.h"
#include "altservers.h"
#include "net.h"
-#include "../shared/sockhelper.h"
-#include "../shared/protocol.h"
-#include "../shared/timing.h"
-#include "../shared/crc32.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/protocol.h>
+#include <dnbd3/shared/timing.h>
+#include <dnbd3/shared/crc32.h>
+#include "threadpool.h"
#include "reference.h"
#include <assert.h>
@@ -17,49 +18,35 @@
#include <unistd.h>
#include <stdatomic.h>
+static const uint8_t HOP_FLAG_BGR = 0x80;
+static const uint8_t HOP_FLAG_PREFETCH = 0x40;
#define FILE_BYTES_PER_MAP_BYTE ( DNBD3_BLOCK_SIZE * 8 )
#define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
#define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
-#define REP_NONE ( (uint64_t)0xffffffffffffffff )
-
-// Status of request in queue
-
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
-
-static const char *const NAMES_ULR[4] = {
- [ULR_FREE] = "ULR_FREE",
- [ULR_NEW] = "ULR_NEW",
- [ULR_PENDING] = "ULR_PENDING",
- [ULR_PROCESSING] = "ULR_PROCESSING",
-};
-
static atomic_uint_fast64_t totalBytesReceived = 0;
+typedef struct {
+ uint64_t start, end, handle;
+} req_t;
+
static void cancelAllRequests(dnbd3_uplink_t *uplink);
-static void uplink_free(ref *ref);
+static void freeUplinkStruct(ref *ref);
static void* uplink_mainloop(void *data);
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
-static void uplink_handleReceive(dnbd3_uplink_t *uplink);
-static int uplink_sendKeepalive(const int fd);
-static void uplink_addCrc32(dnbd3_uplink_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly);
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
+static void handleReceive(dnbd3_uplink_t *uplink);
+static bool sendKeepalive(dnbd3_uplink_t *uplink);
+static void requestCrc32List(dnbd3_uplink_t *uplink);
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink);
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
+
+#define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) )
// ############ uplink connection handling
@@ -81,6 +68,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
{
if ( !_isProxy || _shutdown ) return false;
assert( image != NULL );
+ if ( sock == -1 && !altservers_imageHasAltServers( image->name ) )
+ return false; // Nothing to do
mutex_lock( &image->lock );
dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
if ( uplink != NULL ) {
@@ -97,13 +86,15 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
}
uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
// Start with one reference for the uplink thread. We'll return it when the thread finishes
- ref_init( &uplink->reference, uplink_free, 1 );
+ ref_init( &uplink->reference, freeUplinkStruct, 1 );
mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
uplink->image = image;
uplink->bytesReceived = 0;
- uplink->idleTime = 0;
+ uplink->bytesReceivedLastSave = 0;
+ uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
+ uplink->queue = NULL;
uplink->queueLen = 0;
uplink->cacheFd = -1;
uplink->signal = signal_new();
@@ -111,12 +102,14 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
goto failure;
}
- uplink->replicationHandle = REP_NONE;
mutex_lock( &uplink->rttLock );
mutex_lock( &uplink->sendMutex );
uplink->current.fd = -1;
mutex_unlock( &uplink->sendMutex );
uplink->cycleDetected = false;
+ image->problem.uplink = true;
+ image->problem.write = true;
+ image->problem.queue = false;
if ( sock != -1 ) {
uplink->better.fd = sock;
int index = altservers_hostToIndex( host );
@@ -139,7 +132,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
return true;
failure: ;
if ( uplink != NULL ) {
- image->users++; // Expected by uplink_free()
+ image->users++; // Expected by freeUplinkStruct()
ref_put( &uplink->reference ); // The ref for the uplink thread that never was
}
mutex_unlock( &image->lock );
@@ -166,13 +159,13 @@ bool uplink_shutdown(dnbd3_image_t *image)
image->users++; // Prevent free while uplink shuts down
signal_call( uplink->signal );
} else {
- logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid );
+ logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) );
}
cancelAllRequests( uplink );
ref_setref( &image->uplinkref, NULL );
- ref_put( &uplink->reference );
mutex_unlock( &uplink->queueLock );
bool retval = ( exp && image->users == 0 );
+ ref_put( &uplink->reference );
mutex_unlock( &image->lock );
return retval;
}
@@ -183,19 +176,28 @@ bool uplink_shutdown(dnbd3_image_t *image)
*/
static void cancelAllRequests(dnbd3_uplink_t *uplink)
{
- for ( int i = 0; i < uplink->queueLen; ++i ) {
- if ( uplink->queue[i].status != ULR_FREE ) {
- net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
- uplink->queue[i].status = ULR_FREE;
+ dnbd3_queue_entry_t *it = uplink->queue;
+ while ( it != NULL ) {
+ dnbd3_queue_client_t *cit = it->clients;
+ while ( cit != NULL ) {
+ (*cit->callback)( cit->data, cit->handle, 0, 0, NULL );
+ dnbd3_queue_client_t *next = cit->next;
+ free( cit );
+ cit = next;
}
+ dnbd3_queue_entry_t *next = it->next;
+ free( it );
+ it = next;
}
+ uplink->queue = NULL;
uplink->queueLen = 0;
+ uplink->image->problem.queue = false;
}
-static void uplink_free(ref *ref)
+static void freeUplinkStruct(ref *ref)
{
dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
- logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
+ logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) );
assert( uplink->queueLen == 0 );
if ( uplink->signal != NULL ) {
signal_close( uplink->signal );
@@ -226,35 +228,36 @@ static void uplink_free(ref *ref)
* Remove given client from uplink request queue
* Locks on: uplink.queueLock
*/
-void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback)
{
mutex_lock( &uplink->queueLock );
- for (int i = uplink->queueLen - 1; i >= 0; --i) {
- if ( uplink->queue[i].client == client ) {
- // Make sure client doesn't get destroyed while we're sending it data
- mutex_lock( &client->sendMutex );
- mutex_unlock( &client->sendMutex );
- uplink->queue[i].client = NULL;
- uplink->queue[i].status = ULR_FREE;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; ) {
+ if ( (**cit).data == data && (**cit).callback == callback ) {
+ (*(**cit).callback)( (**cit).data, (**cit).handle, 0, 0, NULL );
+ dnbd3_queue_client_t *entry = *cit;
+ *cit = (**cit).next;
+ free( entry );
+ } else {
+ cit = &(**cit).next;
+ }
}
- if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
}
mutex_unlock( &uplink->queueLock );
}
/**
- * Request a chunk of data through an uplink server
- * Locks on: image.lock, uplink.queueLock
+ * Called from a client (proxy) connection to request a missing part of the image.
+ * The caller has made sure that the range is actually missing.
*/
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
{
- if ( client == NULL || client->image == NULL )
- return false;
- if ( length > (uint32_t)_maxPayload ) {
- logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
+ assert( client != NULL && callback != NULL );
+ if ( ( hops & 0x3f ) > 60 ) { // This is just silly
+ logadd( LOG_WARNING, "Refusing to relay a request that has > 60 hops" );
return false;
}
- dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
+ dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
if ( unlikely( uplink == NULL ) ) {
uplink_init( client->image, -1, NULL, -1 );
uplink = ref_get_uplink( &client->image->uplinkref );
@@ -263,160 +266,275 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
return false;
}
}
- if ( uplink->shutdown ) {
- logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
- goto fail_ref;
- }
// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
// This might be a false positive if there are multiple instances running on the same host (IP)
- if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
+ bool ret;
+ if ( hops > 1 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
uplink->cycleDetected = true;
signal_call( uplink->signal );
logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
- goto fail_ref;
+ ret = false;
+ } else {
+ ret = uplink_requestInternal( uplink, (void*)client, callback, handle, start, length, hops );
}
+ ref_put( &uplink->reference );
+ return ret;
+}
- int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
- int existingType = -1; // ULR_* type of existing request
- int i;
- int freeSlot = -1;
- int firstUsedSlot = -1;
- bool requestLoop = false;
- const uint64_t end = start + length;
+/**
+ * Called by integrated fuse module
+ */
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length)
+{
+ dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+ if ( unlikely( uplink == NULL ) ) {
+ uplink_init( image, -1, NULL, -1 );
+ uplink = ref_get_uplink( &image->uplinkref );
+ if ( uplink == NULL ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+ return false;
+ }
+ }
+ bool ret = uplink_requestInternal( uplink, data, callback, handle, start, length, 0 );
+ ref_put( &uplink->reference );
+ return ret;
+}
+
+static void extendRequest(uint64_t start, uint64_t *end, const dnbd3_image_t *image, uint32_t wanted)
+{
+ uint32_t length = (uint32_t)( *end - start );
+ if ( length >= wanted )
+ return;
+ length = wanted;
+ if ( unlikely( _backgroundReplication == BGR_HASHBLOCK
+ && *end / HASH_BLOCK_SIZE != (start + length) / HASH_BLOCK_SIZE ) ) {
+ // Don't extend across hash-block border in this mode
+ *end = ( start + length ) & ~( HASH_BLOCK_SIZE - 1 );
+ } else {
+ *end = start + length;
+ }
+ if ( unlikely( *end > image->virtualFilesize ) ) {
+ *end = image->virtualFilesize;
+ }
+ *end = ( *end + DNBD3_BLOCK_SIZE - 1 ) & ~( DNBD3_BLOCK_SIZE - 1 );
+ //logadd( LOG_DEBUG2, "Extended %"PRIx64" from %"PRIx64" to %"PRIx64, start, end, req.end );
+}
+
+static bool requestBlock(dnbd3_uplink_t *uplink, req_t *req, uint8_t hops)
+{
+ if ( uplink->current.fd == -1 )
+ return false;
+ return dnbd3_get_block( uplink->current.fd, req->start,
+ (uint32_t)( req->end - req->start ), req->handle,
+ COND_HOPCOUNT( uplink->current.version, hops ) );
+}
+
+/**
+ * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL.
+ * If callback is NULL, this is assumed to be a background replication request.
+ * Locks on: uplink.queueLock, uplink.sendMutex
+ */
+static bool uplink_requestInternal(dnbd3_uplink_t *uplink, void *data, uplink_callback callback,
+ uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+{
+ assert( uplink != NULL );
+ assert( data == NULL || callback != NULL );
+ if ( ( hops & HOP_FLAG_BGR ) // This is a background replication request
+ && _backgroundReplication != BGR_FULL ) { // Deny if we're not doing BGR
+ // TODO: Allow BGR_HASHBLOCK too, but only if hash block isn't completely empty
+ logadd( LOG_DEBUG2, "Dopping client because of BGR policy" );
+ return false;
+ }
+ if ( uplink->shutdown ) {
+ logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
+ return false;
+ }
+ if ( length > (uint32_t)_maxPayload ) {
+ logadd( LOG_WARNING, "UPLINK: Cannot relay request; length of %" PRIu32 " exceeds maximum payload",
+ length );
+ return false;
+ }
+ hops++;
+ if ( callback == NULL ) {
+ // Set upper-most bit for replication requests that we fire
+ // In client mode, at least set prefetch flag to prevent prefetch cascading
+ hops |= (uint8_t)( _pretendClient ? HOP_FLAG_PREFETCH : HOP_FLAG_BGR );
+ }
+
+ req_t req, preReq;
+ dnbd3_queue_entry_t *request = NULL, *last = NULL, *pre = NULL;
+ bool isNew;
+ const uint64_t end = start + length;
+ req.start = start & ~(DNBD3_BLOCK_SIZE - 1);
+ req.end = end;
+ /* Don't do this -- this breaks matching of prefetch jobs, since they'd
+ * be misaligned, and the next client request wouldn't match anything.
+ * To improve this, we need to be able to attach a queue_client to multiple queue_entries
+ * and then serve it once all the queue_entries are done (atomic_int in queue_client).
+ * But currently we directly send the receive buffer's content to the queue_client after
+ * receiving the payload, as this will also work when the local cache is borked (we just
+ * tunnel though the traffic). One could argue that this mode of operation is nonsense,
+ * and we should just drop all affected clients. Then as a next step, don't serve the
+ * clients form the receive buffer, but just issue a normal sendfile() call after writing
+ * the received data to the local cache.
+ */
+ if ( callback != NULL && _minRequestSize != 0 ) {
+ // Not background replication request, extend request size
+ extendRequest( req.start, &req.end, uplink->image, _minRequestSize );
+ }
+ req.end = (req.end + DNBD3_BLOCK_SIZE - 1) & ~(DNBD3_BLOCK_SIZE - 1);
+ // Critical section - work with the queue
mutex_lock( &uplink->queueLock );
if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
goto fail_lock;
}
- for (i = 0; i < uplink->queueLen; ++i) {
- // find free slot to place this request into
- if ( uplink->queue[i].status == ULR_FREE ) {
- if ( freeSlot == -1 || existingType != ULR_PROCESSING ) {
- freeSlot = i;
- }
- continue;
- }
- if ( firstUsedSlot == -1 ) {
- firstUsedSlot = i;
- }
- // find existing request to attach to
- if ( uplink->queue[i].from > start || uplink->queue[i].to < end )
- continue; // Range not suitable
- // Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious
- if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) {
- requestLoop = true;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->from <= start && it->to >= end ) {
+ // Matching range, attach
+ request = it;
break;
}
- if ( foundExisting == -1 || existingType == ULR_PROCESSING ) {
- foundExisting = i;
- existingType = uplink->queue[i].status;
+ if ( it->next == NULL ) {
+ // Not matching, last in list, remember
+ last = it;
+ break;
}
}
- if ( unlikely( requestLoop ) ) {
- uplink->cycleDetected = true;
- signal_call( uplink->signal );
- logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
- goto fail_lock;
- }
- if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
- freeSlot = -1; // Not attaching to existing request, make it use a higher slot
- }
- if ( freeSlot == -1 ) {
- if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
- logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
+ dnbd3_queue_client_t **c = NULL;
+ if ( request == NULL ) {
+ // No existing request to attach to
+ if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) {
+ logadd( LOG_WARNING,
+ "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." );
+ goto fail_lock;
+ }
+ uplink->queueLen++;
+ if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = true;
+ }
+ request = malloc( sizeof(*request) );
+ if ( last == NULL ) {
+ uplink->queue = request;
+ } else {
+ last->next = request;
+ }
+ request->next = NULL;
+ request->handle = ++uplink->queueId;
+ request->from = req.start;
+ request->to = req.end;
+#ifdef DEBUG
+ timing_get( &request->entered );
+#endif
+ request->hopCount = hops;
+ request->sent = true; // Optimistic; would be set to false on failure
+ if ( callback == NULL ) {
+ // BGR
+ request->clients = NULL;
+ } else {
+ c = &request->clients;
+ }
+ isNew = true;
+ } else if ( callback == NULL ) {
+ // Replication request that maches existing request. Do nothing
+ isNew = false;
+ } else {
+ // Existing request. Check if potential cycle
+ if ( hops > request->hopCount && request->from == start && request->to == end ) {
+ logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
goto fail_lock;
}
- freeSlot = uplink->queueLen++;
+ // Count number if clients, get tail of list
+ int count = 0;
+ c = &request->clients;
+ while ( *c != NULL ) {
+ c = &(**c).next;
+ if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) {
+ logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count );
+ goto fail_lock;
+ }
+ }
+ isNew = false;
}
- // Do not send request to uplink server if we have a matching pending request AND the request either has the
- // status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
- // explicitly send this request to the uplink server. The second condition mentioned here is to prevent
- // a race condition where the reply for the outstanding request already arrived and the uplink thread
- // is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
- // already have passed the index of the free slot we determined, but not reached the existing request we just found above.
- if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) {
- foundExisting = -1; // -1 means "send request"
+ // Prefetch immediately, without unlocking the list - the old approach of
+ // async prefetching in another thread was sometimes so slow that we'd process
+ // another request from the same client before the prefetch job would execute.
+ if ( callback != NULL && ( isNew || request->clients == NULL || request->clients->data == data )
+ && !( hops & (HOP_FLAG_BGR | HOP_FLAG_PREFETCH) ) // No cascading of prefetches
+ && end == request->to && length <= _maxPrefetch ) {
+ // Only if this is a client request, and the !! end boundary matches exactly !!
+ // (See above for reason why)
+ // - We neither check the local cache, nor other pending requests. Worth it?
+ // Complexity vs. probability
+ preReq.start = end;
+ preReq.end = end;
+ extendRequest( preReq.start, &preReq.end, uplink->image, MIN( length * 3, _maxPrefetch ) );
+ if ( preReq.start < preReq.end ) {
+ //logadd( LOG_DEBUG2, "Prefetching @ %"PRIx64" - %"PRIx64, preReq.start, preReq.end );
+ uplink->queueLen++;
+ pre = malloc( sizeof(*pre) );
+ pre->next = request->next;
+ request->next = pre;
+ pre->handle = preReq.handle = ++uplink->queueId;
+ pre->from = preReq.start;
+ pre->to = preReq.end;
+ pre->hopCount = hops | HOP_FLAG_PREFETCH;
+ pre->sent = true; // Optimistic; would be set to false on failure
+ pre->clients = NULL;
+#ifdef DEBUG
+ timing_get( &pre->entered );
+#endif
+ }
}
-#ifdef _DEBUG
- if ( foundExisting != -1 ) {
- logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot );
- logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
- "New %" PRIu64 "-%" PRIu64 " (%p)\n",
- uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
- start, end, (void*)client );
+ // // // //
+ // Copy data - need this after unlocking
+ req.handle = request->handle;
+ if ( callback != NULL ) {
+ assert( c != NULL );
+ *c = malloc( sizeof( *request->clients ) );
+ (**c).next = NULL;
+ (**c).handle = handle;
+ (**c).from = start;
+ (**c).to = end;
+ (**c).data = data;
+ (**c).callback = callback;
}
-#endif
- // Fill structure
- uplink->queue[freeSlot].from = start;
- uplink->queue[freeSlot].to = end;
- uplink->queue[freeSlot].handle = handle;
- uplink->queue[freeSlot].client = client;
- //int old = uplink->queue[freeSlot].status;
- uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW :
- ( existingType == ULR_NEW ? ULR_PENDING : existingType ) );
- uplink->queue[freeSlot].hopCount = hops;
-#ifdef _DEBUG
- timing_get( &uplink->queue[freeSlot].entered );
- //logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
-#endif
mutex_unlock( &uplink->queueLock );
+ // End queue critical section
+ if ( pre == NULL && !isNew )
+ return true; // Nothing to do
- if ( foundExisting != -1 ) {
- ref_put( &uplink->reference );
- return true; // Attached to pending request, do nothing
+ // Fire away the request(s)
+ mutex_lock( &uplink->sendMutex );
+ bool ret1 = true;
+ bool ret2 = true;
+ if ( isNew ) {
+ ret1 = requestBlock( uplink, &req, hops );
}
-
- // See if we can fire away the request
- if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
- logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
- } else {
- if ( unlikely( uplink->current.fd == -1 ) ) {
- mutex_unlock( &uplink->sendMutex );
- logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
- } else {
- const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- if ( hops < 200 ) ++hops;
- const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
- mutex_unlock( &uplink->sendMutex );
- if ( unlikely( !ret ) ) {
- logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
- } else {
- // Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
- int state;
- mutex_lock( &uplink->queueLock );
- if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
- state = uplink->queue[freeSlot].status;
- if ( uplink->queue[freeSlot].status == ULR_NEW ) {
- uplink->queue[freeSlot].status = ULR_PENDING;
- }
- } else {
- state = -1;
- }
- mutex_unlock( &uplink->queueLock );
- if ( state == -1 ) {
- logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
- } else if ( state == ULR_NEW ) {
- //logadd( LOG_DEBUG2, "Direct uplink request" );
- } else {
- logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
- }
- ref_put( &uplink->reference );
- return true;
- }
- // Fall through to waking up sender thread
- }
+ if ( pre != NULL ) {
+ ret2 = requestBlock( uplink, &preReq, hops | HOP_FLAG_PREFETCH );
+ }
+ if ( !ret1 || !ret2 ) { // Set with send locked
+ uplink->image->problem.uplink = true;
+ }
+ mutex_unlock( &uplink->sendMutex );
+ // markRequestUnsend locks the queue, would violate locking order with send mutex
+ if ( !ret1 ) {
+ markRequestUnsent( uplink, req.handle );
+ logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle );
+ }
+ if ( !ret2 ) {
+ markRequestUnsent( uplink, preReq.handle );
}
- if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+ if ( ( !ret1 || !ret2 ) && signal_call( uplink->signal ) == SIGNAL_ERROR ) {
logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
}
- ref_put( &uplink->reference );
return true;
+
fail_lock:
mutex_unlock( &uplink->queueLock );
-fail_ref:
- ref_put( &uplink->reference );
return false;
}
@@ -431,11 +549,10 @@ static void* uplink_mainloop(void *data)
#define EV_COUNT (2)
struct pollfd events[EV_COUNT];
dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
- int numSocks, i, waitTime;
+ int numSocks, waitTime;
int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
int rttTestResult;
uint32_t discoverFailCount = 0;
- uint32_t unsavedSeconds = 0;
ticks nextAltCheck, lastKeepalive;
char buffer[200];
memset( events, 0, sizeof(events) );
@@ -447,7 +564,7 @@ static void* uplink_mainloop(void *data)
thread_detach( uplink->thread );
blockNoncriticalSignals();
// Make sure file is open for writing
- if ( !uplink_reopenCacheFd( uplink, false ) ) {
+ if ( !reopenCacheFd( uplink, false ) ) {
// It might have failed - still offer proxy mode, we just can't cache
logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
}
@@ -460,14 +577,14 @@ static void* uplink_mainloop(void *data)
}
while ( !_shutdown && !uplink->shutdown ) {
// poll()
- waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
- if ( waitTime == 0 ) {
+ if ( uplink->rttTestResult == RTT_DOCHANGE ) {
// 0 means poll, since we're about to change the server
+ waitTime = 0;
} else {
declare_now;
waitTime = (int)timing_diffMs( &now, &nextAltCheck );
if ( waitTime < 100 ) waitTime = 100;
- if ( waitTime > 10000 ) waitTime = 10000;
+ else if ( waitTime > 10000 ) waitTime = 10000;
}
events[EV_SOCKET].fd = uplink->current.fd;
numSocks = poll( events, EV_COUNT, waitTime );
@@ -494,8 +611,7 @@ static void* uplink_mainloop(void *data)
mutex_unlock( &uplink->rttLock );
discoverFailCount = 0;
if ( fd != -1 ) close( fd );
- uplink->replicationHandle = REP_NONE;
- uplink->image->working = true;
+ uplink->image->problem.uplink = false;
uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
buffer[0] = '@';
if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
@@ -504,12 +620,17 @@ static void* uplink_mainloop(void *data)
}
// If we don't have a crc32 list yet, see if the new server has one
if ( uplink->image->crc32 == NULL ) {
- uplink_addCrc32( uplink );
+ requestCrc32List( uplink );
}
// Re-send all pending requests
- uplink_sendRequests( uplink, false );
- uplink_sendReplicationRequest( uplink );
+ sendQueuedRequests( uplink, false );
+ sendReplicationRequest( uplink );
events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+ if ( uplink->image->problem.uplink ) {
+ // Some of the requests above must have failed again already :-(
+ logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
+ connectionFailed( uplink, true );
+ }
timing_gets( &nextAltCheck, altCheckInterval );
// The rtt worker already did the handshake for our image, so there's nothing
// more to do here
@@ -517,6 +638,7 @@ static void* uplink_mainloop(void *data)
// Check events
// Signal
if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+ uplink->image->problem.uplink = true;
logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
goto cleanup;
} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
@@ -526,46 +648,37 @@ static void* uplink_mainloop(void *data)
}
if ( uplink->current.fd != -1 ) {
// Uplink seems fine, relay requests to it...
- uplink_sendRequests( uplink, true );
+ sendQueuedRequests( uplink, true );
} else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long
uplink->idleTime = 0;
}
}
// Uplink socket
if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
- uplink_connectionFailed( uplink, true );
+ connectionFailed( uplink, true );
logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents );
setThreadName( "panic-uplink" );
} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
- uplink_handleReceive( uplink );
+ handleReceive( uplink );
if ( _shutdown || uplink->shutdown ) goto cleanup;
}
declare_now;
uint32_t timepassed = timing_diff( &lastKeepalive, &now );
- if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+ if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL
+ || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
lastKeepalive = now;
uplink->idleTime += timepassed;
- unsavedSeconds += timepassed;
- if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) {
- // fsync/save every 4 minutes, or every 60 seconds if uplink is idle
- unsavedSeconds = 0;
- uplink_saveCacheMap( uplink );
- }
// Keep-alive
- if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
- // Send keep-alive if nothing is happening
- if ( uplink_sendKeepalive( uplink->current.fd ) ) {
- // Re-trigger periodically, in case it requires a minimum user count
- uplink_sendReplicationRequest( uplink );
- } else {
- uplink_connectionFailed( uplink, true );
- logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
- setThreadName( "panic-uplink" );
+ if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
+ // Send keep-alive if nothing is happening, and try to trigger background rep.
+ if ( !sendKeepalive( uplink ) || !sendReplicationRequest( uplink ) ) {
+ connectionFailed( uplink, true );
+ logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
}
}
// Don't keep uplink established if we're idle for too much
- if ( uplink_connectionShouldShutdown( uplink ) ) {
- logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
+ if ( connectionShouldShutdown( uplink ) ) {
+ logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) );
goto cleanup;
}
}
@@ -578,6 +691,7 @@ static void* uplink_mainloop(void *data)
// Quit work if image is complete
logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
setThreadName( "finished-uplink" );
+ uplink->image->problem.uplink = false;
goto cleanup;
} else {
// Not complete - do measurement
@@ -592,46 +706,44 @@ static void* uplink_mainloop(void *data)
} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
discoverFailCount++;
- if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
- logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
- uplink->image->working = false;
- }
if ( uplink->current.fd == -1 ) {
uplink->cycleDetected = false;
}
}
timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED );
}
-#ifdef _DEBUG
+#ifdef DEBUG
if ( uplink->current.fd != -1 && !uplink->shutdown ) {
bool resend = false;
ticks deadline;
timing_set( &deadline, &now, -10 );
mutex_lock( &uplink->queueLock );
- for (i = 0; i < uplink->queueLen; ++i) {
- if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
- snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
- "%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
- uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
- uplink->queue[i].entered = now;
-#ifdef _DEBUG_RESEND_STARVING
- uplink->queue[i].status = ULR_NEW;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( timing_reached( &it->entered, &deadline ) ) {
+ logadd( LOG_WARNING, "Starving request detected:"
+ " (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d",
+ it->from, it->to, (int)it->sent, PIMG(uplink->image) );
+ it->entered = now;
+#ifdef DEBUG_RESEND_STARVING
+ it->sent = false;
resend = true;
#endif
- mutex_unlock( &uplink->queueLock );
- logadd( LOG_WARNING, "%s", buffer );
- mutex_lock( &uplink->queueLock );
}
}
mutex_unlock( &uplink->queueLock );
- if ( resend )
- uplink_sendRequests( uplink, true );
+ if ( resend ) {
+ sendQueuedRequests( uplink, true );
+ }
}
#endif
}
- cleanup: ;
- uplink_saveCacheMap( uplink );
+cleanup: ;
dnbd3_image_t *image = uplink->image;
+ dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+ if ( cache != NULL ) {
+ cache->dirty = true; // Force writeout of cache map
+ ref_put( &cache->reference );
+ }
mutex_lock( &image->lock );
bool exp = false;
if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
@@ -653,37 +765,60 @@ static void* uplink_mainloop(void *data)
return NULL ;
}
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
+/**
+ * Only called from uplink thread.
+ */
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly)
{
- // Scan for new requests
- int j;
+ assert_uplink_thread();
+ // Scan for new requests, or optionally, (re)send all
+ // Build a buffer, so if there aren't too many requests, we can send them after
+ // unlocking the queue again. Otherwise we need flushes during iteration, which
+ // is no ideal, but in that case the uplink is probably overwhelmed anyways.
+ // Try 125 as that's exactly 300bytes, usually 2*MTU.
+#define MAX_RESEND_BATCH 125
+ dnbd3_request_t reqs[MAX_RESEND_BATCH];
+ int count = 0;
mutex_lock( &uplink->queueLock );
- for (j = 0; j < uplink->queueLen; ++j) {
- if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue;
- uplink->queue[j].status = ULR_PENDING;
- uint8_t hops = uplink->queue[j].hopCount;
- const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
- const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
- /*
- logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
- (void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
- */
- mutex_unlock( &uplink->queueLock );
- if ( hops < 200 ) ++hops;
- mutex_lock( &uplink->sendMutex );
- const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
- mutex_unlock( &uplink->sendMutex );
- if ( !ret ) {
- // Non-critical - if the connection dropped or the server was changed
- // the thread will re-send this request as soon as the connection
- // is reestablished.
- logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
- altservers_serverFailed( uplink->current.index );
- return;
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( newOnly && it->sent )
+ continue;
+ it->sent = true;
+ dnbd3_request_t *hdr = &reqs[count++];
+ hdr->magic = dnbd3_packet_magic;
+ hdr->cmd = CMD_GET_BLOCK;
+ hdr->size = (uint32_t)( it->to - it->from );
+ hdr->offset = it->from; // Offset first, then hops! (union)
+ hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount );
+ hdr->handle = it->handle;
+ fixup_request( *hdr );
+ if ( count == MAX_RESEND_BATCH ) {
+ bool ok = false;
+ logadd( LOG_DEBUG2, "BLOCKING resend of %d", count );
+ count = 0;
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 )
+ == DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ if ( !ok ) {
+ uplink->image->problem.uplink = true;
+ break;
+ }
}
- mutex_lock( &uplink->queueLock );
}
mutex_unlock( &uplink->queueLock );
+ if ( count != 0 ) {
+ mutex_lock( &uplink->sendMutex );
+ if ( uplink->current.fd != -1 ) {
+ uplink->image->problem.uplink =
+ ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 )
+ != DNBD3_REQUEST_SIZE * count );
+ }
+ mutex_unlock( &uplink->sendMutex );
+ }
+#undef MAX_RESEND_BATCH
}
/**
@@ -695,73 +830,97 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
* server. This means we might request data we already have, but it makes
* the code simpler. Worst case would be only one bit is zero, which means
* 4kb are missing, but we will request 32kb.
+ *
+ * Only called form uplink thread, so current.fd is assumed to be valid.
+ *
+ * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
*/
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink)
{
- if ( uplink == NULL || uplink->current.fd == -1 ) return;
- if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
- if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
- return; // Already a replication request on the wire, or no more blocks to replicate
+ assert_uplink_thread();
+ if ( uplink->current.fd == -1 )
+ return false; // Should never be called in this state, consider send error
+ if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
+ return true; // Don't do background replication
+ if ( uplink->nextReplicationIndex == -1 )
+ return true; // No more blocks to replicate
dnbd3_image_t * const image = uplink->image;
- if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
- if ( image->users < _bgrMinClients ) return; // Not enough active users
+ if ( image->users < _bgrMinClients )
+ return true; // Not enough active users
+ const int numNewRequests = numWantedReplicationRequests( uplink );
+ if ( numNewRequests <= 0 )
+ return true; // Already sufficient amount of requests on the wire
dnbd3_cache_map_t *cache = ref_get_cachemap( image );
- if ( cache == NULL || image->users < _bgrMinClients ) {
+ if ( cache == NULL ) {
// No cache map (=image complete)
- ref_put( &cache->reference );
- return;
+ return true;
}
const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
const int lastBlockIndex = mapBytes - 1;
- int endByte;
- if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
- endByte = uplink->nextReplicationIndex + mapBytes;
- } else { // Hashblock based: Only look for match in current hash block
- endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
- if ( endByte > mapBytes ) {
- endByte = mapBytes;
+ for ( int bc = 0; bc < numNewRequests; ++bc ) {
+ int endByte;
+ if ( UPLINK_MAX_QUEUE - uplink->queueLen < 10 )
+ break; // Don't overload queue
+ if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+ endByte = uplink->nextReplicationIndex + mapBytes;
+ } else { // Hashblock based: Only look for match in current hash block
+ endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+ if ( endByte > mapBytes ) {
+ endByte = mapBytes;
+ }
}
- }
- atomic_thread_fence( memory_order_acquire );
- int replicationIndex = -1;
- for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
- const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
- if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
- && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
- // Found incomplete one
- replicationIndex = i;
+ atomic_thread_fence( memory_order_acquire );
+ int replicationIndex = -1;
+ for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
+ const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+ if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+ && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+ // Found incomplete one
+ replicationIndex = i;
+ break;
+ }
+ }
+ if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+ // Nothing left in current block, find next one
+ replicationIndex = findNextIncompleteHashBlock( uplink, endByte );
+ }
+ if ( replicationIndex == -1 ) {
+ // Replication might be complete, uplink_mainloop should take care....
+ uplink->nextReplicationIndex = -1;
break;
}
+ const uint64_t handle = ++uplink->queueId;
+ const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+ uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+ // Extend the default 32k request size if _minRequestSize is > 32k
+ for ( size_t extra = 1; extra < ( _minRequestSize / FILE_BYTES_PER_MAP_BYTE )
+ && offset + size < image->virtualFilesize
+ && _backgroundReplication == BGR_FULL; ++extra ) {
+ if ( atomic_load_explicit( &cache->map[replicationIndex+1], memory_order_relaxed ) == 0xff )
+ break; // Hit complete 32k block, stop here
+ replicationIndex++;
+ size += (uint32_t)MIN( image->virtualFilesize - offset - size, FILE_BYTES_PER_MAP_BYTE );
+ }
+ if ( !uplink_requestInternal( uplink, NULL, NULL, handle, offset, size, 0 ) ) {
+ logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)",
+ PIMG(uplink->image) );
+ ref_put( &cache->reference );
+ return false;
+ }
+ if ( replicationIndex == lastBlockIndex ) {
+ uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+ }
+ uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+ if ( _backgroundReplication == BGR_HASHBLOCK
+ && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+ // Just crossed a hash block boundary, look for new candidate starting at this very index
+ uplink->nextReplicationIndex = findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+ if ( uplink->nextReplicationIndex == -1 )
+ break;
+ }
}
ref_put( &cache->reference );
- if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
- // Nothing left in current block, find next one
- replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
- }
- if ( replicationIndex == -1 ) {
- // Replication might be complete, uplink_mainloop should take care....
- uplink->nextReplicationIndex = -1;
- return;
- }
- const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
- uplink->replicationHandle = offset;
- const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
- mutex_lock( &uplink->sendMutex );
- bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
- mutex_unlock( &uplink->sendMutex );
- if ( !sendOk ) {
- logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
- return;
- }
- if ( replicationIndex == lastBlockIndex ) {
- uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
- }
- uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
- if ( _backgroundReplication == BGR_HASHBLOCK
- && uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
- // Just crossed a hash block boundary, look for new candidate starting at this very index
- uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
- }
+ return true;
}
/**
@@ -769,7 +928,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
* of a hash block which is neither completely empty nor completely
* replicated yet. Returns -1 if no match.
*/
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
{
int retval = -1;
dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
@@ -816,29 +975,32 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
/**
* Receive data from uplink server and process/dispatch
* Locks on: uplink.lock, images[].lock
+ * Only called from uplink thread, so current.fd is assumed to be valid.
*/
-static void uplink_handleReceive(dnbd3_uplink_t *uplink)
+static void handleReceive(dnbd3_uplink_t *uplink)
{
- dnbd3_reply_t inReply, outReply;
- int ret, i;
+ dnbd3_reply_t inReply;
+ int ret;
+ assert_uplink_thread();
+ assert( uplink->queueLen >= 0 );
for (;;) {
ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
if ( ret == REPLY_AGAIN ) break;
if ( unlikely( ret == REPLY_CLOSED ) ) {
- logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path );
+ logadd( LOG_INFO, "Uplink: Remote host hung up (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
- logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path );
+ logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s:%d)", PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( ret != REPLY_OK ) ) {
- logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path );
+ logadd( LOG_INFO, "Uplink: Connection error %d (%s:%d)", ret, PIMG(uplink->image) );
goto error_cleanup;
}
if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
- logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path );
+ logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s:%d", inReply.size, PIMG(uplink->image) );
goto error_cleanup;
}
@@ -851,21 +1013,41 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
}
}
if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
- logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path );
+ logadd( LOG_INFO, "Lost connection to uplink server of %s:%d (payload)", PIMG(uplink->image) );
goto error_cleanup;
}
// Payload read completely
// Bail out if we're not interested
- if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+ if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) )
+ continue;
// Is a legit block reply
- struct iovec iov[2];
- const uint64_t start = inReply.handle;
- const uint64_t end = inReply.handle + inReply.size;
totalBytesReceived += inReply.size;
uplink->bytesReceived += inReply.size;
+ // Get entry from queue
+ dnbd3_queue_entry_t *entry;
+ mutex_lock( &uplink->queueLock );
+ for ( entry = uplink->queue; entry != NULL; entry = entry->next ) {
+ if ( entry->handle == inReply.handle )
+ break;
+ }
+ if ( entry == NULL ) {
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)",
+ inReply.handle, PIMG(uplink->image) );
+ continue;
+ }
+ const uint64_t start = entry->from;
+ const uint64_t end = entry->to;
+ mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+ // We don't remove the entry from the list here yet, to slightly increase the chance of other
+ // clients attaching to this request while we write the data to disk
+ if ( end - start != inReply.size ) {
+ logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)",
+ inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) );
+ }
// 1) Write to cache file
if ( unlikely( uplink->cacheFd == -1 ) ) {
- uplink_reopenCacheFd( uplink, false );
+ reopenCacheFd( uplink, false );
}
if ( likely( uplink->cacheFd != -1 ) ) {
int err = 0;
@@ -884,16 +1066,19 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
continue; // Success, retry write
}
if ( err == EBADF || err == EINVAL || err == EIO ) {
- if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
+ uplink->image->problem.write = true;
+ if ( !tryAgain || !reopenCacheFd( uplink, true ) )
break;
tryAgain = false;
continue; // Write handle to image successfully re-opened, try again
}
- logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err );
+ logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d",
+ PIMG(uplink->image), err );
break;
}
if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
- logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid );
+ logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d",
+ ret, PIMG(uplink->image) );
break;
}
done += (uint32_t)ret;
@@ -903,114 +1088,79 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
}
if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
- uplink->image->name, (int)uplink->image->rid, err );
+ PIMG(uplink->image), err );
}
}
- // 2) Figure out which clients are interested in it
- // Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop
- // below; this prevents uplink_request() from attaching to this request
- // by populating a slot with index greater than the highest matching
- // request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
- // where it's fine if the index is greater)
+ bool found = false;
+ dnbd3_queue_entry_t **it;
mutex_lock( &uplink->queueLock );
- for (i = 0; i < uplink->queueLen; ++i) {
- dnbd3_queued_request_t * const req = &uplink->queue[i];
- assert( req->status != ULR_PROCESSING );
- if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
- assert( req->client != NULL );
- if ( req->from >= start && req->to <= end ) { // Match :-)
- req->status = ULR_PROCESSING;
+ for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) {
+ if ( *it == entry && entry->handle == inReply.handle ) { // ABA check
+ assert( found == false );
+ *it = (**it).next;
+ found = true;
+ uplink->queueLen--;
+ break;
}
}
- // 3) Send to interested clients - iterate backwards so request collaboration works, and
- // so we can decrease queueLen on the fly while iterating. Should you ever change this to start
- // from 0, you also need to change the "attach to existing request"-logic in uplink_request()
- outReply.magic = dnbd3_packet_magic;
- bool served = false;
- for ( i = uplink->queueLen - 1; i >= 0; --i ) {
- dnbd3_queued_request_t * const req = &uplink->queue[i];
- if ( req->status == ULR_PROCESSING ) {
- size_t bytesSent = 0;
- assert( req->from >= start && req->to <= end );
- dnbd3_client_t * const client = req->client;
- outReply.cmd = CMD_GET_BLOCK;
- outReply.handle = req->handle;
- outReply.size = (uint32_t)( req->to - req->from );
- iov[0].iov_base = &outReply;
- iov[0].iov_len = sizeof outReply;
- iov[1].iov_base = uplink->recvBuffer + (req->from - start);
- iov[1].iov_len = outReply.size;
- fixup_reply( outReply );
- req->status = ULR_FREE;
- req->client = NULL;
- served = true;
- mutex_lock( &client->sendMutex );
- mutex_unlock( &uplink->queueLock );
- if ( client->sock != -1 ) {
- ssize_t sent = writev( client->sock, iov, 2 );
- if ( sent > (ssize_t)sizeof outReply ) {
- bytesSent = (size_t)sent - sizeof outReply;
- }
- }
- if ( bytesSent != 0 ) {
- client->bytesSent += bytesSent;
- }
- mutex_unlock( &client->sendMutex );
- mutex_lock( &uplink->queueLock );
- if ( i > uplink->queueLen ) {
- i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests
- }
- }
- if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
+ if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
+ uplink->image->problem.queue = false;
}
mutex_unlock( &uplink->queueLock );
-#ifdef _DEBUG
- if ( !served && start != uplink->replicationHandle ) {
- logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
+ if ( !found ) {
+ logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)",
+ PIMG(uplink->image) );
+ continue;
}
-#endif
- if ( start == uplink->replicationHandle ) {
- // Was our background replication
- uplink->replicationHandle = REP_NONE;
- // Try to remove from fs cache if no client was interested in this data
- if ( !served && uplink->cacheFd != -1 ) {
- posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
- }
+ dnbd3_queue_client_t *next;
+ for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
+ assert( c->from >= start && c->to <= end );
+ (*c->callback)( c->data, c->handle, c->from, (uint32_t)( c->to - c->from ),
+ (const char*)( uplink->recvBuffer + (c->from - start) ) );
+ next = c->next;
+ free( c );
}
- if ( served ) {
+ if ( entry->clients != NULL ) {
// Was some client -- reset idle counter
uplink->idleTime = 0;
// Re-enable replication if disabled
if ( uplink->nextReplicationIndex == -1 ) {
uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
}
+ } else {
+ if ( uplink->cacheFd != -1 ) {
+ // Try to remove from fs cache if no client was interested in this data
+ posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+ }
}
+ free( entry );
+ } // main receive loop
+ // Trigger background replication if applicable
+ if ( !sendReplicationRequest( uplink ) ) {
+ goto error_cleanup;
}
- if ( uplink->replicationHandle == REP_NONE ) {
- mutex_lock( &uplink->queueLock );
- const bool rep = ( uplink->queueLen == 0 );
- mutex_unlock( &uplink->queueLock );
- if ( rep ) uplink_sendReplicationRequest( uplink );
- }
+ // Normal end
return;
// Error handling from failed receive or message parsing
- error_cleanup: ;
- uplink_connectionFailed( uplink, true );
+error_cleanup: ;
+ connectionFailed( uplink, true );
}
/**
* Only call from uplink thread
*/
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
{
+ assert_uplink_thread();
if ( uplink->current.fd == -1 )
return;
+ setThreadName( "panic-uplink" );
altservers_serverFailed( uplink->current.index );
mutex_lock( &uplink->sendMutex );
+ uplink->image->problem.uplink = true;
close( uplink->current.fd );
uplink->current.fd = -1;
mutex_unlock( &uplink->sendMutex );
- uplink->replicationHandle = REP_NONE;
if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
uplink->nextReplicationIndex = 0;
}
@@ -1025,15 +1175,26 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
}
/**
- * Send keep alive request to server
+ * Send keep alive request to server.
+ * Called from uplink thread, current.fd must be valid.
*/
-static int uplink_sendKeepalive(const int fd)
+static bool sendKeepalive(dnbd3_uplink_t *uplink)
{
static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
- return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ assert_uplink_thread();
+ mutex_lock( &uplink->sendMutex );
+ bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+ mutex_unlock( &uplink->sendMutex );
+ return sendOk;
}
-static void uplink_addCrc32(dnbd3_uplink_t *uplink)
+/**
+ * Request crclist from uplink.
+ * Called from uplink thread, current.fd must be valid.
+ * FIXME This is broken as it could happen that another message arrives after sending
+ * the request. Refactor, split and move receive into general receive handler.
+ */
+static void requestCrc32List(dnbd3_uplink_t *uplink)
{
dnbd3_image_t *image = uplink->image;
if ( image == NULL || image->virtualFilesize == 0 ) return;
@@ -1042,6 +1203,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
uint32_t *buffer = malloc( bytes );
mutex_lock( &uplink->sendMutex );
bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
+ if ( !sendOk ) {
+ uplink->image->problem.uplink = true;
+ }
mutex_unlock( &uplink->sendMutex );
if ( !sendOk || bytes == 0 ) {
free( buffer );
@@ -1051,7 +1215,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
lists_crc = net_order_32( lists_crc );
if ( lists_crc != masterCrc ) {
- logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name );
+ logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) );
free( buffer );
return;
}
@@ -1061,10 +1225,14 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
char path[len];
snprintf( path, len, "%s.crc", uplink->image->path );
const int fd = open( path, O_WRONLY | O_CREAT, 0644 );
- if ( fd >= 0 ) {
- write( fd, &masterCrc, sizeof(uint32_t) );
- write( fd, buffer, bytes );
+ if ( fd != -1 ) {
+ ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+ ret += write( fd, buffer, bytes );
close( fd );
+ if ( (size_t)ret != sizeof(masterCrc) + bytes ) {
+ unlink( path );
+ logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) );
+ }
}
}
@@ -1076,80 +1244,24 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
* it will be closed first. Otherwise, nothing will happen and true will be returned
* immediately.
*/
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
{
if ( uplink->cacheFd != -1 ) {
if ( !force ) return true;
close( uplink->cacheFd );
}
uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+ uplink->image->problem.write = uplink->cacheFd == -1;
return uplink->cacheFd != -1;
}
/**
- * Saves the cache map of the given image.
- * Return true on success.
- * Locks on: imageListLock, image.lock
+ * Returns true if the uplink has been idle for some time (apart from
+ * background replication, if it is set to hashblock, or if it has
+ * a minimum number of active clients configured that is not currently
+ * reached)
*/
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
-{
- dnbd3_image_t *image = uplink->image;
- assert( image != NULL );
-
- if ( uplink->cacheFd != -1 ) {
- if ( fsync( uplink->cacheFd ) == -1 ) {
- // A failing fsync means we have no guarantee that any data
- // since the last fsync (or open if none) has been saved. Apart
- // from keeping the cache map from the last successful fsync
- // around and restoring it there isn't much we can do to recover
- // a consistent state. Bail out.
- logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
- logadd( LOG_ERROR, "Bailing out immediately" );
- exit( 1 );
- }
- }
-
- dnbd3_cache_map_t *cache = ref_get_cachemap( image );
- if ( cache == NULL )
- return true;
- logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
- const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
- assert( image->path != NULL );
- char mapfile[strlen( image->path ) + 4 + 1];
- strcpy( mapfile, image->path );
- strcat( mapfile, ".map" );
-
- int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
- if ( fd == -1 ) {
- const int err = errno;
- ref_put( &cache->reference );
- logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
- return false;
- }
-
- size_t done = 0;
- while ( done < size ) {
- const ssize_t ret = write( fd, cache->map + done, size - done );
- if ( ret == -1 ) {
- if ( errno == EINTR ) continue;
- logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
- break;
- }
- if ( ret <= 0 ) {
- logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
- break;
- }
- done += (size_t)ret;
- }
- ref_put( &cache->reference );
- if ( fsync( fd ) == -1 ) {
- logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
- }
- close( fd );
- return true;
-}
-
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink)
{
return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
@@ -1165,3 +1277,44 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
return false;
return altservers_toString( current, buffer, len );
}
+
+/**
+ * Get number of replication requests that should be sent right now to
+ * meet the configured bgrWindowSize. Returns 0 if any client requests
+ * are pending.
+ * This applies a sort of "slow start" in case the uplink was recently
+ * dealing with actual client requests, in that the uplink's idle time
+ * (in seconds) is an upper bound for the number returned, so we don't
+ * saturate the uplink with loads of requests right away, in case that
+ * client triggers more requests to the uplink server.
+ */
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
+{
+ int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 );
+ if ( uplink->queueLen == 0 )
+ return ret;
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->clients == NULL ) {
+ ret--;
+ } else {
+ ret = 0; // Do not allow BGR if client requests are being handled
+ break;
+ }
+ }
+ mutex_unlock( &uplink->queueLock );
+ return ret;
+}
+
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle)
+{
+ mutex_lock( &uplink->queueLock );
+ for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+ if ( it->handle == handle ) {
+ it->sent = false;
+ break;
+ }
+ }
+ mutex_unlock( &uplink->queueLock );
+}
+
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 49ff0b4..b6037d6 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -2,7 +2,7 @@
#define _UPLINK_H_
#include "globals.h"
-#include "../types.h"
+#include <dnbd3/types.h>
void uplink_globalsInit();
@@ -10,9 +10,11 @@ uint64_t uplink_getTotalBytesReceived();
bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version);
-void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
+void uplink_removeEntry(dnbd3_uplink_t *uplink, void *data, uplink_callback callback);
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+bool uplink_requestClient(dnbd3_client_t *client, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
+
+bool uplink_request(dnbd3_image_t *image, void *data, uplink_callback callback, uint64_t handle, uint64_t start, uint32_t length);
bool uplink_shutdown(dnbd3_image_t *image);
diff --git a/src/serverconfig.h b/src/serverconfig.h
deleted file mode 100644
index 239f0a2..0000000
--- a/src/serverconfig.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _SERVERCONFIG_H_
-#define _SERVERCONFIG_H_
-
-#include "config.h"
-
-// +++++ Performance/memory related
-#define SERVER_MAX_CLIENTS 4000
-#define SERVER_MAX_IMAGES 5000
-#define SERVER_MAX_ALTS 50
-// +++++ Uplink handling (proxy mode)
-#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again
-#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally
-#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
-#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
-#define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
-#define SERVER_MAX_UPLINK_QUEUE 1500 // Maximum number of queued requests per uplink
-#define SERVER_UPLINK_QUEUELEN_THRES 900 // Threshold where we start dropping incoming clients
-#define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
-
-#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
-
-// Time in ms to wait for a read/write call to complete on an uplink connection
-#define SOCKET_TIMEOUT_UPLINK 5000
-// Same for client connections. Be a bit more liberal here
-#define SOCKET_TIMEOUT_CLIENT 15000
-// When waiting for the next request header from client, allow the timeout from above
-// to expire this many times. This allows for greater idle times without also increasing
-// the timeout for cases where we wait for additional data or are actively sending a reply
-#define SOCKET_TIMEOUT_CLIENT_RETRIES 3
-
-#define SERVER_UPLINK_KEEPALIVE_INTERVAL 10 // (Seconds) Send keep-alive if nothing else is happening on the uplink
-#define SERVER_UPLINK_IDLE_TIMEOUT 1800 // (Seconds) Timeout after which we tear down an uplink connection if no blocks needed to be fetched
-
-// +++++ Other magic constants
-#define SERVER_RTT_PROBES 5 // How many probes to average over
-#define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes
-#define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes
-#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while
-#define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times
-
-#define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes
-
-// Which is the minimum protocol version the server expects from the client
-#define MIN_SUPPORTED_CLIENT 2
-// Same for when we're a proxy talking to another server
-#define MIN_SUPPORTED_SERVER 2
-
-// Length of comment fields (for alt server etc.)
-#define COMMENT_LENGTH 120
-
-#define RTT_THRESHOLD_FACTOR(us) (((us) * 2) / 3) // 2/3 = current to best must be 33% worse
-#define RTT_UNREACHABLE 0x7FFFFFFu // Use this value for timeout/unreachable as RTT. Don't set too high or you might get overflows. 0x7FFFFFF = 134 seconds
-
-// How many seconds have to pass after the last client disconnected until the imagefd is closed
-#define UNUSED_FD_TIMEOUT 3600
-
-#endif
-
diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt
new file mode 100644
index 0000000..a1bd49a
--- /dev/null
+++ b/src/shared/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-shared
+ LANGUAGES C)
+
+# find atomic library required by dnbd3-shared
+find_package(Stdatomic REQUIRED)
+find_package(Libatomic REQUIRED)
+
+# add compile option to get POLLRDHUP support for signals
+add_definitions(-D_GNU_SOURCE)
+
+set(DNBD3_SHARED_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/crc32.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/log.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sockhelper.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/timing.c)
+set(DNBD3_SHARED_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/eventfd.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe64.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/fdsignal.inc/pipe_malloc.c)
+
+add_library(dnbd3-shared STATIC ${DNBD3_SHARED_SOURCE_FILES})
+target_include_directories(dnbd3-shared PUBLIC ${PROJECT_INCLUDE_DIR})
+
+add_linter(dnbd3-shared-lint "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
+add_linter_fix(dnbd3-shared-lint-fix "${DNBD3_SHARED_SOURCE_FILES}" "${DNBD3_SHARED_HEADER_FILES}")
diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index db941d3..6cf9a18 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -38,24 +38,23 @@
*/
-#include "../types.h"
+#include <dnbd3/types.h>
#include <stddef.h>
-#define FAR
+#if defined(__x86_64__) || defined(__amd64__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <stdatomic.h>
+#define zalign(n) __attribute__((aligned(n)))
+#endif
+
#define OF(args) args
-#define local static
/* Definitions for doing the crc four data bytes at a time. */
-#if !defined(NOBYFOUR)
-# define BYFOUR
-#endif
-#ifdef BYFOUR
-# define TBLS 8
-#else
-# define TBLS 1
-#endif /* BYFOUR */
+#define TBLS 8
-local const uint32_t crc_table[TBLS][256] =
+static const uint32_t crc_table[TBLS][256] =
{
{
0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U,
@@ -110,7 +109,6 @@ local const uint32_t crc_table[TBLS][256] =
0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U,
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
0x2d02ef8dU
-#ifdef BYFOUR
},
{
0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U,
@@ -489,38 +487,159 @@ local const uint32_t crc_table[TBLS][256] =
0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU,
0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U,
0xf10605deU
-#endif
}
};
-#ifdef NO_ENDIAN
-// Currently not in use, always use the BYFOUR method with known endianness
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+#define PCLMUL_MIN_LEN 64
+#define PCLMUL_ALIGN 16
+#define PCLMUL_ALIGN_MASK 15
-/* ========================================================================= */
-uint32_t crc32(crc, buf, len)
- uint32_t crc;
- const uint8_t *buf;
- size_t len;
+#if defined(__x86_64__) || defined(__amd64__)
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ *
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+static uint32_t
+__attribute__((target("pclmul,sse4.1")))
+crc32pclmul(uint32_t crc, const uint8_t *buf, size_t len)
{
- if (buf == NULL) return 0;
+ /*
+ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+ * the CRC32+Barrett polynomials given at the end of the paper.
+ */
+ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+ /*
+ * There's at least one block of 64.
+ */
+ x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+ x0 = _mm_load_si128((__m128i *)k1k2);
+
+ buf += 64;
+ len -= 64;
- crc = crc ^ 0xffffffffU;
- while (len >= 8) {
- DO8;
- len -= 8;
+ /*
+ * Parallel fold blocks of 64, if any.
+ */
+ while (len >= 64)
+ {
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+ x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+ x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+ x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+ y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+ y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+ y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+ y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+ x1 = _mm_xor_si128(x1, x5);
+ x2 = _mm_xor_si128(x2, x6);
+ x3 = _mm_xor_si128(x3, x7);
+ x4 = _mm_xor_si128(x4, x8);
+
+ x1 = _mm_xor_si128(x1, y5);
+ x2 = _mm_xor_si128(x2, y6);
+ x3 = _mm_xor_si128(x3, y7);
+ x4 = _mm_xor_si128(x4, y8);
+
+ buf += 64;
+ len -= 64;
}
- if (len) do {
- DO1;
- } while (--len);
- return crc ^ 0xffffffffU;
+
+ /*
+ * Fold into 128-bits.
+ */
+ x0 = _mm_load_si128((__m128i *)k3k4);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x3);
+ x1 = _mm_xor_si128(x1, x5);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x4);
+ x1 = _mm_xor_si128(x1, x5);
+
+ /*
+ * Single fold blocks of 16, if any.
+ */
+ while (len >= 16)
+ {
+ x2 = _mm_loadu_si128((__m128i *)buf);
+
+ x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+ x1 = _mm_xor_si128(x1, x2);
+ x1 = _mm_xor_si128(x1, x5);
+
+ buf += 16;
+ len -= 16;
+ }
+
+ /*
+ * Fold 128-bits to 64-bits.
+ */
+ x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+ x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+ x1 = _mm_srli_si128(x1, 8);
+ x1 = _mm_xor_si128(x1, x2);
+
+ x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+ x2 = _mm_srli_si128(x1, 4);
+ x1 = _mm_and_si128(x1, x3);
+ x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Barret reduce to 32-bits.
+ */
+ x0 = _mm_load_si128((__m128i*)poly);
+
+ x2 = _mm_and_si128(x1, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+ x2 = _mm_and_si128(x2, x3);
+ x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+ x1 = _mm_xor_si128(x1, x2);
+
+ /*
+ * Return the crc32.
+ */
+ return _mm_extract_epi32(x1, 1);
}
#endif
-#ifdef BYFOUR
-
/*
This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
integer pointer type. This violates the strict aliasing rule, where a
@@ -533,7 +652,7 @@ uint32_t crc32(crc, buf, len)
writes to the buffer that is passed to these routines.
*/
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
/* ========================================================================= */
#define DOLIT4 c ^= *buf4++; \
c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
@@ -547,25 +666,36 @@ uint32_t crc32(crc, buf, len)
size_t len;
{
if (buf == NULL) return 0;
- register uint32_t c;
- register const uint32_t FAR *buf4;
+ uint32_t c;
c = ~crc;
- while (len && ((uintptr_t)buf & 3)) {
+ while (len && ((uintptr_t)buf & PCLMUL_ALIGN_MASK)) {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
len--;
}
-
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
- while (len >= 32) {
- DOLIT32;
- len -= 32;
- }
- while (len >= 4) {
- DOLIT4;
- len -= 4;
+#if defined(__x86_64__) || defined(__amd64__)
+ static atomic_int pclmul = -1;
+ if (pclmul == -1) {
+ pclmul = __builtin_cpu_supports("pclmul") && __builtin_cpu_supports("sse4.1");
}
- buf = (const uint8_t FAR *)buf4;
+ if (pclmul && len >= PCLMUL_MIN_LEN) {
+ c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
+ buf += len & ~PCLMUL_ALIGN_MASK;
+ len &= PCLMUL_ALIGN_MASK;
+ } else
+#endif
+ do {
+ const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const uint8_t *)buf4;
+ } while (0);
if (len) do {
c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
@@ -575,7 +705,7 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#ifdef BIG_ENDIAN
+#ifdef DNBD3_BIG_ENDIAN
/* ========================================================================= */
#define DOBIG4 c ^= *buf4++; \
c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
@@ -590,7 +720,7 @@ uint32_t crc32(crc, buf, len)
{
if (buf == NULL) return 0;
register uint32_t c;
- register const uint32_t FAR *buf4;
+ register const uint32_t *buf4;
c = ~net_order_32(crc);
while (len && ((uintptr_t)buf & 3)) {
@@ -598,7 +728,7 @@ uint32_t crc32(crc, buf, len)
len--;
}
- buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+ buf4 = (const uint32_t *)(const void *)buf;
while (len >= 32) {
DOBIG32;
len -= 32;
@@ -607,7 +737,7 @@ uint32_t crc32(crc, buf, len)
DOBIG4;
len -= 4;
}
- buf = (const uint8_t FAR *)buf4;
+ buf = (const uint8_t *)buf4;
if (len) do {
c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
@@ -617,5 +747,3 @@ uint32_t crc32(crc, buf, len)
}
#endif
-#endif /* BYFOUR */
-
diff --git a/src/shared/crc32.h b/src/shared/crc32.h
deleted file mode 100644
index 00b8bdd..0000000
--- a/src/shared/crc32.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CRC32_H_
-#define _CRC32_H_
-
-#include <stdint.h>
-
-uint32_t crc32(uint32_t crc, const uint8_t *buf, size_t len);
-
-#endif
-
diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c
index 087b6f1..1db59bd 100644
--- a/src/shared/fdsignal.c
+++ b/src/shared/fdsignal.c
@@ -1,4 +1,4 @@
-#include "fdsignal.h"
+#include <dnbd3/shared/fdsignal.h>
#if defined(__linux__)
//#warning "Using eventfd based signalling"
diff --git a/src/shared/fdsignal.h b/src/shared/fdsignal.h
deleted file mode 100644
index 960a2a9..0000000
--- a/src/shared/fdsignal.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _FD_SIGNAL_H_
-#define _FD_SIGNAL_H_
-
-#define SIGNAL_OK (0)
-#define SIGNAL_TIMEOUT (-2)
-#define SIGNAL_ERROR (-1)
-
-typedef struct _dnbd3_signal dnbd3_signal_t;
-
-/**
- * Create a new signal, nonblocking.
- * @return NULL on error, pointer to dnbd3_signal_t on success.
- */
-dnbd3_signal_t* signal_new();
-
-/**
- * Create a new signal, blocking.
- * @return NULL on error, pointer to dnbd3_signal_t on success.
- */
-dnbd3_signal_t* signal_newBlocking();
-
-/**
- * Trigger the given signal, so a wait or clear call will succeed.
- * @return SIGNAL_OK on success, SIGNAL_ERROR on error
- */
-int signal_call(const dnbd3_signal_t* const signal);
-
-/**
- * Wait for given signal, with an optional timeout.
- * If timeout == 0, just poll once.
- * If timeout < 0, wait forever.
- * @return > 0 telling how many times the signal was called,
- * SIGNAL_TIMEOUT if the timeout was reached,
- * SIGNAL_ERROR if some error occured
- */
-int signal_wait(const dnbd3_signal_t* const signal, int timeoutMs);
-
-/**
- * Clears any pending signals on this signal.
- * @return number of signals that were pending,
- * SIGNAL_ERROR if some error occured
- */
-int signal_clear(const dnbd3_signal_t* const signal);
-
-/**
- * Close the given signal.
- */
-void signal_close(const dnbd3_signal_t* const signal);
-
-/**
- * Get a file descriptor for the given signal that can be
- * waited on using poll or similar.
- * @return -1 if the signal is invalid
- */
-int signal_getWaitFd(const dnbd3_signal_t* const signal);
-
-#endif
diff --git a/src/shared/log.c b/src/shared/log.c
index 055acb4..3a4739d 100644
--- a/src/shared/log.c
+++ b/src/shared/log.c
@@ -3,7 +3,7 @@
*
* Copyright(c) 2011-2012 Simon Rettberg
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,7 +18,7 @@
*
*/
-#include "log.h"
+#include <dnbd3/shared/log.h>
#include <stdarg.h>
#include <pthread.h>
#include <stdlib.h>
@@ -36,6 +36,7 @@ static _Atomic logmask_t maskCon = 15;
static char *logFile = NULL;
static int logFd = -1;
+static FILE *logOutStream;
static bool consoleTimestamps = false;
@@ -43,6 +44,10 @@ static bool consoleTimestamps = false;
static int writeLevel(char *buffer, logmask_t level);
+void log_init(void) {
+ logOutStream = stdout;
+}
+
bool log_hasMask(const logmask_t mask)
{
return ( ( maskFile | maskCon ) & mask ) == mask;
@@ -63,6 +68,15 @@ void log_setConsoleTimestamps(bool on)
consoleTimestamps = on;
}
+int log_setConsoleOutputStream(FILE *outputStream)
+{
+ if ( outputStream != stdout && outputStream != stderr )
+ return -EINVAL;
+
+ logOutStream = outputStream;
+ return 0;
+}
+
bool log_openLogFile(const char *path)
{
pthread_mutex_lock( &logLock );
@@ -93,10 +107,10 @@ void logadd(const logmask_t mask, const char *fmt, ...)
struct tm timeinfo;
char buffer[LINE_LEN];
bool toFile = maskFile & mask;
- bool toStdout = maskCon & mask;
+ bool toOutStream = maskCon & mask;
size_t offset;
- if ( toFile || ( toStdout && consoleTimestamps ) ) {
+ if ( toFile || ( toOutStream && consoleTimestamps ) ) {
time( &rawtime );
localtime_r( &rawtime, &timeinfo );
offset = strftime( buffer, LINE_LEN, "[%d.%m. %H:%M:%S] ", &timeinfo );
@@ -134,15 +148,11 @@ void logadd(const logmask_t mask, const char *fmt, ...)
}
pthread_mutex_unlock( &logLock );
}
- if ( toStdout ) {
- if ( consoleTimestamps ) stdoutLine = buffer;
-#ifdef AFL_MODE
- fputs( stdoutLine, stderr );
- fflush( stderr );
-#else
- fputs( stdoutLine, stdout );
- fflush( stdout );
-#endif
+ if ( toOutStream ) {
+ if ( consoleTimestamps )
+ stdoutLine = buffer;
+ fputs( stdoutLine, logOutStream );
+ fflush( logOutStream );
}
}
diff --git a/src/shared/log.h b/src/shared/log.h
deleted file mode 100644
index 5b1e8f7..0000000
--- a/src/shared/log.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Simon Rettberg
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef LOG_H_
-#define LOG_H_
-
-#include <stdbool.h>
-#include <unistd.h>
-
-typedef unsigned int logmask_t;
-#define LOG_ERROR ((logmask_t)1) // Fatal error, server will terminate
-#define LOG_WARNING ((logmask_t)2) // Major issue, something is broken but keep running
-#define LOG_MINOR ((logmask_t)4) // Minor issue, more of a hickup than serious problem
-#define LOG_INFO ((logmask_t)8) // Informational message
-#define LOG_DEBUG1 ((logmask_t)16) // Debug information, use this for non-spammy stuff
-#define LOG_DEBUG2 ((logmask_t)32) // Use this for debug messages that will show up a lot
-
-
-/**
- * Check if cansoleMask | fileMask has all of mask set.
- */
-bool log_hasMask(const logmask_t mask);
-
-void log_setFileMask(logmask_t mask);
-
-void log_setConsoleMask(logmask_t mask);
-
-void log_setConsoleTimestamps(bool on);
-
-/**
- * Open or reopen the log file. If path is NULL and the
- * function was called with a path before, the same path
- * will be used again.
- */
-bool log_openLogFile(const char *path);
-
-/**
- * Add a line to the log
- */
-void logadd(const logmask_t mask, const char *text, ...)
- __attribute__ ((format (printf, 2, 3)));
-
-/**
- * Return last size bytes of log.
- */
-ssize_t log_fetch(char *buffer, int size);
-
-#endif /* LOG_H_ */
diff --git a/src/shared/protocol.h b/src/shared/protocol.h
deleted file mode 100644
index 2b21c21..0000000
--- a/src/shared/protocol.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef _PROTOCOL_H_
-#define _PROTOCOL_H_
-
-#include "sockhelper.h"
-
-#include "../types.h"
-#include "../serialize.h"
-
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/uio.h>
-
-// Client tells server that it is another server
-#define FLAGS8_SERVER (1)
-// Client (which is a proxy) tells server that it has background-replication enabled
-#define FLAGS8_BG_REP (2)
-
-// 2017-10-16: We now support hop-counting, macro to pass hop count conditinally to a function
-#define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 )
-
-// 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var
-#define SI_SERVER_FLAGS ( (uint8_t)( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) )
-
-#define REPLY_OK (0)
-#define REPLY_ERRNO (-1)
-#define REPLY_AGAIN (-2)
-#define REPLY_INTR (-3)
-#define REPLY_CLOSED (-4)
-#define REPLY_INCOMPLETE (-5)
-#define REPLY_WRONGMAGIC (-6)
-
-static inline int dnbd3_read_reply(int sock, dnbd3_reply_t *reply, bool wait)
-{
- ssize_t ret = recv( sock, reply, sizeof(*reply), (wait ? MSG_WAITALL : MSG_DONTWAIT) | MSG_NOSIGNAL );
- if ( ret == 0 ) return REPLY_CLOSED;
- if ( ret < 0 ) {
- if ( errno == EAGAIN || errno == EWOULDBLOCK ) return REPLY_AGAIN;
- if ( errno == EINTR ) return REPLY_INTR;
- return REPLY_ERRNO;
- }
- if ( !wait && ret != sizeof(*reply) ) ret += recv( sock, ((char*)reply) + ret, sizeof(*reply) - ret, MSG_WAITALL | MSG_NOSIGNAL );
- if ( ret != sizeof(*reply) ) return REPLY_INCOMPLETE;
- fixup_reply( *reply );
- if ( reply->magic != dnbd3_packet_magic ) return REPLY_WRONGMAGIC;
- return REPLY_OK;
-}
-
-static inline bool dnbd3_get_reply(int sock, dnbd3_reply_t *reply)
-{
- int ret;
- do {
- ret = dnbd3_read_reply( sock, reply, true );
- } while ( ret == REPLY_INTR );
- return ret == REPLY_OK;
-}
-
-static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid, uint8_t flags8)
-{
- serialized_buffer_t serialized;
- dnbd3_request_t request;
- struct iovec iov[2];
- serializer_reset_write( &serialized );
- serializer_put_uint16( &serialized, PROTOCOL_VERSION );
- serializer_put_string( &serialized, name );
- serializer_put_uint16( &serialized, rid );
- serializer_put_uint8( &serialized, flags8 );
- const ssize_t len = serializer_get_written_length( &serialized );
- request.magic = dnbd3_packet_magic;
- request.cmd = CMD_SELECT_IMAGE;
- request.size = (uint32_t)len;
-#ifdef _DEBUG
- request.handle = 0;
- request.offset = 0;
-#endif
- fixup_request( request );
- iov[0].iov_base = &request;
- iov[0].iov_len = sizeof(request);
- iov[1].iov_base = &serialized;
- iov[1].iov_len = len;
- ssize_t ret;
- do {
- ret = writev( sock, iov, 2 );
- } while ( ret == -1 && errno == EINTR );
- return ret == len + (ssize_t)sizeof(request);
-}
-
-static inline bool dnbd3_get_block(int sock, uint64_t offset, uint32_t size, uint64_t handle, uint8_t hopCount)
-{
- dnbd3_request_t request;
- request.magic = dnbd3_packet_magic;
- request.handle = handle;
- request.cmd = CMD_GET_BLOCK;
- // When writing before "fixup", we can get away with assigning to offset instead of offset_small if we
- // do it before assigning to .hops. Faster on 64bit machines (so, on everything)
- request.offset = offset;
- request.hops = hopCount;
- request.size = size;
- fixup_request( request );
- return sock_sendAll( sock, &request, sizeof(request), 2 ) == (ssize_t)sizeof(request);
-}
-
-static inline bool dnbd3_get_crc32(int sock, uint32_t *master, void *buffer, size_t *bufferLen)
-{
- dnbd3_request_t request;
- dnbd3_reply_t reply;
- request.magic = dnbd3_packet_magic;
- request.handle = 0;
- request.cmd = CMD_GET_CRC32;
- request.offset = 0;
- request.size = 0;
- fixup_request( request );
- if ( sock_sendAll( sock, &request, sizeof(request), 2 ) != (ssize_t)sizeof(request) ) return false;
- if ( !dnbd3_get_reply( sock, &reply ) ) return false;
- if ( reply.size == 0 ) {
- *bufferLen = 0;
- return true;
- }
- if ( reply.size < 4 ) return false;
- reply.size -= 4;
- if ( reply.cmd != CMD_GET_CRC32 || reply.size > *bufferLen ) return false;
- *bufferLen = reply.size;
- if ( sock_recv( sock, master, sizeof(uint32_t) ) != (ssize_t)sizeof(uint32_t) ) return false;
- return sock_recv( sock, buffer, reply.size ) == (ssize_t)reply.size;
-}
-
-/**
- * Pass a full serialized_buffer_t and a socket fd. Parsed data will be returned in further arguments.
- * Note that all strings will point into the passed buffer, so there's no need to free them.
- * This function will also read the header for you, as this message can only occur during connection,
- * where no unrequested messages could arrive inbetween.
- */
-static inline bool dnbd3_select_image_reply(serialized_buffer_t *buffer, int sock, uint16_t *protocol_version, char **name, uint16_t *rid,
- uint64_t *imageSize)
-{
- errno = 0;
- dnbd3_reply_t reply;
- if ( !dnbd3_get_reply( sock, &reply ) ) {
- return false;
- }
- errno = 0;
- if ( reply.cmd != CMD_SELECT_IMAGE || reply.size < 3 || reply.size > MAX_PAYLOAD ) {
- return false;
- }
- // receive reply payload
- ssize_t ret = sock_recv( sock, buffer, reply.size );
- if ( ret != (ssize_t)reply.size ) {
- return false;
- }
- // handle/check reply payload
- serializer_reset_read( buffer, reply.size );
- *protocol_version = serializer_get_uint16( buffer );
- *name = serializer_get_string( buffer );
- *rid = serializer_get_uint16( buffer );
- *imageSize = serializer_get_uint64( buffer );
- return true;
-}
-
-#endif
diff --git a/src/serialize.c b/src/shared/serialize.c
index 0bc0dcd..1f7cddd 100644
--- a/src/serialize.c
+++ b/src/shared/serialize.c
@@ -1,6 +1,6 @@
-#include "serialize.h"
-#include "types.h"
-
+// SPDX-License-Identifier: GPL-2.0
+#include <dnbd3/shared/serialize.h>
+#include <dnbd3/types.h>
void serializer_reset_read(serialized_buffer_t *buffer, size_t data_len)
{
@@ -16,14 +16,17 @@ void serializer_reset_write(serialized_buffer_t *buffer)
uint8_t serializer_get_uint8(serialized_buffer_t *buffer)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return 0;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return 0;
return (uint8_t)*buffer->buffer_pointer++;
}
uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
{
uint16_t ret;
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 2);
buffer->buffer_pointer += 2;
return net_order_16(ret);
@@ -32,7 +35,9 @@ uint16_t serializer_get_uint16(serialized_buffer_t *buffer)
uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
{
uint64_t ret;
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return 0;
+
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return 0;
memcpy(&ret, buffer->buffer_pointer, 8);
buffer->buffer_pointer += 8;
return net_order_64(ret);
@@ -41,22 +46,29 @@ uint64_t serializer_get_uint64(serialized_buffer_t *buffer)
char *serializer_get_string(serialized_buffer_t *buffer)
{
char *ptr = buffer->buffer_pointer, *start = buffer->buffer_pointer;
- if (ptr >= buffer->buffer_end) return NULL;
- while (ptr < buffer->buffer_end && *ptr) ++ptr;
- if (*ptr) return NULL; // String did not terminate within buffer (possibly corrupted/malicious packet)
+
+ if (ptr >= buffer->buffer_end)
+ return NULL;
+ while (ptr < buffer->buffer_end && *ptr)
+ ++ptr;
+ // String did not terminate within buffer (possibly corrupted/malicious packet)
+ if (*ptr)
+ return NULL;
buffer->buffer_pointer = ptr + 1;
return start;
}
void serializer_put_uint8(serialized_buffer_t *buffer, uint8_t value)
{
- if (buffer->buffer_pointer + 1 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 1 > buffer->buffer_end)
+ return;
*buffer->buffer_pointer++ = (char)value;
}
void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
{
- if (buffer->buffer_pointer + 2 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 2 > buffer->buffer_end)
+ return;
value = net_order_16(value);
memcpy(buffer->buffer_pointer, &value, 2);
buffer->buffer_pointer += 2;
@@ -64,7 +76,8 @@ void serializer_put_uint16(serialized_buffer_t *buffer, uint16_t value)
void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
{
- if (buffer->buffer_pointer + 8 > buffer->buffer_end) return;
+ if (buffer->buffer_pointer + 8 > buffer->buffer_end)
+ return;
value = net_order_64(value);
memcpy(buffer->buffer_pointer, &value, 8);
buffer->buffer_pointer += 8;
@@ -73,12 +86,14 @@ void serializer_put_uint64(serialized_buffer_t *buffer, uint64_t value)
void serializer_put_string(serialized_buffer_t *buffer, const char *value)
{
const size_t len = strlen(value) + 1;
- if (buffer->buffer_pointer + len > buffer->buffer_end) return;
+
+ if (buffer->buffer_pointer + len > buffer->buffer_end)
+ return;
memcpy(buffer->buffer_pointer, value, len);
buffer->buffer_pointer += len;
}
uint32_t serializer_get_written_length(serialized_buffer_t *buffer)
{
- return (uint32_t)( buffer->buffer_pointer - buffer->buffer );
+ return (uint32_t)(buffer->buffer_pointer - buffer->buffer);
}
diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
index ec80659..5096320 100644
--- a/src/shared/sockhelper.c
+++ b/src/shared/sockhelper.c
@@ -1,6 +1,8 @@
-#include "sockhelper.h"
-#include "log.h"
+#include <dnbd3/shared/sockhelper.h>
+#include <dnbd3/shared/log.h>
+#include <dnbd3/types.h>
#include <arpa/inet.h> // inet_ntop
+#include <netinet/tcp.h>
#include <netdb.h>
#include <stdio.h>
#include <unistd.h>
@@ -19,8 +21,7 @@ struct _poll_list {
int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms)
{
// TODO: Move out of here, this unit should contain general socket functions
- // TODO: Abstract away from sockaddr_in* like the rest of the functions here do,
- // so WITH_IPV6 can finally be removed as everything is transparent. b- but how?
+ // TODO: Abstract away from sockaddr_in* like the rest of the functions here
struct sockaddr_storage ss;
int proto, addrlen;
memset( &ss, 0, sizeof ss );
@@ -32,9 +33,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr4->sin_port = addr->port;
proto = PF_INET;
addrlen = sizeof *addr4;
- }
-#ifdef WITH_IPV6
- else if ( addr->type == HOST_IP6 ) {
+ } else if ( addr->type == HOST_IP6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)&ss;
addr6->sin6_family = AF_INET6;
@@ -42,9 +41,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
addr6->sin6_port = addr->port;
proto = PF_INET6;
addrlen = sizeof *addr6;
- }
-#endif
- else {
+ } else {
logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type );
errno = EAFNOSUPPORT;
return -1;
@@ -57,11 +54,13 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
} else {
sock_setTimeout( client_sock, connect_ms );
}
- int e2;
+ // NODELAY makes sense for the client side, which should be all users in this code base
+ int e2 = 1;
+ setsockopt( client_sock, IPPROTO_TCP, TCP_NODELAY, (void *)&e2, sizeof(e2) );
for ( int i = 0; i < 5; ++i ) {
int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen );
e2 = errno;
- if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break;
+ if ( ret != -1 || ( connect_ms == -1 && errno == EINPROGRESS ) || errno == EISCONN ) break;
if ( errno == EINTR ) {
// http://www.madore.org/~david/computers/connect-intr.html
#ifdef __linux__
@@ -77,7 +76,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
errno = e2;
return -1;
}
- sockaddr_storage junk;
+ struct sockaddr_storage junk;
socklen_t more_junk = sizeof(junk);
if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) {
e2 = errno;
@@ -165,7 +164,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr4->sin_addr, 4 );
return true;
}
-#ifdef WITH_IPV6
+
if ( sa->sa_family == AF_INET6 ) {
// Set host (IPv6)
struct sockaddr_in6 *addr6 = (struct sockaddr_in6*)sa;
@@ -174,7 +173,7 @@ bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host)
memcpy( host->addr, &addr6->sin6_addr, 16 );
return true;
}
-#endif
+
return false;
}
@@ -242,7 +241,10 @@ size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLe
outlen = snprintf( output, len, "[%s]:%s", host, port );
}
}
- if ( outlen <= 0 ) return 0;
+ if ( outlen <= 0 ) {
+ output[0] = '\0';
+ return 0;
+ }
return MIN( (size_t)outlen, len-1 );
}
@@ -346,7 +348,7 @@ int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_m
if ( i != list->count ) list->entry[i] = list->entry[list->count];
if ( fd != -1 ) {
sock_set_block( fd );
- if ( rw_ms != -1 && rw_ms != connect_ms ) {
+ if ( rw_ms != -1 ) {
sock_setTimeout( fd, rw_ms );
}
return fd;
diff --git a/src/shared/sockhelper.h b/src/shared/sockhelper.h
deleted file mode 100644
index 8d70789..0000000
--- a/src/shared/sockhelper.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef SOCKHELPER_H_
-#define SOCKHELPER_H_
-
-/*
- * Helper functions for dealing with sockets. These functions should
- * abstract from the IP version by using getaddrinfo() and thelike.
- */
-
-#include "../types.h"
-#include <stdint.h>
-#include <sys/socket.h>
-#include <string.h>
-
-typedef struct _poll_list poll_list_t;
-
-/**
- * Connect to given dnbd3_host_t.
- * @param addr - address of host to connect to
- * @param connect_ms - timeout in milliseconds after which the connection attempt fails
- * @param rw_ms - read/write timeout in milliseconds to apply on successful connect
- * @return socket file descriptor, or -1 on error
- */
-int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const int rw_ms);
-
-/**
- * Resolve/parse given address and put the result(s) into passed dnbd3_host_t array,
- * but only up to count entries.
- * @return Number of items added to array
- */
-int sock_resolveToDnbd3Host(const char * const address, dnbd3_host_t * const dest, const int count);
-
-bool sock_sockaddrToDnbd3(struct sockaddr* sa, dnbd3_host_t *host);
-
-void sock_setTimeout(const int sockfd, const int milliseconds);
-
-size_t sock_printHost(const dnbd3_host_t * const host, char *output, const size_t len);
-
-size_t sock_printable(const struct sockaddr * const addr, const socklen_t addrLen, char *output, const size_t len);
-
-/**
- * Create new poll list.
- */
-poll_list_t* sock_newPollList();
-
-/**
- * Delete a poll list, closing all sockets first if necessary.
- */
-void sock_destroyPollList(poll_list_t *list);
-
-/**
- * Listen on all interfaces/available IP addresses, using the given protocol.
- * IPv4 and IPv6 are supported.
- * @param protocol_family PF_INET or PF_INET6
- * @param port port to listen on
- * @return true if any listen call was successful
- */
-bool sock_listenAny(poll_list_t* list, uint16_t port);
-
-/**
- * Listen on a specific address and port.
- * @param bind_addr human readable address to bind to for listening
- * @param port to listen on
- */
-bool sock_listen(poll_list_t* list, char* bind_addr, uint16_t port);
-
-/**
- * Asynchroneously connect to multiple hosts.
- * This can be called multiple times with varying timeouts. Calling it
- * the first time on an empty list is identical to sock_connect(). On
- * consecutive calls, more nonblocking sockets in connecting state will
- * be added to the list, and on each of these calls, all the pending
- * sockets will be checked for successful connection (or error), respecting
- * the passed timeout.
- * host can be NULL to just wait on the sockets already in the list.
- * If at least one socket completed the connection
- * within the given timeout, it will be removed from the list and
- * returned. On error or timeout, -1 is returned. If there are no more sockets
- * in the list, -2 is returned.
- */
-int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_ms, int rw_ms);
-
-/**
- * This is a multi-socket version of accept. Pass in an array of listening sockets.
- * If any of the sockets has an incoming connection, accept it and return the new socket's fd.
- * On error, return -1, just like accept().
- * @param sockets array of listening socket fds
- * @param socket_count number of sockets in that array
- * @return fd of new client socket, -1 on error
- */
-int sock_accept(poll_list_t *list, struct sockaddr_storage *addr, socklen_t *length_ptr);
-
-void sock_set_nonblock(int sock);
-
-void sock_set_block(int sock);
-
-/**
- * Add given socket to array. Take an existing empty slot ( == -1) if available,
- * append to end otherwise. Updates socket count variable passed by reference.
- *
- * @param poll_list_t list the poll list to add the socket to
- * @param sock socket fd to add
- * @param wantRead whether to set the EPOLLIN flag
- * @param wantWrite whether to set the EPOLLOUT flag
- * @return true on success, false iff the array is already full or socket is < 0
- */
-bool sock_append(poll_list_t *list, const int sock, bool wantRead, bool wantWrite);
-
-/**
- * Send the whole buffer, calling write() multiple times if neccessary.
- * Give up after calling write() maxtries times.
- * Set maxtries < 0 to try infinitely.
- */
-ssize_t sock_sendAll(const int sock, const void *buffer, const size_t len, int maxtries);
-
-/**
- * Send given buffer, repeatedly calling recv on partial send or EINTR.
- */
-ssize_t sock_recv(const int sock, void *buffer, const size_t len);
-
-#endif /* SOCKHELPER_H_ */
diff --git a/src/shared/timing.c b/src/shared/timing.c
index 4ca1002..bdb8388 100644
--- a/src/shared/timing.c
+++ b/src/shared/timing.c
@@ -1,4 +1,4 @@
-#include "timing.h"
+#include <dnbd3/shared/timing.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
diff --git a/src/shared/timing.h b/src/shared/timing.h
deleted file mode 100644
index f23bfeb..0000000
--- a/src/shared/timing.h
+++ /dev/null
@@ -1,162 +0,0 @@
-#ifndef _D_TIMING_H
-#define _D_TIMING_H
-
-#ifndef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 199309L
-#endif
-
-#include <time.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef CLOCK_MONOTONIC_RAW
-#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC_RAW
-#else
-#define BEST_CLOCK_SOURCE CLOCK_MONOTONIC
-#endif
-
-typedef struct timespec ticks;
-
-extern struct timespec basetime;
-
-/**
- * Assign src to dst while adding secs seconds.
- */
-#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + (secs); (dst)->tv_nsec = (src)->tv_nsec; } while (0)
-
-/**
- * Define variable now, initialize to timing_get.
- */
-#define declare_now ticks now; timing_get( &now )
-
-/**
- * Call this once to calibrate on startup.
- * Although overflows of CLOCK_MONOTONIC(_RAW) should
- * by definition never happen, we still have a fixed size
- * int that could at some point. By forcing the counter
- * to start at 0 on startup the point of overflow
- * will be very far in the future (decades for 32bit time_t,
- * end of universe for 64bit).
- */
-void timing_setBase();
-
-/**
- * Internal, do not use. Moved to another function
- * to prevent inlining of error handling code, which
- * should be very unlikely to ever trigger.
- */
-_Noreturn void timing_abort();
-
-/**
- * Get current time. Shortcut for clock_gettime with error check.
- */
-static inline void timing_get(ticks* retval)
-{
- if ( clock_gettime( BEST_CLOCK_SOURCE, retval ) == -1 ) timing_abort();
- retval->tv_sec -= basetime.tv_sec;
-}
-
-/**
- * Get a ticks instance somewhere in the future.
- * Useful for timeouts.
- */
-static inline void timing_gets(ticks* retval, int32_t addSeconds)
-{
- timing_get( retval );
- retval->tv_sec += addSeconds;
-}
-
-static inline void timing_addSeconds(ticks* retval, ticks* base, int32_t addSeconds)
-{
- retval->tv_sec = base->tv_sec + addSeconds;
- retval->tv_nsec = base->tv_nsec;
-}
-
-/**
- * Check whether given timeout is reached.
- * Might trigger up to one second early.
- */
-static inline bool timing_reached(const ticks* timeout, const ticks* now)
-{
- return now->tv_sec >= timeout->tv_sec;
-}
-#define timing_1le2(one,two) timing_reached(one,two)
-
-/**
- * Precise check whether given timeout has been reached.
- */
-static inline bool timing_reachedPrecise(const ticks* timeout, const ticks* now)
-{
- return now->tv_sec > timeout->tv_sec
- || (now->tv_sec == timeout->tv_sec && now->tv_nsec > timeout->tv_nsec);
-}
-
-/**
- * Shortcut for above. Useful if not used in loop.
- * Might trigger up to one second early.
- */
-static inline bool timing_isReached(const ticks* timeout)
-{
- ticks now;
- timing_get( &now );
- return timing_reached( timeout, &now );
-}
-/**
- * Shortcut for above. Useful if not used in loop.
- */
-static inline bool timing_isReachedPrecise(const ticks* timeout)
-{
- ticks now;
- timing_get( &now );
- return timing_reachedPrecise( timeout, &now );
-}
-
-
-/**
- * Get difference between two ticks, rounded down to seconds.
- * Make sure you pass the arguments in the proper order. If
- * end is before start, 0 will always be returned.
- */
-static inline uint32_t timing_diff(const ticks *start, const ticks *end)
-{
- if ( end->tv_sec <= start->tv_sec ) return 0;
- return (uint32_t)( ( end->tv_sec - start->tv_sec )
- + ( start->tv_nsec > end->tv_nsec ? -1 : 0 ) );
-}
-
-/**
- * Get difference between two ticks, rounded down to milliseconds.
- * Same as above; passing arguments in reverse will always return 0.
- */
-static inline uint64_t timing_diffMs(const ticks *start, const ticks *end)
-{
- if ( end->tv_sec < start->tv_sec ) return 0;
- uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000;
- if ( start->tv_nsec >= end->tv_nsec ) {
- if ( diff == 0 ) return 0;
- diff -= (start->tv_nsec - end->tv_nsec) / 1000000;
- } else {
- diff += (end->tv_nsec - start->tv_nsec) / 1000000;
- }
- return diff;
-}
-
-/**
- * Get difference between two ticks, rounded down to microseconds.
- * Same as above; passing arguments in reverse will always return 0.
- */
-static inline uint64_t timing_diffUs(const ticks *start, const ticks *end)
-{
- if ( end->tv_sec < start->tv_sec ) return 0;
- uint64_t diff = (uint64_t)( end->tv_sec - start->tv_sec ) * 1000000;
- if ( start->tv_nsec >= end->tv_nsec ) {
- if ( diff == 0 ) return 0;
- diff -= ( start->tv_nsec - end->tv_nsec ) / 1000;
- } else {
- diff += ( end->tv_nsec - start->tv_nsec ) / 1000;
- }
- return diff;
-}
-
-
-#endif
diff --git a/src/types.h b/src/types.h
deleted file mode 100644
index cb0ccfd..0000000
--- a/src/types.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef TYPES_H_
-#define TYPES_H_
-
-#include "config.h"
-#ifndef KERNEL_MODULE
-#include <stdint.h>
-#include <stdbool.h>
-#endif
-
-#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-#ifndef MAX
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-#endif
-
-#ifdef __GNUC__
-#define UNUSED __attribute__ ((unused))
-#else
-#error "Please add define for your compiler for UNUSED, or define to nothing for your compiler if not supported"
-#endif
-
-#if defined(__GNUC__) && __GNUC__ >= 3
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#else
-#define likely(x) (x)
-#define unlikely(x) (x)
-#endif
-
-#ifdef __linux__
-#define HAVE_THREAD_NAMES
-#endif
-
-#ifdef __FreeBSD__
-#ifndef MSG_MORE
-#define MSG_MORE 0
-#endif
-#ifndef POLLRDHUP
-#define POLLRDHUP 0x2000
-#endif
-#include <netinet/in.h>
-#endif
-
-#ifdef AFL_MODE
-#define send(a,b,c,d) write(a,b,c)
-#define recv(a,b,c,d) read(a,b,c)
-#endif
-
-
-// ioctl
-#define DNBD3_MAGIC 'd'
-#define IOCTL_OPEN _IO(0xab, 1)
-#define IOCTL_CLOSE _IO(0xab, 2)
-#define IOCTL_SWITCH _IO(0xab, 3)
-#define IOCTL_ADD_SRV _IO(0xab, 4)
-#define IOCTL_REM_SRV _IO(0xab, 5)
-
-#if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-#define dnbd3_packet_magic ((uint16_t)( (0x73 << 8) | (0x72) ))
-// Flip bytes around on big endian when putting stuff on the net
-#define net_order_64(a) ((uint64_t)((((a) & 0xFFull) << 56) | (((a) & 0xFF00ull) << 40) | (((a) & 0xFF0000ull) << 24) | (((a) & 0xFF000000ull) << 8) | (((a) & 0xFF00000000ull) >> 8) | (((a) & 0xFF0000000000ull) >> 24) | (((a) & 0xFF000000000000ull) >> 40) | (((a) & 0xFF00000000000000ull) >> 56)))
-#define net_order_32(a) ((uint32_t)((((a) & (uint32_t)0xFF) << 24) | (((a) & (uint32_t)0xFF00) << 8) | (((a) & (uint32_t)0xFF0000) >> 8) | (((a) & (uint32_t)0xFF000000) >> 24)))
-#define net_order_16(a) ((uint16_t)((((a) & (uint16_t)0xFF) << 8) | (((a) & (uint16_t)0xFF00) >> 8)))
-#define fixup_request(a) do { \
- (a).cmd = net_order_16((a).cmd); \
- (a).size = net_order_32((a).size); \
- (a).offset = net_order_64((a).offset); \
-} while (0)
-#define fixup_reply(a) do { \
- (a).cmd = net_order_16((a).cmd); \
- (a).size = net_order_32((a).size); \
-} while (0)
-#define ENDIAN_MODE "Big Endian"
-#ifndef BIG_ENDIAN
-#define BIG_ENDIAN
-#endif
-#elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
-#define dnbd3_packet_magic ((uint16_t)( (0x73) | (0x72 << 8) ))
-// Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
-#define net_order_64(a) (a)
-#define net_order_32(a) (a)
-#define net_order_16(a) (a)
-#define fixup_request(a) while(0)
-#define fixup_reply(a) while(0)
-#define ENDIAN_MODE "Little Endian"
-#ifndef LITTLE_ENDIAN
-#define LITTLE_ENDIAN
-#endif
-#else
-#error "Unknown Endianness"
-#endif
-
-typedef uint8_t dnbd3_af;
-
-static const dnbd3_af HOST_NONE = (dnbd3_af)0;
-static const dnbd3_af HOST_IP4 = (dnbd3_af)2;
-static const dnbd3_af HOST_IP6 = (dnbd3_af)10;
-
-typedef struct __attribute__((packed)) dnbd3_host_t
-{
- uint8_t addr[16]; // 16byte (network representation, so it can be directly passed to socket functions)
- uint16_t port; // 2byte (network representation, so it can be directly passed to socket functions)
- dnbd3_af type; // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored)
-} dnbd3_host_t;
-
-typedef struct __attribute__((packed))
-{
- uint16_t len;
- dnbd3_host_t host;
- uint16_t imgnamelen;
- char *imgname;
- int rid;
- int read_ahead_kb;
- uint8_t use_server_provided_alts;
-} dnbd3_ioctl_t;
-
-// network
-#define CMD_GET_BLOCK 1
-#define CMD_SELECT_IMAGE 2
-#define CMD_GET_SERVERS 3
-#define CMD_ERROR 4
-#define CMD_KEEPALIVE 5
-#define CMD_LATEST_RID 6
-#define CMD_SET_CLIENT_MODE 7
-#define CMD_GET_CRC32 8
-
-#define DNBD3_REQUEST_SIZE 24
-typedef struct __attribute__((packed))
-{
- uint16_t magic; // 2byte
- uint16_t cmd; // 2byte
- uint32_t size; // 4byte
- union {
- struct {
-#ifdef LITTLE_ENDIAN
- uint64_t offset_small:56; // 7byte
- uint8_t hops; // 1byte
-#elif defined(BIG_ENDIAN)
- uint8_t hops; // 1byte
- uint64_t offset_small:56; // 7byte
-#endif
- };
- uint64_t offset; // 8byte
- };
- uint64_t handle; // 8byte
-} dnbd3_request_t;
-_Static_assert( sizeof(dnbd3_request_t) == DNBD3_REQUEST_SIZE, "dnbd3_request_t is messed up" );
-
-#define DNBD3_REPLY_SIZE 16
-typedef struct __attribute__((packed))
-{
- uint16_t magic; // 2byte
- uint16_t cmd; // 2byte
- uint32_t size; // 4byte
- uint64_t handle; // 8byte
-} dnbd3_reply_t;
-_Static_assert( sizeof(dnbd3_reply_t) == DNBD3_REPLY_SIZE, "dnbd3_reply_t is messed up" );
-
-typedef struct __attribute__((packed))
-{
- dnbd3_host_t host;
- uint8_t failures; // 1byte (number of times server has been consecutively unreachable)
-} dnbd3_server_entry_t;
-
-#endif /* TYPES_H_ */
diff --git a/src/version.c.in b/src/version.c.in
deleted file mode 100644
index ab937a2..0000000
--- a/src/version.c.in
+++ /dev/null
@@ -1,3 +0,0 @@
-
-const char * VERSION_STRING = "@VERSION@";
-
diff --git a/src/version.h b/src/version.h
deleted file mode 100644
index 0c4a66b..0000000
--- a/src/version.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef VERSION_H_
-#define VERSION_H_
-
-extern const char *VERSION_STRING;
-
-// This is done in a little weird way but otherwise eclipse complains about
-// unresolvable symbols etc...
-#include "version.c"
-
-#endif /* VERSION_H_ */