5 files changed, 82 insertions, 41 deletions
diff --git a/.github/workflows/test-cow-fuse.yml b/.github/workflows/test-cow-fuse.yml
index 9b42532..46b6933 100644
--- a/.github/workflows/test-cow-fuse.yml
+++ b/.github/workflows/test-cow-fuse.yml
@@ -146,6 +146,6 @@ jobs:
             /mnt/work/logs/randomLog1.out \
             /mnt/work/logs/randomTestLog1.out \
             /mnt/work/logs/randomLog2.out \
-            /mnt/work/tmp/status.txt \
+            /mnt/work/tmp/status \
             ${{ github.workspace }}/build/src/server/dnbd3.log \
             ${{ github.workspace }}/../cow_server/cow_merger_service/publish/log.out
diff --git a/cmake/FindKernelHeaders.cmake b/cmake/FindKernelHeaders.cmake
index c04243e..ce71aa8 100644
--- a/cmake/FindKernelHeaders.cmake
+++ b/cmake/FindKernelHeaders.cmake
@@ -31,10 +31,10 @@ find_path(KernelHeaders_INCLUDE_DIR
 
 # get Linux kernel headers version
 file(READ "${KERNEL_BUILD_DIR}/include/generated/utsrelease.h" tmpvar)
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION "${tmpvar}")
 if("${KernelHeaders_VERSION}" EQUAL "")
     file(READ "${KERNEL_BUILD_DIR}/include/config/kernel.release" tmpvar)
-    string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${tmpvar})
+    string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION "${tmpvar}")
 endif()
 if("${KernelHeaders_VERSION}" EQUAL "")
     string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KernelHeaders_VERSION ${KernelHeaders_INCLUDE_DIR})
diff --git a/src/fuse/cowDoc/readme.md b/src/fuse/cowDoc/readme.md
index 2b752bb..51a0052 100644
--- a/src/fuse/cowDoc/readme.md
+++ b/src/fuse/cowDoc/readme.md
@@ -24,12 +24,12 @@ A typical use case is updating or adding software to an existing image.
 - `- L <path>` Similar to `-c <path>`, but instead of creating a new session, an existing one is loaded from the specified path.
 - `-m` the client requests a merge after the image has been unmounted and all changes have been uploaded.
 
-- `--cowStatFile` creates a status file at the same location as the data and meta file. The file contains information about the current session, for more information see [here](#status).
-- `--cowStatStdout` similar to `--cowStatFile` but the information will be printed in the stdout.
+- `--cow-stats-file` creates a status file at the same location as the data and meta file. The file contains information about the current session, for more information see [here](#status).
+- `--cow-stats-stdout` similar to `--cow-stats-file` but the information will be printed in the stdout.
 
 Example parameters for creating a new cow session:
 ```
-./dnbd3-fuse "/home/user/VMs/mount" -f -h localhost -i imagename -c "/home/user/temp" -C "192.168.178.20:5000" --cowStatStdout -m
+./dnbd3-fuse "/home/user/VMs/mount" -f -h localhost -i imagename -c "/home/user/temp" -C "192.168.178.20:5000" --cow-stats-stdout -m
 
 ```
 
@@ -117,11 +117,11 @@ There are two different limits for the number of parallel uploads in the [config
 
 ## Files
 
-When a new CoW session is started, a new `meta`, `data` and, if so set in the command line arguments, a `status.txt` file is created.
+When a new CoW session is started, a new `meta`, `data` and, if so set in the command line arguments, a `status` file is created.
 
 ### status
 
-The file `status.txt` can be activated with the command line parameter `--cowStatFile`.
+The file `status` can be activated with the command line parameter `--cow-stats-file`.
 
 The file will contain the following:
 
@@ -147,9 +147,9 @@ It is `done` when the image has been unmounted and all clusters have been upload
 - `ulspeed` the current upload speed in kb/s.
 
 Once all clusters have been uploaded, the status is set to `done`.
-If you define `COW_DUMP_BLOCK_UPLOADS`, a list of all clusters, sorted by the number of uploads, is copied to the status.txt file after the cluster upload is completed.
+If you define `COW_DUMP_BLOCK_UPLOADS`, a list of all clusters, sorted by the number of uploads, is copied to the status file after the cluster upload is completed.
 
-With the command line parameter `--cowStatStdout` the same output of the stats file will be printed in stdout.
+With the command line parameter `--cow-stats-stdout` the same output of the stats file will be printed in stdout.
 
 ### meta
 
@@ -231,7 +231,7 @@ The following configuration variables have been added to `config/cow.h`.
 - `COW_MIN_UPLOAD_DELAY` sets the minimum time in seconds that must have elapsed since the last change to a cow cluster before it is uploaded.
 This value can be fine-tuned. A larger value usually reduces redundant uploading of clusters.
 A smaller value reduces the time for the final upload after the image has been unmounted.
-If you set `COW_DUMP_BLOCK_UPLOADS` and set the command line parameter `--cowStatFile`, then a list of all clusters, sorted by the number of uploads, will be written to the status.txt file after the cluster upload is complete.
+If you set `COW_DUMP_BLOCK_UPLOADS` and set the command line parameter `--cow-stats-file`, then a list of all clusters, sorted by the number of uploads, will be written to the status file after the cluster upload is complete.
 This can help in fine-tuning `COW_MIN_UPLOAD_DELAY`.
 - `COW_STATS_UPDATE_TIME` defines the update frequency of the stdout output/statistics file in seconds. Setting it too low could affect performance as a loop runs over all clusters.
 - `COW_MAX_PARALLEL_BACKGROUND_UPLOADS` defines the maximum number of parallel cluster uploads. This number is used when the image is still mounted and the user is still using it.
diff --git a/src/fuse/cowfile.c b/src/fuse/cowfile.c
index b183202..a5d6e4d 100644
--- a/src/fuse/cowfile.c
+++ b/src/fuse/cowfile.c
@@ -261,6 +261,10 @@ static size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb,
 {
 	cow_curl_read_upload_t *uploadBlock = (cow_curl_read_upload_t *)userdata;
 	size_t len = 0;
+
+	if ( size * nmemb < DNBD3_BLOCK_SIZE ) {
+		logadd( LOG_INFO, "Wow, curl read callback with %d bytes left", (int)( size * nmemb ) );
+	}
 	// Check if we're still in the bitfield
 	if ( uploadBlock->position < COW_BITFIELD_SIZE ) {
 		size_t lenCpy = MIN( COW_BITFIELD_SIZE - uploadBlock->position, size * nmemb );
@@ -276,8 +280,9 @@ static size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb,
 		ssize_t spaceLeft = ( size * nmemb ) - len;
 		// Only read blocks that have been written to the cluster. Saves bandwidth. Not optimal since
 		// we do a lot of 4k/32k reads, but it's not that performance critical I guess...
-		while ( spaceLeft >= (ssize_t)DNBD3_BLOCK_SIZE && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) {
+		while ( spaceLeft > 0 && inClusterOffset < (off_t)COW_DATA_CLUSTER_SIZE ) {
 			int bitNumber = (int)( inClusterOffset / DNBD3_BLOCK_SIZE );
+			uint32_t blockOffset = (uint32_t)( inClusterOffset % DNBD3_BLOCK_SIZE );
 			size_t readSize;
 			// Small performance hack: All bits one in a byte, do a 32k instead of 4k read
 			// TODO: preadv with a large iov, reading unchanged blocks into a trash-buffer
@@ -288,8 +293,13 @@ static size_t curlReadCallbackUploadBlock( char *ptr, size_t size, size_t nmemb,
 			} else {
 				readSize = DNBD3_BLOCK_SIZE;
 			}
+			readSize -= blockOffset;
+			if ( (ssize_t)readSize > spaceLeft ) {
+				readSize = spaceLeft;
+			}
 			// If handling single block, check bits in our copy, as global bitfield could change
-			if ( readSize != DNBD3_BLOCK_SIZE || checkBit( uploadBlock->bitfield, bitNumber ) ) {
+			// If uploading 8 blocks at once, check already happened above
+			if ( readSize > DNBD3_BLOCK_SIZE || checkBit( uploadBlock->bitfield, bitNumber ) ) {
 				ssize_t lengthRead = pread( cow.fdData, ( ptr + len ), readSize,
 						uploadBlock->cluster->offset + inClusterOffset );
 				if ( lengthRead == -1 ) {
@@ -424,9 +434,10 @@ static void dumpBlockUploads()
 	}
 	qsort( blockUploads, currentBlock, sizeof( cow_cluster_statistics_t ), cmpfunc );
 
-	dprintf( cow.fdStats, "\n\nclusterNumber: uploads\n==Block Upload Dump===\n" );
+	dprintf( cow.fdStats, "\n\n[BlockStats]\n" );
 	for ( uint64_t i = 0; i < currentBlock; i++ ) {
-		dprintf( cow.fdStats, "%" PRIu64 ": %" PRIu64 " \n", blockUploads[i].clusterNumber, blockUploads[i].uploads );
+		dprintf( cow.fdStats, "%" PRIu64 "=%" PRIu64 " \n",
+				blockUploads[i].clusterNumber, blockUploads[i].uploads );
 	}
 }
 #endif
@@ -454,6 +465,7 @@ static void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t id
 
 	int len = snprintf( buffer, sizeof buffer,
 			"[General]\n"
+			"uuid=%s\n"
 			"state=%s\n"
 			"inQueue=%" PRIu64 "\n"
 			"modifiedClusters=%" PRIu64 "\n"
@@ -461,6 +473,7 @@ static void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t id
 			"totalClustersUploaded=%" PRIu64 "\n"
 			"activeUploads=%i\n"
 			"%s%s\n",
+			metadata->uuid,
 			state, inQueue, modified, idle, totalBlocksUploaded, activeUploads,
 			COW_SHOW_UL_SPEED ? "avgSpeedKb=" : "",
 			speedBuffer );
@@ -478,12 +491,12 @@ static void updateCowStatsFile( uint64_t inQueue, uint64_t modified, uint64_t id
 		// Pad with a bunch of newlines so we don't change the file size all the time
 		ssize_t extra = MIN( 20, (ssize_t)sizeof(buffer) - len - 1 );
 		memset( buffer + len, '\n', extra );
-		lseek( cow.fdStats, 43, SEEK_SET );
-		if ( write( cow.fdStats, buffer, len + extra ) != len + extra ) {
+		if ( pwrite( cow.fdStats, buffer, len + extra, 0 ) != len + extra ) {
 			logadd( LOG_WARNING, "Could not update cow status file" );
 		}
 #ifdef COW_DUMP_BLOCK_UPLOADS
 		if ( !uploadLoop && uploadLoopDone ) {
+			lseek( cow.fdStats, len + extra, SEEK_SET );
 			dumpBlockUploads();
 		}
 #endif
@@ -549,7 +562,7 @@ static size_t curlHeaderCallbackUploadBlock( char *buffer, size_t size, size_t n
 	if ( len < 13 )
 		return len;
 	for ( int i = 0; i < 11; ++i ) {
-		buffer[i] |= 0x60;
+		buffer[i] |= 0x20;
 	}
 	if ( strncmp( buffer, "retry-after:", 12 ) != 0 )
 		return len;
@@ -883,10 +896,10 @@ static bool createCowStatsFile( char *path )
 {
 	char pathStatus[strlen( path ) + 12];
 
-	snprintf( pathStatus, strlen( path ) + 12, "%s%s", path, "/status.txt" );
+	snprintf( pathStatus, strlen( path ) + 12, "%s%s", path, "/status" );
 
 	char buffer[100];
-	int len = snprintf( buffer, 100, "uuid=%s\nstate: active\n", metadata->uuid );
+	int len = snprintf( buffer, 100, "[General]\nuuid=%s\nstate=active\n", metadata->uuid );
 	if ( statStdout ) {
 		logadd( LOG_INFO, "%s", buffer );
 	}
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 5ef4016..fcea31b 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -86,7 +86,7 @@ static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
 		struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info);
 
 static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
-		struct socket *sock);
+		struct socket *sock, u64 test_start, u32 test_size);
 
 static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd);
 
@@ -143,6 +143,7 @@ static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic)
 			// Panic freshly turned on
 			dev->panic = true;
 			dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC;
+			dev->discover_count = 0;
 		}
 		spin_unlock_irqrestore(&dev->blk_lock, irqflags);
 		dnbd3_flag_reset(dev->connection_lock);
@@ -192,10 +193,14 @@ static void dnbd3_internal_discover(dnbd3_device_t *dev)
 {
 	struct socket *sock, *best_sock = NULL;
 	dnbd3_alt_server_t *alt;
+	struct request *blk_request;
 	struct sockaddr_storage host_compare, best_server;
 	uint16_t remote_version;
 	ktime_t start, end;
 	unsigned long rtt = 0, best_rtt = 0;
+	u64 test_start = 0;
+	u32 test_size = RTT_BLOCK_SIZE;
+	unsigned long irqflags;
 	int i, j, k, isize, fails, rtt_threshold;
 	int do_change = 0;
 	u8 check_order[NUMBER_SERVERS];
@@ -219,6 +224,10 @@ static void dnbd3_internal_discover(dnbd3_device_t *dev)
 	best_server.ss_family = 0;
 	best_rtt = RTT_UNREACHABLE;
 
+	if (dev->panic) {
+		dnbd3_dev_dbg_host(dev, &host_compare, "Discover in panic mode\n");
+	}
+
 	if (!ready || dev->panic)
 		isize = NUMBER_SERVERS;
 	else
@@ -249,6 +258,24 @@ static void dnbd3_internal_discover(dnbd3_device_t *dev)
 		if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false))
 			goto error;
 
+		if (dev->panic) {
+			// In panic mode, use next pending request for testing, this has a higher chance of
+			// filtering out a server which can't actually handle our requests, instead of just
+			// requesting the very first block which should be cached by every server.
+			spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+			if (!list_empty(&dev->send_queue)) {
+				blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+				test_start = blk_rq_pos(blk_request) << 9; /* sectors to bytes */
+				test_size = blk_rq_bytes(blk_request);
+			}
+			spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+		}
+
+		// actual rtt measurement is just the first block request and reply
+		start = ktime_get_real();
+		if (!dnbd3_request_test_block(dev, &host_compare, sock, test_start, test_size))
+			goto error;
+		end = ktime_get_real();
 
 		// panic mode, take first responding server
 		if (dev->panic) {
@@ -259,7 +286,10 @@ static void dnbd3_internal_discover(dnbd3_device_t *dev)
 				// Check global flag, a connect might have been in progress
 				if (best_sock != NULL)
 					sock_release(best_sock);
-				set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000);
+				set_socket_timeout(sock, false, MAX(
+							SOCKET_TIMEOUT_RECV * 1000,
+							(int)ktime_ms_delta(end, start)
+						) + 1000);
 				if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0)
 					sock_release(sock);
 				dnbd3_flag_reset(dev->connection_lock);
@@ -267,12 +297,6 @@ static void dnbd3_internal_discover(dnbd3_device_t *dev)
 			}
 		}
 
-		// actual rtt measurement is just the first block requests and reply
-		start = ktime_get_real();
-		if (!dnbd3_request_test_block(dev, &host_compare, sock))
-			goto error;
-		end = ktime_get_real();
-
 		mutex_lock(&dev->alt_servers_lock);
 		if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
 			dev->alt_servers[i].protocol_version = remote_version;
@@ -446,7 +470,9 @@ static void dnbd3_recv_workfn(struct work_struct *work)
 	int remaining;
 	int ret;
 
+	dnbd3_dev_dbg_cur(dev, "starting receive worker...\n");
 	mutex_lock(&dev->recv_mutex);
+	dnbd3_dev_dbg_cur(dev, "receive worker started\n");
 	while (dev->sock) {
 		// receive net reply
 		ret = dnbd3_recv_reply(dev->sock, &reply_hdr);
@@ -594,6 +620,7 @@ static void dnbd3_recv_workfn(struct work_struct *work)
 out_unlock:
 	// This will check if we actually still need a new connection
 	dnbd3_start_discover(dev, true);
+	dnbd3_dev_dbg_cur(dev, "Receive worker exited\n");
 	mutex_unlock(&dev->recv_mutex);
 }
 
@@ -623,7 +650,7 @@ static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_m
 static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out)
 {
 	ktime_t start;
-	int ret, connect_time_ms;
+	int ret, connect_time_ms, diff;
 	struct socket *sock;
 	int retries = 4;
 	const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in)
@@ -659,7 +686,7 @@ static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, str
 
 	if (dev->panic && dev->panic_count > 1) {
 		/* in panic mode for some time, start increasing timeouts */
-		connect_time_ms = dev->panic_count * 1000;
+		connect_time_ms = dev->panic_count * 333;
 	} else {
 		/* otherwise, use 2*RTT of current server */
 		connect_time_ms = dev->cur_server.rtt * 2 / 1000;
@@ -667,21 +694,21 @@ static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, str
 	/* but obey a minimal configurable value, and maximum sanity check */
 	if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000)
 		connect_time_ms = SOCKET_TIMEOUT_SEND * 1000;
-	else if (connect_time_ms > 60000)
-		connect_time_ms = 60000;
+	else if (connect_time_ms > 15000)
+		connect_time_ms = 15000;
 	set_socket_timeout(sock, false, connect_time_ms); // recv
 	set_socket_timeout(sock, true, connect_time_ms); // send
 	start = ktime_get_real();
 	while (--retries > 0) {
 		ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0);
-		connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start);
-		if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) {
+		diff = (int)ktime_ms_delta(ktime_get_real(), start);
+		if (diff > 2 * connect_time_ms) {
 			/* Either I'm losing my mind or there was a specific build of kernel
 			 * 5.x where SO_RCVTIMEO didn't affect the connect call above, so
 			 * this function would hang for over a minute for unreachable hosts.
-			 * Leave in this debug check for twice the configured timeout
+			 * Leave in this debug check for twice the configured timeout.
 			 */
-			dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n",
+			dnbd3_dev_err_host(dev, addr, "connect: call took %dms\n",
 					connect_time_ms);
 		}
 		if (ret != 0) {
@@ -916,23 +943,24 @@ static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int byt
 	return true;
 }
 
-static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock)
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+		struct socket *sock, u64 test_start, u32 test_size)
 {
 	dnbd3_reply_t reply_hdr;
 
 	// Request block
-	if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) {
+	if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, test_start, test_size)) {
 		dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n");
 		return false;
 	}
 
-	// receive net reply
+	// receive reply header
 	if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
 		dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n");
 		return false;
 	}
 	if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK
-			|| reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) {
+			|| reply_hdr.size != test_size || reply_hdr.handle != 0) {
 		dnbd3_err_dbg_host(dev, addr,
 				"unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n",
 				(int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle);
@@ -940,7 +968,7 @@ static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storag
 	}
 
 	// receive data
-	return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE);
+	return dnbd3_drain_socket(dev, sock, test_size);
 }
 #undef dnbd3_err_dbg_host