From b7af3a8c36426811762bf331e3938f9d67b7429e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 2 Aug 2019 16:58:34 +0200
Subject: [SERVER] Make image->users atomic and get rid of some locking

With this change it should be safe to read the users count of an image
without locking first, assuming you already have a reference on the
image or are otherwise sure it cannot be freed, i.e. in an active
uplink. Updating users, or checking whether it's 0 in order to free the
image should only be done while holding the imageListLock.
---
 src/server/globals.h |  2 +-
 src/server/image.c   | 91 ++++++++++++++++++++++------------------------------
 2 files changed, 40 insertions(+), 53 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index b248800..73eb563 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -117,7 +117,7 @@ struct _dnbd3_image
 	uint32_t masterCrc32;  // CRC-32 of the crc-32 list
 	int readFd;            // used to read the image. Used from multiple threads, so use atomic operations (pread et al)
 	int completenessEstimate; // Completeness estimate in percent
-	int users;             // clients currently using this image
+	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
 	bool working;          // true if image exists and completeness is == 100% or a working upstream proxy is connected
 	uint16_t rid;          // revision of image
diff --git a/src/server/image.c b/src/server/image.c
index bfba6cb..1f12eda 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -267,14 +267,12 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		return NULL ;
 	}
 
-	mutex_lock( &candidate->lock );
-	mutex_unlock( &imageListLock );
 	candidate->users++;
-	mutex_unlock( &candidate->lock );
+	mutex_unlock( &imageListLock );
 
 	// Found, see if it works
-// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
-// TODO: But remember size-changed images forever
+	// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
+	// TODO: But remember size-changed images forever
 	if ( candidate->working || checkIfWorking ) {
 		// Is marked working, but might not have an fd open
 		if ( !image_ensureOpen( candidate ) ) {
@@ -391,17 +389,15 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
  * Every call to image_lock() needs to be followed by a call to image_release() at some point.
  * Locks on: imageListLock, _images[].lock
  */
-dnbd3_image_t* image_lock(dnbd3_image_t *image) // TODO: get rid, fix places that do image->users--
+dnbd3_image_t* image_lock(dnbd3_image_t *image)
 {
 	if ( image == NULL ) return NULL ;
 	int i;
 	mutex_lock( &imageListLock );
 	for (i = 0; i < _num_images; ++i) {
 		if ( _images[i] == image ) {
-			mutex_lock( &image->lock );
-			mutex_unlock( &imageListLock );
 			image->users++;
-			mutex_unlock( &image->lock );
+			mutex_unlock( &imageListLock );
 			return image;
 		}
 	}
@@ -419,12 +415,9 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
 {
 	if ( image == NULL ) return NULL;
 	mutex_lock( &imageListLock );
-	mutex_lock( &image->lock );
 	assert( image->users > 0 );
-	image->users--;
-	bool inUse = image->users != 0;
-	mutex_unlock( &image->lock );
-	if ( inUse ) { // Still in use, do nothing
+	// Decrement and check for 0
+	if ( --image->users != 0 ) { // Still in use, do nothing
 		mutex_unlock( &imageListLock );
 		return NULL;
 	}
@@ -439,7 +432,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
 	}
 	mutex_unlock( &imageListLock );
 	// So it wasn't in the images list anymore either, get rid of it
-	if ( !inUse ) image = image_free( image );
+	image = image_free( image );
 	return NULL;
 }
 
@@ -470,7 +463,6 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
 {
 	bool mustFree = false;
 	mutex_lock( &imageListLock );
-	mutex_lock( &image->lock );
 	for ( int i = _num_images - 1; i >= 0; --i ) {
 		if ( _images[i] == image ) {
 			_images[i] = NULL;
@@ -478,7 +470,6 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
 		}
 		if ( _images[i] == NULL && i + 1 == _num_images ) _num_images--;
 	}
-	mutex_unlock( &image->lock );
 	mutex_unlock( &imageListLock );
 	if ( mustFree ) image = image_free( image );
 	return image;
@@ -542,18 +533,14 @@ bool image_loadAll(char *path)
 			// Lock again, see if image is still there, free if required
 			mutex_lock( &imageListLock );
 			if ( ret || i >= _num_images || _images[i] == NULL || _images[i]->id != imgId ) continue;
-			// Image needs to be removed
+			// File not readable but still in list -- needs to be removed
 			imgHandle = _images[i];
 			_images[i] = NULL;
 			if ( i + 1 == _num_images ) _num_images--;
-			mutex_lock( &imgHandle->lock );
-			const bool freeImg = ( imgHandle->users == 0 );
-			mutex_unlock( &imgHandle->lock );
-			// We unlocked, but the image has been removed from the list already, so
-			// there's no way the users-counter can increase at this point.
-			if ( freeImg ) {
+			if ( imgHandle->users == 0 ) {
 				// Image is not in use anymore, free the dangling entry immediately
-				mutex_unlock( &imageListLock ); // image_free might do several fs operations; unlock
+				mutex_unlock( &imageListLock ); // image_free locks on this, and
+				// might do several fs operations; unlock
 				image_free( imgHandle );
 				mutex_lock( &imageListLock );
 			}
@@ -581,7 +568,7 @@ bool image_tryFreeAll()
 {
 	mutex_lock( &imageListLock );
 	for (int i = _num_images - 1; i >= 0; --i) {
-		if ( _images[i] != NULL && _images[i]->users == 0 ) { // XXX Data race...
+		if ( _images[i] != NULL && _images[i]->users == 0 ) {
 			dnbd3_image_t *image = _images[i];
 			_images[i] = NULL;
 			mutex_unlock( &imageListLock );
@@ -1506,7 +1493,7 @@ json_t* image_getListAsJson()
 	int i;
 	char uplinkName[100] = { 0 };
 	uint64_t bytesReceived;
-	int users, completeness, idleTime;
+	int completeness, idleTime;
 	declare_now;
 
 	mutex_lock( &imageListLock );
@@ -1514,8 +1501,6 @@ json_t* image_getListAsJson()
 		if ( _images[i] == NULL ) continue;
 		dnbd3_image_t *image = _images[i];
 		mutex_lock( &image->lock );
-		mutex_unlock( &imageListLock );
-		users = image->users;
 		idleTime = (int)timing_diff( &image->atime, &now );
 		completeness = image_getCompletenessEstimate( image );
 		if ( image->uplink == NULL ) {
@@ -1527,14 +1512,13 @@ json_t* image_getListAsJson()
 				uplinkName[0] = '\0';
 			}
 		}
-		image->users++; // Prevent freeing after we unlock
 		mutex_unlock( &image->lock );
 
 		jsonImage = json_pack( "{sisssisisisisI}",
 				"id", image->id, // id, name, rid never change, so access them without locking
 				"name", image->name,
 				"rid", (int) image->rid,
-				"users", users,
+				"users", image->users,
 				"complete",  completeness,
 				"idle", idleTime,
 				"size", (json_int_t)image->virtualFilesize );
@@ -1546,8 +1530,6 @@ json_t* image_getListAsJson()
 		}
 		json_array_append_new( imagesJson, jsonImage );
 
-		image = image_release( image ); // Since we did image->users++;
-		mutex_lock( &imageListLock );
 	}
 	mutex_unlock( &imageListLock );
 	return imagesJson;
@@ -1669,7 +1651,7 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
  * TODO: Store last access time of images. Currently the
  * last access time is reset to the file modification time
  * on server restart. Thus it will
- * currently only delete images if server uptime is > 10 hours.
+ * currently only delete images if server uptime is > 24 hours.
  * This can be overridden by setting force to true, in case
  * free space is desperately needed.
  * Return true iff enough space is available. false in random other cases
@@ -1693,34 +1675,39 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
 				(int)(size / (1024 * 1024)) );
 		// Find least recently used image
 		dnbd3_image_t *oldest = NULL;
-		int i; // XXX improve locking
+		int i;
+		mutex_lock( &imageListLock );
 		for (i = 0; i < _num_images; ++i) {
-			if ( _images[i] == NULL ) continue;
-			dnbd3_image_t *current = image_lock( _images[i] );
+			dnbd3_image_t *current = _images[i];
 			if ( current == NULL ) continue;
-			if ( current->users == 1 ) { // Just from the lock above
+			if ( current->users == 0 ) { // Not in use :-)
 				if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
 					// Oldest access time so far
 					oldest = current;
 				}
 			}
-			current = image_release( current );
+		}
+		if ( oldest != NULL ) {
+			oldest->users++;
+		}
+		mutex_unlock( &imageListLock );
+		if ( oldest == NULL ) {
+			logadd( LOG_INFO, "All images are currently in use :-(" );
+			return false;
 		}
 		declare_now;
-		if ( oldest == NULL || ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) ) {
-			if ( oldest == NULL ) {
-				logadd( LOG_INFO, "All images are currently in use :-(" );
-			} else {
-				logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" );
-			}
+		if ( !_sparseFiles && timing_diff( &oldest->atime, &now ) < 86400 ) {
+			logadd( LOG_INFO, "Won't free any image, all have been in use in the past 24 hours :-(" );
+			image_release( oldest ); // We did users++ above; image might have to be freed entirely
 			return false;
 		}
-		oldest = image_lock( oldest );
-		if ( oldest == NULL ) continue; // Image freed in the meantime? Try again
 		logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
-		char *filename = strdup( oldest->path );
-		oldest = image_remove( oldest );
-		oldest = image_release( oldest );
+		char *filename = strdup( oldest->path ); // Copy name as we remove the image first
+		oldest = image_remove( oldest ); // Remove from list first...
+		oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed
+		// Technically the image might have been grabbed again, but chances for
+		// this should be close to zero anyways since the image went unused for more than 24 hours..
+		// Proper fix would be a "delete" flag in the image struct that will be checked in image_free
 		unlink( filename );
 		size_t len = strlen( filename ) + 10;
 		char buffer[len];
@@ -1747,7 +1734,6 @@ void image_closeUnusedFd()
 		if ( image == NULL )
 			continue;
 		mutex_lock( &image->lock );
-		mutex_unlock( &imageListLock );
 		if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) {
 			snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
 			fd = image->readFd;
@@ -1757,10 +1743,11 @@ void image_closeUnusedFd()
 		}
 		mutex_unlock( &image->lock );
 		if ( fd != -1 ) {
+			mutex_unlock( &imageListLock );
 			close( fd );
 			logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr );
+			mutex_lock( &imageListLock );
 		}
-		mutex_lock( &imageListLock );
 	}
 	mutex_unlock( &imageListLock );
 }
-- 
cgit v1.2.3-55-g7522


From 77499f086631d0f6eeb96a3e0391cf72eb40ff5e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 3 Aug 2019 16:35:02 +0200
Subject: [SERVER] Atomicize some global flags

---
 src/server/globals.h   | 2 +-
 src/server/integrity.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 73eb563..7e5ff04 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -65,7 +65,7 @@ struct _dnbd3_connection
 	dnbd3_host_t betterServer;  // The better server
 	uint8_t *recvBuffer;        // Buffer for receiving payload
 	uint32_t recvBufferLen;     // Len of ^^
-	volatile bool shutdown;     // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
+	atomic_bool shutdown;       // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
 	bool replicatedLastBlock;   // bool telling if the last block has been replicated yet
 	bool cycleDetected;         // connection cycle between proxies detected for current remote server
 	int nextReplicationIndex;   // Which index in the cache map we should start looking for incomplete blocks at
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 8f17855..a66a364 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -29,7 +29,7 @@ static queue_entry checkQueue[CHECK_QUEUE_SIZE];
 static pthread_mutex_t integrityQueueLock;
 static pthread_cond_t queueSignal;
 static int queueLen = -1;
-static volatile bool bRunning = false;
+static atomic_bool bRunning = false;
 
 static void* integrity_main(void *data);
 
-- 
cgit v1.2.3-55-g7522


From c5795aa1f76a35a9b02ce07f145d650a92cfeb86 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 5 Aug 2019 12:46:22 +0200
Subject: [SERVER] Switch threadpool back to spinlock, add idle thread counter

---
 src/server/threadpool.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index dac0980..c01ae7a 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -16,13 +16,14 @@ static void *threadpool_worker(void *entryPtr);
 static pthread_attr_t threadAttrs;
 
 static int maxIdleThreads = -1;
+static atomic_int currentIdleThreads = 0;
 static entry_t *pool = NULL;
-static pthread_mutex_t poolLock;
+static pthread_spinlock_t poolLock;
 
 bool threadpool_init(int maxIdle)
 {
 	if ( maxIdle < 0 || maxIdleThreads >= 0 ) return false;
-	mutex_init( &poolLock );
+	pthread_spin_init( &poolLock, PTHREAD_PROCESS_PRIVATE );
 	maxIdleThreads = maxIdle;
 	pthread_attr_init( &threadAttrs );
 	pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
@@ -33,24 +34,29 @@ void threadpool_close()
 {
 	_shutdown = true;
 	if ( maxIdleThreads < 0 ) return;
-	mutex_lock( &poolLock );
+	pthread_spin_lock( &poolLock );
 	maxIdleThreads = -1;
 	entry_t *ptr = pool;
+	pool = NULL;
+	currentIdleThreads = 0;
+	pthread_spin_unlock( &poolLock );
 	while ( ptr != NULL ) {
 		entry_t *current = ptr;
 		ptr = ptr->next;
 		signal_call( current->signal );
 	}
-	mutex_unlock( &poolLock );
-	mutex_destroy( &poolLock );
+	pthread_spin_destroy( &poolLock );
 }
 
 bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 {
-	mutex_lock( &poolLock );
+	pthread_spin_lock( &poolLock );
 	entry_t *entry = pool;
-	if ( entry != NULL ) pool = entry->next;
-	mutex_unlock( &poolLock );
+	if ( entry != NULL ) {
+		pool = entry->next;
+		currentIdleThreads--;
+	}
+	pthread_spin_unlock( &poolLock );
 	if ( entry == NULL ) {
 		entry = (entry_t*)malloc( sizeof(entry_t) );
 		if ( entry == NULL ) {
@@ -90,8 +96,8 @@ static void *threadpool_worker(void *entryPtr)
 		if ( _shutdown ) break;
 		if ( ret > 0 ) {
 			if ( entry->startRoutine == NULL ) {
-				logadd( LOG_DEBUG1, "Worker woke up but has no work to do!" );
-				continue;
+				logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
+				exit( 1 );
 			}
 			// Start assigned work
 			(*entry->startRoutine)( entry->arg );
@@ -100,21 +106,16 @@ static void *threadpool_worker(void *entryPtr)
 			entry->arg = NULL;
 			if ( _shutdown ) break;
 			// Put thread back into pool if there are less than maxIdleThreds threads, just die otherwise
-			int threadCount = 0;
-			mutex_lock( &poolLock );
-			entry_t *ptr = pool;
-			while ( ptr != NULL ) {
-				threadCount++;
-				ptr = ptr->next;
-			}
-			if ( threadCount >= maxIdleThreads ) {
-				mutex_unlock( &poolLock );
+			if ( currentIdleThreads >= maxIdleThreads )
 				break;
-			}
+			// Race condition as we checked before locking, but worst case we have a couple
+			// too many threads idling around. At least the count stays accurate.
+			setThreadName( "[pool]" );
+			pthread_spin_lock( &poolLock );
+			currentIdleThreads++;
 			entry->next = pool;
 			pool = entry;
-			mutex_unlock( &poolLock );
-			setThreadName( "[pool]" );
+			pthread_spin_unlock( &poolLock );
 		} else {
 			logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
 		}
-- 
cgit v1.2.3-55-g7522


From 71c707da4e5405c986399c3f4505fa0a554548ba Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 5 Aug 2019 12:47:15 +0200
Subject: [SERVER] Add sanity check to threadpool_run for NULL routine

---
 src/server/threadpool.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src')

diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index c01ae7a..340a98d 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -50,6 +50,10 @@ void threadpool_close()
 
 bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 {
+	if ( startRoutine == NULL ) {
+		logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
+		return false; // Or bail out!?
+	}
 	pthread_spin_lock( &poolLock );
 	entry_t *entry = pool;
 	if ( entry != NULL ) {
-- 
cgit v1.2.3-55-g7522


From 1a8a31603e56995639eba99492611ab4e7ef64af Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 5 Aug 2019 13:42:19 +0200
Subject: [SERVER] Allow uplink shutdown if bgrMinClients > image->users

---
 src/server/uplink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 682b986..aa5228c 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -1082,6 +1082,7 @@ static bool uplink_saveCacheMap(dnbd3_connection_t *link)
 
 static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link)
 {
-	return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT && _backgroundReplication != BGR_FULL );
+	return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
+			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > link->image->users ) );
 }
 
-- 
cgit v1.2.3-55-g7522


From 48533240493c0dd970c926bbdb8939bb7d93cd14 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 6 Aug 2019 11:44:27 +0200
Subject: [SERVER] Fix: Client thread could destroy sendMutex while in use

Fix a race condition where the client thread tears down the client
struct including the sendMutex while the uplink thead is currently
holding the lock, trying to send data to the client.
---
 src/server/uplink.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index aa5228c..f58b019 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -153,6 +153,9 @@ void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client)
 	mutex_lock( &uplink->queueLock );
 	for (int i = uplink->queueLen - 1; i >= 0; --i) {
 		if ( uplink->queue[i].client == client ) {
+			// Make sure client doesn't get destroyed while we're sending it data
+			mutex_lock( &client->sendMutex );
+			mutex_unlock( &client->sendMutex );
 			uplink->queue[i].client = NULL;
 			uplink->queue[i].status = ULR_FREE;
 		}
-- 
cgit v1.2.3-55-g7522


From 8e152f715c015cbd1821ae9422d75db02a04573b Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 6 Aug 2019 11:46:05 +0200
Subject: [SERVER] Improve debug output if a locked lock gets destroyed

---
 src/server/locks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src')

diff --git a/src/server/locks.c b/src/server/locks.c
index a5b7c76..2c0cb27 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -256,6 +256,7 @@ int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mu
 		if ( locks[i].lock == lock ) {
 			if ( locks[i].locked ) {
 				logadd( LOG_ERROR, "Tried to destroy lock %p (%s) at %s:%d when it is still locked\n", (void*)lock, name, file, line );
+				logadd( LOG_ERROR, "Currently locked by: %s", locks[i].where );
 				exit( 4 );
 			}
 			locks[i].lock = NULL;
-- 
cgit v1.2.3-55-g7522


From 9b1cfe9bc09fad8ed0a111ea7db6ebf21def19be Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 6 Aug 2019 11:46:56 +0200
Subject: [BENCH] Fix a couple bugs in stress tester

---
 src/bench/connection.c | 135 +++++++++++++++++++++++--------------------------
 src/bench/connection.h |   2 +-
 src/bench/main.c       |  15 +++---
 3 files changed, 70 insertions(+), 82 deletions(-)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index 129ae3c..2e40019 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -18,23 +18,10 @@ static const size_t SHORTBUF = 100;
 #define SOCKET_KEEPALIVE_TIMEOUT (3)
 #define MAX_ALTS (8)
 #define MAX_HOSTS_PER_ADDRESS (2)
-// If a server wasn't reachable this many times, we slowly start skipping it on measurements
-static const int FAIL_BACKOFF_START_COUNT = 8;
 #define RTT_COUNT (4)
 
 /* Module variables */
-
-// Init guard
-static bool connectionInitDone = false;
-static bool keepRunning = true;
-
-static struct {
-	int sockFd;
-	pthread_mutex_t sendMutex;
-	dnbd3_signal_t* panicSignal;
-	dnbd3_host_t currentServer;
-	uint64_t startupTime;
-} connection;
+static char trash[4096];
 
 // Known alt servers
 typedef struct _alt_server {
@@ -54,13 +41,13 @@ bool connection_init_n_times(
 		const char *lowerImage,
 		const uint16_t rid,
 		int ntimes,
-		BenchCounters* counters,
-		bool closeSockets
+		BenchCounters* counters
 		) {
 	for (int run_i = 0; run_i < ntimes; ++run_i) {
 		counters->attempts++;
 
-		printf(".");
+		putchar('.');
+		fflush(stdout);
 		int sock = -1;
 		char host[SHORTBUF];
 		serialized_buffer_t buffer;
@@ -68,66 +55,70 @@ bool connection_init_n_times(
 		char *remoteName;
 		uint64_t remoteSize;
 
-		if ( !connectionInitDone && keepRunning ) {
-			dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
-			const char *current, *end;
-			int altIndex = 0;
-			memset( altservers, 0, sizeof altservers );
-			connection.sockFd = -1;
-			current = hosts;
-			do {
-				// Get next host from string
-				while ( *current == ' ' ) current++;
-				end = strchr( current, ' ' );
-				size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
-				if ( len > SHORTBUF ) len = SHORTBUF;
-				snprintf( host, len, "%s", current );
-				int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
-				for ( int i = 0; i < newHosts; ++i ) {
-					if ( altIndex >= MAX_ALTS )
-						break;
-					altservers[altIndex].host = tempHosts[i];
-					altIndex += 1;
-				}
-				current = end + 1;
-			} while ( end != NULL && altIndex < MAX_ALTS );
-			logadd( LOG_INFO, "Got %d servers from init call", altIndex );
-			// Connect
-			for ( int i = 0; i < altIndex; ++i ) {
-				if ( altservers[i].host.type == 0 )
-					continue;
-				// Try to connect
-				sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
-				if ( sock == -1 ) {
-					counters->fails++;
-					logadd( LOG_ERROR, "Could not connect to host" );
-				} else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
-					counters->fails++;
-					logadd( LOG_ERROR, "Could not send select image" );
-				} else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
-					counters->fails++;
-					logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
-				} else if ( rid != 0 && rid != remoteRid ) {
-					counters->fails++;
-					logadd( LOG_ERROR, "rid mismatch" );
-				} else {
-					counters->success++;
+		dnbd3_host_t tempHosts[MAX_HOSTS_PER_ADDRESS];
+		const char *current, *end;
+		int altIndex = 0;
+		memset( altservers, 0, sizeof altservers );
+		current = hosts;
+		do {
+			// Get next host from string
+			while ( *current == ' ' ) current++;
+			end = strchr( current, ' ' );
+			size_t len = (end == NULL ? SHORTBUF : (size_t)( end - current ) + 1);
+			if ( len > SHORTBUF ) len = SHORTBUF;
+			snprintf( host, len, "%s", current );
+			int newHosts = sock_resolveToDnbd3Host( host, tempHosts, MAX_HOSTS_PER_ADDRESS );
+			for ( int i = 0; i < newHosts; ++i ) {
+				if ( altIndex >= MAX_ALTS )
 					break;
-				}
-				// Failed
-				logadd( LOG_DEBUG1, "Server does not offer requested image... " );
-				if ( sock != -1 ) {
-					close( sock );
-					sock = -1;
-				}
+				altservers[altIndex].host = tempHosts[i];
+				altIndex += 1;
 			}
+			current = end + 1;
+		} while ( end != NULL && altIndex < MAX_ALTS );
+		// Connect
+		for ( int i = 0; i < altIndex; ++i ) {
+			if ( altservers[i].host.type == 0 )
+				continue;
+			// Try to connect
+			dnbd3_reply_t reply;
+			sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
+			if ( sock == -1 ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "Could not connect to host" );
+			} else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "Could not send select image" );
+			} else if ( !dnbd3_select_image_reply( &buffer, sock, &remoteVersion, &remoteName, &remoteRid, &remoteSize ) ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "Could not read select image reply (%d)", errno );
+			} else if ( rid != 0 && rid != remoteRid ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "rid mismatch" );
+			} else if ( !dnbd3_get_block( sock, run_i * 4096, 4096, 0, 0 ) ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "send: get block failed" );
+			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "recv: get block header failed" );
+			} else if ( recv( sock, trash, sizeof(trash), 0 ) != sizeof(trash) ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "recv: get block payload failed" );
+			} else {
+				counters->success++;
+				close( sock );
+				sock = -1;
+				continue;
+			}
+			// Failed
 			if ( sock != -1 ) {
-				// connectionInitDone = true;
-				if (closeSockets) {
-					close( sock );
-				}
+				close( sock );
+				sock = -1;
 			}
 		}
+		if ( sock != -1 ) {
+			close( sock );
+		}
 	}
 	return true;
 }
diff --git a/src/bench/connection.h b/src/bench/connection.h
index 9cb59ef..ff71e15 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -19,7 +19,7 @@ typedef struct _dnbd3_async {
 } dnbd3_async_t;
 
 
-bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters, bool closeSockets);
+bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters);
 
 bool connection_init(const char *hosts, const char *image, const uint16_t rid);
 
diff --git a/src/bench/main.c b/src/bench/main.c
index 2f32dbf..c86af81 100644
--- a/src/bench/main.c
+++ b/src/bench/main.c
@@ -31,8 +31,6 @@ static void printUsage(char *argv0, int exitCode)
 	printf( "   -n --runs       Number of connection attempts per thread\n" );
 	printf( "   -t --threads    number of threads\n" );
 	printf( "   -l --log        Write log to given location\n" );
-	printf( "   -d --debug      Don't fork and print debug output (fuse > stderr, dnbd3 > stdout)\n" );
-	// // fuse_main( 2, arg, &dnbd3_fuse_no_operations, NULL );
 	exit( exitCode );
 }
 
@@ -41,8 +39,8 @@ static const struct option longOpts[] = {
         { "host", required_argument, NULL, 'h' },
         { "image", required_argument, NULL, 'i' },
         { "nruns", optional_argument, NULL, 'n' },
-        { "threads", optional_argument, NULL, 't' },
-        { "help", optional_argument, NULL, 'H' },
+        { "threads", required_argument, NULL, 't' },
+        { "help", required_argument, NULL, 'H' },
         { "version", no_argument, NULL, 'v' },
         { 0, 0, 0, 0 }
 };
@@ -59,11 +57,10 @@ void* runBenchThread(void* t) {
 	BenchThreadData* data = t;
 	connection_init_n_times(
 			data->server_address,
-			data->server_address,
+			data->image_name,
 			0,
 			data->runs,
-			data->counter,
-			data->closeSockets);
+			data->counter);
 	printf("Thread #%d finished\n", data->threadNumber);
 	return NULL;
 }
@@ -85,10 +82,10 @@ int main(int argc, char *argv[])
 	while ( ( opt = getopt_long( argc, argv, optString, longOpts, &lidx ) ) != -1 ) {
 		switch ( opt ) {
 		case 'h':
-			server_address = optarg;
+			server_address = strdup(optarg);
 			break;
 		case 'i':
-			image_Name = optarg;
+			image_Name = strdup(optarg);
 			break;
 		case 'n':
 			n_runs = atoi(optarg);
-- 
cgit v1.2.3-55-g7522


From 9f5a61cc018831e33161d44ff940f59105b792e3 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 6 Aug 2019 14:04:58 +0200
Subject: [BENCH] Increase timeouts, fix block payload reading

---
 src/bench/connection.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index 2e40019..03ad9e5 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -82,10 +82,10 @@ bool connection_init_n_times(
 				continue;
 			// Try to connect
 			dnbd3_reply_t reply;
-			sock = sock_connect( &altservers[i].host, 500, SOCKET_KEEPALIVE_TIMEOUT * 1000 );
+			sock = sock_connect( &altservers[i].host, 3500, 10000 );
 			if ( sock == -1 ) {
 				counters->fails++;
-				logadd( LOG_ERROR, "Could not connect to host" );
+				logadd( LOG_ERROR, "Could not connect to host (errno=%d)", errno );
 			} else if ( !dnbd3_select_image( sock, lowerImage, rid, 0 ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "Could not send select image" );
@@ -101,7 +101,7 @@ bool connection_init_n_times(
 			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "recv: get block header failed" );
-			} else if ( recv( sock, trash, sizeof(trash), 0 ) != sizeof(trash) ) {
+			} else if ( recv( sock, trash, sizeof(trash), MSG_WAITALL|MSG_NOSIGNAL ) != sizeof(trash) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "recv: get block payload failed" );
 			} else {
-- 
cgit v1.2.3-55-g7522


From 5dc776ac73be190daa2b2b8c3eb6042fdab4acda Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 6 Aug 2019 14:06:27 +0200
Subject: [SERVER] uplink: Improve attaching to existing requests

Allow attaching in ULR_PROCESSING state, leave lower slots empty
to increase chances attaching to ULR_PROCESSING.
---
 src/server/globals.h | 12 -------
 src/server/uplink.c  | 97 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 74 insertions(+), 35 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 7e5ff04..cd5ad7e 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -17,18 +17,6 @@ typedef struct _dnbd3_connection dnbd3_connection_t;
 typedef struct _dnbd3_image dnbd3_image_t;
 typedef struct _dnbd3_client dnbd3_client_t;
 
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
 typedef struct
 {
 	uint64_t handle;  // Client defined handle to pass back in reply
diff --git a/src/server/uplink.c b/src/server/uplink.c
index f58b019..9f99fe4 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -21,6 +21,28 @@
 
 #define REP_NONE ( (uint64_t)0xffffffffffffffff )
 
+// Status of request in queue
+
+// Slot is free, can be used.
+// Must only be set in uplink_handle_receive() or uplink_remove_client()
+#define ULR_FREE 0
+// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
+// Must only be set in uplink_request()
+#define ULR_NEW 1
+// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
+// Must only be set in uplink_mainloop() or uplink_request()
+#define ULR_PENDING 2
+// Slot is being processed, do not consider for hop on.
+// Must only be set in uplink_handle_receive()
+#define ULR_PROCESSING 3
+
+static const char *const NAMES_ULR[4] = {
+	[ULR_FREE] = "ULR_FREE",
+	[ULR_NEW] = "ULR_NEW",
+	[ULR_PENDING] = "ULR_PENDING",
+	[ULR_PROCESSING] = "ULR_PROCESSING",
+};
+
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
 static void* uplink_mainloop(void *data);
@@ -203,30 +225,37 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	int existingType = -1; // ULR_* type of existing request
 	int i;
 	int freeSlot = -1;
+	int firstUsedSlot = -1;
 	bool requestLoop = false;
 	const uint64_t end = start + length;
 
 	mutex_lock( &uplink->queueLock );
 	mutex_unlock( &client->image->lock );
 	for (i = 0; i < uplink->queueLen; ++i) {
-		if ( freeSlot == -1 && uplink->queue[i].status == ULR_FREE ) {
-			freeSlot = i;
+		// find free slot to place this request into
+		if ( uplink->queue[i].status == ULR_FREE ) {
+			if ( freeSlot == -1 || existingType != ULR_PROCESSING ) {
+				freeSlot = i;
+			}
 			continue;
 		}
-		if ( uplink->queue[i].status != ULR_PENDING && uplink->queue[i].status != ULR_NEW ) continue;
-		if ( uplink->queue[i].from <= start && uplink->queue[i].to >= end ) {
-			if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end ) {
-				requestLoop = true;
-				break;
-			}
-			if ( foundExisting == -1 || existingType == ULR_PENDING ) {
-				foundExisting = i;
-				existingType = uplink->queue[i].status;
-				if ( freeSlot != -1 ) break;
-			}
+		if ( firstUsedSlot == -1 ) {
+			firstUsedSlot = i;
+		}
+		// find existing request to attach to
+		if ( uplink->queue[i].from > start || uplink->queue[i].to < end )
+			continue; // Range not suitable
+		// Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious
+		if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) {
+			requestLoop = true;
+			break;
+		}
+		if ( foundExisting == -1 || existingType == ULR_PROCESSING ) {
+			foundExisting = i;
+			existingType = uplink->queue[i].status;
 		}
 	}
-	if ( requestLoop ) {
+	if ( unlikely( requestLoop ) ) {
 		mutex_unlock( &uplink->queueLock );
 		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
 		mutex_lock( &uplink->rttLock );
@@ -235,6 +264,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		signal_call( uplink->signal );
 		return false;
 	}
+	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
+		freeSlot = -1; // Not attaching to existing request, make it use a higher slot
+	}
 	if ( freeSlot == -1 ) {
 		if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
 			mutex_unlock( &uplink->queueLock );
@@ -244,15 +276,17 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		freeSlot = uplink->queueLen++;
 	}
 	// Do not send request to uplink server if we have a matching pending request AND the request either has the
-	// status ULR_NEW OR we found a free slot with LOWER index than the one we attach to. Otherwise
+	// status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
 	// explicitly send this request to the uplink server. The second condition mentioned here is to prevent
 	// a race condition where the reply for the outstanding request already arrived and the uplink thread
 	// is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
 	// already have passed the index of the free slot we determined, but not reached the existing request we just found above.
-	if ( foundExisting != -1 && existingType != ULR_NEW && freeSlot > foundExisting ) foundExisting = -1; // -1 means "send request"
+	if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) {
+		foundExisting = -1; // -1 means "send request"
+	}
 #ifdef _DEBUG
 	if ( foundExisting != -1 ) {
-		logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, existingType == ULR_NEW ? "ULR_NEW" : "ULR_PENDING", foundExisting, freeSlot );
+		logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot );
 		logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
 				"New      %" PRIu64 "-%" PRIu64 " (%p)\n",
 				uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
@@ -265,7 +299,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	uplink->queue[freeSlot].handle = handle;
 	uplink->queue[freeSlot].client = client;
 	//int old = uplink->queue[freeSlot].status;
-	uplink->queue[freeSlot].status = (foundExisting == -1 ? ULR_NEW : ULR_PENDING);
+	uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW :
+			( existingType == ULR_NEW ? ULR_PENDING : existingType ) );
 	uplink->queue[freeSlot].hopCount = hops;
 #ifdef _DEBUG
 	timing_get( &uplink->queue[freeSlot].entered );
@@ -292,14 +327,25 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			if ( !ret ) {
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
 			} else {
+				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
+				int state;
 				mutex_lock( &uplink->queueLock );
-				if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client && uplink->queue[freeSlot].status == ULR_NEW ) {
-					uplink->queue[freeSlot].status = ULR_PENDING;
-					logadd( LOG_DEBUG2, "Succesful direct uplink request" );
+				if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
+					state = uplink->queue[freeSlot].status;
+					if ( uplink->queue[freeSlot].status == ULR_NEW ) {
+						uplink->queue[freeSlot].status = ULR_PENDING;
+					}
 				} else {
-					logadd( LOG_DEBUG2, "Weird queue update fail for direct uplink request" );
+					state = -1;
 				}
 				mutex_unlock( &uplink->queueLock );
+				if ( state == -1 ) {
+					logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
+				} else if ( state == ULR_NEW ) {
+					logadd( LOG_DEBUG2, "Succesful direct uplink request" );
+				} else {
+					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
+				}
 				return true;
 			}
 			// Fall through to waking up sender thread
@@ -837,6 +883,11 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 			}
 		}
 		// 2) Figure out which clients are interested in it
+		// Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop
+		// below; this prevents uplink_request() from attaching to this request
+		// by populating a slot with index greater than the highest matching
+		// request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
+		// where it's fine if the index is greater)
 		mutex_lock( &link->queueLock );
 		for (i = 0; i < link->queueLen; ++i) {
 			dnbd3_queued_request_t * const req = &link->queue[i];
@@ -877,10 +928,10 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 						bytesSent = (size_t)sent - sizeof outReply;
 					}
 				}
-				mutex_unlock( &client->sendMutex );
 				if ( bytesSent != 0 ) {
 					client->bytesSent += bytesSent;
 				}
+				mutex_unlock( &client->sendMutex );
 				mutex_lock( &link->queueLock );
 			}
 			if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--;
-- 
cgit v1.2.3-55-g7522


From 0363303d1b67b47605971b313bc33a049e6a3209 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 14:28:53 +0200
Subject: [SERVER] Fix race condition and invalid lock order

---
 src/server/net.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index 9abe221..c1fa6fa 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -609,6 +609,12 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
 		}
 		bs += client->bytesSent;
 	}
+	// Do this before unlocking the list, otherwise we might
+	// account for a client twice if it would disconnect after
+	// unlocking but before we add the count here.
+	if ( bytesSent != NULL ) {
+		*bytesSent = totalBytesSent + bs;
+	}
 	mutex_unlock( &_clients_lock );
 	if ( clientCount != NULL ) {
 		*clientCount = cc;
@@ -616,9 +622,6 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
 	if ( serverCount != NULL ) {
 		*serverCount = sc;
 	}
-	if ( bytesSent != NULL ) {
-		*bytesSent = totalBytesSent + bs;
-	}
 }
 
 void net_disconnectAll()
@@ -694,9 +697,9 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 		mutex_lock( &client->image->lock );
 		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
 		mutex_unlock( &client->image->lock );
-		client->image = image_release( client->image );
 	}
 	mutex_unlock( &client->lock );
+	client->image = image_release( client->image );
 	mutex_destroy( &client->lock );
 	mutex_destroy( &client->sendMutex );
 	free( client );
-- 
cgit v1.2.3-55-g7522


From 4e2e258dba3c9268e8d4fd061cbb9f291017ed2f Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 14:39:44 +0200
Subject: [SERVER] Use more _Atomic

---
 src/server/globals.h | 6 +++---
 src/server/net.c     | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index cd5ad7e..86b8865 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -60,7 +60,7 @@ struct _dnbd3_connection
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
 	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
-	int queueLen;               // length of queue
+	atomic_int queueLen;        // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
 };
@@ -107,7 +107,7 @@ struct _dnbd3_image
 	int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
-	bool working;          // true if image exists and completeness is == 100% or a working upstream proxy is connected
+	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
 };
@@ -116,7 +116,7 @@ struct _dnbd3_client
 {
 #define HOSTNAMELEN (48)
 	atomic_uint_fast64_t bytesSent;   // Byte counter for this client.
-	dnbd3_image_t *image;             // Image in use by this client, or NULL during handshake
+	dnbd3_image_t * _Atomic image;    // Image in use by this client, or NULL during handshake
 	int sock;
 	bool isServer;                    // true if a server in proxy mode, false if real client
 	dnbd3_host_t host;
diff --git a/src/server/net.c b/src/server/net.c
index c1fa6fa..92728c0 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -255,9 +255,8 @@ void* net_handleNewConnection(void *clientPtr)
 				// No BGR mismatch, but don't lookup if image is unknown locally
 				image = image_get( image_name, rid, true );
 			}
-			mutex_lock( &client->lock );
 			client->image = image;
-			mutex_unlock( &client->lock );
+			atomic_thread_fence( memory_order_release );
 			if ( image == NULL ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
 			} else if ( !image->working ) {
-- 
cgit v1.2.3-55-g7522


From bd0a4d66acaf8ebf6388f6304a90b39434e9e36a Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 14:48:55 +0200
Subject: [BENCH] Allow specifying request block size

---
 src/bench/connection.c | 28 ++++++++++++++++++++--------
 src/bench/connection.h |  2 +-
 src/bench/helper.h     |  1 +
 src/bench/main.c       | 20 ++++++++++----------
 4 files changed, 32 insertions(+), 19 deletions(-)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index 03ad9e5..ce9438a 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -41,6 +41,7 @@ bool connection_init_n_times(
 		const char *lowerImage,
 		const uint16_t rid,
 		int ntimes,
+		int blockSize,
 		BenchCounters* counters
 		) {
 	for (int run_i = 0; run_i < ntimes; ++run_i) {
@@ -95,20 +96,31 @@ bool connection_init_n_times(
 			} else if ( rid != 0 && rid != remoteRid ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "rid mismatch" );
-			} else if ( !dnbd3_get_block( sock, run_i * 4096, 4096, 0, 0 ) ) {
+			} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "send: get block failed" );
 			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "recv: get block header failed" );
-			} else if ( recv( sock, trash, sizeof(trash), MSG_WAITALL|MSG_NOSIGNAL ) != sizeof(trash) ) {
-				counters->fails++;
-				logadd( LOG_ERROR, "recv: get block payload failed" );
 			} else {
-				counters->success++;
-				close( sock );
-				sock = -1;
-				continue;
+				int rv, togo = blockSize;
+				do {
+					rv = recv( sock, trash, MIN( sizeof(trash), togo ), MSG_WAITALL|MSG_NOSIGNAL );
+					if ( rv == -1 && errno == EINTR )
+						continue;
+					if ( rv <= 0 )
+						break;
+					togo -= rv;
+				} while ( togo > 0 );
+				if ( togo != 0 ) {
+					counters->fails++;
+					logadd( LOG_ERROR, "recv: get block payload failed (remaining %d)", togo );
+				} else {
+					counters->success++;
+					close( sock );
+					sock = -1;
+					continue;
+				}
 			}
 			// Failed
 			if ( sock != -1 ) {
diff --git a/src/bench/connection.h b/src/bench/connection.h
index ff71e15..69207ff 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -19,7 +19,7 @@ typedef struct _dnbd3_async {
 } dnbd3_async_t;
 
 
-bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, BenchCounters* counters);
+bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, int blockSize, BenchCounters* counters);
 
 bool connection_init(const char *hosts, const char *image, const uint16_t rid);
 
diff --git a/src/bench/helper.h b/src/bench/helper.h
index 8342a79..e0c0262 100644
--- a/src/bench/helper.h
+++ b/src/bench/helper.h
@@ -29,6 +29,7 @@ typedef struct BenchThreadData {
 	char* server_address;
 	char * image_name;
 	int runs;
+	int bs;
 	int threadNumber;
 	bool closeSockets;
 } BenchThreadData;
diff --git a/src/bench/main.c b/src/bench/main.c
index c86af81..f8c55c3 100644
--- a/src/bench/main.c
+++ b/src/bench/main.c
@@ -17,10 +17,6 @@
 #define debugf(...) do { logadd( LOG_DEBUG1, __VA_ARGS__ ); } while (0)
 
 
-/* Debug/Benchmark variables */
-static bool useDebug = false;
-
-
 static void printUsage(char *argv0, int exitCode)
 {
 	printf( "Usage: %s [--debug] --host <serverAddress(es)> --image <imageName> [--rid revision]\n", argv0 );
@@ -30,17 +26,18 @@ static void printUsage(char *argv0, int exitCode)
 	printf( "   -r --rid        Revision to use (omit or pass 0 for latest)\n" );
 	printf( "   -n --runs       Number of connection attempts per thread\n" );
 	printf( "   -t --threads    number of threads\n" );
-	printf( "   -l --log        Write log to given location\n" );
+	printf( "   -b --blocksize  Size of blocks to request (def. 4096)\n" );
 	exit( exitCode );
 }
 
-static const char *optString = "h:i:n:t:HvVd";
+static const char *optString = "b:h:i:n:t:Hv";
 static const struct option longOpts[] = {
         { "host", required_argument, NULL, 'h' },
         { "image", required_argument, NULL, 'i' },
         { "nruns", optional_argument, NULL, 'n' },
         { "threads", required_argument, NULL, 't' },
-        { "help", required_argument, NULL, 'H' },
+        { "blocksize", required_argument, NULL, 'b' },
+        { "help", no_argument, NULL, 'H' },
         { "version", no_argument, NULL, 'v' },
         { 0, 0, 0, 0 }
 };
@@ -60,6 +57,7 @@ void* runBenchThread(void* t) {
 			data->image_name,
 			0,
 			data->runs,
+			data->bs,
 			data->counter);
 	printf("Thread #%d finished\n", data->threadNumber);
 	return NULL;
@@ -74,6 +72,7 @@ int main(int argc, char *argv[])
 	bool closeSockets = false;
 	int n_runs = 100;
 	int n_threads = 1;
+	int bs = 4096;
 
 	if ( argc <= 1 || strcmp( argv[1], "--help" ) == 0 || strcmp( argv[1], "--usage" ) == 0 ) {
 		printUsage( argv[0], 0 );
@@ -93,15 +92,15 @@ int main(int argc, char *argv[])
 		case 't':
 			n_threads = atoi(optarg);
 			break;
+		case 'b':
+			bs = atoi(optarg);
+			break;
 		case 'c':
 			closeSockets = true;
 			break;
 		case 'H':
 			printUsage( argv[0], 0 );
 			break;
-		case 'd':
-			useDebug = true;
-			break;
 		default:
 			printUsage( argv[0], EXIT_FAILURE );
 		}
@@ -123,6 +122,7 @@ int main(int argc, char *argv[])
 			server_address,
 			image_Name,
 			n_runs,
+			bs,
 			i,
 			closeSockets};
 		threadData[i] = tmp2;
-- 
cgit v1.2.3-55-g7522


From 121dd5eceb64be43d188670bff5bce265d57d199 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 16:31:05 +0200
Subject: [SERVER] Lock-free queue for altservers check thread

---
 src/server/altservers.c | 97 +++++++++++++++++++++++++++----------------------
 src/server/uplink.c     |  8 ++--
 2 files changed, 57 insertions(+), 48 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index bbbc584..a270bf3 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -14,10 +14,8 @@
 #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
 #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
 
-static dnbd3_connection_t *pending[SERVER_MAX_PENDING_ALT_CHECKS];
-static pthread_mutex_t pendingLockWrite; // Lock for adding something to pending. (NULL -> nonNULL)
-static pthread_mutex_t pendingLockConsume; // Lock for removing something (nonNULL -> NULL)
-static dnbd3_signal_t* runSignal = NULL;
+static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
+static dnbd3_signal_t * _Atomic runSignal = NULL;
 
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
 static int numAltServers = 0;
@@ -32,8 +30,6 @@ void altservers_init()
 {
 	srand( (unsigned int)time( NULL ) );
 	// Init spinlock
-	mutex_init( &pendingLockWrite );
-	mutex_init( &pendingLockConsume );
 	mutex_init( &altServersLock );
 	// Init signal
 	runSignal = signal_new();
@@ -48,12 +44,9 @@ void altservers_init()
 	}
 	// Init waiting links queue -- this is currently a global static array so
 	// it will already be zero, but in case we refactor later do it explicitly
-	// while also holding the write lock so thread sanitizer is happy
-	mutex_lock( &pendingLockWrite );
 	for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
 		pending[i] = NULL;
 	}
-	mutex_unlock( &pendingLockWrite );
 }
 
 void altservers_shutdown()
@@ -130,52 +123,77 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
  */
 void altservers_findUplink(dnbd3_connection_t *uplink)
 {
+	if ( uplink->shutdown )
+		return;
 	int i;
 	// if betterFd != -1 it means the uplink is supposed to switch to another
 	// server. As this function here is called by the uplink thread, it can
 	// never be that the uplink is supposed to switch, but instead calls
 	// this function.
 	assert( uplink->betterFd == -1 );
-	mutex_lock( &pendingLockWrite );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
+	// XXX As this function is only ever called by the image's uplink thread,
+	// it cannot happen that the uplink ends up in this list concurrently
+	mutex_lock( &uplink->rttLock );
 	if ( uplink->rttTestResult == RTT_INPROGRESS ) {
 		for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
 			if ( pending[i] != uplink ) continue;
 			// Yep, measuring right now
-			mutex_unlock( &pendingLockWrite );
 			return;
 		}
 	}
 	// Find free slot for measurement
+	uplink->rttTestResult = RTT_INPROGRESS;
 	for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
 		if ( pending[i] != NULL ) continue;
-		pending[i] = uplink;
-		uplink->rttTestResult = RTT_INPROGRESS;
-		mutex_unlock( &pendingLockWrite );
-		signal_call( runSignal ); // Wake altservers thread up
-		return;
+		dnbd3_connection_t *null = NULL;
+		if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) {
+			mutex_unlock( &uplink->rttLock );
+			atomic_thread_fence( memory_order_release );
+			signal_call( runSignal ); // Wake altservers thread up
+			return;
+		}
 	}
 	// End of loop - no free slot
-	mutex_unlock( &pendingLockWrite );
+	uplink->rttTestResult = RTT_NOT_REACHABLE;
+	mutex_unlock( &uplink->rttLock );
 	logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." );
 }
 
 /**
- * The given uplink is about to disappear, so remove it from any queues
+ * The given uplink is about to disappear,
+ * wait until any pending RTT check is done.
  */
 void altservers_removeUplink(dnbd3_connection_t *uplink)
 {
-	mutex_lock( &pendingLockConsume );
-	mutex_lock( &pendingLockWrite );
-	for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-		if ( pending[i] == uplink ) {
+	assert( uplink != NULL );
+	assert( uplink->shutdown );
+	int i;
+	for ( i = 1 ;; ++i ) {
+		atomic_thread_fence( memory_order_acquire );
+		if ( runSignal == NULL ) {
+			// Thread is already done, remove manually
 			uplink->rttTestResult = RTT_NOT_REACHABLE;
-			pending[i] = NULL;
+			break;
+		}
+		// Thread still running, wait until test is done
+		bool found = false;
+		for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
+			if ( pending[i] == uplink ) {
+				found = true;
+				break;
+			}
+		}
+		if ( !found ) // No more test running
+			break;
+		usleep( 10000 ); // 10ms
+		signal_call( runSignal ); // Wake altservers thread up
+		if ( i % 500 == 0 ) {
+			logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink );
 		}
 	}
-	mutex_unlock( &pendingLockWrite );
-	mutex_unlock( &pendingLockConsume );
+	logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i );
 }
 
 /**
@@ -432,28 +450,18 @@ static void *altservers_main(void *data UNUSED)
 			usleep( 100000 );
 		}
 		// Work your way through the queue
+		atomic_thread_fence( memory_order_acquire );
 		for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
-			mutex_lock( &pendingLockWrite );
-			if ( pending[itLink] == NULL ) {
-				mutex_unlock( &pendingLockWrite );
-				continue; // Check once before locking, as a mutex is expensive
-			}
-			mutex_unlock( &pendingLockWrite );
-			mutex_lock( &pendingLockConsume );
-			mutex_lock( &pendingLockWrite );
 			dnbd3_connection_t * const uplink = pending[itLink];
-			mutex_unlock( &pendingLockWrite );
-			if ( uplink == NULL ) { // Check again after locking
-				mutex_unlock( &pendingLockConsume );
+			if ( uplink == NULL )
 				continue;
-			}
 			dnbd3_image_t * const image = image_lock( uplink->image );
 			if ( image == NULL ) { // Check again after locking
+				mutex_lock( &uplink->rttLock );
 				uplink->rttTestResult = RTT_NOT_REACHABLE;
-				mutex_lock( &pendingLockWrite );
+				assert( pending[itLink] == uplink );
 				pending[itLink] = NULL;
-				mutex_unlock( &pendingLockWrite );
-				mutex_unlock( &pendingLockConsume );
+				mutex_unlock( &uplink->rttLock );
 				logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" );
 				continue;
 			}
@@ -592,10 +600,9 @@ static void *altservers_main(void *data UNUSED)
 			}
 			image_release( image );
 			// end of loop over all pending uplinks
-			mutex_lock( &pendingLockWrite );
+			assert( pending[itLink] == uplink );
 			pending[itLink] = NULL;
-			mutex_unlock( &pendingLockWrite );
-			mutex_unlock( &pendingLockConsume );
+			atomic_thread_fence( memory_order_release );
 		}
 		// Save cache maps of all images if applicable
 		declare_now;
@@ -606,7 +613,9 @@ static void *altservers_main(void *data UNUSED)
 		}
 	}
 	cleanup: ;
-	if ( runSignal != NULL ) signal_close( runSignal );
+	if ( runSignal != NULL ) {
+		signal_close( runSignal );
+	}
 	runSignal = NULL;
 	return NULL ;
 }
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 9f99fe4..bb1ffdc 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -583,6 +583,10 @@ static void* uplink_mainloop(void *data)
 #endif
 	}
 	cleanup: ;
+	if ( !link->shutdown ) {
+		link->shutdown = true;
+		thread_detach( link->thread );
+	}
 	altservers_removeUplink( link );
 	uplink_saveCacheMap( link );
 	mutex_lock( &link->image->lock );
@@ -596,10 +600,6 @@ static void* uplink_mainloop(void *data)
 	link->fd = -1;
 	mutex_unlock( &link->sendMutex );
 	link->signal = NULL;
-	if ( !link->shutdown ) {
-		link->shutdown = true;
-		thread_detach( link->thread );
-	}
 	// Do not access link->image after unlocking, since we set
 	// image->uplink to NULL. Acquire with image_lock first,
 	// like done below when checking whether to re-init uplink
-- 
cgit v1.2.3-55-g7522


From be7d7d95850c30a154aaa56e95d6a7f36793409d Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 17:11:51 +0200
Subject: [SERVER] Better lock debugging: Always check lock order

Lock order is predefined in locks.h. Immediately bail out if a lock with
lower priority is obtained while the same thread already holds one with
higher priority.
---
 LOCKS                   |  13 +-
 src/server/altservers.c |   9 +-
 src/server/globals.c    |   2 +-
 src/server/image.c      |  10 +-
 src/server/integrity.c  |   2 +-
 src/server/locks.c      | 319 ++++++++++++++++++++++--------------------------
 src/server/locks.h      |  36 ++++--
 src/server/net.c        |   6 +-
 src/server/rpc.c        |  14 +--
 src/server/server.c     |   7 --
 src/server/uplink.c     |   6 +-
 11 files changed, 198 insertions(+), 226 deletions(-)

(limited to 'src')

diff --git a/LOCKS b/LOCKS
index 4b5b07c..77e44a8 100644
--- a/LOCKS
+++ b/LOCKS
@@ -16,23 +16,22 @@ requests.lock
 
 ===== SERVER =====
 This is a list of used locks, in the order they
-have to be aquired if you must hold multiple locks:
-remoteCloneLock | reloadLock
+have to be aquired if you must hold multiple locks.
+Note this list might be out of date, take a look at the
+defines in lock.h for the effective order.
+reloadLock
+remoteCloneLock
 _clients_lock
 _clients[].lock
 integrityQueueLock
 _images_lock
 _images[].lock
-pendingLockConsume
-pendingLockProduce
 uplink.queueLock
 altServersLock
 client.sendMutex
-client.statsLock
-statisticsSentLock
-statisticsReceivedLock
 uplink.rttLock
 uplink.sendMutex
+aclLock
 
 If you need to lock multiple clients/images/... at once,
 lock the client with the lowest array index first.
diff --git a/src/server/altservers.c b/src/server/altservers.c
index a270bf3..3d5e71e 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -30,7 +30,7 @@ void altservers_init()
 {
 	srand( (unsigned int)time( NULL ) );
 	// Init spinlock
-	mutex_init( &altServersLock );
+	mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
 	// Init signal
 	runSignal = signal_new();
 	if ( runSignal == NULL ) {
@@ -326,13 +326,13 @@ json_t* altservers_toJson()
 }
 
 /**
- * Update rtt history of given server - returns the new average for that server
+ * Update rtt history of given server - returns the new average for that server.
+ * XXX HOLD altServersLock WHEN CALLING THIS!
  */
 static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
 {
 	unsigned int avg = rtt;
 	int i;
-	mutex_lock( &altServersLock );
 	for (i = 0; i < numAltServers; ++i) {
 		if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
 		altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
@@ -353,7 +353,6 @@ static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const
 		}
 		break;
 	}
-	mutex_unlock( &altServersLock );
 	return avg;
 }
 
@@ -529,6 +528,7 @@ static void *altservers_main(void *data UNUSED)
 				}
 				clock_gettime( BEST_CLOCK_SOURCE, &end );
 				// Measurement done - everything fine so far
+				mutex_lock( &altServersLock );
 				mutex_lock( &uplink->rttLock );
 				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
 				// Penaltize rtt if this was a cycle; this will treat this server with lower priority
@@ -538,6 +538,7 @@ static void *altservers_main(void *data UNUSED)
 						+ (end.tv_nsec - start.tv_nsec) / 1000
 						+ ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
 				unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
+				mutex_unlock( &altServersLock );
 				// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
 				if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
 				mutex_unlock( &uplink->rttLock );
diff --git a/src/server/globals.c b/src/server/globals.c
index 69e8a6e..46c1030 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -112,7 +112,7 @@ void globals_loadConfig()
 	asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
 	if ( name == NULL ) return;
 	if ( initialLoad ) {
-		mutex_init( &loadLock );
+		mutex_init( &loadLock, LOCK_LOAD_CONFIG );
 	}
 	if ( mutex_trylock( &loadLock ) != 0 ) {
 		logadd( LOG_INFO, "Ignoring config reload request due to already running reload" );
diff --git a/src/server/image.c b/src/server/image.c
index 1f12eda..4a65ed3 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -59,9 +59,9 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t
 void image_serverStartup()
 {
 	srand( (unsigned int)time( NULL ) );
-	mutex_init( &imageListLock );
-	mutex_init( &remoteCloneLock );
-	mutex_init( &reloadLock );
+	mutex_init( &imageListLock, LOCK_IMAGE_LIST );
+	mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
+	mutex_init( &reloadLock, LOCK_RELOAD );
 }
 
 /**
@@ -347,7 +347,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		img->rid = candidate->rid;
 		img->users = 1;
 		img->working = false;
-		mutex_init( &img->lock );
+		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
 			const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
 			img->crc32 = malloc( mb );
@@ -869,7 +869,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->working = (image->cache_map == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
-	mutex_init( &image->lock );
+	mutex_init( &image->lock, LOCK_IMAGE );
 	int32_t offset;
 	if ( stat( path, &st ) == 0 ) {
 		// Negatively offset atime by file modification time
diff --git a/src/server/integrity.c b/src/server/integrity.c
index a66a364..c52d17b 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -39,7 +39,7 @@ static void* integrity_main(void *data);
 void integrity_init()
 {
 	assert( queueLen == -1 );
-	mutex_init( &integrityQueueLock );
+	mutex_init( &integrityQueueLock, LOCK_INTEGRITY_QUEUE );
 	pthread_cond_init( &queueSignal, NULL );
 	mutex_lock( &integrityQueueLock );
 	queueLen = 0;
diff --git a/src/server/locks.c b/src/server/locks.c
index 2c0cb27..b39576b 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -12,47 +12,45 @@
 #ifdef _DEBUG
 #define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
 #define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
+#define MAXLPT 20
 #define LOCKLEN 60
 typedef struct
 {
-	void *lock;
+	void * _Atomic lock;
 	ticks locktime;
-	char locked;
-	pthread_t thread;
+	bool _Atomic locked;
+	pthread_t _Atomic thread;
 	int lockId;
+	int prio;
 	char name[LOCKLEN];
 	char where[LOCKLEN];
 } debug_lock_t;
 
 typedef struct
 {
-	pthread_t tid;
+	pthread_t _Atomic tid;
 	ticks time;
 	char name[LOCKLEN];
 	char where[LOCKLEN];
-
+	debug_lock_t *locks[MAXLPT];
 } debug_thread_t;
 
 int debugThreadCount = 0;
 
 static debug_lock_t locks[MAXLOCKS];
 static debug_thread_t threads[MAXTHREADS];
-static int init_done = 0;
-static pthread_mutex_t initdestory;
+static pthread_mutex_t initdestory = PTHREAD_MUTEX_INITIALIZER;
 static int lockId = 0;
-static pthread_t watchdog = 0;
-static dnbd3_signal_t* watchdogSignal = NULL;
 
-static void *debug_thread_watchdog(void *something);
+#define ULDE(...) do { \
+			pthread_mutex_unlock( &initdestory ); \
+			logadd( LOG_ERROR, __VA_ARGS__ ); \
+			debug_dump_lock_stats(); \
+			exit( 4 ); \
+} while(0)
 
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority)
 {
-	if ( !init_done ) {
-		memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) );
-		memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) );
-		pthread_mutex_init( &initdestory, NULL );
-		init_done = 1;
-	}
 	int first = -1;
 	pthread_mutex_lock( &initdestory );
 	for (int i = 0; i < MAXLOCKS; ++i) {
@@ -63,20 +61,18 @@ int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex
 		if ( first == -1 && locks[i].lock == NULL ) first = i;
 	}
 	if ( first == -1 ) {
-		logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line );
-		pthread_mutex_unlock( &initdestory );
-		debug_dump_lock_stats();
-		exit( 4 );
+		ULDE( "No more free debug locks (%s:%d)\n", file, line );
 	}
 	locks[first].lock = (void*)lock;
-	locks[first].locked = 0;
+	locks[first].locked = false;
+	locks[first].prio = priority;
 	snprintf( locks[first].name, LOCKLEN, "%s", name );
 	snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line );
 	pthread_mutex_unlock( &initdestory );
 	return pthread_mutex_init( lock, NULL );
 }
 
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try)
 {
 	debug_lock_t *l = NULL;
 	pthread_mutex_lock( &initdestory );
@@ -86,163 +82,180 @@ int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
 	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		debug_dump_lock_stats();
-		exit( 4 );
+		ULDE( "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
 	}
 	debug_thread_t *t = NULL;
-	pthread_mutex_lock( &initdestory );
+	int first = -1;
+	const pthread_t self = pthread_self();
 	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid != 0 ) continue;
-		threads[i].tid = pthread_self();
-		timing_get( &threads[i].time );
-		snprintf( threads[i].name, LOCKLEN, "%s", name );
-		snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
-		t = &threads[i];
-		break;
-	}
-	pthread_mutex_unlock( &initdestory );
-	if ( t == NULL ) {
-		logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
-	}
-	const int retval = pthread_mutex_lock( lock );
-	pthread_mutex_lock( &initdestory );
-	t->tid = 0;
-	pthread_mutex_unlock( &initdestory );
-	if ( l->locked ) {
-		logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
-	}
-	l->locked = 1;
-	timing_get( &l->locktime );
-	l->thread = pthread_self();
-	snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
-	pthread_mutex_lock( &initdestory );
-	l->lockId = ++lockId;
-	pthread_mutex_unlock( &initdestory );
-	return retval;
-}
-
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock)
-{
-	debug_lock_t *l = NULL;
-	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
+		if ( first == -1 && threads[i].tid == 0 ) {
+			first = i;
+		}
 	}
-	pthread_mutex_unlock( &initdestory );
-	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		debug_dump_lock_stats();
-		exit( 4 );
-	}
-	debug_thread_t *t = NULL;
-	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid != 0 ) continue;
-		threads[i].tid = pthread_self();
-		timing_get( &threads[i].time );
-		snprintf( threads[i].name, LOCKLEN, "%s", name );
-		snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
-		t = &threads[i];
-		break;
-	}
-	pthread_mutex_unlock( &initdestory );
+	int idx;
 	if ( t == NULL ) {
-		logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+		if ( first == -1 ) {
+			ULDE( "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+		}
+		t = &threads[first];
+		timing_get( &t->time );
+		t->tid = self;
+		snprintf( t->name, LOCKLEN, "%s", name );
+		snprintf( t->where, LOCKLEN, "%s:%d", file, line );
+		memset( t->locks, 0, sizeof(t->locks) );
+		idx = 0;
+	} else {
+		// Thread already has locks, check for order violation
+		idx = -1;
+		for (int i = 0; i < MAXLPT; ++i) {
+			if ( t->locks[i] == NULL ) {
+				if ( idx == -1 ) {
+					idx = i;
+				}
+				continue;
+			}
+			if ( t->locks[i]->prio >= l->prio ) {
+				ULDE( "Lock priority violation: %s at %s:%d (%d) when already holding %s at %s (%d)",
+						name, file, line, l->prio,
+						t->locks[i]->name, t->locks[i]->where, t->locks[i]->prio );
+			}
+			if ( t->locks[i] == l ) {
+				ULDE( "Tried to recusively lock %s in the same thread. Tried at %s:%d, when already locked at %s",
+						name, file, line, t->locks[i]->name );
+			}
+		}
+		if ( idx == -1 ) {
+			ULDE( "Thread %d tried to lock more than %d locks.", (int)self, (int)MAXLPT );
+		}
 	}
-	const int retval = pthread_mutex_trylock( lock );
-	pthread_mutex_lock( &initdestory );
-	t->tid = 0;
 	pthread_mutex_unlock( &initdestory );
+	const int retval = try ? pthread_mutex_trylock( lock ) : pthread_mutex_lock( lock );
 	if ( retval == 0 ) {
+		timing_get( &l->locktime );
+		l->thread = self;
+		snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
+		pthread_mutex_lock( &initdestory );
 		if ( l->locked ) {
 			logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
 			exit( 4 );
 		}
-		l->locked = 1;
-		timing_get( &l->locktime );
-		l->thread = pthread_self();
-		snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
-		pthread_mutex_lock( &initdestory );
+		l->locked = true;
+		t->locks[idx] = l;
 		l->lockId = ++lockId;
 		pthread_mutex_unlock( &initdestory );
+	} else if ( !try || retval != EBUSY ) {
+		logadd( LOG_ERROR, "Acquiring lock %s at %s:%d failed with error code %d", name, file, line, retval );
+		debug_dump_lock_stats();
+		exit( 4 );
 	}
 	return retval;
 }
 
 int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock)
 {
-	debug_lock_t *l = NULL;
+	debug_thread_t *t = NULL;
+	pthread_t self = pthread_self();
 	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+	for (int i = 0; i < MAXTHREADS; ++i) {
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
-	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( t == NULL ) {
+		ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
 	}
-	if ( !l->locked ) {
-		logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	int idx = -1;
+	int cnt = 0;
+	for (int i = 0; i < MAXLPT; ++i) {
+		if ( t->locks[i] == NULL )
+			continue;
+		cnt++;
+		if ( t->locks[i]->lock == lock ) {
+			idx = i;
+		}
+	}
+	if ( idx == -1 ) {
+		ULDE( "Unlock: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
 	}
-	l->locked = 0;
+	debug_lock_t *l = t->locks[idx];
+	if ( l->thread != self || !l->locked ) {
+		ULDE( "Unlock sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
+	}
+	l->locked = false;
 	l->thread = 0;
+	t->locks[idx] = NULL;
+	if ( cnt == 1 ) {
+		t->tid = 0; // No more locks held, free up slot
+	}
 	snprintf( l->where, LOCKLEN, "U %s:%d", file, line );
-	int retval = pthread_mutex_unlock( lock );
+	pthread_mutex_unlock( &initdestory );
+	const int retval = pthread_mutex_unlock( lock );
+	if ( retval != 0 ) {
+		logadd( LOG_ERROR, "pthread_mutex_unlock returned %d for %s at %s:%d", retval, name, file, line );
+		exit( 4 );
+	}
 	return retval;
 }
 
 int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock)
 {
 	debug_lock_t *l = NULL;
+	debug_thread_t *t = NULL;
+	pthread_t self = pthread_self();
 	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+	for (int i = 0; i < MAXTHREADS; ++i) {
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
+	if ( t == NULL ) {
+		ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
+	}
+	int mp = 0, mpi = -1;
+	for (int i = 0; i < MAXLPT; ++i) {
+		if ( t->locks[i] == NULL )
+			continue;
+		if ( t->locks[i]->lock == lock ) {
+			l = t->locks[i];
+		} else if ( t->locks[i]->prio > mp ) {
+			mp = t->locks[i]->prio;
+			mpi = i;
+		}
+	}
 	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to cond_wait on uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+		ULDE( "cond_wait: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
 	}
-	if ( !l->locked ) {
-		logadd( LOG_ERROR, "Cond_wait sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( l->thread != self || !l->locked ) {
+		ULDE( "cond_wait: Sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
 	}
-	pthread_t self = pthread_self();
-	if ( l->thread != self ) {
-		logadd( LOG_ERROR, "Cond_wait called from non-owning thread for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( mp >= l->prio ) {
+		ULDE( "cond_wait: Yielding a mutex while holding another one with higher prio: %s at %s:%d (%d) while also holding %s at %s (%d)",
+				name, file, line, l->prio,
+				t->locks[mpi]->name, t->locks[mpi]->where, mp );
 	}
-	l->locked = 0;
+	l->locked = false;
 	l->thread = 0;
-	snprintf( l->where, LOCKLEN, "CW %s:%d", file, line );
+	snprintf( l->where, LOCKLEN, "CWU %s:%d", file, line );
+	pthread_mutex_unlock( &initdestory );
 	int retval = pthread_cond_wait( cond, lock );
 	if ( retval != 0 ) {
 		logadd( LOG_ERROR, "pthread_cond_wait returned %d for lock %p (%s) at %s:%d\n", retval, (void*)lock, name, file, line );
 		exit( 4 );
 	}
-	if ( l->locked != 0 || l->thread != 0 ) {
+	if ( l->locked || l->thread != 0 ) {
 		logadd( LOG_ERROR, "Lock is not free after returning from pthread_cond_wait for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
 		exit( 4 );
 	}
-	l->locked = 1;
 	l->thread = self;
 	timing_get( &l->locktime );
+	l->locked = true;
 	pthread_mutex_lock( &initdestory );
 	l->lockId = ++lockId;
 	pthread_mutex_unlock( &initdestory );
@@ -290,63 +303,21 @@ void debug_dump_lock_stats()
 					"* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked );
 		}
 	}
-	printf( "\n **** WAITING THREADS ****\n\n" );
+	printf( "\n **** ACTIVE THREADS ****\n\n" );
 	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid == 0 ) continue;
+		if ( threads[i].tid == 0 )
+			continue;
 		printf( "* *** Thread %d ***\n"
 				"* Lock: %s\n"
 				"* Where: %s\n"
 				"* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) );
-	}
-	pthread_mutex_unlock( &initdestory );
-}
-
-static void *debug_thread_watchdog(void *something UNUSED)
-{
-	setThreadName( "debug-watchdog" );
-	while ( !_shutdown ) {
-		if ( init_done ) {
-			declare_now;
-			pthread_mutex_lock( &initdestory );
-			for (int i = 0; i < MAXTHREADS; ++i) {
-				if ( threads[i].tid == 0 ) continue;
-				const uint32_t diff = timing_diff( &threads[i].time, &now );
-				if ( diff > 6 && diff < 100000 ) {
-					printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" );
-					pthread_mutex_unlock( &initdestory );
-					debug_dump_lock_stats();
-					exit( 99 );
-				}
-			}
-			pthread_mutex_unlock( &initdestory );
+		for (int j = 0; j < MAXLPT; ++j) {
+			if ( threads[i].locks[j] == NULL )
+				continue;
+			printf( "  * Lock %s @ %s\n", threads[i].locks[j]->name, threads[i].locks[j]->where );
 		}
-		if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 );
 	}
-	return NULL ;
-}
-
-#endif
-
-void debug_locks_start_watchdog()
-{
-#ifdef _DEBUG
-	watchdogSignal = signal_new();
-	if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) {
-		logadd( LOG_ERROR, "Could not start debug-lock watchdog." );
-		return;
-	}
-#endif
+	pthread_mutex_unlock( &initdestory );
 }
 
-void debug_locks_stop_watchdog()
-{
-#ifdef _DEBUG
-	_shutdown = true;
-	printf( "Killing debug watchdog...\n" );
-	pthread_mutex_lock( &initdestory );
-	signal_call( watchdogSignal );
-	pthread_mutex_unlock( &initdestory );
-	thread_join( watchdog, NULL );
-	signal_close( watchdogSignal );
 #endif
-}
diff --git a/src/server/locks.h b/src/server/locks.h
index 7f72722..e5c9801 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -5,19 +5,38 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
+
+// Lock priority
+
+#define LOCK_RELOAD 90
+#define LOCK_LOAD_CONFIG 100
+#define LOCK_REMOTE_CLONE 110
+#define LOCK_CLIENT_LIST 120
+#define LOCK_CLIENT 130
+#define LOCK_INTEGRITY_QUEUE 140
+#define LOCK_IMAGE_LIST 150
+#define LOCK_IMAGE 160
+#define LOCK_UPLINK_QUEUE 170
+#define LOCK_ALT_SERVER_LIST 180
+#define LOCK_CLIENT_SEND 190
+#define LOCK_UPLINK_RTT 200
+#define LOCK_UPLINK_SEND 210
+#define LOCK_RPC_ACL 220
+
+//
 
 #ifdef _DEBUG
 
-#define mutex_init( lock ) debug_mutex_init( #lock, __FILE__, __LINE__, lock)
-#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock)
-#define mutex_trylock( lock ) debug_mutex_trylock( #lock, __FILE__, __LINE__, lock)
+#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
+#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
+#define mutex_trylock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, true)
 #define mutex_unlock( lock ) debug_mutex_unlock( #lock, __FILE__, __LINE__, lock)
 #define mutex_cond_wait( cond, lock ) debug_mutex_cond_wait( #lock, __FILE__, __LINE__, cond, lock)
 #define mutex_destroy( lock ) debug_mutex_destroy( #lock, __FILE__, __LINE__, lock)
 
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock);
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority);
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try);
 int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock);
 int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock);
 int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mutex_t *lock);
@@ -27,7 +46,7 @@ void debug_dump_lock_stats();
 
 #else
 
-#define mutex_init( lock ) pthread_mutex_init(lock, NULL)
+#define mutex_init( lock, prio ) pthread_mutex_init(lock, NULL)
 #define mutex_lock( lock ) pthread_mutex_lock(lock)
 #define mutex_trylock( lock ) pthread_mutex_trylock(lock)
 #define mutex_unlock( lock ) pthread_mutex_unlock(lock)
@@ -82,7 +101,4 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr)
 
 #endif
 
-void debug_locks_start_watchdog();
-void debug_locks_stop_watchdog();
-
 #endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
index 92728c0..8f97a12 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -145,7 +145,7 @@ static inline bool sendPadding( const int fd, uint32_t bytes )
 
 void net_init()
 {
-	mutex_init( &_clients_lock );
+	mutex_init( &_clients_lock, LOCK_CLIENT_LIST );
 }
 
 void* net_handleNewConnection(void *clientPtr)
@@ -186,8 +186,8 @@ void* net_handleNewConnection(void *clientPtr)
 		}
 	} while (0);
 	// Fully init client struct
-	mutex_init( &client->lock );
-	mutex_init( &client->sendMutex );
+	mutex_init( &client->lock, LOCK_CLIENT );
+	mutex_init( &client->sendMutex, LOCK_CLIENT_SEND );
 
 	mutex_lock( &client->lock );
 	host_to_string( &client->host, client->hostName, HOSTNAMELEN );
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 5dbcafe..261c6c0 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -75,10 +75,9 @@ static json_int_t randomRunId;
 static pthread_mutex_t aclLock;
 #define MAX_CLIENTS 50
 #define CUTOFF_START 40
-static pthread_mutex_t statusLock;
 static struct {
-	int count;
-	bool overloaded;
+	atomic_int count;
+	atomic_bool overloaded;
 } status;
 
 static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
@@ -91,8 +90,7 @@ static void loadAcl();
 
 void rpc_init()
 {
-	mutex_init( &aclLock );
-	mutex_init( &statusLock );
+	mutex_init( &aclLock, LOCK_RPC_ACL );
 	randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL);
 	// </guard>
 	if ( sizeof(randomRunId) > 4 ) {
@@ -123,10 +121,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 		return;
 	}
 	do {
-		mutex_lock( &statusLock );
 		const int curCount = ++status.count;
 		UPDATE_LOADSTATE( curCount );
-		mutex_unlock( &statusLock );
 		if ( curCount > MAX_CLIENTS ) {
 			sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE );
 			goto func_return;
@@ -198,9 +194,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
 				keepAlive = HTTP_CLOSE;
 			} else { // And if there aren't too many active HTTP sessions
-				mutex_lock( &statusLock );
 				if ( status.overloaded ) keepAlive = HTTP_CLOSE;
-				mutex_unlock( &statusLock );
 			}
 		}
 		if ( method.s != NULL && path.s != NULL ) {
@@ -234,10 +228,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 	} while (true);
 func_return:;
 	do {
-		mutex_lock( &statusLock );
 		const int curCount = --status.count;
 		UPDATE_LOADSTATE( curCount );
-		mutex_unlock( &statusLock );
 	} while (0);
 }
 
diff --git a/src/server/server.c b/src/server/server.c
index 10ab208..838aec2 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -133,9 +133,6 @@ void dnbd3_cleanup()
 	// Wait for clients to disconnect
 	net_waitForAllDisconnected();
 
-	// Watchdog not needed anymore
-	debug_locks_stop_watchdog();
-
 	// Clean up images
 	retries = 5;
 	while ( !image_tryFreeAll() && --retries > 0 ) {
@@ -303,10 +300,6 @@ int main(int argc, char *argv[])
 		logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
 	}
 
-#ifdef _DEBUG
-	debug_locks_start_watchdog();
-#endif
-
 	// setup signal handler
 	struct sigaction sa;
 	memset( &sa, 0, sizeof(sa) );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index bb1ffdc..9570273 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -89,9 +89,9 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		goto failure;
 	}
 	link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) );
-	mutex_init( &link->queueLock );
-	mutex_init( &link->rttLock );
-	mutex_init( &link->sendMutex );
+	mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE );
+	mutex_init( &link->rttLock, LOCK_UPLINK_RTT );
+	mutex_init( &link->sendMutex, LOCK_UPLINK_SEND );
 	link->image = image;
 	link->bytesReceived = 0;
 	link->idleTime = 0;
-- 
cgit v1.2.3-55-g7522


From 926754534985c5ffbb277e5abf40cf9aa72b9fff Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 15 Aug 2019 14:25:28 +0200
Subject: [SHARED] Better errno handling in connect() helper

---
 src/shared/sockhelper.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src')

diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
index ab34aa1..ec80659 100644
--- a/src/shared/sockhelper.c
+++ b/src/shared/sockhelper.c
@@ -46,6 +46,7 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
 #endif
 	else {
 		logadd( LOG_DEBUG1, "Unsupported address type: %d\n", (int)addr->type );
+		errno = EAFNOSUPPORT;
 		return -1;
 	}
 	int client_sock = socket( proto, SOCK_STREAM, IPPROTO_TCP );
@@ -56,8 +57,10 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
 	} else {
 		sock_setTimeout( client_sock, connect_ms );
 	}
+	int e2;
 	for ( int i = 0; i < 5; ++i ) {
 		int ret = connect( client_sock, (struct sockaddr *)&ss, addrlen );
+		e2 = errno;
 		if ( ret != -1 || errno == EINPROGRESS || errno == EISCONN ) break;
 		if ( errno == EINTR ) {
 			// http://www.madore.org/~david/computers/connect-intr.html
@@ -67,21 +70,26 @@ int sock_connect(const dnbd3_host_t * const addr, const int connect_ms, const in
 			struct pollfd unix_really_sucks = { .fd = client_sock, .events = POLLOUT | POLLIN };
 			while ( i-- > 0 ) {
 				int pr = poll( &unix_really_sucks, 1, connect_ms == 0 ? -1 : connect_ms );
+				e2 = errno;
 				if ( pr == 1 && ( unix_really_sucks.revents & POLLOUT ) ) break;
 				if ( pr == -1 && errno == EINTR ) continue;
 				close( client_sock );
+				errno = e2;
 				return -1;
 			}
 			sockaddr_storage junk;
 			socklen_t more_junk = sizeof(junk);
 			if ( getpeername( client_sock, (struct sockaddr*)&junk, &more_junk ) == -1 ) {
+				e2 = errno;
 				close( client_sock );
+				errno = e2;
 				return -1;
 			}
 			break;
 #endif
 		} // EINTR
 		close( client_sock );
+		errno = e2;
 		return -1;
 	}
 	if ( connect_ms != -1 && connect_ms != rw_ms ) {
-- 
cgit v1.2.3-55-g7522


From 0aca693bede4fe7e7e8098cbe33a96a88bc0ec85 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 16 Aug 2019 15:02:47 +0200
Subject: [SERVER] Lock free thread pool

---
 src/server/threadpool.c | 110 +++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 53 deletions(-)

(limited to 'src')

diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 340a98d..3947677 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -4,7 +4,6 @@
 #include "locks.h"
 
 typedef struct _entry_t {
-	struct _entry_t *next;
 	pthread_t thread;
 	dnbd3_signal_t* signal;
 	void *(*startRoutine)(void *);
@@ -14,17 +13,20 @@ typedef struct _entry_t {
 static void *threadpool_worker(void *entryPtr);
 
 static pthread_attr_t threadAttrs;
-
-static int maxIdleThreads = -1;
-static atomic_int currentIdleThreads = 0;
-static entry_t *pool = NULL;
-static pthread_spinlock_t poolLock;
+static atomic_int maxIdleThreads = -1;
+static _Atomic(entry_t *) *pool = NULL;
 
 bool threadpool_init(int maxIdle)
 {
-	if ( maxIdle < 0 || maxIdleThreads >= 0 ) return false;
-	pthread_spin_init( &poolLock, PTHREAD_PROCESS_PRIVATE );
-	maxIdleThreads = maxIdle;
+	if ( maxIdle < 0 )
+		return false;
+	int exp = -1;
+	if ( !atomic_compare_exchange_strong( &maxIdleThreads, &exp, maxIdle ) )
+		return false;
+	pool = malloc( maxIdle * sizeof(*pool) );
+	for ( int i = 0; i < maxIdle; ++i ) {
+		atomic_init( &pool[i], NULL );
+	}
 	pthread_attr_init( &threadAttrs );
 	pthread_attr_setdetachstate( &threadAttrs, PTHREAD_CREATE_DETACHED );
 	return true;
@@ -33,19 +35,15 @@ bool threadpool_init(int maxIdle)
 void threadpool_close()
 {
 	_shutdown = true;
-	if ( maxIdleThreads < 0 ) return;
-	pthread_spin_lock( &poolLock );
+	int max = maxIdleThreads;
 	maxIdleThreads = -1;
-	entry_t *ptr = pool;
-	pool = NULL;
-	currentIdleThreads = 0;
-	pthread_spin_unlock( &poolLock );
-	while ( ptr != NULL ) {
-		entry_t *current = ptr;
-		ptr = ptr->next;
-		signal_call( current->signal );
+	if ( max <= 0 ) return;
+	for ( int i = 0; i < max; ++i ) {
+		entry_t *cur = pool[i];
+		if ( cur != NULL && atomic_compare_exchange_strong( &pool[i], &cur, NULL ) ) {
+			signal_call( cur->signal );
+		}
 	}
-	pthread_spin_destroy( &poolLock );
 }
 
 bool threadpool_run(void *(*startRoutine)(void *), void *arg)
@@ -54,15 +52,16 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 		logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
 		return false; // Or bail out!?
 	}
-	pthread_spin_lock( &poolLock );
-	entry_t *entry = pool;
-	if ( entry != NULL ) {
-		pool = entry->next;
-		currentIdleThreads--;
+	entry_t *entry = NULL;
+	for ( int i = 0; i < maxIdleThreads; ++i ) {
+		entry_t *cur = pool[i];
+		if ( cur != NULL && atomic_compare_exchange_weak( &pool[i], &cur, NULL ) ) {
+			entry = cur;
+			break;
+		}
 	}
-	pthread_spin_unlock( &poolLock );
 	if ( entry == NULL ) {
-		entry = (entry_t*)malloc( sizeof(entry_t) );
+		entry = malloc( sizeof(entry_t) );
 		if ( entry == NULL ) {
 			logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" );
 			return false;
@@ -80,9 +79,9 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 			return false;
 		}
 	}
-	entry->next = NULL;
 	entry->startRoutine = startRoutine;
 	entry->arg = arg;
+	atomic_thread_fence( memory_order_release );
 	signal_call( entry->signal );
 	return true;
 }
@@ -94,35 +93,40 @@ static void *threadpool_worker(void *entryPtr)
 {
 	blockNoncriticalSignals();
 	entry_t *entry = (entry_t*)entryPtr;
+	int ret;
 	for ( ;; ) {
+keep_going:;
 		// Wait for signal from outside that we have work to do
-		int ret = signal_clear( entry->signal );
-		if ( _shutdown ) break;
-		if ( ret > 0 ) {
-			if ( entry->startRoutine == NULL ) {
-				logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
-				exit( 1 );
-			}
-			// Start assigned work
-			(*entry->startRoutine)( entry->arg );
-			// Reset vars for safety
-			entry->startRoutine = NULL;
-			entry->arg = NULL;
-			if ( _shutdown ) break;
-			// Put thread back into pool if there are less than maxIdleThreds threads, just die otherwise
-			if ( currentIdleThreads >= maxIdleThreads )
-				break;
-			// Race condition as we checked before locking, but worst case we have a couple
-			// too many threads idling around. At least the count stays accurate.
-			setThreadName( "[pool]" );
-			pthread_spin_lock( &poolLock );
-			currentIdleThreads++;
-			entry->next = pool;
-			pool = entry;
-			pthread_spin_unlock( &poolLock );
-		} else {
+		ret = signal_clear( entry->signal );
+		atomic_thread_fence( memory_order_acquire );
+		if ( _shutdown )
+			break;
+		if ( ret <= 0 ) {
 			logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
+			continue;
+		}
+		if ( entry->startRoutine == NULL ) {
+			logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
+			exit( 1 );
+		}
+		// Start assigned work
+		(*entry->startRoutine)( entry->arg );
+		// Reset vars for safety
+		entry->startRoutine = NULL;
+		entry->arg = NULL;
+		atomic_thread_fence( memory_order_release );
+		if ( _shutdown )
+			break;
+		// Put thread back into pool
+		setThreadName( "[pool]" );
+		for ( int i = 0; i < maxIdleThreads; ++i ) {
+			entry_t *exp = NULL;
+			if ( atomic_compare_exchange_weak( &pool[i], &exp, entry ) ) {
+				goto keep_going;
+			}
 		}
+		// Reaching here means pool is full; just let the thread exit
+		break;
 	}
 	signal_close( entry->signal );
 	free( entry );
-- 
cgit v1.2.3-55-g7522


From 573e620bb1811fe81c64b86aeb5728e0437eea9f Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sun, 18 Aug 2019 18:18:16 +0200
Subject: [SERVER] net.c: Minor reordering

---
 src/server/net.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index 8f97a12..5de9f14 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -532,16 +532,15 @@ exit_client_cleanup: ;
 	removeFromList( client );
 	totalBytesSent += client->bytesSent;
 	// Access time, but only if client didn't just probe
-	if ( image != NULL ) {
+	if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
 		mutex_lock( &image->lock );
-		if ( client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
-			timing_get( &image->atime );
-		}
+		timing_get( &image->atime );
 		mutex_unlock( &image->lock );
 	}
 	freeClientStruct( client ); // This will also call image_release on client->image
 	return NULL ;
 fail_preadd: ;
+	// This is before we even initialized any mutex
 	close( client->sock );
 	free( client );
 	return NULL;
@@ -688,15 +687,17 @@ static void removeFromList(dnbd3_client_t *client)
 static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 {
 	mutex_lock( &client->lock );
-	mutex_lock( &client->sendMutex );
-	if ( client->sock != -1 ) close( client->sock );
-	client->sock = -1;
-	mutex_unlock( &client->sendMutex );
 	if ( client->image != NULL ) {
 		mutex_lock( &client->image->lock );
 		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
 		mutex_unlock( &client->image->lock );
 	}
+	mutex_lock( &client->sendMutex );
+	if ( client->sock != -1 ) {
+		close( client->sock );
+	}
+	client->sock = -1;
+	mutex_unlock( &client->sendMutex );
 	mutex_unlock( &client->lock );
 	client->image = image_release( client->image );
 	mutex_destroy( &client->lock );
-- 
cgit v1.2.3-55-g7522


From 61913e7277e7c1c8f7a6573c2e3676a3fb0e70c2 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sun, 18 Aug 2019 20:54:17 +0200
Subject: [SERVER] altservers: Don't run check if <= 1 alt server available

---
 src/server/altservers.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 3d5e71e..60c046c 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -18,7 +18,7 @@ static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
 static dnbd3_signal_t * _Atomic runSignal = NULL;
 
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
-static int numAltServers = 0;
+static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
 
 static pthread_t altThread;
@@ -125,6 +125,8 @@ void altservers_findUplink(dnbd3_connection_t *uplink)
 {
 	if ( uplink->shutdown )
 		return;
+	if ( uplink->fd != -1 && numAltServers <= 1 )
+		return;
 	int i;
 	// if betterFd != -1 it means the uplink is supposed to switch to another
 	// server. As this function here is called by the uplink thread, it can
@@ -454,6 +456,13 @@ static void *altservers_main(void *data UNUSED)
 			dnbd3_connection_t * const uplink = pending[itLink];
 			if ( uplink == NULL )
 				continue;
+			// First, get 4 alt servers
+			numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 );
+			// If we're already connected and only got one server anyways, there isn't much to do
+			if ( numAlts <= 1 && uplink->fd != -1 ) {
+				uplink->rttTestResult = RTT_DONTCHANGE;
+				continue;
+			}
 			dnbd3_image_t * const image = image_lock( uplink->image );
 			if ( image == NULL ) { // Check again after locking
 				mutex_lock( &uplink->rttLock );
@@ -461,13 +470,11 @@ static void *altservers_main(void *data UNUSED)
 				assert( pending[itLink] == uplink );
 				pending[itLink] = NULL;
 				mutex_unlock( &uplink->rttLock );
-				logadd( LOG_DEBUG1, "Image has gone away that was queued for RTT measurement" );
+				logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
 				continue;
 			}
 			LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
 			assert( uplink->rttTestResult == RTT_INPROGRESS );
-			// Now get 4 alt servers
-			numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 );
 			if ( uplink->fd != -1 ) {
 				// Add current server if not already in list
 				found = false;
-- 
cgit v1.2.3-55-g7522


From da0950ad342bae3b40a74bf82dba6c1f82e7eb57 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sun, 18 Aug 2019 21:31:56 +0200
Subject: [SERVER] uplink: More consistent type/variable naming

* Change link to uplink everywhere
* dnbd3_connection_t -> dnbd3_uplink_t
---
 src/server/altservers.c |  10 +-
 src/server/altservers.h |   4 +-
 src/server/globals.h    |  12 +-
 src/server/uplink.c     | 554 ++++++++++++++++++++++++------------------------
 src/server/uplink.h     |   2 +-
 5 files changed, 294 insertions(+), 288 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 60c046c..1001981 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -14,7 +14,7 @@
 #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
 #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
 
-static dnbd3_connection_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
+static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
 static dnbd3_signal_t * _Atomic runSignal = NULL;
 
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
@@ -121,7 +121,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 /**
  * ONLY called from the passed uplink's main thread
  */
-void altservers_findUplink(dnbd3_connection_t *uplink)
+void altservers_findUplink(dnbd3_uplink_t *uplink)
 {
 	if ( uplink->shutdown )
 		return;
@@ -149,7 +149,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink)
 	uplink->rttTestResult = RTT_INPROGRESS;
 	for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
 		if ( pending[i] != NULL ) continue;
-		dnbd3_connection_t *null = NULL;
+		dnbd3_uplink_t *null = NULL;
 		if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) {
 			mutex_unlock( &uplink->rttLock );
 			atomic_thread_fence( memory_order_release );
@@ -167,7 +167,7 @@ void altservers_findUplink(dnbd3_connection_t *uplink)
  * The given uplink is about to disappear,
  * wait until any pending RTT check is done.
  */
-void altservers_removeUplink(dnbd3_connection_t *uplink)
+void altservers_removeUplink(dnbd3_uplink_t *uplink)
 {
 	assert( uplink != NULL );
 	assert( uplink->shutdown );
@@ -453,7 +453,7 @@ static void *altservers_main(void *data UNUSED)
 		// Work your way through the queue
 		atomic_thread_fence( memory_order_acquire );
 		for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
-			dnbd3_connection_t * const uplink = pending[itLink];
+			dnbd3_uplink_t * const uplink = pending[itLink];
 			if ( uplink == NULL )
 				continue;
 			// First, get 4 alt servers
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 7b7b46d..e03b900 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -13,9 +13,9 @@ int altservers_load();
 
 bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
 
-void altservers_findUplink(dnbd3_connection_t *uplink);
+void altservers_findUplink(dnbd3_uplink_t *uplink);
 
-void altservers_removeUplink(dnbd3_connection_t *uplink);
+void altservers_removeUplink(dnbd3_uplink_t *uplink);
 
 int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
 
diff --git a/src/server/globals.h b/src/server/globals.h
index 86b8865..0371e33 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -13,7 +13,7 @@ typedef struct timespec ticks;
 
 // ######### All structs/types used by the server ########
 
-typedef struct _dnbd3_connection dnbd3_connection_t;
+typedef struct _dnbd3_uplink dnbd3_uplink_t;
 typedef struct _dnbd3_image dnbd3_image_t;
 typedef struct _dnbd3_client dnbd3_client_t;
 
@@ -30,12 +30,18 @@ typedef struct
 	uint8_t hopCount;      // How many hops this request has already taken across proxies
 } dnbd3_queued_request_t;
 
+typedef struct {
+	int fd;
+	int version;
+	dnbd3_host_t host;
+} dnbd3_server_connection_t;
+
 #define RTT_IDLE 0 // Not in progress
 #define RTT_INPROGRESS 1 // In progess, not finished
 #define RTT_DONTCHANGE 2 // Finished, but no better alternative found
 #define RTT_DOCHANGE 3 // Finished, better alternative written to .betterServer + .betterFd
 #define RTT_NOT_REACHABLE 4 // No uplink was reachable
-struct _dnbd3_connection
+struct _dnbd3_uplink
 {
 	int fd;                     // socket fd to remote server
 	int version;                // remote server protocol version
@@ -94,7 +100,7 @@ struct _dnbd3_image
 {
 	char *path;            // absolute path of the image
 	char *name;            // public name of the image (usually relative path minus revision ID)
-	dnbd3_connection_t *uplink; // pointer to a server connection
+	dnbd3_uplink_t *uplink; // pointer to a server connection
 	uint8_t *cache_map;    // cache map telling which parts are locally cached, NULL if complete
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 9570273..7d66b21 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -46,16 +46,16 @@ static const char *const NAMES_ULR[4] = {
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
 static void* uplink_mainloop(void *data);
-static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly);
-static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int lastBlockIndex);
-static void uplink_handleReceive(dnbd3_connection_t *link);
+static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
+static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
+static void uplink_handleReceive(dnbd3_uplink_t *uplink);
 static int uplink_sendKeepalive(const int fd);
-static void uplink_addCrc32(dnbd3_connection_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_connection_t *link);
-static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force);
-static bool uplink_saveCacheMap(dnbd3_connection_t *link);
-static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link);
-static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew);
+static void uplink_addCrc32(dnbd3_uplink_t *uplink);
+static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
+static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
+static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
+static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
 
 // ############ uplink connection handling
 
@@ -76,7 +76,7 @@ uint64_t uplink_getTotalBytesReceived()
 bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version)
 {
 	if ( !_isProxy || _shutdown ) return false;
-	dnbd3_connection_t *link = NULL;
+	dnbd3_uplink_t *uplink = NULL;
 	assert( image != NULL );
 	mutex_lock( &image->lock );
 	if ( image->uplink != NULL && !image->uplink->shutdown ) {
@@ -88,44 +88,44 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
 		goto failure;
 	}
-	link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) );
-	mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE );
-	mutex_init( &link->rttLock, LOCK_UPLINK_RTT );
-	mutex_init( &link->sendMutex, LOCK_UPLINK_SEND );
-	link->image = image;
-	link->bytesReceived = 0;
-	link->idleTime = 0;
-	link->queueLen = 0;
-	mutex_lock( &link->sendMutex );
-	link->fd = -1;
-	mutex_unlock( &link->sendMutex );
-	link->cacheFd = -1;
-	link->signal = NULL;
-	link->replicationHandle = REP_NONE;
-	mutex_lock( &link->rttLock );
-	link->cycleDetected = false;
+	uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+	mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
+	mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
+	mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
+	uplink->image = image;
+	uplink->bytesReceived = 0;
+	uplink->idleTime = 0;
+	uplink->queueLen = 0;
+	mutex_lock( &uplink->sendMutex );
+	uplink->fd = -1;
+	mutex_unlock( &uplink->sendMutex );
+	uplink->cacheFd = -1;
+	uplink->signal = NULL;
+	uplink->replicationHandle = REP_NONE;
+	mutex_lock( &uplink->rttLock );
+	uplink->cycleDetected = false;
 	if ( sock >= 0 ) {
-		link->betterFd = sock;
-		link->betterServer = *host;
-		link->rttTestResult = RTT_DOCHANGE;
-		link->betterVersion = version;
+		uplink->betterFd = sock;
+		uplink->betterServer = *host;
+		uplink->rttTestResult = RTT_DOCHANGE;
+		uplink->betterVersion = version;
 	} else {
-		link->betterFd = -1;
-		link->rttTestResult = RTT_IDLE;
+		uplink->betterFd = -1;
+		uplink->rttTestResult = RTT_IDLE;
 	}
-	mutex_unlock( &link->rttLock );
-	link->recvBufferLen = 0;
-	link->shutdown = false;
-	if ( 0 != thread_create( &(link->thread), NULL, &uplink_mainloop, (void *)link ) ) {
+	mutex_unlock( &uplink->rttLock );
+	uplink->recvBufferLen = 0;
+	uplink->shutdown = false;
+	if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) {
 		logadd( LOG_ERROR, "Could not start thread for new uplink." );
 		goto failure;
 	}
 	mutex_unlock( &image->lock );
 	return true;
 failure: ;
-	if ( link != NULL ) {
-		free( link );
-		link = image->uplink = NULL;
+	if ( uplink != NULL ) {
+		free( uplink );
+		uplink = image->uplink = NULL;
 	}
 	mutex_unlock( &image->lock );
 	return false;
@@ -146,7 +146,7 @@ void uplink_shutdown(dnbd3_image_t *image)
 		mutex_unlock( &image->lock );
 		return;
 	}
-	dnbd3_connection_t * const uplink = image->uplink;
+	dnbd3_uplink_t * const uplink = image->uplink;
 	mutex_lock( &uplink->queueLock );
 	if ( !uplink->shutdown ) {
 		uplink->shutdown = true;
@@ -170,7 +170,7 @@ void uplink_shutdown(dnbd3_image_t *image)
  * Remove given client from uplink request queue
  * Locks on: uplink.queueLock
  */
-void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client)
+void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
 {
 	mutex_lock( &uplink->queueLock );
 	for (int i = uplink->queueLen - 1; i >= 0; --i) {
@@ -203,7 +203,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
 		return false;
 	}
-	dnbd3_connection_t * const uplink = client->image->uplink;
+	dnbd3_uplink_t * const uplink = client->image->uplink;
 	if ( uplink->shutdown ) {
 		mutex_unlock( &client->image->lock );
 		logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
@@ -370,7 +370,7 @@ static void* uplink_mainloop(void *data)
 #define EV_SOCKET (1)
 #define EV_COUNT  (2)
 	struct pollfd events[EV_COUNT];
-	dnbd3_connection_t * const link = (dnbd3_connection_t*)data;
+	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
 	int numSocks, i, waitTime;
 	int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
 	uint32_t discoverFailCount = 0;
@@ -381,31 +381,31 @@ static void* uplink_mainloop(void *data)
 	timing_get( &nextAltCheck );
 	lastKeepalive = nextAltCheck;
 	//
-	assert( link != NULL );
+	assert( uplink != NULL );
 	setThreadName( "idle-uplink" );
 	blockNoncriticalSignals();
 	// Make sure file is open for writing
-	if ( !uplink_reopenCacheFd( link, false ) ) {
+	if ( !uplink_reopenCacheFd( uplink, false ) ) {
 		// It might have failed - still offer proxy mode, we just can't cache
-		logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", link->image->path, errno );
+		logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
 	}
 	//
-	link->signal = signal_new();
-	if ( link->signal == NULL ) {
+	uplink->signal = signal_new();
+	if ( uplink->signal == NULL ) {
 		logadd( LOG_WARNING, "error creating signal. Uplink unavailable." );
 		goto cleanup;
 	}
 	events[EV_SIGNAL].events = POLLIN;
-	events[EV_SIGNAL].fd = signal_getWaitFd( link->signal );
+	events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal );
 	events[EV_SOCKET].fd = -1;
-	while ( !_shutdown && !link->shutdown ) {
+	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
-		mutex_lock( &link->rttLock );
-		waitTime = link->rttTestResult == RTT_DOCHANGE ? 0 : -1;
-		mutex_unlock( &link->rttLock );
+		mutex_lock( &uplink->rttLock );
+		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
+		mutex_unlock( &uplink->rttLock );
 		if ( waitTime == 0 ) {
 			// Nothing
-		} else if ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) {
+		} else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
 			waitTime = 1000;
 		} else {
 			declare_now;
@@ -413,9 +413,9 @@ static void* uplink_mainloop(void *data)
 			if ( waitTime < 100 ) waitTime = 100;
 			if ( waitTime > 5000 ) waitTime = 5000;
 		}
-		events[EV_SOCKET].fd = link->fd;
+		events[EV_SOCKET].fd = uplink->fd;
 		numSocks = poll( events, EV_COUNT, waitTime );
-		if ( _shutdown || link->shutdown ) goto cleanup;
+		if ( _shutdown || uplink->shutdown ) goto cleanup;
 		if ( numSocks == -1 ) { // Error?
 			if ( errno == EINTR ) continue;
 			logadd( LOG_DEBUG1, "poll() error %d", (int)errno );
@@ -423,39 +423,39 @@ static void* uplink_mainloop(void *data)
 			continue;
 		}
 		// Check if server switch is in order
-		mutex_lock( &link->rttLock );
-		if ( link->rttTestResult != RTT_DOCHANGE ) {
-			mutex_unlock( &link->rttLock );
+		mutex_lock( &uplink->rttLock );
+		if ( uplink->rttTestResult != RTT_DOCHANGE ) {
+			mutex_unlock( &uplink->rttLock );
 		} else {
-			link->rttTestResult = RTT_IDLE;
+			uplink->rttTestResult = RTT_IDLE;
 			// The rttTest worker thread has finished our request.
 			// And says it's better to switch to another server
-			const int fd = link->fd;
-			mutex_lock( &link->sendMutex );
-			link->fd = link->betterFd;
-			mutex_unlock( &link->sendMutex );
-			link->betterFd = -1;
-			link->currentServer = link->betterServer;
-			link->version = link->betterVersion;
-			link->cycleDetected = false;
-			mutex_unlock( &link->rttLock );
+			const int fd = uplink->fd;
+			mutex_lock( &uplink->sendMutex );
+			uplink->fd = uplink->betterFd;
+			mutex_unlock( &uplink->sendMutex );
+			uplink->betterFd = -1;
+			uplink->currentServer = uplink->betterServer;
+			uplink->version = uplink->betterVersion;
+			uplink->cycleDetected = false;
+			mutex_unlock( &uplink->rttLock );
 			discoverFailCount = 0;
 			if ( fd != -1 ) close( fd );
-			link->replicationHandle = REP_NONE;
-			link->image->working = true;
-			link->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
+			uplink->replicationHandle = REP_NONE;
+			uplink->image->working = true;
+			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
-			if ( host_to_string( &link->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) {
-				logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", link->image->name, buffer + 1 );
+			if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) {
+				logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 );
 				setThreadName( buffer );
 			}
 			// If we don't have a crc32 list yet, see if the new server has one
-			if ( link->image->crc32 == NULL ) {
-				uplink_addCrc32( link );
+			if ( uplink->image->crc32 == NULL ) {
+				uplink_addCrc32( uplink );
 			}
 			// Re-send all pending requests
-			uplink_sendRequests( link, false );
-			uplink_sendReplicationRequest( link );
+			uplink_sendRequests( uplink, false );
+			uplink_sendReplicationRequest( uplink );
 			events[EV_SOCKET].events = POLLIN | POLLRDHUP;
 			timing_gets( &nextAltCheck, altCheckInterval );
 			// The rtt worker already did the handshake for our image, so there's nothing
@@ -468,161 +468,161 @@ static void* uplink_mainloop(void *data)
 			goto cleanup;
 		} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
 			// signal triggered -> pending requests
-			if ( signal_clear( link->signal ) == SIGNAL_ERROR ) {
-				logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", link->image->name );
+			if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) {
+				logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name );
 			}
-			if ( link->fd != -1 ) {
+			if ( uplink->fd != -1 ) {
 				// Uplink seems fine, relay requests to it...
-				uplink_sendRequests( link, true );
+				uplink_sendRequests( uplink, true );
 			} else { // No uplink; maybe it was shutdown since it was idle for too long
-				link->idleTime = 0;
+				uplink->idleTime = 0;
 			}
 		}
 		// Uplink socket
 		if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
-			uplink_connectionFailed( link, true );
+			uplink_connectionFailed( uplink, true );
 			logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" );
 			setThreadName( "panic-uplink" );
 		} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
-			uplink_handleReceive( link );
-			if ( _shutdown || link->shutdown ) goto cleanup;
+			uplink_handleReceive( uplink );
+			if ( _shutdown || uplink->shutdown ) goto cleanup;
 		}
 		declare_now;
 		uint32_t timepassed = timing_diff( &lastKeepalive, &now );
 		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
 			lastKeepalive = now;
-			link->idleTime += timepassed;
+			uplink->idleTime += timepassed;
 			unsavedSeconds += timepassed;
-			if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && link->idleTime >= 20 && link->idleTime <= 70 ) ) {
-				// fsync/save every 4 minutes, or every 60 seconds if link is idle
+			if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) {
+				// fsync/save every 4 minutes, or every 60 seconds if uplink is idle
 				unsavedSeconds = 0;
-				uplink_saveCacheMap( link );
+				uplink_saveCacheMap( uplink );
 			}
 			// Keep-alive
-			if ( link->fd != -1 && link->replicationHandle == REP_NONE ) {
+			if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) {
 				// Send keep-alive if nothing is happening
-				if ( uplink_sendKeepalive( link->fd ) ) {
+				if ( uplink_sendKeepalive( uplink->fd ) ) {
 					// Re-trigger periodically, in case it requires a minimum user count
-					uplink_sendReplicationRequest( link );
+					uplink_sendReplicationRequest( uplink );
 				} else {
-					uplink_connectionFailed( link, true );
+					uplink_connectionFailed( uplink, true );
 					logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
 					setThreadName( "panic-uplink" );
 				}
 			}
-			// Don't keep link established if we're idle for too much
-			if ( link->fd != -1 && uplink_connectionShouldShutdown( link ) ) {
-				mutex_lock( &link->sendMutex );
-				close( link->fd );
-				link->fd = events[EV_SOCKET].fd = -1;
-				mutex_unlock( &link->sendMutex );
-				link->cycleDetected = false;
-				if ( link->recvBufferLen != 0 ) {
-					link->recvBufferLen = 0;
-					free( link->recvBuffer );
-					link->recvBuffer = NULL;
+			// Don't keep uplink established if we're idle for too much
+			if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
+				mutex_lock( &uplink->sendMutex );
+				close( uplink->fd );
+				uplink->fd = events[EV_SOCKET].fd = -1;
+				mutex_unlock( &uplink->sendMutex );
+				uplink->cycleDetected = false;
+				if ( uplink->recvBufferLen != 0 ) {
+					uplink->recvBufferLen = 0;
+					free( uplink->recvBuffer );
+					uplink->recvBuffer = NULL;
 				}
-				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", link->image->name, (int)link->image->rid );
+				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
 				setThreadName( "idle-uplink" );
 			}
 		}
 		// See if we should trigger an RTT measurement
-		mutex_lock( &link->rttLock );
-		const int rttTestResult = link->rttTestResult;
-		mutex_unlock( &link->rttLock );
+		mutex_lock( &uplink->rttLock );
+		const int rttTestResult = uplink->rttTestResult;
+		mutex_unlock( &uplink->rttLock );
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
-			if ( timing_reached( &nextAltCheck, &now ) || ( link->fd == -1 && !uplink_connectionShouldShutdown( link ) ) || link->cycleDetected ) {
+			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
-				if ( image_isComplete( link->image ) ) {
+				if ( image_isComplete( uplink->image ) ) {
 					// Quit work if image is complete
-					logadd( LOG_INFO, "Replication of %s complete.", link->image->name );
+					logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
 					setThreadName( "finished-uplink" );
 					goto cleanup;
-				} else if ( !uplink_connectionShouldShutdown( link ) ) {
+				} else if ( !uplink_connectionShouldShutdown( uplink ) ) {
 					// Not complete - do measurement
-					altservers_findUplink( link ); // This will set RTT_INPROGRESS (synchronous)
-					if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
-						link->nextReplicationIndex = 0;
+					altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous)
+					if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
+						uplink->nextReplicationIndex = 0;
 					}
 				}
 				altCheckInterval = MIN(altCheckInterval + 1, SERVER_RTT_INTERVAL_MAX);
 				timing_set( &nextAltCheck, &now, altCheckInterval );
 			}
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
-			mutex_lock( &link->rttLock );
-			link->rttTestResult = RTT_IDLE;
-			mutex_unlock( &link->rttLock );
+			mutex_lock( &uplink->rttLock );
+			uplink->rttTestResult = RTT_IDLE;
+			mutex_unlock( &uplink->rttLock );
 			discoverFailCount++;
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
-		if ( link->fd != -1 && !link->shutdown ) {
+		if ( uplink->fd != -1 && !uplink->shutdown ) {
 			bool resend = false;
 			ticks deadline;
 			timing_set( &deadline, &now, -10 );
-			mutex_lock( &link->queueLock );
-			for (i = 0; i < link->queueLen; ++i) {
-				if ( link->queue[i].status != ULR_FREE && timing_reached( &link->queue[i].entered, &deadline ) ) {
+			mutex_lock( &uplink->queueLock );
+			for (i = 0; i < uplink->queueLen; ++i) {
+				if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
 					snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
-							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, link->queue[i].client->image->name,
-							link->queue[i].from, link->queue[i].to, link->queue[i].status );
-					link->queue[i].entered = now;
+							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name,
+							uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
+					uplink->queue[i].entered = now;
 #ifdef _DEBUG_RESEND_STARVING
-					link->queue[i].status = ULR_NEW;
+					uplink->queue[i].status = ULR_NEW;
 					resend = true;
 #endif
-					mutex_unlock( &link->queueLock );
+					mutex_unlock( &uplink->queueLock );
 					logadd( LOG_WARNING, "%s", buffer );
-					mutex_lock( &link->queueLock );
+					mutex_lock( &uplink->queueLock );
 				}
 			}
-			mutex_unlock( &link->queueLock );
+			mutex_unlock( &uplink->queueLock );
 			if ( resend )
-				uplink_sendRequests( link, true );
+				uplink_sendRequests( uplink, true );
 		}
 #endif
 	}
 	cleanup: ;
-	if ( !link->shutdown ) {
-		link->shutdown = true;
-		thread_detach( link->thread );
+	if ( !uplink->shutdown ) {
+		uplink->shutdown = true;
+		thread_detach( uplink->thread );
 	}
-	altservers_removeUplink( link );
-	uplink_saveCacheMap( link );
-	mutex_lock( &link->image->lock );
-	if ( link->image->uplink == link ) {
-		link->image->uplink = NULL;
+	altservers_removeUplink( uplink );
+	uplink_saveCacheMap( uplink );
+	mutex_lock( &uplink->image->lock );
+	if ( uplink->image->uplink == uplink ) {
+		uplink->image->uplink = NULL;
 	}
-	mutex_lock( &link->queueLock );
-	const int fd = link->fd;
-	const dnbd3_signal_t* signal = link->signal;
-	mutex_lock( &link->sendMutex );
-	link->fd = -1;
-	mutex_unlock( &link->sendMutex );
-	link->signal = NULL;
-	// Do not access link->image after unlocking, since we set
+	mutex_lock( &uplink->queueLock );
+	const int fd = uplink->fd;
+	const dnbd3_signal_t* signal = uplink->signal;
+	mutex_lock( &uplink->sendMutex );
+	uplink->fd = -1;
+	mutex_unlock( &uplink->sendMutex );
+	uplink->signal = NULL;
+	// Do not access uplink->image after unlocking, since we set
 	// image->uplink to NULL. Acquire with image_lock first,
 	// like done below when checking whether to re-init uplink
-	mutex_unlock( &link->image->lock );
-	mutex_unlock( &link->queueLock );
+	mutex_unlock( &uplink->image->lock );
+	mutex_unlock( &uplink->queueLock );
 	if ( fd != -1 ) close( fd );
 	if ( signal != NULL ) signal_close( signal );
 	// Wait for the RTT check to finish/fail if it's in progress
-	while ( link->rttTestResult == RTT_INPROGRESS )
+	while ( uplink->rttTestResult == RTT_INPROGRESS )
 		usleep( 10000 );
-	if ( link->betterFd != -1 ) {
-		close( link->betterFd );
+	if ( uplink->betterFd != -1 ) {
+		close( uplink->betterFd );
 	}
-	mutex_destroy( &link->queueLock );
-	mutex_destroy( &link->rttLock );
-	mutex_destroy( &link->sendMutex );
-	free( link->recvBuffer );
-	link->recvBuffer = NULL;
-	if ( link->cacheFd != -1 ) {
-		close( link->cacheFd );
+	mutex_destroy( &uplink->queueLock );
+	mutex_destroy( &uplink->rttLock );
+	mutex_destroy( &uplink->sendMutex );
+	free( uplink->recvBuffer );
+	uplink->recvBuffer = NULL;
+	if ( uplink->cacheFd != -1 ) {
+		close( uplink->cacheFd );
 	}
-	dnbd3_image_t *image = image_lock( link->image );
-	free( link ); // !!!
+	dnbd3_image_t *image = image_lock( uplink->image );
+	free( uplink ); // !!!
 	if ( image != NULL ) {
 		if ( !_shutdown && image->cache_map != NULL ) {
 			// Ingegrity checker must have found something in the meantime
@@ -633,37 +633,37 @@ static void* uplink_mainloop(void *data)
 	return NULL ;
 }
 
-static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly)
+static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
 	// Scan for new requests
 	int j;
-	mutex_lock( &link->queueLock );
-	for (j = 0; j < link->queueLen; ++j) {
-		if ( link->queue[j].status != ULR_NEW && (newOnly || link->queue[j].status != ULR_PENDING) ) continue;
-		link->queue[j].status = ULR_PENDING;
-		uint8_t hops = link->queue[j].hopCount;
-		const uint64_t reqStart = link->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-		const uint32_t reqSize = (uint32_t)(((link->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
+	mutex_lock( &uplink->queueLock );
+	for (j = 0; j < uplink->queueLen; ++j) {
+		if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue;
+		uplink->queue[j].status = ULR_PENDING;
+		uint8_t hops = uplink->queue[j].hopCount;
+		const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+		const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 		/*
 		logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
-				(void*)link, j, link->queue[j].status, link->queue[j].handle, link->queue[j].from, link->queue[j].to, reqStart, reqStart+reqSize );
+				(void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
 		*/
-		mutex_unlock( &link->queueLock );
+		mutex_unlock( &uplink->queueLock );
 		if ( hops < 200 ) ++hops;
-		mutex_lock( &link->sendMutex );
-		const bool ret = dnbd3_get_block( link->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( link->version, hops ) );
-		mutex_unlock( &link->sendMutex );
+		mutex_lock( &uplink->sendMutex );
+		const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) );
+		mutex_unlock( &uplink->sendMutex );
 		if ( !ret ) {
 			// Non-critical - if the connection dropped or the server was changed
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( &link->currentServer );
+			altservers_serverFailed( &uplink->currentServer );
 			return;
 		}
-		mutex_lock( &link->queueLock );
+		mutex_lock( &uplink->queueLock );
 	}
-	mutex_unlock( &link->queueLock );
+	mutex_unlock( &uplink->queueLock );
 }
 
 /**
@@ -676,13 +676,13 @@ static void uplink_sendRequests(dnbd3_connection_t *link, bool newOnly)
  * the code simpler. Worst case would be only one bit is zero, which means
  * 4kb are missing, but we will request 32kb.
  */
-static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
+static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
-	if ( link == NULL || link->fd == -1 ) return;
-	if ( _backgroundReplication == BGR_DISABLED || link->cacheFd == -1 ) return; // Don't do background replication
-	if ( link->nextReplicationIndex == -1 || link->replicationHandle != REP_NONE )
+	if ( uplink == NULL || uplink->fd == -1 ) return;
+	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
+	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
 		return;
-	dnbd3_image_t * const image = link->image;
+	dnbd3_image_t * const image = uplink->image;
 	if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
 	mutex_lock( &image->lock );
 	if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) {
@@ -694,17 +694,17 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
 	const int lastBlockIndex = mapBytes - 1;
 	int endByte;
 	if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
-		endByte = link->nextReplicationIndex + mapBytes;
+		endByte = uplink->nextReplicationIndex + mapBytes;
 	} else { // Hashblock based: Only look for match in current hash block
-		endByte = ( link->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+		endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
 		if ( endByte > mapBytes ) {
 			endByte = mapBytes;
 		}
 	}
 	int replicationIndex = -1;
-	for ( int j = link->nextReplicationIndex; j < endByte; ++j ) {
+	for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
 		const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
-		if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !link->replicatedLastBlock ) ) {
+		if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
 			// Found incomplete one
 			replicationIndex = i;
 			break;
@@ -713,31 +713,31 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
 	mutex_unlock( &image->lock );
 	if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
 		// Nothing left in current block, find next one
-		replicationIndex = uplink_findNextIncompleteHashBlock( link, endByte );
+		replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
 	}
 	if ( replicationIndex == -1 ) {
 		// Replication might be complete, uplink_mainloop should take care....
-		link->nextReplicationIndex = -1;
+		uplink->nextReplicationIndex = -1;
 		return;
 	}
 	const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
-	link->replicationHandle = offset;
+	uplink->replicationHandle = offset;
 	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
-	mutex_lock( &link->sendMutex );
-	bool sendOk = dnbd3_get_block( link->fd, offset, size, link->replicationHandle, COND_HOPCOUNT( link->version, 1 ) );
-	mutex_unlock( &link->sendMutex );
+	mutex_lock( &uplink->sendMutex );
+	bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) );
+	mutex_unlock( &uplink->sendMutex );
 	if ( !sendOk ) {
 		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
 		return;
 	}
 	if ( replicationIndex == lastBlockIndex ) {
-		link->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+		uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
 	}
-	link->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+	uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
 	if ( _backgroundReplication == BGR_HASHBLOCK
-			&& link->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+			&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
 		// Just crossed a hash block boundary, look for new candidate starting at this very index
-		link->nextReplicationIndex = uplink_findNextIncompleteHashBlock( link, link->nextReplicationIndex );
+		uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
 	}
 }
 
@@ -746,18 +746,18 @@ static void uplink_sendReplicationRequest(dnbd3_connection_t *link)
  * of a hash block which is neither completely empty nor completely
  * replicated yet. Returns -1 if no match.
  */
-static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const int startMapIndex)
+static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
 {
 	int retval = -1;
-	mutex_lock( &link->image->lock );
-	const int mapBytes = IMGSIZE_TO_MAPBYTES( link->image->virtualFilesize );
-	const uint8_t *cache_map = link->image->cache_map;
+	mutex_lock( &uplink->image->lock );
+	const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
+	const uint8_t *cache_map = uplink->image->cache_map;
 	if ( cache_map != NULL ) {
 		int j;
 		const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK );
 		for (j = 0; j < mapBytes; ++j) {
 			const int i = ( start + j ) % mapBytes;
-			const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && link->replicatedLastBlock );
+			const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
 			const bool isEmpty = cache_map[i] == 0;
 			if ( !isEmpty && !isFull ) {
 				// Neither full nor empty, replicate
@@ -785,7 +785,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in
 			retval = -1;
 		}
 	}
-	mutex_unlock( &link->image->lock );
+	mutex_unlock( &uplink->image->lock );
 	return retval;
 }
 
@@ -793,41 +793,41 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_connection_t *link, const in
  * Receive data from uplink server and process/dispatch
  * Locks on: link.lock, images[].lock
  */
-static void uplink_handleReceive(dnbd3_connection_t *link)
+static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
 	dnbd3_reply_t inReply, outReply;
 	int ret, i;
 	for (;;) {
-		ret = dnbd3_read_reply( link->fd, &inReply, false );
-		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !link->shutdown ) ) continue;
+		ret = dnbd3_read_reply( uplink->fd, &inReply, false );
+		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
 		if ( ret == REPLY_AGAIN ) break;
 		if ( unlikely( ret == REPLY_CLOSED ) ) {
-			logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", link->image->path );
+			logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path );
 			goto error_cleanup;
 		}
 		if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
-			logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", link->image->path );
+			logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path );
 			goto error_cleanup;
 		}
 		if ( unlikely( ret != REPLY_OK ) ) {
-			logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, link->image->path );
+			logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path );
 			goto error_cleanup;
 		}
 		if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
-			logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, link->image->path );
+			logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path );
 			goto error_cleanup;
 		}
 
-		if ( unlikely( link->recvBufferLen < inReply.size ) ) {
-			link->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536);
-			link->recvBuffer = realloc( link->recvBuffer, link->recvBufferLen );
-			if ( link->recvBuffer == NULL ) {
+		if ( unlikely( uplink->recvBufferLen < inReply.size ) ) {
+			uplink->recvBufferLen = MIN((uint32_t)_maxPayload, inReply.size + 65536);
+			uplink->recvBuffer = realloc( uplink->recvBuffer, uplink->recvBufferLen );
+			if ( uplink->recvBuffer == NULL ) {
 				logadd( LOG_ERROR, "Out of memory when trying to allocate receive buffer for uplink" );
 				exit( 1 );
 			}
 		}
-		if ( unlikely( (uint32_t)sock_recv( link->fd, link->recvBuffer, inReply.size ) != inReply.size ) ) {
-			logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", link->image->path );
+		if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
+			logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path );
 			goto error_cleanup;
 		}
 		// Payload read completely
@@ -838,18 +838,18 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 		const uint64_t start = inReply.handle;
 		const uint64_t end = inReply.handle + inReply.size;
 		totalBytesReceived += inReply.size;
-		link->bytesReceived += inReply.size;
+		uplink->bytesReceived += inReply.size;
 		// 1) Write to cache file
-		if ( unlikely( link->cacheFd == -1 ) ) {
-			uplink_reopenCacheFd( link, false );
+		if ( unlikely( uplink->cacheFd == -1 ) ) {
+			uplink_reopenCacheFd( uplink, false );
 		}
-		if ( likely( link->cacheFd != -1 ) ) {
+		if ( likely( uplink->cacheFd != -1 ) ) {
 			int err = 0;
 			bool tryAgain = true; // Allow one retry in case we run out of space or the write fd became invalid
 			uint32_t done = 0;
 			ret = 0;
 			while ( done < inReply.size ) {
-				ret = (int)pwrite( link->cacheFd, link->recvBuffer + done, inReply.size - done, start + done );
+				ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done );
 				if ( unlikely( ret == -1 ) ) {
 					err = errno;
 					if ( err == EINTR ) continue;
@@ -860,26 +860,26 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 						continue; // Success, retry write
 					}
 					if ( err == EBADF || err == EINVAL || err == EIO ) {
-						if ( !tryAgain || !uplink_reopenCacheFd( link, true ) )
+						if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
 							break;
 						tryAgain = false;
 						continue; // Write handle to image successfully re-opened, try again
 					}
-					logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", link->image->name, (int)link->image->rid, err );
+					logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err );
 					break;
 				}
 				if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
-					logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, link->image->name, (int)link->image->rid );
+					logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid );
 					break;
 				}
 				done += (uint32_t)ret;
 			}
 			if ( likely( done > 0 ) ) {
-				image_updateCachemap( link->image, start, start + done, true );
+				image_updateCachemap( uplink->image, start, start + done, true );
 			}
 			if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
 				logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
-						link->image->name, (int)link->image->rid, err );
+						uplink->image->name, (int)uplink->image->rid, err );
 			}
 		}
 		// 2) Figure out which clients are interested in it
@@ -888,9 +888,9 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 		// by populating a slot with index greater than the highest matching
 		// request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
 		// where it's fine if the index is greater)
-		mutex_lock( &link->queueLock );
-		for (i = 0; i < link->queueLen; ++i) {
-			dnbd3_queued_request_t * const req = &link->queue[i];
+		mutex_lock( &uplink->queueLock );
+		for (i = 0; i < uplink->queueLen; ++i) {
+			dnbd3_queued_request_t * const req = &uplink->queue[i];
 			assert( req->status != ULR_PROCESSING );
 			if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
 			assert( req->client != NULL );
@@ -903,8 +903,8 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 		// from 0, you also need to change the "attach to existing request"-logic in uplink_request()
 		outReply.magic = dnbd3_packet_magic;
 		bool served = false;
-		for ( i = link->queueLen - 1; i >= 0; --i ) {
-			dnbd3_queued_request_t * const req = &link->queue[i];
+		for ( i = uplink->queueLen - 1; i >= 0; --i ) {
+			dnbd3_queued_request_t * const req = &uplink->queue[i];
 			if ( req->status == ULR_PROCESSING ) {
 				size_t bytesSent = 0;
 				assert( req->from >= start && req->to <= end );
@@ -914,14 +914,14 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 				outReply.size = (uint32_t)( req->to - req->from );
 				iov[0].iov_base = &outReply;
 				iov[0].iov_len = sizeof outReply;
-				iov[1].iov_base = link->recvBuffer + (req->from - start);
+				iov[1].iov_base = uplink->recvBuffer + (req->from - start);
 				iov[1].iov_len = outReply.size;
 				fixup_reply( outReply );
 				req->status = ULR_FREE;
 				req->client = NULL;
 				served = true;
 				mutex_lock( &client->sendMutex );
-				mutex_unlock( &link->queueLock );
+				mutex_unlock( &uplink->queueLock );
 				if ( client->sock != -1 ) {
 					ssize_t sent = writev( client->sock, iov, 2 );
 					if ( sent > (ssize_t)sizeof outReply ) {
@@ -932,66 +932,66 @@ static void uplink_handleReceive(dnbd3_connection_t *link)
 					client->bytesSent += bytesSent;
 				}
 				mutex_unlock( &client->sendMutex );
-				mutex_lock( &link->queueLock );
+				mutex_lock( &uplink->queueLock );
 			}
-			if ( req->status == ULR_FREE && i == link->queueLen - 1 ) link->queueLen--;
+			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
-		mutex_unlock( &link->queueLock );
+		mutex_unlock( &uplink->queueLock );
 #ifdef _DEBUG
-		if ( !served && start != link->replicationHandle ) {
-			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, link->image->name, start, end );
+		if ( !served && start != uplink->replicationHandle ) {
+			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end );
 		}
 #endif
-		if ( start == link->replicationHandle ) {
+		if ( start == uplink->replicationHandle ) {
 			// Was our background replication
-			link->replicationHandle = REP_NONE;
+			uplink->replicationHandle = REP_NONE;
 			// Try to remove from fs cache if no client was interested in this data
-			if ( !served && link->cacheFd != -1 ) {
-				posix_fadvise( link->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+			if ( !served && uplink->cacheFd != -1 ) {
+				posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
 			}
 		}
 		if ( served ) {
 			// Was some client -- reset idle counter
-			link->idleTime = 0;
+			uplink->idleTime = 0;
 			// Re-enable replication if disabled
-			if ( link->nextReplicationIndex == -1 ) {
-				link->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
+			if ( uplink->nextReplicationIndex == -1 ) {
+				uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
 			}
 		}
 	}
-	if ( link->replicationHandle == REP_NONE ) {
-		mutex_lock( &link->queueLock );
-		const bool rep = ( link->queueLen == 0 );
-		mutex_unlock( &link->queueLock );
-		if ( rep ) uplink_sendReplicationRequest( link );
+	if ( uplink->replicationHandle == REP_NONE ) {
+		mutex_lock( &uplink->queueLock );
+		const bool rep = ( uplink->queueLen == 0 );
+		mutex_unlock( &uplink->queueLock );
+		if ( rep ) uplink_sendReplicationRequest( uplink );
 	}
 	return;
 	// Error handling from failed receive or message parsing
 	error_cleanup: ;
-	uplink_connectionFailed( link, true );
+	uplink_connectionFailed( uplink, true );
 }
 
-static void uplink_connectionFailed(dnbd3_connection_t *link, bool findNew)
+static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
-	if ( link->fd == -1 )
+	if ( uplink->fd == -1 )
 		return;
-	altservers_serverFailed( &link->currentServer );
-	mutex_lock( &link->sendMutex );
-	close( link->fd );
-	link->fd = -1;
-	mutex_unlock( &link->sendMutex );
-	link->replicationHandle = REP_NONE;
-	if ( _backgroundReplication == BGR_FULL && link->nextReplicationIndex == -1 ) {
-		link->nextReplicationIndex = 0;
+	altservers_serverFailed( &uplink->currentServer );
+	mutex_lock( &uplink->sendMutex );
+	close( uplink->fd );
+	uplink->fd = -1;
+	mutex_unlock( &uplink->sendMutex );
+	uplink->replicationHandle = REP_NONE;
+	if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
+		uplink->nextReplicationIndex = 0;
 	}
 	if ( !findNew )
 		return;
-	mutex_lock( &link->rttLock );
-	bool bail = link->rttTestResult == RTT_INPROGRESS || link->betterFd != -1;
-	mutex_unlock( &link->rttLock );
+	mutex_lock( &uplink->rttLock );
+	bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1;
+	mutex_unlock( &uplink->rttLock );
 	if ( bail )
 		return;
-	altservers_findUplink( link );
+	altservers_findUplink( uplink );
 }
 
 /**
@@ -1008,7 +1008,7 @@ static int uplink_sendKeepalive(const int fd)
 	return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
 }
 
-static void uplink_addCrc32(dnbd3_connection_t *uplink)
+static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 {
 	dnbd3_image_t *image = uplink->image;
 	if ( image == NULL || image->virtualFilesize == 0 ) return;
@@ -1051,14 +1051,14 @@ static void uplink_addCrc32(dnbd3_connection_t *uplink)
  * it will be closed first. Otherwise, nothing will happen and true will be returned
  * immediately.
  */
-static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force)
+static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 {
-	if ( link->cacheFd != -1 ) {
+	if ( uplink->cacheFd != -1 ) {
 		if ( !force ) return true;
-		close( link->cacheFd );
+		close( uplink->cacheFd );
 	}
-	link->cacheFd = open( link->image->path, O_WRONLY | O_CREAT, 0644 );
-	return link->cacheFd != -1;
+	uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+	return uplink->cacheFd != -1;
 }
 
 /**
@@ -1066,13 +1066,13 @@ static bool uplink_reopenCacheFd(dnbd3_connection_t *link, const bool force)
  * Return true on success.
  * Locks on: imageListLock, image.lock
  */
-static bool uplink_saveCacheMap(dnbd3_connection_t *link)
+static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 {
-	dnbd3_image_t *image = link->image;
+	dnbd3_image_t *image = uplink->image;
 	assert( image != NULL );
 
-	if ( link->cacheFd != -1 ) {
-		if ( fsync( link->cacheFd ) == -1 ) {
+	if ( uplink->cacheFd != -1 ) {
+		if ( fsync( uplink->cacheFd ) == -1 ) {
 			// A failing fsync means we have no guarantee that any data
 			// since the last fsync (or open if none) has been saved. Apart
 			// from keeping the cache_map from the last successful fsync
@@ -1134,9 +1134,9 @@ static bool uplink_saveCacheMap(dnbd3_connection_t *link)
 	return true;
 }
 
-static bool uplink_connectionShouldShutdown(dnbd3_connection_t *link)
+static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
 {
-	return ( link->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
-			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > link->image->users ) );
+	return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
+			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
 }
 
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 2b41dfc..4fd41b0 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -10,7 +10,7 @@ uint64_t uplink_getTotalBytesReceived();
 
 bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version);
 
-void uplink_removeClient(dnbd3_connection_t *uplink, dnbd3_client_t *client);
+void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
 
 bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
 
-- 
cgit v1.2.3-55-g7522


From 1d2295131020688b5a688286ce8c53d6bb7abdb8 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sun, 18 Aug 2019 21:59:26 +0200
Subject: [SERVER] Add struct representing active connection to uplink server

---
 src/server/altservers.c | 30 +++++++++----------
 src/server/globals.h    | 14 ++++-----
 src/server/image.c      |  2 +-
 src/server/integrity.c  |  2 +-
 src/server/uplink.c     | 78 ++++++++++++++++++++++++-------------------------
 5 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 1001981..fbe10a8 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -125,14 +125,14 @@ void altservers_findUplink(dnbd3_uplink_t *uplink)
 {
 	if ( uplink->shutdown )
 		return;
-	if ( uplink->fd != -1 && numAltServers <= 1 )
+	if ( uplink->current.fd != -1 && numAltServers <= 1 )
 		return;
 	int i;
 	// if betterFd != -1 it means the uplink is supposed to switch to another
 	// server. As this function here is called by the uplink thread, it can
 	// never be that the uplink is supposed to switch, but instead calls
 	// this function.
-	assert( uplink->betterFd == -1 );
+	assert( uplink->better.fd == -1 );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
 	// XXX As this function is only ever called by the image's uplink thread,
@@ -457,9 +457,9 @@ static void *altservers_main(void *data UNUSED)
 			if ( uplink == NULL )
 				continue;
 			// First, get 4 alt servers
-			numAlts = altservers_getListForUplink( servers, ALTS, uplink->fd == -1 );
+			numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 );
 			// If we're already connected and only got one server anyways, there isn't much to do
-			if ( numAlts <= 1 && uplink->fd != -1 ) {
+			if ( numAlts <= 1 && uplink->current.fd != -1 ) {
 				uplink->rttTestResult = RTT_DONTCHANGE;
 				continue;
 			}
@@ -475,15 +475,15 @@ static void *altservers_main(void *data UNUSED)
 			}
 			LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
 			assert( uplink->rttTestResult == RTT_INPROGRESS );
-			if ( uplink->fd != -1 ) {
+			if ( uplink->current.fd != -1 ) {
 				// Add current server if not already in list
 				found = false;
 				for (itAlt = 0; itAlt < numAlts; ++itAlt) {
-					if ( !isSameAddressPort( &uplink->currentServer, &servers[itAlt] ) ) continue;
+					if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue;
 					found = true;
 					break;
 				}
-				if ( !found ) servers[numAlts++] = uplink->currentServer;
+				if ( !found ) servers[numAlts++] = uplink->current.host;
 			}
 			// Test them all
 			int bestSock = -1;
@@ -537,7 +537,7 @@ static void *altservers_main(void *data UNUSED)
 				// Measurement done - everything fine so far
 				mutex_lock( &altServersLock );
 				mutex_lock( &uplink->rttLock );
-				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
+				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host );
 				// Penaltize rtt if this was a cycle; this will treat this server with lower priority
 				// in the near future too, so we prevent alternating between two servers that are both
 				// part of a cycle and have the lowest latency.
@@ -547,9 +547,9 @@ static void *altservers_main(void *data UNUSED)
 				unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
 				mutex_unlock( &altServersLock );
 				// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
-				if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
+				if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
 				mutex_unlock( &uplink->rttLock );
-				if ( uplink->fd != -1 && isCurrent ) {
+				if ( uplink->current.fd != -1 && isCurrent ) {
 					// Was measuring current server
 					currentRtt = avg;
 					close( sock );
@@ -574,18 +574,18 @@ static void *altservers_main(void *data UNUSED)
 				close( sock );
 			}
 			// Done testing all servers. See if we should switch
-			if ( bestSock != -1 && (uplink->fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
+			if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
 				// yep
-				if ( currentRtt > 10000000 || uplink->fd == -1 ) {
+				if ( currentRtt > 10000000 || uplink->current.fd == -1 ) {
 					LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
 				} else {
 					LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
 				}
 				sock_setTimeout( bestSock, _uplinkTimeout );
 				mutex_lock( &uplink->rttLock );
-				uplink->betterFd = bestSock;
-				uplink->betterServer = servers[bestIndex];
-				uplink->betterVersion = bestProtocolVersion;
+				uplink->better.fd = bestSock;
+				uplink->better.host = servers[bestIndex];
+				uplink->better.version = bestProtocolVersion;
 				uplink->rttTestResult = RTT_DOCHANGE;
 				mutex_unlock( &uplink->rttLock );
 				signal_call( uplink->signal );
diff --git a/src/server/globals.h b/src/server/globals.h
index 0371e33..659e5a2 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -31,9 +31,9 @@ typedef struct
 } dnbd3_queued_request_t;
 
 typedef struct {
-	int fd;
-	int version;
-	dnbd3_host_t host;
+	int fd;             // Socket fd for this connection
+	int version;        // Protocol version of remote server
+	dnbd3_host_t host;  // IP/Port of remote server
 } dnbd3_server_connection_t;
 
 #define RTT_IDLE 0 // Not in progress
@@ -43,20 +43,16 @@ typedef struct {
 #define RTT_NOT_REACHABLE 4 // No uplink was reachable
 struct _dnbd3_uplink
 {
-	int fd;                     // socket fd to remote server
-	int version;                // remote server protocol version
+	dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected
+	dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none
 	dnbd3_signal_t* signal;     // used to wake up the process
 	pthread_t thread;           // thread holding the connection
 	pthread_mutex_t sendMutex;  // For locking socket while sending
 	pthread_mutex_t queueLock;  // lock for synchronization on request queue etc.
 	dnbd3_image_t *image;       // image that this uplink is used for; do not call get/release for this pointer
-	dnbd3_host_t currentServer; // Current server we're connected to
 	pthread_mutex_t rttLock;    // When accessing rttTestResult, betterFd or betterServer
 	int rttTestResult;          // RTT_*
 	int cacheFd;                // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD!
-	int betterVersion;          // protocol version of better server
-	int betterFd;               // Active connection to better server, ready to use
-	dnbd3_host_t betterServer;  // The better server
 	uint8_t *recvBuffer;        // Buffer for receiving payload
 	uint32_t recvBufferLen;     // Len of ^^
 	atomic_bool shutdown;       // signal this thread to stop, must only be set from uplink_shutdown() or cleanup in uplink_mainloop()
diff --git a/src/server/image.c b/src/server/image.c
index 4a65ed3..d250715 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1508,7 +1508,7 @@ json_t* image_getListAsJson()
 			uplinkName[0] = '\0';
 		} else {
 			bytesReceived = image->uplink->bytesReceived;
-			if ( image->uplink->fd == -1 || !host_to_string( &image->uplink->currentServer, uplinkName, sizeof(uplinkName) ) ) {
+			if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) {
 				uplinkName[0] = '\0';
 			}
 		}
diff --git a/src/server/integrity.c b/src/server/integrity.c
index c52d17b..3d1ac9b 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -240,7 +240,7 @@ static void* integrity_main(void * data UNUSED)
 					if ( !foundCorrupted ) {
 						mutex_lock( &image->lock );
 						if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper?
-							image->working = image->uplink->fd != -1 && image->readFd != -1;
+							image->working = image->uplink->current.fd != -1 && image->readFd != -1;
 						}
 						mutex_unlock( &image->lock );
 					}
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 7d66b21..e21e28c 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -97,7 +97,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->idleTime = 0;
 	uplink->queueLen = 0;
 	mutex_lock( &uplink->sendMutex );
-	uplink->fd = -1;
+	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
 	uplink->cacheFd = -1;
 	uplink->signal = NULL;
@@ -105,12 +105,12 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_lock( &uplink->rttLock );
 	uplink->cycleDetected = false;
 	if ( sock >= 0 ) {
-		uplink->betterFd = sock;
-		uplink->betterServer = *host;
+		uplink->better.fd = sock;
+		uplink->better.host = *host;
 		uplink->rttTestResult = RTT_DOCHANGE;
-		uplink->betterVersion = version;
+		uplink->better.version = version;
 	} else {
-		uplink->betterFd = -1;
+		uplink->better.fd = -1;
 		uplink->rttTestResult = RTT_IDLE;
 	}
 	mutex_unlock( &uplink->rttLock );
@@ -211,7 +211,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( hops != 0 && isSameAddress( &uplink->currentServer, &client->host ) ) {
+	if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) {
 		mutex_unlock( &client->image->lock );
 		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
 		mutex_lock( &uplink->rttLock );
@@ -315,14 +315,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
 	} else {
-		if ( uplink->fd == -1 ) {
+		if ( uplink->current.fd == -1 ) {
 			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 		} else {
 			const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 			const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 			if ( hops < 200 ) ++hops;
-			const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) );
+			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
 			mutex_unlock( &uplink->sendMutex );
 			if ( !ret ) {
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
@@ -405,7 +405,7 @@ static void* uplink_mainloop(void *data)
 		mutex_unlock( &uplink->rttLock );
 		if ( waitTime == 0 ) {
 			// Nothing
-		} else if ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
+		} else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
 			waitTime = 1000;
 		} else {
 			declare_now;
@@ -413,7 +413,7 @@ static void* uplink_mainloop(void *data)
 			if ( waitTime < 100 ) waitTime = 100;
 			if ( waitTime > 5000 ) waitTime = 5000;
 		}
-		events[EV_SOCKET].fd = uplink->fd;
+		events[EV_SOCKET].fd = uplink->current.fd;
 		numSocks = poll( events, EV_COUNT, waitTime );
 		if ( _shutdown || uplink->shutdown ) goto cleanup;
 		if ( numSocks == -1 ) { // Error?
@@ -430,13 +430,11 @@ static void* uplink_mainloop(void *data)
 			uplink->rttTestResult = RTT_IDLE;
 			// The rttTest worker thread has finished our request.
 			// And says it's better to switch to another server
-			const int fd = uplink->fd;
+			const int fd = uplink->current.fd;
 			mutex_lock( &uplink->sendMutex );
-			uplink->fd = uplink->betterFd;
+			uplink->current = uplink->better;
 			mutex_unlock( &uplink->sendMutex );
-			uplink->betterFd = -1;
-			uplink->currentServer = uplink->betterServer;
-			uplink->version = uplink->betterVersion;
+			uplink->better.fd = -1;
 			uplink->cycleDetected = false;
 			mutex_unlock( &uplink->rttLock );
 			discoverFailCount = 0;
@@ -445,7 +443,7 @@ static void* uplink_mainloop(void *data)
 			uplink->image->working = true;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
-			if ( host_to_string( &uplink->currentServer, buffer + 1, sizeof(buffer) - 1 ) ) {
+			if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) {
 				logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 );
 				setThreadName( buffer );
 			}
@@ -471,7 +469,7 @@ static void* uplink_mainloop(void *data)
 			if ( signal_clear( uplink->signal ) == SIGNAL_ERROR ) {
 				logadd( LOG_WARNING, "Errno on signal on uplink for %s! Things will break!", uplink->image->name );
 			}
-			if ( uplink->fd != -1 ) {
+			if ( uplink->current.fd != -1 ) {
 				// Uplink seems fine, relay requests to it...
 				uplink_sendRequests( uplink, true );
 			} else { // No uplink; maybe it was shutdown since it was idle for too long
@@ -499,9 +497,9 @@ static void* uplink_mainloop(void *data)
 				uplink_saveCacheMap( uplink );
 			}
 			// Keep-alive
-			if ( uplink->fd != -1 && uplink->replicationHandle == REP_NONE ) {
+			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
 				// Send keep-alive if nothing is happening
-				if ( uplink_sendKeepalive( uplink->fd ) ) {
+				if ( uplink_sendKeepalive( uplink->current.fd ) ) {
 					// Re-trigger periodically, in case it requires a minimum user count
 					uplink_sendReplicationRequest( uplink );
 				} else {
@@ -511,10 +509,10 @@ static void* uplink_mainloop(void *data)
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
-			if ( uplink->fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
+			if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
 				mutex_lock( &uplink->sendMutex );
-				close( uplink->fd );
-				uplink->fd = events[EV_SOCKET].fd = -1;
+				close( uplink->current.fd );
+				uplink->current.fd = events[EV_SOCKET].fd = -1;
 				mutex_unlock( &uplink->sendMutex );
 				uplink->cycleDetected = false;
 				if ( uplink->recvBufferLen != 0 ) {
@@ -531,7 +529,7 @@ static void* uplink_mainloop(void *data)
 		const int rttTestResult = uplink->rttTestResult;
 		mutex_unlock( &uplink->rttLock );
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
-			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
+			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
 				if ( image_isComplete( uplink->image ) ) {
 					// Quit work if image is complete
@@ -556,7 +554,7 @@ static void* uplink_mainloop(void *data)
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
-		if ( uplink->fd != -1 && !uplink->shutdown ) {
+		if ( uplink->current.fd != -1 && !uplink->shutdown ) {
 			bool resend = false;
 			ticks deadline;
 			timing_set( &deadline, &now, -10 );
@@ -594,10 +592,10 @@ static void* uplink_mainloop(void *data)
 		uplink->image->uplink = NULL;
 	}
 	mutex_lock( &uplink->queueLock );
-	const int fd = uplink->fd;
+	const int fd = uplink->current.fd;
 	const dnbd3_signal_t* signal = uplink->signal;
 	mutex_lock( &uplink->sendMutex );
-	uplink->fd = -1;
+	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
 	uplink->signal = NULL;
 	// Do not access uplink->image after unlocking, since we set
@@ -610,8 +608,8 @@ static void* uplink_mainloop(void *data)
 	// Wait for the RTT check to finish/fail if it's in progress
 	while ( uplink->rttTestResult == RTT_INPROGRESS )
 		usleep( 10000 );
-	if ( uplink->betterFd != -1 ) {
-		close( uplink->betterFd );
+	if ( uplink->better.fd != -1 ) {
+		close( uplink->better.fd );
 	}
 	mutex_destroy( &uplink->queueLock );
 	mutex_destroy( &uplink->rttLock );
@@ -651,14 +649,14 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		mutex_unlock( &uplink->queueLock );
 		if ( hops < 200 ) ++hops;
 		mutex_lock( &uplink->sendMutex );
-		const bool ret = dnbd3_get_block( uplink->fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->version, hops ) );
+		const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
 		mutex_unlock( &uplink->sendMutex );
 		if ( !ret ) {
 			// Non-critical - if the connection dropped or the server was changed
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( &uplink->currentServer );
+			altservers_serverFailed( &uplink->current.host );
 			return;
 		}
 		mutex_lock( &uplink->queueLock );
@@ -678,7 +676,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
  */
 static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
-	if ( uplink == NULL || uplink->fd == -1 ) return;
+	if ( uplink == NULL || uplink->current.fd == -1 ) return;
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
 	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
 		return;
@@ -724,7 +722,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 	uplink->replicationHandle = offset;
 	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
 	mutex_lock( &uplink->sendMutex );
-	bool sendOk = dnbd3_get_block( uplink->fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->version, 1 ) );
+	bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
 	mutex_unlock( &uplink->sendMutex );
 	if ( !sendOk ) {
 		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
@@ -798,7 +796,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 	dnbd3_reply_t inReply, outReply;
 	int ret, i;
 	for (;;) {
-		ret = dnbd3_read_reply( uplink->fd, &inReply, false );
+		ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
 		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
 		if ( ret == REPLY_AGAIN ) break;
 		if ( unlikely( ret == REPLY_CLOSED ) ) {
@@ -826,7 +824,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 				exit( 1 );
 			}
 		}
-		if ( unlikely( (uint32_t)sock_recv( uplink->fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
+		if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
 			logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path );
 			goto error_cleanup;
 		}
@@ -973,12 +971,12 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
-	if ( uplink->fd == -1 )
+	if ( uplink->current.fd == -1 )
 		return;
-	altservers_serverFailed( &uplink->currentServer );
+	altservers_serverFailed( &uplink->current.host );
 	mutex_lock( &uplink->sendMutex );
-	close( uplink->fd );
-	uplink->fd = -1;
+	close( uplink->current.fd );
+	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
 	uplink->replicationHandle = REP_NONE;
 	if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
@@ -987,7 +985,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 	if ( !findNew )
 		return;
 	mutex_lock( &uplink->rttLock );
-	bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->betterFd != -1;
+	bool bail = uplink->rttTestResult == RTT_INPROGRESS || uplink->better.fd != -1;
 	mutex_unlock( &uplink->rttLock );
 	if ( bail )
 		return;
@@ -1016,7 +1014,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	uint32_t masterCrc;
 	uint32_t *buffer = malloc( bytes );
 	mutex_lock( &uplink->sendMutex );
-	bool sendOk = dnbd3_get_crc32( uplink->fd, &masterCrc, buffer, &bytes );
+	bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
 	mutex_unlock( &uplink->sendMutex );
 	if ( !sendOk || bytes == 0 ) {
 		free( buffer );
-- 
cgit v1.2.3-55-g7522


From 9787bccc217ee7369d20e5a4c243d433ae4b70bd Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 22 Aug 2019 10:30:07 +0200
Subject: [SERVER] Put request handle into CMD_ERROR reply

---
 src/server/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index 5de9f14..7f3c1ce 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -314,6 +314,7 @@ void* net_handleNewConnection(void *clientPtr)
 
 			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
+				reply.handle = request.handle;
 				if ( offset >= image->virtualFilesize ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
@@ -395,7 +396,6 @@ void* net_handleNewConnection(void *clientPtr)
 
 				reply.cmd = CMD_GET_BLOCK;
 				reply.size = request.size;
-				reply.handle = request.handle;
 
 				fixup_reply( reply );
 				const bool lock = image->uplink != NULL;
-- 
cgit v1.2.3-55-g7522


From 5fb4ef278be86fb6bda487f65ec4855d830bf4e5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 22 Aug 2019 16:14:27 +0200
Subject: [SERVER] Get rid of alt-servers thread, per-uplink rtt history

Alt-Server checks are now run using the threadpool, so we don't need a
queue and dedicated thread anymore. The rtt history is now kept per
uplink, so many uplinks won't overwhelm the history, making its time
window very short.
Also the fail counter is now split up; a global one for when the server
actually isn't reachable, a local (per-uplink) one for when the server
is reachable but doesn't serve the requested image.
---
 src/server/altservers.c | 738 ++++++++++++++++++++++--------------------------
 src/server/altservers.h |  16 +-
 src/server/globals.h    |  41 ++-
 src/server/image.c      |   6 +-
 src/server/net.c        |  16 +-
 src/server/server.c     |   8 +-
 src/server/uplink.c     | 117 ++++----
 src/server/uplink.h     |   2 +
 src/serverconfig.h      |  10 +-
 9 files changed, 469 insertions(+), 485 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index fbe10a8..493ed9e 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -1,5 +1,6 @@
 #include "altservers.h"
 #include "locks.h"
+#include "threadpool.h"
 #include "helper.h"
 #include "image.h"
 #include "fileutil.h"
@@ -14,46 +15,22 @@
 #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
 #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
 
-static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
-static dnbd3_signal_t * _Atomic runSignal = NULL;
-
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
 static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
+static ticks nextCloseUnusedFd; // TODO: Move away
 
-static pthread_t altThread;
-
-static void *altservers_main(void *data);
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt);
+static void *altservers_runCheck(void *data);
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current);
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink);
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt);
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server);
 
 void altservers_init()
 {
 	srand( (unsigned int)time( NULL ) );
-	// Init spinlock
+	// Init lock
 	mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
-	// Init signal
-	runSignal = signal_new();
-	if ( runSignal == NULL ) {
-		logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." );
-		exit( EXIT_FAILURE );
-	}
-	memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) );
-	if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) {
-		logadd( LOG_ERROR, "Could not start altservers connector thread" );
-		exit( EXIT_FAILURE );
-	}
-	// Init waiting links queue -- this is currently a global static array so
-	// it will already be zero, but in case we refactor later do it explicitly
-	for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-		pending[i] = NULL;
-	}
-}
-
-void altservers_shutdown()
-{
-	if ( runSignal == NULL ) return;
-	signal_call( runSignal ); // Wake altservers thread up
-	thread_join( altThread, NULL );
 }
 
 static void addalt(int argc, char **argv, void *data)
@@ -121,7 +98,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 /**
  * ONLY called from the passed uplink's main thread
  */
-void altservers_findUplink(dnbd3_uplink_t *uplink)
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 {
 	if ( uplink->shutdown )
 		return;
@@ -135,67 +112,11 @@ void altservers_findUplink(dnbd3_uplink_t *uplink)
 	assert( uplink->better.fd == -1 );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
-	// XXX As this function is only ever called by the image's uplink thread,
-	// it cannot happen that the uplink ends up in this list concurrently
 	mutex_lock( &uplink->rttLock );
-	if ( uplink->rttTestResult == RTT_INPROGRESS ) {
-		for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-			if ( pending[i] != uplink ) continue;
-			// Yep, measuring right now
-			return;
-		}
-	}
-	// Find free slot for measurement
-	uplink->rttTestResult = RTT_INPROGRESS;
-	for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-		if ( pending[i] != NULL ) continue;
-		dnbd3_uplink_t *null = NULL;
-		if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) {
-			mutex_unlock( &uplink->rttLock );
-			atomic_thread_fence( memory_order_release );
-			signal_call( runSignal ); // Wake altservers thread up
-			return;
-		}
+	if ( uplink->rttTestResult != RTT_INPROGRESS ) {
+		threadpool_run( &altservers_runCheck, uplink );
 	}
-	// End of loop - no free slot
-	uplink->rttTestResult = RTT_NOT_REACHABLE;
 	mutex_unlock( &uplink->rttLock );
-	logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." );
-}
-
-/**
- * The given uplink is about to disappear,
- * wait until any pending RTT check is done.
- */
-void altservers_removeUplink(dnbd3_uplink_t *uplink)
-{
-	assert( uplink != NULL );
-	assert( uplink->shutdown );
-	int i;
-	for ( i = 1 ;; ++i ) {
-		atomic_thread_fence( memory_order_acquire );
-		if ( runSignal == NULL ) {
-			// Thread is already done, remove manually
-			uplink->rttTestResult = RTT_NOT_REACHABLE;
-			break;
-		}
-		// Thread still running, wait until test is done
-		bool found = false;
-		for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-			if ( pending[i] == uplink ) {
-				found = true;
-				break;
-			}
-		}
-		if ( !found ) // No more test running
-			break;
-		usleep( 10000 ); // 10ms
-		signal_call( runSignal ); // Wake altservers thread up
-		if ( i % 500 == 0 ) {
-			logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink );
-		}
-	}
-	logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i );
 }
 
 /**
@@ -209,90 +130,124 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 	if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
 	int i, j;
 	int count = 0;
-	int scores[size];
-	int score;
-	mutex_lock( &altServersLock );
+	uint16_t scores[SERVER_MAX_ALTS] = { 0 };
 	if ( size > numAltServers ) size = numAltServers;
-	for (i = 0; i < numAltServers; ++i) {
-		if ( altServers[i].host.type == 0 ) continue; // Slot is empty
-		if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers
+	mutex_lock( &altServersLock );
+	for ( i = 0; i < numAltServers; ++i ) {
+		if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
+			continue; // Slot is empty or uplink is for replication only
 		if ( host->type == altServers[i].host.type ) {
-			score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails;
+			scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host );
 		} else {
-			score = -( altServers[i].numFails + 128 ); // Wrong address family
+			scores[i] = 1; // Wrong address family
 		}
-		if ( count == 0 ) {
-			// Trivial - this is the first entry
-			output[0].host = altServers[i].host;
-			output[0].failures = 0;
-			scores[0] = score;
-			count++;
-		} else {
-			// Other entries already exist, insert in proper position
-			for (j = 0; j < size; ++j) {
-				if ( j < count && score <= scores[j] ) continue;
-				if ( j > count ) break; // Should never happen but just in case...
-				if ( j < count && j + 1 < size ) {
-					// Check if we're in the middle and need to move other entries...
-					memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) );
-					memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) );
-				}
-				if ( count < size ) {
-					count++;
-				}
-				output[j].host = altServers[i].host;
-				output[j].failures = 0;
-				scores[j] = score;
-				break;
+	}
+	while ( count < size ) {
+		i = -1;
+		for ( j = 0; j < numAltServers; ++j ) {
+			if ( scores[j] == 0 )
+				continue;
+			if ( i == -1 || scores[j] > scores[i] ) {
+				i = j;
 			}
 		}
+		if ( i == -1 )
+			break;
+		output[count].host = altServers[i].host;
+		output[count].failures = 0;
+		count++;
 	}
 	mutex_unlock( &altServersLock );
 	return count;
 }
 
+bool altservers_toString(int server, char *buffer, size_t len)
+{
+	return host_to_string( &altServers[server].host, buffer, len );
+}
+
+static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now )
+{
+	dnbd3_alt_local_t *local = ( uplink == NULL ? NULL : &uplink->altData[server] );
+	dnbd3_alt_server_t *global = &altServers[server];
+	if ( global->isClientOnly || ( !global->isPrivate && _proxyPrivateOnly ) )
+		return false;
+	// Blocked locally (image not found on server...)
+	if ( local != NULL && local->blocked ) {
+		if ( --local->fails > 0 )
+			return false;
+		local->blocked = false;
+	}
+	if ( global->blocked ) {
+		if ( timing_diff( &global->lastFail, now ) < SERVER_GLOBAL_DUP_TIME )
+			return false;
+		global->lastFail = *now;
+		if ( --global->fails > 0 )
+			return false;
+		global->blocked = false;
+	}
+	// Not blocked, depend on both fail counters
+	int fails = ( local == NULL ? 0 : local->fails ) + global->fails;
+	return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN;
+}
+
+int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
+{
+	int idx[size];
+	int num = altservers_getListForUplink( NULL, idx, size, -1 );
+	for ( int i = 0; i < num; ++i ) {
+		servers[i] = altServers[i].host;
+	}
+	return num;
+}
+
 /**
  * Get <size> alt servers. If there are more alt servers than
  * requested, random servers will be picked.
  * This function is suited for finding uplink servers as
  * it includes private servers and ignores any "client only" servers
+ * @param current index of server for current connection, or -1 in panic mode
  */
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency)
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current)
 {
-	if ( size <= 0 ) return 0;
-	int count = 0, i;
-	ticks now;
-	timing_get( &now );
+	if ( size <= 0 )
+		return 0;
+	int count = 0;
+	declare_now;
 	mutex_lock( &altServersLock );
-	// Flip first server in list with a random one every time this is called
-	if ( numAltServers > 1 ) {
-		const dnbd3_alt_server_t tmp = altServers[0];
-		do {
-			i = rand() % numAltServers;
-		} while ( i == 0 );
-		altServers[0] = altServers[i];
-		altServers[i] = tmp;
-	}
-	// We iterate over the list twice. First run adds servers with 0 failures only,
-	// second one also considers those that failed (not too many times)
-	if ( size > numAltServers ) size = numAltServers;
-	for (i = 0; i < numAltServers * 2; ++i) {
-		dnbd3_alt_server_t *srv = &altServers[i % numAltServers];
-		if ( srv->host.type == 0 ) continue; // Slot is empty
-		if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore!
-		if ( srv->isClientOnly ) continue;
-		bool first = ( i < numAltServers );
-		if ( first ) {
-			if ( srv->numFails > 0 ) continue;
-		} else {
-			if ( srv->numFails == 0 ) continue; // Already added in first iteration
-			if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row
-				&& timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore!
-			if ( !emergency ) srv->numFails--;
+	// If we don't have enough servers to randomize, take a shortcut
+	if ( numAltServers <= size ) {
+		for ( int i = 0; i < numAltServers; ++i ) {
+			if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) {
+				servers[count++] = i;
+			}
+		}
+	} else {
+		// Plenty of alt servers; randomize
+		uint8_t state[SERVER_MAX_ALTS] = { 0 };
+		if ( current != -1 ) { // Make sure we also test the current server
+			servers[count++] = current;
+			state[current] = 2;
+		}
+		for ( int tr = size * 10; tr > 0 && count < size; --tr ) {
+			int idx = rand() % numAltServers;
+			if ( state[idx] != 0 )
+				continue;
+			if ( isUsableForUplink( uplink, idx, &now ) ) {
+				servers[count++] = idx;
+				state[idx] = 2; // Used
+			} else {
+				state[idx] = 1; // Potential
+			}
+		}
+		// If panic mode, consider others too
+		for ( int tr = size * 10; current == -1 && tr > 0 && count < size; --tr ) {
+			int idx = rand() % numAltServers;
+			if ( state[idx] == 2 )
+				continue;
+			servers[count++] = idx;
+			state[idx] = 2; // Used
 		}
-		// server seems ok, include in output and decrease its fail counter
-		output[count++] = srv->host;
-		if ( count >= size ) break;
 	}
 	mutex_unlock( &altServersLock );
 	return count;
@@ -320,7 +275,7 @@ json_t* altservers_toJson()
 			"rtt", rtts,
 			"isPrivate", (int)src[i].isPrivate,
 			"isClientOnly", (int)src[i].isClientOnly,
-			"numFails", src[i].numFails
+			"numFails", src[i].fails
 		);
 		json_array_append_new( list, server );
 	}
@@ -329,32 +284,27 @@ json_t* altservers_toJson()
 
 /**
  * Update rtt history of given server - returns the new average for that server.
- * XXX HOLD altServersLock WHEN CALLING THIS!
  */
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt)
 {
-	unsigned int avg = rtt;
-	int i;
-	for (i = 0; i < numAltServers; ++i) {
-		if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
-		altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
-#if SERVER_RTT_PROBES == 5
-		avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2]
-				+ altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES;
-#else
-#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES
-		avg = 0;
-		for (int j = 0; j < SERVER_RTT_PROBES; ++j) {
-			avg += altServers[i].rtt[j];
+	uint32_t avg = 0, j;
+	dnbd3_alt_local_t *local = &uplink->altData[index];
+	mutex_lock( &altServersLock );
+	if ( likely( local->initDone ) ) {
+		local->rtt[++local->rttIndex % SERVER_RTT_PROBES] = rtt;
+		for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+			avg += local->rtt[j];
 		}
 		avg /= SERVER_RTT_PROBES;
-#endif
-		// If we got a new rtt value, server must be working
-		if ( altServers[i].numFails > 0 ) {
-			altServers[i].numFails--;
+	} else { // First rtt measurement -- copy to every slot
+		for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+			local->rtt[j] = rtt;
 		}
-		break;
+		avg = rtt;
+		local->initDone = true;
 	}
+	altServers[index].rtt[++altServers[index].rttIndex % SERVER_RTT_PROBES] = avg;
+	mutex_unlock( &altServersLock );
 	return avg;
 }
 
@@ -383,40 +333,33 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2)
  * track of how often servers fail, and consider them disabled for some time if they
  * fail too many times.
  */
-void altservers_serverFailed(const dnbd3_host_t * const host)
+void altservers_serverFailed(int server)
 {
-	int i;
-	int foundIndex = -1, lastOk = -1;
-	ticks now;
-	timing_get( &now );
+	declare_now;
 	mutex_lock( &altServersLock );
-	for (i = 0; i < numAltServers; ++i) {
-		if ( foundIndex == -1 ) {
-			// Looking for the failed server in list
-			if ( isSameAddressPort( host, &altServers[i].host ) ) {
-				foundIndex = i;
-			}
-		} else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) {
-			lastOk = i;
+	if ( timing_diff( &altServers[server].lastFail, &now ) > SERVER_GLOBAL_DUP_TIME ) {
+		altServers[server].lastFail = now;
+		if ( altServers[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+			altServers[server].blocked = true;
 		}
 	}
-	// Do only increase counter if last fail was not too recent. This is
-	// to prevent the counter from increasing rapidly if many images use the
-	// same uplink. If there's a network hickup, all uplinks will call this
-	// function and would increase the counter too quickly, disabling the server.
-	if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) {
-		altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE;
-		altServers[foundIndex].lastFail = now;
-		if ( lastOk != -1 ) {
-			// Make sure non-working servers are put at the end of the list, so they're less likely
-			// to get picked when testing servers for uplink connections.
-			const dnbd3_alt_server_t tmp = altServers[foundIndex];
-			altServers[foundIndex] = altServers[lastOk];
-			altServers[lastOk] = tmp;
-		}
+	mutex_unlock( &altServersLock );
+}
+
+/**
+ * Called from RTT checker if connecting to a server succeeded but
+ * subsequently selecting the given image failed. Handle this within
+ * the uplink and don't increase the global fail counter.
+ */
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server)
+{
+	mutex_lock( &altServersLock );
+	if ( uplink->altData[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+		uplink->altData[server].blocked = true;
 	}
 	mutex_unlock( &altServersLock );
 }
+
 /**
  * Mainloop of this module. It will wait for requests by uplinks to find a
  * suitable uplink server for them. If found, it will tell the uplink about
@@ -425,206 +368,213 @@ void altservers_serverFailed(const dnbd3_host_t * const host)
  * will update quite quickly. Needs to be improved some time, ie. by only
  * updating the rtt if the last update was at least X seconds ago.
  */
-static void *altservers_main(void *data UNUSED)
+static void *altservers_runCheck(void *data)
+{
+	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
+
+	assert( uplink != NULL );
+	setThreadName( "altserver-check" );
+	altservers_findUplinkInternal( uplink );
+	// Save cache maps of all images if applicable
+	// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
+	declare_now;
+	if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
+		timing_gets( &nextCloseUnusedFd, 900 );
+		image_closeUnusedFd();
+	}
+	return NULL;
+}
+
+void altservers_findUplink(dnbd3_uplink_t *uplink)
+{
+	altservers_findUplinkInternal( uplink );
+	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
+		usleep( 5000 );
+	}
+}
+
+int altservers_hostToIndex(dnbd3_host_t *host)
+{
+	for ( int i = 0; i < numAltServers; ++i ) {
+		if ( isSameAddressPort( host, &altServers[i].host ) )
+			return i;
+	}
+	return -1;
+}
+
+const dnbd3_host_t* altservers_indexToHost(int server)
+{
+	return &altServers[server].host;
+}
+
+// XXX Sync call above must block until async worker has finished XXX
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 {
 	const int ALTS = 4;
-	int ret, itLink, itAlt, numAlts;
-	bool found;
-	char buffer[DNBD3_BLOCK_SIZE ];
-	dnbd3_reply_t reply;
-	dnbd3_host_t servers[ALTS + 1];
-	serialized_buffer_t serialized;
+	int ret, itAlt, numAlts, current;
+	bool panic;
+	int servers[ALTS + 1];
 	struct timespec start, end;
-	ticks nextCloseUnusedFd;
 
-	setThreadName( "altserver-check" );
-	blockNoncriticalSignals();
-	timing_gets( &nextCloseUnusedFd, 900 );
-	// LOOP
-	while ( !_shutdown ) {
-		// Wait 5 seconds max.
-		ret = signal_wait( runSignal, 5000 );
-		if ( _shutdown ) goto cleanup;
-		if ( ret == SIGNAL_ERROR ) {
-			if ( errno == EAGAIN || errno == EINTR ) continue;
-			logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno );
-			usleep( 100000 );
+	if ( _shutdown )
+		return;
+	mutex_lock( &uplink->rttLock );
+	// Maybe we already have a result, or check is currently running
+	if ( uplink->better.fd != -1 || uplink->rttTestResult == RTT_INPROGRESS ) {
+		mutex_unlock( &uplink->rttLock );
+		return;
+	}
+	assert( uplink->rttTestResult != RTT_DOCHANGE );
+	uplink->rttTestResult = RTT_INPROGRESS;
+	panic = ( uplink->current.fd == -1 );
+	current = uplink->current.index; // Current server index (or last one in panic mode)
+	mutex_unlock( &uplink->rttLock );
+	// First, get 4 alt servers
+	numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current );
+	// If we're already connected and only got one server anyways, there isn't much to do
+	if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) {
+		uplink->rttTestResult = RTT_DONTCHANGE;
+		return;
+	}
+	dnbd3_image_t * const image = image_lock( uplink->image );
+	if ( image == NULL ) { // Check again after locking
+		uplink->rttTestResult = RTT_NOT_REACHABLE;
+		logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
+		return;
+	}
+	LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid );
+	assert( uplink->rttTestResult == RTT_INPROGRESS );
+	// Test them all
+	dnbd3_server_connection_t best = { .fd = -1 };
+	unsigned long bestRtt = RTT_UNREACHABLE;
+	unsigned long currentRtt = RTT_UNREACHABLE;
+	for (itAlt = 0; itAlt < numAlts; ++itAlt) {
+		int server = servers[itAlt];
+		// Connect
+		clock_gettime( BEST_CLOCK_SOURCE, &start );
+		int sock = sock_connect( &altServers[server].host, 750, 1000 );
+		if ( sock == -1 ) { // Connection failed means global error
+			altservers_serverFailed( server );
+			continue;
 		}
-		// Work your way through the queue
-		atomic_thread_fence( memory_order_acquire );
-		for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
-			dnbd3_uplink_t * const uplink = pending[itLink];
-			if ( uplink == NULL )
-				continue;
-			// First, get 4 alt servers
-			numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 );
-			// If we're already connected and only got one server anyways, there isn't much to do
-			if ( numAlts <= 1 && uplink->current.fd != -1 ) {
-				uplink->rttTestResult = RTT_DONTCHANGE;
-				continue;
-			}
-			dnbd3_image_t * const image = image_lock( uplink->image );
-			if ( image == NULL ) { // Check again after locking
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_NOT_REACHABLE;
-				assert( pending[itLink] == uplink );
-				pending[itLink] = NULL;
-				mutex_unlock( &uplink->rttLock );
-				logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
-				continue;
-			}
-			LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
-			assert( uplink->rttTestResult == RTT_INPROGRESS );
-			if ( uplink->current.fd != -1 ) {
-				// Add current server if not already in list
-				found = false;
-				for (itAlt = 0; itAlt < numAlts; ++itAlt) {
-					if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue;
-					found = true;
-					break;
-				}
-				if ( !found ) servers[numAlts++] = uplink->current.host;
-			}
-			// Test them all
-			int bestSock = -1;
-			int bestIndex = -1;
-			int bestProtocolVersion = -1;
-			unsigned long bestRtt = RTT_UNREACHABLE;
-			unsigned long currentRtt = RTT_UNREACHABLE;
-			for (itAlt = 0; itAlt < numAlts; ++itAlt) {
-				usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...)
-				// Connect
-				clock_gettime( BEST_CLOCK_SOURCE, &start );
-				int sock = sock_connect( &servers[itAlt], 750, 1000 );
-				if ( sock < 0 ) continue;
-				// Select image ++++++++++++++++++++++++++++++
-				if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
-					goto server_failed;
-				}
-				// See if selecting the image succeeded ++++++++++++++++++++++++++++++
-				uint16_t protocolVersion, rid;
-				uint64_t imageSize;
-				char *name;
-				if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
-					goto server_image_not_available;
-				}
-				if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed;
-				if ( name == NULL || strcmp( name, image->name ) != 0 ) {
-					ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name );
-				}
-				if ( rid != image->rid ) {
-					ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid );
-				}
-				if ( imageSize != image->virtualFilesize ) {
-					ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
-				}
-				// Request first block (NOT random!) ++++++++++++++++++++++++++++++
-				if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
-					LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink );
-				}
-				// See if requesting the block succeeded ++++++++++++++++++++++
-				if ( !dnbd3_get_reply( sock, &reply ) ) {
-					LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink );
-				}
-				// check reply header
-				if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
-					ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
-				}
-				if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
-					ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink );
-				}
-				clock_gettime( BEST_CLOCK_SOURCE, &end );
-				// Measurement done - everything fine so far
-				mutex_lock( &altServersLock );
-				mutex_lock( &uplink->rttLock );
-				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host );
-				// Penaltize rtt if this was a cycle; this will treat this server with lower priority
-				// in the near future too, so we prevent alternating between two servers that are both
-				// part of a cycle and have the lowest latency.
-				const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000
-						+ (end.tv_nsec - start.tv_nsec) / 1000
-						+ ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
-				unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
-				mutex_unlock( &altServersLock );
-				// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
-				if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
-				mutex_unlock( &uplink->rttLock );
-				if ( uplink->current.fd != -1 && isCurrent ) {
-					// Was measuring current server
-					currentRtt = avg;
-					close( sock );
-				} else if ( avg < bestRtt ) {
-					// Was another server, update "best"
-					if ( bestSock != -1 ) close( bestSock );
-					bestSock = sock;
-					bestRtt = avg;
-					bestIndex = itAlt;
-					bestProtocolVersion = protocolVersion;
-				} else {
-					// Was too slow, ignore
-					close( sock );
-				}
-				// We're done, call continue
-				continue;
-				// Jump here if anything went wrong
-				// This will cleanup and continue
-				server_failed: ;
-				altservers_serverFailed( &servers[itAlt] );
-				server_image_not_available: ;
-				close( sock );
-			}
-			// Done testing all servers. See if we should switch
-			if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
-				// yep
-				if ( currentRtt > 10000000 || uplink->current.fd == -1 ) {
-					LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
-				} else {
-					LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
-				}
-				sock_setTimeout( bestSock, _uplinkTimeout );
-				mutex_lock( &uplink->rttLock );
-				uplink->better.fd = bestSock;
-				uplink->better.host = servers[bestIndex];
-				uplink->better.version = bestProtocolVersion;
-				uplink->rttTestResult = RTT_DOCHANGE;
-				mutex_unlock( &uplink->rttLock );
-				signal_call( uplink->signal );
-			} else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) {
-				// No server was reachable
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_NOT_REACHABLE;
-				mutex_unlock( &uplink->rttLock );
-			} else {
-				// nope
-				if ( bestSock != -1 ) close( bestSock );
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_DONTCHANGE;
-				uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
-				mutex_unlock( &uplink->rttLock );
-				if ( !image->working ) {
-					image->working = true;
-					LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink );
-				}
-			}
-			image_release( image );
-			// end of loop over all pending uplinks
-			assert( pending[itLink] == uplink );
-			pending[itLink] = NULL;
-			atomic_thread_fence( memory_order_release );
+		// Select image ++++++++++++++++++++++++++++++
+		if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
+			goto image_failed;
 		}
-		// Save cache maps of all images if applicable
-		declare_now;
-		// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
-		if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
-			timing_gets( &nextCloseUnusedFd, 900 );
-			image_closeUnusedFd();
+		// See if selecting the image succeeded ++++++++++++++++++++++++++++++
+		uint16_t protocolVersion, rid;
+		uint64_t imageSize;
+		char *name;
+		serialized_buffer_t serialized;
+		if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
+			goto image_failed;
 		}
+		if ( protocolVersion < MIN_SUPPORTED_SERVER ) { // Server version unsupported; global fail
+			goto server_failed;
+		}
+		if ( name == NULL || strcmp( name, image->name ) != 0 ) {
+			ERROR_GOTO( image_failed, "[RTT] Server offers image '%s' instead of '%s'", name, image->name );
+		}
+		if ( rid != image->rid ) {
+			ERROR_GOTO( image_failed, "[RTT] Server provides rid %d instead of %d", (int)rid, (int)image->rid );
+		}
+		if ( imageSize != image->virtualFilesize ) {
+			ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
+		}
+		// Request first block (NOT random!) ++++++++++++++++++++++++++++++
+		if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server );
+		}
+		// See if requesting the block succeeded ++++++++++++++++++++++
+		dnbd3_reply_t reply;
+		if ( !dnbd3_get_reply( sock, &reply ) ) {
+			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
+		}
+		// check reply header
+		if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+			// Sanity check failed; count this as global error (malicious/broken server)
+			ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
+		}
+		// flush payload to include this into measurement
+		char buffer[DNBD3_BLOCK_SIZE];
+		if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+			ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
+		}
+		clock_gettime( BEST_CLOCK_SOURCE, &end );
+		// Measurement done - everything fine so far
+		mutex_lock( &uplink->rttLock );
+		const bool isCurrent = ( uplink->current.index == server );
+		mutex_unlock( &uplink->rttLock );
+		// Penaltize rtt if this was a cycle; this will treat this server with lower priority
+		// in the near future too, so we prevent alternating between two servers that are both
+		// part of a cycle and have the lowest latency.
+		uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
+				+ (end.tv_nsec - start.tv_nsec) / 1000); // µs
+		uint32_t avg = altservers_updateRtt( uplink, server, rtt );
+		// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
+		if ( ( uplink->cycleDetected || panic ) && isCurrent ) {
+			avg = (avg * 2) + 50000;
+		}
+		if ( !panic && isCurrent ) {
+			// Was measuring current server
+			currentRtt = avg;
+			close( sock );
+		} else if ( avg < bestRtt ) {
+			// Was another server, update "best"
+			if ( best.fd != -1 ) {
+				close( best.fd );
+			}
+			best.fd = sock;
+			bestRtt = avg;
+			best.index = server;
+			best.version = protocolVersion;
+		} else {
+			// Was too slow, ignore
+			close( sock );
+		}
+		// We're done, call continue
+		continue;
+		// Jump here if anything went wrong
+		// This will cleanup and continue
+image_failed:
+		altservers_imageFailed( uplink, server );
+		goto failed;
+server_failed:
+		altservers_serverFailed( server );
+failed:
+		close( sock );
 	}
-	cleanup: ;
-	if ( runSignal != NULL ) {
-		signal_close( runSignal );
+	// Done testing all servers. See if we should switch
+	if ( best.fd != -1 && (panic || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
+		// yep
+		if ( currentRtt > 10000000 || panic ) {
+			LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
+		} else {
+			LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
+		}
+		sock_setTimeout( best.fd, _uplinkTimeout );
+		mutex_lock( &uplink->rttLock );
+		uplink->better = best;
+		uplink->rttTestResult = RTT_DOCHANGE;
+		mutex_unlock( &uplink->rttLock );
+		signal_call( uplink->signal );
+	} else if ( best.fd == -1 && currentRtt == RTT_UNREACHABLE ) {
+		// No server was reachable, including current
+		uplink->rttTestResult = RTT_NOT_REACHABLE;
+	} else {
+		// nope
+		if ( best.fd != -1 ) {
+			close( best.fd );
+		}
+		if ( !image->working || uplink->cycleDetected ) {
+			image->working = true;
+			LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
+		}
+		uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
+		mutex_lock( &uplink->rttLock );
+		uplink->rttTestResult = RTT_DONTCHANGE;
+		mutex_unlock( &uplink->rttLock );
 	}
-	runSignal = NULL;
-	return NULL ;
+	image_release( image );
 }
 
diff --git a/src/server/altservers.h b/src/server/altservers.h
index e03b900..8e2b964 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -7,23 +7,27 @@ struct json_t;
 
 void altservers_init();
 
-void altservers_shutdown();
-
 int altservers_load();
 
 bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
 
-void altservers_findUplink(dnbd3_uplink_t *uplink);
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
 
-void altservers_removeUplink(dnbd3_uplink_t *uplink);
+void altservers_findUplink(dnbd3_uplink_t *uplink);
 
 int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
 
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency);
+int altservers_getHostListForReplication(dnbd3_host_t *servers, int size);
+
+bool altservers_toString(int server, char *buffer, size_t len);
 
 int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
 
-void altservers_serverFailed(const dnbd3_host_t * const host);
+void altservers_serverFailed(int server);
+
+int altservers_hostToIndex(dnbd3_host_t *host);
+
+const dnbd3_host_t* altservers_indexToHost(int server);
 
 struct json_t* altservers_toJson();
 
diff --git a/src/server/globals.h b/src/server/globals.h
index 659e5a2..4d97c6b 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -30,10 +30,31 @@ typedef struct
 	uint8_t hopCount;      // How many hops this request has already taken across proxies
 } dnbd3_queued_request_t;
 
+typedef struct
+{
+	int fails;                    // Hard fail: Connection failed
+	int rttIndex;
+	uint32_t rtt[SERVER_RTT_PROBES];
+	bool isPrivate, isClientOnly;
+	bool blocked;                 // If true count down fails until 0 to enable again
+	ticks lastFail;               // Last hard fail
+	dnbd3_host_t host;
+	char comment[COMMENT_LENGTH];
+} dnbd3_alt_server_t;
+
+typedef struct
+{
+	int fails;                    // Soft fail: Image not found
+	int rttIndex;
+	uint32_t rtt[SERVER_RTT_PROBES];
+	bool blocked;                 // True if server is to be ignored and fails should be counted down
+	bool initDone;
+} dnbd3_alt_local_t;
+
 typedef struct {
-	int fd;             // Socket fd for this connection
-	int version;        // Protocol version of remote server
-	dnbd3_host_t host;  // IP/Port of remote server
+	int fd;            // Socket fd for this connection
+	int version;       // Protocol version of remote server
+	int index;         // Entry in uplinks list
 } dnbd3_server_connection_t;
 
 #define RTT_IDLE 0 // Not in progress
@@ -51,7 +72,7 @@ struct _dnbd3_uplink
 	pthread_mutex_t queueLock;  // lock for synchronization on request queue etc.
 	dnbd3_image_t *image;       // image that this uplink is used for; do not call get/release for this pointer
 	pthread_mutex_t rttLock;    // When accessing rttTestResult, betterFd or betterServer
-	int rttTestResult;          // RTT_*
+	atomic_int rttTestResult;   // RTT_*
 	int cacheFd;                // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD!
 	uint8_t *recvBuffer;        // Buffer for receiving payload
 	uint32_t recvBufferLen;     // Len of ^^
@@ -65,19 +86,9 @@ struct _dnbd3_uplink
 	atomic_int queueLen;        // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
 };
 
-typedef struct
-{
-	char comment[COMMENT_LENGTH];
-	dnbd3_host_t host;
-	unsigned int rtt[SERVER_RTT_PROBES];
-	unsigned int rttIndex;
-	bool isPrivate, isClientOnly;
-	ticks lastFail;
-	int numFails;
-} dnbd3_alt_server_t;
-
 typedef struct
 {
 	uint8_t host[16];
diff --git a/src/server/image.c b/src/server/image.c
index d250715..1a6e0f8 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1178,7 +1178,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
 	dnbd3_host_t servers[REP_NUM_SRV];
 	int uplinkSock = -1;
 	dnbd3_host_t uplinkServer;
-	const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false );
+	const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV );
 	uint16_t remoteProtocolVersion;
 	uint16_t remoteRid = revision;
 	uint64_t remoteImageSize;
@@ -1491,7 +1491,7 @@ json_t* image_getListAsJson()
 	json_t *imagesJson = json_array();
 	json_t *jsonImage;
 	int i;
-	char uplinkName[100] = { 0 };
+	char uplinkName[100];
 	uint64_t bytesReceived;
 	int completeness, idleTime;
 	declare_now;
@@ -1508,7 +1508,7 @@ json_t* image_getListAsJson()
 			uplinkName[0] = '\0';
 		} else {
 			bytesReceived = image->uplink->bytesReceived;
-			if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) {
+			if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) {
 				uplinkName[0] = '\0';
 			}
 		}
diff --git a/src/server/net.c b/src/server/net.c
index 7f3c1ce..4976eea 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -669,11 +669,19 @@ static void removeFromList(dnbd3_client_t *client)
 {
 	int i;
 	mutex_lock( &_clients_lock );
-	for ( i = _num_clients - 1; i >= 0; --i ) {
-		if ( _clients[i] == client ) {
-			_clients[i] = NULL;
+	if ( _num_clients != 0 ) {
+		for ( i = _num_clients - 1; i >= 0; --i ) {
+			if ( _clients[i] == client ) {
+				_clients[i] = NULL;
+				break;
+			}
+		}
+		if ( i != 0 && i + 1 == _num_clients ) {
+			do {
+				i--;
+			} while ( _clients[i] == NULL && i > 0 );
+			_num_clients = i + 1;
 		}
-		if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients;
 	}
 	mutex_unlock( &_clients_lock );
 }
diff --git a/src/server/server.c b/src/server/server.c
index 838aec2..640048a 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -121,9 +121,6 @@ void dnbd3_cleanup()
 	// Disable threadpool
 	threadpool_close();
 
-	// Terminate the altserver checking thread
-	altservers_shutdown();
-
 	// Terminate all uplinks
 	image_killUplinks();
 
@@ -198,6 +195,11 @@ int main(int argc, char *argv[])
 		case LONGOPT_CRC4:
 			return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE;
 		case LONGOPT_ASSERT:
+			printf( "Testing use after free:\n" );
+			volatile char * volatile test = malloc( 10 );
+			test[0] = 1;
+			free( test );
+			test[1] = 2;
 			printf( "Testing a failing assertion:\n" );
 			assert( 4 == 5 );
 			printf( "Assertion 4 == 5 seems to hold. ;-)\n" );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index e21e28c..6c85580 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -96,17 +96,18 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->bytesReceived = 0;
 	uplink->idleTime = 0;
 	uplink->queueLen = 0;
-	mutex_lock( &uplink->sendMutex );
-	uplink->current.fd = -1;
-	mutex_unlock( &uplink->sendMutex );
 	uplink->cacheFd = -1;
 	uplink->signal = NULL;
 	uplink->replicationHandle = REP_NONE;
 	mutex_lock( &uplink->rttLock );
+	mutex_lock( &uplink->sendMutex );
+	uplink->current.fd = -1;
+	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
-	if ( sock >= 0 ) {
+	if ( sock != -1 ) {
 		uplink->better.fd = sock;
-		uplink->better.host = *host;
+		int index = altservers_hostToIndex( host );
+		uplink->better.index = index == -1 ? 0 : index; // Prevent invalid array access
 		uplink->rttTestResult = RTT_DOCHANGE;
 		uplink->better.version = version;
 	} else {
@@ -116,7 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_unlock( &uplink->rttLock );
 	uplink->recvBufferLen = 0;
 	uplink->shutdown = false;
-	if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) {
+	if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)uplink ) ) {
 		logadd( LOG_ERROR, "Could not start thread for new uplink." );
 		goto failure;
 	}
@@ -148,8 +149,8 @@ void uplink_shutdown(dnbd3_image_t *image)
 	}
 	dnbd3_uplink_t * const uplink = image->uplink;
 	mutex_lock( &uplink->queueLock );
-	if ( !uplink->shutdown ) {
-		uplink->shutdown = true;
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
 		signal_call( uplink->signal );
 		thread = uplink->thread;
 		join = true;
@@ -211,13 +212,11 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) {
-		mutex_unlock( &client->image->lock );
-		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
-		mutex_lock( &uplink->rttLock );
+	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
-		mutex_unlock( &uplink->rttLock );
 		signal_call( uplink->signal );
+		mutex_unlock( &client->image->lock );
+		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
 		return false;
 	}
 
@@ -256,12 +255,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		}
 	}
 	if ( unlikely( requestLoop ) ) {
-		mutex_unlock( &uplink->queueLock );
-		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		mutex_lock( &uplink->rttLock );
 		uplink->cycleDetected = true;
-		mutex_unlock( &uplink->rttLock );
 		signal_call( uplink->signal );
+		mutex_unlock( &uplink->queueLock );
+		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
 		return false;
 	}
 	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
@@ -311,6 +308,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( foundExisting != -1 )
 		return true; // Attached to pending request, do nothing
 
+	usleep( 10000 );
+
 	// See if we can fire away the request
 	if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
@@ -342,7 +341,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 				if ( state == -1 ) {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
 				} else if ( state == ULR_NEW ) {
-					logadd( LOG_DEBUG2, "Succesful direct uplink request" );
+					//logadd( LOG_DEBUG2, "Direct uplink request" );
 				} else {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
 				}
@@ -352,10 +351,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		}
 	}
 
-	if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed
-		if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
-			logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
-		}
+	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
 	return true;
 }
@@ -443,7 +440,7 @@ static void* uplink_mainloop(void *data)
 			uplink->image->working = true;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
-			if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) {
+			if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
 				logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 );
 				setThreadName( buffer );
 			}
@@ -525,9 +522,7 @@ static void* uplink_mainloop(void *data)
 			}
 		}
 		// See if we should trigger an RTT measurement
-		mutex_lock( &uplink->rttLock );
-		const int rttTestResult = uplink->rttTestResult;
-		mutex_unlock( &uplink->rttLock );
+		int rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
 			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
@@ -538,7 +533,7 @@ static void* uplink_mainloop(void *data)
 					goto cleanup;
 				} else if ( !uplink_connectionShouldShutdown( uplink ) ) {
 					// Not complete - do measurement
-					altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous)
+					altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous)
 					if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
 						uplink->nextReplicationIndex = 0;
 					}
@@ -547,11 +542,9 @@ static void* uplink_mainloop(void *data)
 				timing_set( &nextAltCheck, &now, altCheckInterval );
 			}
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
-			mutex_lock( &uplink->rttLock );
-			uplink->rttTestResult = RTT_IDLE;
-			mutex_unlock( &uplink->rttLock );
+			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
 			discoverFailCount++;
-			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
+			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
 		if ( uplink->current.fd != -1 && !uplink->shutdown ) {
@@ -581,36 +574,38 @@ static void* uplink_mainloop(void *data)
 #endif
 	}
 	cleanup: ;
-	if ( !uplink->shutdown ) {
-		uplink->shutdown = true;
+	// Detach depends on whether someone is joining this thread...
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
 		thread_detach( uplink->thread );
 	}
-	altservers_removeUplink( uplink );
 	uplink_saveCacheMap( uplink );
-	mutex_lock( &uplink->image->lock );
-	if ( uplink->image->uplink == uplink ) {
-		uplink->image->uplink = NULL;
+	dnbd3_image_t *image = uplink->image;
+	mutex_lock( &image->lock );
+	// in the list anymore, but we want to prevent it from being freed in either case
+	if ( image->uplink == uplink ) {
+		image->uplink = NULL;
 	}
+	mutex_unlock( &image->lock ); // Do NOT use image without locking it
 	mutex_lock( &uplink->queueLock );
-	const int fd = uplink->current.fd;
-	const dnbd3_signal_t* signal = uplink->signal;
-	mutex_lock( &uplink->sendMutex );
-	uplink->current.fd = -1;
-	mutex_unlock( &uplink->sendMutex );
-	uplink->signal = NULL;
-	// Do not access uplink->image after unlocking, since we set
-	// image->uplink to NULL. Acquire with image_lock first,
-	// like done below when checking whether to re-init uplink
-	mutex_unlock( &uplink->image->lock );
-	mutex_unlock( &uplink->queueLock );
-	if ( fd != -1 ) close( fd );
-	if ( signal != NULL ) signal_close( signal );
-	// Wait for the RTT check to finish/fail if it's in progress
-	while ( uplink->rttTestResult == RTT_INPROGRESS )
+	// Wait for active RTT measurement to finish
+	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
 		usleep( 10000 );
+	}
+	signal_close( uplink->signal );
+	mutex_lock( &uplink->rttLock );
+	mutex_lock( &uplink->sendMutex );
+	if ( uplink->current.fd != -1 ) {
+		close( uplink->current.fd );
+		uplink->current.fd = -1;
+	}
 	if ( uplink->better.fd != -1 ) {
 		close( uplink->better.fd );
+		uplink->better.fd = -1;
 	}
+	mutex_unlock( &uplink->sendMutex );
+	mutex_unlock( &uplink->rttLock );
+	mutex_unlock( &uplink->queueLock );
 	mutex_destroy( &uplink->queueLock );
 	mutex_destroy( &uplink->rttLock );
 	mutex_destroy( &uplink->sendMutex );
@@ -619,9 +614,9 @@ static void* uplink_mainloop(void *data)
 	if ( uplink->cacheFd != -1 ) {
 		close( uplink->cacheFd );
 	}
-	dnbd3_image_t *image = image_lock( uplink->image );
 	free( uplink ); // !!!
-	if ( image != NULL ) {
+	if ( image_lock( image ) != NULL ) {
+		// Image is still in list...
 		if ( !_shutdown && image->cache_map != NULL ) {
 			// Ingegrity checker must have found something in the meantime
 			uplink_init( image, -1, NULL, 0 );
@@ -656,7 +651,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( &uplink->current.host );
+			altservers_serverFailed( uplink->current.index );
 			return;
 		}
 		mutex_lock( &uplink->queueLock );
@@ -973,7 +968,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	if ( uplink->current.fd == -1 )
 		return;
-	altservers_serverFailed( &uplink->current.host );
+	altservers_serverFailed( uplink->current.index );
 	mutex_lock( &uplink->sendMutex );
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
@@ -1138,3 +1133,13 @@ static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
 			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
 }
 
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
+{
+	int current;
+	mutex_lock( &uplink->rttLock );
+	current = uplink->current.fd == -1 ? -1 : uplink->current.index;
+	mutex_unlock( &uplink->rttLock );
+	if ( current == -1 )
+		return false;
+	return altservers_toString( current, buffer, len );
+}
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 4fd41b0..acc8e11 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -16,4 +16,6 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 
 void uplink_shutdown(dnbd3_image_t *image);
 
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len);
+
 #endif /* UPLINK_H_ */
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 0cbb320..239f0a2 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -6,10 +6,12 @@
 // +++++ Performance/memory related
 #define SERVER_MAX_CLIENTS 4000
 #define SERVER_MAX_IMAGES  5000
-#define SERVER_MAX_ALTS    100
+#define SERVER_MAX_ALTS    50
 // +++++ Uplink handling (proxy mode)
-#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value
-#define SERVER_BAD_UPLINK_THRES  40 // Thresold for numFails at which we ignore a server for the time span below
+#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again
+#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally
+#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
+#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
 #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
 #define SERVER_MAX_UPLINK_QUEUE  1500 // Maximum number of queued requests per uplink
 #define SERVER_UPLINK_QUEUELEN_THRES  900 // Threshold where we start dropping incoming clients
@@ -33,7 +35,7 @@
 #define SERVER_RTT_PROBES 5 // How many probes to average over
 #define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes
 #define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes
-#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad
+#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while
 #define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times
 
 #define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes
-- 
cgit v1.2.3-55-g7522


From 64348f92494484c69e182f41d3d13e419632e30e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 23 Aug 2019 12:31:21 +0200
Subject: [BENCH] Check CMD_GET_BLOCK reply cmd type

---
 src/bench/connection.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index ce9438a..498bc62 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -102,6 +102,9 @@ bool connection_init_n_times(
 			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "recv: get block header failed" );
+			} else if ( reply.cmd != CMD_GET_BLOCK ) {
+				counters->fails++;
+				logadd( LOG_ERROR, "recv: get block reply is not CMD_GET_BLOCK" );
 			} else {
 				int rv, togo = blockSize;
 				do {
-- 
cgit v1.2.3-55-g7522


From e86ee9ba6a0b5299e835a51f62fe5979fc36788c Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 26 Aug 2019 12:00:00 +0200
Subject: [SERVER] Fix warnings, simplify locking

---
 src/server/server.c |  2 +-
 src/server/uplink.c | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/server/server.c b/src/server/server.c
index 640048a..922740a 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -198,7 +198,7 @@ int main(int argc, char *argv[])
 			printf( "Testing use after free:\n" );
 			volatile char * volatile test = malloc( 10 );
 			test[0] = 1;
-			free( test );
+			free( (void*)test );
 			test[1] = 2;
 			printf( "Testing a failing assertion:\n" );
 			assert( 4 == 5 );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 6c85580..abfebf0 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -370,6 +370,7 @@ static void* uplink_mainloop(void *data)
 	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
 	int numSocks, i, waitTime;
 	int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
+	int rttTestResult;
 	uint32_t discoverFailCount = 0;
 	uint32_t unsavedSeconds = 0;
 	ticks nextAltCheck, lastKeepalive;
@@ -397,11 +398,9 @@ static void* uplink_mainloop(void *data)
 	events[EV_SOCKET].fd = -1;
 	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
-		mutex_lock( &uplink->rttLock );
 		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
-		mutex_unlock( &uplink->rttLock );
 		if ( waitTime == 0 ) {
-			// Nothing
+			// 0 means poll, since we're about to change the server
 		} else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
 			waitTime = 1000;
 		} else {
@@ -420,10 +419,9 @@ static void* uplink_mainloop(void *data)
 			continue;
 		}
 		// Check if server switch is in order
-		mutex_lock( &uplink->rttLock );
-		if ( uplink->rttTestResult != RTT_DOCHANGE ) {
-			mutex_unlock( &uplink->rttLock );
-		} else {
+		if ( unlikely( uplink->rttTestResult == RTT_DOCHANGE ) ) {
+			mutex_lock( &uplink->rttLock );
+			assert( uplink->rttTestResult == RTT_DOCHANGE );
 			uplink->rttTestResult = RTT_IDLE;
 			// The rttTest worker thread has finished our request.
 			// And says it's better to switch to another server
@@ -476,7 +474,7 @@ static void* uplink_mainloop(void *data)
 		// Uplink socket
 		if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
 			uplink_connectionFailed( uplink, true );
-			logadd( LOG_DEBUG1, "Uplink gone away, panic!\n" );
+			logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents );
 			setThreadName( "panic-uplink" );
 		} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
 			uplink_handleReceive( uplink );
@@ -509,7 +507,7 @@ static void* uplink_mainloop(void *data)
 			if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
 				mutex_lock( &uplink->sendMutex );
 				close( uplink->current.fd );
-				uplink->current.fd = events[EV_SOCKET].fd = -1;
+				uplink->current.fd = -1;
 				mutex_unlock( &uplink->sendMutex );
 				uplink->cycleDetected = false;
 				if ( uplink->recvBufferLen != 0 ) {
@@ -522,7 +520,7 @@ static void* uplink_mainloop(void *data)
 			}
 		}
 		// See if we should trigger an RTT measurement
-		int rttTestResult = uplink->rttTestResult;
+		rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
 			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
@@ -964,6 +962,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 	uplink_connectionFailed( uplink, true );
 }
 
+/**
+ * Only call from uplink thread
+ */
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	if ( uplink->current.fd == -1 )
@@ -984,7 +985,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 	mutex_unlock( &uplink->rttLock );
 	if ( bail )
 		return;
-	altservers_findUplink( uplink );
+	altservers_findUplinkAsync( uplink );
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 69f5bf408b9587a6e2008fba2224c2d506f1a895 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 27 Aug 2019 16:13:07 +0200
Subject: [SERVER] Use reference counting for uplink

First step towards less locking for proxy mode
---
 src/server/altservers.c |  13 ++-
 src/server/globals.h    |   4 +-
 src/server/image.c      |  39 ++++-----
 src/server/integrity.c  |  17 ++--
 src/server/net.c        |  48 +++++++----
 src/server/net.h        |   2 +
 src/server/reference.c  |  33 ++++++++
 src/server/reference.h  |  54 ++++++++++++
 src/server/reftypes.h   |  25 ++++++
 src/server/uplink.c     | 214 ++++++++++++++++++++++++++++--------------------
 src/server/uplink.h     |   2 +-
 11 files changed, 311 insertions(+), 140 deletions(-)
 create mode 100644 src/server/reference.c
 create mode 100644 src/server/reference.h
 create mode 100644 src/server/reftypes.h

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 493ed9e..7d7fdbe 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -7,6 +7,8 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../serverconfig.h"
+#include "reference.h"
+
 #include <assert.h>
 #include <inttypes.h>
 #include <jansson.h>
@@ -104,7 +106,6 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 		return;
 	if ( uplink->current.fd != -1 && numAltServers <= 1 )
 		return;
-	int i;
 	// if betterFd != -1 it means the uplink is supposed to switch to another
 	// server. As this function here is called by the uplink thread, it can
 	// never be that the uplink is supposed to switch, but instead calls
@@ -112,11 +113,14 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 	assert( uplink->better.fd == -1 );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
-	mutex_lock( &uplink->rttLock );
 	if ( uplink->rttTestResult != RTT_INPROGRESS ) {
-		threadpool_run( &altservers_runCheck, uplink );
+		dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
+		if ( current == uplink ) {
+			threadpool_run( &altservers_runCheck, uplink );
+		} else if ( current != NULL ) {
+			ref_put( &current->reference );
+		}
 	}
-	mutex_unlock( &uplink->rttLock );
 }
 
 /**
@@ -375,6 +379,7 @@ static void *altservers_runCheck(void *data)
 	assert( uplink != NULL );
 	setThreadName( "altserver-check" );
 	altservers_findUplinkInternal( uplink );
+	ref_put( &uplink->reference ); // Acquired in findUplinkAsync
 	// Save cache maps of all images if applicable
 	// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
 	declare_now;
diff --git a/src/server/globals.h b/src/server/globals.h
index 4d97c6b..5dd205a 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -8,6 +8,7 @@
 #include <stdatomic.h>
 #include <time.h>
 #include <pthread.h>
+#include "reftypes.h"
 
 typedef struct timespec ticks;
 
@@ -64,6 +65,7 @@ typedef struct {
 #define RTT_NOT_REACHABLE 4 // No uplink was reachable
 struct _dnbd3_uplink
 {
+	ref reference;
 	dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected
 	dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none
 	dnbd3_signal_t* signal;     // used to wake up the process
@@ -107,7 +109,7 @@ struct _dnbd3_image
 {
 	char *path;            // absolute path of the image
 	char *name;            // public name of the image (usually relative path minus revision ID)
-	dnbd3_uplink_t *uplink; // pointer to a server connection
+	weakref uplinkref;     // pointer to a server connection
 	uint8_t *cache_map;    // cache map telling which parts are locally cached, NULL if complete
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
diff --git a/src/server/image.c b/src/server/image.c
index 1a6e0f8..5b58347 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -8,6 +8,7 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <fcntl.h>
@@ -375,9 +376,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 
 	// Check if image is incomplete, handle
 	if ( candidate->cache_map != NULL ) {
-		if ( candidate->uplink == NULL ) {
-			uplink_init( candidate, -1, NULL, -1 );
-		}
+		uplink_init( candidate, -1, NULL, -1 );
 	}
 
 	return candidate; // We did all we can, hopefully it's working
@@ -484,17 +483,7 @@ void image_killUplinks()
 	mutex_lock( &imageListLock );
 	for (i = 0; i < _num_images; ++i) {
 		if ( _images[i] == NULL ) continue;
-		mutex_lock( &_images[i]->lock );
-		if ( _images[i]->uplink != NULL ) {
-			mutex_lock( &_images[i]->uplink->queueLock );
-			if ( !_images[i]->uplink->shutdown ) {
-				thread_detach( _images[i]->uplink->thread );
-				_images[i]->uplink->shutdown = true;
-			}
-			mutex_unlock( &_images[i]->uplink->queueLock );
-			signal_call( _images[i]->uplink->signal );
-		}
-		mutex_unlock( &_images[i]->lock );
+		uplink_shutdown( _images[i] );
 	}
 	mutex_unlock( &imageListLock );
 }
@@ -588,11 +577,15 @@ bool image_tryFreeAll()
 static dnbd3_image_t* image_free(dnbd3_image_t *image)
 {
 	assert( image != NULL );
+	assert( image->users == 0 );
 	if ( !_shutdown ) {
 		logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
 	}
-	//
-	uplink_shutdown( image );
+	// uplink_shutdown might return false to tell us
+	// that the shutdown is in progress. Bail out since
+	// this will get called again when the uplink is done.
+	if ( !uplink_shutdown( image ) )
+		return NULL;
 	mutex_lock( &image->lock );
 	free( image->cache_map );
 	free( image->crc32 );
@@ -860,7 +853,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->cache_map = cache_map;
 	image->crc32 = crc32list;
 	image->masterCrc32 = masterCrc;
-	image->uplink = NULL;
+	image->uplinkref = NULL;
 	image->realFilesize = realFilesize;
 	image->virtualFilesize = virtualFilesize;
 	image->rid = (uint16_t)revision;
@@ -1503,16 +1496,18 @@ json_t* image_getListAsJson()
 		mutex_lock( &image->lock );
 		idleTime = (int)timing_diff( &image->atime, &now );
 		completeness = image_getCompletenessEstimate( image );
-		if ( image->uplink == NULL ) {
+		mutex_unlock( &image->lock );
+		dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+		if ( uplink == NULL ) {
 			bytesReceived = 0;
 			uplinkName[0] = '\0';
 		} else {
-			bytesReceived = image->uplink->bytesReceived;
-			if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) {
+			bytesReceived = uplink->bytesReceived;
+			if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) {
 				uplinkName[0] = '\0';
 			}
+			ref_put( &uplink->reference );
 		}
-		mutex_unlock( &image->lock );
 
 		jsonImage = json_pack( "{sisssisisisisI}",
 				"id", image->id, // id, name, rid never change, so access them without locking
@@ -1734,7 +1729,7 @@ void image_closeUnusedFd()
 		if ( image == NULL )
 			continue;
 		mutex_lock( &image->lock );
-		if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) {
+		if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
 			snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
 			fd = image->readFd;
 			image->readFd = -1;
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 3d1ac9b..f358c46 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -4,6 +4,7 @@
 #include "locks.h"
 #include "image.h"
 #include "uplink.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <sys/syscall.h>
@@ -238,11 +239,13 @@ static void* integrity_main(void * data UNUSED)
 					if ( i + 1 == queueLen ) queueLen--;
 					// Mark as working again if applicable
 					if ( !foundCorrupted ) {
-						mutex_lock( &image->lock );
-						if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper?
-							image->working = image->uplink->current.fd != -1 && image->readFd != -1;
+						dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+						if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
+							mutex_lock( &image->lock );
+							image->working = uplink->current.fd != -1 && image->readFd != -1;
+							mutex_unlock( &image->lock );
+							ref_put( &uplink->reference );
 						}
-						mutex_unlock( &image->lock );
 					}
 				} else {
 					// Still more blocks to go...
@@ -255,12 +258,8 @@ static void* integrity_main(void * data UNUSED)
 				// Something was fishy, make sure uplink exists
 				mutex_lock( &image->lock );
 				image->working = false;
-				bool restart = image->uplink == NULL || image->uplink->shutdown;
 				mutex_unlock( &image->lock );
-				if ( restart ) {
-					uplink_shutdown( image );
-					uplink_init( image, -1, NULL, -1 );
-				}
+				uplink_init( image, -1, NULL, -1 );
 			}
 			// Release :-)
 			image_release( image );
diff --git a/src/server/net.c b/src/server/net.c
index 4976eea..e0b516e 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -24,6 +24,7 @@
 #include "locks.h"
 #include "rpc.h"
 #include "altservers.h"
+#include "reference.h"
 
 #include "../shared/sockhelper.h"
 #include "../shared/timing.h"
@@ -229,7 +230,7 @@ void* net_handleNewConnection(void *clientPtr)
 		rid = serializer_get_uint16( &payload );
 		const uint8_t flags = serializer_get_uint8( &payload );
 		client->isServer = ( flags & FLAGS8_SERVER );
-		if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) {
+		if ( unlikely( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) ) {
 			if ( client_version < MIN_SUPPORTED_CLIENT ) {
 				logadd( LOG_DEBUG1, "Client %s too old", client->hostName );
 			} else {
@@ -257,22 +258,25 @@ void* net_handleNewConnection(void *clientPtr)
 			}
 			client->image = image;
 			atomic_thread_fence( memory_order_release );
-			if ( image == NULL ) {
+			if ( unlikely( image == NULL ) ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
-			} else if ( !image->working ) {
+			} else if ( unlikely( !image->working ) ) {
 				logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
 						client->hostName, image_name, (int)rid );
 			} else {
-				bool penalty;
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
 				if ( image->cache_map != NULL ) {
-					mutex_lock( &image->lock );
-					if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1;
-					mutex_unlock( &image->lock );
+					bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 );
+					if ( uplink == NULL ) {
+						uplink_init( image, -1, NULL, 0 );
+					} else {
+						ref_put( &uplink->reference );
+					}
 					if ( penalty ) { // Wait 100ms if local caching is not working so this
 						usleep( 100000 ); // server gets a penalty and is less likely to be selected
 					}
@@ -300,7 +304,7 @@ void* net_handleNewConnection(void *clientPtr)
 		}
 	}
 
-	if ( bOk ) {
+	if ( likely( bOk ) ) {
 		// add artificial delay if applicable
 		if ( client->isServer && _serverPenalty != 0 ) {
 			usleep( _serverPenalty );
@@ -315,7 +319,7 @@ void* net_handleNewConnection(void *clientPtr)
 			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
 				reply.handle = request.handle;
-				if ( offset >= image->virtualFilesize ) {
+				if ( unlikely( offset >= image->virtualFilesize ) ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
 					reply.size = 0;
@@ -323,7 +327,7 @@ void* net_handleNewConnection(void *clientPtr)
 					send_reply( client->sock, &reply, NULL );
 					break;
 				}
-				if ( offset + request.size > image->virtualFilesize ) {
+				if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName );
 					reply.size = 0;
@@ -398,7 +402,7 @@ void* net_handleNewConnection(void *clientPtr)
 				reply.size = request.size;
 
 				fixup_reply( reply );
-				const bool lock = image->uplink != NULL;
+				const bool lock = image->uplinkref != NULL;
 				if ( lock ) mutex_lock( &client->sendMutex );
 				// Send reply header
 				if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) {
@@ -696,9 +700,11 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 {
 	mutex_lock( &client->lock );
 	if ( client->image != NULL ) {
-		mutex_lock( &client->image->lock );
-		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
-		mutex_unlock( &client->image->lock );
+		dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
+		if ( uplink != NULL ) {
+			uplink_removeClient( uplink, client );
+			ref_put( &uplink->reference );
+		}
 	}
 	mutex_lock( &client->sendMutex );
 	if ( client->sock != -1 ) {
@@ -740,3 +746,15 @@ static bool addToList(dnbd3_client_t *client)
 	return true;
 }
 
+void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle)
+{
+	dnbd3_reply_t reply;
+	reply.magic = dnbd3_packet_magic;
+	reply.cmd = cmd;
+	reply.handle = handle;
+	reply.size = 0;
+	mutex_lock( &client->sendMutex );
+	send_reply( client->sock, &reply, NULL );
+	mutex_unlock( &client->sendMutex );
+}
+
diff --git a/src/server/net.h b/src/server/net.h
index 6813b49..7719aef 100644
--- a/src/server/net.h
+++ b/src/server/net.h
@@ -37,4 +37,6 @@ void net_disconnectAll();
 
 void net_waitForAllDisconnected();
 
+void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle);
+
 #endif /* NET_H_ */
diff --git a/src/server/reference.c b/src/server/reference.c
new file mode 100644
index 0000000..468e00b
--- /dev/null
+++ b/src/server/reference.c
@@ -0,0 +1,33 @@
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#include "reference.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count )
+{
+	reference->count = count;
+	reference->free = freefun;
+}
+
+_Noreturn void _ref_error( const char *message )
+{
+	fprintf( stderr, "Reference counter overflow\n" );
+	abort();
+}
+
+void ref_setref( weakref *weakref, ref *ref )
+{
+	union _aligned_ref_ *new_weakref = 0;
+	if ( ref ) {
+		( new_weakref = aligned_ref( ref->_aligned_ref ) )->ref = ref;
+		ref->count += sizeof( union _aligned_ref_ ) + 1;
+	}
+	char *old_weakref = (char *)atomic_exchange( weakref, new_weakref );
+	if ( !old_weakref )
+		return;
+	struct _ref_ *old_ref = aligned_ref( old_weakref )->ref;
+	old_ref->count += old_weakref - (char *)aligned_ref( old_weakref ) - sizeof( union _aligned_ref_ );
+	ref_put( old_ref );
+}
diff --git a/src/server/reference.h b/src/server/reference.h
new file mode 100644
index 0000000..0bc081a
--- /dev/null
+++ b/src/server/reference.h
@@ -0,0 +1,54 @@
+#ifndef _REFERENCE_H_
+#define _REFERENCE_H_
+
+#include "reftypes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member)))
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count );
+
+void ref_setref( weakref *weakref, ref *ref );
+
+_Noreturn void _ref_error( const char *message );
+
+static inline ref *ref_get( weakref *weakref )
+{
+	char *old_weakref = (char *)*weakref;
+	do {
+		if ( old_weakref == NULL )
+			return NULL;
+		if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) {
+			old_weakref = (char *)*weakref;
+			continue;
+		}
+	} while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) );
+	struct _ref_ *ref = aligned_ref( old_weakref )->ref;
+	if ( unlikely( ++ref->count == -1 ) ) {
+		_ref_error( "Reference counter overflow. Aborting.\n" );
+	}
+	char *cur_weakref = ( char * )*weakref;
+	do {
+		if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) {
+			ref->count--;
+			break;
+		}
+	} while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) );
+	return ref;
+}
+
+static inline void ref_put( ref *ref )
+{
+	if ( --ref->count == 0 ) {
+		ref->free( ref );
+	}
+}
+
+#define ref_get_uplink(wr) ({ \
+	ref* ref = ref_get( wr ); \
+	ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
+})
+
+#endif
diff --git a/src/server/reftypes.h b/src/server/reftypes.h
new file mode 100644
index 0000000..45c0c20
--- /dev/null
+++ b/src/server/reftypes.h
@@ -0,0 +1,25 @@
+#ifndef _REFTYPES_H_
+#define _REFTYPES_H_
+
+#include <stdatomic.h>
+
+_Static_assert( sizeof( void * ) == sizeof( _Atomic( void * ) ), "Atomic pointer bad" );
+
+typedef _Atomic( void * ) weakref;
+
+#define aligned_ref(ptr) \
+	((union _aligned_ref_ *)((ptr) - (uintptr_t)(ptr) % sizeof(union _aligned_ref_)))
+
+union _aligned_ref_ {
+	struct _ref_ *ref;
+	void *_padding[( 32 - 1 ) / sizeof( void * ) + 1];
+};
+
+typedef struct _ref_ {
+	_Atomic long count;
+	void ( *free )( struct _ref_ * );
+	char _padding[sizeof( union _aligned_ref_ )];
+	char _aligned_ref[sizeof( union _aligned_ref_ )];
+} ref;
+
+#endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index abfebf0..7a39887 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -3,10 +3,12 @@
 #include "locks.h"
 #include "image.h"
 #include "altservers.h"
+#include "net.h"
 #include "../shared/sockhelper.h"
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <inttypes.h>
@@ -45,6 +47,8 @@ static const char *const NAMES_ULR[4] = {
 
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
+static void cancelAllRequests(dnbd3_uplink_t *uplink);
+static void uplink_free(ref *ref);
 static void* uplink_mainloop(void *data);
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
@@ -76,19 +80,24 @@ uint64_t uplink_getTotalBytesReceived()
 bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version)
 {
 	if ( !_isProxy || _shutdown ) return false;
-	dnbd3_uplink_t *uplink = NULL;
 	assert( image != NULL );
 	mutex_lock( &image->lock );
-	if ( image->uplink != NULL && !image->uplink->shutdown ) {
+	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+	if ( uplink != NULL ) {
 		mutex_unlock( &image->lock );
-		if ( sock >= 0 ) close( sock );
+		if ( sock != -1 ) {
+			close( sock );
+		}
+		ref_put( &uplink->reference );
 		return true; // There's already an uplink, so should we consider this success or failure?
 	}
 	if ( image->cache_map == NULL ) {
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
 		goto failure;
 	}
-	uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+	uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+	// Start with one reference for the uplink thread. We'll return it when the thread finishes
+	ref_init( &uplink->reference, uplink_free, 1 );
 	mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
 	mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
 	mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
@@ -121,12 +130,13 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		logadd( LOG_ERROR, "Could not start thread for new uplink." );
 		goto failure;
 	}
+	ref_setref( &image->uplinkref, &uplink->reference );
 	mutex_unlock( &image->lock );
 	return true;
 failure: ;
 	if ( uplink != NULL ) {
 		free( uplink );
-		uplink = image->uplink = NULL;
+		uplink = NULL;
 	}
 	mutex_unlock( &image->lock );
 	return false;
@@ -137,34 +147,83 @@ failure: ;
  * Calling it multiple times, even concurrently, will
  * not break anything.
  */
-void uplink_shutdown(dnbd3_image_t *image)
+bool uplink_shutdown(dnbd3_image_t *image)
 {
-	bool join = false;
-	pthread_t thread;
 	assert( image != NULL );
 	mutex_lock( &image->lock );
-	if ( image->uplink == NULL ) {
+	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+	if ( uplink == NULL ) {
 		mutex_unlock( &image->lock );
-		return;
+		return true;
 	}
-	dnbd3_uplink_t * const uplink = image->uplink;
 	mutex_lock( &uplink->queueLock );
 	bool exp = false;
 	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+		image->users++; // Prevent free while uplink shuts down
 		signal_call( uplink->signal );
-		thread = uplink->thread;
-		join = true;
+	} else {
+		logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid );
 	}
+	cancelAllRequests( uplink );
+	ref_setref( &image->uplinkref, NULL );
+	ref_put( &uplink->reference );
 	mutex_unlock( &uplink->queueLock );
-	bool wait = image->uplink != NULL;
+	bool retval = ( exp && image->users == 0 );
 	mutex_unlock( &image->lock );
-	if ( join ) thread_join( thread, NULL );
-	while ( wait ) {
-		usleep( 5000 );
-		mutex_lock( &image->lock );
-		wait = image->uplink != NULL && image->uplink->shutdown;
-		mutex_unlock( &image->lock );
+	return exp;
+}
+
+/**
+ * Cancel all requests of this uplink.
+ * HOLD QUEUE LOCK WHILE CALLING
+ */
+static void cancelAllRequests(dnbd3_uplink_t *uplink)
+{
+	for ( int i = 0; i < uplink->queueLen; ++i ) {
+		if ( uplink->queue[i].status != ULR_FREE ) {
+			net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
+			uplink->queue[i].status = ULR_FREE;
+		}
+	}
+	uplink->queueLen = 0;
+}
+
+static void uplink_free(ref *ref)
+{
+	dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
+	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
+	assert( uplink->queueLen == 0 );
+	signal_close( uplink->signal );
+	if ( uplink->current.fd != -1 ) {
+		close( uplink->current.fd );
+		uplink->current.fd = -1;
+	}
+	if ( uplink->better.fd != -1 ) {
+		close( uplink->better.fd );
+		uplink->better.fd = -1;
+	}
+	mutex_destroy( &uplink->queueLock );
+	mutex_destroy( &uplink->rttLock );
+	mutex_destroy( &uplink->sendMutex );
+	free( uplink->recvBuffer );
+	uplink->recvBuffer = NULL;
+	if ( uplink->cacheFd != -1 ) {
+		close( uplink->cacheFd );
 	}
+	// TODO Requeue any requests
+	dnbd3_image_t *image = image_lock( uplink->image );
+	if ( image != NULL ) {
+		// != NULL means image is still in list...
+		if ( !_shutdown && image->cache_map != NULL ) {
+			// Ingegrity checker must have found something in the meantime
+			uplink_init( image, -1, NULL, 0 );
+		}
+		image_release( image );
+	}
+	// Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code
+	// of the uplink thread, depending on who set the uplink->shutdown flag.
+	image_release( image );
+	free( uplink ); // !!!
 }
 
 /**
@@ -193,31 +252,28 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
  */
 bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
 {
-	if ( client == NULL || client->image == NULL ) return false;
+	if ( client == NULL || client->image == NULL )
+		return false;
 	if ( length > (uint32_t)_maxPayload ) {
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	mutex_lock( &client->image->lock );
-	if ( client->image->uplink == NULL ) {
-		mutex_unlock( &client->image->lock );
+	dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref );
+	if ( uplink == NULL ) {
 		logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
 		return false;
 	}
-	dnbd3_uplink_t * const uplink = client->image->uplink;
 	if ( uplink->shutdown ) {
-		mutex_unlock( &client->image->lock );
 		logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
-		return false;
+		goto fail_ref;
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
 	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
-		mutex_unlock( &client->image->lock );
 		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
-		return false;
+		goto fail_ref;
 	}
 
 	int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
@@ -229,7 +285,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	const uint64_t end = start + length;
 
 	mutex_lock( &uplink->queueLock );
-	mutex_unlock( &client->image->lock );
+	if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
+		goto fail_lock;
+	}
 	for (i = 0; i < uplink->queueLen; ++i) {
 		// find free slot to place this request into
 		if ( uplink->queue[i].status == ULR_FREE ) {
@@ -257,18 +315,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( unlikely( requestLoop ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
-		mutex_unlock( &uplink->queueLock );
 		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		return false;
+		goto fail_lock;
 	}
 	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
 		freeSlot = -1; // Not attaching to existing request, make it use a higher slot
 	}
 	if ( freeSlot == -1 ) {
 		if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
-			mutex_unlock( &uplink->queueLock );
 			logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
-			return false;
+			goto fail_lock;
 		}
 		freeSlot = uplink->queueLen++;
 	}
@@ -305,16 +361,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 #endif
 	mutex_unlock( &uplink->queueLock );
 
-	if ( foundExisting != -1 )
+	if ( foundExisting != -1 ) {
+		ref_put( &uplink->reference );
 		return true; // Attached to pending request, do nothing
-
-	usleep( 10000 );
+	}
 
 	// See if we can fire away the request
-	if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
+	if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
 	} else {
-		if ( uplink->current.fd == -1 ) {
+		if ( unlikely( uplink->current.fd == -1 ) ) {
 			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 		} else {
@@ -323,13 +379,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			if ( hops < 200 ) ++hops;
 			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
 			mutex_unlock( &uplink->sendMutex );
-			if ( !ret ) {
+			if ( unlikely( !ret ) ) {
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
 			} else {
 				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
 				int state;
 				mutex_lock( &uplink->queueLock );
-				if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
+				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
 					state = uplink->queue[freeSlot].status;
 					if ( uplink->queue[freeSlot].status == ULR_NEW ) {
 						uplink->queue[freeSlot].status = ULR_PENDING;
@@ -345,6 +401,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 				} else {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
 				}
+				ref_put( &uplink->reference );
 				return true;
 			}
 			// Fall through to waking up sender thread
@@ -354,7 +411,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
 		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
+	ref_put( &uplink->reference );
 	return true;
+fail_lock:
+	mutex_unlock( &uplink->queueLock );
+fail_ref:
+	ref_put( &uplink->reference );
+	return false;
 }
 
 /**
@@ -381,6 +444,7 @@ static void* uplink_mainloop(void *data)
 	//
 	assert( uplink != NULL );
 	setThreadName( "idle-uplink" );
+	thread_detach( uplink->thread );
 	blockNoncriticalSignals();
 	// Make sure file is open for writing
 	if ( !uplink_reopenCacheFd( uplink, false ) ) {
@@ -553,7 +617,7 @@ static void* uplink_mainloop(void *data)
 			for (i = 0; i < uplink->queueLen; ++i) {
 				if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
 					snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
-							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name,
+							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
 							uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
 					uplink->queue[i].entered = now;
 #ifdef _DEBUG_RESEND_STARVING
@@ -572,55 +636,26 @@ static void* uplink_mainloop(void *data)
 #endif
 	}
 	cleanup: ;
-	// Detach depends on whether someone is joining this thread...
-	bool exp = false;
-	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
-		thread_detach( uplink->thread );
-	}
 	uplink_saveCacheMap( uplink );
 	dnbd3_image_t *image = uplink->image;
 	mutex_lock( &image->lock );
-	// in the list anymore, but we want to prevent it from being freed in either case
-	if ( image->uplink == uplink ) {
-		image->uplink = NULL;
-	}
-	mutex_unlock( &image->lock ); // Do NOT use image without locking it
-	mutex_lock( &uplink->queueLock );
-	// Wait for active RTT measurement to finish
-	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
-		usleep( 10000 );
-	}
-	signal_close( uplink->signal );
-	mutex_lock( &uplink->rttLock );
-	mutex_lock( &uplink->sendMutex );
-	if ( uplink->current.fd != -1 ) {
-		close( uplink->current.fd );
-		uplink->current.fd = -1;
-	}
-	if ( uplink->better.fd != -1 ) {
-		close( uplink->better.fd );
-		uplink->better.fd = -1;
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+		image->users++; // We set the flag - hold onto image
 	}
-	mutex_unlock( &uplink->sendMutex );
-	mutex_unlock( &uplink->rttLock );
-	mutex_unlock( &uplink->queueLock );
-	mutex_destroy( &uplink->queueLock );
-	mutex_destroy( &uplink->rttLock );
-	mutex_destroy( &uplink->sendMutex );
-	free( uplink->recvBuffer );
-	uplink->recvBuffer = NULL;
-	if ( uplink->cacheFd != -1 ) {
-		close( uplink->cacheFd );
+	dnbd3_uplink_t *current = ref_get_uplink( &image->uplinkref );
+	if ( current == uplink ) { // Set NULL if it's still us...
+		mutex_lock( &uplink->queueLock );
+		cancelAllRequests( uplink );
+		mutex_unlock( &uplink->queueLock );
+		ref_setref( &image->uplinkref, NULL );
 	}
-	free( uplink ); // !!!
-	if ( image_lock( image ) != NULL ) {
-		// Image is still in list...
-		if ( !_shutdown && image->cache_map != NULL ) {
-			// Ingegrity checker must have found something in the meantime
-			uplink_init( image, -1, NULL, 0 );
-		}
-		image_release( image );
+	if ( current != NULL ) { // Decrease ref in any case
+		ref_put( &current->reference );
 	}
+	mutex_unlock( &image->lock );
+	// Finally as the thread is done, decrease our own ref that we initialized with
+	ref_put( &uplink->reference );
 	return NULL ;
 }
 
@@ -637,7 +672,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 		/*
 		logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
-				(void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
+				(void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
 		*/
 		mutex_unlock( &uplink->queueLock );
 		if ( hops < 200 ) ++hops;
@@ -782,7 +817,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 
 /**
  * Receive data from uplink server and process/dispatch
- * Locks on: link.lock, images[].lock
+ * Locks on: uplink.lock, images[].lock
  */
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
@@ -924,13 +959,16 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 				}
 				mutex_unlock( &client->sendMutex );
 				mutex_lock( &uplink->queueLock );
+				if ( i > uplink->queueLen ) {
+					uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests
+				}
 			}
 			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
 		mutex_unlock( &uplink->queueLock );
 #ifdef _DEBUG
 		if ( !served && start != uplink->replicationHandle ) {
-			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end );
+			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
 		}
 #endif
 		if ( start == uplink->replicationHandle ) {
diff --git a/src/server/uplink.h b/src/server/uplink.h
index acc8e11..49ff0b4 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -14,7 +14,7 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
 
 bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
 
-void uplink_shutdown(dnbd3_image_t *image);
+bool uplink_shutdown(dnbd3_image_t *image);
 
 bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len);
 
-- 
cgit v1.2.3-55-g7522


From b848c60317dcb5193e4541a679dfc82a257f83e9 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 27 Aug 2019 20:58:01 +0200
Subject: [SERVER] Fix swapped assignment

---
 src/server/uplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 7a39887..d77be9c 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -960,7 +960,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 				mutex_unlock( &client->sendMutex );
 				mutex_lock( &uplink->queueLock );
 				if ( i > uplink->queueLen ) {
-					uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests
+					i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests
 				}
 			}
 			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
-- 
cgit v1.2.3-55-g7522


From ff228ee519c27bb80234e6eadbf9cbe1adda4318 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 27 Aug 2019 23:20:20 +0200
Subject: [SERVER] Add timer infrastructure

To be used by "close unused fd" checks, and others...
---
 src/server/server.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++----
 src/server/server.h |   2 +-
 2 files changed, 134 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/server/server.c b/src/server/server.c
index 922740a..1cdd2ab 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -45,6 +45,25 @@
 #define LONGOPT_SIZE       1004
 #define LONGOPT_ERRORMSG   1005
 
+typedef struct _job job_t;
+
+struct _job {
+	job_t *next;
+	void *(*startRoutine)(void *);
+	void *arg;
+	ticks dueDate;
+	int intervalSecs;
+};
+
+static job_t *jobHead;
+static _Atomic(job_t *) newJob;
+static bool hasTimerThread = false;
+static pthread_t timerThread;
+
+static pthread_t mainThread;
+
+#define DEFAULT_TIMER_TIMEOUT (60)
+
 static poll_list_t *listeners = NULL;
 
 /**
@@ -71,6 +90,12 @@ static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data);
 
 static void* server_asyncImageListLoad(void *data);
 
+static void* timerMainloop(void*);
+
+static int handlePendingJobs(void);
+
+static void queueJobInternal(job_t *job);
+
 /**
  * Print help text for usage instructions
  */
@@ -105,14 +130,21 @@ void dnbd3_printVersion()
 /**
  * Clean up structs, connections, write out data, then exit
  */
-void dnbd3_cleanup()
+_Noreturn static void dnbd3_cleanup()
 {
 	int retries;
 
 	_shutdown = true;
 	logadd( LOG_INFO, "Cleanup..." );
 
-	if ( listeners != NULL ) sock_destroyPollList( listeners );
+	if ( hasTimerThread ) {
+		pthread_kill( timerThread, SIGHUP );
+		thread_join( timerThread, NULL );
+	}
+
+	if ( listeners != NULL ) {
+		sock_destroyPollList( listeners );
+	}
 	listeners = NULL;
 
 	// Kill connection to all clients
@@ -172,6 +204,7 @@ int main(int argc, char *argv[])
 			{ 0, 0, 0, 0 }
 	};
 
+	mainThread = pthread_self();
 	opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
 
 	while ( opt != -1 ) {
@@ -195,8 +228,12 @@ int main(int argc, char *argv[])
 		case LONGOPT_CRC4:
 			return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE;
 		case LONGOPT_ASSERT:
+			printf( "Now leaking memory:\n" );
+			char *bla = malloc( 10 );
+			bla[2] = 3;
+			bla = NULL;
 			printf( "Testing use after free:\n" );
-			volatile char * volatile test = malloc( 10 );
+			char *test = malloc( 10 );
 			test[0] = 1;
 			free( (void*)test );
 			test[1] = 2;
@@ -303,11 +340,10 @@ int main(int argc, char *argv[])
 	}
 
 	// setup signal handler
-	struct sigaction sa;
-	memset( &sa, 0, sizeof(sa) );
-	sa.sa_sigaction = dnbd3_handleSignal2;
-	sa.sa_flags = SA_SIGINFO;
-	//sa.sa_mask = ;
+	struct sigaction sa = {
+		.sa_sigaction = dnbd3_handleSignal2,
+		.sa_flags = SA_SIGINFO,
+	};
 	sigaction( SIGTERM, &sa, NULL );
 	sigaction( SIGINT, &sa, NULL );
 	sigaction( SIGUSR1, &sa, NULL );
@@ -342,6 +378,10 @@ int main(int argc, char *argv[])
 
 	logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
 
+	if ( thread_create( &timerThread, NULL, &timerMainloop, NULL ) == 0 ) {
+		hasTimerThread = true;
+	}
+
 	// +++++++++++++++++++++++++++++++++++++++++++++++++++ main loop
 	struct sockaddr_storage client;
 	socklen_t len;
@@ -365,7 +405,7 @@ int main(int argc, char *argv[])
 		//
 		len = sizeof(client);
 		fd = sock_accept( listeners, &client, &len );
-		if ( fd < 0 ) {
+		if ( fd == -1 ) {
 			const int err = errno;
 			if ( err == EINTR || err == EAGAIN ) continue;
 			logadd( LOG_ERROR, "Client accept failure (err=%d)", err );
@@ -469,6 +509,8 @@ static void dnbd3_handleSignal(int signum)
 
 static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
 {
+	if ( !pthread_equal( pthread_self(), mainThread ) )
+		return;
 	memcpy( &lastSignal, info, sizeof(siginfo_t) );
 	dnbd3_handleSignal( signum );
 }
@@ -488,3 +530,85 @@ static void* server_asyncImageListLoad(void *data UNUSED)
 	return NULL;
 }
 
+static void* timerMainloop(void* stuff UNUSED)
+{
+	setThreadName( "timer" );
+	while ( !_shutdown ) {
+		// Handle jobs/timer events; returns timeout until next event
+		int to = handlePendingJobs();
+		sleep( MIN( MAX( 1, to ), DEFAULT_TIMER_TIMEOUT ) );
+	}
+	logadd( LOG_DEBUG1, "Timer thread done" );
+	return NULL;
+}
+
+static int handlePendingJobs(void)
+{
+	declare_now;
+	job_t *todo, **temp, *old;
+	int diff;
+	todo = jobHead;
+	for ( temp = &todo; *temp != NULL; temp = &(*temp)->next ) {
+		diff = (int)timing_diff( &now, &(*temp)->dueDate );
+		if ( diff > 0 ) // Found one that's in the future
+			break;
+	}
+	jobHead = *temp; // Make it list head
+	*temp = NULL; // Split off part before that
+	while ( todo != NULL ) {
+		threadpool_run( todo->startRoutine, todo->arg );
+		old = todo;
+		todo = todo->next;
+		if ( old->intervalSecs == 0 ) {
+			free( old ); // oneshot
+		} else {
+			timing_set( &old->dueDate, &now, old->intervalSecs );
+			queueJobInternal( old ); // repeated
+		}
+	}
+	// See if any new jobs have been queued
+	while ( newJob != NULL ) {
+		todo = newJob;
+		// NULL should never happen since we're the only consumer
+		assert( todo != NULL );
+		if ( !atomic_compare_exchange_weak( &newJob, &todo, NULL ) )
+			continue;
+		do {
+			old = todo;
+			todo = todo->next;
+			queueJobInternal( old );
+		} while ( todo != NULL );
+	}
+	// Return new timeout
+	if ( jobHead == NULL )
+		return DEFAULT_TIMER_TIMEOUT;
+	return (int)timing_diff( &now, &jobHead->dueDate );
+}
+
+static void queueJobInternal(job_t *job)
+{
+	assert( job != NULL );
+	job_t **it;
+	for ( it = &jobHead; *it != NULL; it = &(*it)->next ) {
+		if ( timing_1le2( &job->dueDate, &(*it)->dueDate ) )
+			break;
+	}
+	job->next = *it;
+	*it = job;
+}
+
+void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs)
+{
+	declare_now;
+	job_t *new = malloc( sizeof(*new) );
+	new->startRoutine = startRoutine;
+	new->arg = arg;
+	new->intervalSecs = intervalSecs;
+	timing_set( &new->dueDate, &now, delaySecs );
+	for ( ;; ) {
+		new->next = newJob;
+		if ( atomic_compare_exchange_weak( &newJob, &new->next, new ) )
+			break;
+	}
+}
+
diff --git a/src/server/server.h b/src/server/server.h
index bab8421..a026eb6 100644
--- a/src/server/server.h
+++ b/src/server/server.h
@@ -24,8 +24,8 @@
 #include "globals.h"
 #include "../types.h"
 
-void dnbd3_cleanup();
 uint32_t dnbd3_serverUptime();
+void server_addJob(void *(*startRoutine)(void *), void *arg, int delaySecs, int intervalSecs);
 
 #if !defined(_FILE_OFFSET_BITS) || _FILE_OFFSET_BITS != 64
 #error Please set _FILE_OFFSET_BITS to 64 in your makefile/configuration
-- 
cgit v1.2.3-55-g7522


From f4e11e75fe72e9257f7086966a6a480e5f3684a6 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 10:34:22 +0200
Subject: [SERVER] Handle closeUnusedFd via timer

---
 src/server/altservers.c |  8 --------
 src/server/image.c      | 36 +++++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 25 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 7d7fdbe..e088601 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -20,7 +20,6 @@
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
 static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
-static ticks nextCloseUnusedFd; // TODO: Move away
 
 static void *altservers_runCheck(void *data);
 static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current);
@@ -380,13 +379,6 @@ static void *altservers_runCheck(void *data)
 	setThreadName( "altserver-check" );
 	altservers_findUplinkInternal( uplink );
 	ref_put( &uplink->reference ); // Acquired in findUplinkAsync
-	// Save cache maps of all images if applicable
-	// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
-	declare_now;
-	if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
-		timing_gets( &nextCloseUnusedFd, 900 );
-		image_closeUnusedFd();
-	}
 	return NULL;
 }
 
diff --git a/src/server/image.c b/src/server/image.c
index 5b58347..ace585b 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -54,6 +54,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force);
 static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
 static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map);
+static void* closeUnusedFds(void*);
 
 // ##########################################
 
@@ -63,6 +64,7 @@ void image_serverStartup()
 	mutex_init( &imageListLock, LOCK_IMAGE_LIST );
 	mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
 	mutex_init( &reloadLock, LOCK_RELOAD );
+	server_addJob( &closeUnusedFds, NULL, 10, 900 );
 }
 
 /**
@@ -1717,34 +1719,34 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
 	return false;
 }
 
-void image_closeUnusedFd()
+#define FDCOUNT (400)
+static void* closeUnusedFds(void* nix UNUSED)
 {
-	int fd, i;
+	if ( !_closeUnusedFd )
+		return NULL;
 	ticks deadline;
 	timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
-	char imgstr[300];
+	int fds[FDCOUNT];
+	int fdindex = 0;
 	mutex_lock( &imageListLock );
-	for (i = 0; i < _num_images; ++i) {
+	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
 		if ( image == NULL )
 			continue;
-		mutex_lock( &image->lock );
 		if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
-			snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
-			fd = image->readFd;
-			image->readFd = -1;
-		} else {
-			fd = -1;
-		}
-		mutex_unlock( &image->lock );
-		if ( fd != -1 ) {
-			mutex_unlock( &imageListLock );
-			close( fd );
-			logadd( LOG_DEBUG1, "Inactive fd closed for %s", imgstr );
-			mutex_lock( &imageListLock );
+			logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid );
+			fds[fdindex++] = image->readFd;
+			image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock
+			if ( fdindex == FDCOUNT )
+				break;
 		}
 	}
 	mutex_unlock( &imageListLock );
+	// Do this after unlock since close might block
+	for ( int i = 0; i < fdindex; ++i ) {
+		close( fds[i] );
+	}
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From bf8fdccd296bb73154c5355ec6bdfd24fabe87d0 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 10:36:05 +0200
Subject: [SERVER] Remove old comments

---
 src/server/altservers.c |  8 --------
 src/server/image.c      | 30 ------------------------------
 2 files changed, 38 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index e088601..ff3c95b 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -363,14 +363,6 @@ static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server)
 	mutex_unlock( &altServersLock );
 }
 
-/**
- * Mainloop of this module. It will wait for requests by uplinks to find a
- * suitable uplink server for them. If found, it will tell the uplink about
- * the best server found. Currently the RTT history is kept per server and
- * not per uplink, so if many images use the same uplink server, the history
- * will update quite quickly. Needs to be improved some time, ie. by only
- * updating the rtt if the last update was at least X seconds ago.
- */
 static void *altservers_runCheck(void *data)
 {
 	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
diff --git a/src/server/image.c b/src/server/image.c
index ace585b..de93cd4 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1748,33 +1748,3 @@ static void* closeUnusedFds(void* nix UNUSED)
 	}
 	return NULL;
 }
-
-/*
- void image_find_latest()
- {
- // Not in array or most recent rid is requested, try file system
- if (revision != 0) {
- // Easy case - specific RID
- char
- } else {
- // Determine base directory where the image in question has to reside.
- // Eg, the _basePath is "/srv/", requested image is "rz/ubuntu/default-13.04"
- // Then searchPath has to be set to "/srv/rz/ubuntu"
- char searchPath[strlen(_basePath) + len + 1];
- char *lastSlash = strrchr(name, '/');
- char *baseName; // Name of the image. In the example above, it will be "default-13.04"
- if ( lastSlash == NULL ) {
- *searchPath = '\0';
- baseName = name;
- } else {
- char *from = name, *to = searchPath;
- while (from < lastSlash) *to++ = *from++;
- *to = '\0';
- baseName = lastSlash + 1;
- }
- // Now we have the search path in our real file system and the expected image name.
- // The revision naming sceme is <IMAGENAME>.r<RID>, so if we're looking for revision 13,
- // our example image has to be named default-13.04.r13
- }
- }
- */
-- 
cgit v1.2.3-55-g7522


From ac1bf45ebdd630fbc9ad2c1fa3c0ea99f5206799 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 13:07:13 +0200
Subject: [SERVER] Make signal handling more POSIX

According to POSIX, a signal sent to a PID can be delivered to an
arbitrary thread of that process that hasn't the signal blocked. This
seens to never happen on Linux, but would mess things up since the code
expected the main signal handler to only be executed by the main thread.
This should now be fixed by examining the destination PID of the signal
as well as the ID of the thread currently running the signal handler. If
we notice the signal wasn't sent by our own PID and the handler is not
currently run by the main thread, we re-send the signal to the main
thread. Otherwise, if the signal was sent by our own PID but the handler
is not run in the main thread, do nothing. This way we can use
pthread_kill() to wake up threads that might be stuck in a blocking
syscall when it's time to shut down.
---
 src/server/globals.h    |  1 +
 src/server/image.c      | 10 ++--------
 src/server/integrity.c  | 17 +++++++++++++----
 src/server/net.c        | 11 ++++++-----
 src/server/rpc.c        | 13 ++++++++-----
 src/server/server.c     | 22 +++++++++++++++++-----
 src/server/threadpool.c | 28 ++++++++++++++++++++++------
 src/server/threadpool.h |  5 +++++
 8 files changed, 74 insertions(+), 33 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 5dd205a..f940666 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -138,6 +138,7 @@ struct _dnbd3_client
 	char hostName[HOSTNAMELEN];       // inet_ntop version of host
 	pthread_mutex_t sendMutex;        // Held while writing to sock if image is incomplete (since uplink uses socket too)
 	pthread_mutex_t lock;
+	pthread_t thread;
 };
 
 // #######################################################
diff --git a/src/server/image.c b/src/server/image.c
index de93cd4..248c12c 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -562,9 +562,7 @@ bool image_tryFreeAll()
 		if ( _images[i] != NULL && _images[i]->users == 0 ) {
 			dnbd3_image_t *image = _images[i];
 			_images[i] = NULL;
-			mutex_unlock( &imageListLock );
 			image = image_free( image );
-			mutex_lock( &imageListLock );
 		}
 		if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--;
 	}
@@ -574,15 +572,13 @@ bool image_tryFreeAll()
 
 /**
  * Free image. DOES NOT check if it's in use.
- * Indirectly locks on imageListLock, image.lock, uplink.queueLock
+ * (Indirectly) locks on image.lock, uplink.queueLock
  */
 static dnbd3_image_t* image_free(dnbd3_image_t *image)
 {
 	assert( image != NULL );
 	assert( image->users == 0 );
-	if ( !_shutdown ) {
-		logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
-	}
+	logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid );
 	// uplink_shutdown might return false to tell us
 	// that the shutdown is in progress. Bail out since
 	// this will get called again when the uplink is done.
@@ -600,8 +596,6 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	mutex_unlock( &image->lock );
 	if ( image->readFd != -1 ) close( image->readFd );
 	mutex_destroy( &image->lock );
-	//
-	memset( image, 0, sizeof(*image) );
 	free( image );
 	return NULL ;
 }
diff --git a/src/server/integrity.c b/src/server/integrity.c
index f358c46..e7ebeb2 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -184,13 +184,20 @@ static void* integrity_main(void * data UNUSED)
 							mutex_unlock( &image->lock );
 						}
 #if defined(linux) || defined(__linux)
-						if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) {
+						while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
 #else
-						if ( fsync( fd ) == -1 ) {
+						while ( fsync( fd ) == -1 )
 #endif
-							logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path );
+						{
+							if ( _shutdown )
+								break;
+							if ( errno == EINTR )
+								continue;
+							logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, errno );
 							exit( 1 );
 						}
+						if ( _shutdown )
+							break;
 						// Use direct I/O only if read length is multiple of 4096 to be on the safe side
 						int tfd;
 						if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
@@ -266,7 +273,9 @@ static void* integrity_main(void * data UNUSED)
 		}
 	}
 	mutex_unlock( &integrityQueueLock );
-	if ( buffer != NULL ) free( buffer );
+	if ( buffer != NULL ) {
+		free( buffer );
+	}
 	bRunning = false;
 	return NULL;
 }
diff --git a/src/server/net.c b/src/server/net.c
index e0b516e..9c855e4 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -44,6 +44,7 @@
 #include <jansson.h>
 #include <inttypes.h>
 #include <stdatomic.h>
+#include <signal.h>
 
 static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS];
 static int _num_clients = 0;
@@ -153,6 +154,7 @@ void* net_handleNewConnection(void *clientPtr)
 {
 	dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr;
 	dnbd3_request_t request;
+	client->thread = pthread_self();
 
 	// Await data from client. Since this is a fresh connection, we expect data right away
 	sock_setTimeout( client->sock, _clientTimeout );
@@ -631,11 +633,10 @@ void net_disconnectAll()
 	int i;
 	mutex_lock( &_clients_lock );
 	for (i = 0; i < _num_clients; ++i) {
-		if ( _clients[i] == NULL ) continue;
-		dnbd3_client_t * const client = _clients[i];
-		mutex_lock( &client->lock );
-		if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR );
-		mutex_unlock( &client->lock );
+		if ( _clients[i] == NULL )
+			continue;
+		shutdown( _clients[i]->sock, SHUT_RDWR );
+		pthread_kill( _clients[i]->thread, SIGINT );
 	}
 	mutex_unlock( &_clients_lock );
 }
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 261c6c0..662263e 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -137,13 +137,13 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 	bool hasName = false;
 	bool ok;
 	int keepAlive = HTTP_KEEPALIVE;
-	do {
+	while ( !_shutdown ) {
 		// Read request from client
 		struct phr_header headers[100];
 		size_t numHeaders, prevLen = 0, consumed;
 		struct string method, path;
 		int minorVersion;
-		do {
+		while ( !_shutdown ) {
 			// Parse before calling recv, there might be a complete pipelined request in the buffer already
 			// If the request is incomplete, we allow exactly one additional recv() to complete it.
 			// This should suffice for real world scenarios as I don't know of any
@@ -188,7 +188,9 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 				sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE );
 				goto func_return;
 			}
-		} while ( true );
+		} // Loop while request header incomplete
+		if ( _shutdown )
+			break;
 		if ( keepAlive == HTTP_KEEPALIVE ) {
 			// Only keep the connection alive (and indicate so) if the client seems to support this
 			if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
@@ -213,7 +215,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			} else {
 				ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive );
 			}
-			if ( !ok ) break;
+			if ( !ok )
+				break;
 		}
 		// hoff might be beyond end if the client sent another request (burst)
 		const ssize_t extra = hoff - consumed;
@@ -225,7 +228,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			hasName = true;
 			setThreadName( "HTTP" );
 		}
-	} while (true);
+	} // Loop while more requests
 func_return:;
 	do {
 		const int curCount = --status.count;
diff --git a/src/server/server.c b/src/server/server.c
index 1cdd2ab..0dddea7 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -37,6 +37,8 @@
 #include <signal.h>
 #include <getopt.h>
 #include <assert.h>
+#include <sys/types.h>
+#include <unistd.h>
 
 #define LONGOPT_CRC4       1000
 #define LONGOPT_ASSERT     1001
@@ -60,6 +62,7 @@ static _Atomic(job_t *) newJob;
 static bool hasTimerThread = false;
 static pthread_t timerThread;
 
+static pid_t mainPid;
 static pthread_t mainThread;
 
 #define DEFAULT_TIMER_TIMEOUT (60)
@@ -138,7 +141,7 @@ _Noreturn static void dnbd3_cleanup()
 	logadd( LOG_INFO, "Cleanup..." );
 
 	if ( hasTimerThread ) {
-		pthread_kill( timerThread, SIGHUP );
+		pthread_kill( timerThread, SIGINT );
 		thread_join( timerThread, NULL );
 	}
 
@@ -162,6 +165,8 @@ _Noreturn static void dnbd3_cleanup()
 	// Wait for clients to disconnect
 	net_waitForAllDisconnected();
 
+	threadpool_waitEmpty();
+
 	// Clean up images
 	retries = 5;
 	while ( !image_tryFreeAll() && --retries > 0 ) {
@@ -204,6 +209,7 @@ int main(int argc, char *argv[])
 			{ 0, 0, 0, 0 }
 	};
 
+	mainPid = getpid();
 	mainThread = pthread_self();
 	opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
 
@@ -509,10 +515,16 @@ static void dnbd3_handleSignal(int signum)
 
 static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
 {
-	if ( !pthread_equal( pthread_self(), mainThread ) )
-		return;
-	memcpy( &lastSignal, info, sizeof(siginfo_t) );
-	dnbd3_handleSignal( signum );
+	if ( info->si_pid != mainPid ) { // Source is not this process
+		memcpy( &lastSignal, info, sizeof(siginfo_t) ); // Copy signal info
+		if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) {
+			pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread
+		}
+	}
+	if ( pthread_equal( pthread_self(), mainThread ) ) {
+		// Signal received by main thread -- handle
+		dnbd3_handleSignal( signum );
+	}
 }
 
 uint32_t dnbd3_serverUptime()
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 3947677..0b46fd6 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -15,6 +15,7 @@ static void *threadpool_worker(void *entryPtr);
 static pthread_attr_t threadAttrs;
 static atomic_int maxIdleThreads = -1;
 static _Atomic(entry_t *) *pool = NULL;
+static atomic_int activeThreads = 0;
 
 bool threadpool_init(int maxIdle)
 {
@@ -34,10 +35,9 @@ bool threadpool_init(int maxIdle)
 
 void threadpool_close()
 {
-	_shutdown = true;
-	int max = maxIdleThreads;
-	maxIdleThreads = -1;
-	if ( max <= 0 ) return;
+	int max = atomic_exchange( &maxIdleThreads, -1 );
+	if ( max <= 0 )
+		return;
 	for ( int i = 0; i < max; ++i ) {
 		entry_t *cur = pool[i];
 		if ( cur != NULL && atomic_compare_exchange_strong( &pool[i], &cur, NULL ) ) {
@@ -46,9 +46,23 @@ void threadpool_close()
 	}
 }
 
+void threadpool_waitEmpty()
+{
+	if ( activeThreads == 0 )
+		return;
+	do {
+		sleep( 1 );
+		logadd( LOG_INFO, "Threadpool: %d threads still active", (int)activeThreads );
+	} while ( activeThreads != 0 );
+}
+
 bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 {
-	if ( startRoutine == NULL ) {
+	if ( unlikely( _shutdown ) ) {
+		logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
+		return false;
+	}
+	if ( unlikely( startRoutine == NULL ) ) {
 		logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
 		return false; // Or bail out!?
 	}
@@ -60,7 +74,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 			break;
 		}
 	}
-	if ( entry == NULL ) {
+	if ( unlikely( entry == NULL ) ) {
 		entry = malloc( sizeof(entry_t) );
 		if ( entry == NULL ) {
 			logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" );
@@ -78,6 +92,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 			free( entry );
 			return false;
 		}
+		activeThreads++;
 	}
 	entry->startRoutine = startRoutine;
 	entry->arg = arg;
@@ -130,6 +145,7 @@ keep_going:;
 	}
 	signal_close( entry->signal );
 	free( entry );
+	activeThreads--;
 	return NULL;
 }
 
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index 15dd151..ee0b3aa 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -17,6 +17,11 @@ bool threadpool_init(int maxIdleThreadCount);
  */
 void threadpool_close();
 
+/**
+ * Block until all threads spawned have exited
+ */
+void threadpool_waitEmpty();
+
 /**
  * Run a thread using the thread pool.
  * @param startRoutine function to run in new thread
-- 
cgit v1.2.3-55-g7522


From f8136a0b92a9293e7eb71aea27d9da6b6a163d94 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 21:06:45 +0200
Subject: [SERVER] Reintroduce check whether readFd is actually != -1

---
 src/server/image.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 248c12c..4eab1d2 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1725,8 +1725,10 @@ static void* closeUnusedFds(void* nix UNUSED)
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
-		if ( image == NULL )
+		if ( image == NULL || image->readFd == -1 )
 			continue;
+		// TODO: Also close for idle uplinks (uplink_connectionShouldShutdown)
+		// TODO: And close writeFd for idle uplinks....
 		if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
 			logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid );
 			fds[fdindex++] = image->readFd;
-- 
cgit v1.2.3-55-g7522


From 0fb5ec7152c79d10711139158533f96204755788 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 22:14:34 +0200
Subject: [SERVER] Speed up shutdown of integrity checker

---
 src/server/integrity.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/integrity.c b/src/server/integrity.c
index e7ebeb2..1fcb558 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -13,6 +13,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
 
 #define CHECK_QUEUE_SIZE 200
 
@@ -56,13 +58,14 @@ void integrity_init()
 void integrity_shutdown()
 {
 	assert( queueLen != -1 );
+	if ( !bRunning )
+		return;
 	logadd( LOG_DEBUG1, "Shutting down integrity checker...\n" );
+	pthread_kill( thread, SIGINT );
 	mutex_lock( &integrityQueueLock );
 	pthread_cond_signal( &queueSignal );
 	mutex_unlock( &integrityQueueLock );
 	thread_join( thread, NULL );
-	while ( bRunning )
-		usleep( 10000 );
 	mutex_destroy( &integrityQueueLock );
 	pthread_cond_destroy( &queueSignal );
 	logadd( LOG_DEBUG1, "Integrity checker exited normally.\n" );
-- 
cgit v1.2.3-55-g7522


From 8d2c5cae4bd68d7015a438ca6f1c23a02f2f8203 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 14:48:12 +0200
Subject: [BENCH] Request random blocks

---
 src/bench/connection.c | 5 +++--
 src/bench/connection.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index 498bc62..26be440 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -41,7 +41,7 @@ bool connection_init_n_times(
 		const char *lowerImage,
 		const uint16_t rid,
 		int ntimes,
-		int blockSize,
+		uint64_t blockSize,
 		BenchCounters* counters
 		) {
 	for (int run_i = 0; run_i < ntimes; ++run_i) {
@@ -96,7 +96,8 @@ bool connection_init_n_times(
 			} else if ( rid != 0 && rid != remoteRid ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "rid mismatch" );
-			} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
+			//} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
+			} else if ( !dnbd3_get_block( sock, (((uint64_t)rand()) << 16 + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "send: get block failed" );
 			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
diff --git a/src/bench/connection.h b/src/bench/connection.h
index 69207ff..770bf0d 100644
--- a/src/bench/connection.h
+++ b/src/bench/connection.h
@@ -19,7 +19,7 @@ typedef struct _dnbd3_async {
 } dnbd3_async_t;
 
 
-bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, int blockSize, BenchCounters* counters);
+bool connection_init_n_times(const char *hosts, const char *image, const uint16_t rid, int ntimes, uint64_t blockSize, BenchCounters* counters);
 
 bool connection_init(const char *hosts, const char *image, const uint16_t rid);
 
-- 
cgit v1.2.3-55-g7522


From 291eba00d392e17925576ead20b781d774e68134 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 14:48:58 +0200
Subject: [SERVER] reference: Fix error msg usage

---
 src/server/reference.c | 2 +-
 src/server/reference.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/reference.c b/src/server/reference.c
index 468e00b..64109ca 100644
--- a/src/server/reference.c
+++ b/src/server/reference.c
@@ -13,7 +13,7 @@ void ref_init( ref *reference, void ( *freefun )( ref * ), long count )
 
 _Noreturn void _ref_error( const char *message )
 {
-	fprintf( stderr, "Reference counter overflow\n" );
+	fprintf( stderr, "%s\n", message );
 	abort();
 }
 
diff --git a/src/server/reference.h b/src/server/reference.h
index 0bc081a..8883eb1 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -27,7 +27,7 @@ static inline ref *ref_get( weakref *weakref )
 	} while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) );
 	struct _ref_ *ref = aligned_ref( old_weakref )->ref;
 	if ( unlikely( ++ref->count == -1 ) ) {
-		_ref_error( "Reference counter overflow. Aborting.\n" );
+		_ref_error( "Reference counter overflow. Aborting." );
 	}
 	char *cur_weakref = ( char * )*weakref;
 	do {
-- 
cgit v1.2.3-55-g7522


From 88695877f085af475a6ca8a01c2fbb08eb5b15da Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 14:49:18 +0200
Subject: [SERVER] Use weakref for cache maps

Gets rid of a bunch of locking, especially the hot path in net.c where
clients are requesting data. Many clients unsing the same incomplete
image previously created a bottleneck here.
---
 src/server/globals.h   |  10 ++-
 src/server/image.c     | 208 +++++++++++++++++++++++++++++++------------------
 src/server/image.h     |   2 +-
 src/server/integrity.c |  10 ++-
 src/server/net.c       |  81 +++++++++----------
 src/server/reference.h |   5 ++
 src/server/uplink.c    |  64 +++++++--------
 7 files changed, 220 insertions(+), 160 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index f940666..221af78 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -99,6 +99,12 @@ typedef struct
 	int permissions;
 } dnbd3_access_rule_t;
 
+typedef struct
+{
+	ref reference;
+	atomic_uint_least8_t map[];
+} dnbd3_cache_map_t;
+
 /**
  * Image struct. An image path could be something like
  * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1
@@ -110,7 +116,7 @@ struct _dnbd3_image
 	char *path;            // absolute path of the image
 	char *name;            // public name of the image (usually relative path minus revision ID)
 	weakref uplinkref;     // pointer to a server connection
-	uint8_t *cache_map;    // cache map telling which parts are locally cached, NULL if complete
+	weakref ref_cacheMap;  // cache map telling which parts are locally cached, NULL if complete
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
 	ticks atime;                // last access time
@@ -119,7 +125,7 @@ struct _dnbd3_image
 	uint32_t *crc32;       // list of crc32 checksums for each 16MiB block in image
 	uint32_t masterCrc32;  // CRC-32 of the crc-32 list
 	int readFd;            // used to read the image. Used from multiple threads, so use atomic operations (pread et al)
-	int completenessEstimate; // Completeness estimate in percent
+	atomic_int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
 	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
diff --git a/src/server/image.c b/src/server/image.c
index 4eab1d2..1972f48 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -51,10 +51,18 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
 static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
 static bool image_ensureDiskSpace(uint64_t size, bool force);
 
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map);
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map);
 static void* closeUnusedFds(void*);
+static void allocCacheMap(dnbd3_image_t *image, bool complete);
+
+static void cmfree(ref *ref)
+{
+	dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference);
+	logadd( LOG_DEBUG2, "Freeing a cache map" );
+	free( cache );
+}
 
 // ##########################################
 
@@ -70,7 +78,6 @@ void image_serverStartup()
 /**
  * Update cache-map of given image for the given byte range
  * start (inclusive) - end (exclusive)
- * Locks on: images[].lock
  */
 void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set)
 {
@@ -91,33 +98,55 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 	if ( start >= end )
 		return;
 	bool setNewBlocks = false;
-	uint64_t pos = start;
-	mutex_lock( &image->lock );
-	if ( image->cache_map == NULL ) {
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL ) {
 		// Image seems already complete
 		if ( set ) {
 			// This makes no sense
-			mutex_unlock( &image->lock );
-			logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path );
+			logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path );
 			return;
 		}
 		// Recreate a cache map, set it to all 1 initially as we assume the image was complete
-		const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-		image->cache_map = malloc( byteSize );
-		memset( image->cache_map, 0xff, byteSize );
-	}
-	while ( pos < end ) {
-		const size_t map_y = (int)( pos >> 15 );
-		const int map_x = (int)( (pos >> 12) & 7 ); // mod 8
-		const int bit_mask = 1 << map_x;
-		if ( set ) {
-			if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true;
-			image->cache_map[map_y] |= (uint8_t)bit_mask;
-		} else {
-			image->cache_map[map_y] &= (uint8_t)~bit_mask;
+		allocCacheMap( image, true );
+		cache = ref_get_cachemap( image );
+		if ( cache == NULL ) {
+			logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path );
+			return;
 		}
-		pos += DNBD3_BLOCK_SIZE;
 	}
+	// Set/unset
+	const uint64_t firstByteInMap = start >> 15;
+	const uint64_t lastByteInMap = (end - 1) >> 15;
+	uint64_t pos;
+	// First byte
+	uint8_t fb = 0, lb = 0;
+	for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+		const int map_x = (pos >> 12) & 7; // mod 8
+		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+		fb |= bit_mask;
+	}
+	// Last byte
+	for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+		const int map_x = (pos >> 12) & 7; // mod 8
+		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+		lb |= bit_mask;
+	}
+	if ( set ) {
+		uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+		uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+		setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] );
+	} else {
+		atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+		atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+	}
+	const uint8_t nval = set ? 0xff : 0;
+	// Everything in between
+	for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+		if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+			setNewBlocks = true;
+		}
+	}
+	atomic_thread_fence( memory_order_release );
 	if ( setNewBlocks && image->crc32 != NULL ) {
 		// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
 		// for checking, even though this might lead to checking some hash block again, if it was
@@ -125,19 +154,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 		// First set start and end to borders of hash blocks
 		start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1);
 		end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1);
-		pos = start;
-		while ( pos < end ) {
-			if ( image->cache_map == NULL ) break;
+		for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) {
 			const int block = (int)( pos / HASH_BLOCK_SIZE );
-			if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) {
-				mutex_unlock( &image->lock );
+			if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) {
 				integrity_check( image, block );
-				mutex_lock( &image->lock );
 			}
-			pos += HASH_BLOCK_SIZE;
 		}
 	}
-	mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
 }
 
 /**
@@ -149,20 +173,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 bool image_isComplete(dnbd3_image_t *image)
 {
 	assert( image != NULL );
-	mutex_lock( &image->lock );
 	if ( image->virtualFilesize == 0 ) {
-		mutex_unlock( &image->lock );
 		return false;
 	}
-	if ( image->cache_map == NULL ) {
-		mutex_unlock( &image->lock );
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL ) {
 		return true;
 	}
 	bool complete = true;
 	int j;
 	const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	for (j = 0; j < map_len_bytes - 1; ++j) {
-		if ( image->cache_map[j] != 0xFF ) {
+		if ( cache->map[j] != 0xFF ) {
 			complete = false;
 			break;
 		}
@@ -177,18 +199,27 @@ bool image_isComplete(dnbd3_image_t *image)
 			for (j = 0; j < blocks_in_last_byte; ++j)
 				last_byte |= (uint8_t)(1 << j);
 		}
-		complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte);
+		complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte);
 	}
-	if ( !complete ) {
-		mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
+	if ( !complete )
 		return false;
+	mutex_lock( &image->lock );
+	// Lock and make sure current cache map is still the one we saw complete
+	dnbd3_cache_map_t *current = ref_get_cachemap( image );
+	if ( current == cache ) {
+		// Set cache map NULL as it's complete
+		ref_setref( &image->ref_cacheMap, NULL );
+	}
+	if ( current != NULL ) {
+		ref_put( &current->reference );
 	}
-	char mapfile[PATHLEN] = "";
-	free( image->cache_map );
-	image->cache_map = NULL;
-	snprintf( mapfile, PATHLEN, "%s.map", image->path );
 	mutex_unlock( &image->lock );
-	unlink( mapfile );
+	if ( current == cache ) { // Successfully set cache map to NULL above
+		char mapfile[PATHLEN] = "";
+		snprintf( mapfile, PATHLEN, "%s.map", image->path );
+		unlink( mapfile );
+	}
 	return true;
 }
 
@@ -350,19 +381,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		img->rid = candidate->rid;
 		img->users = 1;
 		img->working = false;
+		img->ref_cacheMap = NULL;
 		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
 			const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
 			img->crc32 = malloc( mb );
 			memcpy( img->crc32, candidate->crc32, mb );
 		}
-		mutex_lock( &candidate->lock );
-		if ( candidate->cache_map != NULL ) {
-			const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize );
-			img->cache_map = malloc( mb );
-			memcpy( img->cache_map, candidate->cache_map, mb );
+		dnbd3_cache_map_t *cache = ref_get_cachemap( candidate );
+		if ( cache != NULL ) {
+			ref_setref( &img->ref_cacheMap, &cache->reference );
+			ref_put( &cache->reference );
 		}
-		mutex_unlock( &candidate->lock );
 		if ( image_addToList( img ) ) {
 			image_release( candidate );
 			candidate = img;
@@ -377,7 +407,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 	}
 
 	// Check if image is incomplete, handle
-	if ( candidate->cache_map != NULL ) {
+	if ( candidate->ref_cacheMap != NULL ) {
 		uplink_init( candidate, -1, NULL, -1 );
 	}
 
@@ -585,11 +615,10 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	if ( !uplink_shutdown( image ) )
 		return NULL;
 	mutex_lock( &image->lock );
-	free( image->cache_map );
+	ref_setref( &image->ref_cacheMap, NULL );
 	free( image->crc32 );
 	free( image->path );
 	free( image->name );
-	image->cache_map = NULL;
 	image->crc32 = NULL;
 	image->path = NULL;
 	image->name = NULL;
@@ -600,7 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	return NULL ;
 }
 
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
+bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
 {
 	if ( cacheMap == NULL ) return true;
 	const uint64_t end = (block + 1) * HASH_BLOCK_SIZE;
@@ -707,7 +736,7 @@ static bool image_load(char *base, char *path, int withUplink)
 {
 	int revision = -1;
 	struct stat st;
-	uint8_t *cache_map = NULL;
+	dnbd3_cache_map_t *cache = NULL;
 	uint32_t *crc32list = NULL;
 	dnbd3_image_t *existing = NULL;
 	int fdImage = -1;
@@ -790,7 +819,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	}
 
 	// 1. Allocate memory for the cache map if the image is incomplete
-	cache_map = image_loadCacheMap( path, virtualFilesize );
+	cache = image_loadCacheMap( path, virtualFilesize );
 
 	// XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented)
 
@@ -802,7 +831,7 @@ static bool image_load(char *base, char *path, int withUplink)
 
 	// Check CRC32
 	if ( crc32list != NULL ) {
-		if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) {
+		if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) {
 			logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path );
 			doFullCheck = true;
 		}
@@ -826,7 +855,7 @@ static bool image_load(char *base, char *path, int withUplink)
 			crc32list = NULL;
 			function_return = true;
 			goto load_error; // Keep existing
-		} else if ( existing->cache_map != NULL && cache_map == NULL ) {
+		} else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
 			// Just ignore that fact, if replication is really complete the cache map will be removed anyways
 			logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
 			function_return = true;
@@ -846,7 +875,8 @@ static bool image_load(char *base, char *path, int withUplink)
 	dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) );
 	image->path = strdup( path );
 	image->name = strdup( imgName );
-	image->cache_map = cache_map;
+	image->ref_cacheMap = NULL;
+	ref_setref( &image->ref_cacheMap, &cache->reference );
 	image->crc32 = crc32list;
 	image->masterCrc32 = masterCrc;
 	image->uplinkref = NULL;
@@ -855,7 +885,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->rid = (uint16_t)revision;
 	image->users = 0;
 	image->readFd = -1;
-	image->working = (image->cache_map == NULL );
+	image->working = ( cache == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
@@ -870,16 +900,16 @@ static bool image_load(char *base, char *path, int withUplink)
 	timing_gets( &image->atime, offset );
 
 	// Prevent freeing in cleanup
-	cache_map = NULL;
+	cache = NULL;
 	crc32list = NULL;
 
 	// Get rid of cache map if image is complete
-	if ( image->cache_map != NULL ) {
+	if ( image->ref_cacheMap != NULL ) {
 		image_isComplete( image );
 	}
 
 	// Image is definitely incomplete, initialize uplink worker
-	if ( image->cache_map != NULL ) {
+	if ( image->ref_cacheMap != NULL ) {
 		image->working = false;
 		if ( withUplink ) {
 			uplink_init( image, -1, NULL, -1 );
@@ -910,21 +940,22 @@ static bool image_load(char *base, char *path, int withUplink)
 load_error: ;
 	if ( existing != NULL ) existing = image_release( existing );
 	if ( crc32list != NULL ) free( crc32list );
-	if ( cache_map != NULL ) free( cache_map );
+	if ( cache != NULL ) free( cache );
 	if ( fdImage != -1 ) close( fdImage );
 	return function_return;
 }
 
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
 {
-	uint8_t *retval = NULL;
+	dnbd3_cache_map_t *retval = NULL;
 	char mapFile[strlen( imagePath ) + 10 + 1];
 	sprintf( mapFile, "%s.map", imagePath );
 	int fdMap = open( mapFile, O_RDONLY );
-	if ( fdMap >= 0 ) {
+	if ( fdMap != -1 ) {
 		const int map_size = IMGSIZE_TO_MAPBYTES( fileSize );
-		retval = calloc( 1, map_size );
-		const ssize_t rd = read( fdMap, retval, map_size );
+		retval = calloc( 1, sizeof(*retval) + map_size );
+		ref_init( &retval->reference, cmfree, 0 );
+		const ssize_t rd = read( fdMap, retval->map, map_size );
 		if ( map_size != rd ) {
 			logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath );
 			// Could not read complete map, that means the rest of the image file will be considered incomplete
@@ -985,7 +1016,7 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
 	return retval;
 }
 
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map)
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map)
 {
 	// This checks the first block and (up to) count - 1 random blocks for corruption
 	// via the known crc32 list. This is very sloppy and is merely supposed to detect
@@ -1529,30 +1560,37 @@ json_t* image_getListAsJson()
 /**
  * Get completeness of an image in percent. Only estimated, not exact.
  * Returns: 0-100
- * DOES NOT LOCK, so make sure to do so before calling
  */
 int image_getCompletenessEstimate(dnbd3_image_t * const image)
 {
 	assert( image != NULL );
-	if ( image->cache_map == NULL ) return image->working ? 100 : 0;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL )
+		return image->working ? 100 : 0;
+	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+	if ( unlikely( len == 0 ) ) {
+		ref_put( &cache->reference );
+		return 0;
+	}
 	declare_now;
 	if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) {
 		// Since this operation is relatively expensive, we cache the result for a while
+		ref_put( &cache->reference );
 		return image->completenessEstimate;
 	}
 	int i;
 	int percent = 0;
-	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-	if ( len == 0 ) return 0;
 	for ( i = 0; i < len; ++i ) {
-		if ( image->cache_map[i] == 0xff ) {
+		const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+		if ( v == 0xff ) {
 			percent += 100;
-		} else if ( image->cache_map[i] != 0 ) {
+		} else if ( v != 0 ) {
 			percent += 50;
 		}
 	}
+	ref_put( &cache->reference );
 	image->completenessEstimate = percent / len;
-	timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 );
+	timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 );
 	return image->completenessEstimate;
 }
 
@@ -1744,3 +1782,21 @@ static void* closeUnusedFds(void* nix UNUSED)
 	}
 	return NULL;
 }
+
+static void allocCacheMap(dnbd3_image_t *image, bool complete)
+{
+	const uint8_t val = complete ? 0xff : 0;
+	const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+	dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize );
+	ref_init( &cache->reference, cmfree, 0 );
+	memset( cache->map, val, byteSize );
+	mutex_lock( &image->lock );
+	if ( image->ref_cacheMap != NULL ) {
+		logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid );
+		free( cache );
+	} else {
+		ref_setref( &image->ref_cacheMap, &cache->reference );
+	}
+	mutex_unlock( &image->lock );
+}
+
diff --git a/src/server/image.h b/src/server/image.h
index 4668eff..cd87f03 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -9,7 +9,7 @@ void image_serverStartup();
 
 bool image_isComplete(dnbd3_image_t *image);
 
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
+bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
 
 void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set);
 
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 1fcb558..a9fbae6 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -181,10 +181,12 @@ static void* integrity_main(void * data UNUSED)
 						const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
 						bool complete = true;
 						if ( qCount == CHECK_ALL ) {
-							// When checking full image, skip incomplete blocks, otherwise assume block is complete
-							mutex_lock( &image->lock );
-							complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
-							mutex_unlock( &image->lock );
+							dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+							if ( cache != NULL ) {
+								// When checking full image, skip incomplete blocks, otherwise assume block is complete
+								complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+								ref_put( &cache->reference );
+							}
 						}
 #if defined(linux) || defined(__linux)
 						while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
diff --git a/src/server/net.c b/src/server/net.c
index 9c855e4..12bcdad 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -246,7 +246,7 @@ void* net_handleNewConnection(void *clientPtr)
 				// We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does...
 				// Reject, as this would basically force this proxy to do BGR too.
 				image = image_get( image_name, rid, true );
-				if ( image != NULL && image->cache_map != NULL ) {
+				if ( image != NULL && image->ref_cacheMap != NULL ) {
 					// Only exception is if the image is complete locally
 					image = image_release( image );
 				}
@@ -268,7 +268,7 @@ void* net_handleNewConnection(void *clientPtr)
 			} else {
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
-				if ( image->cache_map != NULL ) {
+				if ( image->ref_cacheMap != NULL ) {
 					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
 					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
 						bOk = ( rand() % 4 ) == 1;
@@ -338,57 +338,52 @@ void* net_handleNewConnection(void *clientPtr)
 					break;
 				}
 
-				if ( request.size != 0 && image->cache_map != NULL ) {
+				dnbd3_cache_map_t *cache;
+				if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
 					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					bool isCached = true;
-					mutex_lock( &image->lock );
-					// Check again as we only aquired the lock just now
-					if ( image->cache_map != NULL ) {
-						const uint64_t firstByteInMap = start >> 15;
-						const uint64_t lastByteInMap = (end - 1) >> 15;
-						uint64_t pos;
-						// Middle - quick checking
-						if ( isCached ) {
-							pos = firstByteInMap + 1;
-							while ( pos < lastByteInMap ) {
-								if ( image->cache_map[pos] != 0xff ) {
-									isCached = false;
-									break;
-								}
-								++pos;
+					const uint64_t firstByteInMap = start >> 15;
+					const uint64_t lastByteInMap = (end - 1) >> 15;
+					uint64_t pos;
+					uint8_t b;
+					atomic_thread_fence( memory_order_acquire );
+					// Middle - quick checking
+					if ( isCached ) {
+						for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+							if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+								isCached = false;
+								break;
 							}
 						}
-						// First byte
-						if ( isCached ) {
-							pos = start;
-							do {
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-								pos += DNBD3_BLOCK_SIZE;
-							} while ( firstByteInMap == (pos >> 15) && pos < end );
+					}
+					// First byte
+					if ( isCached ) {
+						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+						for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+							const int map_x = (pos >> 12) & 7; // mod 8
+							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+							if ( (b & bit_mask) == 0 ) {
+								isCached = false;
+								break;
+							}
 						}
-						// Last byte - only check if request spans multiple bytes in cache map
-						if ( isCached && firstByteInMap != lastByteInMap ) {
-							pos = lastByteInMap << 15;
-							while ( pos < end ) {
-								assert( lastByteInMap == (pos >> 15) );
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-								pos += DNBD3_BLOCK_SIZE;
+					}
+					// Last byte - only check if request spans multiple bytes in cache map
+					if ( isCached && firstByteInMap != lastByteInMap ) {
+						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+						for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+							assert( lastByteInMap == (pos >> 15) );
+							const int map_x = (pos >> 12) & 7; // mod 8
+							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+							if ( (b & bit_mask) == 0 ) {
+								isCached = false;
+								break;
 							}
 						}
 					}
-					mutex_unlock( &image->lock );
+					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
diff --git a/src/server/reference.h b/src/server/reference.h
index 8883eb1..2a80955 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -51,4 +51,9 @@ static inline void ref_put( ref *ref )
 	ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
 })
 
+#define ref_get_cachemap(image) ({ \
+	ref* ref = ref_get( &(image)->ref_cacheMap ); \
+	ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \
+})
+
 #endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index d77be9c..0a6bd11 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -91,7 +91,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		ref_put( &uplink->reference );
 		return true; // There's already an uplink, so should we consider this success or failure?
 	}
-	if ( image->cache_map == NULL ) {
+	if ( image->ref_cacheMap == NULL ) {
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
 		goto failure;
 	}
@@ -170,7 +170,7 @@ bool uplink_shutdown(dnbd3_image_t *image)
 	mutex_unlock( &uplink->queueLock );
 	bool retval = ( exp && image->users == 0 );
 	mutex_unlock( &image->lock );
-	return exp;
+	return retval;
 }
 
 /**
@@ -214,7 +214,7 @@ static void uplink_free(ref *ref)
 	dnbd3_image_t *image = image_lock( uplink->image );
 	if ( image != NULL ) {
 		// != NULL means image is still in list...
-		if ( !_shutdown && image->cache_map != NULL ) {
+		if ( !_shutdown && image->ref_cacheMap != NULL ) {
 			// Ingegrity checker must have found something in the meantime
 			uplink_init( image, -1, NULL, 0 );
 		}
@@ -707,13 +707,14 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 	if ( uplink == NULL || uplink->current.fd == -1 ) return;
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
 	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return;
+		return; // Already a replication request on the wire, or no more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
 	if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
-	mutex_lock( &image->lock );
-	if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) {
-		// No cache map (=image complete), or replication pending, or not enough users, do nothing
-		mutex_unlock( &image->lock );
+	if ( image->users < _bgrMinClients ) return; // Not enough active users
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL || image->users < _bgrMinClients ) {
+		// No cache map (=image complete)
+		ref_put( &cache->reference );
 		return;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
@@ -727,16 +728,18 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 			endByte = mapBytes;
 		}
 	}
+	atomic_thread_fence( memory_order_acquire );
 	int replicationIndex = -1;
 	for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
 		const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
-		if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+		if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+				&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
 			// Found incomplete one
 			replicationIndex = i;
 			break;
 		}
 	}
-	mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
 	if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
 		// Nothing left in current block, find next one
 		replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
@@ -768,23 +771,24 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 }
 
 /**
- * find next index into cache_map that corresponds to the beginning
+ * find next index into cache map that corresponds to the beginning
  * of a hash block which is neither completely empty nor completely
  * replicated yet. Returns -1 if no match.
  */
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
 {
 	int retval = -1;
-	mutex_lock( &uplink->image->lock );
-	const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
-	const uint8_t *cache_map = uplink->image->cache_map;
-	if ( cache_map != NULL ) {
-		int j;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
+	if ( cache != NULL ) {
+		const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
 		const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK );
+		atomic_thread_fence( memory_order_acquire );
+		int j;
 		for (j = 0; j < mapBytes; ++j) {
 			const int i = ( start + j ) % mapBytes;
-			const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
-			const bool isEmpty = cache_map[i] == 0;
+			const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+			const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
+			const bool isEmpty = b == 0;
 			if ( !isEmpty && !isFull ) {
 				// Neither full nor empty, replicate
 				if ( retval == -1 ) {
@@ -811,7 +815,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 			retval = -1;
 		}
 	}
-	mutex_unlock( &uplink->image->lock );
+	ref_put( &cache->reference );
 	return retval;
 }
 
@@ -1107,7 +1111,7 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		if ( fsync( uplink->cacheFd ) == -1 ) {
 			// A failing fsync means we have no guarantee that any data
 			// since the last fsync (or open if none) has been saved. Apart
-			// from keeping the cache_map from the last successful fsync
+			// from keeping the cache map from the last successful fsync
 			// around and restoring it there isn't much we can do to recover
 			// a consistent state. Bail out.
 			logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
@@ -1116,21 +1120,13 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		}
 	}
 
-	if ( image->cache_map == NULL ) return true;
-	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
-	mutex_lock( &image->lock );
-	// Lock and get a copy of the cache map, as it could be freed by another thread that is just about to
-	// figure out that this image's cache copy is complete
-	if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) {
-		mutex_unlock( &image->lock );
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL )
 		return true;
-	}
+	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
-	uint8_t *map = malloc( size );
-	memcpy( map, image->cache_map, size );
 	// Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
 	// cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
-	mutex_unlock( &image->lock );
 	assert( image->path != NULL );
 	char mapfile[strlen( image->path ) + 4 + 1];
 	strcpy( mapfile, image->path );
@@ -1139,14 +1135,14 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 	int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
 	if ( fd == -1 ) {
 		const int err = errno;
-		free( map );
+		ref_put( &cache->reference );
 		logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
 		return false;
 	}
 
 	size_t done = 0;
 	while ( done < size ) {
-		const ssize_t ret = write( fd, map, size - done );
+		const ssize_t ret = write( fd, cache->map + done, size - done );
 		if ( ret == -1 ) {
 			if ( errno == EINTR ) continue;
 			logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
@@ -1158,11 +1154,11 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		}
 		done += (size_t)ret;
 	}
+	ref_put( &cache->reference );
 	if ( fsync( fd ) == -1 ) {
 		logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
 	}
 	close( fd );
-	free( map );
 	return true;
 }
 
-- 
cgit v1.2.3-55-g7522


From 9d2d9c6de358b2cf1a602c999d2e0a7a664610f7 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 23:05:26 +0200
Subject: [SERVER] Tear down whole uplink on idle timeout

Keeping the uplink thread around forever even though we
disconnected from the upstream server seems wasteful. Get
rid of this and rear down the uplink entirely.
---
 src/server/net.c    | 13 +++++--------
 src/server/uplink.c | 40 +++++++++++++++++++---------------------
 2 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index 12bcdad..00c9a8d 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -270,18 +270,15 @@ void* net_handleNewConnection(void *clientPtr)
 				bOk = true;
 				if ( image->ref_cacheMap != NULL ) {
 					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 );
-					if ( uplink == NULL ) {
-						uplink_init( image, -1, NULL, 0 );
-					} else {
-						ref_put( &uplink->reference );
-					}
-					if ( penalty ) { // Wait 100ms if local caching is not working so this
+					if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
 						usleep( 100000 ); // server gets a penalty and is less likely to be selected
 					}
+					if ( uplink != NULL ) {
+						ref_put( &uplink->reference );
+					}
 				}
 				if ( bOk ) {
 					mutex_lock( &image->lock );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 0a6bd11..58f8ea5 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -258,10 +258,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref );
-	if ( uplink == NULL ) {
-		logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
-		return false;
+	dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
+	if ( unlikely( uplink == NULL ) ) {
+		uplink_init( client->image, -1, NULL, -1 );
+		uplink = ref_get_uplink( &client->image->uplinkref );
+		if ( uplink == NULL ) {
+			logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+			return false;
+		}
 	}
 	if ( uplink->shutdown ) {
 		logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
@@ -460,12 +464,15 @@ static void* uplink_mainloop(void *data)
 	events[EV_SIGNAL].events = POLLIN;
 	events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal );
 	events[EV_SOCKET].fd = -1;
+	if ( uplink->rttTestResult != RTT_DOCHANGE ) {
+		altservers_findUplink( uplink ); // In case we didn't kickstart
+	}
 	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
 		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
 		if ( waitTime == 0 ) {
 			// 0 means poll, since we're about to change the server
-		} else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
+		} else if ( uplink->current.fd == -1 ) {
 			waitTime = 1000;
 		} else {
 			declare_now;
@@ -568,32 +575,22 @@ static void* uplink_mainloop(void *data)
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
-			if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
-				mutex_lock( &uplink->sendMutex );
-				close( uplink->current.fd );
-				uplink->current.fd = -1;
-				mutex_unlock( &uplink->sendMutex );
-				uplink->cycleDetected = false;
-				if ( uplink->recvBufferLen != 0 ) {
-					uplink->recvBufferLen = 0;
-					free( uplink->recvBuffer );
-					uplink->recvBuffer = NULL;
-				}
+			if ( uplink_connectionShouldShutdown( uplink ) ) {
 				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
-				setThreadName( "idle-uplink" );
+				goto cleanup;
 			}
 		}
 		// See if we should trigger an RTT measurement
 		rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
-			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
+			if ( timing_reached( &nextAltCheck, &now ) || uplink->current.fd == -1 || uplink->cycleDetected ) {
 				// It seems it's time for a check
 				if ( image_isComplete( uplink->image ) ) {
 					// Quit work if image is complete
 					logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
 					setThreadName( "finished-uplink" );
 					goto cleanup;
-				} else if ( !uplink_connectionShouldShutdown( uplink ) ) {
+				} else {
 					// Not complete - do measurement
 					altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous)
 					if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
@@ -606,6 +603,9 @@ static void* uplink_mainloop(void *data)
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
 			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
 			discoverFailCount++;
+			if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
+				uplink->image->working = false;
+			}
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
@@ -1125,8 +1125,6 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		return true;
 	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
-	// Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
-	// cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
 	assert( image->path != NULL );
 	char mapfile[strlen( image->path ) + 4 + 1];
 	strcpy( mapfile, image->path );
-- 
cgit v1.2.3-55-g7522


From ebde623c2cdb84eb36e06bbf944efa54aef0e461 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 30 Aug 2019 09:25:28 +0200
Subject: [SERVER] No uplink_init when checking working state; improve logging

---
 src/server/image.c  | 18 ++++++++++--------
 src/server/uplink.c |  1 +
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 1972f48..b349058 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -237,7 +237,9 @@ bool image_ensureOpen(dnbd3_image_t *image)
 {
 	if ( image->readFd != -1 ) return image;
 	int newFd = open( image->path, O_RDONLY );
-	if ( newFd != -1 ) {
+	if ( newFd == -1 ) {
+		logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+	} else {
 		// Check size
 		const off_t flen = lseek( newFd, 0, SEEK_END );
 		if ( flen == -1 ) {
@@ -349,14 +351,14 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
 		reload = true;
 	} else if ( (uint64_t)len != candidate->realFilesize ) {
-		logadd( LOG_DEBUG1, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
+		logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
 				". Try sending SIGHUP to server if you know what you're doing.",
 				candidate->path, candidate->realFilesize, (uint64_t)len );
 	} else {
 		// Seek worked, file size is same, now see if we can read from file
 		char buffer[100];
 		if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
-			logadd( LOG_DEBUG2, "Reading first %d bytes from %s failed (errno=%d)%s.",
+			logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.",
 					(int)sizeof(buffer), candidate->path, errno, removingText );
 			reload = true;
 		} else if ( !candidate->working ) {
@@ -370,6 +372,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		// Could not access the image with exising fd - mark for reload which will re-open the file.
 		// make a copy of the image struct but keep the old one around. If/When it's not being used
 		// anymore, it will be freed automatically.
+		logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path );
 		dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
 		img->path = strdup( candidate->path );
 		img->name = strdup( candidate->name );
@@ -400,17 +403,16 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 			img->users = 0;
 			image_free( img );
 		}
+	// Check if image is incomplete, initialize uplink
+	if ( candidate->ref_cacheMap != NULL ) {
+		uplink_init( candidate, -1, NULL, -1 );
+	}
 		// readFd == -1 and working == FALSE at this point,
 		// this function needs some splitting up for handling as we need to run most
 		// of the above code again. for now we know that the next call for this
 		// name:rid will get ne newly inserted "img" and try to re-open the file.
 	}
 
-	// Check if image is incomplete, handle
-	if ( candidate->ref_cacheMap != NULL ) {
-		uplink_init( candidate, -1, NULL, -1 );
-	}
-
 	return candidate; // We did all we can, hopefully it's working
 }
 
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 58f8ea5..52cf417 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -604,6 +604,7 @@ static void* uplink_mainloop(void *data)
 			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
 			discoverFailCount++;
 			if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
+				logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
 				uplink->image->working = false;
 			}
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
-- 
cgit v1.2.3-55-g7522


From 645bb4b91b06c0eb23867aab1511b080ce122d96 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 30 Aug 2019 09:46:53 +0200
Subject: [SERVER] Introduce debug spam

---
 src/server/uplink.c | 16 ++++++++--------
 src/shared/timing.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 52cf417..4cea7e2 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -472,11 +472,10 @@ static void* uplink_mainloop(void *data)
 		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
 		if ( waitTime == 0 ) {
 			// 0 means poll, since we're about to change the server
-		} else if ( uplink->current.fd == -1 ) {
-			waitTime = 1000;
 		} else {
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
+			logadd( LOG_DEBUG1, "Next  %d  for %s", waitTime / 1000, uplink->image->name );
 			if ( waitTime < 100 ) waitTime = 100;
 			if ( waitTime > 5000 ) waitTime = 5000;
 		}
@@ -601,13 +600,14 @@ static void* uplink_mainloop(void *data)
 				timing_set( &nextAltCheck, &now, altCheckInterval );
 			}
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
-			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
-			discoverFailCount++;
-			if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
-				logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
-				uplink->image->working = false;
+			if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
+				discoverFailCount++;
+				if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
+					logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
+					uplink->image->working = false;
+				}
 			}
-			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
+			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED );
 		}
 #ifdef _DEBUG
 		if ( uplink->current.fd != -1 && !uplink->shutdown ) {
diff --git a/src/shared/timing.h b/src/shared/timing.h
index f3d8802..f23bfeb 100644
--- a/src/shared/timing.h
+++ b/src/shared/timing.h
@@ -22,7 +22,7 @@ extern struct timespec basetime;
 /**
  * Assign src to dst while adding secs seconds.
  */
-#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + secs; (dst)->tv_nsec = (src)->tv_nsec; } while (0)
+#define timing_set(dst,src,secs) do { (dst)->tv_sec = (src)->tv_sec + (secs); (dst)->tv_nsec = (src)->tv_nsec; } while (0)
 
 /**
  * Define variable now, initialize to timing_get.
-- 
cgit v1.2.3-55-g7522


From 5613ed8bf1f05c38af163c1303ab20be6b20090e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 30 Aug 2019 09:55:41 +0200
Subject: [SERVER] Less debug spam, fix RTT interval calculation

---
 src/server/uplink.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 4cea7e2..d1cd2e8 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -475,9 +475,8 @@ static void* uplink_mainloop(void *data)
 		} else {
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
-			logadd( LOG_DEBUG1, "Next  %d  for %s", waitTime / 1000, uplink->image->name );
 			if ( waitTime < 100 ) waitTime = 100;
-			if ( waitTime > 5000 ) waitTime = 5000;
+			if ( waitTime > 10000 ) waitTime = 10000;
 		}
 		events[EV_SOCKET].fd = uplink->current.fd;
 		numSocks = poll( events, EV_COUNT, waitTime );
@@ -582,7 +581,7 @@ static void* uplink_mainloop(void *data)
 		// See if we should trigger an RTT measurement
 		rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
-			if ( timing_reached( &nextAltCheck, &now ) || uplink->current.fd == -1 || uplink->cycleDetected ) {
+			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && discoverFailCount == 0 ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
 				if ( image_isComplete( uplink->image ) ) {
 					// Quit work if image is complete
@@ -606,6 +605,9 @@ static void* uplink_mainloop(void *data)
 					logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
 					uplink->image->working = false;
 				}
+				if ( uplink->current.fd == -1 ) {
+					uplink->cycleDetected = false;
+				}
 			}
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH) ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED );
 		}
-- 
cgit v1.2.3-55-g7522


From 23210df3faf44521942be607e0afc7bf63742297 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 2 Sep 2019 13:19:07 +0200
Subject: [SERVER] uplink: Don't reinit uplink when freeing it

---
 src/server/uplink.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index d1cd2e8..14b9013 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -89,7 +89,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 			close( sock );
 		}
 		ref_put( &uplink->reference );
-		return true; // There's already an uplink, so should we consider this success or failure?
+		return true; // There's already an uplink
 	}
 	if ( image->ref_cacheMap == NULL ) {
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
@@ -210,19 +210,9 @@ static void uplink_free(ref *ref)
 	if ( uplink->cacheFd != -1 ) {
 		close( uplink->cacheFd );
 	}
-	// TODO Requeue any requests
-	dnbd3_image_t *image = image_lock( uplink->image );
-	if ( image != NULL ) {
-		// != NULL means image is still in list...
-		if ( !_shutdown && image->ref_cacheMap != NULL ) {
-			// Ingegrity checker must have found something in the meantime
-			uplink_init( image, -1, NULL, 0 );
-		}
-		image_release( image );
-	}
 	// Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code
 	// of the uplink thread, depending on who set the uplink->shutdown flag.
-	image_release( image );
+	image_release( uplink->image );
 	free( uplink ); // !!!
 }
 
@@ -536,7 +526,7 @@ static void* uplink_mainloop(void *data)
 			if ( uplink->current.fd != -1 ) {
 				// Uplink seems fine, relay requests to it...
 				uplink_sendRequests( uplink, true );
-			} else { // No uplink; maybe it was shutdown since it was idle for too long
+			} else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long
 				uplink->idleTime = 0;
 			}
 		}
-- 
cgit v1.2.3-55-g7522


From 0edf0a0888b1e40769e19eee031c2cefdcf37d26 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 2 Sep 2019 13:26:47 +0200
Subject: [SERVER] Fix compiler warnings

---
 src/server/altservers.c | 4 ++--
 src/server/reference.h  | 4 ++--
 src/shared/protocol.h   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index ff3c95b..9e30cd0 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -140,7 +140,7 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 		if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
 			continue; // Slot is empty or uplink is for replication only
 		if ( host->type == altServers[i].host.type ) {
-			scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host );
+			scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
 		} else {
 			scores[i] = 1; // Wrong address family
 		}
@@ -400,7 +400,7 @@ const dnbd3_host_t* altservers_indexToHost(int server)
 static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 {
 	const int ALTS = 4;
-	int ret, itAlt, numAlts, current;
+	int itAlt, numAlts, current;
 	bool panic;
 	int servers[ALTS + 1];
 	struct timespec start, end;
diff --git a/src/server/reference.h b/src/server/reference.h
index 2a80955..4eda546 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -46,12 +46,12 @@ static inline void ref_put( ref *ref )
 	}
 }
 
-#define ref_get_uplink(wr) ({ \
+#define ref_get_uplink(wr) __extension__({ \
 	ref* ref = ref_get( wr ); \
 	ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
 })
 
-#define ref_get_cachemap(image) ({ \
+#define ref_get_cachemap(image) __extension__({ \
 	ref* ref = ref_get( &(image)->ref_cacheMap ); \
 	ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \
 })
diff --git a/src/shared/protocol.h b/src/shared/protocol.h
index 92dbe11..2b21c21 100644
--- a/src/shared/protocol.h
+++ b/src/shared/protocol.h
@@ -20,7 +20,7 @@
 #define COND_HOPCOUNT(vers,hopcount) ( (vers) >= 3 ? (hopcount) : 0 )
 
 // 2017-11-02: Macro to set flags in select image message properly if we're a server, as BG_REP depends on global var
-#define SI_SERVER_FLAGS ( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) )
+#define SI_SERVER_FLAGS ( (uint8_t)( (_pretendClient ? 0 : FLAGS8_SERVER) | (_backgroundReplication == BGR_FULL ? FLAGS8_BG_REP : 0) ) )
 
 #define REPLY_OK (0)
 #define REPLY_ERRNO (-1)
-- 
cgit v1.2.3-55-g7522


From be2e1135c7fcf3850535932b70c0d0891d095d12 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 2 Sep 2019 13:37:48 +0200
Subject: [SERVER] uplink: Don't disable already disabled image

---
 src/server/uplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 14b9013..49e726d 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -591,7 +591,7 @@ static void* uplink_mainloop(void *data)
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
 			if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
 				discoverFailCount++;
-				if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
+				if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
 					logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
 					uplink->image->working = false;
 				}
-- 
cgit v1.2.3-55-g7522


From e83d45b1decd892dfd0a30d4f3db00f5e68c38ae Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 2 Sep 2019 17:30:19 +0200
Subject: [SERVER] Move signal init to uplink_init

Initializing the signal in the thread lead to a race
where we would init the uplink and queue a request for it
before the thread actually initialized it. This was not harmful
but lead to spurious warnings in the server's log.
---
 src/server/uplink.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 49e726d..8a0b06b 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -106,7 +106,11 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->idleTime = 0;
 	uplink->queueLen = 0;
 	uplink->cacheFd = -1;
-	uplink->signal = NULL;
+	uplink->signal = signal_new();
+	if ( uplink->signal == NULL ) {
+		logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
+		goto failure;
+	}
 	uplink->replicationHandle = REP_NONE;
 	mutex_lock( &uplink->rttLock );
 	mutex_lock( &uplink->sendMutex );
@@ -135,8 +139,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	return true;
 failure: ;
 	if ( uplink != NULL ) {
-		free( uplink );
-		uplink = NULL;
+		image->users++; // Expected by uplink_free()
+		ref_put( &uplink->reference ); // The ref for the uplink thread that never was
 	}
 	mutex_unlock( &image->lock );
 	return false;
@@ -193,7 +197,9 @@ static void uplink_free(ref *ref)
 	dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
 	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
 	assert( uplink->queueLen == 0 );
-	signal_close( uplink->signal );
+	if ( uplink->signal != NULL ) {
+		signal_close( uplink->signal );
+	}
 	if ( uplink->current.fd != -1 ) {
 		close( uplink->current.fd );
 		uplink->current.fd = -1;
@@ -211,7 +217,7 @@ static void uplink_free(ref *ref)
 		close( uplink->cacheFd );
 	}
 	// Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code
-	// of the uplink thread, depending on who set the uplink->shutdown flag.
+	// of the uplink thread, depending on who set the uplink->shutdown flag. (Or uplink_init if that failed)
 	image_release( uplink->image );
 	free( uplink ); // !!!
 }
@@ -446,11 +452,6 @@ static void* uplink_mainloop(void *data)
 		logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
 	}
 	//
-	uplink->signal = signal_new();
-	if ( uplink->signal == NULL ) {
-		logadd( LOG_WARNING, "error creating signal. Uplink unavailable." );
-		goto cleanup;
-	}
 	events[EV_SIGNAL].events = POLLIN;
 	events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal );
 	events[EV_SOCKET].fd = -1;
-- 
cgit v1.2.3-55-g7522


From 4578e890e44e55c71736e77ca92ce5a2a5d3bc9a Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Sep 2019 19:28:47 +0200
Subject: [SERVER] Fix image_updateCachemap()

---
 src/server/image.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index b349058..484a28f 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -126,11 +126,15 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 		fb |= bit_mask;
 	}
 	// Last byte
-	for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-		const int map_x = (pos >> 12) & 7; // mod 8
-		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-		lb |= bit_mask;
+	if ( lastByteInMap != firstByteInMap ) {
+		for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+			assert( lastByteInMap == (pos >> 15) );
+			const int map_x = (pos >> 12) & 7; // mod 8
+			const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+			lb |= bit_mask;
+		}
 	}
+	atomic_thread_fence( memory_order_acquire );
 	if ( set ) {
 		uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
 		uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
-- 
cgit v1.2.3-55-g7522


From faf0e758b30ca0598713ee9898aa85360d36a4e4 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Sep 2019 20:16:23 +0200
Subject: [SERVER] Fix indentation

---
 src/server/image.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 484a28f..bdb910d 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -407,10 +407,10 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 			img->users = 0;
 			image_free( img );
 		}
-	// Check if image is incomplete, initialize uplink
-	if ( candidate->ref_cacheMap != NULL ) {
-		uplink_init( candidate, -1, NULL, -1 );
-	}
+		// Check if image is incomplete, initialize uplink
+		if ( candidate->ref_cacheMap != NULL ) {
+			uplink_init( candidate, -1, NULL, -1 );
+		}
 		// readFd == -1 and working == FALSE at this point,
 		// this function needs some splitting up for handling as we need to run most
 		// of the above code again. for now we know that the next call for this
-- 
cgit v1.2.3-55-g7522


From 778fb6d2d15d534869461560d80524d74446bb84 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Sep 2019 19:49:11 +0200
Subject: [SERVER] Fix altservers_getListForClient()

The score wasn't reset when adding clients to the list, resulting in
exactly one server filling up the whole list.
---
 src/server/altservers.c | 58 +++++++++++++++++++++++++++++++++++++++++++++----
 src/server/altservers.h |  2 +-
 src/server/ini.c        | 12 +++++++++-
 3 files changed, 66 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 9e30cd0..952af4f 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -1,3 +1,4 @@
+#include "ini.h"
 #include "altservers.h"
 #include "locks.h"
 #include "threadpool.h"
@@ -34,7 +35,7 @@ void altservers_init()
 	mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
 }
 
-static void addalt(int argc, char **argv, void *data)
+static void addAltFromLegacy(int argc, char **argv, void *data)
 {
 	char *shost;
 	dnbd3_host_t host;
@@ -52,29 +53,75 @@ static void addalt(int argc, char **argv, void *data)
 		return;
 	}
 	if ( argc == 1 ) argv[1] = "";
-	if ( altservers_add( &host, argv[1], isPrivate, isClientOnly ) ) {
+	if ( altservers_add( &host, argv[1], isPrivate, isClientOnly, NULL ) ) {
 		(*(int*)data)++;
 	}
 }
 
+static int addAltFromIni(void *countptr, const char* section, const char* key, const char* value)
+{
+	dnbd3_host_t host;
+	char *strhost = strdup( section );
+	if ( !parse_address( strhost, &host ) ) {
+		free( strhost );
+		logadd( LOG_WARNING, "Invalid host section in alt-servers file ignored: '%s'", section );
+		return 1;
+	}
+	free( strhost );
+	int index;
+	if ( altservers_add( &host, "", false, false, &index ) ) {
+		(*(int*)countptr)++;
+	}
+	if ( index == -1 )
+		return 1;
+	if ( strcmp( key, "for" ) == 0 ) {
+		if ( strncmp( value, "client", 6 ) == 0 ) {
+			altServers[index].isClientOnly = true;
+			altServers[index].isPrivate = false;
+		} else if ( strcmp( value, "replication" ) == 0 ) {
+			altServers[index].isClientOnly = false;
+			altServers[index].isPrivate = true;
+		} else {
+			logadd( LOG_WARNING, "Invalid value in alt-servers section %s for key %s: '%s'", section, key, value );
+		}
+	} else if ( strcmp( key, "comment" ) == 0 ) {
+		snprintf( altServers[index].comment, COMMENT_LENGTH, "%s", value );
+	} else {
+		logadd( LOG_DEBUG1, "Unknown key in alt-servers section: '%s'", key );
+	}
+	return 1;
+}
+
 int altservers_load()
 {
 	int count = 0;
 	char *name;
 	if ( asprintf( &name, "%s/%s", _configDir, "alt-servers" ) == -1 ) return -1;
-	file_loadLineBased( name, 1, 2, &addalt, (void*)&count );
+	if ( !file_isReadable( name ) ) {
+		free( name );
+		return 0;
+	}
+	ini_parse( name, &addAltFromIni, &count );
+	if ( numAltServers == 0 ) {
+		logadd( LOG_INFO, "Could not parse %s as .ini file, trying to load as legacy format.", name );
+		file_loadLineBased( name, 1, 2, &addAltFromLegacy, (void*)&count );
+	}
 	free( name );
 	logadd( LOG_DEBUG1, "Added %d alt servers\n", count );
 	return count;
 }
 
-bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly)
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly, int *index)
 {
 	int i, freeSlot = -1;
+	if ( index == NULL ) {
+		index = &freeSlot;
+	}
 	mutex_lock( &altServersLock );
 	for (i = 0; i < numAltServers; ++i) {
 		if ( isSameAddressPort( &altServers[i].host, host ) ) {
 			mutex_unlock( &altServersLock );
+			*index = i;
 			return false;
 		} else if ( freeSlot == -1 && altServers[i].host.type == 0 ) {
 			freeSlot = i;
@@ -84,6 +131,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 		if ( numAltServers >= SERVER_MAX_ALTS ) {
 			logadd( LOG_WARNING, "Cannot add another alt server, maximum of %d already reached.", (int)SERVER_MAX_ALTS );
 			mutex_unlock( &altServersLock );
+			*index = -1;
 			return false;
 		}
 		freeSlot = numAltServers++;
@@ -93,6 +141,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 	altServers[freeSlot].isClientOnly = isClientOnly;
 	if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment );
 	mutex_unlock( &altServersLock );
+	*index = freeSlot;
 	return true;
 }
 
@@ -156,6 +205,7 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 		}
 		if ( i == -1 )
 			break;
+		scores[i] = 0;
 		output[count].host = altServers[i].host;
 		output[count].failures = 0;
 		count++;
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 8e2b964..1e1f119 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -9,7 +9,7 @@ void altservers_init();
 
 int altservers_load();
 
-bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
+bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly, int *index);
 
 void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
 
diff --git a/src/server/ini.c b/src/server/ini.c
index 216543b..c796d5c 100644
--- a/src/server/ini.c
+++ b/src/server/ini.c
@@ -110,7 +110,17 @@ int ini_parse_file(FILE* file, int (*handler)(void*, const char*, const char*, c
 #endif
 		else if ( *start == '[' ) {
 			/* A "[section]" line */
-			end = find_char_or_comment( start + 1, ']' );
+			int cnt = 0;
+			char *f = start, *sstart = start;
+			while ( *++f ) {
+				if ( *f == '[' ) cnt++;
+				if ( *f == ']' ) cnt--;
+				if ( cnt < 0 ) {
+					sstart = f - 1;
+					break;
+				}
+			}
+			end = find_char_or_comment( sstart + 1, ']' );
 			if ( *end == ']' ) {
 				*end = '\0';
 				strncpy0( section, start + 1, sizeof(section) );
-- 
cgit v1.2.3-55-g7522


From 543877c7fc17c0a881d6a85c76dfc17f8def7dff Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Sep 2019 20:06:11 +0200
Subject: [SERVER] Support limiting alt-servers to specific namespace

Not really namespace but simple string matching for the image path. Path
is matched from start with no support for glob or regex, so usually you
want to have a trailing '/' to limit to certain directories.
---
 src/server/altservers.c | 51 +++++++++++++++++++++++++++++++++++--------------
 src/server/altservers.h |  4 ++--
 src/server/globals.h    |  8 ++++++++
 src/server/image.c      |  2 +-
 src/server/net.c        |  2 +-
 5 files changed, 49 insertions(+), 18 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 952af4f..943345c 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -23,7 +23,7 @@ static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
 
 static void *altservers_runCheck(void *data);
-static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current);
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current);
 static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink);
 static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt);
 static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server);
@@ -86,6 +86,13 @@ static int addAltFromIni(void *countptr, const char* section, const char* key, c
 		}
 	} else if ( strcmp( key, "comment" ) == 0 ) {
 		snprintf( altServers[index].comment, COMMENT_LENGTH, "%s", value );
+	} else if ( strcmp( key, "namespace" ) == 0 ) {
+		dnbd3_ns_t *elem = malloc( sizeof(*elem) );
+		elem->name = strdup( value );
+		elem->len = strlen( value );
+		do {
+			elem->next = altServers[index].nameSpaces;
+		} while ( !atomic_compare_exchange_weak( &altServers[index].nameSpaces, &elem->next, elem ) );
 	} else {
 		logadd( LOG_DEBUG1, "Unknown key in alt-servers section: '%s'", key );
 	}
@@ -139,6 +146,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 	altServers[freeSlot].host = *host;
 	altServers[freeSlot].isPrivate = isPrivate;
 	altServers[freeSlot].isClientOnly = isClientOnly;
+	altServers[freeSlot].nameSpaces = NULL;
 	if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment );
 	mutex_unlock( &altServersLock );
 	*index = freeSlot;
@@ -171,15 +179,28 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 	}
 }
 
+static bool isImageAllowed(dnbd3_alt_server_t *alt, const char *image)
+{
+	if ( alt->nameSpaces == NULL )
+		return true;
+	for ( dnbd3_ns_t *it = alt->nameSpaces; it != NULL; it = it->next ) {
+		if ( strncmp( it->name, image, it->len ) == 0 )
+			return true;
+	}
+	return false;
+}
+
 /**
  * Get <size> known (working) alt servers, ordered by network closeness
  * (by finding the smallest possible subnet)
  * Private servers are excluded, so this is what you want to call to
  * get a list of servers you can tell a client about
  */
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size)
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size)
 {
-	if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
+	dnbd3_host_t *host = &client->host;
+	if ( host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 )
+		return 0;
 	int i, j;
 	int count = 0;
 	uint16_t scores[SERVER_MAX_ALTS] = { 0 };
@@ -188,11 +209,9 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 	for ( i = 0; i < numAltServers; ++i ) {
 		if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
 			continue; // Slot is empty or uplink is for replication only
-		if ( host->type == altServers[i].host.type ) {
-			scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
-		} else {
-			scores[i] = 1; // Wrong address family
-		}
+		if ( !isImageAllowed( &altServers[i], client->image->name ) )
+			continue;
+		scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
 	}
 	while ( count < size ) {
 		i = -1;
@@ -244,10 +263,10 @@ static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now )
 	return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN;
 }
 
-int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size)
 {
 	int idx[size];
-	int num = altservers_getListForUplink( NULL, idx, size, -1 );
+	int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
 	for ( int i = 0; i < num; ++i ) {
 		servers[i] = altServers[i].host;
 	}
@@ -261,7 +280,7 @@ int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
  * it includes private servers and ignores any "client only" servers
  * @param current index of server for current connection, or -1 in panic mode
  */
-static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current)
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current)
 {
 	if ( size <= 0 )
 		return 0;
@@ -272,7 +291,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int
 	if ( numAltServers <= size ) {
 		for ( int i = 0; i < numAltServers; ++i ) {
 			if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) {
-				servers[count++] = i;
+				if ( isImageAllowed( &altServers[i], image ) ) {
+					servers[count++] = i;
+				}
 			}
 		}
 	} else {
@@ -286,7 +307,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int
 			int idx = rand() % numAltServers;
 			if ( state[idx] != 0 )
 				continue;
-			if ( isUsableForUplink( uplink, idx, &now ) ) {
+			if ( !isImageAllowed( &altServers[idx], image ) ) {
+				state[idx] = 2; // Mark as used without adding, so it will be ignored in panic loop
+			} else if ( isUsableForUplink( uplink, idx, &now ) ) {
 				servers[count++] = idx;
 				state[idx] = 2; // Used
 			} else {
@@ -469,7 +492,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 	current = uplink->current.index; // Current server index (or last one in panic mode)
 	mutex_unlock( &uplink->rttLock );
 	// First, get 4 alt servers
-	numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current );
+	numAlts = altservers_getListForUplink( uplink, uplink->image->name, servers, ALTS, panic ? -1 : current );
 	// If we're already connected and only got one server anyways, there isn't much to do
 	if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) {
 		uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 1e1f119..8e29aaa 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -15,9 +15,9 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
 
 void altservers_findUplink(dnbd3_uplink_t *uplink);
 
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size);
 
-int altservers_getHostListForReplication(dnbd3_host_t *servers, int size);
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
 
 bool altservers_toString(int server, char *buffer, size_t len);
 
diff --git a/src/server/globals.h b/src/server/globals.h
index 221af78..ebdc1c7 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -31,6 +31,13 @@ typedef struct
 	uint8_t hopCount;      // How many hops this request has already taken across proxies
 } dnbd3_queued_request_t;
 
+typedef struct _ns
+{
+	struct _ns *next;
+	char *name;
+	size_t len;
+} dnbd3_ns_t;
+
 typedef struct
 {
 	int fails;                    // Hard fail: Connection failed
@@ -41,6 +48,7 @@ typedef struct
 	ticks lastFail;               // Last hard fail
 	dnbd3_host_t host;
 	char comment[COMMENT_LENGTH];
+	_Atomic(dnbd3_ns_t *) nameSpaces; // Linked list of name spaces
 } dnbd3_alt_server_t;
 
 typedef struct
diff --git a/src/server/image.c b/src/server/image.c
index bdb910d..86e6b87 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1204,7 +1204,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
 	dnbd3_host_t servers[REP_NUM_SRV];
 	int uplinkSock = -1;
 	dnbd3_host_t uplinkServer;
-	const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV );
+	const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV );
 	uint16_t remoteProtocolVersion;
 	uint16_t remoteRid = revision;
 	uint64_t remoteImageSize;
diff --git a/src/server/net.c b/src/server/net.c
index 00c9a8d..aba4e7d 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -477,7 +477,7 @@ void* net_handleNewConnection(void *clientPtr)
 
 			case CMD_GET_SERVERS:
 				// Build list of known working alt servers
-				num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS );
+				num = altservers_getListForClient( client, server_list, NUMBER_SERVERS );
 				reply.cmd = CMD_GET_SERVERS;
 				reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) );
 				mutex_lock( &client->sendMutex );
-- 
cgit v1.2.3-55-g7522


From 5765ce49f5e1e26505fd6b162db73a732603d1a8 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 5 Sep 2019 16:52:31 +0200
Subject: [SERVER] integrity checker: Improve flushing logic

---
 src/server/integrity.c | 199 +++++++++++++++++++++++++++----------------------
 src/server/uplink.c    |   2 +-
 2 files changed, 111 insertions(+), 90 deletions(-)

(limited to 'src')

diff --git a/src/server/integrity.c b/src/server/integrity.c
index a9fbae6..fddb755 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -35,6 +35,7 @@ static int queueLen = -1;
 static atomic_bool bRunning = false;
 
 static void* integrity_main(void *data);
+static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end);
 
 /**
  * Initialize the integrity check thread
@@ -88,14 +89,17 @@ void integrity_check(dnbd3_image_t *image, int block)
 	for (i = 0; i < queueLen; ++i) {
 		if ( freeSlot == -1 && checkQueue[i].image == NULL ) {
 			freeSlot = i;
-		} else if ( checkQueue[i].image == image
-				&& checkQueue[i].block <= block && checkQueue[i].block + checkQueue[i].count >= block ) {
-			// Already queued check dominates this one, or at least lies directly before this block
-			if ( checkQueue[i].block + checkQueue[i].count == block ) {
-				// It's directly before this one; expand range
+		} else if ( checkQueue[i].image == image && checkQueue[i].block <= block ) {
+			if ( checkQueue[i].count == CHECK_ALL ) {
+				logadd( LOG_DEBUG2, "Dominated by full image scan request (%d/%d) (at %d)", i, queueLen, checkQueue[i].block );
+			} else if ( checkQueue[i].block + checkQueue[i].count == block ) {
 				checkQueue[i].count += 1;
+				logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
+			} else if ( checkQueue[i].block + checkQueue[i].count > block ) {
+				logadd( LOG_DEBUG2, "Dominated by existing check request (%d/%d) (at %d, %d to go)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
+			} else {
+				continue;
 			}
-			logadd( LOG_DEBUG2, "Attaching to existing check request (%d/%d) (%d +%d)", i, queueLen, checkQueue[i].block, checkQueue[i].count );
 			mutex_unlock( &integrityQueueLock );
 			return;
 		}
@@ -123,8 +127,6 @@ void integrity_check(dnbd3_image_t *image, int block)
 static void* integrity_main(void * data UNUSED)
 {
 	int i;
-	uint8_t *buffer = NULL;
-	size_t bufferSize = 0;
 	setThreadName( "image-check" );
 	blockNoncriticalSignals();
 #if defined(linux) || defined(__linux)
@@ -150,88 +152,70 @@ static void* integrity_main(void * data UNUSED)
 			// We have the image. Call image_release() some time
 			const int qCount = checkQueue[i].count;
 			bool foundCorrupted = false;
-			mutex_lock( &image->lock );
 			if ( image->crc32 != NULL && image->realFilesize != 0 ) {
 				int blocks[2] = { checkQueue[i].block, -1 };
 				mutex_unlock( &integrityQueueLock );
-				// Make copy of crc32 list as it might go away
 				const uint64_t fileSize = image->realFilesize;
 				const int numHashBlocks = IMGSIZE_TO_HASHBLOCKS(fileSize);
-				const size_t required = numHashBlocks * sizeof(uint32_t);
-				if ( buffer == NULL || required > bufferSize ) {
-					bufferSize = required;
-					if ( buffer != NULL ) free( buffer );
-					buffer = malloc( bufferSize );
-				}
-				memcpy( buffer, image->crc32, required );
-				mutex_unlock( &image->lock );
-				// Open for direct I/O if possible; this prevents polluting the fs cache
-				int fd = open( image->path, O_RDONLY | O_DIRECT );
-				bool direct = fd != -1;
-				if ( unlikely( !direct ) ) {
-					// Try unbuffered; flush to disk for that
-					logadd( LOG_DEBUG1, "O_DIRECT failed for %s", image->path );
-					image_ensureOpen( image );
-					fd = image->readFd;
-				}
 				int checkCount = MIN( qCount, 5 );
-				if ( fd != -1 ) {
-					while ( blocks[0] < numHashBlocks && !_shutdown ) {
-						const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
-						const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
-						bool complete = true;
-						if ( qCount == CHECK_ALL ) {
-							dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-							if ( cache != NULL ) {
-								// When checking full image, skip incomplete blocks, otherwise assume block is complete
-								complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
-								ref_put( &cache->reference );
-							}
-						}
-#if defined(linux) || defined(__linux)
-						while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
-#else
-						while ( fsync( fd ) == -1 )
-#endif
-						{
-							if ( _shutdown )
-								break;
-							if ( errno == EINTR )
-								continue;
-							logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, errno );
-							exit( 1 );
+				int readFd = -1, directFd = -1;
+				while ( blocks[0] < numHashBlocks && !_shutdown ) {
+					const uint64_t start = blocks[0] * HASH_BLOCK_SIZE;
+					const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
+					bool complete = true;
+					if ( qCount == CHECK_ALL ) {
+						dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+						if ( cache != NULL ) {
+							// When checking full image, skip incomplete blocks, otherwise assume block is complete
+							complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+							ref_put( &cache->reference );
 						}
-						if ( _shutdown )
-							break;
+					}
+					// Flush to disk if there's an uplink, as that means the block might have been written recently
+					if ( image->uplinkref != NULL ) {
+						flushFileRange( image, start, end );
+					}
+					if ( _shutdown )
+						break;
+					// Open for direct I/O if possible; this prevents polluting the fs cache
+					if ( directFd == -1 && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
 						// Use direct I/O only if read length is multiple of 4096 to be on the safe side
-						int tfd;
-						if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
-							// Suitable for direct io
-							tfd = fd;
-						} else if ( !image_ensureOpen( image ) ) {
-							logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
-							break;
+						directFd = open( image->path, O_RDONLY | O_DIRECT );
+						if ( directFd == -1 ) {
+							logadd( LOG_DEBUG2, "O_DIRECT failed for %s (errno=%d)", image->path, errno );
+							directFd = -2;
 						} else {
-							tfd = image->readFd;
-							// Evict from cache so we have to re-read, making sure data was properly stored
-							posix_fadvise( fd, start, end - start, POSIX_FADV_DONTNEED );
+							readFd = directFd;
 						}
-						if ( complete && !image_checkBlocksCrc32( tfd, (uint32_t*)buffer, blocks, fileSize ) ) {
-							logadd( LOG_WARNING, "Hash check for block %d of %s failed!", blocks[0], image->name );
-							image_updateCachemap( image, start, end, false );
-							// If this is not a full check, queue one
-							if ( qCount != CHECK_ALL ) {
-								logadd( LOG_INFO, "Queueing full check for %s", image->name );
-								integrity_check( image, -1 );
-							}
-							foundCorrupted = true;
-						}
-						blocks[0]++; // Increase before break, so it always points to the next block to check after loop
-						if ( complete && --checkCount == 0 ) break;
 					}
-					if ( direct ) {
-						close( fd );
+					if ( readFd == -1 ) { // Try buffered; flush to disk for that
+						image_ensureOpen( image );
+						readFd = image->readFd;
+					}
+					if ( readFd == -1 ) {
+						logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
+					} else if ( complete && !image_checkBlocksCrc32( readFd, image->crc32, blocks, fileSize ) ) {
+						bool iscomplete = true;
+						dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+						if ( cache != NULL ) {
+							iscomplete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+							ref_put( &cache->reference );
+						}
+						logadd( LOG_WARNING, "Hash check for block %d of %s failed (complete: was: %d, is: %d)", blocks[0], image->name, (int)complete, (int)iscomplete );
+						image_updateCachemap( image, start, end, false );
+						// If this is not a full check, queue one
+						if ( qCount != CHECK_ALL ) {
+							logadd( LOG_INFO, "Queueing full check for %s", image->name );
+							integrity_check( image, -1 );
+						}
+						foundCorrupted = true;
 					}
+					blocks[0]++; // Increase before break, so it always points to the next block to check after loop
+					if ( complete && --checkCount == 0 )
+						break;
+				}
+				if ( directFd != -1 && directFd != -2 ) {
+					close( directFd );
 				}
 				mutex_lock( &integrityQueueLock );
 				assert( checkQueue[i].image == image );
@@ -242,11 +226,8 @@ static void* integrity_main(void * data UNUSED)
 						logadd( LOG_WARNING, "BUG! checkQueue counter ran negative" );
 					}
 				}
-				if ( checkCount > 0 || checkQueue[i].count <= 0 || fd == -1 ) {
-					// Done with this task as nothing left, OR we don't have an fd to read from
-					if ( fd == -1 ) {
-						logadd( LOG_WARNING, "Cannot hash check %s: bad fd", image->path );
-					}
+				if ( checkCount > 0 || checkQueue[i].count <= 0 ) {
+					// Done with this task as nothing left
 					checkQueue[i].image = NULL;
 					if ( i + 1 == queueLen ) queueLen--;
 					// Mark as working again if applicable
@@ -263,10 +244,8 @@ static void* integrity_main(void * data UNUSED)
 					// Still more blocks to go...
 					checkQueue[i].block = blocks[0];
 				}
-			} else {
-				mutex_unlock( &image->lock );
 			}
-			if ( foundCorrupted ) {
+			if ( foundCorrupted && !_shutdown ) {
 				// Something was fishy, make sure uplink exists
 				mutex_lock( &image->lock );
 				image->working = false;
@@ -278,10 +257,52 @@ static void* integrity_main(void * data UNUSED)
 		}
 	}
 	mutex_unlock( &integrityQueueLock );
-	if ( buffer != NULL ) {
-		free( buffer );
-	}
 	bRunning = false;
 	return NULL;
 }
 
+static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end)
+{
+	int flushFd;
+	int writableFd = -1;
+	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+	if ( uplink != NULL ) { // Try to steal uplink's writable fd
+		if ( uplink->cacheFd != -1 ) {
+			writableFd = dup( uplink->cacheFd );
+		}
+		ref_put( &uplink->reference );
+	}
+	if ( writableFd == -1 ) { // Open file as writable
+		writableFd = open( image->path, O_WRONLY );
+	}
+	if ( writableFd == -1 ) { // Fallback to readFd (should work on Linux and BSD...)
+		logadd( LOG_WARNING, "flushFileRange: Cannot open %s for writing. Trying readFd.", image->path );
+		image_ensureOpen( image );
+		flushFd = image->readFd;
+	} else {
+		flushFd = writableFd;
+	}
+	if ( flushFd == -1 )
+		return;
+#if defined(linux) || defined(__linux)
+	while ( sync_file_range( flushFd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
+#else
+	while ( fsync( flushFd ) == -1 ) // TODO: fdatasync() should be available since FreeBSD 12.0 ... Might be a tad bit faster
+#endif
+	{
+		if ( _shutdown )
+			break;
+		int e = errno;
+		if ( e == EINTR )
+			continue;
+		logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, e );
+		if ( e == EIO ) {
+			exit( 1 );
+		}
+	}
+	// Evict from cache too so we have to re-read, making sure data was properly stored
+	posix_fadvise( flushFd, start, end - start, POSIX_FADV_DONTNEED );
+	if ( writableFd != -1 ) {
+		close( writableFd );
+	}
+}
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 8a0b06b..dab5c27 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -876,7 +876,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 				ret = (int)pwrite( uplink->cacheFd, uplink->recvBuffer + done, inReply.size - done, start + done );
 				if ( unlikely( ret == -1 ) ) {
 					err = errno;
-					if ( err == EINTR ) continue;
+					if ( err == EINTR && !_shutdown ) continue;
 					if ( err == ENOSPC || err == EDQUOT ) {
 						// try to free 256MiB
 						if ( !tryAgain || !image_ensureDiskSpaceLocked( 256ull * 1024 * 1024, true ) ) break;
-- 
cgit v1.2.3-55-g7522


From bf665f59411840c60b6e3c9ac33f28a818233c0a Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 5 Sep 2019 18:15:52 +0200
Subject: [SERVER] Introduce autoFreeDiskSpaceDelay

This setting allows you to control the formerly hard-coded timeout of 10
hours before a proxy would start deleting old images in order to free up
space for new images. Setting it to -1 entirely disables automatic
deletion, in case you have an external process for freeing up disk
space.
---
 conf/server.conf     | 11 +++++++++++
 src/server/globals.c | 12 ++++++++++++
 src/server/globals.h |  7 +++++++
 src/server/image.c   | 14 ++++++++------
 4 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/conf/server.conf b/conf/server.conf
index a101f34..a15092f 100644
--- a/conf/server.conf
+++ b/conf/server.conf
@@ -31,6 +31,17 @@ vmdkLegacyMode=false
 ; Don't set the server flag when connecting to alt-servers
 ; Intended for if the proxy is used for on-client caching
 pretendClient=false
+; When running in proxy mode and running out of space, automatically delete oldest image(s) to make
+; the newly replicated image fit. In sparse mode, this will make sure at least 2GB of free space are
+; available when replicating a new image. During normal operation, it will free at least 256MB whenever
+; an attempt to write more data to cache fails. In non-sparse mode, whenever a new image is replicated,
+; as much space as is required to store the entire image will be made available.
+; However, after startup the proxy will refuse to delete any images for the time span given below, to be
+; able to gather up to date usage information for the images available. If unitless, the value is
+; interpreted in seconds. Valid suffixes are m, h, d.
+; Setting this to -1 disables deletion of images. If the cache partition is full, no more images will
+; be replicated unless you manually free up more disk space.
+autoFreeDiskSpaceDelay=10h
 
 [limits]
 maxClients=2000
diff --git a/src/server/globals.c b/src/server/globals.c
index 46c1030..f8c3f66 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -28,6 +28,7 @@ atomic_bool _closeUnusedFd = false;
 atomic_bool _vmdkLegacyMode = false;
 // Not really needed anymore since we have '+' and '-' in alt-servers
 atomic_bool _proxyPrivateOnly = false;
+atomic_int _autoFreeDiskSpaceDelay = 3600 * 10;
 // [limits]
 atomic_int _maxClients = SERVER_MAX_CLIENTS;
 atomic_int _maxImages = SERVER_MAX_IMAGES;
@@ -83,6 +84,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
 	SAVE_TO_VAR_UINT( limits, maxPayload );
 	SAVE_TO_VAR_UINT64( limits, maxReplicationSize );
 	SAVE_TO_VAR_BOOL( dnbd3, pretendClient );
+	SAVE_TO_VAR_INT( dnbd3, autoFreeDiskSpaceDelay );
 	if ( strcmp( section, "dnbd3" ) == 0 && strcmp( key, "backgroundReplication" ) == 0 ) {
 		if ( strcmp( value, "hashblock" ) == 0 ) {
 			_backgroundReplication = BGR_HASHBLOCK;
@@ -229,6 +231,15 @@ static bool parse64(const char *in, atomic_int_fast64_t *out, const char *optnam
 	while ( *end == ' ' ) end++;
 	if ( *end == '\0' ) {
 		exp = 0;
+	} else if ( *end == 'm' ) {
+		exp = 1;
+		base = 60;
+	} else if ( *end == 'h' ) {
+		exp = 1;
+		base = 3600;
+	} else if ( *end == 'd' ) {
+		exp = 1;
+		base = 24 * 3600;
 	} else {
 		char *pos = strchr( units, *end > 'Z' ? (*end - 32) : *end );
 		if ( pos == NULL ) {
@@ -318,6 +329,7 @@ size_t globals_dumpConfig(char *buffer, size_t size)
 	PBOOL(vmdkLegacyMode);
 	PBOOL(proxyPrivateOnly);
 	PBOOL(pretendClient);
+	PINT(autoFreeDiskSpaceDelay);
 	P_ARG("[limits]\n");
 	PINT(maxClients);
 	PINT(maxImages);
diff --git a/src/server/globals.h b/src/server/globals.h
index ebdc1c7..58b2c9d 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -290,6 +290,13 @@ extern atomic_uint_fast64_t _maxReplicationSize;
  */
 extern atomic_bool _pretendClient;
 
+/**
+ * Minimum uptime in seconds before proxy starts deleting old
+ * images if running out of space. -1 disables automatic deletion.
+ * Only relevant in proxy mode.
+ */
+extern atomic_int _autoFreeDiskSpaceDelay;
+
 /**
  * Load the server configuration.
  */
diff --git a/src/server/image.c b/src/server/image.c
index 86e6b87..9fcb866 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1696,14 +1696,16 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
 	for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
 		uint64_t available;
 		if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
-			const int e = errno;
-			logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", e );
+			logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno );
 			return true;
 		}
-		if ( available > size ) return true;
-		if ( !force && dnbd3_serverUptime() < 10 * 3600 ) {
-			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < 10 hours...", (int)(available / (1024ll * 1024ll)),
-					(int)(size / (1024 * 1024)) );
+		if ( available > size )
+			return true; // Yay
+		if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
+			return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+		if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
+			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)),
+					(int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 );
 			return false;
 		}
 		logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
-- 
cgit v1.2.3-55-g7522


From 701e5a967fd6bc97644f39e6fea3714f49a90291 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Sep 2019 17:32:58 +0200
Subject: [SERVER] rpc: Add cachemap feature

---
 src/server/globals.h |  2 +-
 src/server/image.c   | 16 ++++++++++++++++
 src/server/image.h   |  2 ++
 src/server/rpc.c     | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 58b2c9d..df8c595 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -110,7 +110,7 @@ typedef struct
 typedef struct
 {
 	ref reference;
-	atomic_uint_least8_t map[];
+	_Atomic uint8_t map[];
 } dnbd3_cache_map_t;
 
 /**
diff --git a/src/server/image.c b/src/server/image.c
index 9fcb866..5fa06d8 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -274,6 +274,22 @@ bool image_ensureOpen(dnbd3_image_t *image)
 	return image->readFd != -1;
 }
 
+dnbd3_image_t* image_byId(int imgId)
+{
+	int i;
+	mutex_lock( &imageListLock );
+	for (i = 0; i < _num_images; ++i) {
+		dnbd3_image_t * const image = _images[i];
+		if ( image != NULL && image->id == imgId ) {
+			image->users++;
+			mutex_unlock( &imageListLock );
+			return image;
+		}
+	}
+	mutex_unlock( &imageListLock );
+	return NULL;
+}
+
 /**
  * Get an image by name+rid. This function increases a reference counter,
  * so you HAVE TO CALL image_release for every image_get() call at some
diff --git a/src/server/image.h b/src/server/image.h
index cd87f03..449e31f 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -17,6 +17,8 @@ void image_markComplete(dnbd3_image_t *image);
 
 bool image_ensureOpen(dnbd3_image_t *image);
 
+dnbd3_image_t* image_byId(int imgId);
+
 dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking);
 
 bool image_reopenCacheFd(dnbd3_image_t *image, const bool force);
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 662263e..548c80f 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -9,6 +9,7 @@
 #include "fileutil.h"
 #include "picohttpparser/picohttpparser.h"
 #include "urldecode.h"
+#include "reference.h"
 
 #include <jansson.h>
 #include <sys/types.h>
@@ -43,7 +44,9 @@ _Static_assert( sizeof("test") == 5 && sizeof("test2") == 6, "Stringsize messup
 DEFSTR(STR_CONNECTION, "connection")
 DEFSTR(STR_CLOSE, "close")
 DEFSTR(STR_QUERY, "/query")
+DEFSTR(STR_CACHEMAP, "/cachemap")
 DEFSTR(STR_Q, "q")
+DEFSTR(STR_ID, "id")
 
 static inline bool equals(struct string *s1,struct string *s2)
 {
@@ -81,6 +84,7 @@ static struct {
 } status;
 
 static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
+static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
 static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive);
 static void parsePath(struct string *path, struct string *file, struct field *getv, size_t *getc);
 static bool hasHeaderValue(struct phr_header *headers, size_t numHeaders, struct string *name, struct string *value);
@@ -212,6 +216,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			// Don't care if GET or POST
 			if ( equals( &file, &STR_QUERY ) ) {
 				ok = handleStatus( sock, permissions, getv, getc, keepAlive );
+			} else if ( equals( &file, &STR_CACHEMAP ) ) {
+				ok = handleCacheMap( sock, permissions, getv, getc, keepAlive );
 			} else {
 				ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive );
 			}
@@ -342,6 +348,44 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
 	return ok;
 }
 
+static bool handleCacheMap(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive)
+{
+	if ( !(permissions & ACL_IMAGE_LIST) ) {
+		return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive );
+	}
+	int imgId = -1;
+	static const char one = 0xff;
+	for (size_t i = 0; i < fields_num; ++i) {
+		if ( equals( &fields[i].name, &STR_ID ) ) {
+			char *broken;
+			imgId = strtol( fields[i].value.s, &broken, 10 );
+			if ( broken != fields[i].value.s )
+				break;
+			imgId = -1;
+		}
+	}
+	if ( imgId == -1 )
+		return sendReply( sock, "400 Bad Request", "text/plain", "Missing parameter 'id'", -1, keepAlive );
+	dnbd3_image_t *image = image_byId( imgId );
+	if ( image == NULL )
+		return sendReply( sock, "404 Not found", "text/plain", "Image not found", -1, keepAlive );
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	image_release( image );
+	int len;
+	const char *map;
+	if ( cache == NULL ) {
+		map = &one;
+		len = 1;
+	} else {
+		_Static_assert( sizeof(const char) == sizeof(_Atomic uint8_t), "Atomic assumption exploded" );
+		map = (const char*)cache->map;
+		len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+	}
+	bool ok = sendReply( sock, "200 OK", "application/octet-stream", map, len, keepAlive );
+	ref_put( &cache->reference );
+	return ok;
+}
+
 static bool sendReply(int sock, const char *status, const char *ctype, const char *payload, ssize_t plen, int keepAlive)
 {
 	if ( plen == -1 ) plen = strlen( payload );
-- 
cgit v1.2.3-55-g7522


From 22bbdabe886aad1a776be1c25989d418d4a19cd0 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 7 Sep 2019 10:05:49 +0200
Subject: [SERVER] picohttpparser: Merge
 81fe3d99fd90a55cafb993e53fd3000dbc4d564c

---
 src/server/picohttpparser/picohttpparser.c | 67 ++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 21 deletions(-)

(limited to 'src')

diff --git a/src/server/picohttpparser/picohttpparser.c b/src/server/picohttpparser/picohttpparser.c
index cfa05ef..f077016 100644
--- a/src/server/picohttpparser/picohttpparser.c
+++ b/src/server/picohttpparser/picohttpparser.c
@@ -36,8 +36,6 @@
 #endif
 #include "picohttpparser.h"
 
-/* $Id$ */
-
 #if __GNUC__ >= 3
 #define likely(x) __builtin_expect(!!(x), 1)
 #define unlikely(x) __builtin_expect(!!(x), 0)
@@ -73,9 +71,9 @@
 #define ADVANCE_TOKEN(tok, toklen)                                                                                                 \
     do {                                                                                                                           \
         const char *tok_start = buf;                                                                                               \
-        static const char ALIGNED(16) ranges2[] = "\000\040\177\177";                                                              \
+        static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";                                                            \
         int found2;                                                                                                                \
-        buf = findchar_fast(buf, buf_end, ranges2, sizeof(ranges2) - 1, &found2);                                                  \
+        buf = findchar_fast(buf, buf_end, ranges2, 4, &found2);                                                                    \
         if (!found2) {                                                                                                             \
             CHECK_EOF();                                                                                                           \
         }                                                                                                                          \
@@ -138,15 +136,11 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, struct
     const char *token_start = buf;
 
 #ifdef __SSE4_2__
-    static const char ranges1[] = "\0\010"
-                                  /* allow HT */
-                                  "\012\037"
-                                  /* allow SP and up to but not including DEL */
-                                  "\177\177"
-        /* allow chars w. MSB set */
-        ;
+    static const char ALIGNED(16) ranges1[16] = "\0\010"    /* allow HT */
+                                                "\012\037"  /* allow SP and up to but not including DEL */
+                                                "\177\177"; /* allow chars w. MSB set */
     int found;
-    buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found);
+    buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
     if (found)
         goto FOUND_CTL;
 #else
@@ -325,9 +319,21 @@ static const char *parse_headers(const char *buf, const char *buf_end, struct ph
             headers[*num_headers].name.s = NULL;
             headers[*num_headers].name.l = 0;
         }
-        if ((buf = get_token_to_eol(buf, buf_end, &headers[*num_headers].value, ret)) == NULL) {
+        struct string value;
+        // DELETE
+        if ((buf = get_token_to_eol(buf, buf_end, &value, ret)) == NULL) {
             return NULL;
         }
+        /* remove trailing SPs and HTABs */
+        const char *value_end = value.s + value.l;
+        for (; value_end != value.s; --value_end) {
+            const char c = *(value_end - 1);
+            if (!(c == ' ' || c == '\t')) {
+                break;
+            }
+        }
+        headers[*num_headers].value.s = value.s;
+        headers[*num_headers].value.l = value_end - value.s;
     }
     return buf;
 }
@@ -347,9 +353,17 @@ static const char *parse_request(const char *buf, const char *buf_end, struct st
 
     /* parse request line */
     ADVANCE_TOKEN(method->s, method->l);
-    ++buf;
+    do {
+        ++buf;
+    } while (*buf == ' ');
     ADVANCE_TOKEN(path->s, path->l);
-    ++buf;
+    do {
+        ++buf;
+    } while (*buf == ' ');
+    if (method->l == 0 || path->l == 0) {
+        *ret = -1;
+        return NULL;
+    }
     if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) {
         return NULL;
     }
@@ -402,10 +416,13 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
         return NULL;
     }
     /* skip space */
-    if (*buf++ != ' ') {
+    if (*buf != ' ') {
         *ret = -1;
         return NULL;
     }
+    do {
+        ++buf;
+    } while (*buf == ' ');
     /* parse status code, we want at least [:digit:][:digit:][:digit:]<other char> to try to parse */
     if (buf_end - buf < 4) {
         *ret = -2;
@@ -413,13 +430,21 @@ static const char *parse_response(const char *buf, const char *buf_end, int *min
     }
     PARSE_INT_3(status);
 
-    /* skip space */
-    if (*buf++ != ' ') {
-        *ret = -1;
+    /* get message includig preceding space */
+    if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) {
         return NULL;
     }
-    /* get message */
-    if ((buf = get_token_to_eol(buf, buf_end, msg, ret)) == NULL) {
+    if (msg->l == 0) {
+        /* ok */
+    } else if (*msg->s == ' ') {
+        /* remove preceding space */
+        do {
+            ++msg->s;
+            --msg->l;
+        } while (*msg->s == ' ');
+    } else {
+        /* garbage found after status code */
+        *ret = -1;
         return NULL;
     }
 
-- 
cgit v1.2.3-55-g7522


From ebe7d990aa6e2c42ddc8475be5ea65ce2a96605a Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 7 Sep 2019 10:09:11 +0200
Subject: [SERVER] Fix warning on clang

---
 src/server/uplink.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index dab5c27..f39e633 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -1029,12 +1029,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
  */
 static int uplink_sendKeepalive(const int fd)
 {
-	static dnbd3_request_t request = { 0 };
-	if ( request.magic == 0 ) {
-		request.magic = dnbd3_packet_magic;
-		request.cmd = CMD_KEEPALIVE;
-		fixup_request( request );
-	}
+	static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
 	return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
 }
 
-- 
cgit v1.2.3-55-g7522


From c2209762259426a8e5a25a6789711f76b4dca569 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 9 Sep 2019 17:43:20 +0200
Subject: Fix compilation on older gcc

---
 CMakeLists.txt | 2 +-
 src/types.h    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26d4d38..1e75f2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,7 +42,7 @@ if(CMAKE_C_COMPILER MATCHES "clang")
 	SET(CMAKE_C_FLAGS_RELEASE " -O3 -Wno-unused-result -DNDEBUG")
 elseif (CMAKE_C_COMPILER MATCHES "(cc-)|(cc$)")
 	message( "Using (g)cc flags." )
-	SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstack-clash-protection -mmitigate-rop")
+	SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mmitigate-rop")
 	SET(CMAKE_C_FLAGS_DEBUG " -O0 -g -Wall -Wextra -Wpedantic -Wconversion -Wno-sign-conversion -D_DEBUG")
 	SET(CMAKE_C_FLAGS_RELEASE " -O3 -Wno-unused-result -DNDEBUG")
 else()
diff --git a/src/types.h b/src/types.h
index ec37d9b..695d81d 100644
--- a/src/types.h
+++ b/src/types.h
@@ -77,7 +77,7 @@
 #define IOCTL_REM_SRV	_IO(0xab, 5)
 
 #if defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
-static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72);
+#define dnbd3_packet_magic ((uint16_t)( (0x73 << 8) | (0x72) ))
 // Flip bytes around on big endian when putting stuff on the net
 #define net_order_64(a) ((uint64_t)((((a) & 0xFFull) << 56) | (((a) & 0xFF00ull) << 40) | (((a) & 0xFF0000ull) << 24) | (((a) & 0xFF000000ull) << 8) | (((a) & 0xFF00000000ull) >> 8) | (((a) & 0xFF0000000000ull) >> 24) | (((a) & 0xFF000000000000ull) >> 40) | (((a) & 0xFF00000000000000ull) >> 56)))
 #define net_order_32(a) ((uint32_t)((((a) & (uint32_t)0xFF) << 24) | (((a) & (uint32_t)0xFF00) << 8) | (((a) & (uint32_t)0xFF0000) >> 8) | (((a) & (uint32_t)0xFF000000) >> 24)))
@@ -96,7 +96,7 @@ static const uint16_t dnbd3_packet_magic = (0x73 << 8) | (0x72);
 #define BIG_ENDIAN
 #endif
 #elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
-static const uint16_t dnbd3_packet_magic = (0x73) | (0x72 << 8);
+#define dnbd3_packet_magic ((uint16_t)( (0x73) | (0x72 << 8) ))
 // Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
 #define net_order_64(a) (a)
 #define net_order_32(a) (a)
-- 
cgit v1.2.3-55-g7522


From fdcbdcac2e721d72136794bdc45c63d71799dcd5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 10 Sep 2019 17:49:05 +0200
Subject: [SERVER] Make integrity checks on startup async

---
 src/server/image.c     | 49 ++++++++++++++++++++++++-------------------------
 src/server/integrity.c | 17 ++++++++++++-----
 src/server/integrity.h |  2 +-
 3 files changed, 37 insertions(+), 31 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 5fa06d8..822a710 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -53,7 +53,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force);
 
 static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map);
+static void image_checkRandomBlocks(dnbd3_image_t *image, const int count);
 static void* closeUnusedFds(void*);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
 
@@ -161,7 +161,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 		for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) {
 			const int block = (int)( pos / HASH_BLOCK_SIZE );
 			if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) {
-				integrity_check( image, block );
+				integrity_check( image, block, false );
 			}
 		}
 	}
@@ -846,19 +846,10 @@ static bool image_load(char *base, char *path, int withUplink)
 	// XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented)
 
 	// 2. Load CRC-32 list of image
-	bool doFullCheck = false;
 	uint32_t masterCrc = 0;
 	const int hashBlockCount = IMGSIZE_TO_HASHBLOCKS( virtualFilesize );
 	crc32list = image_loadCrcList( path, virtualFilesize, &masterCrc );
 
-	// Check CRC32
-	if ( crc32list != NULL ) {
-		if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) {
-			logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path );
-			doFullCheck = true;
-		}
-	}
-
 	// Compare data just loaded to identical image we apparently already loaded
 	if ( existing != NULL ) {
 		if ( existing->realFilesize != realFilesize ) {
@@ -943,6 +934,8 @@ static bool image_load(char *base, char *path, int withUplink)
 	if ( image_addToList( image ) ) {
 		// Keep fd for reading
 		fdImage = -1;
+		// Check CRC32
+		image_checkRandomBlocks( image, 4 );
 	} else {
 		logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
 		image->readFd = -1; // Keep fdImage instead, will be closed below
@@ -950,12 +943,6 @@ static bool image_load(char *base, char *path, int withUplink)
 		goto load_error;
 	}
 	logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
-	// CRC errors found...
-	if ( doFullCheck ) {
-		logadd( LOG_INFO, "Queueing full CRC32 check for '%s:%d'\n", image->name, (int)image->rid );
-		integrity_check( image, -1 );
-	}
-
 	function_return = true;
 
 	// Clean exit:
@@ -1038,18 +1025,26 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
 	return retval;
 }
 
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map)
+static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
 {
+	if ( image->crc32 == NULL )
+		return;
 	// This checks the first block and (up to) count - 1 random blocks for corruption
 	// via the known crc32 list. This is very sloppy and is merely supposed to detect
 	// accidental corruption due to broken dnbd3-proxy functionality or file system
-	// corruption.
+	// corruption, or people replacing/updating images which is a very stupid thing.
 	assert( count > 0 );
-	const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( realFilesize );
-	int blocks[count + 1];
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
+	int blocks[count];
 	int index = 0, j;
 	int block;
-	if ( image_isHashBlockComplete( cache_map, 0, realFilesize ) ) blocks[index++] = 0;
+	if ( image_isHashBlockComplete( cache->map, 0, image->virtualFilesize ) ) {
+		blocks[index++] = 0;
+	}
+	if ( hashBlocks > 1 && image_isHashBlockComplete( cache->map, hashBlocks - 1, image->virtualFilesize ) ) {
+		blocks[index++] = hashBlocks - 1;
+	}
 	int tries = count * 5; // Try only so many times to find a non-duplicate complete block
 	while ( index + 1 < count && --tries > 0 ) {
 		block = rand() % hashBlocks; // Random block
@@ -1057,11 +1052,15 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t
 			if ( blocks[j] == block ) goto while_end;
 		}
 		// Block complete? If yes, add to list
-		if ( image_isHashBlockComplete( cache_map, block, realFilesize ) ) blocks[index++] = block;
+		if ( image_isHashBlockComplete( cache->map, block, image->virtualFilesize ) ) {
+			blocks[index++] = block;
+		}
 while_end: ;
 	}
-	blocks[MIN(index, count)] = -1; // End of array has to be marked by a -1
-	return image_checkBlocksCrc32( fdImage, crc32list, blocks, realFilesize ); // Return result of check
+	ref_put( &cache->reference );
+	for ( int i = 0; i < index; ++i ) {
+		integrity_check( image, blocks[i], true );
+	}
 }
 
 /**
diff --git a/src/server/integrity.c b/src/server/integrity.c
index fddb755..2058104 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -78,15 +78,17 @@ void integrity_shutdown()
  * make sure it is before calling, otherwise it will result in falsely
  * detected corruption.
  */
-void integrity_check(dnbd3_image_t *image, int block)
+void integrity_check(dnbd3_image_t *image, int block, bool blocking)
 {
+	int freeSlot;
 	if ( !bRunning ) {
 		logadd( LOG_MINOR, "Ignoring check request; thread not running..." );
 		return;
 	}
-	int i, freeSlot = -1;
+start_over:
+	freeSlot = -1;
 	mutex_lock( &integrityQueueLock );
-	for (i = 0; i < queueLen; ++i) {
+	for (int i = 0; i < queueLen; ++i) {
 		if ( freeSlot == -1 && checkQueue[i].image == NULL ) {
 			freeSlot = i;
 		} else if ( checkQueue[i].image == image && checkQueue[i].block <= block ) {
@@ -105,8 +107,13 @@ void integrity_check(dnbd3_image_t *image, int block)
 		}
 	}
 	if ( freeSlot == -1 ) {
-		if ( queueLen >= CHECK_QUEUE_SIZE ) {
+		if ( unlikely( queueLen >= CHECK_QUEUE_SIZE ) ) {
 			mutex_unlock( &integrityQueueLock );
+			if ( blocking ) {
+				logadd( LOG_INFO, "Check queue full, waiting a couple seconds...\n" );
+				sleep( 3 );
+				goto start_over;
+			}
 			logadd( LOG_INFO, "Check queue full, discarding check request...\n" );
 			return;
 		}
@@ -206,7 +213,7 @@ static void* integrity_main(void * data UNUSED)
 						// If this is not a full check, queue one
 						if ( qCount != CHECK_ALL ) {
 							logadd( LOG_INFO, "Queueing full check for %s", image->name );
-							integrity_check( image, -1 );
+							integrity_check( image, -1, false );
 						}
 						foundCorrupted = true;
 					}
diff --git a/src/server/integrity.h b/src/server/integrity.h
index c3c2b44..09d3785 100644
--- a/src/server/integrity.h
+++ b/src/server/integrity.h
@@ -7,6 +7,6 @@ void integrity_init();
 
 void integrity_shutdown();
 
-void integrity_check(dnbd3_image_t *image, int block);
+void integrity_check(dnbd3_image_t *image, int block, bool blocking);
 
 #endif /* INTEGRITY_H_ */
-- 
cgit v1.2.3-55-g7522


From ea4b63c25882603e3583921c4d18c448293b6125 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 11 Sep 2019 22:00:59 +0200
Subject: [SERVER] Make buffer when reading for crc check larger

---
 src/server/image.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 822a710..6259e38 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1643,7 +1643,7 @@ bool image_checkBlocksCrc32(const int fd, uint32_t *crc32list, const int *blocks
 static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc)
 {
 	// Make buffer 4k aligned in case fd has O_DIRECT set
-#define BSIZE 262144
+#define BSIZE (512*1024)
 	char rawBuffer[BSIZE + DNBD3_BLOCK_SIZE];
 	char * const buffer = (char*)( ( (uintptr_t)rawBuffer + ( DNBD3_BLOCK_SIZE - 1 ) ) & ~( DNBD3_BLOCK_SIZE - 1 ) );
 	// How many bytes to read from the input file
-- 
cgit v1.2.3-55-g7522


From 11411f822e5526f8fa3ce47f4315557dd0915ddf Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 11 Sep 2019 22:01:36 +0200
Subject: [*] Use __attribute__((packed)) instead of #pragma pack

---
 src/types.h | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/types.h b/src/types.h
index 695d81d..cb0ccfd 100644
--- a/src/types.h
+++ b/src/types.h
@@ -117,17 +117,14 @@ static const dnbd3_af HOST_NONE = (dnbd3_af)0;
 static const dnbd3_af HOST_IP4 = (dnbd3_af)2;
 static const dnbd3_af HOST_IP6 = (dnbd3_af)10;
 
-#pragma pack(1)
-typedef struct dnbd3_host_t
+typedef struct __attribute__((packed)) dnbd3_host_t
 {
 	uint8_t addr[16];    // 16byte (network representation, so it can be directly passed to socket functions)
 	uint16_t port;       // 2byte (network representation, so it can be directly passed to socket functions)
 	dnbd3_af type;        // 1byte (ip version. HOST_IP4 or HOST_IP6. 0 means this struct is empty and should be ignored)
 } dnbd3_host_t;
-#pragma pack(0)
 
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
 {
 	uint16_t len;
 	dnbd3_host_t host;
@@ -137,7 +134,6 @@ typedef struct
 	int read_ahead_kb;
 	uint8_t use_server_provided_alts;
 } dnbd3_ioctl_t;
-#pragma pack(0)
 
 // network
 #define CMD_GET_BLOCK           1
@@ -150,8 +146,7 @@ typedef struct
 #define CMD_GET_CRC32           8
 
 #define DNBD3_REQUEST_SIZE     24
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
 {
 	uint16_t magic;           // 2byte
 	uint16_t cmd;             // 2byte
@@ -170,27 +165,22 @@ typedef struct
 	};
 	uint64_t handle;          // 8byte
 } dnbd3_request_t;
-#pragma pack(0)
 _Static_assert( sizeof(dnbd3_request_t) == DNBD3_REQUEST_SIZE, "dnbd3_request_t is messed up" );
 
 #define DNBD3_REPLY_SIZE       16
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
 {
 	uint16_t magic;		// 2byte
 	uint16_t cmd;		// 2byte
 	uint32_t size;		// 4byte
 	uint64_t handle;	// 8byte
 } dnbd3_reply_t;
-#pragma pack(0)
 _Static_assert( sizeof(dnbd3_reply_t) == DNBD3_REPLY_SIZE, "dnbd3_reply_t is messed up" );
 
-#pragma pack(1)
-typedef struct
+typedef struct __attribute__((packed))
 {
 	dnbd3_host_t host;
 	uint8_t  failures;		// 1byte (number of times server has been consecutively unreachable)
 } dnbd3_server_entry_t;
-#pragma pack(0)
 
 #endif /* TYPES_H_ */
-- 
cgit v1.2.3-55-g7522


From f6d5dad8cd50390bd25b22d70871a89b6d7af268 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 11 Sep 2019 22:15:22 +0200
Subject: [SERVER] rpc: Fix warnings

---
 src/server/rpc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/rpc.c b/src/server/rpc.c
index 548c80f..5daf20c 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -354,11 +354,11 @@ static bool handleCacheMap(int sock, int permissions, struct field *fields, size
 		return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access image list", -1, keepAlive );
 	}
 	int imgId = -1;
-	static const char one = 0xff;
+	static const char one = (char)0xff;
 	for (size_t i = 0; i < fields_num; ++i) {
 		if ( equals( &fields[i].name, &STR_ID ) ) {
 			char *broken;
-			imgId = strtol( fields[i].value.s, &broken, 10 );
+			imgId = (int)strtol( fields[i].value.s, &broken, 10 );
 			if ( broken != fields[i].value.s )
 				break;
 			imgId = -1;
-- 
cgit v1.2.3-55-g7522


From 53fbcc89f027992e29c96086dd32eb624e181eac Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 17 Sep 2019 14:56:03 +0200
Subject: [*] Fix/simplify checks for linux

---
 src/server/integrity.c | 4 ++--
 src/shared/fdsignal.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/server/integrity.c b/src/server/integrity.c
index 2058104..1fbd9dc 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -136,7 +136,7 @@ static void* integrity_main(void * data UNUSED)
 	int i;
 	setThreadName( "image-check" );
 	blockNoncriticalSignals();
-#if defined(linux) || defined(__linux)
+#if defined(__linux__)
 	// Setting nice of this thread - this is not POSIX conforming, so check if other platforms support this.
 	// POSIX says that setpriority() should set the nice value of all threads belonging to the current process,
 	// but on linux you can do this per thread.
@@ -291,7 +291,7 @@ static void flushFileRange(dnbd3_image_t *image, uint64_t start, uint64_t end)
 	}
 	if ( flushFd == -1 )
 		return;
-#if defined(linux) || defined(__linux)
+#if defined(__linux__)
 	while ( sync_file_range( flushFd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
 #else
 	while ( fsync( flushFd ) == -1 ) // TODO: fdatasync() should be available since FreeBSD 12.0 ... Might be a tad bit faster
diff --git a/src/shared/fdsignal.c b/src/shared/fdsignal.c
index 5e5cf7f..087b6f1 100644
--- a/src/shared/fdsignal.c
+++ b/src/shared/fdsignal.c
@@ -1,6 +1,6 @@
 #include "fdsignal.h"
 
-#if defined(linux) || defined(__linux) || defined(__linux__)
+#if defined(__linux__)
 //#warning "Using eventfd based signalling"
 #include "fdsignal.inc/eventfd.c"
 #elif __SIZEOF_INT__ == 4 && __SIZEOF_POINTER__ == 8
-- 
cgit v1.2.3-55-g7522


From 3d0c89fccf14599d156696d74224a4fbe0787777 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 29 Oct 2019 17:55:20 +0100
Subject: [SERVER] Fix checking images without cache map

---
 src/server/image.c     | 18 +++++++++++-------
 src/server/image.h     |  2 +-
 src/server/integrity.c |  4 ++--
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 6259e38..9581a92 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -160,7 +160,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 		end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1);
 		for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) {
 			const int block = (int)( pos / HASH_BLOCK_SIZE );
-			if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) {
+			if ( image_isHashBlockComplete( cache, block, image->realFilesize ) ) {
 				integrity_check( image, block, false );
 			}
 		}
@@ -651,9 +651,11 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	return NULL ;
 }
 
-bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
+bool image_isHashBlockComplete(dnbd3_cache_map_t * const cache, const uint64_t block, const uint64_t realFilesize)
 {
-	if ( cacheMap == NULL ) return true;
+	if ( cache == NULL )
+		return true;
+	const atomic_uint_least8_t *cacheMap = cache->map;
 	const uint64_t end = (block + 1) * HASH_BLOCK_SIZE;
 	if ( end <= realFilesize ) {
 		// Trivial case: block in question is not the last block (well, or image size is multiple of HASH_BLOCK_SIZE)
@@ -1039,10 +1041,10 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
 	int blocks[count];
 	int index = 0, j;
 	int block;
-	if ( image_isHashBlockComplete( cache->map, 0, image->virtualFilesize ) ) {
+	if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
 		blocks[index++] = 0;
 	}
-	if ( hashBlocks > 1 && image_isHashBlockComplete( cache->map, hashBlocks - 1, image->virtualFilesize ) ) {
+	if ( hashBlocks > 1 && image_isHashBlockComplete( cache, hashBlocks - 1, image->virtualFilesize ) ) {
 		blocks[index++] = hashBlocks - 1;
 	}
 	int tries = count * 5; // Try only so many times to find a non-duplicate complete block
@@ -1052,12 +1054,14 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
 			if ( blocks[j] == block ) goto while_end;
 		}
 		// Block complete? If yes, add to list
-		if ( image_isHashBlockComplete( cache->map, block, image->virtualFilesize ) ) {
+		if ( image_isHashBlockComplete( cache, block, image->virtualFilesize ) ) {
 			blocks[index++] = block;
 		}
 while_end: ;
 	}
-	ref_put( &cache->reference );
+	if ( cache != NULL ) {
+		ref_put( &cache->reference );
+	}
 	for ( int i = 0; i < index; ++i ) {
 		integrity_check( image, blocks[i], true );
 	}
diff --git a/src/server/image.h b/src/server/image.h
index 449e31f..89791fc 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -9,7 +9,7 @@ void image_serverStartup();
 
 bool image_isComplete(dnbd3_image_t *image);
 
-bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
+bool image_isHashBlockComplete(dnbd3_cache_map_t * const cache, const uint64_t block, const uint64_t fileSize);
 
 void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set);
 
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 1fbd9dc..4006dfc 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -174,7 +174,7 @@ static void* integrity_main(void * data UNUSED)
 						dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 						if ( cache != NULL ) {
 							// When checking full image, skip incomplete blocks, otherwise assume block is complete
-							complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+							complete = image_isHashBlockComplete( cache, blocks[0], fileSize );
 							ref_put( &cache->reference );
 						}
 					}
@@ -205,7 +205,7 @@ static void* integrity_main(void * data UNUSED)
 						bool iscomplete = true;
 						dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 						if ( cache != NULL ) {
-							iscomplete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+							iscomplete = image_isHashBlockComplete( cache, blocks[0], fileSize );
 							ref_put( &cache->reference );
 						}
 						logadd( LOG_WARNING, "Hash check for block %d of %s failed (complete: was: %d, is: %d)", blocks[0], image->name, (int)complete, (int)iscomplete );
-- 
cgit v1.2.3-55-g7522


From f700b99a36c8094af0e311b23a2f725120f180ac Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 30 Oct 2019 12:30:48 +0100
Subject: [SERVER] Fix another nullpointer access

---
 src/server/rpc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/rpc.c b/src/server/rpc.c
index 5daf20c..a454d6d 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -382,7 +382,9 @@ static bool handleCacheMap(int sock, int permissions, struct field *fields, size
 		len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	}
 	bool ok = sendReply( sock, "200 OK", "application/octet-stream", map, len, keepAlive );
-	ref_put( &cache->reference );
+	if ( cache != NULL ) {
+		ref_put( &cache->reference );
+	}
 	return ok;
 }
 
-- 
cgit v1.2.3-55-g7522


From 9c482e75728556f969132eee8b0a402a508e46a6 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 28 Jan 2020 16:43:46 +0100
Subject: [SERVER] Lookup image on storage even in proxy mode

In proxy mode, when rid 0 is requested, we now first query
our uplink servers for the latest revision and if this fails,
like in non-proxy mode, we'll see what the latest version on
disk is.
---
 src/server/image.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 9581a92..16dae45 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1143,8 +1143,7 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
  * revision 0 is requested, it will:
  * a) Try to clone it from an authoritative dnbd3 server, if
  *    the server is running in proxy mode.
- * b) Try to load it from disk by constructing the appropriate file name, if not
- *    running in proxy mode.
+ * b) Try to load it from disk by constructing the appropriate file name.
  *
  *  If the return value is not NULL,
  * image_release needs to be called on the image at some point.
@@ -1152,21 +1151,25 @@ static dnbd3_image_t *loadImageServer(char * const name, const uint16_t requeste
  */
 dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
 {
+	dnbd3_image_t *image;
 	// specific revision - try shortcut
 	if ( revision != 0 ) {
-		dnbd3_image_t *image = image_get( name, revision, true );
-		if ( image != NULL ) return image;
+		image = image_get( name, revision, true );
+		if ( image != NULL )
+			return image;
 	}
 	const size_t len = strlen( name );
 	// Sanity check
 	if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
 			|| name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
-	// Call specific function depending on whether this is a proxy or not
+	// If in proxy mode, check with upstream server first
 	if ( _isProxy ) {
-		return loadImageProxy( name, revision, len );
-	} else {
-		return loadImageServer( name, revision );
+		image = loadImageProxy( name, revision, len );
+		if ( image != NULL )
+			return image;
 	}
+	// Lookup on local storage
+	return loadImageServer( name, revision );
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From dd0880b8ee67f9a69802a2a3ef26cd5df6881129 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 24 Feb 2020 14:13:01 +0100
Subject: [SERVER] Introduce ignoreAllocErrors

If enabled, a failed fallocate will not abort image replication, but
retry with sparse mode.
---
 src/server/globals.c | 3 +++
 src/server/globals.h | 6 ++++++
 src/server/image.c   | 9 +++++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.c b/src/server/globals.c
index f8c3f66..2e87400 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -21,6 +21,7 @@ atomic_int _backgroundReplication = BGR_FULL;
 atomic_int _bgrMinClients = 0;
 atomic_bool _lookupMissingForProxy = true;
 atomic_bool _sparseFiles = false;
+atomic_bool _ignoreAllocErrors = false;
 atomic_bool _removeMissingImages = true;
 atomic_int _uplinkTimeout = SOCKET_TIMEOUT_UPLINK;
 atomic_int _clientTimeout = SOCKET_TIMEOUT_CLIENT;
@@ -75,6 +76,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
 	SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
 	SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
 	SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
+	SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
 	SAVE_TO_VAR_BOOL( dnbd3, removeMissingImages );
 	SAVE_TO_VAR_BOOL( dnbd3, closeUnusedFd );
 	SAVE_TO_VAR_UINT( dnbd3, serverPenalty );
@@ -322,6 +324,7 @@ size_t globals_dumpConfig(char *buffer, size_t size)
 	PINT(bgrMinClients);
 	PBOOL(lookupMissingForProxy);
 	PBOOL(sparseFiles);
+	PBOOL(ignoreAllocErrors);
 	PBOOL(removeMissingImages);
 	PINT(uplinkTimeout);
 	PINT(clientTimeout);
diff --git a/src/server/globals.h b/src/server/globals.h
index df8c595..b1336dc 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -254,6 +254,12 @@ extern atomic_bool _lookupMissingForProxy;
  */
 extern atomic_bool _sparseFiles;
 
+/**
+ * If true, don't abort image replication if preallocating
+ * the image fails, but retry with sparse file.
+ */
+extern atomic_bool _ignoreAllocErrors;
+
 /**
  * Port to listen on (default: #define PORT (5003))
  */
diff --git a/src/server/image.c b/src/server/image.c
index 16dae45..6017e59 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1111,14 +1111,19 @@ bool image_create(char *image, int revision, uint64_t size)
 		logadd( LOG_DEBUG1, "Could not allocate %d bytes for %s (errno=%d)", mapsize, cache, err );
 	}
 	// Now write image
+	bool fallback = false;
 	if ( !_sparseFiles && !file_alloc( fdImage, 0, size ) ) {
 		logadd( LOG_ERROR, "Could not allocate %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
 		logadd( LOG_ERROR, "It is highly recommended to use a file system that supports preallocating disk"
 				" space without actually writing all zeroes to the block device." );
 		logadd( LOG_ERROR, "If you cannot fix this, try setting sparseFiles=true, but don't expect"
 				" divine performance during replication." );
-		goto failure_cleanup;
-	} else if ( _sparseFiles && !file_setSize( fdImage, size ) ) {
+		if ( !_ignoreAllocErrors ) {
+			goto failure_cleanup;
+		}
+		fallback = true;
+	}
+	if ( ( _sparseFiles || fallback ) && !file_setSize( fdImage, size ) ) {
 		logadd( LOG_ERROR, "Could not create sparse file of %" PRIu64 " bytes for %s (errno=%d)", size, path, errno );
 		logadd( LOG_ERROR, "Make sure you have enough disk space, check directory permissions, fs errors etc." );
 		goto failure_cleanup;
-- 
cgit v1.2.3-55-g7522


From 45b345e637e643d4dfaa7bdc4691359f4b170218 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 26 Feb 2020 18:03:05 +0100
Subject: [SERVER] altservers: Fix missing index mapping (replication)

---
 src/server/altservers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 943345c..3fdbe0d 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -268,7 +268,7 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server
 	int idx[size];
 	int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
 	for ( int i = 0; i < num; ++i ) {
-		servers[i] = altServers[i].host;
+		servers[i] = altServers[idx[i]].host;
 	}
 	return num;
 }
-- 
cgit v1.2.3-55-g7522


From 26c1ad7af0f5749c5343a5823b9c8cece885ce84 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 12:21:01 +0100
Subject: [SERVER] Remove "working" flag, introduce fine-grained flags

Tracking the "working" state of images using one boolean is insufficient
regarding the different ways in which providing an image can fail.
Introduce separate flags for different conditions, like "file not
readable", "file not writable", "no uplink server available", "file
content has changed".
---
 src/server/altservers.c |   4 -
 src/server/globals.h    |   7 +-
 src/server/image.c      | 193 +++++++++++++++++++++++++-----------------------
 src/server/integrity.c  |  20 +----
 src/server/net.c        |  17 +++--
 src/server/uplink.c     | 114 ++++++++++++++++++----------
 6 files changed, 197 insertions(+), 158 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 3fdbe0d..a6ad235 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -628,10 +628,6 @@ failed:
 		if ( best.fd != -1 ) {
 			close( best.fd );
 		}
-		if ( !image->working || uplink->cycleDetected ) {
-			image->working = true;
-			LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
-		}
 		uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
 		mutex_lock( &uplink->rttLock );
 		uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/globals.h b/src/server/globals.h
index b1336dc..31fbce5 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -136,7 +136,12 @@ struct _dnbd3_image
 	atomic_int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
-	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
+	struct {
+		atomic_bool uplink;      // No uplink connected
+		atomic_bool write;       // Error writing to file
+		atomic_bool read;        // Error reading from file
+		atomic_bool changed;     // File disappeared or changed, thorough check required if it seems to be back
+	} problem;
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
 };
diff --git a/src/server/image.c b/src/server/image.c
index 6017e59..1ce1574 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -53,7 +53,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force);
 
 static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count);
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
 
@@ -239,35 +239,76 @@ bool image_isComplete(dnbd3_image_t *image)
  */
 bool image_ensureOpen(dnbd3_image_t *image)
 {
-	if ( image->readFd != -1 ) return image;
-	int newFd = open( image->path, O_RDONLY );
+	bool sizeChanged = false;
+	if ( image->readFd != -1 && !image->problem.changed )
+		return true;
+	int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd );
 	if ( newFd == -1 ) {
-		logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+		if ( !image->problem.read ) {
+			logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+			image->problem.read = true;
+		}
 	} else {
-		// Check size
+		// Check size + read access
+		char buffer[100];
 		const off_t flen = lseek( newFd, 0, SEEK_END );
 		if ( flen == -1 ) {
-			logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+			if ( !image->problem.read ) {
+				logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+				image->problem.read = true;
+			}
 			close( newFd );
 			newFd = -1;
 		} else if ( (uint64_t)flen != image->realFilesize ) {
-			logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+			if ( !image->problem.changed ) {
+				logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64,
+						image->realFilesize, (uint64_t)flen );
+			}
+			sizeChanged = true;
+		} else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+			if ( !image->problem.read ) {
+				logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)",
+						(int)sizeof(buffer), image->path, errno );
+				image->problem.read = true;
+			}
 			close( newFd );
 			newFd = -1;
 		}
 	}
 	if ( newFd == -1 ) {
-		mutex_lock( &image->lock );
-		image->working = false;
-		mutex_unlock( &image->lock );
+		if ( sizeChanged ) {
+			image->problem.changed = true;
+		}
 		return false;
 	}
+
+	// Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32,
+	// but only if the size we just got above is correct.
+	if ( image->problem.changed && !sizeChanged ) {
+		if ( image->crc32 == NULL ) {
+			// Cannot verify further, hope for the best
+			image->problem.changed = false;
+			logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value",
+					image->name, (int)image->rid );
+		} else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
+			// This should have checked the first block (if complete) -> All is well again
+			image->problem.changed = false;
+			logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value",
+					image->name, (int)image->rid );
+		}
+	} else {
+		image->problem.changed = sizeChanged;
+	}
+
 	mutex_lock( &image->lock );
 	if ( image->readFd == -1 ) {
 		image->readFd = newFd;
+		image->problem.read = false;
 		mutex_unlock( &image->lock );
 	} else {
-		// There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+		// There was a race while opening the file (happens cause not locked cause blocking),
+		// we lost the race so close new fd and proceed.
+		// *OR* we dup()'ed above for cheating when the image changed before.
 		mutex_unlock( &image->lock );
 		close( newFd );
 	}
@@ -296,7 +337,7 @@ dnbd3_image_t* image_byId(int imgId)
  * point...
  * Locks on: imageListLock, _images[].lock
  */
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen)
 {
 	int i;
 	const char *removingText = _removeMissingImages ? ", removing from list" : "";
@@ -326,84 +367,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 	candidate->users++;
 	mutex_unlock( &imageListLock );
 
-	// Found, see if it works
-	// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
-	// TODO: But remember size-changed images forever
-	if ( candidate->working || checkIfWorking ) {
-		// Is marked working, but might not have an fd open
-		if ( !image_ensureOpen( candidate ) ) {
-			mutex_lock( &candidate->lock );
-			timing_get( &candidate->lastWorkCheck );
-			mutex_unlock( &candidate->lock );
-			if ( _removeMissingImages ) {
-				candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
-			}
-			return candidate;
-		}
-	}
-
-	if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
-
-	// ...not working...
-
-	// Don't re-check too often
-	mutex_lock( &candidate->lock );
-	bool check;
-	declare_now;
-	check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
-	if ( check ) {
-		candidate->lastWorkCheck = now;
-	}
-	mutex_unlock( &candidate->lock );
-	if ( !check ) {
+	if ( !ensureFdOpen ) // Don't want to re-check
 		return candidate;
-	}
 
-	// reaching this point means:
-	// 1) We should check if the image is working, it might or might not be in working state right now
-	// 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
-	// 3) We made sure not to re-check this image too often
-
-	// Common for ro and rw images: Size check, read check
-	const off_t len = lseek( candidate->readFd, 0, SEEK_END );
-	bool reload = false;
-	if ( len == -1 ) {
-		logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
-		reload = true;
-	} else if ( (uint64_t)len != candidate->realFilesize ) {
-		logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
-				". Try sending SIGHUP to server if you know what you're doing.",
-				candidate->path, candidate->realFilesize, (uint64_t)len );
-	} else {
-		// Seek worked, file size is same, now see if we can read from file
-		char buffer[100];
-		if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
-			logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.",
-					(int)sizeof(buffer), candidate->path, errno, removingText );
-			reload = true;
-		} else if ( !candidate->working ) {
-			// Seems everything is fine again \o/
-			candidate->working = true;
-			logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
-		}
-	}
+	if ( image_ensureOpen( candidate ) && !candidate->problem.read )
+		return candidate; // We have a read fd and no read or changed problems
 
-	if ( reload ) {
+	// -- image could not be opened again, or is open but has problem --
+
+	if ( _removeMissingImages && !file_isReadable( candidate->path ) ) {
+		candidate = image_remove( candidate );
+		// No image_release here, the image is still returned and should be released by caller
+	} else if ( candidate->readFd != -1 ) {
+		// We cannot just close the fd as it might be in use. Make a copy and remove old entry.
+		candidate = image_remove( candidate );
 		// Could not access the image with exising fd - mark for reload which will re-open the file.
 		// make a copy of the image struct but keep the old one around. If/When it's not being used
 		// anymore, it will be freed automatically.
-		logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path );
+		logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path );
 		dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
 		img->path = strdup( candidate->path );
 		img->name = strdup( candidate->name );
 		img->virtualFilesize = candidate->virtualFilesize;
 		img->realFilesize = candidate->realFilesize;
-		img->atime = now;
+		timing_get( &img->atime );
 		img->masterCrc32 = candidate->masterCrc32;
 		img->readFd = -1;
 		img->rid = candidate->rid;
 		img->users = 1;
-		img->working = false;
+		img->problem.read = true;
+		img->problem.changed = candidate->problem.changed;
 		img->ref_cacheMap = NULL;
 		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
@@ -419,18 +412,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		if ( image_addToList( img ) ) {
 			image_release( candidate );
 			candidate = img;
+			// Check if image is incomplete, initialize uplink
+			if ( candidate->ref_cacheMap != NULL ) {
+				uplink_init( candidate, -1, NULL, -1 );
+			}
+			// Try again with new instance
+			image_ensureOpen( candidate );
 		} else {
 			img->users = 0;
 			image_free( img );
 		}
-		// Check if image is incomplete, initialize uplink
-		if ( candidate->ref_cacheMap != NULL ) {
-			uplink_init( candidate, -1, NULL, -1 );
-		}
-		// readFd == -1 and working == FALSE at this point,
-		// this function needs some splitting up for handling as we need to run most
-		// of the above code again. for now we know that the next call for this
-		// name:rid will get ne newly inserted "img" and try to re-open the file.
+		// readFd == -1 and problem.read == true
 	}
 
 	return candidate; // We did all we can, hopefully it's working
@@ -900,7 +892,6 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->rid = (uint16_t)revision;
 	image->users = 0;
 	image->readFd = -1;
-	image->working = ( cache == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
@@ -925,7 +916,7 @@ static bool image_load(char *base, char *path, int withUplink)
 
 	// Image is definitely incomplete, initialize uplink worker
 	if ( image->ref_cacheMap != NULL ) {
-		image->working = false;
+		image->problem.uplink = true;
 		if ( withUplink ) {
 			uplink_init( image, -1, NULL, -1 );
 		}
@@ -937,7 +928,7 @@ static bool image_load(char *base, char *path, int withUplink)
 		// Keep fd for reading
 		fdImage = -1;
 		// Check CRC32
-		image_checkRandomBlocks( image, 4 );
+		image_checkRandomBlocks( image, 4, -1 );
 	} else {
 		logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
 		image->readFd = -1; // Keep fdImage instead, will be closed below
@@ -1027,10 +1018,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
 	return retval;
 }
 
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
+/**
+ * Check up to count random blocks from given image. If fromFd is -1, the check will
+ * be run asynchronously using the integrity checker. Otherwise, the check will
+ * happen in the function and return the result of the check.
+ * @param image image to check
+ * @param count number of blocks to check (max)
+ * @param fromFd, check synchronously and use this fd for reading, -1 = async
+ * @return true = OK, false = error. Meaningless if fromFd == -1
+ */
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd)
 {
 	if ( image->crc32 == NULL )
-		return;
+		return true;
 	// This checks the first block and (up to) count - 1 random blocks for corruption
 	// via the known crc32 list. This is very sloppy and is merely supposed to detect
 	// accidental corruption due to broken dnbd3-proxy functionality or file system
@@ -1038,7 +1038,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
 	assert( count > 0 );
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 	const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
-	int blocks[count];
+	int blocks[count+1]; // +1 for "-1" in sync case
 	int index = 0, j;
 	int block;
 	if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
@@ -1062,9 +1062,16 @@ while_end: ;
 	if ( cache != NULL ) {
 		ref_put( &cache->reference );
 	}
-	for ( int i = 0; i < index; ++i ) {
-		integrity_check( image, blocks[i], true );
+	if ( fromFd == -1 ) {
+		// Async
+		for ( int i = 0; i < index; ++i ) {
+			integrity_check( image, blocks[i], true );
+		}
+		return true;
 	}
+	// Sync
+	blocks[index] = -1;
+	return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize );
 }
 
 /**
@@ -1306,7 +1313,7 @@ server_fail: ;
 		} else {
 			// Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
 			int i = 0;
-			while ( !image->working && ++i < 100 )
+			while ( image->problem.uplink && ++i < 100 )
 				usleep( 2000 );
 		}
 	} else if ( uplinkSock != -1 ) {
@@ -1599,7 +1606,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image)
 	assert( image != NULL );
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 	if ( cache == NULL )
-		return image->working ? 100 : 0;
+		return 100;
 	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	if ( unlikely( len == 0 ) ) {
 		ref_put( &cache->reference );
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 4006dfc..91e53b8 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED)
 							readFd = directFd;
 						}
 					}
-					if ( readFd == -1 ) { // Try buffered; flush to disk for that
-						image_ensureOpen( image );
-						readFd = image->readFd;
+					if ( readFd == -1 ) { // Try buffered as fallback
+						if ( image_ensureOpen( image ) && !image->problem.read ) {
+							readFd = image->readFd;
+						}
 					}
 					if ( readFd == -1 ) {
 						logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
@@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED)
 					// Done with this task as nothing left
 					checkQueue[i].image = NULL;
 					if ( i + 1 == queueLen ) queueLen--;
-					// Mark as working again if applicable
-					if ( !foundCorrupted ) {
-						dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-						if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
-							mutex_lock( &image->lock );
-							image->working = uplink->current.fd != -1 && image->readFd != -1;
-							mutex_unlock( &image->lock );
-							ref_put( &uplink->reference );
-						}
-					}
 				} else {
 					// Still more blocks to go...
 					checkQueue[i].block = blocks[0];
@@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED)
 			}
 			if ( foundCorrupted && !_shutdown ) {
 				// Something was fishy, make sure uplink exists
-				mutex_lock( &image->lock );
-				image->working = false;
-				mutex_unlock( &image->lock );
 				uplink_init( image, -1, NULL, -1 );
 			}
 			// Release :-)
diff --git a/src/server/net.c b/src/server/net.c
index aba4e7d..29147be 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -262,7 +262,7 @@ void* net_handleNewConnection(void *clientPtr)
 			atomic_thread_fence( memory_order_release );
 			if ( unlikely( image == NULL ) ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
-			} else if ( unlikely( !image->working ) ) {
+			} else if ( unlikely( image->problem.read || image->problem.changed ) ) {
 				logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
 						client->hostName, image_name, (int)rid );
 			} else {
@@ -273,8 +273,14 @@ void* net_handleNewConnection(void *clientPtr)
 					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
-						usleep( 100000 ); // server gets a penalty and is less likely to be selected
+					if ( bOk && uplink != NULL ) {
+						if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
+							usleep( 100000 ); // server gets a penalty and is less likely to be selected
+						}
+						if ( image->problem.uplink ) {
+							// Penaltize depending on completeness, if no uplink is available
+							usleep( ( 100 - image->completenessEstimate ) * 100 );
+						}
 					}
 					if ( uplink != NULL ) {
 						ref_put( &uplink->reference );
@@ -383,9 +389,8 @@ void* net_handleNewConnection(void *clientPtr)
 					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
-							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
-							image->working = false;
 							goto exit_client_cleanup;
 						}
 						break; // DONE, exit request.cmd switch
@@ -456,7 +461,7 @@ void* net_handleNewConnection(void *clientPtr)
 								}
 								if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
 									logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
-									image->working = false;
+									image->problem.read = true;
 								}
 							}
 							goto exit_client_cleanup;
diff --git a/src/server/uplink.c b/src/server/uplink.c
index f39e633..aba53ba 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -53,9 +53,9 @@ static void* uplink_mainloop(void *data);
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
 static void uplink_handleReceive(dnbd3_uplink_t *uplink);
-static int uplink_sendKeepalive(const int fd);
+static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink);
 static void uplink_addCrc32(dnbd3_uplink_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
 static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
 static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
@@ -117,6 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
+	image->problem.uplink = true;
 	if ( sock != -1 ) {
 		uplink->better.fd = sock;
 		int index = altservers_hostToIndex( host );
@@ -371,6 +372,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
 	} else {
 		if ( unlikely( uplink->current.fd == -1 ) ) {
+			uplink->image->problem.uplink = true;
 			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 		} else {
@@ -378,12 +380,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 			if ( hops < 200 ) ++hops;
 			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-			mutex_unlock( &uplink->sendMutex );
 			if ( unlikely( !ret ) ) {
+				uplink->image->problem.uplink = true;
+				mutex_unlock( &uplink->sendMutex );
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
 			} else {
 				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
 				int state;
+				mutex_unlock( &uplink->sendMutex );
 				mutex_lock( &uplink->queueLock );
 				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
 					state = uplink->queue[freeSlot].status;
@@ -460,9 +464,9 @@ static void* uplink_mainloop(void *data)
 	}
 	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
-		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
-		if ( waitTime == 0 ) {
+		if ( uplink->rttTestResult == RTT_DOCHANGE ) {
 			// 0 means poll, since we're about to change the server
+			waitTime = 0;
 		} else {
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
@@ -495,7 +499,7 @@ static void* uplink_mainloop(void *data)
 			discoverFailCount = 0;
 			if ( fd != -1 ) close( fd );
 			uplink->replicationHandle = REP_NONE;
-			uplink->image->working = true;
+			uplink->image->problem.uplink = false;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
 			if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
@@ -510,6 +514,11 @@ static void* uplink_mainloop(void *data)
 			uplink_sendRequests( uplink, false );
 			uplink_sendReplicationRequest( uplink );
 			events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+			if ( uplink->image->problem.uplink ) {
+				// Some of the requests above must have failed again already :-(
+				logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
+				uplink_connectionFailed( uplink, true );
+			}
 			timing_gets( &nextAltCheck, altCheckInterval );
 			// The rtt worker already did the handshake for our image, so there's nothing
 			// more to do here
@@ -517,6 +526,7 @@ static void* uplink_mainloop(void *data)
 		// Check events
 		// Signal
 		if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+			uplink->image->problem.uplink = true;
 			logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
 			goto cleanup;
 		} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
@@ -553,14 +563,10 @@ static void* uplink_mainloop(void *data)
 			}
 			// Keep-alive
 			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
-				// Send keep-alive if nothing is happening
-				if ( uplink_sendKeepalive( uplink->current.fd ) ) {
-					// Re-trigger periodically, in case it requires a minimum user count
-					uplink_sendReplicationRequest( uplink );
-				} else {
+				// Send keep-alive if nothing is happening, and try to trigger background rep.
+				if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) {
 					uplink_connectionFailed( uplink, true );
-					logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
-					setThreadName( "panic-uplink" );
+					logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
@@ -578,6 +584,7 @@ static void* uplink_mainloop(void *data)
 					// Quit work if image is complete
 					logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
 					setThreadName( "finished-uplink" );
+					uplink->image->problem.uplink = false;
 					goto cleanup;
 				} else {
 					// Not complete - do measurement
@@ -592,10 +599,6 @@ static void* uplink_mainloop(void *data)
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
 			if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
 				discoverFailCount++;
-				if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
-					logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
-					uplink->image->working = false;
-				}
 				if ( uplink->current.fd == -1 ) {
 					uplink->cycleDetected = false;
 				}
@@ -624,8 +627,9 @@ static void* uplink_mainloop(void *data)
 				}
 			}
 			mutex_unlock( &uplink->queueLock );
-			if ( resend )
+			if ( resend ) {
 				uplink_sendRequests( uplink, true );
+			}
 		}
 #endif
 	}
@@ -653,6 +657,9 @@ static void* uplink_mainloop(void *data)
 	return NULL ;
 }
 
+/**
+ * Only called from uplink thread.
+ */
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
 	// Scan for new requests
@@ -672,13 +679,15 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		if ( hops < 200 ) ++hops;
 		mutex_lock( &uplink->sendMutex );
 		const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-		mutex_unlock( &uplink->sendMutex );
-		if ( !ret ) {
+		if ( likely( ret ) ) {
+			mutex_unlock( &uplink->sendMutex );
+		} else {
 			// Non-critical - if the connection dropped or the server was changed
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
+			uplink->image->problem.uplink = true;
+			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( uplink->current.index );
 			return;
 		}
 		mutex_lock( &uplink->queueLock );
@@ -695,21 +704,27 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
  * server. This means we might request data we already have, but it makes
  * the code simpler. Worst case would be only one bit is zero, which means
  * 4kb are missing, but we will request 32kb.
+ *
+ * Only called form uplink thread, so current.fd is assumed to be valid.
+ *
+ * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
  */
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
+static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
-	if ( uplink == NULL || uplink->current.fd == -1 ) return;
-	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
+	if ( uplink->current.fd == -1 )
+		return false; // Should never be called in this state, consider send error
+	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
+		return true; // Don't do background replication
 	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return; // Already a replication request on the wire, or no more blocks to replicate
+		return true; // Already a replication request on the wire, or no more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
-	if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
-	if ( image->users < _bgrMinClients ) return; // Not enough active users
+	if ( image->users < _bgrMinClients )
+		return true; // Not enough active users
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-	if ( cache == NULL || image->users < _bgrMinClients ) {
+	if ( cache == NULL || image->users ) {
 		// No cache map (=image complete)
 		ref_put( &cache->reference );
-		return;
+		return true;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	const int lastBlockIndex = mapBytes - 1;
@@ -741,17 +756,20 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 	if ( replicationIndex == -1 ) {
 		// Replication might be complete, uplink_mainloop should take care....
 		uplink->nextReplicationIndex = -1;
-		return;
+		return true;
 	}
 	const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
 	uplink->replicationHandle = offset;
 	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
 	mutex_lock( &uplink->sendMutex );
 	bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
-	mutex_unlock( &uplink->sendMutex );
-	if ( !sendOk ) {
+	if ( likely( sendOk ) ) {
+		mutex_unlock( &uplink->sendMutex );
+	} else {
+		uplink->image->problem.uplink = true;
+		mutex_unlock( &uplink->sendMutex );
 		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
-		return;
+		return false;
 	}
 	if ( replicationIndex == lastBlockIndex ) {
 		uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
@@ -762,6 +780,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		// Just crossed a hash block boundary, look for new candidate starting at this very index
 		uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
 	}
+	return true;
 }
 
 /**
@@ -816,6 +835,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 /**
  * Receive data from uplink server and process/dispatch
  * Locks on: uplink.lock, images[].lock
+ * Only called from uplink thread, so current.fd is assumed to be valid.
  */
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
@@ -990,11 +1010,14 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		mutex_lock( &uplink->queueLock );
 		const bool rep = ( uplink->queueLen == 0 );
 		mutex_unlock( &uplink->queueLock );
-		if ( rep ) uplink_sendReplicationRequest( uplink );
+		if ( rep ) {
+			if ( !uplink_sendReplicationRequest( uplink ) )
+				goto error_cleanup;
+		}
 	}
 	return;
 	// Error handling from failed receive or message parsing
-	error_cleanup: ;
+error_cleanup: ;
 	uplink_connectionFailed( uplink, true );
 }
 
@@ -1005,8 +1028,10 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	if ( uplink->current.fd == -1 )
 		return;
+	setThreadName( "panic-uplink" );
 	altservers_serverFailed( uplink->current.index );
 	mutex_lock( &uplink->sendMutex );
+	uplink->image->problem.uplink = true;
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
@@ -1025,14 +1050,24 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 }
 
 /**
- * Send keep alive request to server
+ * Send keep alive request to server.
+ * Called from uplink thread, current.fd must be valid.
  */
-static int uplink_sendKeepalive(const int fd)
+static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink)
 {
 	static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
-	return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+	mutex_lock( &uplink->sendMutex );
+	bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+	mutex_unlock( &uplink->sendMutex );
+	return sendOk;
 }
 
+/**
+ * Request crclist from uplink.
+ * Called from uplink thread, current.fd must be valid.
+ * FIXME This is broken as it could happen that another message arrives after sending
+ * the request. Refactor, split and move receive into general receive handler.
+ */
 static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 {
 	dnbd3_image_t *image = uplink->image;
@@ -1042,6 +1077,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	uint32_t *buffer = malloc( bytes );
 	mutex_lock( &uplink->sendMutex );
 	bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
+	if ( !sendOk ) {
+		uplink->image->problem.uplink = true;
+	}
 	mutex_unlock( &uplink->sendMutex );
 	if ( !sendOk || bytes == 0 ) {
 		free( buffer );
-- 
cgit v1.2.3-55-g7522


From 5bc3badd013b88201da64dc970600d19451daaec Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 14:55:01 +0100
Subject: [SERVER] Also add a flag for uplink queue overload

---
 src/server/globals.h |  3 ++-
 src/server/net.c     | 10 +++-------
 src/server/uplink.c  | 11 +++++++++++
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 31fbce5..0bd6e47 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -93,7 +93,7 @@ struct _dnbd3_uplink
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
 	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
-	atomic_int queueLen;        // length of queue
+	int queueLen;               // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
 	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
@@ -141,6 +141,7 @@ struct _dnbd3_image
 		atomic_bool write;       // Error writing to file
 		atomic_bool read;        // Error reading from file
 		atomic_bool changed;     // File disappeared or changed, thorough check required if it seems to be back
+		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
diff --git a/src/server/net.c b/src/server/net.c
index 29147be..a478e0c 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -269,12 +269,11 @@ void* net_handleNewConnection(void *clientPtr)
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
 				if ( image->ref_cacheMap != NULL ) {
-					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
+					if ( image->problem.queue || image->problem.write ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					if ( bOk && uplink != NULL ) {
-						if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
+					if ( bOk ) {
+						if ( image->problem.write ) { // Wait 100ms if local caching is not working so this
 							usleep( 100000 ); // server gets a penalty and is less likely to be selected
 						}
 						if ( image->problem.uplink ) {
@@ -282,9 +281,6 @@ void* net_handleNewConnection(void *clientPtr)
 							usleep( ( 100 - image->completenessEstimate ) * 100 );
 						}
 					}
-					if ( uplink != NULL ) {
-						ref_put( &uplink->reference );
-					}
 				}
 				if ( bOk ) {
 					mutex_lock( &image->lock );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index aba53ba..97cb2a9 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -118,6 +118,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
 	image->problem.uplink = true;
+	image->problem.write = true;
+	image->problem.queue = false;
 	if ( sock != -1 ) {
 		uplink->better.fd = sock;
 		int index = altservers_hostToIndex( host );
@@ -191,6 +193,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink)
 		}
 	}
 	uplink->queueLen = 0;
+	uplink->image->problem.queue = false;
 }
 
 static void uplink_free(ref *ref)
@@ -328,6 +331,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			goto fail_lock;
 		}
 		freeSlot = uplink->queueLen++;
+		if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) {
+			uplink->image->problem.queue = true;
+		}
 	}
 	// Do not send request to uplink server if we have a matching pending request AND the request either has the
 	// status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
@@ -904,6 +910,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 						continue; // Success, retry write
 					}
 					if ( err == EBADF || err == EINVAL || err == EIO ) {
+						uplink->image->problem.write = true;
 						if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
 							break;
 						tryAgain = false;
@@ -983,6 +990,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 			}
 			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
+		if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
+			uplink->image->problem.queue = false;
+		}
 		mutex_unlock( &uplink->queueLock );
 #ifdef _DEBUG
 		if ( !served && start != uplink->replicationHandle ) {
@@ -1121,6 +1131,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 		close( uplink->cacheFd );
 	}
 	uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+	uplink->image->problem.write = uplink->cacheFd == -1;
 	return uplink->cacheFd != -1;
 }
 
-- 
cgit v1.2.3-55-g7522


From f9468ef42cb5e2b1779c3309b2bbbe2495418787 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 15:48:26 +0100
Subject: [SERVER] Expose image->problem bools as bitmask in RPC json data

---
 src/server/globals.h |  4 ++--
 src/server/image.c   | 13 +++++++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 0bd6e47..5de4180 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -137,10 +137,10 @@ struct _dnbd3_image
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
 	struct {
-		atomic_bool uplink;      // No uplink connected
-		atomic_bool write;       // Error writing to file
 		atomic_bool read;        // Error reading from file
+		atomic_bool write;       // Error writing to file
 		atomic_bool changed;     // File disappeared or changed, thorough check required if it seems to be back
+		atomic_bool uplink;      // No uplink connected
 		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
diff --git a/src/server/image.c b/src/server/image.c
index 1ce1574..a6aec82 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1576,14 +1576,23 @@ json_t* image_getListAsJson()
 			ref_put( &uplink->reference );
 		}
 
-		jsonImage = json_pack( "{sisssisisisisI}",
+		int problems = 0;
+#define addproblem(name,val) if (image->problem.name) problems |= (1 << val)
+		addproblem(read, 0);
+		addproblem(write, 1);
+		addproblem(changed, 2);
+		addproblem(uplink, 3);
+		addproblem(queue, 4);
+
+		jsonImage = json_pack( "{sisssisisisisIsi}",
 				"id", image->id, // id, name, rid never change, so access them without locking
 				"name", image->name,
 				"rid", (int) image->rid,
 				"users", image->users,
 				"complete",  completeness,
 				"idle", idleTime,
-				"size", (json_int_t)image->virtualFilesize );
+				"size", (json_int_t)image->virtualFilesize,
+				"problems", problems );
 		if ( bytesReceived != 0 ) {
 			json_object_set_new( jsonImage, "bytesReceived", json_integer( (json_int_t) bytesReceived ) );
 		}
-- 
cgit v1.2.3-55-g7522


From 49a9cd2d89dd586db5e08c9d3e96b88a8e8346d7 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 16:46:36 +0100
Subject: [SERVER] Optimize client handler for CMD_GET_BLOCK

Move CMD_GET_BLOCK out of switch block and mark as likely. Don't acquire
and release cache map for every single request, but keep reference
around and only release when a message other than CMD_GET_BLOCK arrives.
On idle links, this should happen through CMD_KEEPALIVE every now and
then.
---
 src/server/net.c | 68 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index a478e0c..0f7e169 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -207,6 +207,7 @@ void* net_handleNewConnection(void *clientPtr)
 	dnbd3_reply_t reply;
 
 	dnbd3_image_t *image = NULL;
+	dnbd3_cache_map_t *cache = NULL;
 	int image_file = -1;
 
 	int num;
@@ -315,9 +316,8 @@ void* net_handleNewConnection(void *clientPtr)
 		// client handling mainloop
 		while ( recv_request_header( client->sock, &request ) ) {
 			if ( _shutdown ) break;
-			switch ( request.cmd ) {
+			if ( likely ( request.cmd == CMD_GET_BLOCK ) ) {
 
-			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
 				reply.handle = request.handle;
 				if ( unlikely( offset >= image->virtualFilesize ) ) {
@@ -326,7 +326,7 @@ void* net_handleNewConnection(void *clientPtr)
 					reply.size = 0;
 					reply.cmd = CMD_ERROR;
 					send_reply( client->sock, &reply, NULL );
-					break;
+					continue;
 				}
 				if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
 					// Sanity check
@@ -334,11 +334,14 @@ void* net_handleNewConnection(void *clientPtr)
 					reply.size = 0;
 					reply.cmd = CMD_ERROR;
 					send_reply( client->sock, &reply, NULL );
-					break;
+					continue;
+				}
+
+				if ( cache == NULL && image->uplinkref != NULL ) {
+					cache = ref_get_cachemap( image );
 				}
 
-				dnbd3_cache_map_t *cache;
-				if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
+				if ( request.size != 0 && cache != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
 					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
@@ -360,36 +363,39 @@ void* net_handleNewConnection(void *clientPtr)
 					// First byte
 					if ( isCached ) {
 						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-						for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
-							const int map_x = (pos >> 12) & 7; // mod 8
-							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-							if ( (b & bit_mask) == 0 ) {
-								isCached = false;
-								break;
+						if ( b != 0xff ) {
+							for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+								const int map_x = (pos >> 12) & 7; // mod 8
+								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+								if ( (b & bit_mask) == 0 ) {
+									isCached = false;
+									break;
+								}
 							}
 						}
 					}
 					// Last byte - only check if request spans multiple bytes in cache map
 					if ( isCached && firstByteInMap != lastByteInMap ) {
 						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-						for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-							assert( lastByteInMap == (pos >> 15) );
-							const int map_x = (pos >> 12) & 7; // mod 8
-							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-							if ( (b & bit_mask) == 0 ) {
-								isCached = false;
-								break;
+						if ( b != 0xff ) {
+							for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+								assert( lastByteInMap == (pos >> 15) );
+								const int map_x = (pos >> 12) & 7; // mod 8
+								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+								if ( (b & bit_mask) == 0 ) {
+									isCached = false;
+									break;
+								}
 							}
 						}
 					}
-					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
 							goto exit_client_cleanup;
 						}
-						break; // DONE, exit request.cmd switch
+						continue; // Reply arrives on uplink some time later, handle next request now
 					}
 				}
 
@@ -474,7 +480,16 @@ void* net_handleNewConnection(void *clientPtr)
 				if ( lock ) mutex_unlock( &client->sendMutex );
 				// Global per-client counter
 				client->bytesSent += request.size; // Increase counter for statistics.
-				break;
+				continue;
+			}
+			// Any other command
+			// Release cache map every now and then, in case the image was replicated
+			// entirely. Will be re-grabbed on next CMD_GET_BLOCK otherwise.
+			if ( cache != NULL ) {
+				ref_put( &cache->reference );
+				cache = NULL;
+			}
+			switch ( request.cmd ) {
 
 			case CMD_GET_SERVERS:
 				// Build list of known working alt servers
@@ -523,9 +538,9 @@ set_name: ;
 				logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
 				break;
 
-			}
-		}
-	}
+			} // end switch
+		} // end loop
+	} // end bOk
 exit_client_cleanup: ;
 	// First remove from list, then add to counter to prevent race condition
 	removeFromList( client );
@@ -536,6 +551,9 @@ exit_client_cleanup: ;
 		timing_get( &image->atime );
 		mutex_unlock( &image->lock );
 	}
+	if ( cache != NULL ) {
+		ref_put( &cache->reference );
+	}
 	freeClientStruct( client ); // This will also call image_release on client->image
 	return NULL ;
 fail_preadd: ;
-- 
cgit v1.2.3-55-g7522


From 70df3aea6257f259ecc0c1921e597081eb0ab7b9 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 17:52:33 +0100
Subject: [SERVER] Get rid of two loops in image_updateCacheMap

---
 src/server/image.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index a6aec82..886bf33 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -118,30 +118,26 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 	const uint64_t firstByteInMap = start >> 15;
 	const uint64_t lastByteInMap = (end - 1) >> 15;
 	uint64_t pos;
-	// First byte
-	uint8_t fb = 0, lb = 0;
-	for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
-		const int map_x = (pos >> 12) & 7; // mod 8
-		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-		fb |= bit_mask;
-	}
-	// Last byte
-	if ( lastByteInMap != firstByteInMap ) {
-		for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-			assert( lastByteInMap == (pos >> 15) );
-			const int map_x = (pos >> 12) & 7; // mod 8
-			const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-			lb |= bit_mask;
-		}
-	}
+	// First and last byte masks
+	const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+	const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
 	atomic_thread_fence( memory_order_acquire );
-	if ( set ) {
-		uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
-		uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
-		setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] );
+	if ( firstByteInMap != lastByteInMap ) {
+		if ( set ) {
+			uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+			uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+			setNewBlocks = ( fo != ( fo | fb ) || lo != ( lo | lb ) );
+		} else {
+			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+			atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+		}
 	} else {
-		atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
-		atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+		if ( set ) {
+			uint8_t o = atomic_fetch_or_explicit( &cache->map[firstByteInMap], (uint8_t)(fb & lb), memory_order_relaxed );
+			setNewBlocks = o != ( o | (fb & lb) );
+		} else {
+			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~(fb & lb), memory_order_relaxed );
+		}
 	}
 	const uint8_t nval = set ? 0xff : 0;
 	// Everything in between
-- 
cgit v1.2.3-55-g7522


From 5c92010d74451a46064e85484a6969a8a2f2cf82 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Mar 2020 12:17:40 +0100
Subject: [SERVER] Likewise, get rid of same loops in client handler

---
 src/server/image.c | 32 +++++++++++++++---------------
 src/server/net.c   | 58 ++++++++++++++++++++++--------------------------------
 2 files changed, 39 insertions(+), 51 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 886bf33..3583f86 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -121,8 +121,15 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 	// First and last byte masks
 	const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
 	const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
-	atomic_thread_fence( memory_order_acquire );
-	if ( firstByteInMap != lastByteInMap ) {
+	if ( firstByteInMap == lastByteInMap ) {
+		if ( set ) {
+			uint8_t o = atomic_fetch_or( &cache->map[firstByteInMap], (uint8_t)(fb & lb) );
+			setNewBlocks = o != ( o | (fb & lb) );
+		} else {
+			atomic_fetch_and( &cache->map[firstByteInMap], (uint8_t)~(fb & lb) );
+		}
+	} else {
+		atomic_thread_fence( memory_order_acquire );
 		if ( set ) {
 			uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
 			uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
@@ -131,22 +138,15 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
 			atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
 		}
-	} else {
-		if ( set ) {
-			uint8_t o = atomic_fetch_or_explicit( &cache->map[firstByteInMap], (uint8_t)(fb & lb), memory_order_relaxed );
-			setNewBlocks = o != ( o | (fb & lb) );
-		} else {
-			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~(fb & lb), memory_order_relaxed );
-		}
-	}
-	const uint8_t nval = set ? 0xff : 0;
-	// Everything in between
-	for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-		if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
-			setNewBlocks = true;
+		// Everything in between
+		const uint8_t nval = set ? 0xff : 0;
+		for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+			if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+				setNewBlocks = true;
+			}
 		}
+		atomic_thread_fence( memory_order_release );
 	}
-	atomic_thread_fence( memory_order_release );
 	if ( setNewBlocks && image->crc32 != NULL ) {
 		// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
 		// for checking, even though this might lead to checking some hash block again, if it was
diff --git a/src/server/net.c b/src/server/net.c
index 0f7e169..01056e0 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -216,7 +216,6 @@ void* net_handleNewConnection(void *clientPtr)
 
 	serialized_buffer_t payload;
 	uint16_t rid, client_version;
-	uint64_t start, end;
 
 	dnbd3_server_entry_t server_list[NUMBER_SERVERS];
 
@@ -343,46 +342,35 @@ void* net_handleNewConnection(void *clientPtr)
 
 				if ( request.size != 0 && cache != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
-					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					bool isCached = true;
+					const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+					const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					const uint64_t firstByteInMap = start >> 15;
 					const uint64_t lastByteInMap = (end - 1) >> 15;
+					const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+					const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
 					uint64_t pos;
 					uint8_t b;
-					atomic_thread_fence( memory_order_acquire );
-					// Middle - quick checking
-					if ( isCached ) {
-						for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-							if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
-								isCached = false;
-								break;
-							}
+					bool isCached;
+					if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+						b = cache->map[firstByteInMap];
+						isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+					} else {
+						isCached = true;
+						atomic_thread_fence( memory_order_acquire );
+						// First byte
+						if ( isCached ) {
+							b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+							isCached = ( ( b & fb ) == fb );
 						}
-					}
-					// First byte
-					if ( isCached ) {
-						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-						if ( b != 0xff ) {
-							for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (b & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-							}
+						// Last byte
+						if ( isCached ) {
+							b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+							isCached = ( ( b & lb ) == lb );
 						}
-					}
-					// Last byte - only check if request spans multiple bytes in cache map
-					if ( isCached && firstByteInMap != lastByteInMap ) {
-						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-						if ( b != 0xff ) {
-							for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-								assert( lastByteInMap == (pos >> 15) );
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (b & bit_mask) == 0 ) {
+						// Middle, must be all bits set (0xff)
+						if ( isCached ) {
+							for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+								if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
 									isCached = false;
 									break;
 								}
-- 
cgit v1.2.3-55-g7522


From 930b65f26cb39687a113641f56711a2d58f886ca Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Mar 2020 17:49:50 +0100
Subject: [SERVER] Add timer task for saving cache maps

Cache maps will now be saved periodically, but only if either they have
a "dirty" bit set, which happens if any bits in the map get cleared
again (due to corruption), or if new data has been replicated from an
uplink server. This either means at least one byte received and 5
minutes have passed, or at least 500MB have been downloaded. The timer
currently runs every 20 seconds.
---
 src/server/altservers.c |  20 +++++++
 src/server/altservers.h |   2 +
 src/server/globals.h    |   3 +-
 src/server/image.c      | 136 +++++++++++++++++++++++++++++++++++++++++++++++-
 src/server/image.h      |   2 +
 src/server/uplink.c     |  76 ++-------------------------
 src/serverconfig.h      |   5 +-
 7 files changed, 168 insertions(+), 76 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index a6ad235..380737c 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -273,6 +273,26 @@ int altservers_getHostListForReplication(const char *image, dnbd3_host_t *server
 	return num;
 }
 
+/**
+ * Returns true if there is at least one alt-server the
+ * given image name would be allowed to be cloned from.
+ */
+bool altservers_imageHasAltServers(const char *image)
+{
+	bool ret = false;
+	mutex_lock( &altServersLock );
+	for ( int i = 0; i < numAltServers; ++i ) {
+		if ( altServers[i].isClientOnly || ( !altServers[i].isPrivate && _proxyPrivateOnly ) )
+			continue;
+		if ( !isImageAllowed( &altServers[i], image ) )
+			continue;
+		ret = true;
+		break;
+	}
+	mutex_unlock( &altServersLock );
+	return ret;
+}
+
 /**
  * Get <size> alt servers. If there are more alt servers than
  * requested, random servers will be picked.
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 8e29aaa..78f6fcc 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -19,6 +19,8 @@ int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *ou
 
 int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
 
+bool altservers_imageHasAltServers(const char *image);
+
 bool altservers_toString(int server, char *buffer, size_t len);
 
 int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
diff --git a/src/server/globals.h b/src/server/globals.h
index 5de4180..10d3ee3 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -93,6 +93,7 @@ struct _dnbd3_uplink
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
 	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
+	atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
 	int queueLen;               // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
@@ -128,7 +129,6 @@ struct _dnbd3_image
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
 	ticks atime;                // last access time
-	ticks lastWorkCheck;   // last time a non-working image has been checked
 	ticks nextCompletenessEstimate; // next time the completeness estimate should be updated
 	uint32_t *crc32;       // list of crc32 checksums for each 16MiB block in image
 	uint32_t masterCrc32;  // CRC-32 of the crc-32 list
@@ -144,6 +144,7 @@ struct _dnbd3_image
 		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
+	atomic_bool mapDirty;  // Cache map has been modified outside uplink (only integrity checker for now)
 	pthread_mutex_t lock;
 };
 
diff --git a/src/server/image.c b/src/server/image.c
index 3583f86..5a9e15b 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -55,6 +55,8 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
 static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
+static void* saveAllCacheMaps(void*);
+static bool saveCacheMap(dnbd3_image_t *image);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
 
 static void cmfree(ref *ref)
@@ -73,6 +75,7 @@ void image_serverStartup()
 	mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
 	mutex_init( &reloadLock, LOCK_RELOAD );
 	server_addJob( &closeUnusedFds, NULL, 10, 900 );
+	server_addJob( &saveAllCacheMaps, NULL, 9, 20 );
 }
 
 /**
@@ -160,6 +163,8 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 				integrity_check( image, block, false );
 			}
 		}
+	} else if ( !set ) {
+		image->mapDirty = true;
 	}
 	ref_put( &cache->reference );
 }
@@ -624,6 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	// this will get called again when the uplink is done.
 	if ( !uplink_shutdown( image ) )
 		return NULL;
+	saveCacheMap( image );
 	mutex_lock( &image->lock );
 	ref_setref( &image->ref_cacheMap, NULL );
 	free( image->crc32 );
@@ -1830,6 +1836,135 @@ static void* closeUnusedFds(void* nix UNUSED)
 	return NULL;
 }
 
+#define IMGCOUNT 5
+static void* saveAllCacheMaps(void* nix UNUSED)
+{
+	static ticks nextSave;
+	dnbd3_image_t *list[IMGCOUNT];
+	int count = 0;
+	declare_now;
+	bool full = timing_reached( &nextSave, &now );
+	mutex_lock( &imageListLock );
+	for ( int i = 0; i < _num_images; ++i ) {
+		dnbd3_image_t * const image = _images[i];
+		if ( image->mapDirty ) {
+			// Flag is set if integrity checker found a problem - save out
+			image->users++;
+			list[count++] = image;
+			image->mapDirty = false;
+		} else {
+			// Otherwise, consider longer timeout and byte count limits of uplink
+			dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+			if ( uplink != NULL ) {
+				assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+				uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+				if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES
+						|| ( full && diff != 0 ) ) {
+					image->users++;
+					list[count++] = image;
+					uplink->bytesReceivedLastSave = uplink->bytesReceived;
+				}
+				ref_put( &uplink->reference );
+			}
+		}
+		if ( count == IMGCOUNT )
+			break;
+	}
+	mutex_unlock( &imageListLock );
+	if ( full && count < IMGCOUNT ) {
+		// Only update nextSave once we handled all images in the list
+		timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
+	}
+	for ( int i = 0; i < count; ++i ) {
+		saveCacheMap( list[i] );
+		image_release( list[i] );
+	}
+	return NULL;
+}
+#undef IMGCOUNT
+
+/**
+ * Saves the cache map of the given image.
+ * Return true on success.
+ * @param image the image
+ */
+static bool saveCacheMap(dnbd3_image_t *image)
+{
+	if ( !_isProxy )
+		return true; // Nothing to do
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL )
+		return true; // Nothing to do
+	// Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+	// for which we have any upstream servers configured. If there's none, don't touch
+	// the cache map on disk.
+	if ( !altservers_imageHasAltServers( image->name ) ) {
+		ref_put( &cache->reference );
+		return true; // Nothing to do
+	}
+
+	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
+	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
+	char mapfile[strlen( image->path ) + 4 + 1];
+	strcpy( mapfile, image->path );
+	strcat( mapfile, ".map" );
+
+	int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
+	if ( fd == -1 ) {
+		const int err = errno;
+		ref_put( &cache->reference );
+		logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
+		return false;
+	}
+
+	// On Linux we could use readFd, but in general it's not guaranteed to work
+	int imgFd = open( image->path, O_WRONLY );
+	if ( imgFd == -1 ) {
+		logadd( LOG_WARNING, "Cannot open %s for fsync(): errno=%d", image->path, errno );
+	} else {
+		if ( fsync( imgFd ) == -1 ) {
+			logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d. Resetting cache map.", image->path, errno );
+			dnbd3_cache_map_t *old = image_loadCacheMap(image->path, image->virtualFilesize);
+			const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+			if ( old == NULL ) {
+				// Could not load old map. FS might be toast.
+				logadd( LOG_ERROR, "Cannot load old cache map. Setting all zero." );
+				memset( cache->map, 0, mapSize );
+			} else {
+				// AND the maps together to be safe
+				for ( int i = 0; i < mapSize; ++i ) {
+					cache->map[i] &= old->map[i];
+				}
+				old->reference.free( &old->reference );
+			}
+		}
+		close( imgFd );
+	}
+
+	// Write current map to file
+	size_t done = 0;
+	while ( done < size ) {
+		const ssize_t ret = write( fd, cache->map + done, size - done );
+		if ( ret == -1 ) {
+			if ( errno == EINTR ) continue;
+			logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
+			break;
+		}
+		if ( ret <= 0 ) {
+			logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
+			break;
+		}
+		done += (size_t)ret;
+	}
+	ref_put( &cache->reference );
+	if ( fsync( fd ) == -1 ) {
+		logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
+	}
+	close( fd );
+	// TODO fsync on parent directory
+	return true;
+}
+
 static void allocCacheMap(dnbd3_image_t *image, bool complete)
 {
 	const uint8_t val = complete ? 0xff : 0;
@@ -1846,4 +1981,3 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
 	}
 	mutex_unlock( &image->lock );
 }
-
diff --git a/src/server/image.h b/src/server/image.h
index 89791fc..4614c74 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -49,6 +49,8 @@ void image_closeUnusedFd();
 
 bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
 
+bool image_saveCacheMap(dnbd3_image_t *image);
+
 // one byte in the map covers 8 4kib blocks, so 32kib per byte
 // "+ (1 << 15) - 1" is required to account for the last bit of
 // the image that is smaller than 32kib
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 97cb2a9..e5ab9c0 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -57,7 +57,6 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink);
 static void uplink_addCrc32(dnbd3_uplink_t *uplink);
 static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
 static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
 
@@ -103,6 +102,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
 	uplink->image = image;
 	uplink->bytesReceived = 0;
+	uplink->bytesReceivedLastSave = 0;
 	uplink->idleTime = 0;
 	uplink->queueLen = 0;
 	uplink->cacheFd = -1;
@@ -445,7 +445,6 @@ static void* uplink_mainloop(void *data)
 	int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
 	int rttTestResult;
 	uint32_t discoverFailCount = 0;
-	uint32_t unsavedSeconds = 0;
 	ticks nextAltCheck, lastKeepalive;
 	char buffer[200];
 	memset( events, 0, sizeof(events) );
@@ -561,12 +560,6 @@ static void* uplink_mainloop(void *data)
 		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
 			lastKeepalive = now;
 			uplink->idleTime += timepassed;
-			unsavedSeconds += timepassed;
-			if ( unsavedSeconds > 240 || ( unsavedSeconds > 60 && uplink->idleTime >= 20 && uplink->idleTime <= 70 ) ) {
-				// fsync/save every 4 minutes, or every 60 seconds if uplink is idle
-				unsavedSeconds = 0;
-				uplink_saveCacheMap( uplink );
-			}
 			// Keep-alive
 			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
 				// Send keep-alive if nothing is happening, and try to trigger background rep.
@@ -639,9 +632,9 @@ static void* uplink_mainloop(void *data)
 		}
 #endif
 	}
-	cleanup: ;
-	uplink_saveCacheMap( uplink );
+cleanup: ;
 	dnbd3_image_t *image = uplink->image;
+	image->mapDirty = true; // Force writeout of cache map
 	mutex_lock( &image->lock );
 	bool exp = false;
 	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
@@ -1135,69 +1128,6 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 	return uplink->cacheFd != -1;
 }
 
-/**
- * Saves the cache map of the given image.
- * Return true on success.
- * Locks on: imageListLock, image.lock
- */
-static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
-{
-	dnbd3_image_t *image = uplink->image;
-	assert( image != NULL );
-
-	if ( uplink->cacheFd != -1 ) {
-		if ( fsync( uplink->cacheFd ) == -1 ) {
-			// A failing fsync means we have no guarantee that any data
-			// since the last fsync (or open if none) has been saved. Apart
-			// from keeping the cache map from the last successful fsync
-			// around and restoring it there isn't much we can do to recover
-			// a consistent state. Bail out.
-			logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
-			logadd( LOG_ERROR, "Bailing out immediately" );
-			exit( 1 );
-		}
-	}
-
-	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-	if ( cache == NULL )
-		return true;
-	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
-	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
-	assert( image->path != NULL );
-	char mapfile[strlen( image->path ) + 4 + 1];
-	strcpy( mapfile, image->path );
-	strcat( mapfile, ".map" );
-
-	int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
-	if ( fd == -1 ) {
-		const int err = errno;
-		ref_put( &cache->reference );
-		logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
-		return false;
-	}
-
-	size_t done = 0;
-	while ( done < size ) {
-		const ssize_t ret = write( fd, cache->map + done, size - done );
-		if ( ret == -1 ) {
-			if ( errno == EINTR ) continue;
-			logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
-			break;
-		}
-		if ( ret <= 0 ) {
-			logadd( LOG_WARNING, "Unexpected return value %d for write() to %s", (int)ret, mapfile );
-			break;
-		}
-		done += (size_t)ret;
-	}
-	ref_put( &cache->reference );
-	if ( fsync( fd ) == -1 ) {
-		logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
-	}
-	close( fd );
-	return true;
-}
-
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
 {
 	return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 239f0a2..5c7301d 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -17,7 +17,10 @@
 #define SERVER_UPLINK_QUEUELEN_THRES  900 // Threshold where we start dropping incoming clients
 #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
 
-#define SERVER_CACHE_MAP_SAVE_INTERVAL 90
+// Wait a maximum of 5 minutes before saving cache map (if data was received at all)
+#define CACHE_MAP_MAX_SAVE_DELAY 300
+// If more than 500MB have been received from uplink without saving cache map, do so
+#define CACHE_MAP_MAX_UNSAVED_BYTES ((uint64_t)500 * 1000 * 1000)
 
 // Time in ms to wait for a read/write call to complete on an uplink connection
 #define SOCKET_TIMEOUT_UPLINK 5000
-- 
cgit v1.2.3-55-g7522


From 080a06ab22c8ac0841c06fe52ab4dbc982beafc1 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Mar 2020 11:34:58 +0100
Subject: [SERVER] Reload cache maps periodically for local images

If an image is incomplete, but has no upstream server that can be used
for replication, reload the cache map from disk periodically, in case
some other server instance is writing to the image.
---
 src/server/globals.h |   3 +-
 src/server/image.c   | 129 +++++++++++++++++++++++++++++++++------------------
 src/server/uplink.c  |  10 +++-
 3 files changed, 93 insertions(+), 49 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 10d3ee3..211fe2d 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -111,6 +111,8 @@ typedef struct
 typedef struct
 {
 	ref reference;
+	atomic_bool dirty;     // Cache map has been modified outside uplink (only integrity checker for now)
+	bool unchanged;        // How many times in a row a reloaded cache map went unchanged
 	_Atomic uint8_t map[];
 } dnbd3_cache_map_t;
 
@@ -144,7 +146,6 @@ struct _dnbd3_image
 		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
-	atomic_bool mapDirty;  // Cache map has been modified outside uplink (only integrity checker for now)
 	pthread_mutex_t lock;
 };
 
diff --git a/src/server/image.c b/src/server/image.c
index 5a9e15b..7ffe041 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -55,8 +55,9 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
 static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
-static void* saveAllCacheMaps(void*);
-static bool saveCacheMap(dnbd3_image_t *image);
+static bool imageShouldSaveCacheMap(dnbd3_image_t *image);
+static void* saveLoadAllCacheMaps(void*);
+static void saveCacheMap(dnbd3_image_t *image);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
 
 static void cmfree(ref *ref)
@@ -75,7 +76,7 @@ void image_serverStartup()
 	mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
 	mutex_init( &reloadLock, LOCK_RELOAD );
 	server_addJob( &closeUnusedFds, NULL, 10, 900 );
-	server_addJob( &saveAllCacheMaps, NULL, 9, 20 );
+	server_addJob( &saveLoadAllCacheMaps, NULL, 9, 20 );
 }
 
 /**
@@ -164,7 +165,7 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 			}
 		}
 	} else if ( !set ) {
-		image->mapDirty = true;
+		cache->dirty = true;
 	}
 	ref_put( &cache->reference );
 }
@@ -629,7 +630,9 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	// this will get called again when the uplink is done.
 	if ( !uplink_shutdown( image ) )
 		return NULL;
-	saveCacheMap( image );
+	if ( imageShouldSaveCacheMap( image ) ) {
+		saveCacheMap( image );
+	}
 	mutex_lock( &image->lock );
 	ref_setref( &image->ref_cacheMap, NULL );
 	free( image->crc32 );
@@ -1836,72 +1839,107 @@ static void* closeUnusedFds(void* nix UNUSED)
 	return NULL;
 }
 
-#define IMGCOUNT 5
-static void* saveAllCacheMaps(void* nix UNUSED)
+static bool imageShouldSaveCacheMap(dnbd3_image_t *image)
+{
+	if ( !_isProxy )
+		return false; // Nothing to do
+	if ( image->ref_cacheMap == NULL )
+		return false; // Nothing to do
+	// Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
+	// for which we have any upstream servers configured. If there's none, don't touch
+	// the cache map on disk.
+	if ( !altservers_imageHasAltServers( image->name ) )
+		return false; // Nothing to do
+	return true;
+}
+
+static void* saveLoadAllCacheMaps(void* nix UNUSED)
 {
 	static ticks nextSave;
-	dnbd3_image_t *list[IMGCOUNT];
-	int count = 0;
 	declare_now;
 	bool full = timing_reached( &nextSave, &now );
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
-		if ( image->mapDirty ) {
-			// Flag is set if integrity checker found a problem - save out
-			image->users++;
-			list[count++] = image;
-			image->mapDirty = false;
-		} else {
-			// Otherwise, consider longer timeout and byte count limits of uplink
+		dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+		if ( cache == NULL )
+			continue; // No users++ or mutex_unlock yet -> safe
+		image->users++;
+		mutex_unlock( &imageListLock );
+		if ( imageShouldSaveCacheMap( image ) ) {
+			// Replicated image, we're responsible for updating the map, so save it
+			// Save if dirty bit is set, blocks were invalidated
+			bool save = cache->dirty;
 			dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-			if ( uplink != NULL ) {
-				assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
-				uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
-				if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES
-						|| ( full && diff != 0 ) ) {
-					image->users++;
-					list[count++] = image;
+			if ( !save ) {
+				// Otherwise, consider longer timeout and byte count limits of uplink
+				if ( uplink != NULL ) {
+					assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+					uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+					if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+						save = true;
+					}
+				}
+			}
+			if ( save ) {
+				cache->dirty = false;
+				if ( uplink != NULL ) {
 					uplink->bytesReceivedLastSave = uplink->bytesReceived;
 				}
+				saveCacheMap( image );
+			}
+			if ( uplink != NULL ) {
 				ref_put( &uplink->reference );
 			}
+		} else {
+			// We're not replicating this image, if there's a cache map, reload
+			// it periodically, since we might read from a shared storage that
+			// another server instance is writing to.
+			if ( full || !cache->unchanged && !image->problem.read ) {
+				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid );
+				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+				if ( onDisk == NULL ) {
+					// Should be complete now
+					logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid );
+					ref_setref( &image->ref_cacheMap, NULL );
+				} else {
+					const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+					if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+						// Unchanged
+						cache->unchanged = true;
+						onDisk->reference.free( &onDisk->reference );
+					} else {
+						// Replace
+						ref_setref( &image->ref_cacheMap, &onDisk->reference );
+						logadd( LOG_DEBUG2, "Map changed" );
+					}
+				}
+			}
 		}
-		if ( count == IMGCOUNT )
-			break;
+		ref_put( &cache->reference );
+		image_release( image ); // Always do this instead of users-- to handle freeing
+		mutex_lock( &imageListLock );
 	}
 	mutex_unlock( &imageListLock );
-	if ( full && count < IMGCOUNT ) {
-		// Only update nextSave once we handled all images in the list
+	if ( full ) {
 		timing_addSeconds( &nextSave, &now, CACHE_MAP_MAX_SAVE_DELAY );
 	}
-	for ( int i = 0; i < count; ++i ) {
-		saveCacheMap( list[i] );
-		image_release( list[i] );
-	}
 	return NULL;
 }
-#undef IMGCOUNT
 
 /**
  * Saves the cache map of the given image.
- * Return true on success.
+ * Return false if this image doesn't have a cache map, or if the image
+ * doesn't have any uplink to replicate from. In this case the image might
+ * still have a cache map that was loaded from disk, and should be reloaded
+ * periodically.
  * @param image the image
  */
-static bool saveCacheMap(dnbd3_image_t *image)
+static void saveCacheMap(dnbd3_image_t *image)
 {
-	if ( !_isProxy )
-		return true; // Nothing to do
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 	if ( cache == NULL )
-		return true; // Nothing to do
-	// Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
-	// for which we have any upstream servers configured. If there's none, don't touch
-	// the cache map on disk.
-	if ( !altservers_imageHasAltServers( image->name ) ) {
-		ref_put( &cache->reference );
-		return true; // Nothing to do
-	}
+		return; // Race - wasn't NULL in function call above...
 
 	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
@@ -1914,7 +1952,7 @@ static bool saveCacheMap(dnbd3_image_t *image)
 		const int err = errno;
 		ref_put( &cache->reference );
 		logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
-		return false;
+		return;
 	}
 
 	// On Linux we could use readFd, but in general it's not guaranteed to work
@@ -1962,7 +2000,6 @@ static bool saveCacheMap(dnbd3_image_t *image)
 	}
 	close( fd );
 	// TODO fsync on parent directory
-	return true;
 }
 
 static void allocCacheMap(dnbd3_image_t *image, bool complete)
diff --git a/src/server/uplink.c b/src/server/uplink.c
index e5ab9c0..e644e56 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -80,6 +80,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 {
 	if ( !_isProxy || _shutdown ) return false;
 	assert( image != NULL );
+	if ( sock == -1 && !altservers_imageHasAltServers( image->name ) )
+		return false; // Nothing to do
 	mutex_lock( &image->lock );
 	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
 	if ( uplink != NULL ) {
@@ -103,7 +105,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->image = image;
 	uplink->bytesReceived = 0;
 	uplink->bytesReceivedLastSave = 0;
-	uplink->idleTime = 0;
+	uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
 	uplink->queueLen = 0;
 	uplink->cacheFd = -1;
 	uplink->signal = signal_new();
@@ -634,7 +636,11 @@ static void* uplink_mainloop(void *data)
 	}
 cleanup: ;
 	dnbd3_image_t *image = uplink->image;
-	image->mapDirty = true; // Force writeout of cache map
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache != NULL ) {
+		cache->dirty = true; // Force writeout of cache map
+		ref_put( &cache->reference );
+	}
 	mutex_lock( &image->lock );
 	bool exp = false;
 	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
-- 
cgit v1.2.3-55-g7522


From ff4e770e645c05da48baddb30a77b9dc15ca76fd Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Mar 2020 15:00:46 +0100
Subject: [SERVER] Handle "warn unused result" cases

---
 src/server/fileutil.c |  2 +-
 src/server/globals.c  |  5 ++++-
 src/server/image.c    |  8 ++++++--
 src/server/rpc.c      |  2 +-
 src/server/server.c   |  5 ++++-
 src/server/uplink.c   | 14 ++++++++++----
 6 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/server/fileutil.c b/src/server/fileutil.c
index 336ab68..9a9f066 100644
--- a/src/server/fileutil.c
+++ b/src/server/fileutil.c
@@ -68,7 +68,7 @@ bool file_setSize(int fd, uint64_t size)
 	// Try really hard... image loading logic relies on the file
 	// having the proper apparent size
 	uint8_t byte = 0;
-	pread( fd, &byte, 1, size - 1 );
+	(void)!pread( fd, &byte, 1, size - 1 );
 	if ( pwrite( fd, &byte, 1, size - 1 ) == 1 ) return true;
 	return false;
 }
diff --git a/src/server/globals.c b/src/server/globals.c
index 2e87400..ac079b1 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -113,7 +113,10 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
 void globals_loadConfig()
 {
 	char *name = NULL;
-	asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
+	if ( asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME ) == -1 ) {
+		logadd( LOG_ERROR, "Memory allocation error for config filename" );
+		exit( 1 );
+	}
 	if ( name == NULL ) return;
 	if ( initialLoad ) {
 		mutex_init( &loadLock, LOCK_LOAD_CONFIG );
diff --git a/src/server/image.c b/src/server/image.c
index 7ffe041..32c9efe 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1446,9 +1446,13 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
 				logadd( LOG_WARNING, "OTF-Clone: Corrupted CRC-32 list. ignored. (%s)", name );
 			} else {
 				int fd = open( crcFile, O_WRONLY | O_CREAT, 0644 );
-				write( fd, &masterCrc, sizeof(uint32_t) );
-				write( fd, crc32list, crc32len );
+				ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+				ret += write( fd, crc32list, crc32len );
 				close( fd );
+				if ( (size_t)ret != crc32len + sizeof(masterCrc) ) {
+					logadd( LOG_WARNING, "Could not save freshly received crc32 list for %s:%d", name, (int)revision );
+					unlink( crcFile );
+				}
 			}
 		}
 		free( crc32list );
diff --git a/src/server/rpc.c b/src/server/rpc.c
index a454d6d..b66b8fe 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -101,7 +101,7 @@ void rpc_init()
 		int fd = open( "/dev/urandom", O_RDONLY );
 		if ( fd != -1 ) {
 			uint32_t bla = 1;
-			read( fd, &bla, 4 );
+			(void)!read( fd, &bla, 4 );
 			randomRunId = (randomRunId << 32) | bla;
 		}
 		close( fd );
diff --git a/src/server/server.c b/src/server/server.c
index 0dddea7..c9edc05 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -315,7 +315,10 @@ int main(int argc, char *argv[])
 	// No one-shot detected, normal server operation or errormsg serving
 	if ( demonize ) {
 		logadd( LOG_INFO, "Forking into background, see log file for further information" );
-		daemon( 1, 0 );
+		if ( daemon( 0, 0 ) == -1 ) {
+			logadd( LOG_ERROR, "Could not daemon(): errno=%d", errno );
+			exit( 1 );
+		}
 	}
 	if ( errorMsg != NULL ) {
 		setupNetwork( bindAddress );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index e644e56..71d9f94 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -1098,7 +1098,8 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
 	lists_crc = net_order_32( lists_crc );
 	if ( lists_crc != masterCrc ) {
-		logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s)!", uplink->image->name );
+		logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!",
+				uplink->image->name, (int)uplink->image->rid );
 		free( buffer );
 		return;
 	}
@@ -1108,10 +1109,15 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	char path[len];
 	snprintf( path, len, "%s.crc", uplink->image->path );
 	const int fd = open( path, O_WRONLY | O_CREAT, 0644 );
-	if ( fd >= 0 ) {
-		write( fd, &masterCrc, sizeof(uint32_t) );
-		write( fd, buffer, bytes );
+	if ( fd != -1 ) {
+		ssize_t ret = write( fd, &masterCrc, sizeof(masterCrc) );
+		ret += write( fd, buffer, bytes );
 		close( fd );
+		if ( (size_t)ret != sizeof(masterCrc) + bytes ) {
+			unlink( path );
+			logadd( LOG_WARNING, "Could not write crc32 file for %s:%d",
+					uplink->image->name, (int)uplink->image->rid );
+		}
 	}
 }
 
-- 
cgit v1.2.3-55-g7522


From 9f11c67b291b50e0f1c98d2e85db22a33d2e2d11 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Mar 2020 16:02:54 +0100
Subject: [SERVER] Add printf macro for image (name:rid as %s:%d)

---
 src/server/altservers.c |  4 ++--
 src/server/globals.h    |  1 +
 src/server/image.c      | 32 ++++++++++++++------------------
 src/server/uplink.c     | 20 ++++++++++----------
 4 files changed, 27 insertions(+), 30 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 380737c..35da3a2 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -14,7 +14,7 @@
 #include <inttypes.h>
 #include <jansson.h>
 
-#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, image->name, (int)image->rid)
+#define LOG(lvl, msg, ...) logadd(lvl, msg " (%s:%d)", __VA_ARGS__, PIMG(image))
 #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
 #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
 
@@ -524,7 +524,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 		logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
 		return;
 	}
-	LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid );
+	logadd( LOG_DEBUG2, "Running alt check for %s:%d", PIMG(image) );
 	assert( uplink->rttTestResult == RTT_INPROGRESS );
 	// Test them all
 	dnbd3_server_connection_t best = { .fd = -1 };
diff --git a/src/server/globals.h b/src/server/globals.h
index 211fe2d..1bb6857 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -148,6 +148,7 @@ struct _dnbd3_image
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
 };
+#define PIMG(x) (x)->name, (int)(x)->rid
 
 struct _dnbd3_client
 {
diff --git a/src/server/image.c b/src/server/image.c
index 32c9efe..18e91d9 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -290,13 +290,11 @@ bool image_ensureOpen(dnbd3_image_t *image)
 		if ( image->crc32 == NULL ) {
 			// Cannot verify further, hope for the best
 			image->problem.changed = false;
-			logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value",
-					image->name, (int)image->rid );
+			logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value", PIMG(image) );
 		} else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
 			// This should have checked the first block (if complete) -> All is well again
 			image->problem.changed = false;
-			logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value",
-					image->name, (int)image->rid );
+			logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value", PIMG(image) );
 		}
 	} else {
 		image->problem.changed = sizeChanged;
@@ -624,7 +622,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 {
 	assert( image != NULL );
 	assert( image->users == 0 );
-	logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid );
+	logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", PIMG(image) );
 	// uplink_shutdown might return false to tell us
 	// that the shutdown is in progress. Bail out since
 	// this will get called again when the uplink is done.
@@ -852,16 +850,16 @@ static bool image_load(char *base, char *path, int withUplink)
 	// Compare data just loaded to identical image we apparently already loaded
 	if ( existing != NULL ) {
 		if ( existing->realFilesize != realFilesize ) {
-			logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+			logadd( LOG_WARNING, "Size of image '%s:%d' has changed.", PIMG(existing) );
 			// Image will be replaced below
 		} else if ( existing->crc32 != NULL && crc32list != NULL
 				&& memcmp( existing->crc32, crc32list, sizeof(uint32_t) * hashBlockCount ) != 0 ) {
-			logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", existing->name, (int)existing->rid );
+			logadd( LOG_WARNING, "CRC32 list of image '%s:%d' has changed.", PIMG(existing) );
 			logadd( LOG_WARNING, "The image will be reloaded, but you should NOT replace existing images while the server is running." );
 			logadd( LOG_WARNING, "Actually even if it's not running this should never be done. Use a new RID instead!" );
 			// Image will be replaced below
 		} else if ( existing->crc32 == NULL && crc32list != NULL ) {
-			logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", existing->name, (int)existing->rid );
+			logadd( LOG_INFO, "Found CRC-32 list for already loaded image '%s:%d', adding...", PIMG(existing) );
 			existing->crc32 = crc32list;
 			existing->masterCrc32 = masterCrc;
 			crc32list = NULL;
@@ -869,7 +867,7 @@ static bool image_load(char *base, char *path, int withUplink)
 			goto load_error; // Keep existing
 		} else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
 			// Just ignore that fact, if replication is really complete the cache map will be removed anyways
-			logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
+			logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", PIMG(existing) );
 			function_return = true;
 			goto load_error; // Keep existing
 		} else {
@@ -940,7 +938,7 @@ static bool image_load(char *base, char *path, int withUplink)
 		image = image_free( image );
 		goto load_error;
 	}
-	logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", image->name, (int)image->rid );
+	logadd( LOG_DEBUG1, "Loaded image '%s:%d'\n", PIMG(image) );
 	function_return = true;
 
 	// Clean exit:
@@ -1790,7 +1788,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
 			image_release( oldest ); // We did users++ above; image might have to be freed entirely
 			return false;
 		}
-		logadd( LOG_INFO, "'%s:%d' has to go!", oldest->name, (int)oldest->rid );
+		logadd( LOG_INFO, "'%s:%d' has to go!", PIMG(oldest) );
 		char *filename = strdup( oldest->path ); // Copy name as we remove the image first
 		oldest = image_remove( oldest ); // Remove from list first...
 		oldest = image_release( oldest ); // Decrease users counter; if it falls to 0, image will be freed
@@ -1825,10 +1823,8 @@ static void* closeUnusedFds(void* nix UNUSED)
 		dnbd3_image_t * const image = _images[i];
 		if ( image == NULL || image->readFd == -1 )
 			continue;
-		// TODO: Also close for idle uplinks (uplink_connectionShouldShutdown)
-		// TODO: And close writeFd for idle uplinks....
 		if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
-			logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", image->name, (int)image->rid );
+			logadd( LOG_DEBUG1, "Inactive fd closed for %s:%d", PIMG(image) );
 			fds[fdindex++] = image->readFd;
 			image->readFd = -1; // Not a race; image->users is 0 and to increase it you need imageListLock
 			if ( fdindex == FDCOUNT )
@@ -1900,11 +1896,11 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 			// it periodically, since we might read from a shared storage that
 			// another server instance is writing to.
 			if ( full || !cache->unchanged && !image->problem.read ) {
-				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", image->name, (int)image->rid );
+				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
 				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
 				if ( onDisk == NULL ) {
 					// Should be complete now
-					logadd( LOG_DEBUG1, "External replication of %s:%d complete", image->name, (int)image->rid );
+					logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
 					ref_setref( &image->ref_cacheMap, NULL );
 				} else {
 					const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
@@ -1945,7 +1941,7 @@ static void saveCacheMap(dnbd3_image_t *image)
 	if ( cache == NULL )
 		return; // Race - wasn't NULL in function call above...
 
-	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
+	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", PIMG(image) );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
 	char mapfile[strlen( image->path ) + 4 + 1];
 	strcpy( mapfile, image->path );
@@ -2015,7 +2011,7 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
 	memset( cache->map, val, byteSize );
 	mutex_lock( &image->lock );
 	if ( image->ref_cacheMap != NULL ) {
-		logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid );
+		logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a map for %s:%d", PIMG(image) );
 		free( cache );
 	} else {
 		ref_setref( &image->ref_cacheMap, &cache->reference );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 71d9f94..7c7cd1c 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -171,7 +171,7 @@ bool uplink_shutdown(dnbd3_image_t *image)
 		image->users++; // Prevent free while uplink shuts down
 		signal_call( uplink->signal );
 	} else {
-		logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid );
+		logadd( LOG_ERROR, "This will never happen. '%s:%d'", PIMG(image) );
 	}
 	cancelAllRequests( uplink );
 	ref_setref( &image->uplinkref, NULL );
@@ -201,7 +201,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink)
 static void uplink_free(ref *ref)
 {
 	dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
-	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
+	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) );
 	assert( uplink->queueLen == 0 );
 	if ( uplink->signal != NULL ) {
 		signal_close( uplink->signal );
@@ -572,7 +572,7 @@ static void* uplink_mainloop(void *data)
 			}
 			// Don't keep uplink established if we're idle for too much
 			if ( uplink_connectionShouldShutdown( uplink ) ) {
-				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
+				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) );
 				goto cleanup;
 			}
 		}
@@ -915,11 +915,13 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 						tryAgain = false;
 						continue; // Write handle to image successfully re-opened, try again
 					}
-					logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d", uplink->image->name, (int)uplink->image->rid, err );
+					logadd( LOG_DEBUG1, "Error trying to cache data for %s:%d -- errno=%d",
+							PIMG(uplink->image), err );
 					break;
 				}
 				if ( unlikely( ret <= 0 || (uint32_t)ret > inReply.size - done ) ) {
-					logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d", ret, uplink->image->name, (int)uplink->image->rid );
+					logadd( LOG_WARNING, "Unexpected return value %d from pwrite to %s:%d",
+							ret, PIMG(uplink->image) );
 					break;
 				}
 				done += (uint32_t)ret;
@@ -929,7 +931,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 			}
 			if ( unlikely( ret == -1 && ( err == EBADF || err == EINVAL || err == EIO ) ) ) {
 				logadd( LOG_WARNING, "Error writing received data for %s:%d (errno=%d); disabling caching.",
-						uplink->image->name, (int)uplink->image->rid, err );
+						PIMG(uplink->image), err );
 			}
 		}
 		// 2) Figure out which clients are interested in it
@@ -1098,8 +1100,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	lists_crc = crc32( lists_crc, (uint8_t*)buffer, bytes );
 	lists_crc = net_order_32( lists_crc );
 	if ( lists_crc != masterCrc ) {
-		logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!",
-				uplink->image->name, (int)uplink->image->rid );
+		logadd( LOG_WARNING, "Received corrupted crc32 list from uplink server (%s:%d)!", PIMG(uplink->image) );
 		free( buffer );
 		return;
 	}
@@ -1115,8 +1116,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 		close( fd );
 		if ( (size_t)ret != sizeof(masterCrc) + bytes ) {
 			unlink( path );
-			logadd( LOG_WARNING, "Could not write crc32 file for %s:%d",
-					uplink->image->name, (int)uplink->image->rid );
+			logadd( LOG_WARNING, "Could not write crc32 file for %s:%d", PIMG(uplink->image) );
 		}
 	}
 }
-- 
cgit v1.2.3-55-g7522


From a91bd049d6e33af29d5f941d556cd1c374b4dd7e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Mar 2020 19:07:44 +0100
Subject: [SERVER] Fix: Image would be assumed complete if no uplink exists

Severe data corruption on client. Nice.
---
 src/server/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/net.c b/src/server/net.c
index 01056e0..954cb8a 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -336,7 +336,7 @@ void* net_handleNewConnection(void *clientPtr)
 					continue;
 				}
 
-				if ( cache == NULL && image->uplinkref != NULL ) {
+				if ( cache == NULL ) {
 					cache = ref_get_cachemap( image );
 				}
 
-- 
cgit v1.2.3-55-g7522


From 0b9706d7654863009ece7c1509c33f97cae07bca Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 9 Mar 2020 11:22:26 +0100
Subject: [SERVER] Fix data type

---
 src/server/image.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 18e91d9..86b6374 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -46,7 +46,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image);
 static dnbd3_image_t* image_free(dnbd3_image_t *image);
 static bool image_load_all_internal(char *base, char *path);
 static bool image_addToList(dnbd3_image_t *image);
-static bool image_load(char *base, char *path, int withUplink);
+static bool image_load(char *base, char *path, bool withUplink);
 static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageSize);
 static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
 static bool image_ensureDiskSpace(uint64_t size, bool force);
@@ -751,7 +751,7 @@ static bool image_addToList(dnbd3_image_t *image)
  * Note that this is NOT THREAD SAFE so make sure its always
  * called on one thread only.
  */
-static bool image_load(char *base, char *path, int withUplink)
+static bool image_load(char *base, char *path, bool withUplink)
 {
 	int revision = -1;
 	struct stat st;
-- 
cgit v1.2.3-55-g7522


From 290d3478f245bb7d2112bb781286a9fbae42b983 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 13 Mar 2020 16:03:29 +0100
Subject: [SERVER] Rewrite uplink queue handling

- Now uses linked lists instead of huge array
- Does prefetch data on client requests
- Can have multiple replication requests in-flight
---
 src/server/globals.c   |   6 +
 src/server/globals.h   |  35 ++-
 src/server/image.c     |   3 +-
 src/server/image.h     |  44 +++
 src/server/net.c       |  44 +--
 src/server/reference.h |   5 +
 src/server/uplink.c    | 771 +++++++++++++++++++++++++++----------------------
 src/server/uplink.h    |   2 +-
 src/serverconfig.h     |   3 +-
 9 files changed, 518 insertions(+), 395 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.c b/src/server/globals.c
index ac079b1..98e0ddb 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -19,6 +19,7 @@ atomic_int _clientPenalty = 0;
 atomic_bool _isProxy = false;
 atomic_int _backgroundReplication = BGR_FULL;
 atomic_int _bgrMinClients = 0;
+atomic_int _bgrWindowSize = 1;
 atomic_bool _lookupMissingForProxy = true;
 atomic_bool _sparseFiles = false;
 atomic_bool _ignoreAllocErrors = false;
@@ -74,6 +75,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
 	SAVE_TO_VAR_BOOL( dnbd3, isProxy );
 	SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
 	SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+	SAVE_TO_VAR_INT( dnbd3, bgrWindowSize );
 	SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
 	SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
 	SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
@@ -134,6 +136,9 @@ void globals_loadConfig()
 		logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
 		_sparseFiles = false;
 	}
+	if ( _bgrWindowSize < 1 ) {
+		_bgrWindowSize = 1;
+	}
 	// Dump config as interpreted
 	char buffer[2000];
 	globals_dumpConfig( buffer, sizeof(buffer) );
@@ -325,6 +330,7 @@ size_t globals_dumpConfig(char *buffer, size_t size)
 		PBOOL(backgroundReplication);
 	}
 	PINT(bgrMinClients);
+	PINT(bgrWindowSize);
 	PBOOL(lookupMissingForProxy);
 	PBOOL(sparseFiles);
 	PBOOL(ignoreAllocErrors);
diff --git a/src/server/globals.h b/src/server/globals.h
index 1bb6857..5cee92a 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -18,18 +18,27 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t;
 typedef struct _dnbd3_image dnbd3_image_t;
 typedef struct _dnbd3_client dnbd3_client_t;
 
-typedef struct
+typedef struct _dnbd3_queue_client
 {
-	uint64_t handle;  // Client defined handle to pass back in reply
-	uint64_t from;    // First byte offset of requested block (ie. 4096)
-	uint64_t to;      // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+	struct _dnbd3_queue_client *next;
+	uint64_t handle;    // Handle used by client
+	uint64_t from, to;  // Client range
 	dnbd3_client_t * client; // Client to send reply to
-	int status;      // status of this entry: ULR_*
+} dnbd3_queue_client_t;
+
+typedef struct _dnbd3_queue_entry
+{
+	struct _dnbd3_queue_entry *next;
+	uint64_t   handle;   // Our handle for this entry
+	uint64_t   from;     // First byte offset of requested block (ie. 4096)
+	uint64_t   to;       // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+	dnbd3_queue_client_t *clients;
 #ifdef _DEBUG
-	ticks entered;           // When this request entered the queue (for debugging)
+	ticks      entered;  // When this request entered the queue (for debugging)
 #endif
-	uint8_t hopCount;      // How many hops this request has already taken across proxies
-} dnbd3_queued_request_t;
+	uint8_t    hopCount; // How many hops this request has already taken across proxies
+	bool       sent;     // Already sent to uplink?
+} dnbd3_queue_entry_t;
 
 typedef struct _ns
 {
@@ -91,12 +100,12 @@ struct _dnbd3_uplink
 	bool cycleDetected;         // connection cycle between proxies detected for current remote server
 	int nextReplicationIndex;   // Which index in the cache map we should start looking for incomplete blocks at
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
-	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
 	atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
 	int queueLen;               // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
-	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+	dnbd3_queue_entry_t *queue;
+	atomic_uint_fast32_t queueId;
 	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
 };
 
@@ -156,6 +165,7 @@ struct _dnbd3_client
 	atomic_uint_fast64_t bytesSent;   // Byte counter for this client.
 	dnbd3_image_t * _Atomic image;    // Image in use by this client, or NULL during handshake
 	int sock;
+	_Atomic uint8_t relayedCount;     // How many requests are in-flight to the uplink server
 	bool isServer;                    // true if a server in proxy mode, false if real client
 	dnbd3_host_t host;
 	char hostName[HOSTNAMELEN];       // inet_ntop version of host
@@ -242,6 +252,11 @@ extern atomic_int _backgroundReplication;
  */
 extern atomic_int _bgrMinClients;
 
+/**
+ * How many in-flight replication requests we should target (per uplink)
+ */
+extern atomic_int _bgrWindowSize;
+
 /**
  * (In proxy mode): If connecting client is a proxy, and the requested image
  * is not known locally, should we ask our known alt servers for it?
diff --git a/src/server/image.c b/src/server/image.c
index 86b6374..81ec479 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -340,7 +340,6 @@ dnbd3_image_t* image_byId(int imgId)
 dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen)
 {
 	int i;
-	const char *removingText = _removeMissingImages ? ", removing from list" : "";
 	dnbd3_image_t *candidate = NULL;
 	// Simple sanity check
 	const size_t slen = strlen( name );
@@ -1895,7 +1894,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 			// We're not replicating this image, if there's a cache map, reload
 			// it periodically, since we might read from a shared storage that
 			// another server instance is writing to.
-			if ( full || !cache->unchanged && !image->problem.read ) {
+			if ( full || ( !cache->unchanged && !image->problem.read ) ) {
 				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
 				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
 				if ( onDisk == NULL ) {
diff --git a/src/server/image.h b/src/server/image.h
index 4614c74..b23711b 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -51,6 +51,50 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
 
 bool image_saveCacheMap(dnbd3_image_t *image);
 
+/**
+ * Check if given range is cached. Be careful when using this function because:
+ * 1) you need to hold a reference to the cache map
+ * 2) start and end are assumed to be 4k aligned
+ * 3) start and end are not checked to be in bounds (we don't know the image in this context)
+ */
+static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end)
+{
+	const uint64_t firstByteInMap = start >> 15;
+	const uint64_t lastByteInMap = (end - 1) >> 15;
+	const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+	const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+	uint64_t pos;
+	uint8_t b;
+	bool isCached;
+	if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+		b = cache->map[firstByteInMap];
+		isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+	} else {
+		isCached = true;
+		atomic_thread_fence( memory_order_acquire );
+		// First byte
+		if ( isCached ) {
+			b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+			isCached = ( ( b & fb ) == fb );
+		}
+		// Last byte
+		if ( isCached ) {
+			b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+			isCached = ( ( b & lb ) == lb );
+		}
+		// Middle, must be all bits set (0xff)
+		if ( isCached ) {
+			for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+				if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+					isCached = false;
+					break;
+				}
+			}
+		}
+	}
+	return isCached;
+}
+
 // one byte in the map covers 8 4kib blocks, so 32kib per byte
 // "+ (1 << 15) - 1" is required to account for the last bit of
 // the image that is smaller than 32kib
diff --git a/src/server/net.c b/src/server/net.c
index 954cb8a..9ba9dbc 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -197,6 +197,7 @@ void* net_handleNewConnection(void *clientPtr)
 	client->hostName[HOSTNAMELEN-1] = '\0';
 	mutex_unlock( &client->lock );
 	client->bytesSent = 0;
+	client->relayedCount = 0;
 
 	if ( !addToList( client ) ) {
 		freeClientStruct( client );
@@ -344,41 +345,18 @@ void* net_handleNewConnection(void *clientPtr)
 					// This is a proxyed image, check if we need to relay the request...
 					const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					const uint64_t firstByteInMap = start >> 15;
-					const uint64_t lastByteInMap = (end - 1) >> 15;
-					const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
-					const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
-					uint64_t pos;
-					uint8_t b;
-					bool isCached;
-					if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
-						b = cache->map[firstByteInMap];
-						isCached = ( b & ( fb & lb ) ) == ( fb & lb );
-					} else {
-						isCached = true;
-						atomic_thread_fence( memory_order_acquire );
-						// First byte
-						if ( isCached ) {
-							b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-							isCached = ( ( b & fb ) == fb );
-						}
-						// Last byte
-						if ( isCached ) {
-							b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-							isCached = ( ( b & lb ) == lb );
-						}
-						// Middle, must be all bits set (0xff)
-						if ( isCached ) {
-							for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-								if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
-									isCached = false;
-									break;
-								}
+					if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+						if ( unlikely( client->relayedCount > 250 ) ) {
+							logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" );
+							for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) {
+								usleep( 10000 );
+							}
+							if ( client->relayedCount > 250 ) {
+								logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" );
+								goto exit_client_cleanup;
 							}
 						}
-					}
-					if ( !isCached ) {
-						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
+						if ( !uplink_request( NULL, client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
 							goto exit_client_cleanup;
diff --git a/src/server/reference.h b/src/server/reference.h
index 4eda546..75a681f 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref )
 	return ref;
 }
 
+static inline void ref_inc( ref *ref )
+{
+	++ref->count;
+}
+
 static inline void ref_put( ref *ref )
 {
 	if ( --ref->count == 0 ) {
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 7c7cd1c..188bf06 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -8,6 +8,7 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "threadpool.h"
 #include "reference.h"
 
 #include <assert.h>
@@ -21,30 +22,6 @@
 #define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
 #define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
 
-#define REP_NONE ( (uint64_t)0xffffffffffffffff )
-
-// Status of request in queue
-
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
-
-static const char *const NAMES_ULR[4] = {
-	[ULR_FREE] = "ULR_FREE",
-	[ULR_NEW] = "ULR_NEW",
-	[ULR_PENDING] = "ULR_PENDING",
-	[ULR_PROCESSING] = "ULR_PROCESSING",
-};
-
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
 static void cancelAllRequests(dnbd3_uplink_t *uplink);
@@ -59,6 +36,15 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
 static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
+static void *prefetchForClient(void *data);
+
+typedef struct {
+	dnbd3_uplink_t *uplink;
+	uint64_t start;
+	uint32_t length;
+} prefetch_request_t;
 
 // ############ uplink connection handling
 
@@ -106,6 +92,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->bytesReceived = 0;
 	uplink->bytesReceivedLastSave = 0;
 	uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
+	uplink->queue = NULL;
 	uplink->queueLen = 0;
 	uplink->cacheFd = -1;
 	uplink->signal = signal_new();
@@ -113,7 +100,6 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
 		goto failure;
 	}
-	uplink->replicationHandle = REP_NONE;
 	mutex_lock( &uplink->rttLock );
 	mutex_lock( &uplink->sendMutex );
 	uplink->current.fd = -1;
@@ -175,9 +161,9 @@ bool uplink_shutdown(dnbd3_image_t *image)
 	}
 	cancelAllRequests( uplink );
 	ref_setref( &image->uplinkref, NULL );
-	ref_put( &uplink->reference );
 	mutex_unlock( &uplink->queueLock );
 	bool retval = ( exp && image->users == 0 );
+	ref_put( &uplink->reference );
 	mutex_unlock( &image->lock );
 	return retval;
 }
@@ -188,12 +174,21 @@ bool uplink_shutdown(dnbd3_image_t *image)
  */
 static void cancelAllRequests(dnbd3_uplink_t *uplink)
 {
-	for ( int i = 0; i < uplink->queueLen; ++i ) {
-		if ( uplink->queue[i].status != ULR_FREE ) {
-			net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
-			uplink->queue[i].status = ULR_FREE;
+	dnbd3_queue_entry_t *it = uplink->queue;
+	while ( it != NULL ) {
+		dnbd3_queue_client_t *cit = it->clients;
+		while ( cit != NULL ) {
+			net_sendReply( cit->client, CMD_ERROR, cit->handle );
+			cit->client->relayedCount--;
+			dnbd3_queue_client_t *next = cit->next;
+			free( cit );
+			cit = next;
 		}
+		dnbd3_queue_entry_t *next = it->next;
+		free( it );
+		it = next;
 	}
+	uplink->queue = NULL;
 	uplink->queueLen = 0;
 	uplink->image->problem.queue = false;
 }
@@ -234,39 +229,54 @@ static void uplink_free(ref *ref)
  */
 void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
 {
+	if ( client->relayedCount == 0 )
+		return;
 	mutex_lock( &uplink->queueLock );
-	for (int i = uplink->queueLen - 1; i >= 0; --i) {
-		if ( uplink->queue[i].client == client ) {
-			// Make sure client doesn't get destroyed while we're sending it data
-			mutex_lock( &client->sendMutex );
-			mutex_unlock( &client->sendMutex );
-			uplink->queue[i].client = NULL;
-			uplink->queue[i].status = ULR_FREE;
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) {
+			if ( (**cit).client == client ) {
+				--client->relayedCount;
+				dnbd3_queue_client_t *entry = *cit;
+				*cit = (**cit).next;
+				free( entry );
+			}
 		}
-		if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
 	}
 	mutex_unlock( &uplink->queueLock );
+	if ( unlikely( client->relayedCount != 0 ) ) {
+		logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount );
+		int i;
+		for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) {
+			usleep( 10000 );
+		}
+		if ( client->relayedCount != 0 ) {
+			logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount );
+		}
+	}
 }
 
 /**
- * Request a chunk of data through an uplink server
- * Locks on: image.lock, uplink.queueLock
+ * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL.
+ * If client is NULL, this is assumed to be a background replication request.
+ * Locks on: uplink.queueLock, uplink.sendMutex
  */
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
 {
-	if ( client == NULL || client->image == NULL )
-		return false;
+	bool getUplink = ( uplink == NULL );
+	assert( client != NULL || uplink != NULL );
 	if ( length > (uint32_t)_maxPayload ) {
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
-	if ( unlikely( uplink == NULL ) ) {
-		uplink_init( client->image, -1, NULL, -1 );
+	if ( getUplink ) {
 		uplink = ref_get_uplink( &client->image->uplinkref );
-		if ( uplink == NULL ) {
-			logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
-			return false;
+		if ( unlikely( uplink == NULL ) ) {
+			uplink_init( client->image, -1, NULL, -1 );
+			uplink = ref_get_uplink( &client->image->uplinkref );
+			if ( uplink == NULL ) {
+				logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+				return false;
+			}
 		}
 	}
 	if ( uplink->shutdown ) {
@@ -275,163 +285,179 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
+	if ( client != NULL && hops != 0
+			&& isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
 		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
 		goto fail_ref;
 	}
 
-	int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
-	int existingType = -1; // ULR_* type of existing request
-	int i;
-	int freeSlot = -1;
-	int firstUsedSlot = -1;
-	bool requestLoop = false;
-	const uint64_t end = start + length;
-
-	mutex_lock( &uplink->queueLock );
-	if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
-		goto fail_lock;
-	}
-	for (i = 0; i < uplink->queueLen; ++i) {
-		// find free slot to place this request into
-		if ( uplink->queue[i].status == ULR_FREE ) {
-			if ( freeSlot == -1 || existingType != ULR_PROCESSING ) {
-				freeSlot = i;
-			}
-			continue;
-		}
-		if ( firstUsedSlot == -1 ) {
-			firstUsedSlot = i;
-		}
-		// find existing request to attach to
-		if ( uplink->queue[i].from > start || uplink->queue[i].to < end )
-			continue; // Range not suitable
-		// Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious
-		if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) {
-			requestLoop = true;
-			break;
-		}
-		if ( foundExisting == -1 || existingType == ULR_PROCESSING ) {
-			foundExisting = i;
-			existingType = uplink->queue[i].status;
-		}
-	}
-	if ( unlikely( requestLoop ) ) {
-		uplink->cycleDetected = true;
-		signal_call( uplink->signal );
-		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		goto fail_lock;
-	}
-	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
-		freeSlot = -1; // Not attaching to existing request, make it use a higher slot
-	}
-	if ( freeSlot == -1 ) {
-		if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
-			logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
+	struct {
+		uint64_t handle, start, end;
+	} req;
+	do {
+		const uint64_t end = start + length;
+		dnbd3_queue_entry_t *request = NULL, *last = NULL;
+		bool isNew;
+		mutex_lock( &uplink->queueLock );
+		if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
 			goto fail_lock;
 		}
-		freeSlot = uplink->queueLen++;
-		if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) {
-			uplink->image->problem.queue = true;
+		for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+			if ( it->from <= start && it->to >= end ) {
+				// Matching range, attach
+				request = it;
+				break;
+			}
+			if ( it->next == NULL ) {
+				// Not matching, last in list, remember
+				last = it;
+				break;
+			}
 		}
-	}
-	// Do not send request to uplink server if we have a matching pending request AND the request either has the
-	// status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
-	// explicitly send this request to the uplink server. The second condition mentioned here is to prevent
-	// a race condition where the reply for the outstanding request already arrived and the uplink thread
-	// is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
-	// already have passed the index of the free slot we determined, but not reached the existing request we just found above.
-	if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) {
-		foundExisting = -1; // -1 means "send request"
-	}
-#ifdef _DEBUG
-	if ( foundExisting != -1 ) {
-		logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot );
-		logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
-				"New      %" PRIu64 "-%" PRIu64 " (%p)\n",
-				uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
-				start, end, (void*)client );
-	}
-#endif
-	// Fill structure
-	uplink->queue[freeSlot].from = start;
-	uplink->queue[freeSlot].to = end;
-	uplink->queue[freeSlot].handle = handle;
-	uplink->queue[freeSlot].client = client;
-	//int old = uplink->queue[freeSlot].status;
-	uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW :
-			( existingType == ULR_NEW ? ULR_PENDING : existingType ) );
-	uplink->queue[freeSlot].hopCount = hops;
+		dnbd3_queue_client_t **c;
+		if ( request == NULL ) {
+			// No existing request to attach to
+			if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) {
+				logadd( LOG_WARNING, "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." );
+				goto fail_lock;
+			}
+			uplink->queueLen++;
+			if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+				uplink->image->problem.queue = true;
+			}
+			request = malloc( sizeof(*request) );
+			if ( last == NULL ) {
+				uplink->queue = request;
+			} else {
+				last->next = request;
+			}
+			request->next = NULL;
+			request->handle = ++uplink->queueId;
+			request->from = start & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+			request->to = (end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 #ifdef _DEBUG
-	timing_get( &uplink->queue[freeSlot].entered );
-	//logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
+			timing_get( &request->entered );
 #endif
-	mutex_unlock( &uplink->queueLock );
+			request->hopCount = hops;
+			request->sent = true; // Optimistic; would be set to false on failure
+			if ( client == NULL ) {
+				// BGR
+				request->clients = NULL;
+			} else {
+				c = &request->clients;
+			}
+			isNew = true;
+		} else if ( client == NULL ) {
+			// Replication request that maches existing request. Do nothing
+			isNew = false;
+		} else {
+			// Existing request. Check if potential cycle
+			if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) {
+				logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
+				goto fail_lock;
+			}
+			// Count number if clients, get tail of list
+			int count = 0;
+			c = &request->clients;
+			while ( *c != NULL ) {
+				c = &(**c).next;
+				if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) {
+					logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count );
+					goto fail_lock;
+				}
+			}
+			isNew = false;
+		}
+		req.handle = request->handle;
+		req.start = request->from;
+		req.end = request->to;
+		if ( client != NULL ) {
+			*c = malloc( sizeof( *request->clients ) );
+			(**c).next = NULL;
+			(**c).handle = handle;
+			(**c).from = start;
+			(**c).to = end;
+			(**c).client = client;
+			client->relayedCount++;
+		}
+		mutex_unlock( &uplink->queueLock );
 
-	if ( foundExisting != -1 ) {
-		ref_put( &uplink->reference );
-		return true; // Attached to pending request, do nothing
-	}
+		if ( !isNew ) {
+			goto success_ref; // Attached to pending request, do nothing
+		}
+	} while (0);
 
-	// See if we can fire away the request
-	if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
-		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
+	// Fire away the request
+	mutex_lock( &uplink->sendMutex );
+	if ( unlikely( uplink->current.fd == -1 ) ) {
+		uplink->image->problem.uplink = true;
+		markRequestUnsent( uplink, req.handle );
+		mutex_unlock( &uplink->sendMutex );
+		logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 	} else {
-		if ( unlikely( uplink->current.fd == -1 ) ) {
+		if ( hops < 200 ) ++hops;
+		const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start,
+				req.handle, COND_HOPCOUNT( uplink->current.version, hops ) );
+		if ( unlikely( !ret ) ) {
+			markRequestUnsent( uplink, req.handle );
 			uplink->image->problem.uplink = true;
 			mutex_unlock( &uplink->sendMutex );
-			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
+			logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle );
 		} else {
-			const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-			const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
-			if ( hops < 200 ) ++hops;
-			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-			if ( unlikely( !ret ) ) {
-				uplink->image->problem.uplink = true;
-				mutex_unlock( &uplink->sendMutex );
-				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
-			} else {
-				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
-				int state;
-				mutex_unlock( &uplink->sendMutex );
-				mutex_lock( &uplink->queueLock );
-				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
-					state = uplink->queue[freeSlot].status;
-					if ( uplink->queue[freeSlot].status == ULR_NEW ) {
-						uplink->queue[freeSlot].status = ULR_PENDING;
-					}
-				} else {
-					state = -1;
-				}
-				mutex_unlock( &uplink->queueLock );
-				if ( state == -1 ) {
-					logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
-				} else if ( state == ULR_NEW ) {
-					//logadd( LOG_DEBUG2, "Direct uplink request" );
-				} else {
-					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
-				}
-				ref_put( &uplink->reference );
-				return true;
-			}
-			// Fall through to waking up sender thread
+			// OK
+			mutex_unlock( &uplink->sendMutex );
+			goto success_ref;
 		}
+		// Fall through to waking up sender thread
 	}
 
 	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
 		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
-	ref_put( &uplink->reference );
+
+success_ref:
+	if ( client != NULL ) {
+		// Was from client -- potential prefetch
+		uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start );
+		if ( len > 0 ) {
+			prefetch_request_t *job = malloc( sizeof( *job ) );
+			job->start = req.end;
+			job->length = len;
+			job->uplink = uplink;
+			ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it
+			threadpool_run( &prefetchForClient, (void*)job );
+		}
+	}
+	if ( getUplink ) {
+		ref_put( &uplink->reference );
+	}
 	return true;
 fail_lock:
 	mutex_unlock( &uplink->queueLock );
 fail_ref:
-	ref_put( &uplink->reference );
+	if ( getUplink ) {
+		ref_put( &uplink->reference );
+	}
 	return false;
 }
 
+static void *prefetchForClient(void *data)
+{
+	prefetch_request_t *job = (prefetch_request_t*)data;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image );
+	if ( cache != NULL ) {
+		if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) {
+			uplink_request( job->uplink, NULL, ++job->uplink->queueId, job->start, job->length, 0 );
+		}
+		ref_put( &cache->reference );
+	}
+	ref_put( &job->uplink->reference );
+	free( job );
+	return NULL;
+}
+
 /**
  * Uplink thread.
  * Locks are irrelevant as this is never called from another function
@@ -443,7 +469,7 @@ static void* uplink_mainloop(void *data)
 #define EV_COUNT  (2)
 	struct pollfd events[EV_COUNT];
 	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
-	int numSocks, i, waitTime;
+	int numSocks, waitTime;
 	int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
 	int rttTestResult;
 	uint32_t discoverFailCount = 0;
@@ -478,7 +504,7 @@ static void* uplink_mainloop(void *data)
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
 			if ( waitTime < 100 ) waitTime = 100;
-			if ( waitTime > 10000 ) waitTime = 10000;
+			else if ( waitTime > 10000 ) waitTime = 10000;
 		}
 		events[EV_SOCKET].fd = uplink->current.fd;
 		numSocks = poll( events, EV_COUNT, waitTime );
@@ -505,7 +531,6 @@ static void* uplink_mainloop(void *data)
 			mutex_unlock( &uplink->rttLock );
 			discoverFailCount = 0;
 			if ( fd != -1 ) close( fd );
-			uplink->replicationHandle = REP_NONE;
 			uplink->image->problem.uplink = false;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
@@ -559,11 +584,11 @@ static void* uplink_mainloop(void *data)
 		}
 		declare_now;
 		uint32_t timepassed = timing_diff( &lastKeepalive, &now );
-		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
 			lastKeepalive = now;
 			uplink->idleTime += timepassed;
 			// Keep-alive
-			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
+			if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
 				// Send keep-alive if nothing is happening, and try to trigger background rep.
 				if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) {
 					uplink_connectionFailed( uplink, true );
@@ -612,19 +637,16 @@ static void* uplink_mainloop(void *data)
 			ticks deadline;
 			timing_set( &deadline, &now, -10 );
 			mutex_lock( &uplink->queueLock );
-			for (i = 0; i < uplink->queueLen; ++i) {
-				if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
-					snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
-							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
-							uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
-					uplink->queue[i].entered = now;
+			for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+				if ( timing_reached( &it->entered, &deadline ) ) {
+					logadd( LOG_WARNING, "Starving request detected:"
+							" (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d",
+							it->from, it->to, (int)it->sent, PIMG(uplink->image) );
+					it->entered = now;
 #ifdef _DEBUG_RESEND_STARVING
-					uplink->queue[i].status = ULR_NEW;
+					it->sent = false;
 					resend = true;
 #endif
-					mutex_unlock( &uplink->queueLock );
-					logadd( LOG_WARNING, "%s", buffer );
-					mutex_lock( &uplink->queueLock );
 				}
 			}
 			mutex_unlock( &uplink->queueLock );
@@ -667,37 +689,54 @@ cleanup: ;
  */
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
-	// Scan for new requests
-	int j;
+	// Scan for new requests, or optionally, (re)send all
+	// Build a buffer, so if there aren't too many requests, we can send them after
+	// unlocking the queue again. Otherwise we need flushes during iteration, which
+	// is no ideal, but in that case the uplink is probably overwhelmed anyways.
+	// Try 125 as that's exactly 300bytes, usually 2*MTU.
+#define MAX_RESEND_BATCH 125
+	dnbd3_request_t reqs[MAX_RESEND_BATCH];
+	int count = 0;
 	mutex_lock( &uplink->queueLock );
-	for (j = 0; j < uplink->queueLen; ++j) {
-		if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue;
-		uplink->queue[j].status = ULR_PENDING;
-		uint8_t hops = uplink->queue[j].hopCount;
-		const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-		const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
-		/*
-		logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
-				(void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
-		*/
-		mutex_unlock( &uplink->queueLock );
-		if ( hops < 200 ) ++hops;
-		mutex_lock( &uplink->sendMutex );
-		const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-		if ( likely( ret ) ) {
-			mutex_unlock( &uplink->sendMutex );
-		} else {
-			// Non-critical - if the connection dropped or the server was changed
-			// the thread will re-send this request as soon as the connection
-			// is reestablished.
-			uplink->image->problem.uplink = true;
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( newOnly && it->sent )
+			continue;
+		it->sent = true;
+		dnbd3_request_t *hdr = &reqs[count++];
+		hdr->magic = dnbd3_packet_magic;
+		hdr->cmd = CMD_GET_BLOCK;
+		hdr->size = it->to - it->from;
+		hdr->offset_small = it->from;
+		hdr->hops = it->hopCount + 1;
+		hdr->handle = it->handle;
+		fixup_request( *hdr );
+		if ( count == MAX_RESEND_BATCH ) {
+			bool ok = false;
+			logadd( LOG_DEBUG2, "BLOCKING resend of %d", count );
+			count = 0;
+			mutex_lock( &uplink->sendMutex );
+			if ( uplink->current.fd != -1 ) {
+				ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 )
+						== DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH );
+			}
 			mutex_unlock( &uplink->sendMutex );
-			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			return;
+			if ( !ok ) {
+				uplink->image->problem.uplink = true;
+				break;
+			}
 		}
-		mutex_lock( &uplink->queueLock );
 	}
 	mutex_unlock( &uplink->queueLock );
+	if ( count != 0 ) {
+		mutex_lock( &uplink->sendMutex );
+		if ( uplink->current.fd != -1 ) {
+			uplink->image->problem.uplink =
+				( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 )
+					!= DNBD3_REQUEST_SIZE * count );
+		}
+		mutex_unlock( &uplink->sendMutex );
+	}
+#undef MAX_RESEND_BATCH
 }
 
 /**
@@ -720,71 +759,73 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		return false; // Should never be called in this state, consider send error
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
 		return true; // Don't do background replication
-	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return true; // Already a replication request on the wire, or no more blocks to replicate
+	if ( uplink->nextReplicationIndex == -1 )
+		return true; // No more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
 	if ( image->users < _bgrMinClients )
 		return true; // Not enough active users
+	const int numNewRequests = numWantedReplicationRequests( uplink );
+	if ( numNewRequests <= 0 )
+		return true; // Already sufficient amount of requests on the wire
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-	if ( cache == NULL || image->users ) {
+	if ( cache == NULL ) {
 		// No cache map (=image complete)
-		ref_put( &cache->reference );
 		return true;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	const int lastBlockIndex = mapBytes - 1;
-	int endByte;
-	if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
-		endByte = uplink->nextReplicationIndex + mapBytes;
-	} else { // Hashblock based: Only look for match in current hash block
-		endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
-		if ( endByte > mapBytes ) {
-			endByte = mapBytes;
+	for ( int bc = 0; bc < numNewRequests; ++bc ) {
+		int endByte;
+		if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+			endByte = uplink->nextReplicationIndex + mapBytes;
+		} else { // Hashblock based: Only look for match in current hash block
+			endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+			if ( endByte > mapBytes ) {
+				endByte = mapBytes;
+			}
 		}
-	}
-	atomic_thread_fence( memory_order_acquire );
-	int replicationIndex = -1;
-	for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
-		const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
-		if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
-				&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
-			// Found incomplete one
-			replicationIndex = i;
+		atomic_thread_fence( memory_order_acquire );
+		int replicationIndex = -1;
+		for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
+			const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+			if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+					&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+				// Found incomplete one
+				replicationIndex = i;
+				break;
+			}
+		}
+		if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+			// Nothing left in current block, find next one
+			replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
+		}
+		if ( replicationIndex == -1 ) {
+			// Replication might be complete, uplink_mainloop should take care....
+			uplink->nextReplicationIndex = -1;
 			break;
 		}
+		const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+		const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+		const uint64_t handle = ++uplink->queueId;
+		if ( !uplink_request( uplink, NULL, handle, offset, size, 0 ) ) {
+			logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)",
+					PIMG(uplink->image) );
+			ref_put( &cache->reference );
+			return false;
+		}
+		if ( replicationIndex == lastBlockIndex ) {
+			uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+		}
+		uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+		if ( _backgroundReplication == BGR_HASHBLOCK
+				&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+			// Just crossed a hash block boundary, look for new candidate starting at this very index
+			uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+			if ( uplink->nextReplicationIndex == -1 )
+				break;
+		}
 	}
 	ref_put( &cache->reference );
-	if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
-		// Nothing left in current block, find next one
-		replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
-	}
-	if ( replicationIndex == -1 ) {
-		// Replication might be complete, uplink_mainloop should take care....
-		uplink->nextReplicationIndex = -1;
-		return true;
-	}
-	const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
-	uplink->replicationHandle = offset;
-	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
-	mutex_lock( &uplink->sendMutex );
-	bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
-	if ( likely( sendOk ) ) {
-		mutex_unlock( &uplink->sendMutex );
-	} else {
-		uplink->image->problem.uplink = true;
-		mutex_unlock( &uplink->sendMutex );
-		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
-		return false;
-	}
-	if ( replicationIndex == lastBlockIndex ) {
-		uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
-	}
-	uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
-	if ( _backgroundReplication == BGR_HASHBLOCK
-			&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
-		// Just crossed a hash block boundary, look for new candidate starting at this very index
-		uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
-	}
 	return true;
 }
 
@@ -845,7 +886,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
 	dnbd3_reply_t inReply, outReply;
-	int ret, i;
+	int ret;
 	for (;;) {
 		ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
 		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
@@ -881,13 +922,34 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		}
 		// Payload read completely
 		// Bail out if we're not interested
-		if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+		if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) )
+			continue;
 		// Is a legit block reply
-		struct iovec iov[2];
-		const uint64_t start = inReply.handle;
-		const uint64_t end = inReply.handle + inReply.size;
 		totalBytesReceived += inReply.size;
 		uplink->bytesReceived += inReply.size;
+		// Get entry from queue
+		dnbd3_queue_entry_t *entry;
+		mutex_lock( &uplink->queueLock );
+		for ( entry = uplink->queue; entry != NULL; entry = entry->next ) {
+			if ( entry->handle == inReply.handle )
+				break;
+		}
+		if ( entry == NULL ) {
+			mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+			logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)",
+					inReply.handle, PIMG(uplink->image) );
+			continue;
+		}
+		const uint64_t start = entry->from;
+		const uint64_t end = entry->to;
+		mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+		// We don't remove the entry from the list here yet, to slightly increase the chance of other
+		// clients attaching to this request while we write the data to disk
+		if ( end - start != inReply.size ) {
+			logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)",
+					inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) );
+		}
+		struct iovec iov[2];
 		// 1) Write to cache file
 		if ( unlikely( uplink->cacheFd == -1 ) ) {
 			uplink_reopenCacheFd( uplink, false );
@@ -934,98 +996,76 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 						PIMG(uplink->image), err );
 			}
 		}
-		// 2) Figure out which clients are interested in it
-		// Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop
-		// below; this prevents uplink_request() from attaching to this request
-		// by populating a slot with index greater than the highest matching
-		// request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
-		// where it's fine if the index is greater)
+		bool found = false;
+		dnbd3_queue_entry_t **it;
 		mutex_lock( &uplink->queueLock );
-		for (i = 0; i < uplink->queueLen; ++i) {
-			dnbd3_queued_request_t * const req = &uplink->queue[i];
-			assert( req->status != ULR_PROCESSING );
-			if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
-			assert( req->client != NULL );
-			if ( req->from >= start && req->to <= end ) { // Match :-)
-				req->status = ULR_PROCESSING;
-			}
-		}
-		// 3) Send to interested clients - iterate backwards so request collaboration works, and
-		// so we can decrease queueLen on the fly while iterating. Should you ever change this to start
-		// from 0, you also need to change the "attach to existing request"-logic in uplink_request()
-		outReply.magic = dnbd3_packet_magic;
-		bool served = false;
-		for ( i = uplink->queueLen - 1; i >= 0; --i ) {
-			dnbd3_queued_request_t * const req = &uplink->queue[i];
-			if ( req->status == ULR_PROCESSING ) {
-				size_t bytesSent = 0;
-				assert( req->from >= start && req->to <= end );
-				dnbd3_client_t * const client = req->client;
-				outReply.cmd = CMD_GET_BLOCK;
-				outReply.handle = req->handle;
-				outReply.size = (uint32_t)( req->to - req->from );
-				iov[0].iov_base = &outReply;
-				iov[0].iov_len = sizeof outReply;
-				iov[1].iov_base = uplink->recvBuffer + (req->from - start);
-				iov[1].iov_len = outReply.size;
-				fixup_reply( outReply );
-				req->status = ULR_FREE;
-				req->client = NULL;
-				served = true;
-				mutex_lock( &client->sendMutex );
-				mutex_unlock( &uplink->queueLock );
-				if ( client->sock != -1 ) {
-					ssize_t sent = writev( client->sock, iov, 2 );
-					if ( sent > (ssize_t)sizeof outReply ) {
-						bytesSent = (size_t)sent - sizeof outReply;
-					}
-				}
-				if ( bytesSent != 0 ) {
-					client->bytesSent += bytesSent;
-				}
-				mutex_unlock( &client->sendMutex );
-				mutex_lock( &uplink->queueLock );
-				if ( i > uplink->queueLen ) {
-					i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests
-				}
+		for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) {
+			if ( *it == entry && entry->handle == inReply.handle ) { // ABA check
+				assert( found == false );
+				*it = (**it).next;
+				found = true;
+				uplink->queueLen--;
+				break;
 			}
-			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
 		if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
 			uplink->image->problem.queue = false;
 		}
 		mutex_unlock( &uplink->queueLock );
-#ifdef _DEBUG
-		if ( !served && start != uplink->replicationHandle ) {
-			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
+		if ( !found ) {
+			logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)",
+					PIMG(uplink->image) );
+			continue;
 		}
-#endif
-		if ( start == uplink->replicationHandle ) {
-			// Was our background replication
-			uplink->replicationHandle = REP_NONE;
-			// Try to remove from fs cache if no client was interested in this data
-			if ( !served && uplink->cacheFd != -1 ) {
-				posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+		outReply.magic = dnbd3_packet_magic;
+		dnbd3_queue_client_t *next;
+		for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
+			size_t bytesSent = 0;
+			assert( c->from >= start && c->to <= end );
+			dnbd3_client_t * const client = c->client;
+			outReply.cmd = CMD_GET_BLOCK;
+			outReply.handle = c->handle;
+			outReply.size = (uint32_t)( c->to - c->from );
+			iov[0].iov_base = &outReply;
+			iov[0].iov_len = sizeof outReply;
+			iov[1].iov_base = uplink->recvBuffer + (c->from - start);
+			iov[1].iov_len = outReply.size;
+			fixup_reply( outReply );
+			mutex_lock( &client->sendMutex );
+			if ( client->sock != -1 ) {
+				ssize_t sent = writev( client->sock, iov, 2 );
+				if ( sent > (ssize_t)sizeof outReply ) {
+					bytesSent = (size_t)sent - sizeof outReply;
+				}
+				if ( bytesSent != 0 ) {
+					client->bytesSent += bytesSent;
+				}
 			}
+			mutex_unlock( &client->sendMutex );
+			client->relayedCount--;
+			next = c->next;
+			free( c );
 		}
-		if ( served ) {
+		if ( entry->clients != NULL ) {
 			// Was some client -- reset idle counter
 			uplink->idleTime = 0;
 			// Re-enable replication if disabled
 			if ( uplink->nextReplicationIndex == -1 ) {
 				uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
 			}
+		} else {
+			if ( uplink->cacheFd != -1 ) {
+				// Try to remove from fs cache if no client was interested in this data
+				posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+			}
 		}
+		free( entry );
+	} // main receive loop
+	// Trigger background replication if applicable
+	if ( !uplink_sendReplicationRequest( uplink ) ) {
+		goto error_cleanup;
 	}
-	if ( uplink->replicationHandle == REP_NONE ) {
-		mutex_lock( &uplink->queueLock );
-		const bool rep = ( uplink->queueLen == 0 );
-		mutex_unlock( &uplink->queueLock );
-		if ( rep ) {
-			if ( !uplink_sendReplicationRequest( uplink ) )
-				goto error_cleanup;
-		}
-	}
+	// Normal end
 	return;
 	// Error handling from failed receive or message parsing
 error_cleanup: ;
@@ -1046,7 +1086,6 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
-	uplink->replicationHandle = REP_NONE;
 	if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
 		uplink->nextReplicationIndex = 0;
 	}
@@ -1156,3 +1195,39 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
 		return false;
 	return altservers_toString( current, buffer, len );
 }
+
+/**
+ * Get number of replication requests that should be sent right now to
+ * meet the configured bgrWindowSize. Returns 0 if any client requests
+ * are pending
+ */
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
+{
+	int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 );
+	if ( uplink->queueLen == 0 )
+		return ret;
+	mutex_lock( &uplink->queueLock );
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( it->clients == NULL ) {
+			ret--;
+		} else {
+			ret = 0; // Do not allow BGR if client requests are being handled
+			break;
+		}
+	}
+	mutex_unlock( &uplink->queueLock );
+	return ret;
+}
+
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle)
+{
+	mutex_lock( &uplink->queueLock );
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( it->handle == handle ) {
+			it->sent = false;
+			break;
+		}
+	}
+	mutex_unlock( &uplink->queueLock );
+}
+
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 49ff0b4..8f69b05 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -12,7 +12,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 
 void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
 
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
 
 bool uplink_shutdown(dnbd3_image_t *image);
 
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 5c7301d..31708de 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -13,7 +13,8 @@
 #define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
 #define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
 #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
-#define SERVER_MAX_UPLINK_QUEUE  1500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_QUEUE  500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request
 #define SERVER_UPLINK_QUEUELEN_THRES  900 // Threshold where we start dropping incoming clients
 #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
 
-- 
cgit v1.2.3-55-g7522


From 03a2ac45f217793f532af16fd75a163e42e6f18d Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 13 Mar 2020 22:28:11 +0100
Subject: [SERVER] Check and increase hopCount when adding uplink request

---
 src/server/uplink.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index 188bf06..e7bbe70 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -264,6 +264,10 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 {
 	bool getUplink = ( uplink == NULL );
 	assert( client != NULL || uplink != NULL );
+	if ( hops++ > 200 ) { // This is just silly
+		logadd( LOG_WARNING, "Refusing to relay a request that has > 200 hops" );
+		return false;
+	}
 	if ( length > (uint32_t)_maxPayload ) {
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
@@ -285,7 +289,7 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( client != NULL && hops != 0
+	if ( client != NULL && hops > 1
 			&& isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
@@ -354,7 +358,7 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 			isNew = false;
 		} else {
 			// Existing request. Check if potential cycle
-			if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) {
+			if ( hops > request->hopCount && request->from == start && request->to == end ) {
 				logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
 				goto fail_lock;
 			}
@@ -397,7 +401,6 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 		mutex_unlock( &uplink->sendMutex );
 		logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 	} else {
-		if ( hops < 200 ) ++hops;
 		const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start,
 				req.handle, COND_HOPCOUNT( uplink->current.version, hops ) );
 		if ( unlikely( !ret ) ) {
@@ -707,7 +710,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		hdr->cmd = CMD_GET_BLOCK;
 		hdr->size = it->to - it->from;
 		hdr->offset_small = it->from;
-		hdr->hops = it->hopCount + 1;
+		hdr->hops = it->hopCount;
 		hdr->handle = it->handle;
 		fixup_request( *hdr );
 		if ( count == MAX_RESEND_BATCH ) {
-- 
cgit v1.2.3-55-g7522


From 8e0115f6c9ffbf9d9773f8c625c5e353c4b38583 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 13 Mar 2020 22:40:45 +0100
Subject: [SERVER] Check server version before setting hopCount field

---
 src/server/uplink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index e7bbe70..b01df58 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -710,7 +710,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		hdr->cmd = CMD_GET_BLOCK;
 		hdr->size = it->to - it->from;
 		hdr->offset_small = it->from;
-		hdr->hops = it->hopCount;
+		hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount );
 		hdr->handle = it->handle;
 		fixup_request( *hdr );
 		if ( count == MAX_RESEND_BATCH ) {
-- 
cgit v1.2.3-55-g7522


From eddfdc8482b8d28c263d1b1f85e6d5e4badc49ed Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 14 Mar 2020 15:09:11 +0100
Subject: [SERVER] Use image:rid in log messages

---
 src/server/uplink.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index b01df58..efe7fa0 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -895,19 +895,19 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
 		if ( ret == REPLY_AGAIN ) break;
 		if ( unlikely( ret == REPLY_CLOSED ) ) {
-			logadd( LOG_INFO, "Uplink: Remote host hung up (%s)", uplink->image->path );
+			logadd( LOG_INFO, "Uplink: Remote host hung up (%s:%d)", PIMG(uplink->image) );
 			goto error_cleanup;
 		}
 		if ( unlikely( ret == REPLY_WRONGMAGIC ) ) {
-			logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s)", uplink->image->path );
+			logadd( LOG_WARNING, "Uplink server's packet did not start with dnbd3_packet_magic (%s:%d)", PIMG(uplink->image) );
 			goto error_cleanup;
 		}
 		if ( unlikely( ret != REPLY_OK ) ) {
-			logadd( LOG_INFO, "Uplink: Connection error %d (%s)", ret, uplink->image->path );
+			logadd( LOG_INFO, "Uplink: Connection error %d (%s:%d)", ret, PIMG(uplink->image) );
 			goto error_cleanup;
 		}
 		if ( unlikely( inReply.size > (uint32_t)_maxPayload ) ) {
-			logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s", inReply.size, uplink->image->path );
+			logadd( LOG_WARNING, "Pure evil: Uplink server sent too much payload (%" PRIu32 ") for %s:%d", inReply.size, PIMG(uplink->image) );
 			goto error_cleanup;
 		}
 
@@ -920,7 +920,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 			}
 		}
 		if ( unlikely( (uint32_t)sock_recv( uplink->current.fd, uplink->recvBuffer, inReply.size ) != inReply.size ) ) {
-			logadd( LOG_INFO, "Lost connection to uplink server of %s (payload)", uplink->image->path );
+			logadd( LOG_INFO, "Lost connection to uplink server of %s:%d (payload)", PIMG(uplink->image) );
 			goto error_cleanup;
 		}
 		// Payload read completely
-- 
cgit v1.2.3-55-g7522


From 023145f531c54bdfa9e329a5caf38a3061dc42c5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 14 Mar 2020 15:41:50 +0100
Subject: [SERVER] Add comments, assert for uplink thread

---
 src/server/uplink.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index efe7fa0..df2f082 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -46,6 +46,8 @@ typedef struct {
 	uint32_t length;
 } prefetch_request_t;
 
+#define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) )
+
 // ############ uplink connection handling
 
 void uplink_globalsInit()
@@ -692,6 +694,7 @@ cleanup: ;
  */
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
+	assert_uplink_thread();
 	// Scan for new requests, or optionally, (re)send all
 	// Build a buffer, so if there aren't too many requests, we can send them after
 	// unlocking the queue again. Otherwise we need flushes during iteration, which
@@ -758,6 +761,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
  */
 static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
+	assert_uplink_thread();
 	if ( uplink->current.fd == -1 )
 		return false; // Should never be called in this state, consider send error
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
@@ -890,6 +894,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
 	dnbd3_reply_t inReply, outReply;
 	int ret;
+	assert_uplink_thread();
 	for (;;) {
 		ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
 		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
@@ -1023,7 +1028,6 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		outReply.magic = dnbd3_packet_magic;
 		dnbd3_queue_client_t *next;
 		for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
-			size_t bytesSent = 0;
 			assert( c->from >= start && c->to <= end );
 			dnbd3_client_t * const client = c->client;
 			outReply.cmd = CMD_GET_BLOCK;
@@ -1038,10 +1042,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 			if ( client->sock != -1 ) {
 				ssize_t sent = writev( client->sock, iov, 2 );
 				if ( sent > (ssize_t)sizeof outReply ) {
-					bytesSent = (size_t)sent - sizeof outReply;
-				}
-				if ( bytesSent != 0 ) {
-					client->bytesSent += bytesSent;
+					client->bytesSent += (size_t)sent - sizeof outReply;
 				}
 			}
 			mutex_unlock( &client->sendMutex );
@@ -1080,6 +1081,7 @@ error_cleanup: ;
  */
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
+	assert_uplink_thread();
 	if ( uplink->current.fd == -1 )
 		return;
 	setThreadName( "panic-uplink" );
@@ -1109,6 +1111,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink)
 {
 	static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
+	assert_uplink_thread();
 	mutex_lock( &uplink->sendMutex );
 	bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
 	mutex_unlock( &uplink->sendMutex );
@@ -1182,6 +1185,12 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 	return uplink->cacheFd != -1;
 }
 
+/**
+ * Returns true if the uplink has been idle for some time (apart from
+ * background replication, if it is set to hashblock, or if it has
+ * a minimum number of active clients configured that is not currently
+ * reached)
+ */
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
 {
 	return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
@@ -1202,7 +1211,12 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
 /**
  * Get number of replication requests that should be sent right now to
  * meet the configured bgrWindowSize. Returns 0 if any client requests
- * are pending
+ * are pending.
+ * This applies a sort of "slow start" in case the uplink was recently
+ * dealing with actual client requests, in that the uplink's idle time
+ * (in seconds) is an upper bound for the number returned, so we don't
+ * saturate the uplink with loads of requests right away, in case that
+ * client triggers more requests to the uplink server.
  */
 static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
 {
-- 
cgit v1.2.3-55-g7522


From 3680e4819cbd7edbe632372e69533d254f1ae2c2 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 14 Mar 2020 15:51:52 +0100
Subject: [SERVER] Remove uplink_ prefix from static (private) functions

---
 src/server/uplink.c | 80 ++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index df2f082..d6b319b 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -25,17 +25,17 @@
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
 static void cancelAllRequests(dnbd3_uplink_t *uplink);
-static void uplink_free(ref *ref);
+static void freeUplinkStruct(ref *ref);
 static void* uplink_mainloop(void *data);
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
-static void uplink_handleReceive(dnbd3_uplink_t *uplink);
-static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink);
-static void uplink_addCrc32(dnbd3_uplink_t *uplink);
-static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly);
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
+static void handleReceive(dnbd3_uplink_t *uplink);
+static bool sendKeepalive(dnbd3_uplink_t *uplink);
+static void requestCrc32List(dnbd3_uplink_t *uplink);
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink);
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
 static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
 static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
 static void *prefetchForClient(void *data);
@@ -86,7 +86,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	}
 	uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
 	// Start with one reference for the uplink thread. We'll return it when the thread finishes
-	ref_init( &uplink->reference, uplink_free, 1 );
+	ref_init( &uplink->reference, freeUplinkStruct, 1 );
 	mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
 	mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
 	mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
@@ -132,7 +132,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	return true;
 failure: ;
 	if ( uplink != NULL ) {
-		image->users++; // Expected by uplink_free()
+		image->users++; // Expected by freeUplinkStruct()
 		ref_put( &uplink->reference ); // The ref for the uplink thread that never was
 	}
 	mutex_unlock( &image->lock );
@@ -195,7 +195,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink)
 	uplink->image->problem.queue = false;
 }
 
-static void uplink_free(ref *ref)
+static void freeUplinkStruct(ref *ref)
 {
 	dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
 	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", PIMG(uplink->image) );
@@ -489,7 +489,7 @@ static void* uplink_mainloop(void *data)
 	thread_detach( uplink->thread );
 	blockNoncriticalSignals();
 	// Make sure file is open for writing
-	if ( !uplink_reopenCacheFd( uplink, false ) ) {
+	if ( !reopenCacheFd( uplink, false ) ) {
 		// It might have failed - still offer proxy mode, we just can't cache
 		logadd( LOG_WARNING, "Cannot open cache file %s for writing (errno=%d); will just proxy traffic without caching!", uplink->image->path, errno );
 	}
@@ -545,16 +545,16 @@ static void* uplink_mainloop(void *data)
 			}
 			// If we don't have a crc32 list yet, see if the new server has one
 			if ( uplink->image->crc32 == NULL ) {
-				uplink_addCrc32( uplink );
+				requestCrc32List( uplink );
 			}
 			// Re-send all pending requests
-			uplink_sendRequests( uplink, false );
-			uplink_sendReplicationRequest( uplink );
+			sendQueuedRequests( uplink, false );
+			sendReplicationRequest( uplink );
 			events[EV_SOCKET].events = POLLIN | POLLRDHUP;
 			if ( uplink->image->problem.uplink ) {
 				// Some of the requests above must have failed again already :-(
 				logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
-				uplink_connectionFailed( uplink, true );
+				connectionFailed( uplink, true );
 			}
 			timing_gets( &nextAltCheck, altCheckInterval );
 			// The rtt worker already did the handshake for our image, so there's nothing
@@ -573,18 +573,18 @@ static void* uplink_mainloop(void *data)
 			}
 			if ( uplink->current.fd != -1 ) {
 				// Uplink seems fine, relay requests to it...
-				uplink_sendRequests( uplink, true );
+				sendQueuedRequests( uplink, true );
 			} else if ( uplink->queueLen != 0 ) { // No uplink; maybe it was shutdown since it was idle for too long
 				uplink->idleTime = 0;
 			}
 		}
 		// Uplink socket
 		if ( (events[EV_SOCKET].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
-			uplink_connectionFailed( uplink, true );
+			connectionFailed( uplink, true );
 			logadd( LOG_DEBUG1, "Uplink gone away, panic! (revents=%d)\n", (int)events[EV_SOCKET].revents );
 			setThreadName( "panic-uplink" );
 		} else if ( (events[EV_SOCKET].revents & POLLIN) ) {
-			uplink_handleReceive( uplink );
+			handleReceive( uplink );
 			if ( _shutdown || uplink->shutdown ) goto cleanup;
 		}
 		declare_now;
@@ -595,13 +595,13 @@ static void* uplink_mainloop(void *data)
 			// Keep-alive
 			if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
 				// Send keep-alive if nothing is happening, and try to trigger background rep.
-				if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) {
-					uplink_connectionFailed( uplink, true );
+				if ( !sendKeepalive( uplink ) || !sendReplicationRequest( uplink ) ) {
+					connectionFailed( uplink, true );
 					logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
-			if ( uplink_connectionShouldShutdown( uplink ) ) {
+			if ( connectionShouldShutdown( uplink ) ) {
 				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", PIMG(uplink->image) );
 				goto cleanup;
 			}
@@ -656,7 +656,7 @@ static void* uplink_mainloop(void *data)
 			}
 			mutex_unlock( &uplink->queueLock );
 			if ( resend ) {
-				uplink_sendRequests( uplink, true );
+				sendQueuedRequests( uplink, true );
 			}
 		}
 #endif
@@ -692,7 +692,7 @@ cleanup: ;
 /**
  * Only called from uplink thread.
  */
-static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
+static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
 	assert_uplink_thread();
 	// Scan for new requests, or optionally, (re)send all
@@ -759,7 +759,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
  *
  * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
  */
-static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
+static bool sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
 	assert_uplink_thread();
 	if ( uplink->current.fd == -1 )
@@ -804,7 +804,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		}
 		if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
 			// Nothing left in current block, find next one
-			replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
+			replicationIndex = findNextIncompleteHashBlock( uplink, endByte );
 		}
 		if ( replicationIndex == -1 ) {
 			// Replication might be complete, uplink_mainloop should take care....
@@ -827,7 +827,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		if ( _backgroundReplication == BGR_HASHBLOCK
 				&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
 			// Just crossed a hash block boundary, look for new candidate starting at this very index
-			uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+			uplink->nextReplicationIndex = findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
 			if ( uplink->nextReplicationIndex == -1 )
 				break;
 		}
@@ -841,7 +841,7 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
  * of a hash block which is neither completely empty nor completely
  * replicated yet. Returns -1 if no match.
  */
-static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
+static int findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
 {
 	int retval = -1;
 	dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
@@ -890,7 +890,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
  * Locks on: uplink.lock, images[].lock
  * Only called from uplink thread, so current.fd is assumed to be valid.
  */
-static void uplink_handleReceive(dnbd3_uplink_t *uplink)
+static void handleReceive(dnbd3_uplink_t *uplink)
 {
 	dnbd3_reply_t inReply, outReply;
 	int ret;
@@ -960,7 +960,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		struct iovec iov[2];
 		// 1) Write to cache file
 		if ( unlikely( uplink->cacheFd == -1 ) ) {
-			uplink_reopenCacheFd( uplink, false );
+			reopenCacheFd( uplink, false );
 		}
 		if ( likely( uplink->cacheFd != -1 ) ) {
 			int err = 0;
@@ -980,7 +980,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 					}
 					if ( err == EBADF || err == EINVAL || err == EIO ) {
 						uplink->image->problem.write = true;
-						if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
+						if ( !tryAgain || !reopenCacheFd( uplink, true ) )
 							break;
 						tryAgain = false;
 						continue; // Write handle to image successfully re-opened, try again
@@ -1066,20 +1066,20 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		free( entry );
 	} // main receive loop
 	// Trigger background replication if applicable
-	if ( !uplink_sendReplicationRequest( uplink ) ) {
+	if ( !sendReplicationRequest( uplink ) ) {
 		goto error_cleanup;
 	}
 	// Normal end
 	return;
 	// Error handling from failed receive or message parsing
 error_cleanup: ;
-	uplink_connectionFailed( uplink, true );
+	connectionFailed( uplink, true );
 }
 
 /**
  * Only call from uplink thread
  */
-static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
+static void connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	assert_uplink_thread();
 	if ( uplink->current.fd == -1 )
@@ -1108,7 +1108,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
  * Send keep alive request to server.
  * Called from uplink thread, current.fd must be valid.
  */
-static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink)
+static bool sendKeepalive(dnbd3_uplink_t *uplink)
 {
 	static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
 	assert_uplink_thread();
@@ -1124,7 +1124,7 @@ static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink)
  * FIXME This is broken as it could happen that another message arrives after sending
  * the request. Refactor, split and move receive into general receive handler.
  */
-static void uplink_addCrc32(dnbd3_uplink_t *uplink)
+static void requestCrc32List(dnbd3_uplink_t *uplink)
 {
 	dnbd3_image_t *image = uplink->image;
 	if ( image == NULL || image->virtualFilesize == 0 ) return;
@@ -1174,7 +1174,7 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
  * it will be closed first. Otherwise, nothing will happen and true will be returned
  * immediately.
  */
-static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
+static bool reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 {
 	if ( uplink->cacheFd != -1 ) {
 		if ( !force ) return true;
@@ -1191,7 +1191,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
  * a minimum number of active clients configured that is not currently
  * reached)
  */
-static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
+static bool connectionShouldShutdown(dnbd3_uplink_t *uplink)
 {
 	return ( uplink->idleTime > SERVER_UPLINK_IDLE_TIMEOUT
 			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
-- 
cgit v1.2.3-55-g7522


From d3df3ba3005977629b8847b507df1fdae40ffbd5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sat, 14 Mar 2020 17:27:13 +0100
Subject: [SERVER] threadpool: Simplify get code, make debug code _DEBUG only

---
 src/server/threadpool.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 0b46fd6..96162a6 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -62,15 +62,16 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 		logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
 		return false;
 	}
+#ifdef _DEBUG
 	if ( unlikely( startRoutine == NULL ) ) {
 		logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
 		return false; // Or bail out!?
 	}
-	entry_t *entry = NULL;
+#endif
+	entry_t *entry;
 	for ( int i = 0; i < maxIdleThreads; ++i ) {
-		entry_t *cur = pool[i];
-		if ( cur != NULL && atomic_compare_exchange_weak( &pool[i], &cur, NULL ) ) {
-			entry = cur;
+		entry = atomic_exchange( &pool[i], NULL );
+		if ( entry != NULL ) {
 			break;
 		}
 	}
@@ -120,10 +121,12 @@ keep_going:;
 			logadd( LOG_DEBUG1, "Unexpected return value %d for signal_wait in threadpool worker!", ret );
 			continue;
 		}
+#ifdef _DEBUG
 		if ( entry->startRoutine == NULL ) {
 			logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
 			exit( 1 );
 		}
+#endif
 		// Start assigned work
 		(*entry->startRoutine)( entry->arg );
 		// Reset vars for safety
-- 
cgit v1.2.3-55-g7522


From a2cbfba828bd8fcd5803d9786a3b3050823b27fc Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 17 Mar 2020 13:00:39 +0100
Subject: [SERVER] Don't prefetch across hash blocks in BGS_HASHBLOCK mode

---
 src/server/uplink.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index d6b319b..9bf48d3 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -44,7 +44,7 @@ typedef struct {
 	dnbd3_uplink_t *uplink;
 	uint64_t start;
 	uint32_t length;
-} prefetch_request_t;
+} prefetch_job_t;
 
 #define assert_uplink_thread() assert( pthread_equal( uplink->thread, pthread_self() ) )
 
@@ -425,9 +425,12 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 success_ref:
 	if ( client != NULL ) {
 		// Was from client -- potential prefetch
+		// Same size as this request, but consider end of image...
 		uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start );
-		if ( len > 0 ) {
-			prefetch_request_t *job = malloc( sizeof( *job ) );
+		// Also don't prefetch if we cross a hash block border and BGR mode == hashblock
+		if ( len > 0 && ( _backgroundReplication != BGR_HASHBLOCK
+					|| req.start % HASH_BLOCK_SIZE == (req.end-1) % HASH_BLOCK_SIZE ) ) {
+			prefetch_job_t *job = malloc( sizeof( *job ) );
 			job->start = req.end;
 			job->length = len;
 			job->uplink = uplink;
@@ -450,7 +453,7 @@ fail_ref:
 
 static void *prefetchForClient(void *data)
 {
-	prefetch_request_t *job = (prefetch_request_t*)data;
+	prefetch_job_t *job = (prefetch_job_t*)data;
 	dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image );
 	if ( cache != NULL ) {
 		if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) {
@@ -458,7 +461,7 @@ static void *prefetchForClient(void *data)
 		}
 		ref_put( &cache->reference );
 	}
-	ref_put( &job->uplink->reference );
+	ref_put( &job->uplink->reference ); // Acquired in uplink_request
 	free( job );
 	return NULL;
 }
-- 
cgit v1.2.3-55-g7522


From 2e70a0836173c9502ff5cddd849165d432a883cb Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 17 Mar 2020 13:01:37 +0100
Subject: [SERVER] Include build type and version in RPC

Added new query type q=version, which uses the STATS access rights.
---
 CMakeLists.txt      |  7 +++----
 get-version.sh      |  2 +-
 src/server/rpc.c    | 10 ++++++++--
 src/server/server.c |  7 +++++--
 src/types.h         |  3 +++
 src/version.h       |  4 ----
 6 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc8bfb7..b263f77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 
 ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
 ADD_DEFINITIONS(-DWITH_IPV6)
+ADD_DEFINITIONS(-DBUILD_TYPE=${CMAKE_BUILD_TYPE})
 
 FIND_PACKAGE(Threads)
 
@@ -133,14 +134,12 @@ ADD_CUSTOM_TARGET(
 						-P ${CMAKE_BINARY_DIR}/version.cmake
 )
 
-INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/generated )
-
 ################################################################################
 # CLIENT                                                                       #
 ################################################################################
 
 if(BUILD_KERNEL_MODULE)
-	FILE(GLOB_RECURSE CLIENT_SRCS src/client/*.c)
+	FILE(GLOB_RECURSE CLIENT_SRCS ${CMAKE_BINARY_DIR}/generated/version.c src/client/*.c)
 	ADD_EXECUTABLE(dnbd3-client ${CLIENT_SRCS})
 	TARGET_LINK_LIBRARIES(dnbd3-client)
 	ADD_DEPENDENCIES(dnbd3-client version)
@@ -157,7 +156,7 @@ if(BUILD_SERVER)
 		message(" ######################## Building server for AFL mode - will be useless otherwise!")
 		ADD_DEFINITIONS(-DAFL_MODE)
 	ENDIF()
-	FILE(GLOB SERVER_SRCS src/server/*.c src/shared/*.c src/server/picohttpparser/*.c)
+	FILE(GLOB SERVER_SRCS ${CMAKE_BINARY_DIR}/generated/version.c src/server/*.c src/shared/*.c src/server/picohttpparser/*.c)
 	ADD_EXECUTABLE(dnbd3-server ${SERVER_SRCS})
 	TARGET_INCLUDE_DIRECTORIES(dnbd3-server PRIVATE ${JANSSON_INCLUDE_DIR})
 	TARGET_LINK_LIBRARIES(dnbd3-server ${CMAKE_THREAD_LIBS_INIT} ${JANSSON_LIBRARIES})
diff --git a/get-version.sh b/get-version.sh
index 1d4a8cb..5e5b3e1 100755
--- a/get-version.sh
+++ b/get-version.sh
@@ -8,7 +8,7 @@ ROOT_DIR="$(dirname "${SELF}")"
 cd "$ROOT_DIR"
 
 if [ -d .git ]; then
-	[ -n "$(git diff)" ] && MODDED='+MOD'
+	[ -n "$(git diff HEAD)" ] && MODDED='+MOD'
 	echo $(git describe)$MODDED, branch $(git rev-parse --abbrev-ref HEAD), built "$(date +%Y-%m-%d)"
 	exit 0
 fi
diff --git a/src/server/rpc.c b/src/server/rpc.c
index b66b8fe..12ad0dd 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -6,6 +6,7 @@
 #include "image.h"
 #include "altservers.h"
 #include "../shared/sockhelper.h"
+#include "../version.h"
 #include "fileutil.h"
 #include "picohttpparser/picohttpparser.h"
 #include "urldecode.h"
@@ -259,7 +260,7 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
 {
 	bool ok;
 	bool stats = false, images = false, clients = false, space = false;
-	bool logfile = false, config = false, altservers = false;
+	bool logfile = false, config = false, altservers = false, version = false;
 #define SETVAR(var) if ( !var && STRCMP(fields[i].value, #var) ) var = true
 	for (size_t i = 0; i < fields_num; ++i) {
 		if ( !equals( &fields[i].name, &STR_Q ) ) continue;
@@ -270,9 +271,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
 		else SETVAR(logfile);
 		else SETVAR(config);
 		else SETVAR(altservers);
+		else SETVAR(version);
 	}
 #undef SETVAR
-	if ( ( stats || space ) && !(permissions & ACL_STATS) ) {
+	if ( ( stats || space || version ) && !(permissions & ACL_STATS) ) {
 		return sendReply( sock, "403 Forbidden", "text/plain", "No permission to access statistics", -1, keepAlive );
 	}
 	if ( images && !(permissions & ACL_IMAGE_LIST) ) {
@@ -308,6 +310,10 @@ static bool handleStatus(int sock, int permissions, struct field *fields, size_t
 		statisticsJson = json_pack( "{sI}",
 				"runId", randomRunId );
 	}
+	if ( version ) {
+		json_object_set_new( statisticsJson, "version", json_string( VERSION_STRING ) );
+		json_object_set_new( statisticsJson, "build", json_string( TOSTRING( BUILD_TYPE ) ) );
+	}
 	if ( space ) {
 		uint64_t spaceTotal = 0, spaceAvail = 0;
 		file_freeDiskSpace( _basePath, &spaceTotal, &spaceAvail );
diff --git a/src/server/server.c b/src/server/server.c
index c9edc05..71a49b9 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -342,7 +342,10 @@ int main(int argc, char *argv[])
 	net_init();
 	uplink_globalsInit();
 	rpc_init();
-	logadd( LOG_INFO, "DNBD3 server starting.... Machine type: " ENDIAN_MODE );
+	logadd( LOG_INFO, "DNBD3 server starting...." );
+	logadd( LOG_INFO, "Machine type: " ENDIAN_MODE );
+	logadd( LOG_INFO, "Build Type: " TOSTRING( BUILD_TYPE ) );
+	logadd( LOG_INFO, "Version: %s", VERSION_STRING );
 
 	if ( altservers_load() < 0 ) {
 		logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
@@ -385,7 +388,7 @@ int main(int argc, char *argv[])
 		exit( EXIT_FAILURE );
 	}
 
-	logadd( LOG_INFO, "Server is ready. (%s)", VERSION_STRING );
+	logadd( LOG_INFO, "Server is ready." );
 
 	if ( thread_create( &timerThread, NULL, &timerMainloop, NULL ) == 0 ) {
 		hasTimerThread = true;
diff --git a/src/types.h b/src/types.h
index cb0ccfd..dc8e501 100644
--- a/src/types.h
+++ b/src/types.h
@@ -34,6 +34,9 @@
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
 
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
 #ifdef __GNUC__
 #define UNUSED __attribute__ ((unused))
 #else
diff --git a/src/version.h b/src/version.h
index 0c4a66b..1c17442 100644
--- a/src/version.h
+++ b/src/version.h
@@ -23,8 +23,4 @@
 
 extern const char *VERSION_STRING;
 
-// This is done in a little weird way but otherwise eclipse complains about
-// unresolvable symbols etc...
-#include "version.c"
-
 #endif /* VERSION_H_ */
-- 
cgit v1.2.3-55-g7522


From 79d36aa260f49716ede72cd6bea5cf10aa688651 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 17 Mar 2020 13:26:06 +0100
Subject: [SERVER] Make sure bgrWindowSize doesn't overwhelm uplink queue

---
 src/server/globals.c | 4 ++++
 src/server/uplink.c  | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'src')

diff --git a/src/server/globals.c b/src/server/globals.c
index 98e0ddb..9914f89 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -138,6 +138,10 @@ void globals_loadConfig()
 	}
 	if ( _bgrWindowSize < 1 ) {
 		_bgrWindowSize = 1;
+	} else if ( _bgrWindowSize > UPLINK_MAX_QUEUE - 10 ) {
+		_bgrWindowSize = UPLINK_MAX_QUEUE - 10;
+		logadd( LOG_MINOR, "Limiting bgrWindowSize to %d, because of UPLINK_MAX_QUEUE",
+				_bgrWindowSize );
 	}
 	// Dump config as interpreted
 	char buffer[2000];
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 9bf48d3..af854d6 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -786,6 +786,8 @@ static bool sendReplicationRequest(dnbd3_uplink_t *uplink)
 	const int lastBlockIndex = mapBytes - 1;
 	for ( int bc = 0; bc < numNewRequests; ++bc ) {
 		int endByte;
+		if ( UPLINK_MAX_QUEUE - uplink->queueLen < 10 )
+			break; // Don't overload queue
 		if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
 			endByte = uplink->nextReplicationIndex + mapBytes;
 		} else { // Hashblock based: Only look for match in current hash block
-- 
cgit v1.2.3-55-g7522


From c0bd7d324a4f44b5b5cefc0705ee68177186edf2 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 17 Mar 2020 14:28:42 +0100
Subject: [SERVER] Use first request in queue for RTT measurements

This makes sure the server we're potentially switching to
can at least serve the next request in the queue, making
sure some progress will be made.
---
 src/server/altservers.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 35da3a2..1ba75f4 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -470,6 +470,11 @@ static void *altservers_runCheck(void *data)
 void altservers_findUplink(dnbd3_uplink_t *uplink)
 {
 	altservers_findUplinkInternal( uplink );
+	// Above function is sync, which means normally when it
+	// returns, rttTestResult will not be RTT_INPROGRESS.
+	// But we might have an ansync call running in parallel, which would
+	// mean the above call returns immediately. Wait for that check
+	// to finish too.
 	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
 		usleep( 5000 );
 	}
@@ -530,6 +535,18 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 	dnbd3_server_connection_t best = { .fd = -1 };
 	unsigned long bestRtt = RTT_UNREACHABLE;
 	unsigned long currentRtt = RTT_UNREACHABLE;
+	uint64_t offset = 0;
+	uint32_t length = DNBD3_BLOCK_SIZE;
+	// Try to use the range of the first request in the queue as RTT block.
+	// In case we have a cluster of servers where none of them has a complete
+	// copy, we at least make sure the one we're potentially switching to
+	// has the next block we're about to request.
+	mutex_lock( &uplink->queueLock );
+	if ( uplink->queue != NULL ) {
+		offset = uplink->queue->from;
+		length = (uint32_t)( uplink->queue->to - offset );
+	}
+	mutex_unlock( &uplink->queueLock );
 	for (itAlt = 0; itAlt < numAlts; ++itAlt) {
 		int server = servers[itAlt];
 		// Connect
@@ -563,9 +580,9 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 		if ( imageSize != image->virtualFilesize ) {
 			ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
 		}
-		// Request first block (NOT random!) ++++++++++++++++++++++++++++++
-		if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
-			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server );
+		// Request block (NOT random! First or from queue) ++++++++++++
+		if ( !dnbd3_get_block( sock, offset, length, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request block", server );
 		}
 		// See if requesting the block succeeded ++++++++++++++++++++++
 		dnbd3_reply_t reply;
@@ -587,9 +604,6 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 		mutex_lock( &uplink->rttLock );
 		const bool isCurrent = ( uplink->current.index == server );
 		mutex_unlock( &uplink->rttLock );
-		// Penaltize rtt if this was a cycle; this will treat this server with lower priority
-		// in the near future too, so we prevent alternating between two servers that are both
-		// part of a cycle and have the lowest latency.
 		uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
 				+ (end.tv_nsec - start.tv_nsec) / 1000); // µs
 		uint32_t avg = altservers_updateRtt( uplink, server, rtt );
-- 
cgit v1.2.3-55-g7522


From ba617b55eb606ab487f154b124750e121518d5e5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 19 Mar 2020 11:26:12 +0100
Subject: [SERVER] Add name param to threadpool_run

---
 src/server/altservers.c | 2 +-
 src/server/image.c      | 2 ++
 src/server/server.c     | 6 +++---
 src/server/threadpool.c | 8 +++++++-
 src/server/threadpool.h | 3 ++-
 src/server/uplink.c     | 2 +-
 6 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 1ba75f4..5076a05 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -172,7 +172,7 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 	if ( uplink->rttTestResult != RTT_INPROGRESS ) {
 		dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
 		if ( current == uplink ) {
-			threadpool_run( &altservers_runCheck, uplink );
+			threadpool_run( &altservers_runCheck, uplink, "UPLINK" );
 		} else if ( current != NULL ) {
 			ref_put( &current->reference );
 		}
diff --git a/src/server/image.c b/src/server/image.c
index 81ec479..0ec1d58 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1817,6 +1817,7 @@ static void* closeUnusedFds(void* nix UNUSED)
 	timing_gets( &deadline, -UNUSED_FD_TIMEOUT );
 	int fds[FDCOUNT];
 	int fdindex = 0;
+	setThreadName( "unused-fd-close" );
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
@@ -1857,6 +1858,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 	static ticks nextSave;
 	declare_now;
 	bool full = timing_reached( &nextSave, &now );
+	setThreadName( "cache-mapper" );
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
diff --git a/src/server/server.c b/src/server/server.c
index 71a49b9..fa7bcda 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -404,7 +404,7 @@ int main(int argc, char *argv[])
 		if ( sigReload ) {
 			sigReload = false;
 			logadd( LOG_INFO, "SIGHUP received, re-scanning image directory" );
-			threadpool_run( &server_asyncImageListLoad, NULL );
+			threadpool_run( &server_asyncImageListLoad, NULL, "IMAGE_RELOAD" );
 		}
 		if ( sigLogCycle ) {
 			sigLogCycle = false;
@@ -431,7 +431,7 @@ int main(int argc, char *argv[])
 			continue;
 		}
 
-		if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client ) ) {
+		if ( !threadpool_run( &net_handleNewConnection, (void *)dnbd3_client, "CLIENT" ) ) {
 			logadd( LOG_ERROR, "Could not start thread for new connection." );
 			free( dnbd3_client );
 			continue;
@@ -574,7 +574,7 @@ static int handlePendingJobs(void)
 	jobHead = *temp; // Make it list head
 	*temp = NULL; // Split off part before that
 	while ( todo != NULL ) {
-		threadpool_run( todo->startRoutine, todo->arg );
+		threadpool_run( todo->startRoutine, todo->arg, "TIMER_TASK" );
 		old = todo;
 		todo = todo->next;
 		if ( old->intervalSecs == 0 ) {
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 96162a6..63ae19f 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -8,6 +8,7 @@ typedef struct _entry_t {
 	dnbd3_signal_t* signal;
 	void *(*startRoutine)(void *);
 	void * arg;
+	const char *name;
 } entry_t;
 
 static void *threadpool_worker(void *entryPtr);
@@ -56,7 +57,7 @@ void threadpool_waitEmpty()
 	} while ( activeThreads != 0 );
 }
 
-bool threadpool_run(void *(*startRoutine)(void *), void *arg)
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name)
 {
 	if ( unlikely( _shutdown ) ) {
 		logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
@@ -97,6 +98,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 	}
 	entry->startRoutine = startRoutine;
 	entry->arg = arg;
+	entry->name = name;
 	atomic_thread_fence( memory_order_release );
 	signal_call( entry->signal );
 	return true;
@@ -126,6 +128,9 @@ keep_going:;
 			logadd( LOG_ERROR, "Worker woke up but has no work to do!" );
 			exit( 1 );
 		}
+		if ( entry->name != NULL ) {
+			setThreadName( entry->name );
+		}
 #endif
 		// Start assigned work
 		(*entry->startRoutine)( entry->arg );
@@ -146,6 +151,7 @@ keep_going:;
 		// Reaching here means pool is full; just let the thread exit
 		break;
 	}
+	setThreadName( "[dead]" );
 	signal_close( entry->signal );
 	free( entry );
 	activeThreads--;
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index ee0b3aa..d8a526e 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -26,9 +26,10 @@ void threadpool_waitEmpty();
  * Run a thread using the thread pool.
  * @param startRoutine function to run in new thread
  * @param arg argument to pass to thead
+ * @param name STRING CONSTANT (literal) for debugging purposes
  * @return true if thread was started
  */
-bool threadpool_run(void *(*startRoutine)(void *), void *arg);
+bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name);
 
 #endif
 
diff --git a/src/server/uplink.c b/src/server/uplink.c
index af854d6..a7f140f 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -435,7 +435,7 @@ success_ref:
 			job->length = len;
 			job->uplink = uplink;
 			ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it
-			threadpool_run( &prefetchForClient, (void*)job );
+			threadpool_run( &prefetchForClient, (void*)job, "PREFETCH" );
 		}
 	}
 	if ( getUplink ) {
-- 
cgit v1.2.3-55-g7522


From 878a2414b4ed93461bb5b2be7dca026cdc56b43b Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 19 Mar 2020 13:36:23 +0100
Subject: [SERVER] Shorter wait when closing socket after reply

---
 src/server/rpc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/server/rpc.c b/src/server/rpc.c
index 12ad0dd..0002661 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -414,6 +414,8 @@ static bool sendReply(int sock, const char *status, const char *ctype, const cha
 #ifdef AFL_MODE
 		sock = 0;
 #endif
+		// Don't wait too long in case other side ignores the shutdown
+		sock_setTimeout( sock, 600 );
 		while ( read( sock, buffer, sizeof buffer ) > 0 );
 		return false;
 	}
-- 
cgit v1.2.3-55-g7522


From 3d2f1f605e07b511c4ebf79c936c7061dd918957 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 19 Mar 2020 20:43:15 +0100
Subject: [SERVER] Use PCLMUL for crc32 on AMD64 if available

This is about 16x as fast as before with the lookup table
for processing 4 bytes at a time and should work on any AMD64
CPU made in the last decade.
We still need an AltiVec implementation for G5 though.
---
 src/shared/crc32.c | 221 +++++++++++++++++++++++++++++++++++++++++------------
 src/types.h        |  12 +--
 2 files changed, 178 insertions(+), 55 deletions(-)

(limited to 'src')

diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index db941d3..50f476a 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -41,21 +41,20 @@
 #include "../types.h"
 #include <stddef.h>
 
-#define FAR
+#if defined(__x86_64__) || defined(__amd64__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <stdatomic.h>
+#define zalign(n) __attribute__((aligned(n)))
+#endif
+
 #define OF(args) args
-#define local static
 
 /* Definitions for doing the crc four data bytes at a time. */
-#if !defined(NOBYFOUR)
-#  define BYFOUR
-#endif
-#ifdef BYFOUR
-#  define TBLS 8
-#else
-#  define TBLS 1
-#endif /* BYFOUR */
+#define TBLS 8
 
-local const uint32_t crc_table[TBLS][256] =
+static const uint32_t crc_table[TBLS][256] =
 {
   {
     0x00000000U, 0x77073096U, 0xee0e612cU, 0x990951baU, 0x076dc419U,
@@ -110,7 +109,6 @@ local const uint32_t crc_table[TBLS][256] =
     0xcdd70693U, 0x54de5729U, 0x23d967bfU, 0xb3667a2eU, 0xc4614ab8U,
     0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
     0x2d02ef8dU
-#ifdef BYFOUR
   },
   {
     0x00000000U, 0x191b3141U, 0x32366282U, 0x2b2d53c3U, 0x646cc504U,
@@ -489,38 +487,159 @@ local const uint32_t crc_table[TBLS][256] =
     0x95e6b8b1U, 0x7b490da3U, 0x1e2eb11bU, 0x483ed243U, 0x2d596efbU,
     0xc3f6dbe9U, 0xa6916751U, 0x1fa9b0ccU, 0x7ace0c74U, 0x9461b966U,
     0xf10605deU
-#endif
   }
 };
 
-#ifdef NO_ENDIAN
-// Currently not in use, always use the BYFOUR method with known endianness
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+#define PCLMUL_MIN_LEN 64
+#define PCLMUL_ALIGN 16
+#define PCLMUL_ALIGN_MASK 15
 
-/* ========================================================================= */
-uint32_t crc32(crc, buf, len)
-    uint32_t crc;
-    const uint8_t *buf;
-    size_t len;
+#if defined(__x86_64__) || defined(__amd64__)
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ *
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+static uint32_t
+__attribute__((target("pclmul")))
+crc32pclmul(uint32_t crc, const uint8_t *buf, size_t len)
 {
-    if (buf == NULL) return 0;
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+    x0 = _mm_load_si128((__m128i *)k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+        x1 = _mm_xor_si128(x1, x5);
+        x2 = _mm_xor_si128(x2, x6);
+        x3 = _mm_xor_si128(x3, x7);
+        x4 = _mm_xor_si128(x4, x8);
+
+        x1 = _mm_xor_si128(x1, y5);
+        x2 = _mm_xor_si128(x2, y6);
+        x3 = _mm_xor_si128(x3, y7);
+        x4 = _mm_xor_si128(x4, y8);
 
-    crc = crc ^ 0xffffffffU;
-    while (len >= 8) {
-        DO8;
-        len -= 8;
+        buf += 64;
+        len -= 64;
     }
-    if (len) do {
-        DO1;
-    } while (--len);
-    return crc ^ 0xffffffffU;
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = _mm_load_si128((__m128i *)k3k4);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = _mm_loadu_si128((__m128i *)buf);
+
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x2);
+        x1 = _mm_xor_si128(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+
+    x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = _mm_load_si128((__m128i*)poly);
+
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(x1, 1);
 }
 #endif
 
-#ifdef BYFOUR
-
 /*
    This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
    integer pointer type. This violates the strict aliasing rule, where a
@@ -533,7 +652,7 @@ uint32_t crc32(crc, buf, len)
    writes to the buffer that is passed to these routines.
  */
 
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
 /* ========================================================================= */
 #define DOLIT4 c ^= *buf4++; \
         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
@@ -547,16 +666,25 @@ uint32_t crc32(crc, buf, len)
     size_t len;
 {
     if (buf == NULL) return 0;
-    register uint32_t c;
-    register const uint32_t FAR *buf4;
+    uint32_t c;
 
     c = ~crc;
-    while (len && ((uintptr_t)buf & 3)) {
+    while (len && ((uintptr_t)buf & PCLMUL_ALIGN_MASK)) {
         c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
         len--;
     }
-
-    buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+#if defined(__x86_64__) || defined(__amd64__)
+    static  atomic_int pclmul = -1;
+    if (pclmul == -1) {
+        pclmul = __builtin_cpu_supports("pclmul");
+    }
+    if (pclmul && len >= PCLMUL_MIN_LEN) {
+        c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
+        buf += len & ~PCLMUL_ALIGN_MASK;
+        len &= PCLMUL_ALIGN_MASK;
+    }
+#else
+    const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
     while (len >= 32) {
         DOLIT32;
         len -= 32;
@@ -565,7 +693,8 @@ uint32_t crc32(crc, buf, len)
         DOLIT4;
         len -= 4;
     }
-    buf = (const uint8_t FAR *)buf4;
+    buf = (const uint8_t *)buf4;
+#endif
 
     if (len) do {
         c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
@@ -575,7 +704,7 @@ uint32_t crc32(crc, buf, len)
 }
 #endif
 
-#ifdef BIG_ENDIAN
+#ifdef DNBD3_BIG_ENDIAN
 /* ========================================================================= */
 #define DOBIG4 c ^= *buf4++; \
         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
@@ -590,7 +719,7 @@ uint32_t crc32(crc, buf, len)
 {
     if (buf == NULL) return 0;
     register uint32_t c;
-    register const uint32_t FAR *buf4;
+    register const uint32_t *buf4;
 
     c = ~net_order_32(crc);
     while (len && ((uintptr_t)buf & 3)) {
@@ -598,7 +727,7 @@ uint32_t crc32(crc, buf, len)
         len--;
     }
 
-    buf4 = (const uint32_t FAR *)(const void FAR *)buf;
+    buf4 = (const uint32_t *)(const void *)buf;
     while (len >= 32) {
         DOBIG32;
         len -= 32;
@@ -607,7 +736,7 @@ uint32_t crc32(crc, buf, len)
         DOBIG4;
         len -= 4;
     }
-    buf = (const uint8_t FAR *)buf4;
+    buf = (const uint8_t *)buf4;
 
     if (len) do {
         c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
@@ -617,5 +746,3 @@ uint32_t crc32(crc, buf, len)
 }
 #endif
 
-#endif /* BYFOUR */
-
diff --git a/src/types.h b/src/types.h
index dc8e501..83416f4 100644
--- a/src/types.h
+++ b/src/types.h
@@ -95,9 +95,7 @@
 	(a).size = net_order_32((a).size); \
 } while (0)
 #define ENDIAN_MODE "Big Endian"
-#ifndef BIG_ENDIAN
-#define BIG_ENDIAN
-#endif
+#define DNBD3_BIG_ENDIAN
 #elif defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__i386__) || defined(__i386) || defined(__x86_64)
 #define dnbd3_packet_magic ((uint16_t)( (0x73) | (0x72 << 8) ))
 // Make little endian our network byte order as probably 99.999% of machines this will be used on are LE
@@ -107,9 +105,7 @@
 #define fixup_request(a) while(0)
 #define fixup_reply(a)   while(0)
 #define ENDIAN_MODE "Little Endian"
-#ifndef LITTLE_ENDIAN
-#define LITTLE_ENDIAN
-#endif
+#define DNBD3_LITTLE_ENDIAN
 #else
 #error "Unknown Endianness"
 #endif
@@ -156,10 +152,10 @@ typedef struct __attribute__((packed))
 	uint32_t size;            // 4byte
 	union {
 		struct {
-#ifdef LITTLE_ENDIAN
+#ifdef DNBD3_LITTLE_ENDIAN
 			uint64_t offset_small:56;  // 7byte
 			uint8_t  hops;            // 1byte
-#elif defined(BIG_ENDIAN)
+#elif defined(DNBD3_BIG_ENDIAN)
 			uint8_t  hops;            // 1byte
 			uint64_t offset_small:56;  // 7byte
 #endif
-- 
cgit v1.2.3-55-g7522


From 0f47d29912b0e3d0e387db715a16b7b4f273f389 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 19 Mar 2020 21:15:42 +0100
Subject: [SERVER] crc32: Fix compile with optimizations

Should have tested in "Release" mode I guess.
Seems we're at about 24x performance this way, so hooray.
---
 src/shared/crc32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index 50f476a..098615f 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -508,7 +508,7 @@ static const uint32_t crc_table[TBLS][256] =
  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
  */
 static uint32_t
-__attribute__((target("pclmul")))
+__attribute__((target("pclmul,sse4.1")))
 crc32pclmul(uint32_t crc, const uint8_t *buf, size_t len)
 {
     /*
@@ -676,7 +676,7 @@ uint32_t crc32(crc, buf, len)
 #if defined(__x86_64__) || defined(__amd64__)
     static  atomic_int pclmul = -1;
     if (pclmul == -1) {
-        pclmul = __builtin_cpu_supports("pclmul");
+        pclmul = __builtin_cpu_supports("pclmul") && __builtin_cpu_supports("sse4.1");
     }
     if (pclmul && len >= PCLMUL_MIN_LEN) {
         c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
-- 
cgit v1.2.3-55-g7522


From be628c705594a36f6aa649613ddf6c86039192a1 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 19 Mar 2020 21:45:12 +0100
Subject: [SHARED] crc32: Don't skip table lookup if PCLMUL is unavailable

---
 src/shared/crc32.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/shared/crc32.c b/src/shared/crc32.c
index 098615f..c3e566f 100644
--- a/src/shared/crc32.c
+++ b/src/shared/crc32.c
@@ -682,19 +682,20 @@ uint32_t crc32(crc, buf, len)
         c = crc32pclmul(c, buf, len & ~PCLMUL_ALIGN_MASK);
         buf += len & ~PCLMUL_ALIGN_MASK;
         len &= PCLMUL_ALIGN_MASK;
-    }
-#else
-    const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
-    while (len >= 32) {
-        DOLIT32;
-        len -= 32;
-    }
-    while (len >= 4) {
-        DOLIT4;
-        len -= 4;
-    }
-    buf = (const uint8_t *)buf4;
+    } else
 #endif
+    do {
+        const uint32_t *buf4 = (const uint32_t *)(const void *)buf;
+        while (len >= 32) {
+            DOLIT32;
+            len -= 32;
+        }
+        while (len >= 4) {
+            DOLIT4;
+            len -= 4;
+        }
+        buf = (const uint8_t *)buf4;
+    } while (0);
 
     if (len) do {
         c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
-- 
cgit v1.2.3-55-g7522


From a9f5b836d9fddb3e1851c5b0a77c566b0f267ead Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 20 Mar 2020 12:08:10 +0100
Subject: [SERVER] Fix warnings, add assertions

---
 src/server/globals.h |  2 +-
 src/server/image.c   |  7 +++++--
 src/server/uplink.c  | 15 +++++++++------
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 5cee92a..08ec303 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -103,7 +103,7 @@ struct _dnbd3_uplink
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
 	atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
 	int queueLen;               // length of queue
-	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
+	int idleTime;               // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queue_entry_t *queue;
 	atomic_uint_fast32_t queueId;
 	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
diff --git a/src/server/image.c b/src/server/image.c
index 0ec1d58..ef40325 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -440,6 +440,7 @@ dnbd3_image_t* image_lock(dnbd3_image_t *image)
 	mutex_lock( &imageListLock );
 	for (i = 0; i < _num_images; ++i) {
 		if ( _images[i] == image ) {
+			assert( _images[i]->id == image->id );
 			image->users++;
 			mutex_unlock( &imageListLock );
 			return image;
@@ -470,6 +471,7 @@ dnbd3_image_t* image_release(dnbd3_image_t *image)
 	// responsible for freeing it
 	for (int i = 0; i < _num_images; ++i) {
 		if ( _images[i] == image ) { // Found, do nothing
+			assert( _images[i]->id == image->id );
 			mutex_unlock( &imageListLock );
 			return NULL;
 		}
@@ -509,6 +511,7 @@ static dnbd3_image_t* image_remove(dnbd3_image_t *image)
 	mutex_lock( &imageListLock );
 	for ( int i = _num_images - 1; i >= 0; --i ) {
 		if ( _images[i] == image ) {
+			assert( _images[i]->id == image->id );
 			_images[i] = NULL;
 			mustFree = ( image->users == 0 );
 		}
@@ -1088,7 +1091,7 @@ bool image_create(char *image, int revision, uint64_t size)
 		logadd( LOG_ERROR, "revision id invalid: %d", revision );
 		return false;
 	}
-	char path[PATHLEN], cache[PATHLEN];
+	char path[PATHLEN], cache[PATHLEN+4];
 	char *lastSlash = strrchr( image, '/' );
 	if ( lastSlash == NULL ) {
 		snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
@@ -1099,7 +1102,7 @@ bool image_create(char *image, int revision, uint64_t size)
 		*lastSlash = '/';
 		snprintf( path, PATHLEN, "%s/%s.r%d", _basePath, image, revision );
 	}
-	snprintf( cache, PATHLEN, "%s.map", path );
+	snprintf( cache, PATHLEN+4, "%s.map", path );
 	size = (size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 	const int mapsize = IMGSIZE_TO_MAPBYTES(size);
 	// Write files
diff --git a/src/server/uplink.c b/src/server/uplink.c
index a7f140f..f5ac6ac 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -403,8 +403,9 @@ bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t han
 		mutex_unlock( &uplink->sendMutex );
 		logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 	} else {
-		const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start,
-				req.handle, COND_HOPCOUNT( uplink->current.version, hops ) );
+		const bool ret = dnbd3_get_block( uplink->current.fd, req.start,
+				(uint32_t)( req.end - req.start ), req.handle,
+				COND_HOPCOUNT( uplink->current.version, hops ) );
 		if ( unlikely( !ret ) ) {
 			markRequestUnsent( uplink, req.handle );
 			uplink->image->problem.uplink = true;
@@ -426,7 +427,8 @@ success_ref:
 	if ( client != NULL ) {
 		// Was from client -- potential prefetch
 		// Same size as this request, but consider end of image...
-		uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start );
+		uint32_t len = (uint32_t)MIN( uplink->image->virtualFilesize - req.end,
+				req.end - req.start );
 		// Also don't prefetch if we cross a hash block border and BGR mode == hashblock
 		if ( len > 0 && ( _backgroundReplication != BGR_HASHBLOCK
 					|| req.start % HASH_BLOCK_SIZE == (req.end-1) % HASH_BLOCK_SIZE ) ) {
@@ -592,7 +594,8 @@ static void* uplink_mainloop(void *data)
 		}
 		declare_now;
 		uint32_t timepassed = timing_diff( &lastKeepalive, &now );
-		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
+		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL
+				|| ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
 			lastKeepalive = now;
 			uplink->idleTime += timepassed;
 			// Keep-alive
@@ -714,8 +717,8 @@ static void sendQueuedRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		dnbd3_request_t *hdr = &reqs[count++];
 		hdr->magic = dnbd3_packet_magic;
 		hdr->cmd = CMD_GET_BLOCK;
-		hdr->size = it->to - it->from;
-		hdr->offset_small = it->from;
+		hdr->size = (uint32_t)( it->to - it->from );
+		hdr->offset = it->from; // Offset first, then hops! (union)
 		hdr->hops = COND_HOPCOUNT( uplink->current.version, it->hopCount );
 		hdr->handle = it->handle;
 		fixup_request( *hdr );
-- 
cgit v1.2.3-55-g7522


From 269abbd82cf98eaeac85f97d6b5fee0d20751163 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 20 Mar 2020 12:26:11 +0100
Subject: [SERVER] Forbid hidden files when scanning image dir

---
 src/server/image.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index ef40325..67a763c 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -697,7 +697,8 @@ static bool image_load_all_internal(char *base, char *path)
 
 	while ( !_shutdown && (entryPtr = readdir( dir )) != NULL ) {
 		entry = *entryPtr;
-		if ( strcmp( entry.d_name, "." ) == 0 || strcmp( entry.d_name, ".." ) == 0 ) continue;
+		if ( entry.d_name[0] == '.' )
+			continue; // No hidden files, no . or ..
 		if ( strlen( entry.d_name ) > SUBDIR_LEN ) {
 			logadd( LOG_WARNING, "Skipping entry %s: Too long (max %d bytes)", entry.d_name, (int)SUBDIR_LEN );
 			continue;
-- 
cgit v1.2.3-55-g7522


From 894eeb86f872a7f7f5f36bfa8649da3075dd28d6 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 20 Mar 2020 21:22:18 +0100
Subject: [SERVER] Remember atime in .meta file

---
 src/server/globals.h |   1 +
 src/server/image.c   | 198 +++++++++++++++++++++++++++++++++++----------------
 src/server/net.c     |   2 +
 3 files changed, 139 insertions(+), 62 deletions(-)

(limited to 'src')

diff --git a/src/server/globals.h b/src/server/globals.h
index 08ec303..95d8ec2 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -155,6 +155,7 @@ struct _dnbd3_image
 		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
+	bool accessed;         // image was accessed since .meta was written
 	pthread_mutex_t lock;
 };
 #define PIMG(x) (x)->name, (int)(x)->rid
diff --git a/src/server/image.c b/src/server/image.c
index 67a763c..4944bfd 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -55,10 +55,12 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
 static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
-static bool imageShouldSaveCacheMap(dnbd3_image_t *image);
+static bool isImageFromUpstream(dnbd3_image_t *image);
 static void* saveLoadAllCacheMaps(void*);
 static void saveCacheMap(dnbd3_image_t *image);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime);
+static void loadImageMeta(dnbd3_image_t *image);
 
 static void cmfree(ref *ref)
 {
@@ -630,8 +632,11 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	// this will get called again when the uplink is done.
 	if ( !uplink_shutdown( image ) )
 		return NULL;
-	if ( imageShouldSaveCacheMap( image ) ) {
-		saveCacheMap( image );
+	if ( isImageFromUpstream( image ) ) {
+		saveMetaData( image, NULL, 0 );
+		if ( image->ref_cacheMap != NULL ) {
+			saveCacheMap( image );
+		}
 	}
 	mutex_lock( &image->lock );
 	ref_setref( &image->ref_cacheMap, NULL );
@@ -757,7 +762,6 @@ static bool image_addToList(dnbd3_image_t *image)
 static bool image_load(char *base, char *path, bool withUplink)
 {
 	int revision = -1;
-	struct stat st;
 	dnbd3_cache_map_t *cache = NULL;
 	uint32_t *crc32list = NULL;
 	dnbd3_image_t *existing = NULL;
@@ -901,15 +905,7 @@ static bool image_load(char *base, char *path, bool withUplink)
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
-	int32_t offset;
-	if ( stat( path, &st ) == 0 ) {
-		// Negatively offset atime by file modification time
-		offset = (int32_t)( st.st_mtime - time( NULL ) );
-		if ( offset > 0 ) offset = 0;
-	} else {
-		offset = 0;
-	}
-	timing_gets( &image->atime, offset );
+	loadImageMeta( image );
 
 	// Prevent freeing in cleanup
 	cache = NULL;
@@ -1843,12 +1839,10 @@ static void* closeUnusedFds(void* nix UNUSED)
 	return NULL;
 }
 
-static bool imageShouldSaveCacheMap(dnbd3_image_t *image)
+static bool isImageFromUpstream(dnbd3_image_t *image)
 {
 	if ( !_isProxy )
 		return false; // Nothing to do
-	if ( image->ref_cacheMap == NULL )
-		return false; // Nothing to do
 	// Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
 	// for which we have any upstream servers configured. If there's none, don't touch
 	// the cache map on disk.
@@ -1862,66 +1856,71 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 	static ticks nextSave;
 	declare_now;
 	bool full = timing_reached( &nextSave, &now );
+	time_t walltime = full ? time( NULL ) : 0;
 	setThreadName( "cache-mapper" );
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
-		dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-		if ( cache == NULL )
-			continue; // No users++ or mutex_unlock yet -> safe
 		image->users++;
 		mutex_unlock( &imageListLock );
-		if ( imageShouldSaveCacheMap( image ) ) {
-			// Replicated image, we're responsible for updating the map, so save it
-			// Save if dirty bit is set, blocks were invalidated
-			bool save = cache->dirty;
-			dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-			if ( !save ) {
-				// Otherwise, consider longer timeout and byte count limits of uplink
-				if ( uplink != NULL ) {
-					assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
-					uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
-					if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
-						save = true;
+		const bool fromUpstream = isImageFromUpstream( image );
+		dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+		if ( cache != NULL ) {
+			if ( fromUpstream ) {
+				// Replicated image, we're responsible for updating the map, so save it
+				// Save if dirty bit is set, blocks were invalidated
+				bool save = cache->dirty;
+				dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+				if ( !save ) {
+					// Otherwise, consider longer timeout and byte count limits of uplink
+					if ( uplink != NULL ) {
+						assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+						uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+						if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+							save = true;
+						}
 					}
 				}
-			}
-			if ( save ) {
-				cache->dirty = false;
+				if ( save ) {
+					cache->dirty = false;
+					if ( uplink != NULL ) {
+						uplink->bytesReceivedLastSave = uplink->bytesReceived;
+					}
+					saveCacheMap( image );
+				}
 				if ( uplink != NULL ) {
-					uplink->bytesReceivedLastSave = uplink->bytesReceived;
+					ref_put( &uplink->reference );
 				}
-				saveCacheMap( image );
-			}
-			if ( uplink != NULL ) {
-				ref_put( &uplink->reference );
-			}
-		} else {
-			// We're not replicating this image, if there's a cache map, reload
-			// it periodically, since we might read from a shared storage that
-			// another server instance is writing to.
-			if ( full || ( !cache->unchanged && !image->problem.read ) ) {
-				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
-				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
-				if ( onDisk == NULL ) {
-					// Should be complete now
-					logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
-					ref_setref( &image->ref_cacheMap, NULL );
-				} else {
-					const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-					if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
-						// Unchanged
-						cache->unchanged = true;
-						onDisk->reference.free( &onDisk->reference );
+			} else {
+				// We're not replicating this image, if there's a cache map, reload
+				// it periodically, since we might read from a shared storage that
+				// another server instance is writing to.
+				if ( full || ( !cache->unchanged && !image->problem.read ) ) {
+					logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
+					dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+					if ( onDisk == NULL ) {
+						// Should be complete now
+						logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
+						ref_setref( &image->ref_cacheMap, NULL );
 					} else {
-						// Replace
-						ref_setref( &image->ref_cacheMap, &onDisk->reference );
-						logadd( LOG_DEBUG2, "Map changed" );
+						const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+						if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+							// Unchanged
+							cache->unchanged = true;
+							onDisk->reference.free( &onDisk->reference );
+						} else {
+							// Replace
+							ref_setref( &image->ref_cacheMap, &onDisk->reference );
+							logadd( LOG_DEBUG2, "Map changed" );
+						}
 					}
 				}
-			}
+			} // end reload cache map
+			ref_put( &cache->reference );
+		} // end has cache map
+		if ( full && fromUpstream ) {
+			saveMetaData( image, &now, walltime );
 		}
-		ref_put( &cache->reference );
 		image_release( image ); // Always do this instead of users-- to handle freeing
 		mutex_lock( &imageListLock );
 	}
@@ -2023,3 +2022,78 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
 	}
 	mutex_unlock( &image->lock );
 }
+
+/**
+ * It's assumed you hold a reference to the image
+ */
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime)
+{
+	if ( !image->accessed )
+		return;
+	ticks tmp;
+	uint32_t diff;
+	char *fn;
+	if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+		logadd( LOG_WARNING, "Cannot asprintf meta" );
+		return;
+	}
+	if ( now == NULL ) {
+		timing_get( &tmp );
+		now = &tmp;
+		walltime = time( NULL );
+	}
+	mutex_lock( &image->lock );
+	image->accessed = false;
+	diff = timing_diff( &image->atime, now );
+	mutex_unlock( &image->lock );
+	FILE *f = fopen( fn, "w" );
+	if ( f == NULL ) {
+		logadd( LOG_WARNING, "Cannot open %s for writing", fn );
+	} else {
+		fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) );
+		fclose( f );
+	}
+	free( fn );
+	// TODO: fsync() dir
+}
+
+static void loadImageMeta(dnbd3_image_t *image)
+{
+	int32_t offset = 1;
+	char *fn;
+	if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+		logadd( LOG_WARNING, "asprintf load" );
+	} else {
+		int fh = open( fn, O_RDONLY );
+		free( fn );
+		if ( fh != -1 ) {
+			char buf[200];
+			ssize_t ret = read( fh, buf, sizeof(buf)-1 );
+			close( fh );
+			if ( ret > 0 ) {
+				buf[ret] = '\0';
+				// Do it the cheap way until we actually store more stuff
+				char *pos = strstr( buf, "atime=" );
+				if ( pos != NULL ) {
+					offset = (int32_t)( atol( pos + 6 ) - time( NULL ) );
+				}
+			}
+		}
+	}
+	if ( offset == 1 ) {
+		// Nothing from .meta file, use old guesstimate
+		struct stat st;
+		if ( stat( image->path, &st ) == 0 ) {
+			// Negatively offset atime by file modification time
+			offset = (int32_t)( st.st_mtime - time( NULL ) );
+		} else {
+			offset = 0;
+		}
+		image->accessed = true;
+	}
+	if ( offset > 0 ) {
+		offset = 0;
+	}
+	timing_gets( &image->atime, offset );
+}
+
diff --git a/src/server/net.c b/src/server/net.c
index 9ba9dbc..6b930df 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -289,6 +289,7 @@ void* net_handleNewConnection(void *clientPtr)
 					if ( !client->isServer ) {
 						// Only update immediately if this is a client. Servers are handled on disconnect.
 						timing_get( &image->atime );
+						image->accessed = true;
 					}
 					mutex_unlock( &image->lock );
 					serializer_reset_write( &payload );
@@ -515,6 +516,7 @@ exit_client_cleanup: ;
 	if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
 		mutex_lock( &image->lock );
 		timing_get( &image->atime );
+		image->accessed = true;
 		mutex_unlock( &image->lock );
 	}
 	if ( cache != NULL ) {
-- 
cgit v1.2.3-55-g7522


From 19ec1693667390c064a45b0c6f13f1b3350cbb3f Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 20 Mar 2020 21:39:32 +0100
Subject: [SERVER] image_ensureDiskSpace should only deletes proxied images

---
 src/server/image.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 4944bfd..2273dc2 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1732,17 +1732,17 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
 /**
  * Make sure at least size bytes are available in _basePath.
  * Will delete old images to make room for new ones.
- * TODO: Store last access time of images. Currently the
- * last access time is reset to the file modification time
- * on server restart. Thus it will
- * currently only delete images if server uptime is > 24 hours.
+ * It will only delete images if a configurable uptime is
+ * reached.
  * This can be overridden by setting force to true, in case
  * free space is desperately needed.
  * Return true iff enough space is available. false in random other cases
  */
 static bool image_ensureDiskSpace(uint64_t size, bool force)
 {
-	for ( int maxtries = 0; maxtries < 20; ++maxtries ) {
+	if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
+		return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+	for ( int maxtries = 0; maxtries < 50; ++maxtries ) {
 		uint64_t available;
 		if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
 			logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno );
@@ -1750,28 +1750,29 @@ static bool image_ensureDiskSpace(uint64_t size, bool force)
 		}
 		if ( available > size )
 			return true; // Yay
-		if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
-			return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
 		if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
-			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...", (int)(available / (1024ll * 1024ll)),
-					(int)(size / (1024 * 1024)), _autoFreeDiskSpaceDelay / 60 );
+			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...",
+					(int)(available / (1024ll * 1024)),
+					(int)(size / (1024ll * 1024)), _autoFreeDiskSpaceDelay / 60 );
 			return false;
 		}
-		logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...", (int)(available / (1024ll * 1024ll)),
-				(int)(size / (1024 * 1024)) );
+		logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, freeing an image...",
+				(int)(available / (1024ll * 1024)),
+				(int)(size / (1024ll * 1024)) );
 		// Find least recently used image
 		dnbd3_image_t *oldest = NULL;
 		int i;
 		mutex_lock( &imageListLock );
 		for (i = 0; i < _num_images; ++i) {
 			dnbd3_image_t *current = _images[i];
-			if ( current == NULL ) continue;
-			if ( current->users == 0 ) { // Not in use :-)
-				if ( oldest == NULL || timing_1le2( &current->atime, &oldest->atime ) ) {
-					// Oldest access time so far
-					oldest = current;
-				}
-			}
+			if ( current == NULL || current->users != 0 )
+				continue; // Empty slot or in use
+			if ( oldest != NULL && timing_1le2( &oldest->atime, &current->atime ) )
+				continue; // Already got a newer one
+			if ( !isImageFromUpstream( current ) )
+				continue; // Not replicated, don't touch
+			// Oldest access time so far
+			oldest = current;
 		}
 		if ( oldest != NULL ) {
 			oldest->users++;
-- 
cgit v1.2.3-55-g7522


From 431ddd8bfb78a20f7d2739c95aefb1402c228091 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 23 Mar 2020 10:53:27 +0100
Subject: [SHARED] connect_ms might change above, don't check

---
 src/shared/sockhelper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/shared/sockhelper.c b/src/shared/sockhelper.c
index ec80659..9e9109c 100644
--- a/src/shared/sockhelper.c
+++ b/src/shared/sockhelper.c
@@ -346,7 +346,7 @@ int sock_multiConnect(poll_list_t* list, const dnbd3_host_t* host, int connect_m
 		if ( i != list->count ) list->entry[i] = list->entry[list->count];
 		if ( fd != -1 ) {
 			sock_set_block( fd );
-			if ( rw_ms != -1 && rw_ms != connect_ms ) {
+			if ( rw_ms != -1 ) {
 				sock_setTimeout( fd, rw_ms );
 			}
 			return fd;
-- 
cgit v1.2.3-55-g7522


From 411051a14781b004705e45e6fb2842e0b635812e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 23 Mar 2020 11:32:29 +0100
Subject: [SERVER] Make lock/thread debugging dedicated cmake option

---
 CMakeLists.txt     | 26 +++++++++++++++++---------
 src/server/locks.c |  2 +-
 src/server/locks.h |  2 +-
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b061bb..952ff00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,8 @@ OPTION(BUILD_FUSE_CLIENT "Build dnbd3 fuse client" ON)
 OPTION(BUILD_SERVER "Build dnbd3 server" ON)
 OPTION(BUILD_STRESSTEST "Build dnbd3 stress testing tool" OFF)
 SET(EXTRA_C_FLAGS "" CACHE STRING "Additional options to pass to compiler")
+OPTION(DEBUG_LOCKS "Add lock debugging code to dnbd3-server" OFF)
+OPTION(DEBUG_THREADS "Add thread debugging code to dnbd3-server" OFF)
 
 OPTION(SERVER_FOR_AFL "Build dnbd3-server for usage with afl-fuzz" OFF)
 
@@ -47,16 +49,16 @@ macro (TRY_ADD_FLAG _FLAG)
 endmacro()
 
 # Common for gcc and clang
-SET(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now,-z,defs -pie")
-SET(CMAKE_C_FLAGS "-fPIE -std=c11 -Wno-multichar -fno-strict-aliasing -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 ${EXTRA_C_FLAGS}")
-SET(CMAKE_C_FLAGS_RELEASE " -O3 -Wno-unused-result -DNDEBUG")
+#SET(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,relro,-z,now,-z,defs -pie")
+SET(CMAKE_C_FLAGS_RELEASE "-DNDEBUG")
+SET(CMAKE_C_FLAGS "-std=c11 -Wno-multichar -fno-strict-aliasing -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 ${EXTRA_C_FLAGS}")
 # Hardening. Try as much as is possible.
-TRY_ADD_FLAG("-mmitigate-rop")
-TRY_ADD_FLAG("-fstack-protector-strong")
-TRY_ADD_FLAG("-fstack-clash-protection")
-TRY_ADD_FLAG("-Wformat")
-TRY_ADD_FLAG("-Wformat-security")
-TRY_ADD_FLAG("-Werror=format-security")
+#TRY_ADD_FLAG("-mmitigate-rop")
+#TRY_ADD_FLAG("-fstack-protector-strong")
+#TRY_ADD_FLAG("-fstack-clash-protection")
+#TRY_ADD_FLAG("-Wformat")
+#TRY_ADD_FLAG("-Wformat-security")
+#TRY_ADD_FLAG("-Werror=format-security")
 if(CMAKE_C_COMPILER MATCHES "clang")
 	message( "Using clang flags." )
 	SET(CMAKE_C_FLAGS_DEBUG " -O1 -fno-omit-frame-pointer -g -Wall -Wextra -Wpedantic -Wno-unused-result -D_DEBUG")
@@ -72,6 +74,12 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
 ADD_DEFINITIONS(-DWITH_IPV6)
 ADD_DEFINITIONS(-DBUILD_TYPE=${CMAKE_BUILD_TYPE})
+if (DEBUG_LOCKS)
+	ADD_DEFINITIONS(-DDEBUG_LOCKS)
+ENDIF()
+if (DEBUG_THREADS)
+	ADD_DEFINITIONS(-DDEBUG_THREADS)
+ENDIF()
 
 FIND_PACKAGE(Threads)
 
diff --git a/src/server/locks.c b/src/server/locks.c
index b39576b..4d9cde6 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -9,7 +9,7 @@
 #include "helper.h"
 #include "../shared/timing.h"
 
-#ifdef _DEBUG
+#ifdef DEBUG_LOCKS
 #define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
 #define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
 #define MAXLPT 20
diff --git a/src/server/locks.h b/src/server/locks.h
index e5c9801..6111d71 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -26,7 +26,7 @@
 
 //
 
-#ifdef _DEBUG
+#ifdef DEBUG_LOCKS
 
 #define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
 #define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
-- 
cgit v1.2.3-55-g7522


From 94608a6297bd959dd0b2fd03ddaf7484a4bcc5d8 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Mon, 23 Mar 2020 11:33:13 +0100
Subject: [BENCH] fix wrong operator precedence

---
 src/bench/connection.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/bench/connection.c b/src/bench/connection.c
index 26be440..65f1757 100644
--- a/src/bench/connection.c
+++ b/src/bench/connection.c
@@ -97,7 +97,7 @@ bool connection_init_n_times(
 				counters->fails++;
 				logadd( LOG_ERROR, "rid mismatch" );
 			//} else if ( !dnbd3_get_block( sock, run_i * blockSize, blockSize, 0, 0 ) ) {
-			} else if ( !dnbd3_get_block( sock, (((uint64_t)rand()) << 16 + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
+			} else if ( !dnbd3_get_block( sock, (((uint64_t)rand() << 16) + rand()) % (remoteSize - blockSize), blockSize, 0, 0 ) ) {
 				counters->fails++;
 				logadd( LOG_ERROR, "send: get block failed" );
 			} else if ( !dnbd3_get_reply( sock, &reply ) ) {
-- 
cgit v1.2.3-55-g7522


From bd85f127c62a490e94752f82e6d9d9778a03f1f1 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 31 Mar 2020 17:38:24 +0200
Subject: [SERVER] Check local and remote for updates on rid == 0

---
 src/server/image.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 2273dc2..7eeca76 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1176,14 +1176,18 @@ dnbd3_image_t* image_getOrLoad(char * const name, const uint16_t revision)
 	// Sanity check
 	if ( len == 0 || name[len - 1] == '/' || name[0] == '/'
 			|| name[0] == '.' || strstr( name, "/." ) != NULL ) return NULL;
-	// If in proxy mode, check with upstream server first
+	// Re-check latest local revision
+	image = loadImageServer( name, revision );
+	// If in proxy mode, check with upstream servers
 	if ( _isProxy ) {
+		// Forget the locally loaded one
+		image_release( image );
+		// Check with upstream - if unsuccessful, will return the same
+		// as loadImageServer did
 		image = loadImageProxy( name, revision, len );
-		if ( image != NULL )
-			return image;
 	}
 	// Lookup on local storage
-	return loadImageServer( name, revision );
+	return image;
 }
 
 /**
-- 
cgit v1.2.3-55-g7522


From 171a6313c40744ca947553554248bf2cab0bb0f5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 10 Jun 2020 15:08:19 +0200
Subject: [SERVER] Fix check for RTT block reply size

---
 src/server/altservers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 5076a05..d4f41c0 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -590,7 +590,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
 		}
 		// check reply header
-		if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+		if ( reply.cmd != CMD_GET_BLOCK || reply.size != length ) {
 			// Sanity check failed; count this as global error (malicious/broken server)
 			ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
 		}
-- 
cgit v1.2.3-55-g7522


From abe55c2bf2b93e9431ee2c22afd7f3f6611d71d8 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 10 Jun 2020 15:28:41 +0200
Subject: [SERVER] fix uninitialized variable

---
 src/server/threadpool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 63ae19f..4ebefcb 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -69,7 +69,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg, const char *name)
 		return false; // Or bail out!?
 	}
 #endif
-	entry_t *entry;
+	entry_t *entry = NULL;
 	for ( int i = 0; i < maxIdleThreads; ++i ) {
 		entry = atomic_exchange( &pool[i], NULL );
 		if ( entry != NULL ) {
-- 
cgit v1.2.3-55-g7522


From c61f65ebd977d0fa4f1f486458655242f3aeb3e5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 10 Jun 2020 16:01:36 +0200
Subject: [SERVER] Fix list walk when removing client from uplink

---
 src/server/uplink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/uplink.c b/src/server/uplink.c
index f5ac6ac..bf6f32e 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -235,12 +235,14 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
 		return;
 	mutex_lock( &uplink->queueLock );
 	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
-		for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) {
+		for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; ) {
 			if ( (**cit).client == client ) {
 				--client->relayedCount;
 				dnbd3_queue_client_t *entry = *cit;
 				*cit = (**cit).next;
 				free( entry );
+			} else {
+				cit = &(**cit).next;
 			}
 		}
 	}
-- 
cgit v1.2.3-55-g7522


From 8e017e62a088d101b25d179ca3ff7592978bf362 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 10 Jun 2020 16:08:00 +0200
Subject: [SERVER] Likewise, flush entire payload on RTT measurement

---
 src/server/altservers.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index d4f41c0..fbbac81 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -596,7 +596,12 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 		}
 		// flush payload to include this into measurement
 		char buffer[DNBD3_BLOCK_SIZE];
-		if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+		uint32_t todo = length;
+		ssize_t ret;
+		while ( ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
+			todo -= (uint32_t)ret;
+		}
+		if ( todo != 0 ) {
 			ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
 		}
 		clock_gettime( BEST_CLOCK_SOURCE, &end );
-- 
cgit v1.2.3-55-g7522


From cbd74e10a3e7fe4ae27e32d91e2b1cd1b95e3729 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 10 Jun 2020 16:14:50 +0200
Subject: [SERVER] Know when to stop

---
 src/server/altservers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index fbbac81..838a475 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -598,7 +598,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 		char buffer[DNBD3_BLOCK_SIZE];
 		uint32_t todo = length;
 		ssize_t ret;
-		while ( ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
+		while ( todo != 0 && ( ret = recv( sock, buffer, MIN( DNBD3_BLOCK_SIZE, todo ), MSG_WAITALL ) ) > 0 ) {
 			todo -= (uint32_t)ret;
 		}
 		if ( todo != 0 ) {
-- 
cgit v1.2.3-55-g7522


From dcece877215a0d909553ae9301a02d031b37b715 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 30 Jun 2020 11:21:31 +0200
Subject: [SHARED] Fix 16 byte information leakage in select image message

---
 src/shared/protocol.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/shared/protocol.h b/src/shared/protocol.h
index 2b21c21..05fd2bf 100644
--- a/src/shared/protocol.h
+++ b/src/shared/protocol.h
@@ -69,10 +69,8 @@ static inline bool dnbd3_select_image(int sock, const char *name, uint16_t rid,
 	request.magic = dnbd3_packet_magic;
 	request.cmd = CMD_SELECT_IMAGE;
 	request.size = (uint32_t)len;
-#ifdef _DEBUG
 	request.handle = 0;
 	request.offset = 0;
-#endif
 	fixup_request( request );
 	iov[0].iov_base = &request;
 	iov[0].iov_len = sizeof(request);
-- 
cgit v1.2.3-55-g7522


From 2db0b8475b224125e3800771df43a5dd68c957ee Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 30 Jun 2020 12:10:39 +0200
Subject: [SERVER] Fix: No replication if autoFreeDiskSpaceDelay is disabled

---
 src/server/image.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 7eeca76..4893b7e 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1291,7 +1291,11 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
 		} else {
 			ok = image_ensureDiskSpace( remoteImageSize + ( 10 * 1024 * 1024 ), false ); // some extra space for cache map etc.
 		}
-		ok = ok && image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+		if ( ok ) {
+			ok = image_clone( sock, name, remoteRid, remoteImageSize ); // This sets up the file+map+crc and loads the img
+		} else {
+			logadd( LOG_INFO, "Not enough space to replicate '%s:%d'", name, (int)revision );
+		}
 		mutex_unlock( &reloadLock );
 		if ( !ok ) goto server_fail;
 
@@ -1744,16 +1748,20 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force)
  */
 static bool image_ensureDiskSpace(uint64_t size, bool force)
 {
-	if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 )
-		return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
 	for ( int maxtries = 0; maxtries < 50; ++maxtries ) {
 		uint64_t available;
 		if ( !file_freeDiskSpace( _basePath, NULL, &available ) ) {
-			logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left... ;-)\n", errno );
+			logadd( LOG_WARNING, "Could not get free disk space (errno %d), will assume there is enough space left.", errno );
 			return true;
 		}
 		if ( available > size )
 			return true; // Yay
+		if ( !_isProxy || _autoFreeDiskSpaceDelay == -1 ) {
+			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but auto-freeing of disk space is disabled.",
+					(int)(available / (1024ll * 1024)),
+					(int)(size / (1024ll * 1024)) );
+			return false; // If not in proxy mode at all, or explicitly disabled, never delete anything
+		}
 		if ( !force && dnbd3_serverUptime() < (uint32_t)_autoFreeDiskSpaceDelay ) {
 			logadd( LOG_INFO, "Only %dMiB free, %dMiB requested, but server uptime < %d minutes...",
 					(int)(available / (1024ll * 1024)),
-- 
cgit v1.2.3-55-g7522


From 632528507c4cc1e7ba414bfaccfe6125bb1b0625 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 21 Jul 2020 17:34:14 +0200
Subject: [SERVER] Fix: NULL pointer access in saveLoadAllCacheMaps()

Entries in _images array might ne NULL
---
 src/server/image.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/server/image.c b/src/server/image.c
index 4893b7e..efece62 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1874,6 +1874,8 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
+		if ( image == NULL )
+			continue;
 		image->users++;
 		mutex_unlock( &imageListLock );
 		const bool fromUpstream = isImageFromUpstream( image );
-- 
cgit v1.2.3-55-g7522