From 0363303d1b67b47605971b313bc33a049e6a3209 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 14:28:53 +0200
Subject: [SERVER] Fix race condition and invalid lock order

---
 src/server/net.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index 9abe221..c1fa6fa 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -609,6 +609,12 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
 		}
 		bs += client->bytesSent;
 	}
+	// Do this before unlocking the list, otherwise we might
+	// account for a client twice if it would disconnect after
+	// unlocking but before we add the count here.
+	if ( bytesSent != NULL ) {
+		*bytesSent = totalBytesSent + bs;
+	}
 	mutex_unlock( &_clients_lock );
 	if ( clientCount != NULL ) {
 		*clientCount = cc;
@@ -616,9 +622,6 @@ void net_getStats(int *clientCount, int *serverCount, uint64_t *bytesSent)
 	if ( serverCount != NULL ) {
 		*serverCount = sc;
 	}
-	if ( bytesSent != NULL ) {
-		*bytesSent = totalBytesSent + bs;
-	}
 }
 
 void net_disconnectAll()
@@ -694,9 +697,9 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 		mutex_lock( &client->image->lock );
 		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
 		mutex_unlock( &client->image->lock );
-		client->image = image_release( client->image );
 	}
 	mutex_unlock( &client->lock );
+	client->image = image_release( client->image );
 	mutex_destroy( &client->lock );
 	mutex_destroy( &client->sendMutex );
 	free( client );
-- 
cgit v1.2.3-55-g7522


From 4e2e258dba3c9268e8d4fd061cbb9f291017ed2f Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 14:39:44 +0200
Subject: [SERVER] Use more _Atomic

---
 src/server/globals.h | 6 +++---
 src/server/net.c     | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.h b/src/server/globals.h
index cd5ad7e..86b8865 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -60,7 +60,7 @@ struct _dnbd3_connection
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
 	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
-	int queueLen;               // length of queue
+	atomic_int queueLen;        // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
 };
@@ -107,7 +107,7 @@ struct _dnbd3_image
 	int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
-	bool working;          // true if image exists and completeness is == 100% or a working upstream proxy is connected
+	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
 };
@@ -116,7 +116,7 @@ struct _dnbd3_client
 {
 #define HOSTNAMELEN (48)
 	atomic_uint_fast64_t bytesSent;   // Byte counter for this client.
-	dnbd3_image_t *image;             // Image in use by this client, or NULL during handshake
+	dnbd3_image_t * _Atomic image;    // Image in use by this client, or NULL during handshake
 	int sock;
 	bool isServer;                    // true if a server in proxy mode, false if real client
 	dnbd3_host_t host;
diff --git a/src/server/net.c b/src/server/net.c
index c1fa6fa..92728c0 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -255,9 +255,8 @@ void* net_handleNewConnection(void *clientPtr)
 				// No BGR mismatch, but don't lookup if image is unknown locally
 				image = image_get( image_name, rid, true );
 			}
-			mutex_lock( &client->lock );
 			client->image = image;
-			mutex_unlock( &client->lock );
+			atomic_thread_fence( memory_order_release );
 			if ( image == NULL ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
 			} else if ( !image->working ) {
-- 
cgit v1.2.3-55-g7522


From be7d7d95850c30a154aaa56e95d6a7f36793409d Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 7 Aug 2019 17:11:51 +0200
Subject: [SERVER] Better lock debugging: Always check lock order

Lock order is predefined in locks.h. Immediately bail out if a lock with
lower priority is obtained while the same thread already holds one with
higher priority.
---
 LOCKS                   |  13 +-
 src/server/altservers.c |   9 +-
 src/server/globals.c    |   2 +-
 src/server/image.c      |  10 +-
 src/server/integrity.c  |   2 +-
 src/server/locks.c      | 319 ++++++++++++++++++++++--------------------------
 src/server/locks.h      |  36 ++++--
 src/server/net.c        |   6 +-
 src/server/rpc.c        |  14 +--
 src/server/server.c     |   7 --
 src/server/uplink.c     |   6 +-
 11 files changed, 198 insertions(+), 226 deletions(-)

(limited to 'src/server/net.c')

diff --git a/LOCKS b/LOCKS
index 4b5b07c..77e44a8 100644
--- a/LOCKS
+++ b/LOCKS
@@ -16,23 +16,22 @@ requests.lock
 
 ===== SERVER =====
 This is a list of used locks, in the order they
-have to be aquired if you must hold multiple locks:
-remoteCloneLock | reloadLock
+have to be aquired if you must hold multiple locks.
+Note this list might be out of date, take a look at the
+defines in lock.h for the effective order.
+reloadLock
+remoteCloneLock
 _clients_lock
 _clients[].lock
 integrityQueueLock
 _images_lock
 _images[].lock
-pendingLockConsume
-pendingLockProduce
 uplink.queueLock
 altServersLock
 client.sendMutex
-client.statsLock
-statisticsSentLock
-statisticsReceivedLock
 uplink.rttLock
 uplink.sendMutex
+aclLock
 
 If you need to lock multiple clients/images/... at once,
 lock the client with the lowest array index first.
diff --git a/src/server/altservers.c b/src/server/altservers.c
index a270bf3..3d5e71e 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -30,7 +30,7 @@ void altservers_init()
 {
 	srand( (unsigned int)time( NULL ) );
 	// Init spinlock
-	mutex_init( &altServersLock );
+	mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
 	// Init signal
 	runSignal = signal_new();
 	if ( runSignal == NULL ) {
@@ -326,13 +326,13 @@ json_t* altservers_toJson()
 }
 
 /**
- * Update rtt history of given server - returns the new average for that server
+ * Update rtt history of given server - returns the new average for that server.
+ * XXX HOLD altServersLock WHEN CALLING THIS!
  */
 static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
 {
 	unsigned int avg = rtt;
 	int i;
-	mutex_lock( &altServersLock );
 	for (i = 0; i < numAltServers; ++i) {
 		if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
 		altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
@@ -353,7 +353,6 @@ static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const
 		}
 		break;
 	}
-	mutex_unlock( &altServersLock );
 	return avg;
 }
 
@@ -529,6 +528,7 @@ static void *altservers_main(void *data UNUSED)
 				}
 				clock_gettime( BEST_CLOCK_SOURCE, &end );
 				// Measurement done - everything fine so far
+				mutex_lock( &altServersLock );
 				mutex_lock( &uplink->rttLock );
 				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->currentServer );
 				// Penaltize rtt if this was a cycle; this will treat this server with lower priority
@@ -538,6 +538,7 @@ static void *altservers_main(void *data UNUSED)
 						+ (end.tv_nsec - start.tv_nsec) / 1000
 						+ ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
 				unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
+				mutex_unlock( &altServersLock );
 				// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
 				if ( ( uplink->cycleDetected || uplink->fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
 				mutex_unlock( &uplink->rttLock );
diff --git a/src/server/globals.c b/src/server/globals.c
index 69e8a6e..46c1030 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -112,7 +112,7 @@ void globals_loadConfig()
 	asprintf( &name, "%s/%s", _configDir, CONFIG_FILENAME );
 	if ( name == NULL ) return;
 	if ( initialLoad ) {
-		mutex_init( &loadLock );
+		mutex_init( &loadLock, LOCK_LOAD_CONFIG );
 	}
 	if ( mutex_trylock( &loadLock ) != 0 ) {
 		logadd( LOG_INFO, "Ignoring config reload request due to already running reload" );
diff --git a/src/server/image.c b/src/server/image.c
index 1f12eda..4a65ed3 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -59,9 +59,9 @@ static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t
 void image_serverStartup()
 {
 	srand( (unsigned int)time( NULL ) );
-	mutex_init( &imageListLock );
-	mutex_init( &remoteCloneLock );
-	mutex_init( &reloadLock );
+	mutex_init( &imageListLock, LOCK_IMAGE_LIST );
+	mutex_init( &remoteCloneLock, LOCK_REMOTE_CLONE );
+	mutex_init( &reloadLock, LOCK_RELOAD );
 }
 
 /**
@@ -347,7 +347,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		img->rid = candidate->rid;
 		img->users = 1;
 		img->working = false;
-		mutex_init( &img->lock );
+		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
 			const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
 			img->crc32 = malloc( mb );
@@ -869,7 +869,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->working = (image->cache_map == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
-	mutex_init( &image->lock );
+	mutex_init( &image->lock, LOCK_IMAGE );
 	int32_t offset;
 	if ( stat( path, &st ) == 0 ) {
 		// Negatively offset atime by file modification time
diff --git a/src/server/integrity.c b/src/server/integrity.c
index a66a364..c52d17b 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -39,7 +39,7 @@ static void* integrity_main(void *data);
 void integrity_init()
 {
 	assert( queueLen == -1 );
-	mutex_init( &integrityQueueLock );
+	mutex_init( &integrityQueueLock, LOCK_INTEGRITY_QUEUE );
 	pthread_cond_init( &queueSignal, NULL );
 	mutex_lock( &integrityQueueLock );
 	queueLen = 0;
diff --git a/src/server/locks.c b/src/server/locks.c
index 2c0cb27..b39576b 100644
--- a/src/server/locks.c
+++ b/src/server/locks.c
@@ -12,47 +12,45 @@
 #ifdef _DEBUG
 #define MAXLOCKS (SERVER_MAX_CLIENTS * 2 + SERVER_MAX_ALTS + 200 + SERVER_MAX_IMAGES)
 #define MAXTHREADS (SERVER_MAX_CLIENTS + 100)
+#define MAXLPT 20
 #define LOCKLEN 60
 typedef struct
 {
-	void *lock;
+	void * _Atomic lock;
 	ticks locktime;
-	char locked;
-	pthread_t thread;
+	bool _Atomic locked;
+	pthread_t _Atomic thread;
 	int lockId;
+	int prio;
 	char name[LOCKLEN];
 	char where[LOCKLEN];
 } debug_lock_t;
 
 typedef struct
 {
-	pthread_t tid;
+	pthread_t _Atomic tid;
 	ticks time;
 	char name[LOCKLEN];
 	char where[LOCKLEN];
-
+	debug_lock_t *locks[MAXLPT];
 } debug_thread_t;
 
 int debugThreadCount = 0;
 
 static debug_lock_t locks[MAXLOCKS];
 static debug_thread_t threads[MAXTHREADS];
-static int init_done = 0;
-static pthread_mutex_t initdestory;
+static pthread_mutex_t initdestory = PTHREAD_MUTEX_INITIALIZER;
 static int lockId = 0;
-static pthread_t watchdog = 0;
-static dnbd3_signal_t* watchdogSignal = NULL;
 
-static void *debug_thread_watchdog(void *something);
+#define ULDE(...) do { \
+			pthread_mutex_unlock( &initdestory ); \
+			logadd( LOG_ERROR, __VA_ARGS__ ); \
+			debug_dump_lock_stats(); \
+			exit( 4 ); \
+} while(0)
 
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority)
 {
-	if ( !init_done ) {
-		memset( locks, 0, MAXLOCKS * sizeof(debug_lock_t) );
-		memset( threads, 0, MAXTHREADS * sizeof(debug_thread_t) );
-		pthread_mutex_init( &initdestory, NULL );
-		init_done = 1;
-	}
 	int first = -1;
 	pthread_mutex_lock( &initdestory );
 	for (int i = 0; i < MAXLOCKS; ++i) {
@@ -63,20 +61,18 @@ int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex
 		if ( first == -1 && locks[i].lock == NULL ) first = i;
 	}
 	if ( first == -1 ) {
-		logadd( LOG_ERROR, "No more free debug locks (%s:%d)\n", file, line );
-		pthread_mutex_unlock( &initdestory );
-		debug_dump_lock_stats();
-		exit( 4 );
+		ULDE( "No more free debug locks (%s:%d)\n", file, line );
 	}
 	locks[first].lock = (void*)lock;
-	locks[first].locked = 0;
+	locks[first].locked = false;
+	locks[first].prio = priority;
 	snprintf( locks[first].name, LOCKLEN, "%s", name );
 	snprintf( locks[first].where, LOCKLEN, "I %s:%d", file, line );
 	pthread_mutex_unlock( &initdestory );
 	return pthread_mutex_init( lock, NULL );
 }
 
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock)
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try)
 {
 	debug_lock_t *l = NULL;
 	pthread_mutex_lock( &initdestory );
@@ -86,163 +82,180 @@ int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
 	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		debug_dump_lock_stats();
-		exit( 4 );
+		ULDE( "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
 	}
 	debug_thread_t *t = NULL;
-	pthread_mutex_lock( &initdestory );
+	int first = -1;
+	const pthread_t self = pthread_self();
 	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid != 0 ) continue;
-		threads[i].tid = pthread_self();
-		timing_get( &threads[i].time );
-		snprintf( threads[i].name, LOCKLEN, "%s", name );
-		snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
-		t = &threads[i];
-		break;
-	}
-	pthread_mutex_unlock( &initdestory );
-	if ( t == NULL ) {
-		logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
-	}
-	const int retval = pthread_mutex_lock( lock );
-	pthread_mutex_lock( &initdestory );
-	t->tid = 0;
-	pthread_mutex_unlock( &initdestory );
-	if ( l->locked ) {
-		logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
-	}
-	l->locked = 1;
-	timing_get( &l->locktime );
-	l->thread = pthread_self();
-	snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
-	pthread_mutex_lock( &initdestory );
-	l->lockId = ++lockId;
-	pthread_mutex_unlock( &initdestory );
-	return retval;
-}
-
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock)
-{
-	debug_lock_t *l = NULL;
-	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
+		if ( first == -1 && threads[i].tid == 0 ) {
+			first = i;
+		}
 	}
-	pthread_mutex_unlock( &initdestory );
-	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to lock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		debug_dump_lock_stats();
-		exit( 4 );
-	}
-	debug_thread_t *t = NULL;
-	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid != 0 ) continue;
-		threads[i].tid = pthread_self();
-		timing_get( &threads[i].time );
-		snprintf( threads[i].name, LOCKLEN, "%s", name );
-		snprintf( threads[i].where, LOCKLEN, "%s:%d", file, line );
-		t = &threads[i];
-		break;
-	}
-	pthread_mutex_unlock( &initdestory );
+	int idx;
 	if ( t == NULL ) {
-		logadd( LOG_ERROR, "Lock sanity check: Too many waiting threads for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+		if ( first == -1 ) {
+			ULDE( "Lock sanity check: Too many waiting threads for lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
+		}
+		t = &threads[first];
+		timing_get( &t->time );
+		t->tid = self;
+		snprintf( t->name, LOCKLEN, "%s", name );
+		snprintf( t->where, LOCKLEN, "%s:%d", file, line );
+		memset( t->locks, 0, sizeof(t->locks) );
+		idx = 0;
+	} else {
+		// Thread already has locks, check for order violation
+		idx = -1;
+		for (int i = 0; i < MAXLPT; ++i) {
+			if ( t->locks[i] == NULL ) {
+				if ( idx == -1 ) {
+					idx = i;
+				}
+				continue;
+			}
+			if ( t->locks[i]->prio >= l->prio ) {
+				ULDE( "Lock priority violation: %s at %s:%d (%d) when already holding %s at %s (%d)",
+						name, file, line, l->prio,
+						t->locks[i]->name, t->locks[i]->where, t->locks[i]->prio );
+			}
+			if ( t->locks[i] == l ) {
+				ULDE( "Tried to recusively lock %s in the same thread. Tried at %s:%d, when already locked at %s",
+						name, file, line, t->locks[i]->name );
+			}
+		}
+		if ( idx == -1 ) {
+			ULDE( "Thread %d tried to lock more than %d locks.", (int)self, (int)MAXLPT );
+		}
 	}
-	const int retval = pthread_mutex_trylock( lock );
-	pthread_mutex_lock( &initdestory );
-	t->tid = 0;
 	pthread_mutex_unlock( &initdestory );
+	const int retval = try ? pthread_mutex_trylock( lock ) : pthread_mutex_lock( lock );
 	if ( retval == 0 ) {
+		timing_get( &l->locktime );
+		l->thread = self;
+		snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
+		pthread_mutex_lock( &initdestory );
 		if ( l->locked ) {
 			logadd( LOG_ERROR, "Lock sanity check: lock %p (%s) already locked at %s:%d\n", (void*)lock, name, file, line );
 			exit( 4 );
 		}
-		l->locked = 1;
-		timing_get( &l->locktime );
-		l->thread = pthread_self();
-		snprintf( l->where, LOCKLEN, "L %s:%d", file, line );
-		pthread_mutex_lock( &initdestory );
+		l->locked = true;
+		t->locks[idx] = l;
 		l->lockId = ++lockId;
 		pthread_mutex_unlock( &initdestory );
+	} else if ( !try || retval != EBUSY ) {
+		logadd( LOG_ERROR, "Acquiring lock %s at %s:%d failed with error code %d", name, file, line, retval );
+		debug_dump_lock_stats();
+		exit( 4 );
 	}
 	return retval;
 }
 
 int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock)
 {
-	debug_lock_t *l = NULL;
+	debug_thread_t *t = NULL;
+	pthread_t self = pthread_self();
 	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+	for (int i = 0; i < MAXTHREADS; ++i) {
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
-	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to unlock uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( t == NULL ) {
+		ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
 	}
-	if ( !l->locked ) {
-		logadd( LOG_ERROR, "Unlock sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	int idx = -1;
+	int cnt = 0;
+	for (int i = 0; i < MAXLPT; ++i) {
+		if ( t->locks[i] == NULL )
+			continue;
+		cnt++;
+		if ( t->locks[i]->lock == lock ) {
+			idx = i;
+		}
+	}
+	if ( idx == -1 ) {
+		ULDE( "Unlock: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
 	}
-	l->locked = 0;
+	debug_lock_t *l = t->locks[idx];
+	if ( l->thread != self || !l->locked ) {
+		ULDE( "Unlock sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
+	}
+	l->locked = false;
 	l->thread = 0;
+	t->locks[idx] = NULL;
+	if ( cnt == 1 ) {
+		t->tid = 0; // No more locks held, free up slot
+	}
 	snprintf( l->where, LOCKLEN, "U %s:%d", file, line );
-	int retval = pthread_mutex_unlock( lock );
+	pthread_mutex_unlock( &initdestory );
+	const int retval = pthread_mutex_unlock( lock );
+	if ( retval != 0 ) {
+		logadd( LOG_ERROR, "pthread_mutex_unlock returned %d for %s at %s:%d", retval, name, file, line );
+		exit( 4 );
+	}
 	return retval;
 }
 
 int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock)
 {
 	debug_lock_t *l = NULL;
+	debug_thread_t *t = NULL;
+	pthread_t self = pthread_self();
 	pthread_mutex_lock( &initdestory );
-	for (int i = 0; i < MAXLOCKS; ++i) {
-		if ( locks[i].lock == lock ) {
-			l = &locks[i];
+	for (int i = 0; i < MAXTHREADS; ++i) {
+		if ( threads[i].tid == self ) {
+			t = &threads[i];
 			break;
 		}
 	}
-	pthread_mutex_unlock( &initdestory );
+	if ( t == NULL ) {
+		ULDE( "Unlock called from unknown thread for %s at %s:%d", name, file, line );
+	}
+	int mp = 0, mpi = -1;
+	for (int i = 0; i < MAXLPT; ++i) {
+		if ( t->locks[i] == NULL )
+			continue;
+		if ( t->locks[i]->lock == lock ) {
+			l = t->locks[i];
+		} else if ( t->locks[i]->prio > mp ) {
+			mp = t->locks[i]->prio;
+			mpi = i;
+		}
+	}
 	if ( l == NULL ) {
-		logadd( LOG_ERROR, "Tried to cond_wait on uninitialized lock %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+		ULDE( "cond_wait: Calling thread doesn't hold lock %s at %s:%d", name, file, line );
 	}
-	if ( !l->locked ) {
-		logadd( LOG_ERROR, "Cond_wait sanity check: lock %p (%s) not locked at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( l->thread != self || !l->locked ) {
+		ULDE( "cond_wait: Sanity check for lock debugger failed! Lock %s is assigned to calling thread, but lock's meta data doesn't match up at %s:%d", name, file, line );
 	}
-	pthread_t self = pthread_self();
-	if ( l->thread != self ) {
-		logadd( LOG_ERROR, "Cond_wait called from non-owning thread for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
-		exit( 4 );
+	if ( mp >= l->prio ) {
+		ULDE( "cond_wait: Yielding a mutex while holding another one with higher prio: %s at %s:%d (%d) while also holding %s at %s (%d)",
+				name, file, line, l->prio,
+				t->locks[mpi]->name, t->locks[mpi]->where, mp );
 	}
-	l->locked = 0;
+	l->locked = false;
 	l->thread = 0;
-	snprintf( l->where, LOCKLEN, "CW %s:%d", file, line );
+	snprintf( l->where, LOCKLEN, "CWU %s:%d", file, line );
+	pthread_mutex_unlock( &initdestory );
 	int retval = pthread_cond_wait( cond, lock );
 	if ( retval != 0 ) {
 		logadd( LOG_ERROR, "pthread_cond_wait returned %d for lock %p (%s) at %s:%d\n", retval, (void*)lock, name, file, line );
 		exit( 4 );
 	}
-	if ( l->locked != 0 || l->thread != 0 ) {
+	if ( l->locked || l->thread != 0 ) {
 		logadd( LOG_ERROR, "Lock is not free after returning from pthread_cond_wait for %p (%s) at %s:%d\n", (void*)lock, name, file, line );
 		exit( 4 );
 	}
-	l->locked = 1;
 	l->thread = self;
 	timing_get( &l->locktime );
+	l->locked = true;
 	pthread_mutex_lock( &initdestory );
 	l->lockId = ++lockId;
 	pthread_mutex_unlock( &initdestory );
@@ -290,63 +303,21 @@ void debug_dump_lock_stats()
 					"* Locked: %d\n", locks[i].name, locks[i].where, (int)locks[i].locked );
 		}
 	}
-	printf( "\n **** WAITING THREADS ****\n\n" );
+	printf( "\n **** ACTIVE THREADS ****\n\n" );
 	for (int i = 0; i < MAXTHREADS; ++i) {
-		if ( threads[i].tid == 0 ) continue;
+		if ( threads[i].tid == 0 )
+			continue;
 		printf( "* *** Thread %d ***\n"
 				"* Lock: %s\n"
 				"* Where: %s\n"
 				"* How long: %d secs\n", (int)threads[i].tid, threads[i].name, threads[i].where, (int)timing_diff( &threads[i].time, &now ) );
-	}
-	pthread_mutex_unlock( &initdestory );
-}
-
-static void *debug_thread_watchdog(void *something UNUSED)
-{
-	setThreadName( "debug-watchdog" );
-	while ( !_shutdown ) {
-		if ( init_done ) {
-			declare_now;
-			pthread_mutex_lock( &initdestory );
-			for (int i = 0; i < MAXTHREADS; ++i) {
-				if ( threads[i].tid == 0 ) continue;
-				const uint32_t diff = timing_diff( &threads[i].time, &now );
-				if ( diff > 6 && diff < 100000 ) {
-					printf( "\n\n +++++++++ DEADLOCK ++++++++++++\n\n" );
-					pthread_mutex_unlock( &initdestory );
-					debug_dump_lock_stats();
-					exit( 99 );
-				}
-			}
-			pthread_mutex_unlock( &initdestory );
+		for (int j = 0; j < MAXLPT; ++j) {
+			if ( threads[i].locks[j] == NULL )
+				continue;
+			printf( "  * Lock %s @ %s\n", threads[i].locks[j]->name, threads[i].locks[j]->where );
 		}
-		if ( watchdogSignal == NULL || signal_wait( watchdogSignal, 5000 ) == SIGNAL_ERROR ) sleep( 5 );
 	}
-	return NULL ;
-}
-
-#endif
-
-void debug_locks_start_watchdog()
-{
-#ifdef _DEBUG
-	watchdogSignal = signal_new();
-	if ( 0 != thread_create( &watchdog, NULL, &debug_thread_watchdog, (void *)NULL ) ) {
-		logadd( LOG_ERROR, "Could not start debug-lock watchdog." );
-		return;
-	}
-#endif
+	pthread_mutex_unlock( &initdestory );
 }
 
-void debug_locks_stop_watchdog()
-{
-#ifdef _DEBUG
-	_shutdown = true;
-	printf( "Killing debug watchdog...\n" );
-	pthread_mutex_lock( &initdestory );
-	signal_call( watchdogSignal );
-	pthread_mutex_unlock( &initdestory );
-	thread_join( watchdog, NULL );
-	signal_close( watchdogSignal );
 #endif
-}
diff --git a/src/server/locks.h b/src/server/locks.h
index 7f72722..e5c9801 100644
--- a/src/server/locks.h
+++ b/src/server/locks.h
@@ -5,19 +5,38 @@
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
+
+// Lock priority
+
+#define LOCK_RELOAD 90
+#define LOCK_LOAD_CONFIG 100
+#define LOCK_REMOTE_CLONE 110
+#define LOCK_CLIENT_LIST 120
+#define LOCK_CLIENT 130
+#define LOCK_INTEGRITY_QUEUE 140
+#define LOCK_IMAGE_LIST 150
+#define LOCK_IMAGE 160
+#define LOCK_UPLINK_QUEUE 170
+#define LOCK_ALT_SERVER_LIST 180
+#define LOCK_CLIENT_SEND 190
+#define LOCK_UPLINK_RTT 200
+#define LOCK_UPLINK_SEND 210
+#define LOCK_RPC_ACL 220
+
+//
 
 #ifdef _DEBUG
 
-#define mutex_init( lock ) debug_mutex_init( #lock, __FILE__, __LINE__, lock)
-#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock)
-#define mutex_trylock( lock ) debug_mutex_trylock( #lock, __FILE__, __LINE__, lock)
+#define mutex_init( lock, prio ) debug_mutex_init( #lock, __FILE__, __LINE__, lock, prio)
+#define mutex_lock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, false)
+#define mutex_trylock( lock ) debug_mutex_lock( #lock, __FILE__, __LINE__, lock, true)
 #define mutex_unlock( lock ) debug_mutex_unlock( #lock, __FILE__, __LINE__, lock)
 #define mutex_cond_wait( cond, lock ) debug_mutex_cond_wait( #lock, __FILE__, __LINE__, cond, lock)
 #define mutex_destroy( lock ) debug_mutex_destroy( #lock, __FILE__, __LINE__, lock)
 
-int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock);
-int debug_mutex_trylock(const char *name, const char *file, int line, pthread_mutex_t *lock);
+int debug_mutex_init(const char *name, const char *file, int line, pthread_mutex_t *lock, int priority);
+int debug_mutex_lock(const char *name, const char *file, int line, pthread_mutex_t *lock, bool try);
 int debug_mutex_unlock(const char *name, const char *file, int line, pthread_mutex_t *lock);
 int debug_mutex_cond_wait(const char *name, const char *file, int line, pthread_cond_t *restrict cond, pthread_mutex_t *restrict lock);
 int debug_mutex_destroy(const char *name, const char *file, int line, pthread_mutex_t *lock);
@@ -27,7 +46,7 @@ void debug_dump_lock_stats();
 
 #else
 
-#define mutex_init( lock ) pthread_mutex_init(lock, NULL)
+#define mutex_init( lock, prio ) pthread_mutex_init(lock, NULL)
 #define mutex_lock( lock ) pthread_mutex_lock(lock)
 #define mutex_trylock( lock ) pthread_mutex_trylock(lock)
 #define mutex_unlock( lock ) pthread_mutex_unlock(lock)
@@ -82,7 +101,4 @@ static inline int debug_thread_join(pthread_t thread, void **value_ptr)
 
 #endif
 
-void debug_locks_start_watchdog();
-void debug_locks_stop_watchdog();
-
 #endif /* LOCKS_H_ */
diff --git a/src/server/net.c b/src/server/net.c
index 92728c0..8f97a12 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -145,7 +145,7 @@ static inline bool sendPadding( const int fd, uint32_t bytes )
 
 void net_init()
 {
-	mutex_init( &_clients_lock );
+	mutex_init( &_clients_lock, LOCK_CLIENT_LIST );
 }
 
 void* net_handleNewConnection(void *clientPtr)
@@ -186,8 +186,8 @@ void* net_handleNewConnection(void *clientPtr)
 		}
 	} while (0);
 	// Fully init client struct
-	mutex_init( &client->lock );
-	mutex_init( &client->sendMutex );
+	mutex_init( &client->lock, LOCK_CLIENT );
+	mutex_init( &client->sendMutex, LOCK_CLIENT_SEND );
 
 	mutex_lock( &client->lock );
 	host_to_string( &client->host, client->hostName, HOSTNAMELEN );
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 5dbcafe..261c6c0 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -75,10 +75,9 @@ static json_int_t randomRunId;
 static pthread_mutex_t aclLock;
 #define MAX_CLIENTS 50
 #define CUTOFF_START 40
-static pthread_mutex_t statusLock;
 static struct {
-	int count;
-	bool overloaded;
+	atomic_int count;
+	atomic_bool overloaded;
 } status;
 
 static bool handleStatus(int sock, int permissions, struct field *fields, size_t fields_num, int keepAlive);
@@ -91,8 +90,7 @@ static void loadAcl();
 
 void rpc_init()
 {
-	mutex_init( &aclLock );
-	mutex_init( &statusLock );
+	mutex_init( &aclLock, LOCK_RPC_ACL );
 	randomRunId = (((json_int_t)getpid()) << 16) | (json_int_t)time(NULL);
 	// </guard>
 	if ( sizeof(randomRunId) > 4 ) {
@@ -123,10 +121,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 		return;
 	}
 	do {
-		mutex_lock( &statusLock );
 		const int curCount = ++status.count;
 		UPDATE_LOADSTATE( curCount );
-		mutex_unlock( &statusLock );
 		if ( curCount > MAX_CLIENTS ) {
 			sendReply( sock, "503 Service Temporarily Unavailable", "text/plain", "Too many HTTP clients", -1, HTTP_CLOSE );
 			goto func_return;
@@ -198,9 +194,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
 				keepAlive = HTTP_CLOSE;
 			} else { // And if there aren't too many active HTTP sessions
-				mutex_lock( &statusLock );
 				if ( status.overloaded ) keepAlive = HTTP_CLOSE;
-				mutex_unlock( &statusLock );
 			}
 		}
 		if ( method.s != NULL && path.s != NULL ) {
@@ -234,10 +228,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 	} while (true);
 func_return:;
 	do {
-		mutex_lock( &statusLock );
 		const int curCount = --status.count;
 		UPDATE_LOADSTATE( curCount );
-		mutex_unlock( &statusLock );
 	} while (0);
 }
 
diff --git a/src/server/server.c b/src/server/server.c
index 10ab208..838aec2 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -133,9 +133,6 @@ void dnbd3_cleanup()
 	// Wait for clients to disconnect
 	net_waitForAllDisconnected();
 
-	// Watchdog not needed anymore
-	debug_locks_stop_watchdog();
-
 	// Clean up images
 	retries = 5;
 	while ( !image_tryFreeAll() && --retries > 0 ) {
@@ -303,10 +300,6 @@ int main(int argc, char *argv[])
 		logadd( LOG_WARNING, "Could not load alt-servers. Does the file exist in %s?", _configDir );
 	}
 
-#ifdef _DEBUG
-	debug_locks_start_watchdog();
-#endif
-
 	// setup signal handler
 	struct sigaction sa;
 	memset( &sa, 0, sizeof(sa) );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index bb1ffdc..9570273 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -89,9 +89,9 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		goto failure;
 	}
 	link = image->uplink = calloc( 1, sizeof(dnbd3_connection_t) );
-	mutex_init( &link->queueLock );
-	mutex_init( &link->rttLock );
-	mutex_init( &link->sendMutex );
+	mutex_init( &link->queueLock, LOCK_UPLINK_QUEUE );
+	mutex_init( &link->rttLock, LOCK_UPLINK_RTT );
+	mutex_init( &link->sendMutex, LOCK_UPLINK_SEND );
 	link->image = image;
 	link->bytesReceived = 0;
 	link->idleTime = 0;
-- 
cgit v1.2.3-55-g7522


From 573e620bb1811fe81c64b86aeb5728e0437eea9f Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Sun, 18 Aug 2019 18:18:16 +0200
Subject: [SERVER] net.c: Minor reordering

---
 src/server/net.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index 8f97a12..5de9f14 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -532,16 +532,15 @@ exit_client_cleanup: ;
 	removeFromList( client );
 	totalBytesSent += client->bytesSent;
 	// Access time, but only if client didn't just probe
-	if ( image != NULL ) {
+	if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
 		mutex_lock( &image->lock );
-		if ( client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
-			timing_get( &image->atime );
-		}
+		timing_get( &image->atime );
 		mutex_unlock( &image->lock );
 	}
 	freeClientStruct( client ); // This will also call image_release on client->image
 	return NULL ;
 fail_preadd: ;
+	// This is before we even initialized any mutex
 	close( client->sock );
 	free( client );
 	return NULL;
@@ -688,15 +687,17 @@ static void removeFromList(dnbd3_client_t *client)
 static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 {
 	mutex_lock( &client->lock );
-	mutex_lock( &client->sendMutex );
-	if ( client->sock != -1 ) close( client->sock );
-	client->sock = -1;
-	mutex_unlock( &client->sendMutex );
 	if ( client->image != NULL ) {
 		mutex_lock( &client->image->lock );
 		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
 		mutex_unlock( &client->image->lock );
 	}
+	mutex_lock( &client->sendMutex );
+	if ( client->sock != -1 ) {
+		close( client->sock );
+	}
+	client->sock = -1;
+	mutex_unlock( &client->sendMutex );
 	mutex_unlock( &client->lock );
 	client->image = image_release( client->image );
 	mutex_destroy( &client->lock );
-- 
cgit v1.2.3-55-g7522


From 9787bccc217ee7369d20e5a4c243d433ae4b70bd Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 22 Aug 2019 10:30:07 +0200
Subject: [SERVER] Put request handle into CMD_ERROR reply

---
 src/server/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index 5de9f14..7f3c1ce 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -314,6 +314,7 @@ void* net_handleNewConnection(void *clientPtr)
 
 			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
+				reply.handle = request.handle;
 				if ( offset >= image->virtualFilesize ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
@@ -395,7 +396,6 @@ void* net_handleNewConnection(void *clientPtr)
 
 				reply.cmd = CMD_GET_BLOCK;
 				reply.size = request.size;
-				reply.handle = request.handle;
 
 				fixup_reply( reply );
 				const bool lock = image->uplink != NULL;
-- 
cgit v1.2.3-55-g7522


From 5fb4ef278be86fb6bda487f65ec4855d830bf4e5 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 22 Aug 2019 16:14:27 +0200
Subject: [SERVER] Get rid of alt-servers thread, per-uplink rtt history

Alt-Server checks are now run using the threadpool, so we don't need a
queue and dedicated thread anymore. The rtt history is now kept per
uplink, so many uplinks won't overwhelm the history, making its time
window very short.
Also the fail counter is now split up; a global one for when the server
actually isn't reachable, a local (per-uplink) one for when the server
is reachable but doesn't serve the requested image.
---
 src/server/altservers.c | 738 ++++++++++++++++++++++--------------------------
 src/server/altservers.h |  16 +-
 src/server/globals.h    |  41 ++-
 src/server/image.c      |   6 +-
 src/server/net.c        |  16 +-
 src/server/server.c     |   8 +-
 src/server/uplink.c     | 117 ++++----
 src/server/uplink.h     |   2 +
 src/serverconfig.h      |  10 +-
 9 files changed, 469 insertions(+), 485 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index fbe10a8..493ed9e 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -1,5 +1,6 @@
 #include "altservers.h"
 #include "locks.h"
+#include "threadpool.h"
 #include "helper.h"
 #include "image.h"
 #include "fileutil.h"
@@ -14,46 +15,22 @@
 #define LOG_GOTO(jumplabel, lvl, ...) do { LOG(lvl, __VA_ARGS__); goto jumplabel; } while (0);
 #define ERROR_GOTO(jumplabel, ...) LOG_GOTO(jumplabel, LOG_ERROR, __VA_ARGS__)
 
-static dnbd3_uplink_t * _Atomic pending[SERVER_MAX_PENDING_ALT_CHECKS];
-static dnbd3_signal_t * _Atomic runSignal = NULL;
-
 static dnbd3_alt_server_t altServers[SERVER_MAX_ALTS];
 static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
+static ticks nextCloseUnusedFd; // TODO: Move away
 
-static pthread_t altThread;
-
-static void *altservers_main(void *data);
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt);
+static void *altservers_runCheck(void *data);
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current);
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink);
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt);
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server);
 
 void altservers_init()
 {
 	srand( (unsigned int)time( NULL ) );
-	// Init spinlock
+	// Init lock
 	mutex_init( &altServersLock, LOCK_ALT_SERVER_LIST );
-	// Init signal
-	runSignal = signal_new();
-	if ( runSignal == NULL ) {
-		logadd( LOG_ERROR, "Error creating signal object. Uplink feature unavailable." );
-		exit( EXIT_FAILURE );
-	}
-	memset( altServers, 0, SERVER_MAX_ALTS * sizeof(dnbd3_alt_server_t) );
-	if ( 0 != thread_create( &altThread, NULL, &altservers_main, (void *)NULL ) ) {
-		logadd( LOG_ERROR, "Could not start altservers connector thread" );
-		exit( EXIT_FAILURE );
-	}
-	// Init waiting links queue -- this is currently a global static array so
-	// it will already be zero, but in case we refactor later do it explicitly
-	for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-		pending[i] = NULL;
-	}
-}
-
-void altservers_shutdown()
-{
-	if ( runSignal == NULL ) return;
-	signal_call( runSignal ); // Wake altservers thread up
-	thread_join( altThread, NULL );
 }
 
 static void addalt(int argc, char **argv, void *data)
@@ -121,7 +98,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 /**
  * ONLY called from the passed uplink's main thread
  */
-void altservers_findUplink(dnbd3_uplink_t *uplink)
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 {
 	if ( uplink->shutdown )
 		return;
@@ -135,67 +112,11 @@ void altservers_findUplink(dnbd3_uplink_t *uplink)
 	assert( uplink->better.fd == -1 );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
-	// XXX As this function is only ever called by the image's uplink thread,
-	// it cannot happen that the uplink ends up in this list concurrently
 	mutex_lock( &uplink->rttLock );
-	if ( uplink->rttTestResult == RTT_INPROGRESS ) {
-		for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-			if ( pending[i] != uplink ) continue;
-			// Yep, measuring right now
-			return;
-		}
-	}
-	// Find free slot for measurement
-	uplink->rttTestResult = RTT_INPROGRESS;
-	for (i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-		if ( pending[i] != NULL ) continue;
-		dnbd3_uplink_t *null = NULL;
-		if ( atomic_compare_exchange_strong( &pending[i], &null, uplink ) ) {
-			mutex_unlock( &uplink->rttLock );
-			atomic_thread_fence( memory_order_release );
-			signal_call( runSignal ); // Wake altservers thread up
-			return;
-		}
+	if ( uplink->rttTestResult != RTT_INPROGRESS ) {
+		threadpool_run( &altservers_runCheck, uplink );
 	}
-	// End of loop - no free slot
-	uplink->rttTestResult = RTT_NOT_REACHABLE;
 	mutex_unlock( &uplink->rttLock );
-	logadd( LOG_WARNING, "No more free RTT measurement slots, ignoring a request..." );
-}
-
-/**
- * The given uplink is about to disappear,
- * wait until any pending RTT check is done.
- */
-void altservers_removeUplink(dnbd3_uplink_t *uplink)
-{
-	assert( uplink != NULL );
-	assert( uplink->shutdown );
-	int i;
-	for ( i = 1 ;; ++i ) {
-		atomic_thread_fence( memory_order_acquire );
-		if ( runSignal == NULL ) {
-			// Thread is already done, remove manually
-			uplink->rttTestResult = RTT_NOT_REACHABLE;
-			break;
-		}
-		// Thread still running, wait until test is done
-		bool found = false;
-		for (int i = 0; i < SERVER_MAX_PENDING_ALT_CHECKS; ++i) {
-			if ( pending[i] == uplink ) {
-				found = true;
-				break;
-			}
-		}
-		if ( !found ) // No more test running
-			break;
-		usleep( 10000 ); // 10ms
-		signal_call( runSignal ); // Wake altservers thread up
-		if ( i % 500 == 0 ) {
-			logadd( LOG_INFO, "Still waiting for altserver check for uplink %p...", (void*)uplink );
-		}
-	}
-	logadd( LOG_DEBUG1, "Waited for %d iterations for altservers check when tearing down uplink", i );
 }
 
 /**
@@ -209,90 +130,124 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 	if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
 	int i, j;
 	int count = 0;
-	int scores[size];
-	int score;
-	mutex_lock( &altServersLock );
+	uint16_t scores[SERVER_MAX_ALTS] = { 0 };
 	if ( size > numAltServers ) size = numAltServers;
-	for (i = 0; i < numAltServers; ++i) {
-		if ( altServers[i].host.type == 0 ) continue; // Slot is empty
-		if ( altServers[i].isPrivate ) continue; // Do not tell clients about private servers
+	mutex_lock( &altServersLock );
+	for ( i = 0; i < numAltServers; ++i ) {
+		if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
+			continue; // Slot is empty or uplink is for replication only
 		if ( host->type == altServers[i].host.type ) {
-			score = altservers_netCloseness( host, &altServers[i].host ) - altServers[i].numFails;
+			scores[i] = 10 + altservers_netCloseness( host, &altServers[i].host );
 		} else {
-			score = -( altServers[i].numFails + 128 ); // Wrong address family
+			scores[i] = 1; // Wrong address family
 		}
-		if ( count == 0 ) {
-			// Trivial - this is the first entry
-			output[0].host = altServers[i].host;
-			output[0].failures = 0;
-			scores[0] = score;
-			count++;
-		} else {
-			// Other entries already exist, insert in proper position
-			for (j = 0; j < size; ++j) {
-				if ( j < count && score <= scores[j] ) continue;
-				if ( j > count ) break; // Should never happen but just in case...
-				if ( j < count && j + 1 < size ) {
-					// Check if we're in the middle and need to move other entries...
-					memmove( &output[j + 1], &output[j], sizeof(dnbd3_server_entry_t) * (size - j - 1) );
-					memmove( &scores[j + 1], &scores[j], sizeof(int) * (size - j - 1) );
-				}
-				if ( count < size ) {
-					count++;
-				}
-				output[j].host = altServers[i].host;
-				output[j].failures = 0;
-				scores[j] = score;
-				break;
+	}
+	while ( count < size ) {
+		i = -1;
+		for ( j = 0; j < numAltServers; ++j ) {
+			if ( scores[j] == 0 )
+				continue;
+			if ( i == -1 || scores[j] > scores[i] ) {
+				i = j;
 			}
 		}
+		if ( i == -1 )
+			break;
+		output[count].host = altServers[i].host;
+		output[count].failures = 0;
+		count++;
 	}
 	mutex_unlock( &altServersLock );
 	return count;
 }
 
+bool altservers_toString(int server, char *buffer, size_t len)
+{
+	return host_to_string( &altServers[server].host, buffer, len );
+}
+
+static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now )
+{
+	dnbd3_alt_local_t *local = ( uplink == NULL ? NULL : &uplink->altData[server] );
+	dnbd3_alt_server_t *global = &altServers[server];
+	if ( global->isClientOnly || ( !global->isPrivate && _proxyPrivateOnly ) )
+		return false;
+	// Blocked locally (image not found on server...)
+	if ( local != NULL && local->blocked ) {
+		if ( --local->fails > 0 )
+			return false;
+		local->blocked = false;
+	}
+	if ( global->blocked ) {
+		if ( timing_diff( &global->lastFail, now ) < SERVER_GLOBAL_DUP_TIME )
+			return false;
+		global->lastFail = *now;
+		if ( --global->fails > 0 )
+			return false;
+		global->blocked = false;
+	}
+	// Not blocked, depend on both fail counters
+	int fails = ( local == NULL ? 0 : local->fails ) + global->fails;
+	return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN;
+}
+
+int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
+{
+	int idx[size];
+	int num = altservers_getListForUplink( NULL, idx, size, -1 );
+	for ( int i = 0; i < num; ++i ) {
+		servers[i] = altServers[i].host;
+	}
+	return num;
+}
+
 /**
  * Get <size> alt servers. If there are more alt servers than
  * requested, random servers will be picked.
  * This function is suited for finding uplink servers as
  * it includes private servers and ignores any "client only" servers
+ * @param current index of server for current connection, or -1 in panic mode
  */
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency)
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current)
 {
-	if ( size <= 0 ) return 0;
-	int count = 0, i;
-	ticks now;
-	timing_get( &now );
+	if ( size <= 0 )
+		return 0;
+	int count = 0;
+	declare_now;
 	mutex_lock( &altServersLock );
-	// Flip first server in list with a random one every time this is called
-	if ( numAltServers > 1 ) {
-		const dnbd3_alt_server_t tmp = altServers[0];
-		do {
-			i = rand() % numAltServers;
-		} while ( i == 0 );
-		altServers[0] = altServers[i];
-		altServers[i] = tmp;
-	}
-	// We iterate over the list twice. First run adds servers with 0 failures only,
-	// second one also considers those that failed (not too many times)
-	if ( size > numAltServers ) size = numAltServers;
-	for (i = 0; i < numAltServers * 2; ++i) {
-		dnbd3_alt_server_t *srv = &altServers[i % numAltServers];
-		if ( srv->host.type == 0 ) continue; // Slot is empty
-		if ( _proxyPrivateOnly && !srv->isPrivate ) continue; // Config says to consider private alt-servers only? ignore!
-		if ( srv->isClientOnly ) continue;
-		bool first = ( i < numAltServers );
-		if ( first ) {
-			if ( srv->numFails > 0 ) continue;
-		} else {
-			if ( srv->numFails == 0 ) continue; // Already added in first iteration
-			if ( !emergency && srv->numFails > SERVER_BAD_UPLINK_THRES // server failed X times in a row
-				&& timing_diff( &srv->lastFail, &now ) < SERVER_BAD_UPLINK_IGNORE ) continue; // and last fail was not too long ago? ignore!
-			if ( !emergency ) srv->numFails--;
+	// If we don't have enough servers to randomize, take a shortcut
+	if ( numAltServers <= size ) {
+		for ( int i = 0; i < numAltServers; ++i ) {
+			if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) {
+				servers[count++] = i;
+			}
+		}
+	} else {
+		// Plenty of alt servers; randomize
+		uint8_t state[SERVER_MAX_ALTS] = { 0 };
+		if ( current != -1 ) { // Make sure we also test the current server
+			servers[count++] = current;
+			state[current] = 2;
+		}
+		for ( int tr = size * 10; tr > 0 && count < size; --tr ) {
+			int idx = rand() % numAltServers;
+			if ( state[idx] != 0 )
+				continue;
+			if ( isUsableForUplink( uplink, idx, &now ) ) {
+				servers[count++] = idx;
+				state[idx] = 2; // Used
+			} else {
+				state[idx] = 1; // Potential
+			}
+		}
+		// If panic mode, consider others too
+		for ( int tr = size * 10; current == -1 && tr > 0 && count < size; --tr ) {
+			int idx = rand() % numAltServers;
+			if ( state[idx] == 2 )
+				continue;
+			servers[count++] = idx;
+			state[idx] = 2; // Used
 		}
-		// server seems ok, include in output and decrease its fail counter
-		output[count++] = srv->host;
-		if ( count >= size ) break;
 	}
 	mutex_unlock( &altServersLock );
 	return count;
@@ -320,7 +275,7 @@ json_t* altservers_toJson()
 			"rtt", rtts,
 			"isPrivate", (int)src[i].isPrivate,
 			"isClientOnly", (int)src[i].isClientOnly,
-			"numFails", src[i].numFails
+			"numFails", src[i].fails
 		);
 		json_array_append_new( list, server );
 	}
@@ -329,32 +284,27 @@ json_t* altservers_toJson()
 
 /**
  * Update rtt history of given server - returns the new average for that server.
- * XXX HOLD altServersLock WHEN CALLING THIS!
  */
-static unsigned int altservers_updateRtt(const dnbd3_host_t * const host, const unsigned int rtt)
+static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt)
 {
-	unsigned int avg = rtt;
-	int i;
-	for (i = 0; i < numAltServers; ++i) {
-		if ( !isSameAddressPort( host, &altServers[i].host ) ) continue;
-		altServers[i].rtt[++altServers[i].rttIndex % SERVER_RTT_PROBES] = rtt;
-#if SERVER_RTT_PROBES == 5
-		avg = (altServers[i].rtt[0] + altServers[i].rtt[1] + altServers[i].rtt[2]
-				+ altServers[i].rtt[3] + altServers[i].rtt[4]) / SERVER_RTT_PROBES;
-#else
-#warning You might want to change the code in altservers_update_rtt if you changed SERVER_RTT_PROBES
-		avg = 0;
-		for (int j = 0; j < SERVER_RTT_PROBES; ++j) {
-			avg += altServers[i].rtt[j];
+	uint32_t avg = 0, j;
+	dnbd3_alt_local_t *local = &uplink->altData[index];
+	mutex_lock( &altServersLock );
+	if ( likely( local->initDone ) ) {
+		local->rtt[++local->rttIndex % SERVER_RTT_PROBES] = rtt;
+		for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+			avg += local->rtt[j];
 		}
 		avg /= SERVER_RTT_PROBES;
-#endif
-		// If we got a new rtt value, server must be working
-		if ( altServers[i].numFails > 0 ) {
-			altServers[i].numFails--;
+	} else { // First rtt measurement -- copy to every slot
+		for ( j = 0; j < SERVER_RTT_PROBES; ++j ) {
+			local->rtt[j] = rtt;
 		}
-		break;
+		avg = rtt;
+		local->initDone = true;
 	}
+	altServers[index].rtt[++altServers[index].rttIndex % SERVER_RTT_PROBES] = avg;
+	mutex_unlock( &altServersLock );
 	return avg;
 }
 
@@ -383,40 +333,33 @@ int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2)
  * track of how often servers fail, and consider them disabled for some time if they
  * fail too many times.
  */
-void altservers_serverFailed(const dnbd3_host_t * const host)
+void altservers_serverFailed(int server)
 {
-	int i;
-	int foundIndex = -1, lastOk = -1;
-	ticks now;
-	timing_get( &now );
+	declare_now;
 	mutex_lock( &altServersLock );
-	for (i = 0; i < numAltServers; ++i) {
-		if ( foundIndex == -1 ) {
-			// Looking for the failed server in list
-			if ( isSameAddressPort( host, &altServers[i].host ) ) {
-				foundIndex = i;
-			}
-		} else if ( altServers[i].host.type != 0 && altServers[i].numFails == 0 ) {
-			lastOk = i;
+	if ( timing_diff( &altServers[server].lastFail, &now ) > SERVER_GLOBAL_DUP_TIME ) {
+		altServers[server].lastFail = now;
+		if ( altServers[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+			altServers[server].blocked = true;
 		}
 	}
-	// Do only increase counter if last fail was not too recent. This is
-	// to prevent the counter from increasing rapidly if many images use the
-	// same uplink. If there's a network hickup, all uplinks will call this
-	// function and would increase the counter too quickly, disabling the server.
-	if ( foundIndex != -1 && timing_diff( &altServers[foundIndex].lastFail, &now ) > SERVER_RTT_INTERVAL_INIT ) {
-		altServers[foundIndex].numFails += SERVER_UPLINK_FAIL_INCREASE;
-		altServers[foundIndex].lastFail = now;
-		if ( lastOk != -1 ) {
-			// Make sure non-working servers are put at the end of the list, so they're less likely
-			// to get picked when testing servers for uplink connections.
-			const dnbd3_alt_server_t tmp = altServers[foundIndex];
-			altServers[foundIndex] = altServers[lastOk];
-			altServers[lastOk] = tmp;
-		}
+	mutex_unlock( &altServersLock );
+}
+
+/**
+ * Called from RTT checker if connecting to a server succeeded but
+ * subsequently selecting the given image failed. Handle this within
+ * the uplink and don't increase the global fail counter.
+ */
+static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server)
+{
+	mutex_lock( &altServersLock );
+	if ( uplink->altData[server].fails++ >= SERVER_BAD_UPLINK_MAX ) {
+		uplink->altData[server].blocked = true;
 	}
 	mutex_unlock( &altServersLock );
 }
+
 /**
  * Mainloop of this module. It will wait for requests by uplinks to find a
  * suitable uplink server for them. If found, it will tell the uplink about
@@ -425,206 +368,213 @@ void altservers_serverFailed(const dnbd3_host_t * const host)
  * will update quite quickly. Needs to be improved some time, ie. by only
  * updating the rtt if the last update was at least X seconds ago.
  */
-static void *altservers_main(void *data UNUSED)
+static void *altservers_runCheck(void *data)
+{
+	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
+
+	assert( uplink != NULL );
+	setThreadName( "altserver-check" );
+	altservers_findUplinkInternal( uplink );
+	// Save cache maps of all images if applicable
+	// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
+	declare_now;
+	if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
+		timing_gets( &nextCloseUnusedFd, 900 );
+		image_closeUnusedFd();
+	}
+	return NULL;
+}
+
+void altservers_findUplink(dnbd3_uplink_t *uplink)
+{
+	altservers_findUplinkInternal( uplink );
+	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
+		usleep( 5000 );
+	}
+}
+
+int altservers_hostToIndex(dnbd3_host_t *host)
+{
+	for ( int i = 0; i < numAltServers; ++i ) {
+		if ( isSameAddressPort( host, &altServers[i].host ) )
+			return i;
+	}
+	return -1;
+}
+
+const dnbd3_host_t* altservers_indexToHost(int server)
+{
+	return &altServers[server].host;
+}
+
+// XXX Sync call above must block until async worker has finished XXX
+static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 {
 	const int ALTS = 4;
-	int ret, itLink, itAlt, numAlts;
-	bool found;
-	char buffer[DNBD3_BLOCK_SIZE ];
-	dnbd3_reply_t reply;
-	dnbd3_host_t servers[ALTS + 1];
-	serialized_buffer_t serialized;
+	int ret, itAlt, numAlts, current;
+	bool panic;
+	int servers[ALTS + 1];
 	struct timespec start, end;
-	ticks nextCloseUnusedFd;
 
-	setThreadName( "altserver-check" );
-	blockNoncriticalSignals();
-	timing_gets( &nextCloseUnusedFd, 900 );
-	// LOOP
-	while ( !_shutdown ) {
-		// Wait 5 seconds max.
-		ret = signal_wait( runSignal, 5000 );
-		if ( _shutdown ) goto cleanup;
-		if ( ret == SIGNAL_ERROR ) {
-			if ( errno == EAGAIN || errno == EINTR ) continue;
-			logadd( LOG_WARNING, "Error %d on signal_clear on alservers_main! Things will break!", errno );
-			usleep( 100000 );
+	if ( _shutdown )
+		return;
+	mutex_lock( &uplink->rttLock );
+	// Maybe we already have a result, or check is currently running
+	if ( uplink->better.fd != -1 || uplink->rttTestResult == RTT_INPROGRESS ) {
+		mutex_unlock( &uplink->rttLock );
+		return;
+	}
+	assert( uplink->rttTestResult != RTT_DOCHANGE );
+	uplink->rttTestResult = RTT_INPROGRESS;
+	panic = ( uplink->current.fd == -1 );
+	current = uplink->current.index; // Current server index (or last one in panic mode)
+	mutex_unlock( &uplink->rttLock );
+	// First, get 4 alt servers
+	numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current );
+	// If we're already connected and only got one server anyways, there isn't much to do
+	if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) {
+		uplink->rttTestResult = RTT_DONTCHANGE;
+		return;
+	}
+	dnbd3_image_t * const image = image_lock( uplink->image );
+	if ( image == NULL ) { // Check again after locking
+		uplink->rttTestResult = RTT_NOT_REACHABLE;
+		logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
+		return;
+	}
+	LOG( LOG_DEBUG2, "Running alt check for %s:%d", image->name, (int)image->rid );
+	assert( uplink->rttTestResult == RTT_INPROGRESS );
+	// Test them all
+	dnbd3_server_connection_t best = { .fd = -1 };
+	unsigned long bestRtt = RTT_UNREACHABLE;
+	unsigned long currentRtt = RTT_UNREACHABLE;
+	for (itAlt = 0; itAlt < numAlts; ++itAlt) {
+		int server = servers[itAlt];
+		// Connect
+		clock_gettime( BEST_CLOCK_SOURCE, &start );
+		int sock = sock_connect( &altServers[server].host, 750, 1000 );
+		if ( sock == -1 ) { // Connection failed means global error
+			altservers_serverFailed( server );
+			continue;
 		}
-		// Work your way through the queue
-		atomic_thread_fence( memory_order_acquire );
-		for (itLink = 0; itLink < SERVER_MAX_PENDING_ALT_CHECKS; ++itLink) {
-			dnbd3_uplink_t * const uplink = pending[itLink];
-			if ( uplink == NULL )
-				continue;
-			// First, get 4 alt servers
-			numAlts = altservers_getListForUplink( servers, ALTS, uplink->current.fd == -1 );
-			// If we're already connected and only got one server anyways, there isn't much to do
-			if ( numAlts <= 1 && uplink->current.fd != -1 ) {
-				uplink->rttTestResult = RTT_DONTCHANGE;
-				continue;
-			}
-			dnbd3_image_t * const image = image_lock( uplink->image );
-			if ( image == NULL ) { // Check again after locking
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_NOT_REACHABLE;
-				assert( pending[itLink] == uplink );
-				pending[itLink] = NULL;
-				mutex_unlock( &uplink->rttLock );
-				logadd( LOG_WARNING, "Image has gone away that was queued for RTT measurement" );
-				continue;
-			}
-			LOG( LOG_DEBUG2, "[%d] Running alt check", itLink );
-			assert( uplink->rttTestResult == RTT_INPROGRESS );
-			if ( uplink->current.fd != -1 ) {
-				// Add current server if not already in list
-				found = false;
-				for (itAlt = 0; itAlt < numAlts; ++itAlt) {
-					if ( !isSameAddressPort( &uplink->current.host, &servers[itAlt] ) ) continue;
-					found = true;
-					break;
-				}
-				if ( !found ) servers[numAlts++] = uplink->current.host;
-			}
-			// Test them all
-			int bestSock = -1;
-			int bestIndex = -1;
-			int bestProtocolVersion = -1;
-			unsigned long bestRtt = RTT_UNREACHABLE;
-			unsigned long currentRtt = RTT_UNREACHABLE;
-			for (itAlt = 0; itAlt < numAlts; ++itAlt) {
-				usleep( 1000 ); // Wait a very short moment for the network to recover (we might be doing lots of measurements...)
-				// Connect
-				clock_gettime( BEST_CLOCK_SOURCE, &start );
-				int sock = sock_connect( &servers[itAlt], 750, 1000 );
-				if ( sock < 0 ) continue;
-				// Select image ++++++++++++++++++++++++++++++
-				if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
-					goto server_failed;
-				}
-				// See if selecting the image succeeded ++++++++++++++++++++++++++++++
-				uint16_t protocolVersion, rid;
-				uint64_t imageSize;
-				char *name;
-				if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
-					goto server_image_not_available;
-				}
-				if ( protocolVersion < MIN_SUPPORTED_SERVER ) goto server_failed;
-				if ( name == NULL || strcmp( name, image->name ) != 0 ) {
-					ERROR_GOTO( server_failed, "[RTT] Server offers image '%s'", name );
-				}
-				if ( rid != image->rid ) {
-					ERROR_GOTO( server_failed, "[RTT] Server provides rid %d", (int)rid );
-				}
-				if ( imageSize != image->virtualFilesize ) {
-					ERROR_GOTO( server_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
-				}
-				// Request first block (NOT random!) ++++++++++++++++++++++++++++++
-				if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
-					LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", itLink );
-				}
-				// See if requesting the block succeeded ++++++++++++++++++++++
-				if ( !dnbd3_get_reply( sock, &reply ) ) {
-					LOG_GOTO( server_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", itLink );
-				}
-				// check reply header
-				if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
-					ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
-				}
-				if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
-					ERROR_GOTO( server_failed, "[RTT%d] Could not read first block payload", itLink );
-				}
-				clock_gettime( BEST_CLOCK_SOURCE, &end );
-				// Measurement done - everything fine so far
-				mutex_lock( &altServersLock );
-				mutex_lock( &uplink->rttLock );
-				const bool isCurrent = isSameAddressPort( &servers[itAlt], &uplink->current.host );
-				// Penaltize rtt if this was a cycle; this will treat this server with lower priority
-				// in the near future too, so we prevent alternating between two servers that are both
-				// part of a cycle and have the lowest latency.
-				const unsigned int rtt = (unsigned int)((end.tv_sec - start.tv_sec) * 1000000
-						+ (end.tv_nsec - start.tv_nsec) / 1000
-						+ ( (isCurrent && uplink->cycleDetected) ? 1000000 : 0 )); // µs
-				unsigned int avg = altservers_updateRtt( &servers[itAlt], rtt );
-				mutex_unlock( &altServersLock );
-				// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
-				if ( ( uplink->cycleDetected || uplink->current.fd == -1 ) && isCurrent ) avg = (avg * 2) + 50000;
-				mutex_unlock( &uplink->rttLock );
-				if ( uplink->current.fd != -1 && isCurrent ) {
-					// Was measuring current server
-					currentRtt = avg;
-					close( sock );
-				} else if ( avg < bestRtt ) {
-					// Was another server, update "best"
-					if ( bestSock != -1 ) close( bestSock );
-					bestSock = sock;
-					bestRtt = avg;
-					bestIndex = itAlt;
-					bestProtocolVersion = protocolVersion;
-				} else {
-					// Was too slow, ignore
-					close( sock );
-				}
-				// We're done, call continue
-				continue;
-				// Jump here if anything went wrong
-				// This will cleanup and continue
-				server_failed: ;
-				altservers_serverFailed( &servers[itAlt] );
-				server_image_not_available: ;
-				close( sock );
-			}
-			// Done testing all servers. See if we should switch
-			if ( bestSock != -1 && (uplink->current.fd == -1 || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
-				// yep
-				if ( currentRtt > 10000000 || uplink->current.fd == -1 ) {
-					LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
-				} else {
-					LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
-				}
-				sock_setTimeout( bestSock, _uplinkTimeout );
-				mutex_lock( &uplink->rttLock );
-				uplink->better.fd = bestSock;
-				uplink->better.host = servers[bestIndex];
-				uplink->better.version = bestProtocolVersion;
-				uplink->rttTestResult = RTT_DOCHANGE;
-				mutex_unlock( &uplink->rttLock );
-				signal_call( uplink->signal );
-			} else if ( bestSock == -1 && currentRtt == RTT_UNREACHABLE ) {
-				// No server was reachable
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_NOT_REACHABLE;
-				mutex_unlock( &uplink->rttLock );
-			} else {
-				// nope
-				if ( bestSock != -1 ) close( bestSock );
-				mutex_lock( &uplink->rttLock );
-				uplink->rttTestResult = RTT_DONTCHANGE;
-				uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
-				mutex_unlock( &uplink->rttLock );
-				if ( !image->working ) {
-					image->working = true;
-					LOG( LOG_DEBUG1, "[%d] No better alt server found, enabling again", itLink );
-				}
-			}
-			image_release( image );
-			// end of loop over all pending uplinks
-			assert( pending[itLink] == uplink );
-			pending[itLink] = NULL;
-			atomic_thread_fence( memory_order_release );
+		// Select image ++++++++++++++++++++++++++++++
+		if ( !dnbd3_select_image( sock, image->name, image->rid, SI_SERVER_FLAGS ) ) {
+			goto image_failed;
 		}
-		// Save cache maps of all images if applicable
-		declare_now;
-		// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
-		if ( _closeUnusedFd && timing_reached( &nextCloseUnusedFd, &now ) ) {
-			timing_gets( &nextCloseUnusedFd, 900 );
-			image_closeUnusedFd();
+		// See if selecting the image succeeded ++++++++++++++++++++++++++++++
+		uint16_t protocolVersion, rid;
+		uint64_t imageSize;
+		char *name;
+		serialized_buffer_t serialized;
+		if ( !dnbd3_select_image_reply( &serialized, sock, &protocolVersion, &name, &rid, &imageSize ) ) {
+			goto image_failed;
 		}
+		if ( protocolVersion < MIN_SUPPORTED_SERVER ) { // Server version unsupported; global fail
+			goto server_failed;
+		}
+		if ( name == NULL || strcmp( name, image->name ) != 0 ) {
+			ERROR_GOTO( image_failed, "[RTT] Server offers image '%s' instead of '%s'", name, image->name );
+		}
+		if ( rid != image->rid ) {
+			ERROR_GOTO( image_failed, "[RTT] Server provides rid %d instead of %d", (int)rid, (int)image->rid );
+		}
+		if ( imageSize != image->virtualFilesize ) {
+			ERROR_GOTO( image_failed, "[RTT] Remote size: %" PRIu64 ", expected: %" PRIu64, imageSize, image->virtualFilesize );
+		}
+		// Request first block (NOT random!) ++++++++++++++++++++++++++++++
+		if ( !dnbd3_get_block( sock, 0, DNBD3_BLOCK_SIZE, 0, COND_HOPCOUNT( protocolVersion, 1 ) ) ) {
+			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Could not request first block", server );
+		}
+		// See if requesting the block succeeded ++++++++++++++++++++++
+		dnbd3_reply_t reply;
+		if ( !dnbd3_get_reply( sock, &reply ) ) {
+			LOG_GOTO( image_failed, LOG_DEBUG1, "[RTT%d] Received corrupted reply header after CMD_GET_BLOCK", server );
+		}
+		// check reply header
+		if ( reply.cmd != CMD_GET_BLOCK || reply.size != DNBD3_BLOCK_SIZE ) {
+			// Sanity check failed; count this as global error (malicious/broken server)
+			ERROR_GOTO( server_failed, "[RTT] Reply to first block request is %" PRIu32 " bytes", reply.size );
+		}
+		// flush payload to include this into measurement
+		char buffer[DNBD3_BLOCK_SIZE];
+		if ( recv( sock, buffer, DNBD3_BLOCK_SIZE, MSG_WAITALL ) != DNBD3_BLOCK_SIZE ) {
+			ERROR_GOTO( image_failed, "[RTT%d] Could not read first block payload", server );
+		}
+		clock_gettime( BEST_CLOCK_SOURCE, &end );
+		// Measurement done - everything fine so far
+		mutex_lock( &uplink->rttLock );
+		const bool isCurrent = ( uplink->current.index == server );
+		mutex_unlock( &uplink->rttLock );
+		// Penaltize rtt if this was a cycle; this will treat this server with lower priority
+		// in the near future too, so we prevent alternating between two servers that are both
+		// part of a cycle and have the lowest latency.
+		uint32_t rtt = (uint32_t)((end.tv_sec - start.tv_sec) * 1000000
+				+ (end.tv_nsec - start.tv_nsec) / 1000); // µs
+		uint32_t avg = altservers_updateRtt( uplink, server, rtt );
+		// If a cycle was detected, or we lost connection to the current (last) server, penaltize it one time
+		if ( ( uplink->cycleDetected || panic ) && isCurrent ) {
+			avg = (avg * 2) + 50000;
+		}
+		if ( !panic && isCurrent ) {
+			// Was measuring current server
+			currentRtt = avg;
+			close( sock );
+		} else if ( avg < bestRtt ) {
+			// Was another server, update "best"
+			if ( best.fd != -1 ) {
+				close( best.fd );
+			}
+			best.fd = sock;
+			bestRtt = avg;
+			best.index = server;
+			best.version = protocolVersion;
+		} else {
+			// Was too slow, ignore
+			close( sock );
+		}
+		// We're done, call continue
+		continue;
+		// Jump here if anything went wrong
+		// This will cleanup and continue
+image_failed:
+		altservers_imageFailed( uplink, server );
+		goto failed;
+server_failed:
+		altservers_serverFailed( server );
+failed:
+		close( sock );
 	}
-	cleanup: ;
-	if ( runSignal != NULL ) {
-		signal_close( runSignal );
+	// Done testing all servers. See if we should switch
+	if ( best.fd != -1 && (panic || (bestRtt < 10000000 && RTT_THRESHOLD_FACTOR(currentRtt) > bestRtt)) ) {
+		// yep
+		if ( currentRtt > 10000000 || panic ) {
+			LOG( LOG_DEBUG1, "Change - best: %luµs, current: -", bestRtt );
+		} else {
+			LOG( LOG_DEBUG1, "Change - best: %luµs, current: %luµs", bestRtt, currentRtt );
+		}
+		sock_setTimeout( best.fd, _uplinkTimeout );
+		mutex_lock( &uplink->rttLock );
+		uplink->better = best;
+		uplink->rttTestResult = RTT_DOCHANGE;
+		mutex_unlock( &uplink->rttLock );
+		signal_call( uplink->signal );
+	} else if ( best.fd == -1 && currentRtt == RTT_UNREACHABLE ) {
+		// No server was reachable, including current
+		uplink->rttTestResult = RTT_NOT_REACHABLE;
+	} else {
+		// nope
+		if ( best.fd != -1 ) {
+			close( best.fd );
+		}
+		if ( !image->working || uplink->cycleDetected ) {
+			image->working = true;
+			LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
+		}
+		uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
+		mutex_lock( &uplink->rttLock );
+		uplink->rttTestResult = RTT_DONTCHANGE;
+		mutex_unlock( &uplink->rttLock );
 	}
-	runSignal = NULL;
-	return NULL ;
+	image_release( image );
 }
 
diff --git a/src/server/altservers.h b/src/server/altservers.h
index e03b900..8e2b964 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -7,23 +7,27 @@ struct json_t;
 
 void altservers_init();
 
-void altservers_shutdown();
-
 int altservers_load();
 
 bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate, const int isClientOnly);
 
-void altservers_findUplink(dnbd3_uplink_t *uplink);
+void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
 
-void altservers_removeUplink(dnbd3_uplink_t *uplink);
+void altservers_findUplink(dnbd3_uplink_t *uplink);
 
 int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
 
-int altservers_getListForUplink(dnbd3_host_t *output, int size, int emergency);
+int altservers_getHostListForReplication(dnbd3_host_t *servers, int size);
+
+bool altservers_toString(int server, char *buffer, size_t len);
 
 int altservers_netCloseness(dnbd3_host_t *host1, dnbd3_host_t *host2);
 
-void altservers_serverFailed(const dnbd3_host_t * const host);
+void altservers_serverFailed(int server);
+
+int altservers_hostToIndex(dnbd3_host_t *host);
+
+const dnbd3_host_t* altservers_indexToHost(int server);
 
 struct json_t* altservers_toJson();
 
diff --git a/src/server/globals.h b/src/server/globals.h
index 659e5a2..4d97c6b 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -30,10 +30,31 @@ typedef struct
 	uint8_t hopCount;      // How many hops this request has already taken across proxies
 } dnbd3_queued_request_t;
 
+typedef struct
+{
+	int fails;                    // Hard fail: Connection failed
+	int rttIndex;
+	uint32_t rtt[SERVER_RTT_PROBES];
+	bool isPrivate, isClientOnly;
+	bool blocked;                 // If true count down fails until 0 to enable again
+	ticks lastFail;               // Last hard fail
+	dnbd3_host_t host;
+	char comment[COMMENT_LENGTH];
+} dnbd3_alt_server_t;
+
+typedef struct
+{
+	int fails;                    // Soft fail: Image not found
+	int rttIndex;
+	uint32_t rtt[SERVER_RTT_PROBES];
+	bool blocked;                 // True if server is to be ignored and fails should be counted down
+	bool initDone;
+} dnbd3_alt_local_t;
+
 typedef struct {
-	int fd;             // Socket fd for this connection
-	int version;        // Protocol version of remote server
-	dnbd3_host_t host;  // IP/Port of remote server
+	int fd;            // Socket fd for this connection
+	int version;       // Protocol version of remote server
+	int index;         // Entry in uplinks list
 } dnbd3_server_connection_t;
 
 #define RTT_IDLE 0 // Not in progress
@@ -51,7 +72,7 @@ struct _dnbd3_uplink
 	pthread_mutex_t queueLock;  // lock for synchronization on request queue etc.
 	dnbd3_image_t *image;       // image that this uplink is used for; do not call get/release for this pointer
 	pthread_mutex_t rttLock;    // When accessing rttTestResult, betterFd or betterServer
-	int rttTestResult;          // RTT_*
+	atomic_int rttTestResult;   // RTT_*
 	int cacheFd;                // used to write to the image, in case it is relayed. ONLY USE FROM UPLINK THREAD!
 	uint8_t *recvBuffer;        // Buffer for receiving payload
 	uint32_t recvBufferLen;     // Len of ^^
@@ -65,19 +86,9 @@ struct _dnbd3_uplink
 	atomic_int queueLen;        // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
 };
 
-typedef struct
-{
-	char comment[COMMENT_LENGTH];
-	dnbd3_host_t host;
-	unsigned int rtt[SERVER_RTT_PROBES];
-	unsigned int rttIndex;
-	bool isPrivate, isClientOnly;
-	ticks lastFail;
-	int numFails;
-} dnbd3_alt_server_t;
-
 typedef struct
 {
 	uint8_t host[16];
diff --git a/src/server/image.c b/src/server/image.c
index d250715..1a6e0f8 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1178,7 +1178,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
 	dnbd3_host_t servers[REP_NUM_SRV];
 	int uplinkSock = -1;
 	dnbd3_host_t uplinkServer;
-	const int count = altservers_getListForUplink( servers, REP_NUM_SRV, false );
+	const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV );
 	uint16_t remoteProtocolVersion;
 	uint16_t remoteRid = revision;
 	uint64_t remoteImageSize;
@@ -1491,7 +1491,7 @@ json_t* image_getListAsJson()
 	json_t *imagesJson = json_array();
 	json_t *jsonImage;
 	int i;
-	char uplinkName[100] = { 0 };
+	char uplinkName[100];
 	uint64_t bytesReceived;
 	int completeness, idleTime;
 	declare_now;
@@ -1508,7 +1508,7 @@ json_t* image_getListAsJson()
 			uplinkName[0] = '\0';
 		} else {
 			bytesReceived = image->uplink->bytesReceived;
-			if ( image->uplink->current.fd == -1 || !host_to_string( &image->uplink->current.host, uplinkName, sizeof(uplinkName) ) ) {
+			if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) {
 				uplinkName[0] = '\0';
 			}
 		}
diff --git a/src/server/net.c b/src/server/net.c
index 7f3c1ce..4976eea 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -669,11 +669,19 @@ static void removeFromList(dnbd3_client_t *client)
 {
 	int i;
 	mutex_lock( &_clients_lock );
-	for ( i = _num_clients - 1; i >= 0; --i ) {
-		if ( _clients[i] == client ) {
-			_clients[i] = NULL;
+	if ( _num_clients != 0 ) {
+		for ( i = _num_clients - 1; i >= 0; --i ) {
+			if ( _clients[i] == client ) {
+				_clients[i] = NULL;
+				break;
+			}
+		}
+		if ( i != 0 && i + 1 == _num_clients ) {
+			do {
+				i--;
+			} while ( _clients[i] == NULL && i > 0 );
+			_num_clients = i + 1;
 		}
-		if ( _clients[i] == NULL && i + 1 == _num_clients ) --_num_clients;
 	}
 	mutex_unlock( &_clients_lock );
 }
diff --git a/src/server/server.c b/src/server/server.c
index 838aec2..640048a 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -121,9 +121,6 @@ void dnbd3_cleanup()
 	// Disable threadpool
 	threadpool_close();
 
-	// Terminate the altserver checking thread
-	altservers_shutdown();
-
 	// Terminate all uplinks
 	image_killUplinks();
 
@@ -198,6 +195,11 @@ int main(int argc, char *argv[])
 		case LONGOPT_CRC4:
 			return image_generateCrcFile( optarg ) ? 0 : EXIT_FAILURE;
 		case LONGOPT_ASSERT:
+			printf( "Testing use after free:\n" );
+			volatile char * volatile test = malloc( 10 );
+			test[0] = 1;
+			free( test );
+			test[1] = 2;
 			printf( "Testing a failing assertion:\n" );
 			assert( 4 == 5 );
 			printf( "Assertion 4 == 5 seems to hold. ;-)\n" );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index e21e28c..6c85580 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -96,17 +96,18 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->bytesReceived = 0;
 	uplink->idleTime = 0;
 	uplink->queueLen = 0;
-	mutex_lock( &uplink->sendMutex );
-	uplink->current.fd = -1;
-	mutex_unlock( &uplink->sendMutex );
 	uplink->cacheFd = -1;
 	uplink->signal = NULL;
 	uplink->replicationHandle = REP_NONE;
 	mutex_lock( &uplink->rttLock );
+	mutex_lock( &uplink->sendMutex );
+	uplink->current.fd = -1;
+	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
-	if ( sock >= 0 ) {
+	if ( sock != -1 ) {
 		uplink->better.fd = sock;
-		uplink->better.host = *host;
+		int index = altservers_hostToIndex( host );
+		uplink->better.index = index == -1 ? 0 : index; // Prevent invalid array access
 		uplink->rttTestResult = RTT_DOCHANGE;
 		uplink->better.version = version;
 	} else {
@@ -116,7 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_unlock( &uplink->rttLock );
 	uplink->recvBufferLen = 0;
 	uplink->shutdown = false;
-	if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)link ) ) {
+	if ( 0 != thread_create( &(uplink->thread), NULL, &uplink_mainloop, (void *)uplink ) ) {
 		logadd( LOG_ERROR, "Could not start thread for new uplink." );
 		goto failure;
 	}
@@ -148,8 +149,8 @@ void uplink_shutdown(dnbd3_image_t *image)
 	}
 	dnbd3_uplink_t * const uplink = image->uplink;
 	mutex_lock( &uplink->queueLock );
-	if ( !uplink->shutdown ) {
-		uplink->shutdown = true;
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
 		signal_call( uplink->signal );
 		thread = uplink->thread;
 		join = true;
@@ -211,13 +212,11 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( hops != 0 && isSameAddress( &uplink->current.host, &client->host ) ) {
-		mutex_unlock( &client->image->lock );
-		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
-		mutex_lock( &uplink->rttLock );
+	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
-		mutex_unlock( &uplink->rttLock );
 		signal_call( uplink->signal );
+		mutex_unlock( &client->image->lock );
+		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
 		return false;
 	}
 
@@ -256,12 +255,10 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		}
 	}
 	if ( unlikely( requestLoop ) ) {
-		mutex_unlock( &uplink->queueLock );
-		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		mutex_lock( &uplink->rttLock );
 		uplink->cycleDetected = true;
-		mutex_unlock( &uplink->rttLock );
 		signal_call( uplink->signal );
+		mutex_unlock( &uplink->queueLock );
+		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
 		return false;
 	}
 	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
@@ -311,6 +308,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( foundExisting != -1 )
 		return true; // Attached to pending request, do nothing
 
+	usleep( 10000 );
+
 	// See if we can fire away the request
 	if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
@@ -342,7 +341,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 				if ( state == -1 ) {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
 				} else if ( state == ULR_NEW ) {
-					logadd( LOG_DEBUG2, "Succesful direct uplink request" );
+					//logadd( LOG_DEBUG2, "Direct uplink request" );
 				} else {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
 				}
@@ -352,10 +351,8 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		}
 	}
 
-	if ( foundExisting == -1 ) { // Only wake up uplink thread if the request needs to be relayed
-		if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
-			logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
-		}
+	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
+		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
 	return true;
 }
@@ -443,7 +440,7 @@ static void* uplink_mainloop(void *data)
 			uplink->image->working = true;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
-			if ( host_to_string( &uplink->current.host, buffer + 1, sizeof(buffer) - 1 ) ) {
+			if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
 				logadd( LOG_DEBUG1, "(Uplink %s) Now connected to %s\n", uplink->image->name, buffer + 1 );
 				setThreadName( buffer );
 			}
@@ -525,9 +522,7 @@ static void* uplink_mainloop(void *data)
 			}
 		}
 		// See if we should trigger an RTT measurement
-		mutex_lock( &uplink->rttLock );
-		const int rttTestResult = uplink->rttTestResult;
-		mutex_unlock( &uplink->rttLock );
+		int rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
 			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
 				// It seems it's time for a check
@@ -538,7 +533,7 @@ static void* uplink_mainloop(void *data)
 					goto cleanup;
 				} else if ( !uplink_connectionShouldShutdown( uplink ) ) {
 					// Not complete - do measurement
-					altservers_findUplink( uplink ); // This will set RTT_INPROGRESS (synchronous)
+					altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous)
 					if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
 						uplink->nextReplicationIndex = 0;
 					}
@@ -547,11 +542,9 @@ static void* uplink_mainloop(void *data)
 				timing_set( &nextAltCheck, &now, altCheckInterval );
 			}
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
-			mutex_lock( &uplink->rttLock );
-			uplink->rttTestResult = RTT_IDLE;
-			mutex_unlock( &uplink->rttLock );
+			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
 			discoverFailCount++;
-			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_BACKOFF_COUNT ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
+			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
 		if ( uplink->current.fd != -1 && !uplink->shutdown ) {
@@ -581,36 +574,38 @@ static void* uplink_mainloop(void *data)
 #endif
 	}
 	cleanup: ;
-	if ( !uplink->shutdown ) {
-		uplink->shutdown = true;
+	// Detach depends on whether someone is joining this thread...
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
 		thread_detach( uplink->thread );
 	}
-	altservers_removeUplink( uplink );
 	uplink_saveCacheMap( uplink );
-	mutex_lock( &uplink->image->lock );
-	if ( uplink->image->uplink == uplink ) {
-		uplink->image->uplink = NULL;
+	dnbd3_image_t *image = uplink->image;
+	mutex_lock( &image->lock );
+	// in the list anymore, but we want to prevent it from being freed in either case
+	if ( image->uplink == uplink ) {
+		image->uplink = NULL;
 	}
+	mutex_unlock( &image->lock ); // Do NOT use image without locking it
 	mutex_lock( &uplink->queueLock );
-	const int fd = uplink->current.fd;
-	const dnbd3_signal_t* signal = uplink->signal;
-	mutex_lock( &uplink->sendMutex );
-	uplink->current.fd = -1;
-	mutex_unlock( &uplink->sendMutex );
-	uplink->signal = NULL;
-	// Do not access uplink->image after unlocking, since we set
-	// image->uplink to NULL. Acquire with image_lock first,
-	// like done below when checking whether to re-init uplink
-	mutex_unlock( &uplink->image->lock );
-	mutex_unlock( &uplink->queueLock );
-	if ( fd != -1 ) close( fd );
-	if ( signal != NULL ) signal_close( signal );
-	// Wait for the RTT check to finish/fail if it's in progress
-	while ( uplink->rttTestResult == RTT_INPROGRESS )
+	// Wait for active RTT measurement to finish
+	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
 		usleep( 10000 );
+	}
+	signal_close( uplink->signal );
+	mutex_lock( &uplink->rttLock );
+	mutex_lock( &uplink->sendMutex );
+	if ( uplink->current.fd != -1 ) {
+		close( uplink->current.fd );
+		uplink->current.fd = -1;
+	}
 	if ( uplink->better.fd != -1 ) {
 		close( uplink->better.fd );
+		uplink->better.fd = -1;
 	}
+	mutex_unlock( &uplink->sendMutex );
+	mutex_unlock( &uplink->rttLock );
+	mutex_unlock( &uplink->queueLock );
 	mutex_destroy( &uplink->queueLock );
 	mutex_destroy( &uplink->rttLock );
 	mutex_destroy( &uplink->sendMutex );
@@ -619,9 +614,9 @@ static void* uplink_mainloop(void *data)
 	if ( uplink->cacheFd != -1 ) {
 		close( uplink->cacheFd );
 	}
-	dnbd3_image_t *image = image_lock( uplink->image );
 	free( uplink ); // !!!
-	if ( image != NULL ) {
+	if ( image_lock( image ) != NULL ) {
+		// Image is still in list...
 		if ( !_shutdown && image->cache_map != NULL ) {
 			// Ingegrity checker must have found something in the meantime
 			uplink_init( image, -1, NULL, 0 );
@@ -656,7 +651,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( &uplink->current.host );
+			altservers_serverFailed( uplink->current.index );
 			return;
 		}
 		mutex_lock( &uplink->queueLock );
@@ -973,7 +968,7 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	if ( uplink->current.fd == -1 )
 		return;
-	altservers_serverFailed( &uplink->current.host );
+	altservers_serverFailed( uplink->current.index );
 	mutex_lock( &uplink->sendMutex );
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
@@ -1138,3 +1133,13 @@ static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink)
 			&& ( _backgroundReplication != BGR_FULL || _bgrMinClients > uplink->image->users ) );
 }
 
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
+{
+	int current;
+	mutex_lock( &uplink->rttLock );
+	current = uplink->current.fd == -1 ? -1 : uplink->current.index;
+	mutex_unlock( &uplink->rttLock );
+	if ( current == -1 )
+		return false;
+	return altservers_toString( current, buffer, len );
+}
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 4fd41b0..acc8e11 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -16,4 +16,6 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 
 void uplink_shutdown(dnbd3_image_t *image);
 
+bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len);
+
 #endif /* UPLINK_H_ */
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 0cbb320..239f0a2 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -6,10 +6,12 @@
 // +++++ Performance/memory related
 #define SERVER_MAX_CLIENTS 4000
 #define SERVER_MAX_IMAGES  5000
-#define SERVER_MAX_ALTS    100
+#define SERVER_MAX_ALTS    50
 // +++++ Uplink handling (proxy mode)
-#define SERVER_UPLINK_FAIL_INCREASE 5 // On server failure, increase numFails by this value
-#define SERVER_BAD_UPLINK_THRES  40 // Thresold for numFails at which we ignore a server for the time span below
+#define SERVER_GLOBAL_DUP_TIME 6 // How many seconds to wait before changing global fail counter again
+#define SERVER_BAD_UPLINK_MIN 10 // Thresold for fails at which we start ignoring the server occasionally
+#define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
+#define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
 #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
 #define SERVER_MAX_UPLINK_QUEUE  1500 // Maximum number of queued requests per uplink
 #define SERVER_UPLINK_QUEUELEN_THRES  900 // Threshold where we start dropping incoming clients
@@ -33,7 +35,7 @@
 #define SERVER_RTT_PROBES 5 // How many probes to average over
 #define SERVER_RTT_INTERVAL_INIT 5 // Initial interval between probes
 #define SERVER_RTT_INTERVAL_MAX 45 // Maximum interval between probes
-#define SERVER_RTT_BACKOFF_COUNT 5 // If we can't reach any uplink server this many times, consider the uplink bad
+#define SERVER_RTT_MAX_UNREACH 10 // If no server was reachable this many times, stop RTT measurements for a while
 #define SERVER_RTT_INTERVAL_FAILED 180 // Interval to use if no uplink server is reachable for above many times
 
 #define SERVER_REMOTE_IMAGE_CHECK_CACHETIME 120 // 2 minutes
-- 
cgit v1.2.3-55-g7522


From 69f5bf408b9587a6e2008fba2224c2d506f1a895 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 27 Aug 2019 16:13:07 +0200
Subject: [SERVER] Use reference counting for uplink

First step towards less locking for proxy mode
---
 src/server/altservers.c |  13 ++-
 src/server/globals.h    |   4 +-
 src/server/image.c      |  39 ++++-----
 src/server/integrity.c  |  17 ++--
 src/server/net.c        |  48 +++++++----
 src/server/net.h        |   2 +
 src/server/reference.c  |  33 ++++++++
 src/server/reference.h  |  54 ++++++++++++
 src/server/reftypes.h   |  25 ++++++
 src/server/uplink.c     | 214 ++++++++++++++++++++++++++++--------------------
 src/server/uplink.h     |   2 +-
 11 files changed, 311 insertions(+), 140 deletions(-)
 create mode 100644 src/server/reference.c
 create mode 100644 src/server/reference.h
 create mode 100644 src/server/reftypes.h

(limited to 'src/server/net.c')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 493ed9e..7d7fdbe 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -7,6 +7,8 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../serverconfig.h"
+#include "reference.h"
+
 #include <assert.h>
 #include <inttypes.h>
 #include <jansson.h>
@@ -104,7 +106,6 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 		return;
 	if ( uplink->current.fd != -1 && numAltServers <= 1 )
 		return;
-	int i;
 	// if betterFd != -1 it means the uplink is supposed to switch to another
 	// server. As this function here is called by the uplink thread, it can
 	// never be that the uplink is supposed to switch, but instead calls
@@ -112,11 +113,14 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 	assert( uplink->better.fd == -1 );
 	// it is however possible that an RTT measurement is currently in progress,
 	// so check for that case and do nothing if one is in progress
-	mutex_lock( &uplink->rttLock );
 	if ( uplink->rttTestResult != RTT_INPROGRESS ) {
-		threadpool_run( &altservers_runCheck, uplink );
+		dnbd3_uplink_t *current = ref_get_uplink( &uplink->image->uplinkref );
+		if ( current == uplink ) {
+			threadpool_run( &altservers_runCheck, uplink );
+		} else if ( current != NULL ) {
+			ref_put( &current->reference );
+		}
 	}
-	mutex_unlock( &uplink->rttLock );
 }
 
 /**
@@ -375,6 +379,7 @@ static void *altservers_runCheck(void *data)
 	assert( uplink != NULL );
 	setThreadName( "altserver-check" );
 	altservers_findUplinkInternal( uplink );
+	ref_put( &uplink->reference ); // Acquired in findUplinkAsync
 	// Save cache maps of all images if applicable
 	// TODO: Has nothing to do with alt servers really, maybe move somewhere else?
 	declare_now;
diff --git a/src/server/globals.h b/src/server/globals.h
index 4d97c6b..5dd205a 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -8,6 +8,7 @@
 #include <stdatomic.h>
 #include <time.h>
 #include <pthread.h>
+#include "reftypes.h"
 
 typedef struct timespec ticks;
 
@@ -64,6 +65,7 @@ typedef struct {
 #define RTT_NOT_REACHABLE 4 // No uplink was reachable
 struct _dnbd3_uplink
 {
+	ref reference;
 	dnbd3_server_connection_t current; // Currently active connection; fd == -1 means disconnected
 	dnbd3_server_connection_t better; // Better connection as found by altserver worker; fd == -1 means none
 	dnbd3_signal_t* signal;     // used to wake up the process
@@ -107,7 +109,7 @@ struct _dnbd3_image
 {
 	char *path;            // absolute path of the image
 	char *name;            // public name of the image (usually relative path minus revision ID)
-	dnbd3_uplink_t *uplink; // pointer to a server connection
+	weakref uplinkref;     // pointer to a server connection
 	uint8_t *cache_map;    // cache map telling which parts are locally cached, NULL if complete
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
diff --git a/src/server/image.c b/src/server/image.c
index 1a6e0f8..5b58347 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -8,6 +8,7 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <fcntl.h>
@@ -375,9 +376,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 
 	// Check if image is incomplete, handle
 	if ( candidate->cache_map != NULL ) {
-		if ( candidate->uplink == NULL ) {
-			uplink_init( candidate, -1, NULL, -1 );
-		}
+		uplink_init( candidate, -1, NULL, -1 );
 	}
 
 	return candidate; // We did all we can, hopefully it's working
@@ -484,17 +483,7 @@ void image_killUplinks()
 	mutex_lock( &imageListLock );
 	for (i = 0; i < _num_images; ++i) {
 		if ( _images[i] == NULL ) continue;
-		mutex_lock( &_images[i]->lock );
-		if ( _images[i]->uplink != NULL ) {
-			mutex_lock( &_images[i]->uplink->queueLock );
-			if ( !_images[i]->uplink->shutdown ) {
-				thread_detach( _images[i]->uplink->thread );
-				_images[i]->uplink->shutdown = true;
-			}
-			mutex_unlock( &_images[i]->uplink->queueLock );
-			signal_call( _images[i]->uplink->signal );
-		}
-		mutex_unlock( &_images[i]->lock );
+		uplink_shutdown( _images[i] );
 	}
 	mutex_unlock( &imageListLock );
 }
@@ -588,11 +577,15 @@ bool image_tryFreeAll()
 static dnbd3_image_t* image_free(dnbd3_image_t *image)
 {
 	assert( image != NULL );
+	assert( image->users == 0 );
 	if ( !_shutdown ) {
 		logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
 	}
-	//
-	uplink_shutdown( image );
+	// uplink_shutdown might return false to tell us
+	// that the shutdown is in progress. Bail out since
+	// this will get called again when the uplink is done.
+	if ( !uplink_shutdown( image ) )
+		return NULL;
 	mutex_lock( &image->lock );
 	free( image->cache_map );
 	free( image->crc32 );
@@ -860,7 +853,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->cache_map = cache_map;
 	image->crc32 = crc32list;
 	image->masterCrc32 = masterCrc;
-	image->uplink = NULL;
+	image->uplinkref = NULL;
 	image->realFilesize = realFilesize;
 	image->virtualFilesize = virtualFilesize;
 	image->rid = (uint16_t)revision;
@@ -1503,16 +1496,18 @@ json_t* image_getListAsJson()
 		mutex_lock( &image->lock );
 		idleTime = (int)timing_diff( &image->atime, &now );
 		completeness = image_getCompletenessEstimate( image );
-		if ( image->uplink == NULL ) {
+		mutex_unlock( &image->lock );
+		dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+		if ( uplink == NULL ) {
 			bytesReceived = 0;
 			uplinkName[0] = '\0';
 		} else {
-			bytesReceived = image->uplink->bytesReceived;
-			if ( !uplink_getHostString( image->uplink, uplinkName, sizeof(uplinkName) ) ) {
+			bytesReceived = uplink->bytesReceived;
+			if ( !uplink_getHostString( uplink, uplinkName, sizeof(uplinkName) ) ) {
 				uplinkName[0] = '\0';
 			}
+			ref_put( &uplink->reference );
 		}
-		mutex_unlock( &image->lock );
 
 		jsonImage = json_pack( "{sisssisisisisI}",
 				"id", image->id, // id, name, rid never change, so access them without locking
@@ -1734,7 +1729,7 @@ void image_closeUnusedFd()
 		if ( image == NULL )
 			continue;
 		mutex_lock( &image->lock );
-		if ( image->users == 0 && image->uplink == NULL && timing_reached( &image->atime, &deadline ) ) {
+		if ( image->users == 0 && image->uplinkref == NULL && timing_reached( &image->atime, &deadline ) ) {
 			snprintf( imgstr, sizeof(imgstr), "%s:%d", image->name, (int)image->rid );
 			fd = image->readFd;
 			image->readFd = -1;
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 3d1ac9b..f358c46 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -4,6 +4,7 @@
 #include "locks.h"
 #include "image.h"
 #include "uplink.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <sys/syscall.h>
@@ -238,11 +239,13 @@ static void* integrity_main(void * data UNUSED)
 					if ( i + 1 == queueLen ) queueLen--;
 					// Mark as working again if applicable
 					if ( !foundCorrupted ) {
-						mutex_lock( &image->lock );
-						if ( image->uplink != NULL ) { // TODO: image_determineWorkingState() helper?
-							image->working = image->uplink->current.fd != -1 && image->readFd != -1;
+						dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+						if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
+							mutex_lock( &image->lock );
+							image->working = uplink->current.fd != -1 && image->readFd != -1;
+							mutex_unlock( &image->lock );
+							ref_put( &uplink->reference );
 						}
-						mutex_unlock( &image->lock );
 					}
 				} else {
 					// Still more blocks to go...
@@ -255,12 +258,8 @@ static void* integrity_main(void * data UNUSED)
 				// Something was fishy, make sure uplink exists
 				mutex_lock( &image->lock );
 				image->working = false;
-				bool restart = image->uplink == NULL || image->uplink->shutdown;
 				mutex_unlock( &image->lock );
-				if ( restart ) {
-					uplink_shutdown( image );
-					uplink_init( image, -1, NULL, -1 );
-				}
+				uplink_init( image, -1, NULL, -1 );
 			}
 			// Release :-)
 			image_release( image );
diff --git a/src/server/net.c b/src/server/net.c
index 4976eea..e0b516e 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -24,6 +24,7 @@
 #include "locks.h"
 #include "rpc.h"
 #include "altservers.h"
+#include "reference.h"
 
 #include "../shared/sockhelper.h"
 #include "../shared/timing.h"
@@ -229,7 +230,7 @@ void* net_handleNewConnection(void *clientPtr)
 		rid = serializer_get_uint16( &payload );
 		const uint8_t flags = serializer_get_uint8( &payload );
 		client->isServer = ( flags & FLAGS8_SERVER );
-		if ( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) {
+		if ( unlikely( request.size < 3 || !image_name || client_version < MIN_SUPPORTED_CLIENT ) ) {
 			if ( client_version < MIN_SUPPORTED_CLIENT ) {
 				logadd( LOG_DEBUG1, "Client %s too old", client->hostName );
 			} else {
@@ -257,22 +258,25 @@ void* net_handleNewConnection(void *clientPtr)
 			}
 			client->image = image;
 			atomic_thread_fence( memory_order_release );
-			if ( image == NULL ) {
+			if ( unlikely( image == NULL ) ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
-			} else if ( !image->working ) {
+			} else if ( unlikely( !image->working ) ) {
 				logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
 						client->hostName, image_name, (int)rid );
 			} else {
-				bool penalty;
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
 				if ( image->cache_map != NULL ) {
-					mutex_lock( &image->lock );
-					if ( image->uplink == NULL || image->uplink->cacheFd == -1 || image->uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					penalty = bOk && image->uplink != NULL && image->uplink->cacheFd == -1;
-					mutex_unlock( &image->lock );
+					bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 );
+					if ( uplink == NULL ) {
+						uplink_init( image, -1, NULL, 0 );
+					} else {
+						ref_put( &uplink->reference );
+					}
 					if ( penalty ) { // Wait 100ms if local caching is not working so this
 						usleep( 100000 ); // server gets a penalty and is less likely to be selected
 					}
@@ -300,7 +304,7 @@ void* net_handleNewConnection(void *clientPtr)
 		}
 	}
 
-	if ( bOk ) {
+	if ( likely( bOk ) ) {
 		// add artificial delay if applicable
 		if ( client->isServer && _serverPenalty != 0 ) {
 			usleep( _serverPenalty );
@@ -315,7 +319,7 @@ void* net_handleNewConnection(void *clientPtr)
 			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
 				reply.handle = request.handle;
-				if ( offset >= image->virtualFilesize ) {
+				if ( unlikely( offset >= image->virtualFilesize ) ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested non-existent block", client->hostName );
 					reply.size = 0;
@@ -323,7 +327,7 @@ void* net_handleNewConnection(void *clientPtr)
 					send_reply( client->sock, &reply, NULL );
 					break;
 				}
-				if ( offset + request.size > image->virtualFilesize ) {
+				if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
 					// Sanity check
 					logadd( LOG_WARNING, "Client %s requested data block that extends beyond image size", client->hostName );
 					reply.size = 0;
@@ -398,7 +402,7 @@ void* net_handleNewConnection(void *clientPtr)
 				reply.size = request.size;
 
 				fixup_reply( reply );
-				const bool lock = image->uplink != NULL;
+				const bool lock = image->uplinkref != NULL;
 				if ( lock ) mutex_lock( &client->sendMutex );
 				// Send reply header
 				if ( send( client->sock, &reply, sizeof(dnbd3_reply_t), (request.size == 0 ? 0 : MSG_MORE) ) != sizeof(dnbd3_reply_t) ) {
@@ -696,9 +700,11 @@ static dnbd3_client_t* freeClientStruct(dnbd3_client_t *client)
 {
 	mutex_lock( &client->lock );
 	if ( client->image != NULL ) {
-		mutex_lock( &client->image->lock );
-		if ( client->image->uplink != NULL ) uplink_removeClient( client->image->uplink, client );
-		mutex_unlock( &client->image->lock );
+		dnbd3_uplink_t *uplink = ref_get_uplink( &client->image->uplinkref );
+		if ( uplink != NULL ) {
+			uplink_removeClient( uplink, client );
+			ref_put( &uplink->reference );
+		}
 	}
 	mutex_lock( &client->sendMutex );
 	if ( client->sock != -1 ) {
@@ -740,3 +746,15 @@ static bool addToList(dnbd3_client_t *client)
 	return true;
 }
 
+void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle)
+{
+	dnbd3_reply_t reply;
+	reply.magic = dnbd3_packet_magic;
+	reply.cmd = cmd;
+	reply.handle = handle;
+	reply.size = 0;
+	mutex_lock( &client->sendMutex );
+	send_reply( client->sock, &reply, NULL );
+	mutex_unlock( &client->sendMutex );
+}
+
diff --git a/src/server/net.h b/src/server/net.h
index 6813b49..7719aef 100644
--- a/src/server/net.h
+++ b/src/server/net.h
@@ -37,4 +37,6 @@ void net_disconnectAll();
 
 void net_waitForAllDisconnected();
 
+void net_sendReply(dnbd3_client_t *client, uint16_t cmd, uint64_t handle);
+
 #endif /* NET_H_ */
diff --git a/src/server/reference.c b/src/server/reference.c
new file mode 100644
index 0000000..468e00b
--- /dev/null
+++ b/src/server/reference.c
@@ -0,0 +1,33 @@
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#include "reference.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count )
+{
+	reference->count = count;
+	reference->free = freefun;
+}
+
+_Noreturn void _ref_error( const char *message )
+{
+	fprintf( stderr, "Reference counter overflow\n" );
+	abort();
+}
+
+void ref_setref( weakref *weakref, ref *ref )
+{
+	union _aligned_ref_ *new_weakref = 0;
+	if ( ref ) {
+		( new_weakref = aligned_ref( ref->_aligned_ref ) )->ref = ref;
+		ref->count += sizeof( union _aligned_ref_ ) + 1;
+	}
+	char *old_weakref = (char *)atomic_exchange( weakref, new_weakref );
+	if ( !old_weakref )
+		return;
+	struct _ref_ *old_ref = aligned_ref( old_weakref )->ref;
+	old_ref->count += old_weakref - (char *)aligned_ref( old_weakref ) - sizeof( union _aligned_ref_ );
+	ref_put( old_ref );
+}
diff --git a/src/server/reference.h b/src/server/reference.h
new file mode 100644
index 0000000..0bc081a
--- /dev/null
+++ b/src/server/reference.h
@@ -0,0 +1,54 @@
+#ifndef _REFERENCE_H_
+#define _REFERENCE_H_
+
+#include "reftypes.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define container_of(ptr, type, member) \
+	((type *)((char *)(ptr) - (char *)&(((type *)NULL)->member)))
+
+void ref_init( ref *reference, void ( *freefun )( ref * ), long count );
+
+void ref_setref( weakref *weakref, ref *ref );
+
+_Noreturn void _ref_error( const char *message );
+
+static inline ref *ref_get( weakref *weakref )
+{
+	char *old_weakref = (char *)*weakref;
+	do {
+		if ( old_weakref == NULL )
+			return NULL;
+		if ( aligned_ref( old_weakref ) != aligned_ref( old_weakref + 1 ) ) {
+			old_weakref = (char *)*weakref;
+			continue;
+		}
+	} while ( !atomic_compare_exchange_weak( weakref, (void **)&old_weakref, old_weakref + 1 ) );
+	struct _ref_ *ref = aligned_ref( old_weakref )->ref;
+	if ( unlikely( ++ref->count == -1 ) ) {
+		_ref_error( "Reference counter overflow. Aborting.\n" );
+	}
+	char *cur_weakref = ( char * )*weakref;
+	do {
+		if ( aligned_ref( cur_weakref ) != aligned_ref( old_weakref ) ) {
+			ref->count--;
+			break;
+		}
+	} while ( !atomic_compare_exchange_weak( weakref, (void **)&cur_weakref, cur_weakref - 1 ) );
+	return ref;
+}
+
+static inline void ref_put( ref *ref )
+{
+	if ( --ref->count == 0 ) {
+		ref->free( ref );
+	}
+}
+
+#define ref_get_uplink(wr) ({ \
+	ref* ref = ref_get( wr ); \
+	ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
+})
+
+#endif
diff --git a/src/server/reftypes.h b/src/server/reftypes.h
new file mode 100644
index 0000000..45c0c20
--- /dev/null
+++ b/src/server/reftypes.h
@@ -0,0 +1,25 @@
+#ifndef _REFTYPES_H_
+#define _REFTYPES_H_
+
+#include <stdatomic.h>
+
+_Static_assert( sizeof( void * ) == sizeof( _Atomic( void * ) ), "Atomic pointer bad" );
+
+typedef _Atomic( void * ) weakref;
+
+#define aligned_ref(ptr) \
+	((union _aligned_ref_ *)((ptr) - (uintptr_t)(ptr) % sizeof(union _aligned_ref_)))
+
+union _aligned_ref_ {
+	struct _ref_ *ref;
+	void *_padding[( 32 - 1 ) / sizeof( void * ) + 1];
+};
+
+typedef struct _ref_ {
+	_Atomic long count;
+	void ( *free )( struct _ref_ * );
+	char _padding[sizeof( union _aligned_ref_ )];
+	char _aligned_ref[sizeof( union _aligned_ref_ )];
+} ref;
+
+#endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index abfebf0..7a39887 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -3,10 +3,12 @@
 #include "locks.h"
 #include "image.h"
 #include "altservers.h"
+#include "net.h"
 #include "../shared/sockhelper.h"
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "reference.h"
 
 #include <assert.h>
 #include <inttypes.h>
@@ -45,6 +47,8 @@ static const char *const NAMES_ULR[4] = {
 
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
+static void cancelAllRequests(dnbd3_uplink_t *uplink);
+static void uplink_free(ref *ref);
 static void* uplink_mainloop(void *data);
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
@@ -76,19 +80,24 @@ uint64_t uplink_getTotalBytesReceived()
 bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version)
 {
 	if ( !_isProxy || _shutdown ) return false;
-	dnbd3_uplink_t *uplink = NULL;
 	assert( image != NULL );
 	mutex_lock( &image->lock );
-	if ( image->uplink != NULL && !image->uplink->shutdown ) {
+	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+	if ( uplink != NULL ) {
 		mutex_unlock( &image->lock );
-		if ( sock >= 0 ) close( sock );
+		if ( sock != -1 ) {
+			close( sock );
+		}
+		ref_put( &uplink->reference );
 		return true; // There's already an uplink, so should we consider this success or failure?
 	}
 	if ( image->cache_map == NULL ) {
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
 		goto failure;
 	}
-	uplink = image->uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+	uplink = calloc( 1, sizeof(dnbd3_uplink_t) );
+	// Start with one reference for the uplink thread. We'll return it when the thread finishes
+	ref_init( &uplink->reference, uplink_free, 1 );
 	mutex_init( &uplink->queueLock, LOCK_UPLINK_QUEUE );
 	mutex_init( &uplink->rttLock, LOCK_UPLINK_RTT );
 	mutex_init( &uplink->sendMutex, LOCK_UPLINK_SEND );
@@ -121,12 +130,13 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		logadd( LOG_ERROR, "Could not start thread for new uplink." );
 		goto failure;
 	}
+	ref_setref( &image->uplinkref, &uplink->reference );
 	mutex_unlock( &image->lock );
 	return true;
 failure: ;
 	if ( uplink != NULL ) {
 		free( uplink );
-		uplink = image->uplink = NULL;
+		uplink = NULL;
 	}
 	mutex_unlock( &image->lock );
 	return false;
@@ -137,34 +147,83 @@ failure: ;
  * Calling it multiple times, even concurrently, will
  * not break anything.
  */
-void uplink_shutdown(dnbd3_image_t *image)
+bool uplink_shutdown(dnbd3_image_t *image)
 {
-	bool join = false;
-	pthread_t thread;
 	assert( image != NULL );
 	mutex_lock( &image->lock );
-	if ( image->uplink == NULL ) {
+	dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+	if ( uplink == NULL ) {
 		mutex_unlock( &image->lock );
-		return;
+		return true;
 	}
-	dnbd3_uplink_t * const uplink = image->uplink;
 	mutex_lock( &uplink->queueLock );
 	bool exp = false;
 	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+		image->users++; // Prevent free while uplink shuts down
 		signal_call( uplink->signal );
-		thread = uplink->thread;
-		join = true;
+	} else {
+		logadd( LOG_ERROR, "This will never happen. '%s:%d'", image->name, (int)image->rid );
 	}
+	cancelAllRequests( uplink );
+	ref_setref( &image->uplinkref, NULL );
+	ref_put( &uplink->reference );
 	mutex_unlock( &uplink->queueLock );
-	bool wait = image->uplink != NULL;
+	bool retval = ( exp && image->users == 0 );
 	mutex_unlock( &image->lock );
-	if ( join ) thread_join( thread, NULL );
-	while ( wait ) {
-		usleep( 5000 );
-		mutex_lock( &image->lock );
-		wait = image->uplink != NULL && image->uplink->shutdown;
-		mutex_unlock( &image->lock );
+	return exp;
+}
+
+/**
+ * Cancel all requests of this uplink.
+ * HOLD QUEUE LOCK WHILE CALLING
+ */
+static void cancelAllRequests(dnbd3_uplink_t *uplink)
+{
+	for ( int i = 0; i < uplink->queueLen; ++i ) {
+		if ( uplink->queue[i].status != ULR_FREE ) {
+			net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
+			uplink->queue[i].status = ULR_FREE;
+		}
+	}
+	uplink->queueLen = 0;
+}
+
+static void uplink_free(ref *ref)
+{
+	dnbd3_uplink_t *uplink = container_of(ref, dnbd3_uplink_t, reference);
+	logadd( LOG_DEBUG1, "Freeing uplink for '%s:%d'", uplink->image->name, (int)uplink->image->rid );
+	assert( uplink->queueLen == 0 );
+	signal_close( uplink->signal );
+	if ( uplink->current.fd != -1 ) {
+		close( uplink->current.fd );
+		uplink->current.fd = -1;
+	}
+	if ( uplink->better.fd != -1 ) {
+		close( uplink->better.fd );
+		uplink->better.fd = -1;
+	}
+	mutex_destroy( &uplink->queueLock );
+	mutex_destroy( &uplink->rttLock );
+	mutex_destroy( &uplink->sendMutex );
+	free( uplink->recvBuffer );
+	uplink->recvBuffer = NULL;
+	if ( uplink->cacheFd != -1 ) {
+		close( uplink->cacheFd );
 	}
+	// TODO Requeue any requests
+	dnbd3_image_t *image = image_lock( uplink->image );
+	if ( image != NULL ) {
+		// != NULL means image is still in list...
+		if ( !_shutdown && image->cache_map != NULL ) {
+			// Ingegrity checker must have found something in the meantime
+			uplink_init( image, -1, NULL, 0 );
+		}
+		image_release( image );
+	}
+	// Finally let go of image. It was acquired either in uplink_shutdown or in the cleanup code
+	// of the uplink thread, depending on who set the uplink->shutdown flag.
+	image_release( image );
+	free( uplink ); // !!!
 }
 
 /**
@@ -193,31 +252,28 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
  */
 bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
 {
-	if ( client == NULL || client->image == NULL ) return false;
+	if ( client == NULL || client->image == NULL )
+		return false;
 	if ( length > (uint32_t)_maxPayload ) {
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	mutex_lock( &client->image->lock );
-	if ( client->image->uplink == NULL ) {
-		mutex_unlock( &client->image->lock );
+	dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref );
+	if ( uplink == NULL ) {
 		logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
 		return false;
 	}
-	dnbd3_uplink_t * const uplink = client->image->uplink;
 	if ( uplink->shutdown ) {
-		mutex_unlock( &client->image->lock );
 		logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
-		return false;
+		goto fail_ref;
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
 	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
-		mutex_unlock( &client->image->lock );
 		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
-		return false;
+		goto fail_ref;
 	}
 
 	int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
@@ -229,7 +285,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	const uint64_t end = start + length;
 
 	mutex_lock( &uplink->queueLock );
-	mutex_unlock( &client->image->lock );
+	if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
+		goto fail_lock;
+	}
 	for (i = 0; i < uplink->queueLen; ++i) {
 		// find free slot to place this request into
 		if ( uplink->queue[i].status == ULR_FREE ) {
@@ -257,18 +315,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( unlikely( requestLoop ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
-		mutex_unlock( &uplink->queueLock );
 		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		return false;
+		goto fail_lock;
 	}
 	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
 		freeSlot = -1; // Not attaching to existing request, make it use a higher slot
 	}
 	if ( freeSlot == -1 ) {
 		if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
-			mutex_unlock( &uplink->queueLock );
 			logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
-			return false;
+			goto fail_lock;
 		}
 		freeSlot = uplink->queueLen++;
 	}
@@ -305,16 +361,16 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 #endif
 	mutex_unlock( &uplink->queueLock );
 
-	if ( foundExisting != -1 )
+	if ( foundExisting != -1 ) {
+		ref_put( &uplink->reference );
 		return true; // Attached to pending request, do nothing
-
-	usleep( 10000 );
+	}
 
 	// See if we can fire away the request
-	if ( mutex_trylock( &uplink->sendMutex ) != 0 ) {
+	if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
 	} else {
-		if ( uplink->current.fd == -1 ) {
+		if ( unlikely( uplink->current.fd == -1 ) ) {
 			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 		} else {
@@ -323,13 +379,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			if ( hops < 200 ) ++hops;
 			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
 			mutex_unlock( &uplink->sendMutex );
-			if ( !ret ) {
+			if ( unlikely( !ret ) ) {
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
 			} else {
 				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
 				int state;
 				mutex_lock( &uplink->queueLock );
-				if ( uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
+				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
 					state = uplink->queue[freeSlot].status;
 					if ( uplink->queue[freeSlot].status == ULR_NEW ) {
 						uplink->queue[freeSlot].status = ULR_PENDING;
@@ -345,6 +401,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 				} else {
 					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
 				}
+				ref_put( &uplink->reference );
 				return true;
 			}
 			// Fall through to waking up sender thread
@@ -354,7 +411,13 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
 		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
+	ref_put( &uplink->reference );
 	return true;
+fail_lock:
+	mutex_unlock( &uplink->queueLock );
+fail_ref:
+	ref_put( &uplink->reference );
+	return false;
 }
 
 /**
@@ -381,6 +444,7 @@ static void* uplink_mainloop(void *data)
 	//
 	assert( uplink != NULL );
 	setThreadName( "idle-uplink" );
+	thread_detach( uplink->thread );
 	blockNoncriticalSignals();
 	// Make sure file is open for writing
 	if ( !uplink_reopenCacheFd( uplink, false ) ) {
@@ -553,7 +617,7 @@ static void* uplink_mainloop(void *data)
 			for (i = 0; i < uplink->queueLen; ++i) {
 				if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
 					snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
-							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)link, i, uplink->queue[i].client->image->name,
+							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
 							uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
 					uplink->queue[i].entered = now;
 #ifdef _DEBUG_RESEND_STARVING
@@ -572,55 +636,26 @@ static void* uplink_mainloop(void *data)
 #endif
 	}
 	cleanup: ;
-	// Detach depends on whether someone is joining this thread...
-	bool exp = false;
-	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
-		thread_detach( uplink->thread );
-	}
 	uplink_saveCacheMap( uplink );
 	dnbd3_image_t *image = uplink->image;
 	mutex_lock( &image->lock );
-	// in the list anymore, but we want to prevent it from being freed in either case
-	if ( image->uplink == uplink ) {
-		image->uplink = NULL;
-	}
-	mutex_unlock( &image->lock ); // Do NOT use image without locking it
-	mutex_lock( &uplink->queueLock );
-	// Wait for active RTT measurement to finish
-	while ( uplink->rttTestResult == RTT_INPROGRESS ) {
-		usleep( 10000 );
-	}
-	signal_close( uplink->signal );
-	mutex_lock( &uplink->rttLock );
-	mutex_lock( &uplink->sendMutex );
-	if ( uplink->current.fd != -1 ) {
-		close( uplink->current.fd );
-		uplink->current.fd = -1;
-	}
-	if ( uplink->better.fd != -1 ) {
-		close( uplink->better.fd );
-		uplink->better.fd = -1;
+	bool exp = false;
+	if ( atomic_compare_exchange_strong( &uplink->shutdown, &exp, true ) ) {
+		image->users++; // We set the flag - hold onto image
 	}
-	mutex_unlock( &uplink->sendMutex );
-	mutex_unlock( &uplink->rttLock );
-	mutex_unlock( &uplink->queueLock );
-	mutex_destroy( &uplink->queueLock );
-	mutex_destroy( &uplink->rttLock );
-	mutex_destroy( &uplink->sendMutex );
-	free( uplink->recvBuffer );
-	uplink->recvBuffer = NULL;
-	if ( uplink->cacheFd != -1 ) {
-		close( uplink->cacheFd );
+	dnbd3_uplink_t *current = ref_get_uplink( &image->uplinkref );
+	if ( current == uplink ) { // Set NULL if it's still us...
+		mutex_lock( &uplink->queueLock );
+		cancelAllRequests( uplink );
+		mutex_unlock( &uplink->queueLock );
+		ref_setref( &image->uplinkref, NULL );
 	}
-	free( uplink ); // !!!
-	if ( image_lock( image ) != NULL ) {
-		// Image is still in list...
-		if ( !_shutdown && image->cache_map != NULL ) {
-			// Ingegrity checker must have found something in the meantime
-			uplink_init( image, -1, NULL, 0 );
-		}
-		image_release( image );
+	if ( current != NULL ) { // Decrease ref in any case
+		ref_put( &current->reference );
 	}
+	mutex_unlock( &image->lock );
+	// Finally as the thread is done, decrease our own ref that we initialized with
+	ref_put( &uplink->reference );
 	return NULL ;
 }
 
@@ -637,7 +672,7 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 		/*
 		logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
-				(void*)link, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
+				(void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
 		*/
 		mutex_unlock( &uplink->queueLock );
 		if ( hops < 200 ) ++hops;
@@ -782,7 +817,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 
 /**
  * Receive data from uplink server and process/dispatch
- * Locks on: link.lock, images[].lock
+ * Locks on: uplink.lock, images[].lock
  */
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
@@ -924,13 +959,16 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 				}
 				mutex_unlock( &client->sendMutex );
 				mutex_lock( &uplink->queueLock );
+				if ( i > uplink->queueLen ) {
+					uplink->queueLen = i; // Might have been set to 0 by cancelAllRequests
+				}
 			}
 			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
 		mutex_unlock( &uplink->queueLock );
 #ifdef _DEBUG
 		if ( !served && start != uplink->replicationHandle ) {
-			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)link, uplink->image->name, start, end );
+			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
 		}
 #endif
 		if ( start == uplink->replicationHandle ) {
diff --git a/src/server/uplink.h b/src/server/uplink.h
index acc8e11..49ff0b4 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -14,7 +14,7 @@ void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
 
 bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
 
-void uplink_shutdown(dnbd3_image_t *image);
+bool uplink_shutdown(dnbd3_image_t *image);
 
 bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len);
 
-- 
cgit v1.2.3-55-g7522


From ac1bf45ebdd630fbc9ad2c1fa3c0ea99f5206799 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 28 Aug 2019 13:07:13 +0200
Subject: [SERVER] Make signal handling more POSIX

According to POSIX, a signal sent to a PID can be delivered to an
arbitrary thread of that process that hasn't the signal blocked. This
seens to never happen on Linux, but would mess things up since the code
expected the main signal handler to only be executed by the main thread.
This should now be fixed by examining the destination PID of the signal
as well as the ID of the thread currently running the signal handler. If
we notice the signal wasn't sent by our own PID and the handler is not
currently run by the main thread, we re-send the signal to the main
thread. Otherwise, if the signal was sent by our own PID but the handler
is not run in the main thread, do nothing. This way we can use
pthread_kill() to wake up threads that might be stuck in a blocking
syscall when it's time to shut down.
---
 src/server/globals.h    |  1 +
 src/server/image.c      | 10 ++--------
 src/server/integrity.c  | 17 +++++++++++++----
 src/server/net.c        | 11 ++++++-----
 src/server/rpc.c        | 13 ++++++++-----
 src/server/server.c     | 22 +++++++++++++++++-----
 src/server/threadpool.c | 28 ++++++++++++++++++++++------
 src/server/threadpool.h |  5 +++++
 8 files changed, 74 insertions(+), 33 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.h b/src/server/globals.h
index 5dd205a..f940666 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -138,6 +138,7 @@ struct _dnbd3_client
 	char hostName[HOSTNAMELEN];       // inet_ntop version of host
 	pthread_mutex_t sendMutex;        // Held while writing to sock if image is incomplete (since uplink uses socket too)
 	pthread_mutex_t lock;
+	pthread_t thread;
 };
 
 // #######################################################
diff --git a/src/server/image.c b/src/server/image.c
index de93cd4..248c12c 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -562,9 +562,7 @@ bool image_tryFreeAll()
 		if ( _images[i] != NULL && _images[i]->users == 0 ) {
 			dnbd3_image_t *image = _images[i];
 			_images[i] = NULL;
-			mutex_unlock( &imageListLock );
 			image = image_free( image );
-			mutex_lock( &imageListLock );
 		}
 		if ( i + 1 == _num_images && _images[i] == NULL ) _num_images--;
 	}
@@ -574,15 +572,13 @@ bool image_tryFreeAll()
 
 /**
  * Free image. DOES NOT check if it's in use.
- * Indirectly locks on imageListLock, image.lock, uplink.queueLock
+ * (Indirectly) locks on image.lock, uplink.queueLock
  */
 static dnbd3_image_t* image_free(dnbd3_image_t *image)
 {
 	assert( image != NULL );
 	assert( image->users == 0 );
-	if ( !_shutdown ) {
-		logadd( LOG_INFO, "Freeing image %s:%d", image->name, (int)image->rid );
-	}
+	logadd( ( _shutdown ? LOG_DEBUG1 : LOG_INFO ), "Freeing image %s:%d", image->name, (int)image->rid );
 	// uplink_shutdown might return false to tell us
 	// that the shutdown is in progress. Bail out since
 	// this will get called again when the uplink is done.
@@ -600,8 +596,6 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	mutex_unlock( &image->lock );
 	if ( image->readFd != -1 ) close( image->readFd );
 	mutex_destroy( &image->lock );
-	//
-	memset( image, 0, sizeof(*image) );
 	free( image );
 	return NULL ;
 }
diff --git a/src/server/integrity.c b/src/server/integrity.c
index f358c46..e7ebeb2 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -184,13 +184,20 @@ static void* integrity_main(void * data UNUSED)
 							mutex_unlock( &image->lock );
 						}
 #if defined(linux) || defined(__linux)
-						if ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 ) {
+						while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
 #else
-						if ( fsync( fd ) == -1 ) {
+						while ( fsync( fd ) == -1 )
 #endif
-							logadd( LOG_ERROR, "Cannot flush %s for integrity check", image->path );
+						{
+							if ( _shutdown )
+								break;
+							if ( errno == EINTR )
+								continue;
+							logadd( LOG_ERROR, "Cannot flush %s for integrity check (errno=%d)", image->path, errno );
 							exit( 1 );
 						}
+						if ( _shutdown )
+							break;
 						// Use direct I/O only if read length is multiple of 4096 to be on the safe side
 						int tfd;
 						if ( direct && ( end % DNBD3_BLOCK_SIZE ) == 0 ) {
@@ -266,7 +273,9 @@ static void* integrity_main(void * data UNUSED)
 		}
 	}
 	mutex_unlock( &integrityQueueLock );
-	if ( buffer != NULL ) free( buffer );
+	if ( buffer != NULL ) {
+		free( buffer );
+	}
 	bRunning = false;
 	return NULL;
 }
diff --git a/src/server/net.c b/src/server/net.c
index e0b516e..9c855e4 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -44,6 +44,7 @@
 #include <jansson.h>
 #include <inttypes.h>
 #include <stdatomic.h>
+#include <signal.h>
 
 static dnbd3_client_t *_clients[SERVER_MAX_CLIENTS];
 static int _num_clients = 0;
@@ -153,6 +154,7 @@ void* net_handleNewConnection(void *clientPtr)
 {
 	dnbd3_client_t * const client = (dnbd3_client_t *)clientPtr;
 	dnbd3_request_t request;
+	client->thread = pthread_self();
 
 	// Await data from client. Since this is a fresh connection, we expect data right away
 	sock_setTimeout( client->sock, _clientTimeout );
@@ -631,11 +633,10 @@ void net_disconnectAll()
 	int i;
 	mutex_lock( &_clients_lock );
 	for (i = 0; i < _num_clients; ++i) {
-		if ( _clients[i] == NULL ) continue;
-		dnbd3_client_t * const client = _clients[i];
-		mutex_lock( &client->lock );
-		if ( client->sock >= 0 ) shutdown( client->sock, SHUT_RDWR );
-		mutex_unlock( &client->lock );
+		if ( _clients[i] == NULL )
+			continue;
+		shutdown( _clients[i]->sock, SHUT_RDWR );
+		pthread_kill( _clients[i]->thread, SIGINT );
 	}
 	mutex_unlock( &_clients_lock );
 }
diff --git a/src/server/rpc.c b/src/server/rpc.c
index 261c6c0..662263e 100644
--- a/src/server/rpc.c
+++ b/src/server/rpc.c
@@ -137,13 +137,13 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 	bool hasName = false;
 	bool ok;
 	int keepAlive = HTTP_KEEPALIVE;
-	do {
+	while ( !_shutdown ) {
 		// Read request from client
 		struct phr_header headers[100];
 		size_t numHeaders, prevLen = 0, consumed;
 		struct string method, path;
 		int minorVersion;
-		do {
+		while ( !_shutdown ) {
 			// Parse before calling recv, there might be a complete pipelined request in the buffer already
 			// If the request is incomplete, we allow exactly one additional recv() to complete it.
 			// This should suffice for real world scenarios as I don't know of any
@@ -188,7 +188,9 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 				sendReply( sock, "400 Bad Request", "text/plain", "Server cannot understand what you're trying to say", -1, HTTP_CLOSE );
 				goto func_return;
 			}
-		} while ( true );
+		} // Loop while request header incomplete
+		if ( _shutdown )
+			break;
 		if ( keepAlive == HTTP_KEEPALIVE ) {
 			// Only keep the connection alive (and indicate so) if the client seems to support this
 			if ( minorVersion == 0 || hasHeaderValue( headers, numHeaders, &STR_CONNECTION, &STR_CLOSE ) ) {
@@ -213,7 +215,8 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			} else {
 				ok = sendReply( sock, "404 Not found", "text/plain", "Nothing", -1, keepAlive );
 			}
-			if ( !ok ) break;
+			if ( !ok )
+				break;
 		}
 		// hoff might be beyond end if the client sent another request (burst)
 		const ssize_t extra = hoff - consumed;
@@ -225,7 +228,7 @@ void rpc_sendStatsJson(int sock, dnbd3_host_t* host, const void* data, const int
 			hasName = true;
 			setThreadName( "HTTP" );
 		}
-	} while (true);
+	} // Loop while more requests
 func_return:;
 	do {
 		const int curCount = --status.count;
diff --git a/src/server/server.c b/src/server/server.c
index 1cdd2ab..0dddea7 100644
--- a/src/server/server.c
+++ b/src/server/server.c
@@ -37,6 +37,8 @@
 #include <signal.h>
 #include <getopt.h>
 #include <assert.h>
+#include <sys/types.h>
+#include <unistd.h>
 
 #define LONGOPT_CRC4       1000
 #define LONGOPT_ASSERT     1001
@@ -60,6 +62,7 @@ static _Atomic(job_t *) newJob;
 static bool hasTimerThread = false;
 static pthread_t timerThread;
 
+static pid_t mainPid;
 static pthread_t mainThread;
 
 #define DEFAULT_TIMER_TIMEOUT (60)
@@ -138,7 +141,7 @@ _Noreturn static void dnbd3_cleanup()
 	logadd( LOG_INFO, "Cleanup..." );
 
 	if ( hasTimerThread ) {
-		pthread_kill( timerThread, SIGHUP );
+		pthread_kill( timerThread, SIGINT );
 		thread_join( timerThread, NULL );
 	}
 
@@ -162,6 +165,8 @@ _Noreturn static void dnbd3_cleanup()
 	// Wait for clients to disconnect
 	net_waitForAllDisconnected();
 
+	threadpool_waitEmpty();
+
 	// Clean up images
 	retries = 5;
 	while ( !image_tryFreeAll() && --retries > 0 ) {
@@ -204,6 +209,7 @@ int main(int argc, char *argv[])
 			{ 0, 0, 0, 0 }
 	};
 
+	mainPid = getpid();
 	mainThread = pthread_self();
 	opt = getopt_long( argc, argv, optString, longOpts, &longIndex );
 
@@ -509,10 +515,16 @@ static void dnbd3_handleSignal(int signum)
 
 static void dnbd3_handleSignal2(int signum, siginfo_t *info, void *data UNUSED)
 {
-	if ( !pthread_equal( pthread_self(), mainThread ) )
-		return;
-	memcpy( &lastSignal, info, sizeof(siginfo_t) );
-	dnbd3_handleSignal( signum );
+	if ( info->si_pid != mainPid ) { // Source is not this process
+		memcpy( &lastSignal, info, sizeof(siginfo_t) ); // Copy signal info
+		if ( info->si_pid != 0 && !pthread_equal( pthread_self(), mainThread ) ) {
+			pthread_kill( mainThread, info->si_signo ); // And relay signal if we're not the main thread
+		}
+	}
+	if ( pthread_equal( pthread_self(), mainThread ) ) {
+		// Signal received by main thread -- handle
+		dnbd3_handleSignal( signum );
+	}
 }
 
 uint32_t dnbd3_serverUptime()
diff --git a/src/server/threadpool.c b/src/server/threadpool.c
index 3947677..0b46fd6 100644
--- a/src/server/threadpool.c
+++ b/src/server/threadpool.c
@@ -15,6 +15,7 @@ static void *threadpool_worker(void *entryPtr);
 static pthread_attr_t threadAttrs;
 static atomic_int maxIdleThreads = -1;
 static _Atomic(entry_t *) *pool = NULL;
+static atomic_int activeThreads = 0;
 
 bool threadpool_init(int maxIdle)
 {
@@ -34,10 +35,9 @@ bool threadpool_init(int maxIdle)
 
 void threadpool_close()
 {
-	_shutdown = true;
-	int max = maxIdleThreads;
-	maxIdleThreads = -1;
-	if ( max <= 0 ) return;
+	int max = atomic_exchange( &maxIdleThreads, -1 );
+	if ( max <= 0 )
+		return;
 	for ( int i = 0; i < max; ++i ) {
 		entry_t *cur = pool[i];
 		if ( cur != NULL && atomic_compare_exchange_strong( &pool[i], &cur, NULL ) ) {
@@ -46,9 +46,23 @@ void threadpool_close()
 	}
 }
 
+void threadpool_waitEmpty()
+{
+	if ( activeThreads == 0 )
+		return;
+	do {
+		sleep( 1 );
+		logadd( LOG_INFO, "Threadpool: %d threads still active", (int)activeThreads );
+	} while ( activeThreads != 0 );
+}
+
 bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 {
-	if ( startRoutine == NULL ) {
+	if ( unlikely( _shutdown ) ) {
+		logadd( LOG_MINOR, "Cannot submit work to threadpool while shutting down!" );
+		return false;
+	}
+	if ( unlikely( startRoutine == NULL ) ) {
 		logadd( LOG_ERROR, "Trying to queue work for thread pool with NULL startRoutine" );
 		return false; // Or bail out!?
 	}
@@ -60,7 +74,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 			break;
 		}
 	}
-	if ( entry == NULL ) {
+	if ( unlikely( entry == NULL ) ) {
 		entry = malloc( sizeof(entry_t) );
 		if ( entry == NULL ) {
 			logadd( LOG_WARNING, "Could not alloc entry_t for new thread\n" );
@@ -78,6 +92,7 @@ bool threadpool_run(void *(*startRoutine)(void *), void *arg)
 			free( entry );
 			return false;
 		}
+		activeThreads++;
 	}
 	entry->startRoutine = startRoutine;
 	entry->arg = arg;
@@ -130,6 +145,7 @@ keep_going:;
 	}
 	signal_close( entry->signal );
 	free( entry );
+	activeThreads--;
 	return NULL;
 }
 
diff --git a/src/server/threadpool.h b/src/server/threadpool.h
index 15dd151..ee0b3aa 100644
--- a/src/server/threadpool.h
+++ b/src/server/threadpool.h
@@ -17,6 +17,11 @@ bool threadpool_init(int maxIdleThreadCount);
  */
 void threadpool_close();
 
+/**
+ * Block until all threads spawned have exited
+ */
+void threadpool_waitEmpty();
+
 /**
  * Run a thread using the thread pool.
  * @param startRoutine function to run in new thread
-- 
cgit v1.2.3-55-g7522


From 88695877f085af475a6ca8a01c2fbb08eb5b15da Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 14:49:18 +0200
Subject: [SERVER] Use weakref for cache maps

Gets rid of a bunch of locking, especially the hot path in net.c where
clients are requesting data. Many clients unsing the same incomplete
image previously created a bottleneck here.
---
 src/server/globals.h   |  10 ++-
 src/server/image.c     | 208 +++++++++++++++++++++++++++++++------------------
 src/server/image.h     |   2 +-
 src/server/integrity.c |  10 ++-
 src/server/net.c       |  81 +++++++++----------
 src/server/reference.h |   5 ++
 src/server/uplink.c    |  64 +++++++--------
 7 files changed, 220 insertions(+), 160 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.h b/src/server/globals.h
index f940666..221af78 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -99,6 +99,12 @@ typedef struct
 	int permissions;
 } dnbd3_access_rule_t;
 
+typedef struct
+{
+	ref reference;
+	atomic_uint_least8_t map[];
+} dnbd3_cache_map_t;
+
 /**
  * Image struct. An image path could be something like
  * /mnt/images/rz/zfs/Windows7 ZfS.vmdk.r1
@@ -110,7 +116,7 @@ struct _dnbd3_image
 	char *path;            // absolute path of the image
 	char *name;            // public name of the image (usually relative path minus revision ID)
 	weakref uplinkref;     // pointer to a server connection
-	uint8_t *cache_map;    // cache map telling which parts are locally cached, NULL if complete
+	weakref ref_cacheMap;  // cache map telling which parts are locally cached, NULL if complete
 	uint64_t virtualFilesize;   // virtual size of image (real size rounded up to multiple of 4k)
 	uint64_t realFilesize;      // actual file size on disk
 	ticks atime;                // last access time
@@ -119,7 +125,7 @@ struct _dnbd3_image
 	uint32_t *crc32;       // list of crc32 checksums for each 16MiB block in image
 	uint32_t masterCrc32;  // CRC-32 of the crc-32 list
 	int readFd;            // used to read the image. Used from multiple threads, so use atomic operations (pread et al)
-	int completenessEstimate; // Completeness estimate in percent
+	atomic_int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
 	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
diff --git a/src/server/image.c b/src/server/image.c
index 4eab1d2..1972f48 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -51,10 +51,18 @@ static bool image_clone(int sock, char *name, uint16_t revision, uint64_t imageS
 static bool image_calcBlockCrc32(const int fd, const size_t block, const uint64_t realFilesize, uint32_t *crc);
 static bool image_ensureDiskSpace(uint64_t size, bool force);
 
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, uint8_t * const cache_map);
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t fileSize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map);
 static void* closeUnusedFds(void*);
+static void allocCacheMap(dnbd3_image_t *image, bool complete);
+
+static void cmfree(ref *ref)
+{
+	dnbd3_cache_map_t *cache = container_of(ref, dnbd3_cache_map_t, reference);
+	logadd( LOG_DEBUG2, "Freeing a cache map" );
+	free( cache );
+}
 
 // ##########################################
 
@@ -70,7 +78,6 @@ void image_serverStartup()
 /**
  * Update cache-map of given image for the given byte range
  * start (inclusive) - end (exclusive)
- * Locks on: images[].lock
  */
 void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set)
 {
@@ -91,33 +98,55 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 	if ( start >= end )
 		return;
 	bool setNewBlocks = false;
-	uint64_t pos = start;
-	mutex_lock( &image->lock );
-	if ( image->cache_map == NULL ) {
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL ) {
 		// Image seems already complete
 		if ( set ) {
 			// This makes no sense
-			mutex_unlock( &image->lock );
-			logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache_map: %s", image->path );
+			logadd( LOG_DEBUG1, "image_updateCachemap(true) with no cache map: %s", image->path );
 			return;
 		}
 		// Recreate a cache map, set it to all 1 initially as we assume the image was complete
-		const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-		image->cache_map = malloc( byteSize );
-		memset( image->cache_map, 0xff, byteSize );
-	}
-	while ( pos < end ) {
-		const size_t map_y = (int)( pos >> 15 );
-		const int map_x = (int)( (pos >> 12) & 7 ); // mod 8
-		const int bit_mask = 1 << map_x;
-		if ( set ) {
-			if ( (image->cache_map[map_y] & bit_mask) == 0 ) setNewBlocks = true;
-			image->cache_map[map_y] |= (uint8_t)bit_mask;
-		} else {
-			image->cache_map[map_y] &= (uint8_t)~bit_mask;
+		allocCacheMap( image, true );
+		cache = ref_get_cachemap( image );
+		if ( cache == NULL ) {
+			logadd( LOG_WARNING, "WHAT!!!?!?!= No cache map right after alloc?! %s", image->path );
+			return;
 		}
-		pos += DNBD3_BLOCK_SIZE;
 	}
+	// Set/unset
+	const uint64_t firstByteInMap = start >> 15;
+	const uint64_t lastByteInMap = (end - 1) >> 15;
+	uint64_t pos;
+	// First byte
+	uint8_t fb = 0, lb = 0;
+	for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+		const int map_x = (pos >> 12) & 7; // mod 8
+		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+		fb |= bit_mask;
+	}
+	// Last byte
+	for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+		const int map_x = (pos >> 12) & 7; // mod 8
+		const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+		lb |= bit_mask;
+	}
+	if ( set ) {
+		uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
+		uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
+		setNewBlocks = ( fo != cache->map[firstByteInMap] || lo != cache->map[lastByteInMap] );
+	} else {
+		atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
+		atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
+	}
+	const uint8_t nval = set ? 0xff : 0;
+	// Everything in between
+	for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+		if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+			setNewBlocks = true;
+		}
+	}
+	atomic_thread_fence( memory_order_release );
 	if ( setNewBlocks && image->crc32 != NULL ) {
 		// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
 		// for checking, even though this might lead to checking some hash block again, if it was
@@ -125,19 +154,14 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 		// First set start and end to borders of hash blocks
 		start &= ~(uint64_t)(HASH_BLOCK_SIZE - 1);
 		end = (end + HASH_BLOCK_SIZE - 1) & ~(uint64_t)(HASH_BLOCK_SIZE - 1);
-		pos = start;
-		while ( pos < end ) {
-			if ( image->cache_map == NULL ) break;
+		for ( pos = start; pos < end; pos += HASH_BLOCK_SIZE ) {
 			const int block = (int)( pos / HASH_BLOCK_SIZE );
-			if ( image_isHashBlockComplete( image->cache_map, block, image->realFilesize ) ) {
-				mutex_unlock( &image->lock );
+			if ( image_isHashBlockComplete( cache->map, block, image->realFilesize ) ) {
 				integrity_check( image, block );
-				mutex_lock( &image->lock );
 			}
-			pos += HASH_BLOCK_SIZE;
 		}
 	}
-	mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
 }
 
 /**
@@ -149,20 +173,18 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 bool image_isComplete(dnbd3_image_t *image)
 {
 	assert( image != NULL );
-	mutex_lock( &image->lock );
 	if ( image->virtualFilesize == 0 ) {
-		mutex_unlock( &image->lock );
 		return false;
 	}
-	if ( image->cache_map == NULL ) {
-		mutex_unlock( &image->lock );
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL ) {
 		return true;
 	}
 	bool complete = true;
 	int j;
 	const int map_len_bytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	for (j = 0; j < map_len_bytes - 1; ++j) {
-		if ( image->cache_map[j] != 0xFF ) {
+		if ( cache->map[j] != 0xFF ) {
 			complete = false;
 			break;
 		}
@@ -177,18 +199,27 @@ bool image_isComplete(dnbd3_image_t *image)
 			for (j = 0; j < blocks_in_last_byte; ++j)
 				last_byte |= (uint8_t)(1 << j);
 		}
-		complete = ((image->cache_map[map_len_bytes - 1] & last_byte) == last_byte);
+		complete = ((cache->map[map_len_bytes - 1] & last_byte) == last_byte);
 	}
-	if ( !complete ) {
-		mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
+	if ( !complete )
 		return false;
+	mutex_lock( &image->lock );
+	// Lock and make sure current cache map is still the one we saw complete
+	dnbd3_cache_map_t *current = ref_get_cachemap( image );
+	if ( current == cache ) {
+		// Set cache map NULL as it's complete
+		ref_setref( &image->ref_cacheMap, NULL );
+	}
+	if ( current != NULL ) {
+		ref_put( &current->reference );
 	}
-	char mapfile[PATHLEN] = "";
-	free( image->cache_map );
-	image->cache_map = NULL;
-	snprintf( mapfile, PATHLEN, "%s.map", image->path );
 	mutex_unlock( &image->lock );
-	unlink( mapfile );
+	if ( current == cache ) { // Successfully set cache map to NULL above
+		char mapfile[PATHLEN] = "";
+		snprintf( mapfile, PATHLEN, "%s.map", image->path );
+		unlink( mapfile );
+	}
 	return true;
 }
 
@@ -350,19 +381,18 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		img->rid = candidate->rid;
 		img->users = 1;
 		img->working = false;
+		img->ref_cacheMap = NULL;
 		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
 			const size_t mb = IMGSIZE_TO_HASHBLOCKS( candidate->virtualFilesize ) * sizeof(uint32_t);
 			img->crc32 = malloc( mb );
 			memcpy( img->crc32, candidate->crc32, mb );
 		}
-		mutex_lock( &candidate->lock );
-		if ( candidate->cache_map != NULL ) {
-			const size_t mb = IMGSIZE_TO_MAPBYTES( candidate->virtualFilesize );
-			img->cache_map = malloc( mb );
-			memcpy( img->cache_map, candidate->cache_map, mb );
+		dnbd3_cache_map_t *cache = ref_get_cachemap( candidate );
+		if ( cache != NULL ) {
+			ref_setref( &img->ref_cacheMap, &cache->reference );
+			ref_put( &cache->reference );
 		}
-		mutex_unlock( &candidate->lock );
 		if ( image_addToList( img ) ) {
 			image_release( candidate );
 			candidate = img;
@@ -377,7 +407,7 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 	}
 
 	// Check if image is incomplete, handle
-	if ( candidate->cache_map != NULL ) {
+	if ( candidate->ref_cacheMap != NULL ) {
 		uplink_init( candidate, -1, NULL, -1 );
 	}
 
@@ -585,11 +615,10 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	if ( !uplink_shutdown( image ) )
 		return NULL;
 	mutex_lock( &image->lock );
-	free( image->cache_map );
+	ref_setref( &image->ref_cacheMap, NULL );
 	free( image->crc32 );
 	free( image->path );
 	free( image->name );
-	image->cache_map = NULL;
 	image->crc32 = NULL;
 	image->path = NULL;
 	image->name = NULL;
@@ -600,7 +629,7 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	return NULL ;
 }
 
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
+bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t realFilesize)
 {
 	if ( cacheMap == NULL ) return true;
 	const uint64_t end = (block + 1) * HASH_BLOCK_SIZE;
@@ -707,7 +736,7 @@ static bool image_load(char *base, char *path, int withUplink)
 {
 	int revision = -1;
 	struct stat st;
-	uint8_t *cache_map = NULL;
+	dnbd3_cache_map_t *cache = NULL;
 	uint32_t *crc32list = NULL;
 	dnbd3_image_t *existing = NULL;
 	int fdImage = -1;
@@ -790,7 +819,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	}
 
 	// 1. Allocate memory for the cache map if the image is incomplete
-	cache_map = image_loadCacheMap( path, virtualFilesize );
+	cache = image_loadCacheMap( path, virtualFilesize );
 
 	// XXX: Maybe try sha-256 or 512 first if you're paranoid (to be implemented)
 
@@ -802,7 +831,7 @@ static bool image_load(char *base, char *path, int withUplink)
 
 	// Check CRC32
 	if ( crc32list != NULL ) {
-		if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache_map ) ) {
+		if ( !image_checkRandomBlocks( 4, fdImage, realFilesize, crc32list, cache != NULL ? cache->map : NULL ) ) {
 			logadd( LOG_ERROR, "quick crc32 check of %s failed. Data corruption?", path );
 			doFullCheck = true;
 		}
@@ -826,7 +855,7 @@ static bool image_load(char *base, char *path, int withUplink)
 			crc32list = NULL;
 			function_return = true;
 			goto load_error; // Keep existing
-		} else if ( existing->cache_map != NULL && cache_map == NULL ) {
+		} else if ( existing->ref_cacheMap != NULL && cache == NULL ) {
 			// Just ignore that fact, if replication is really complete the cache map will be removed anyways
 			logadd( LOG_INFO, "Image '%s:%d' has no cache map on disk!", existing->name, (int)existing->rid );
 			function_return = true;
@@ -846,7 +875,8 @@ static bool image_load(char *base, char *path, int withUplink)
 	dnbd3_image_t *image = calloc( 1, sizeof(dnbd3_image_t) );
 	image->path = strdup( path );
 	image->name = strdup( imgName );
-	image->cache_map = cache_map;
+	image->ref_cacheMap = NULL;
+	ref_setref( &image->ref_cacheMap, &cache->reference );
 	image->crc32 = crc32list;
 	image->masterCrc32 = masterCrc;
 	image->uplinkref = NULL;
@@ -855,7 +885,7 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->rid = (uint16_t)revision;
 	image->users = 0;
 	image->readFd = -1;
-	image->working = (image->cache_map == NULL );
+	image->working = ( cache == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
@@ -870,16 +900,16 @@ static bool image_load(char *base, char *path, int withUplink)
 	timing_gets( &image->atime, offset );
 
 	// Prevent freeing in cleanup
-	cache_map = NULL;
+	cache = NULL;
 	crc32list = NULL;
 
 	// Get rid of cache map if image is complete
-	if ( image->cache_map != NULL ) {
+	if ( image->ref_cacheMap != NULL ) {
 		image_isComplete( image );
 	}
 
 	// Image is definitely incomplete, initialize uplink worker
-	if ( image->cache_map != NULL ) {
+	if ( image->ref_cacheMap != NULL ) {
 		image->working = false;
 		if ( withUplink ) {
 			uplink_init( image, -1, NULL, -1 );
@@ -910,21 +940,22 @@ static bool image_load(char *base, char *path, int withUplink)
 load_error: ;
 	if ( existing != NULL ) existing = image_release( existing );
 	if ( crc32list != NULL ) free( crc32list );
-	if ( cache_map != NULL ) free( cache_map );
+	if ( cache != NULL ) free( cache );
 	if ( fdImage != -1 ) close( fdImage );
 	return function_return;
 }
 
-static uint8_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
+static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize)
 {
-	uint8_t *retval = NULL;
+	dnbd3_cache_map_t *retval = NULL;
 	char mapFile[strlen( imagePath ) + 10 + 1];
 	sprintf( mapFile, "%s.map", imagePath );
 	int fdMap = open( mapFile, O_RDONLY );
-	if ( fdMap >= 0 ) {
+	if ( fdMap != -1 ) {
 		const int map_size = IMGSIZE_TO_MAPBYTES( fileSize );
-		retval = calloc( 1, map_size );
-		const ssize_t rd = read( fdMap, retval, map_size );
+		retval = calloc( 1, sizeof(*retval) + map_size );
+		ref_init( &retval->reference, cmfree, 0 );
+		const ssize_t rd = read( fdMap, retval->map, map_size );
 		if ( map_size != rd ) {
 			logadd( LOG_WARNING, "Could only read %d of expected %d bytes of cache map of '%s'", (int)rd, (int)map_size, imagePath );
 			// Could not read complete map, that means the rest of the image file will be considered incomplete
@@ -985,7 +1016,7 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
 	return retval;
 }
 
-static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, uint8_t * const cache_map)
+static bool image_checkRandomBlocks(const int count, int fdImage, const int64_t realFilesize, uint32_t * const crc32list, atomic_uint_least8_t * const cache_map)
 {
 	// This checks the first block and (up to) count - 1 random blocks for corruption
 	// via the known crc32 list. This is very sloppy and is merely supposed to detect
@@ -1529,30 +1560,37 @@ json_t* image_getListAsJson()
 /**
  * Get completeness of an image in percent. Only estimated, not exact.
  * Returns: 0-100
- * DOES NOT LOCK, so make sure to do so before calling
  */
 int image_getCompletenessEstimate(dnbd3_image_t * const image)
 {
 	assert( image != NULL );
-	if ( image->cache_map == NULL ) return image->working ? 100 : 0;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL )
+		return image->working ? 100 : 0;
+	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+	if ( unlikely( len == 0 ) ) {
+		ref_put( &cache->reference );
+		return 0;
+	}
 	declare_now;
 	if ( !timing_reached( &image->nextCompletenessEstimate, &now ) ) {
 		// Since this operation is relatively expensive, we cache the result for a while
+		ref_put( &cache->reference );
 		return image->completenessEstimate;
 	}
 	int i;
 	int percent = 0;
-	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-	if ( len == 0 ) return 0;
 	for ( i = 0; i < len; ++i ) {
-		if ( image->cache_map[i] == 0xff ) {
+		const uint8_t v = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+		if ( v == 0xff ) {
 			percent += 100;
-		} else if ( image->cache_map[i] != 0 ) {
+		} else if ( v != 0 ) {
 			percent += 50;
 		}
 	}
+	ref_put( &cache->reference );
 	image->completenessEstimate = percent / len;
-	timing_set( &image->nextCompletenessEstimate, &now, 8 + rand() % 32 );
+	timing_set( &image->nextCompletenessEstimate, &now, 4 + rand() % 16 );
 	return image->completenessEstimate;
 }
 
@@ -1744,3 +1782,21 @@ static void* closeUnusedFds(void* nix UNUSED)
 	}
 	return NULL;
 }
+
+static void allocCacheMap(dnbd3_image_t *image, bool complete)
+{
+	const uint8_t val = complete ? 0xff : 0;
+	const int byteSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+	dnbd3_cache_map_t *cache = malloc( sizeof(*cache) + byteSize );
+	ref_init( &cache->reference, cmfree, 0 );
+	memset( cache->map, val, byteSize );
+	mutex_lock( &image->lock );
+	if ( image->ref_cacheMap != NULL ) {
+		logadd( LOG_WARNING, "BUG: allocCacheMap called but there already is a cache map for %s:%d", image->name, (int)image->rid );
+		free( cache );
+	} else {
+		ref_setref( &image->ref_cacheMap, &cache->reference );
+	}
+	mutex_unlock( &image->lock );
+}
+
diff --git a/src/server/image.h b/src/server/image.h
index 4668eff..cd87f03 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -9,7 +9,7 @@ void image_serverStartup();
 
 bool image_isComplete(dnbd3_image_t *image);
 
-bool image_isHashBlockComplete(const uint8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
+bool image_isHashBlockComplete(atomic_uint_least8_t * const cacheMap, const uint64_t block, const uint64_t fileSize);
 
 void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, const bool set);
 
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 1fcb558..a9fbae6 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -181,10 +181,12 @@ static void* integrity_main(void * data UNUSED)
 						const uint64_t end = MIN( (uint64_t)(blocks[0] + 1) * HASH_BLOCK_SIZE, image->virtualFilesize );
 						bool complete = true;
 						if ( qCount == CHECK_ALL ) {
-							// When checking full image, skip incomplete blocks, otherwise assume block is complete
-							mutex_lock( &image->lock );
-							complete = image_isHashBlockComplete( image->cache_map, blocks[0], fileSize );
-							mutex_unlock( &image->lock );
+							dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+							if ( cache != NULL ) {
+								// When checking full image, skip incomplete blocks, otherwise assume block is complete
+								complete = image_isHashBlockComplete( cache->map, blocks[0], fileSize );
+								ref_put( &cache->reference );
+							}
 						}
 #if defined(linux) || defined(__linux)
 						while ( sync_file_range( fd, start, end - start, SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER ) == -1 )
diff --git a/src/server/net.c b/src/server/net.c
index 9c855e4..12bcdad 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -246,7 +246,7 @@ void* net_handleNewConnection(void *clientPtr)
 				// We're a proxy, client is another proxy, we don't do BGR, but connecting proxy does...
 				// Reject, as this would basically force this proxy to do BGR too.
 				image = image_get( image_name, rid, true );
-				if ( image != NULL && image->cache_map != NULL ) {
+				if ( image != NULL && image->ref_cacheMap != NULL ) {
 					// Only exception is if the image is complete locally
 					image = image_release( image );
 				}
@@ -268,7 +268,7 @@ void* net_handleNewConnection(void *clientPtr)
 			} else {
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
-				if ( image->cache_map != NULL ) {
+				if ( image->ref_cacheMap != NULL ) {
 					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
 					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
 						bOk = ( rand() % 4 ) == 1;
@@ -338,57 +338,52 @@ void* net_handleNewConnection(void *clientPtr)
 					break;
 				}
 
-				if ( request.size != 0 && image->cache_map != NULL ) {
+				dnbd3_cache_map_t *cache;
+				if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
 					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					bool isCached = true;
-					mutex_lock( &image->lock );
-					// Check again as we only aquired the lock just now
-					if ( image->cache_map != NULL ) {
-						const uint64_t firstByteInMap = start >> 15;
-						const uint64_t lastByteInMap = (end - 1) >> 15;
-						uint64_t pos;
-						// Middle - quick checking
-						if ( isCached ) {
-							pos = firstByteInMap + 1;
-							while ( pos < lastByteInMap ) {
-								if ( image->cache_map[pos] != 0xff ) {
-									isCached = false;
-									break;
-								}
-								++pos;
+					const uint64_t firstByteInMap = start >> 15;
+					const uint64_t lastByteInMap = (end - 1) >> 15;
+					uint64_t pos;
+					uint8_t b;
+					atomic_thread_fence( memory_order_acquire );
+					// Middle - quick checking
+					if ( isCached ) {
+						for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+							if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+								isCached = false;
+								break;
 							}
 						}
-						// First byte
-						if ( isCached ) {
-							pos = start;
-							do {
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (image->cache_map[firstByteInMap] & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-								pos += DNBD3_BLOCK_SIZE;
-							} while ( firstByteInMap == (pos >> 15) && pos < end );
+					}
+					// First byte
+					if ( isCached ) {
+						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+						for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+							const int map_x = (pos >> 12) & 7; // mod 8
+							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+							if ( (b & bit_mask) == 0 ) {
+								isCached = false;
+								break;
+							}
 						}
-						// Last byte - only check if request spans multiple bytes in cache map
-						if ( isCached && firstByteInMap != lastByteInMap ) {
-							pos = lastByteInMap << 15;
-							while ( pos < end ) {
-								assert( lastByteInMap == (pos >> 15) );
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (image->cache_map[lastByteInMap] & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-								pos += DNBD3_BLOCK_SIZE;
+					}
+					// Last byte - only check if request spans multiple bytes in cache map
+					if ( isCached && firstByteInMap != lastByteInMap ) {
+						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+						for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+							assert( lastByteInMap == (pos >> 15) );
+							const int map_x = (pos >> 12) & 7; // mod 8
+							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+							if ( (b & bit_mask) == 0 ) {
+								isCached = false;
+								break;
 							}
 						}
 					}
-					mutex_unlock( &image->lock );
+					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
diff --git a/src/server/reference.h b/src/server/reference.h
index 8883eb1..2a80955 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -51,4 +51,9 @@ static inline void ref_put( ref *ref )
 	ref == NULL ? NULL : container_of(ref, dnbd3_uplink_t, reference); \
 })
 
+#define ref_get_cachemap(image) ({ \
+	ref* ref = ref_get( &(image)->ref_cacheMap ); \
+	ref == NULL ? NULL : container_of(ref, dnbd3_cache_map_t, reference); \
+})
+
 #endif
diff --git a/src/server/uplink.c b/src/server/uplink.c
index d77be9c..0a6bd11 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -91,7 +91,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		ref_put( &uplink->reference );
 		return true; // There's already an uplink, so should we consider this success or failure?
 	}
-	if ( image->cache_map == NULL ) {
+	if ( image->ref_cacheMap == NULL ) {
 		logadd( LOG_WARNING, "Uplink was requested for image %s, but it is already complete", image->name );
 		goto failure;
 	}
@@ -170,7 +170,7 @@ bool uplink_shutdown(dnbd3_image_t *image)
 	mutex_unlock( &uplink->queueLock );
 	bool retval = ( exp && image->users == 0 );
 	mutex_unlock( &image->lock );
-	return exp;
+	return retval;
 }
 
 /**
@@ -214,7 +214,7 @@ static void uplink_free(ref *ref)
 	dnbd3_image_t *image = image_lock( uplink->image );
 	if ( image != NULL ) {
 		// != NULL means image is still in list...
-		if ( !_shutdown && image->cache_map != NULL ) {
+		if ( !_shutdown && image->ref_cacheMap != NULL ) {
 			// Ingegrity checker must have found something in the meantime
 			uplink_init( image, -1, NULL, 0 );
 		}
@@ -707,13 +707,14 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 	if ( uplink == NULL || uplink->current.fd == -1 ) return;
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
 	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return;
+		return; // Already a replication request on the wire, or no more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
 	if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
-	mutex_lock( &image->lock );
-	if ( image == NULL || image->cache_map == NULL || image->users < _bgrMinClients ) {
-		// No cache map (=image complete), or replication pending, or not enough users, do nothing
-		mutex_unlock( &image->lock );
+	if ( image->users < _bgrMinClients ) return; // Not enough active users
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL || image->users < _bgrMinClients ) {
+		// No cache map (=image complete)
+		ref_put( &cache->reference );
 		return;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
@@ -727,16 +728,18 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 			endByte = mapBytes;
 		}
 	}
+	atomic_thread_fence( memory_order_acquire );
 	int replicationIndex = -1;
 	for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
 		const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
-		if ( image->cache_map[i] != 0xff && ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+		if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+				&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
 			// Found incomplete one
 			replicationIndex = i;
 			break;
 		}
 	}
-	mutex_unlock( &image->lock );
+	ref_put( &cache->reference );
 	if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
 		// Nothing left in current block, find next one
 		replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
@@ -768,23 +771,24 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 }
 
 /**
- * find next index into cache_map that corresponds to the beginning
+ * find next index into cache map that corresponds to the beginning
  * of a hash block which is neither completely empty nor completely
  * replicated yet. Returns -1 if no match.
  */
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int startMapIndex)
 {
 	int retval = -1;
-	mutex_lock( &uplink->image->lock );
-	const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
-	const uint8_t *cache_map = uplink->image->cache_map;
-	if ( cache_map != NULL ) {
-		int j;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( uplink->image );
+	if ( cache != NULL ) {
+		const int mapBytes = IMGSIZE_TO_MAPBYTES( uplink->image->virtualFilesize );
 		const int start = ( startMapIndex & MAP_INDEX_HASH_START_MASK );
+		atomic_thread_fence( memory_order_acquire );
+		int j;
 		for (j = 0; j < mapBytes; ++j) {
 			const int i = ( start + j ) % mapBytes;
-			const bool isFull = cache_map[i] == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
-			const bool isEmpty = cache_map[i] == 0;
+			const uint8_t b = atomic_load_explicit( &cache->map[i], memory_order_relaxed );
+			const bool isFull = b == 0xff || ( i + 1 == mapBytes && uplink->replicatedLastBlock );
+			const bool isEmpty = b == 0;
 			if ( !isEmpty && !isFull ) {
 				// Neither full nor empty, replicate
 				if ( retval == -1 ) {
@@ -811,7 +815,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 			retval = -1;
 		}
 	}
-	mutex_unlock( &uplink->image->lock );
+	ref_put( &cache->reference );
 	return retval;
 }
 
@@ -1107,7 +1111,7 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		if ( fsync( uplink->cacheFd ) == -1 ) {
 			// A failing fsync means we have no guarantee that any data
 			// since the last fsync (or open if none) has been saved. Apart
-			// from keeping the cache_map from the last successful fsync
+			// from keeping the cache map from the last successful fsync
 			// around and restoring it there isn't much we can do to recover
 			// a consistent state. Bail out.
 			logadd( LOG_ERROR, "fsync() on image file %s failed with errno %d", image->path, errno );
@@ -1116,21 +1120,13 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		}
 	}
 
-	if ( image->cache_map == NULL ) return true;
-	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
-	mutex_lock( &image->lock );
-	// Lock and get a copy of the cache map, as it could be freed by another thread that is just about to
-	// figure out that this image's cache copy is complete
-	if ( image->cache_map == NULL || image->virtualFilesize < DNBD3_BLOCK_SIZE ) {
-		mutex_unlock( &image->lock );
+	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+	if ( cache == NULL )
 		return true;
-	}
+	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
-	uint8_t *map = malloc( size );
-	memcpy( map, image->cache_map, size );
 	// Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
 	// cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
-	mutex_unlock( &image->lock );
 	assert( image->path != NULL );
 	char mapfile[strlen( image->path ) + 4 + 1];
 	strcpy( mapfile, image->path );
@@ -1139,14 +1135,14 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 	int fd = open( mapfile, O_WRONLY | O_CREAT, 0644 );
 	if ( fd == -1 ) {
 		const int err = errno;
-		free( map );
+		ref_put( &cache->reference );
 		logadd( LOG_WARNING, "Could not open file to write cache map to disk (errno=%d) file %s", err, mapfile );
 		return false;
 	}
 
 	size_t done = 0;
 	while ( done < size ) {
-		const ssize_t ret = write( fd, map, size - done );
+		const ssize_t ret = write( fd, cache->map + done, size - done );
 		if ( ret == -1 ) {
 			if ( errno == EINTR ) continue;
 			logadd( LOG_WARNING, "Could not write cache map (errno=%d) file %s", errno, mapfile );
@@ -1158,11 +1154,11 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		}
 		done += (size_t)ret;
 	}
+	ref_put( &cache->reference );
 	if ( fsync( fd ) == -1 ) {
 		logadd( LOG_WARNING, "fsync() on image map %s failed with errno %d", mapfile, errno );
 	}
 	close( fd );
-	free( map );
 	return true;
 }
 
-- 
cgit v1.2.3-55-g7522


From 9d2d9c6de358b2cf1a602c999d2e0a7a664610f7 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Thu, 29 Aug 2019 23:05:26 +0200
Subject: [SERVER] Tear down whole uplink on idle timeout

Keeping the uplink thread around forever even though we
disconnected from the upstream server seems wasteful. Get
rid of this and rear down the uplink entirely.
---
 src/server/net.c    | 13 +++++--------
 src/server/uplink.c | 40 +++++++++++++++++++---------------------
 2 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index 12bcdad..00c9a8d 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -270,18 +270,15 @@ void* net_handleNewConnection(void *clientPtr)
 				bOk = true;
 				if ( image->ref_cacheMap != NULL ) {
 					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-					if ( uplink == NULL || uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					bool penalty = bOk && ( uplink == NULL || uplink->cacheFd == -1 );
-					if ( uplink == NULL ) {
-						uplink_init( image, -1, NULL, 0 );
-					} else {
-						ref_put( &uplink->reference );
-					}
-					if ( penalty ) { // Wait 100ms if local caching is not working so this
+					if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
 						usleep( 100000 ); // server gets a penalty and is less likely to be selected
 					}
+					if ( uplink != NULL ) {
+						ref_put( &uplink->reference );
+					}
 				}
 				if ( bOk ) {
 					mutex_lock( &image->lock );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 0a6bd11..58f8ea5 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -258,10 +258,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	dnbd3_uplink_t * const uplink = ref_get_uplink( &client->image->uplinkref );
-	if ( uplink == NULL ) {
-		logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
-		return false;
+	dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
+	if ( unlikely( uplink == NULL ) ) {
+		uplink_init( client->image, -1, NULL, -1 );
+		uplink = ref_get_uplink( &client->image->uplinkref );
+		if ( uplink == NULL ) {
+			logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+			return false;
+		}
 	}
 	if ( uplink->shutdown ) {
 		logadd( LOG_DEBUG1, "Uplink request for image with uplink shutting down" );
@@ -460,12 +464,15 @@ static void* uplink_mainloop(void *data)
 	events[EV_SIGNAL].events = POLLIN;
 	events[EV_SIGNAL].fd = signal_getWaitFd( uplink->signal );
 	events[EV_SOCKET].fd = -1;
+	if ( uplink->rttTestResult != RTT_DOCHANGE ) {
+		altservers_findUplink( uplink ); // In case we didn't kickstart
+	}
 	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
 		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
 		if ( waitTime == 0 ) {
 			// 0 means poll, since we're about to change the server
-		} else if ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) {
+		} else if ( uplink->current.fd == -1 ) {
 			waitTime = 1000;
 		} else {
 			declare_now;
@@ -568,32 +575,22 @@ static void* uplink_mainloop(void *data)
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
-			if ( uplink->current.fd != -1 && uplink_connectionShouldShutdown( uplink ) ) {
-				mutex_lock( &uplink->sendMutex );
-				close( uplink->current.fd );
-				uplink->current.fd = -1;
-				mutex_unlock( &uplink->sendMutex );
-				uplink->cycleDetected = false;
-				if ( uplink->recvBufferLen != 0 ) {
-					uplink->recvBufferLen = 0;
-					free( uplink->recvBuffer );
-					uplink->recvBuffer = NULL;
-				}
+			if ( uplink_connectionShouldShutdown( uplink ) ) {
 				logadd( LOG_DEBUG1, "Closing idle uplink for image %s:%d", uplink->image->name, (int)uplink->image->rid );
-				setThreadName( "idle-uplink" );
+				goto cleanup;
 			}
 		}
 		// See if we should trigger an RTT measurement
 		rttTestResult = uplink->rttTestResult;
 		if ( rttTestResult == RTT_IDLE || rttTestResult == RTT_DONTCHANGE ) {
-			if ( timing_reached( &nextAltCheck, &now ) || ( uplink->current.fd == -1 && !uplink_connectionShouldShutdown( uplink ) ) || uplink->cycleDetected ) {
+			if ( timing_reached( &nextAltCheck, &now ) || uplink->current.fd == -1 || uplink->cycleDetected ) {
 				// It seems it's time for a check
 				if ( image_isComplete( uplink->image ) ) {
 					// Quit work if image is complete
 					logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
 					setThreadName( "finished-uplink" );
 					goto cleanup;
-				} else if ( !uplink_connectionShouldShutdown( uplink ) ) {
+				} else {
 					// Not complete - do measurement
 					altservers_findUplinkAsync( uplink ); // This will set RTT_INPROGRESS (synchronous)
 					if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
@@ -606,6 +603,9 @@ static void* uplink_mainloop(void *data)
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
 			atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE );
 			discoverFailCount++;
+			if ( uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
+				uplink->image->working = false;
+			}
 			timing_set( &nextAltCheck, &now, (discoverFailCount < SERVER_RTT_MAX_UNREACH ? altCheckInterval : SERVER_RTT_INTERVAL_FAILED) );
 		}
 #ifdef _DEBUG
@@ -1125,8 +1125,6 @@ static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink)
 		return true;
 	logadd( LOG_DEBUG2, "Saving cache map of %s:%d", image->name, (int)image->rid );
 	const size_t size = IMGSIZE_TO_MAPBYTES(image->virtualFilesize);
-	// Unlock. Use path and cacheFd without locking. path should never change after initialization of the image,
-	// cacheFd is owned by the uplink thread and we don't want to hold a spinlock during I/O
 	assert( image->path != NULL );
 	char mapfile[strlen( image->path ) + 4 + 1];
 	strcpy( mapfile, image->path );
-- 
cgit v1.2.3-55-g7522


From 543877c7fc17c0a881d6a85c76dfc17f8def7dff Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Sep 2019 20:06:11 +0200
Subject: [SERVER] Support limiting alt-servers to specific namespace

Not really namespace but simple string matching for the image path. Path
is matched from start with no support for glob or regex, so usually you
want to have a trailing '/' to limit to certain directories.
---
 src/server/altservers.c | 51 +++++++++++++++++++++++++++++++++++--------------
 src/server/altservers.h |  4 ++--
 src/server/globals.h    |  8 ++++++++
 src/server/image.c      |  2 +-
 src/server/net.c        |  2 +-
 5 files changed, 49 insertions(+), 18 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 952af4f..943345c 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -23,7 +23,7 @@ static atomic_int numAltServers = 0;
 static pthread_mutex_t altServersLock;
 
 static void *altservers_runCheck(void *data);
-static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current);
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current);
 static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink);
 static uint32_t altservers_updateRtt(dnbd3_uplink_t *uplink, int index, uint32_t rtt);
 static void altservers_imageFailed(dnbd3_uplink_t *uplink, int server);
@@ -86,6 +86,13 @@ static int addAltFromIni(void *countptr, const char* section, const char* key, c
 		}
 	} else if ( strcmp( key, "comment" ) == 0 ) {
 		snprintf( altServers[index].comment, COMMENT_LENGTH, "%s", value );
+	} else if ( strcmp( key, "namespace" ) == 0 ) {
+		dnbd3_ns_t *elem = malloc( sizeof(*elem) );
+		elem->name = strdup( value );
+		elem->len = strlen( value );
+		do {
+			elem->next = altServers[index].nameSpaces;
+		} while ( !atomic_compare_exchange_weak( &altServers[index].nameSpaces, &elem->next, elem ) );
 	} else {
 		logadd( LOG_DEBUG1, "Unknown key in alt-servers section: '%s'", key );
 	}
@@ -139,6 +146,7 @@ bool altservers_add(dnbd3_host_t *host, const char *comment, const int isPrivate
 	altServers[freeSlot].host = *host;
 	altServers[freeSlot].isPrivate = isPrivate;
 	altServers[freeSlot].isClientOnly = isClientOnly;
+	altServers[freeSlot].nameSpaces = NULL;
 	if ( comment != NULL ) snprintf( altServers[freeSlot].comment, COMMENT_LENGTH, "%s", comment );
 	mutex_unlock( &altServersLock );
 	*index = freeSlot;
@@ -171,15 +179,28 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink)
 	}
 }
 
+static bool isImageAllowed(dnbd3_alt_server_t *alt, const char *image)
+{
+	if ( alt->nameSpaces == NULL )
+		return true;
+	for ( dnbd3_ns_t *it = alt->nameSpaces; it != NULL; it = it->next ) {
+		if ( strncmp( it->name, image, it->len ) == 0 )
+			return true;
+	}
+	return false;
+}
+
 /**
  * Get <size> known (working) alt servers, ordered by network closeness
  * (by finding the smallest possible subnet)
  * Private servers are excluded, so this is what you want to call to
  * get a list of servers you can tell a client about
  */
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size)
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size)
 {
-	if ( host == NULL || host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 ) return 0;
+	dnbd3_host_t *host = &client->host;
+	if ( host->type == 0 || numAltServers == 0 || output == NULL || size <= 0 )
+		return 0;
 	int i, j;
 	int count = 0;
 	uint16_t scores[SERVER_MAX_ALTS] = { 0 };
@@ -188,11 +209,9 @@ int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output
 	for ( i = 0; i < numAltServers; ++i ) {
 		if ( altServers[i].host.type == 0 || altServers[i].isPrivate )
 			continue; // Slot is empty or uplink is for replication only
-		if ( host->type == altServers[i].host.type ) {
-			scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
-		} else {
-			scores[i] = 1; // Wrong address family
-		}
+		if ( !isImageAllowed( &altServers[i], client->image->name ) )
+			continue;
+		scores[i] = (uint16_t)( 10 + altservers_netCloseness( host, &altServers[i].host ) );
 	}
 	while ( count < size ) {
 		i = -1;
@@ -244,10 +263,10 @@ static bool isUsableForUplink( dnbd3_uplink_t *uplink, int server, ticks *now )
 	return fails < SERVER_BAD_UPLINK_MIN || ( rand() % fails ) < SERVER_BAD_UPLINK_MIN;
 }
 
-int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size)
 {
 	int idx[size];
-	int num = altservers_getListForUplink( NULL, idx, size, -1 );
+	int num = altservers_getListForUplink( NULL, image, idx, size, -1 );
 	for ( int i = 0; i < num; ++i ) {
 		servers[i] = altServers[i].host;
 	}
@@ -261,7 +280,7 @@ int altservers_getHostListForReplication(dnbd3_host_t *servers, int size)
  * it includes private servers and ignores any "client only" servers
  * @param current index of server for current connection, or -1 in panic mode
  */
-static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int size, int current)
+static int altservers_getListForUplink(dnbd3_uplink_t *uplink, const char *image, int *servers, int size, int current)
 {
 	if ( size <= 0 )
 		return 0;
@@ -272,7 +291,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int
 	if ( numAltServers <= size ) {
 		for ( int i = 0; i < numAltServers; ++i ) {
 			if ( current == -1 || i == current || isUsableForUplink( uplink, i, &now ) ) {
-				servers[count++] = i;
+				if ( isImageAllowed( &altServers[i], image ) ) {
+					servers[count++] = i;
+				}
 			}
 		}
 	} else {
@@ -286,7 +307,9 @@ static int altservers_getListForUplink(dnbd3_uplink_t *uplink, int *servers, int
 			int idx = rand() % numAltServers;
 			if ( state[idx] != 0 )
 				continue;
-			if ( isUsableForUplink( uplink, idx, &now ) ) {
+			if ( !isImageAllowed( &altServers[idx], image ) ) {
+				state[idx] = 2; // Mark as used without adding, so it will be ignored in panic loop
+			} else if ( isUsableForUplink( uplink, idx, &now ) ) {
 				servers[count++] = idx;
 				state[idx] = 2; // Used
 			} else {
@@ -469,7 +492,7 @@ static void altservers_findUplinkInternal(dnbd3_uplink_t *uplink)
 	current = uplink->current.index; // Current server index (or last one in panic mode)
 	mutex_unlock( &uplink->rttLock );
 	// First, get 4 alt servers
-	numAlts = altservers_getListForUplink( uplink, servers, ALTS, panic ? -1 : current );
+	numAlts = altservers_getListForUplink( uplink, uplink->image->name, servers, ALTS, panic ? -1 : current );
 	// If we're already connected and only got one server anyways, there isn't much to do
 	if ( numAlts == 0 || ( numAlts == 1 && !panic ) ) {
 		uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/altservers.h b/src/server/altservers.h
index 1e1f119..8e29aaa 100644
--- a/src/server/altservers.h
+++ b/src/server/altservers.h
@@ -15,9 +15,9 @@ void altservers_findUplinkAsync(dnbd3_uplink_t *uplink);
 
 void altservers_findUplink(dnbd3_uplink_t *uplink);
 
-int altservers_getListForClient(dnbd3_host_t *host, dnbd3_server_entry_t *output, int size);
+int altservers_getListForClient(dnbd3_client_t *client, dnbd3_server_entry_t *output, int size);
 
-int altservers_getHostListForReplication(dnbd3_host_t *servers, int size);
+int altservers_getHostListForReplication(const char *image, dnbd3_host_t *servers, int size);
 
 bool altservers_toString(int server, char *buffer, size_t len);
 
diff --git a/src/server/globals.h b/src/server/globals.h
index 221af78..ebdc1c7 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -31,6 +31,13 @@ typedef struct
 	uint8_t hopCount;      // How many hops this request has already taken across proxies
 } dnbd3_queued_request_t;
 
+typedef struct _ns
+{
+	struct _ns *next;
+	char *name;
+	size_t len;
+} dnbd3_ns_t;
+
 typedef struct
 {
 	int fails;                    // Hard fail: Connection failed
@@ -41,6 +48,7 @@ typedef struct
 	ticks lastFail;               // Last hard fail
 	dnbd3_host_t host;
 	char comment[COMMENT_LENGTH];
+	_Atomic(dnbd3_ns_t *) nameSpaces; // Linked list of name spaces
 } dnbd3_alt_server_t;
 
 typedef struct
diff --git a/src/server/image.c b/src/server/image.c
index bdb910d..86e6b87 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -1204,7 +1204,7 @@ static dnbd3_image_t *loadImageProxy(char * const name, const uint16_t revision,
 	dnbd3_host_t servers[REP_NUM_SRV];
 	int uplinkSock = -1;
 	dnbd3_host_t uplinkServer;
-	const int count = altservers_getHostListForReplication( servers, REP_NUM_SRV );
+	const int count = altservers_getHostListForReplication( name, servers, REP_NUM_SRV );
 	uint16_t remoteProtocolVersion;
 	uint16_t remoteRid = revision;
 	uint64_t remoteImageSize;
diff --git a/src/server/net.c b/src/server/net.c
index 00c9a8d..aba4e7d 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -477,7 +477,7 @@ void* net_handleNewConnection(void *clientPtr)
 
 			case CMD_GET_SERVERS:
 				// Build list of known working alt servers
-				num = altservers_getListForClient( &client->host, server_list, NUMBER_SERVERS );
+				num = altservers_getListForClient( client, server_list, NUMBER_SERVERS );
 				reply.cmd = CMD_GET_SERVERS;
 				reply.size = (uint32_t)( num * sizeof(dnbd3_server_entry_t) );
 				mutex_lock( &client->sendMutex );
-- 
cgit v1.2.3-55-g7522


From 26c1ad7af0f5749c5343a5823b9c8cece885ce84 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 12:21:01 +0100
Subject: [SERVER] Remove "working" flag, introduce fine-grained flags

Tracking the "working" state of images using one boolean is insufficient
regarding the different ways in which providing an image can fail.
Introduce separate flags for different conditions, like "file not
readable", "file not writable", "no uplink server available", "file
content has changed".
---
 src/server/altservers.c |   4 -
 src/server/globals.h    |   7 +-
 src/server/image.c      | 193 +++++++++++++++++++++++++-----------------------
 src/server/integrity.c  |  20 +----
 src/server/net.c        |  17 +++--
 src/server/uplink.c     | 114 ++++++++++++++++++----------
 6 files changed, 197 insertions(+), 158 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/altservers.c b/src/server/altservers.c
index 3fdbe0d..a6ad235 100644
--- a/src/server/altservers.c
+++ b/src/server/altservers.c
@@ -628,10 +628,6 @@ failed:
 		if ( best.fd != -1 ) {
 			close( best.fd );
 		}
-		if ( !image->working || uplink->cycleDetected ) {
-			image->working = true;
-			LOG( LOG_DEBUG1, "[RTT] No better alt server found, enabling '%s:%d' again... :-(", image->name, (int)image->rid );
-		}
 		uplink->cycleDetected = false; // It's a lie, but prevents rtt measurement triggering again right away
 		mutex_lock( &uplink->rttLock );
 		uplink->rttTestResult = RTT_DONTCHANGE;
diff --git a/src/server/globals.h b/src/server/globals.h
index b1336dc..31fbce5 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -136,7 +136,12 @@ struct _dnbd3_image
 	atomic_int completenessEstimate; // Completeness estimate in percent
 	atomic_int users;      // clients currently using this image. XXX Lock on imageListLock when modifying and checking whether the image should be freed. Reading it elsewhere is fine without the lock.
 	int id;                // Unique ID of this image. Only unique in the context of this running instance of DNBD3-Server
-	atomic_bool working;   // true if image exists and completeness is == 100% or a working upstream proxy is connected
+	struct {
+		atomic_bool uplink;      // No uplink connected
+		atomic_bool write;       // Error writing to file
+		atomic_bool read;        // Error reading from file
+		atomic_bool changed;     // File disappeared or changed, thorough check required if it seems to be back
+	} problem;
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
 };
diff --git a/src/server/image.c b/src/server/image.c
index 6017e59..1ce1574 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -53,7 +53,7 @@ static bool image_ensureDiskSpace(uint64_t size, bool force);
 
 static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const int64_t fileSize);
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count);
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
 
@@ -239,35 +239,76 @@ bool image_isComplete(dnbd3_image_t *image)
  */
 bool image_ensureOpen(dnbd3_image_t *image)
 {
-	if ( image->readFd != -1 ) return image;
-	int newFd = open( image->path, O_RDONLY );
+	bool sizeChanged = false;
+	if ( image->readFd != -1 && !image->problem.changed )
+		return true;
+	int newFd = image->readFd == -1 ? open( image->path, O_RDONLY ) : dup( image->readFd );
 	if ( newFd == -1 ) {
-		logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+		if ( !image->problem.read ) {
+			logadd( LOG_WARNING, "Cannot open %s for reading", image->path );
+			image->problem.read = true;
+		}
 	} else {
-		// Check size
+		// Check size + read access
+		char buffer[100];
 		const off_t flen = lseek( newFd, 0, SEEK_END );
 		if ( flen == -1 ) {
-			logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+			if ( !image->problem.read ) {
+				logadd( LOG_WARNING, "Could not seek to end of %s (errno %d)", image->path, errno );
+				image->problem.read = true;
+			}
 			close( newFd );
 			newFd = -1;
 		} else if ( (uint64_t)flen != image->realFilesize ) {
-			logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64, image->realFilesize, (uint64_t)flen );
+			if ( !image->problem.changed ) {
+				logadd( LOG_WARNING, "Size of active image with closed fd changed from %" PRIu64 " to %" PRIu64,
+						image->realFilesize, (uint64_t)flen );
+			}
+			sizeChanged = true;
+		} else if ( pread( newFd, buffer, sizeof(buffer), 0 ) == -1 ) {
+			if ( !image->problem.read ) {
+				logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)",
+						(int)sizeof(buffer), image->path, errno );
+				image->problem.read = true;
+			}
 			close( newFd );
 			newFd = -1;
 		}
 	}
 	if ( newFd == -1 ) {
-		mutex_lock( &image->lock );
-		image->working = false;
-		mutex_unlock( &image->lock );
+		if ( sizeChanged ) {
+			image->problem.changed = true;
+		}
 		return false;
 	}
+
+	// Re-opened. Check if the "size/content changed" flag was set before and if so, check crc32,
+	// but only if the size we just got above is correct.
+	if ( image->problem.changed && !sizeChanged ) {
+		if ( image->crc32 == NULL ) {
+			// Cannot verify further, hope for the best
+			image->problem.changed = false;
+			logadd( LOG_DEBUG1, "Size of image %s:%d changed back to expected value",
+					image->name, (int)image->rid );
+		} else if ( image_checkRandomBlocks( image, 1, newFd ) ) {
+			// This should have checked the first block (if complete) -> All is well again
+			image->problem.changed = false;
+			logadd( LOG_DEBUG1, "Size and CRC of image %s:%d changed back to expected value",
+					image->name, (int)image->rid );
+		}
+	} else {
+		image->problem.changed = sizeChanged;
+	}
+
 	mutex_lock( &image->lock );
 	if ( image->readFd == -1 ) {
 		image->readFd = newFd;
+		image->problem.read = false;
 		mutex_unlock( &image->lock );
 	} else {
-		// There was a race while opening the file (happens cause not locked cause blocking), we lost the race so close new fd and proceed
+		// There was a race while opening the file (happens cause not locked cause blocking),
+		// we lost the race so close new fd and proceed.
+		// *OR* we dup()'ed above for cheating when the image changed before.
 		mutex_unlock( &image->lock );
 		close( newFd );
 	}
@@ -296,7 +337,7 @@ dnbd3_image_t* image_byId(int imgId)
  * point...
  * Locks on: imageListLock, _images[].lock
  */
-dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
+dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen)
 {
 	int i;
 	const char *removingText = _removeMissingImages ? ", removing from list" : "";
@@ -326,84 +367,36 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 	candidate->users++;
 	mutex_unlock( &imageListLock );
 
-	// Found, see if it works
-	// TODO: Also make sure a non-working image still has old fd open but created a new one and removed itself from the list
-	// TODO: But remember size-changed images forever
-	if ( candidate->working || checkIfWorking ) {
-		// Is marked working, but might not have an fd open
-		if ( !image_ensureOpen( candidate ) ) {
-			mutex_lock( &candidate->lock );
-			timing_get( &candidate->lastWorkCheck );
-			mutex_unlock( &candidate->lock );
-			if ( _removeMissingImages ) {
-				candidate = image_remove( candidate ); // No release here, the image is still returned and should be released by caller
-			}
-			return candidate;
-		}
-	}
-
-	if ( !checkIfWorking ) return candidate; // Not interested in re-cechking working state
-
-	// ...not working...
-
-	// Don't re-check too often
-	mutex_lock( &candidate->lock );
-	bool check;
-	declare_now;
-	check = timing_diff( &candidate->lastWorkCheck, &now ) > NONWORKING_RECHECK_INTERVAL_SECONDS;
-	if ( check ) {
-		candidate->lastWorkCheck = now;
-	}
-	mutex_unlock( &candidate->lock );
-	if ( !check ) {
+	if ( !ensureFdOpen ) // Don't want to re-check
 		return candidate;
-	}
 
-	// reaching this point means:
-	// 1) We should check if the image is working, it might or might not be in working state right now
-	// 2) The image is open for reading (or at least was at some point, the fd might be stale if images lie on an NFS share etc.)
-	// 3) We made sure not to re-check this image too often
-
-	// Common for ro and rw images: Size check, read check
-	const off_t len = lseek( candidate->readFd, 0, SEEK_END );
-	bool reload = false;
-	if ( len == -1 ) {
-		logadd( LOG_WARNING, "lseek() on %s failed (errno=%d)%s.", candidate->path, errno, removingText );
-		reload = true;
-	} else if ( (uint64_t)len != candidate->realFilesize ) {
-		logadd( LOG_WARNING, "Size of %s changed at runtime, keeping disabled! Expected: %" PRIu64 ", found: %" PRIu64
-				". Try sending SIGHUP to server if you know what you're doing.",
-				candidate->path, candidate->realFilesize, (uint64_t)len );
-	} else {
-		// Seek worked, file size is same, now see if we can read from file
-		char buffer[100];
-		if ( pread( candidate->readFd, buffer, sizeof(buffer), 0 ) == -1 ) {
-			logadd( LOG_WARNING, "Reading first %d bytes from %s failed (errno=%d)%s.",
-					(int)sizeof(buffer), candidate->path, errno, removingText );
-			reload = true;
-		} else if ( !candidate->working ) {
-			// Seems everything is fine again \o/
-			candidate->working = true;
-			logadd( LOG_INFO, "Changed state of %s:%d to 'working'", candidate->name, candidate->rid );
-		}
-	}
+	if ( image_ensureOpen( candidate ) && !candidate->problem.read )
+		return candidate; // We have a read fd and no read or changed problems
 
-	if ( reload ) {
+	// -- image could not be opened again, or is open but has problem --
+
+	if ( _removeMissingImages && !file_isReadable( candidate->path ) ) {
+		candidate = image_remove( candidate );
+		// No image_release here, the image is still returned and should be released by caller
+	} else if ( candidate->readFd != -1 ) {
+		// We cannot just close the fd as it might be in use. Make a copy and remove old entry.
+		candidate = image_remove( candidate );
 		// Could not access the image with exising fd - mark for reload which will re-open the file.
 		// make a copy of the image struct but keep the old one around. If/When it's not being used
 		// anymore, it will be freed automatically.
-		logadd( LOG_DEBUG1, "Reloading image file %s", candidate->path );
+		logadd( LOG_DEBUG1, "Reloading image file %s because of read problem/changed", candidate->path );
 		dnbd3_image_t *img = calloc( sizeof(dnbd3_image_t), 1 );
 		img->path = strdup( candidate->path );
 		img->name = strdup( candidate->name );
 		img->virtualFilesize = candidate->virtualFilesize;
 		img->realFilesize = candidate->realFilesize;
-		img->atime = now;
+		timing_get( &img->atime );
 		img->masterCrc32 = candidate->masterCrc32;
 		img->readFd = -1;
 		img->rid = candidate->rid;
 		img->users = 1;
-		img->working = false;
+		img->problem.read = true;
+		img->problem.changed = candidate->problem.changed;
 		img->ref_cacheMap = NULL;
 		mutex_init( &img->lock, LOCK_IMAGE );
 		if ( candidate->crc32 != NULL ) {
@@ -419,18 +412,17 @@ dnbd3_image_t* image_get(char *name, uint16_t revision, bool checkIfWorking)
 		if ( image_addToList( img ) ) {
 			image_release( candidate );
 			candidate = img;
+			// Check if image is incomplete, initialize uplink
+			if ( candidate->ref_cacheMap != NULL ) {
+				uplink_init( candidate, -1, NULL, -1 );
+			}
+			// Try again with new instance
+			image_ensureOpen( candidate );
 		} else {
 			img->users = 0;
 			image_free( img );
 		}
-		// Check if image is incomplete, initialize uplink
-		if ( candidate->ref_cacheMap != NULL ) {
-			uplink_init( candidate, -1, NULL, -1 );
-		}
-		// readFd == -1 and working == FALSE at this point,
-		// this function needs some splitting up for handling as we need to run most
-		// of the above code again. for now we know that the next call for this
-		// name:rid will get ne newly inserted "img" and try to re-open the file.
+		// readFd == -1 and problem.read == true
 	}
 
 	return candidate; // We did all we can, hopefully it's working
@@ -900,7 +892,6 @@ static bool image_load(char *base, char *path, int withUplink)
 	image->rid = (uint16_t)revision;
 	image->users = 0;
 	image->readFd = -1;
-	image->working = ( cache == NULL );
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
@@ -925,7 +916,7 @@ static bool image_load(char *base, char *path, int withUplink)
 
 	// Image is definitely incomplete, initialize uplink worker
 	if ( image->ref_cacheMap != NULL ) {
-		image->working = false;
+		image->problem.uplink = true;
 		if ( withUplink ) {
 			uplink_init( image, -1, NULL, -1 );
 		}
@@ -937,7 +928,7 @@ static bool image_load(char *base, char *path, int withUplink)
 		// Keep fd for reading
 		fdImage = -1;
 		// Check CRC32
-		image_checkRandomBlocks( image, 4 );
+		image_checkRandomBlocks( image, 4, -1 );
 	} else {
 		logadd( LOG_ERROR, "Image list full: Could not add image %s", path );
 		image->readFd = -1; // Keep fdImage instead, will be closed below
@@ -1027,10 +1018,19 @@ static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t f
 	return retval;
 }
 
-static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
+/**
+ * Check up to count random blocks from given image. If fromFd is -1, the check will
+ * be run asynchronously using the integrity checker. Otherwise, the check will
+ * happen in the function and return the result of the check.
+ * @param image image to check
+ * @param count number of blocks to check (max)
+ * @param fromFd, check synchronously and use this fd for reading, -1 = async
+ * @return true = OK, false = error. Meaningless if fromFd == -1
+ */
+static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd)
 {
 	if ( image->crc32 == NULL )
-		return;
+		return true;
 	// This checks the first block and (up to) count - 1 random blocks for corruption
 	// via the known crc32 list. This is very sloppy and is merely supposed to detect
 	// accidental corruption due to broken dnbd3-proxy functionality or file system
@@ -1038,7 +1038,7 @@ static void image_checkRandomBlocks(dnbd3_image_t *image, const int count)
 	assert( count > 0 );
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 	const int hashBlocks = IMGSIZE_TO_HASHBLOCKS( image->virtualFilesize );
-	int blocks[count];
+	int blocks[count+1]; // +1 for "-1" in sync case
 	int index = 0, j;
 	int block;
 	if ( image_isHashBlockComplete( cache, 0, image->virtualFilesize ) ) {
@@ -1062,9 +1062,16 @@ while_end: ;
 	if ( cache != NULL ) {
 		ref_put( &cache->reference );
 	}
-	for ( int i = 0; i < index; ++i ) {
-		integrity_check( image, blocks[i], true );
+	if ( fromFd == -1 ) {
+		// Async
+		for ( int i = 0; i < index; ++i ) {
+			integrity_check( image, blocks[i], true );
+		}
+		return true;
 	}
+	// Sync
+	blocks[index] = -1;
+	return image_checkBlocksCrc32( fromFd, image->crc32, blocks, image->realFilesize );
 }
 
 /**
@@ -1306,7 +1313,7 @@ server_fail: ;
 		} else {
 			// Clumsy busy wait, but this should only take as long as it takes to start a thread, so is it really worth using a signalling mechanism?
 			int i = 0;
-			while ( !image->working && ++i < 100 )
+			while ( image->problem.uplink && ++i < 100 )
 				usleep( 2000 );
 		}
 	} else if ( uplinkSock != -1 ) {
@@ -1599,7 +1606,7 @@ int image_getCompletenessEstimate(dnbd3_image_t * const image)
 	assert( image != NULL );
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
 	if ( cache == NULL )
-		return image->working ? 100 : 0;
+		return 100;
 	const int len = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	if ( unlikely( len == 0 ) ) {
 		ref_put( &cache->reference );
diff --git a/src/server/integrity.c b/src/server/integrity.c
index 4006dfc..91e53b8 100644
--- a/src/server/integrity.c
+++ b/src/server/integrity.c
@@ -195,9 +195,10 @@ static void* integrity_main(void * data UNUSED)
 							readFd = directFd;
 						}
 					}
-					if ( readFd == -1 ) { // Try buffered; flush to disk for that
-						image_ensureOpen( image );
-						readFd = image->readFd;
+					if ( readFd == -1 ) { // Try buffered as fallback
+						if ( image_ensureOpen( image ) && !image->problem.read ) {
+							readFd = image->readFd;
+						}
 					}
 					if ( readFd == -1 ) {
 						logadd( LOG_MINOR, "Couldn't get any valid fd for integrity check of %s... ignoring...", image->path );
@@ -237,16 +238,6 @@ static void* integrity_main(void * data UNUSED)
 					// Done with this task as nothing left
 					checkQueue[i].image = NULL;
 					if ( i + 1 == queueLen ) queueLen--;
-					// Mark as working again if applicable
-					if ( !foundCorrupted ) {
-						dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-						if ( uplink != NULL ) { // TODO: image_determineWorkingState() helper?
-							mutex_lock( &image->lock );
-							image->working = uplink->current.fd != -1 && image->readFd != -1;
-							mutex_unlock( &image->lock );
-							ref_put( &uplink->reference );
-						}
-					}
 				} else {
 					// Still more blocks to go...
 					checkQueue[i].block = blocks[0];
@@ -254,9 +245,6 @@ static void* integrity_main(void * data UNUSED)
 			}
 			if ( foundCorrupted && !_shutdown ) {
 				// Something was fishy, make sure uplink exists
-				mutex_lock( &image->lock );
-				image->working = false;
-				mutex_unlock( &image->lock );
 				uplink_init( image, -1, NULL, -1 );
 			}
 			// Release :-)
diff --git a/src/server/net.c b/src/server/net.c
index aba4e7d..29147be 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -262,7 +262,7 @@ void* net_handleNewConnection(void *clientPtr)
 			atomic_thread_fence( memory_order_release );
 			if ( unlikely( image == NULL ) ) {
 				//logadd( LOG_DEBUG1, "Client requested non-existent image '%s' (rid:%d), rejected\n", image_name, (int)rid );
-			} else if ( unlikely( !image->working ) ) {
+			} else if ( unlikely( image->problem.read || image->problem.changed ) ) {
 				logadd( LOG_DEBUG1, "Client %s requested non-working image '%s' (rid:%d), rejected\n",
 						client->hostName, image_name, (int)rid );
 			} else {
@@ -273,8 +273,14 @@ void* net_handleNewConnection(void *clientPtr)
 					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					if ( bOk && uplink != NULL && uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
-						usleep( 100000 ); // server gets a penalty and is less likely to be selected
+					if ( bOk && uplink != NULL ) {
+						if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
+							usleep( 100000 ); // server gets a penalty and is less likely to be selected
+						}
+						if ( image->problem.uplink ) {
+							// Penaltize depending on completeness, if no uplink is available
+							usleep( ( 100 - image->completenessEstimate ) * 100 );
+						}
 					}
 					if ( uplink != NULL ) {
 						ref_put( &uplink->reference );
@@ -383,9 +389,8 @@ void* net_handleNewConnection(void *clientPtr)
 					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
-							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy, disabling image %s:%d",
+							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
-							image->working = false;
 							goto exit_client_cleanup;
 						}
 						break; // DONE, exit request.cmd switch
@@ -456,7 +461,7 @@ void* net_handleNewConnection(void *clientPtr)
 								}
 								if ( err == EBADF || err == EFAULT || err == EINVAL || err == EIO ) {
 									logadd( LOG_INFO, "Disabling %s:%d", image->name, image->rid );
-									image->working = false;
+									image->problem.read = true;
 								}
 							}
 							goto exit_client_cleanup;
diff --git a/src/server/uplink.c b/src/server/uplink.c
index f39e633..aba53ba 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -53,9 +53,9 @@ static void* uplink_mainloop(void *data);
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly);
 static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int lastBlockIndex);
 static void uplink_handleReceive(dnbd3_uplink_t *uplink);
-static int uplink_sendKeepalive(const int fd);
+static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink);
 static void uplink_addCrc32(dnbd3_uplink_t *uplink);
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
+static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
 static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
 static bool uplink_saveCacheMap(dnbd3_uplink_t *uplink);
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
@@ -117,6 +117,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
+	image->problem.uplink = true;
 	if ( sock != -1 ) {
 		uplink->better.fd = sock;
 		int index = altservers_hostToIndex( host );
@@ -371,6 +372,7 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
 	} else {
 		if ( unlikely( uplink->current.fd == -1 ) ) {
+			uplink->image->problem.uplink = true;
 			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 		} else {
@@ -378,12 +380,14 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
 			if ( hops < 200 ) ++hops;
 			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-			mutex_unlock( &uplink->sendMutex );
 			if ( unlikely( !ret ) ) {
+				uplink->image->problem.uplink = true;
+				mutex_unlock( &uplink->sendMutex );
 				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
 			} else {
 				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
 				int state;
+				mutex_unlock( &uplink->sendMutex );
 				mutex_lock( &uplink->queueLock );
 				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
 					state = uplink->queue[freeSlot].status;
@@ -460,9 +464,9 @@ static void* uplink_mainloop(void *data)
 	}
 	while ( !_shutdown && !uplink->shutdown ) {
 		// poll()
-		waitTime = uplink->rttTestResult == RTT_DOCHANGE ? 0 : -1;
-		if ( waitTime == 0 ) {
+		if ( uplink->rttTestResult == RTT_DOCHANGE ) {
 			// 0 means poll, since we're about to change the server
+			waitTime = 0;
 		} else {
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
@@ -495,7 +499,7 @@ static void* uplink_mainloop(void *data)
 			discoverFailCount = 0;
 			if ( fd != -1 ) close( fd );
 			uplink->replicationHandle = REP_NONE;
-			uplink->image->working = true;
+			uplink->image->problem.uplink = false;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
 			if ( altservers_toString( uplink->current.index, buffer + 1, sizeof(buffer) - 1 ) ) {
@@ -510,6 +514,11 @@ static void* uplink_mainloop(void *data)
 			uplink_sendRequests( uplink, false );
 			uplink_sendReplicationRequest( uplink );
 			events[EV_SOCKET].events = POLLIN | POLLRDHUP;
+			if ( uplink->image->problem.uplink ) {
+				// Some of the requests above must have failed again already :-(
+				logadd( LOG_DEBUG1, "Newly established uplink connection failed during getCRC or sendRequests" );
+				uplink_connectionFailed( uplink, true );
+			}
 			timing_gets( &nextAltCheck, altCheckInterval );
 			// The rtt worker already did the handshake for our image, so there's nothing
 			// more to do here
@@ -517,6 +526,7 @@ static void* uplink_mainloop(void *data)
 		// Check events
 		// Signal
 		if ( (events[EV_SIGNAL].revents & (POLLERR | POLLHUP | POLLRDHUP | POLLNVAL)) ) {
+			uplink->image->problem.uplink = true;
 			logadd( LOG_WARNING, "poll error on signal in uplink_mainloop!" );
 			goto cleanup;
 		} else if ( (events[EV_SIGNAL].revents & POLLIN) ) {
@@ -553,14 +563,10 @@ static void* uplink_mainloop(void *data)
 			}
 			// Keep-alive
 			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
-				// Send keep-alive if nothing is happening
-				if ( uplink_sendKeepalive( uplink->current.fd ) ) {
-					// Re-trigger periodically, in case it requires a minimum user count
-					uplink_sendReplicationRequest( uplink );
-				} else {
+				// Send keep-alive if nothing is happening, and try to trigger background rep.
+				if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) {
 					uplink_connectionFailed( uplink, true );
-					logadd( LOG_DEBUG1, "Error sending keep-alive, panic!\n" );
-					setThreadName( "panic-uplink" );
+					logadd( LOG_DEBUG1, "Error sending keep-alive/BGR, panic!\n" );
 				}
 			}
 			// Don't keep uplink established if we're idle for too much
@@ -578,6 +584,7 @@ static void* uplink_mainloop(void *data)
 					// Quit work if image is complete
 					logadd( LOG_INFO, "Replication of %s complete.", uplink->image->name );
 					setThreadName( "finished-uplink" );
+					uplink->image->problem.uplink = false;
 					goto cleanup;
 				} else {
 					// Not complete - do measurement
@@ -592,10 +599,6 @@ static void* uplink_mainloop(void *data)
 		} else if ( rttTestResult == RTT_NOT_REACHABLE ) {
 			if ( atomic_compare_exchange_strong( &uplink->rttTestResult, &rttTestResult, RTT_IDLE ) ) {
 				discoverFailCount++;
-				if ( uplink->image->working && uplink->current.fd == -1 && discoverFailCount > (SERVER_RTT_MAX_UNREACH / 2) ) {
-					logadd( LOG_DEBUG1, "Disabling %s:%d since no uplink is available", uplink->image->name, (int)uplink->image->rid );
-					uplink->image->working = false;
-				}
 				if ( uplink->current.fd == -1 ) {
 					uplink->cycleDetected = false;
 				}
@@ -624,8 +627,9 @@ static void* uplink_mainloop(void *data)
 				}
 			}
 			mutex_unlock( &uplink->queueLock );
-			if ( resend )
+			if ( resend ) {
 				uplink_sendRequests( uplink, true );
+			}
 		}
 #endif
 	}
@@ -653,6 +657,9 @@ static void* uplink_mainloop(void *data)
 	return NULL ;
 }
 
+/**
+ * Only called from uplink thread.
+ */
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
 	// Scan for new requests
@@ -672,13 +679,15 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 		if ( hops < 200 ) ++hops;
 		mutex_lock( &uplink->sendMutex );
 		const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-		mutex_unlock( &uplink->sendMutex );
-		if ( !ret ) {
+		if ( likely( ret ) ) {
+			mutex_unlock( &uplink->sendMutex );
+		} else {
 			// Non-critical - if the connection dropped or the server was changed
 			// the thread will re-send this request as soon as the connection
 			// is reestablished.
+			uplink->image->problem.uplink = true;
+			mutex_unlock( &uplink->sendMutex );
 			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			altservers_serverFailed( uplink->current.index );
 			return;
 		}
 		mutex_lock( &uplink->queueLock );
@@ -695,21 +704,27 @@ static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
  * server. This means we might request data we already have, but it makes
  * the code simpler. Worst case would be only one bit is zero, which means
  * 4kb are missing, but we will request 32kb.
+ *
+ * Only called form uplink thread, so current.fd is assumed to be valid.
+ *
+ * @return false if sending request failed, true otherwise (i.e. not necessary/disabled)
  */
-static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
+static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 {
-	if ( uplink == NULL || uplink->current.fd == -1 ) return;
-	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 ) return; // Don't do background replication
+	if ( uplink->current.fd == -1 )
+		return false; // Should never be called in this state, consider send error
+	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
+		return true; // Don't do background replication
 	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return; // Already a replication request on the wire, or no more blocks to replicate
+		return true; // Already a replication request on the wire, or no more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
-	if ( image->virtualFilesize < DNBD3_BLOCK_SIZE ) return;
-	if ( image->users < _bgrMinClients ) return; // Not enough active users
+	if ( image->users < _bgrMinClients )
+		return true; // Not enough active users
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-	if ( cache == NULL || image->users < _bgrMinClients ) {
+	if ( cache == NULL || image->users ) {
 		// No cache map (=image complete)
 		ref_put( &cache->reference );
-		return;
+		return true;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	const int lastBlockIndex = mapBytes - 1;
@@ -741,17 +756,20 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 	if ( replicationIndex == -1 ) {
 		// Replication might be complete, uplink_mainloop should take care....
 		uplink->nextReplicationIndex = -1;
-		return;
+		return true;
 	}
 	const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
 	uplink->replicationHandle = offset;
 	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
 	mutex_lock( &uplink->sendMutex );
 	bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
-	mutex_unlock( &uplink->sendMutex );
-	if ( !sendOk ) {
+	if ( likely( sendOk ) ) {
+		mutex_unlock( &uplink->sendMutex );
+	} else {
+		uplink->image->problem.uplink = true;
+		mutex_unlock( &uplink->sendMutex );
 		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
-		return;
+		return false;
 	}
 	if ( replicationIndex == lastBlockIndex ) {
 		uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
@@ -762,6 +780,7 @@ static void uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		// Just crossed a hash block boundary, look for new candidate starting at this very index
 		uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
 	}
+	return true;
 }
 
 /**
@@ -816,6 +835,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 /**
  * Receive data from uplink server and process/dispatch
  * Locks on: uplink.lock, images[].lock
+ * Only called from uplink thread, so current.fd is assumed to be valid.
  */
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
@@ -990,11 +1010,14 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		mutex_lock( &uplink->queueLock );
 		const bool rep = ( uplink->queueLen == 0 );
 		mutex_unlock( &uplink->queueLock );
-		if ( rep ) uplink_sendReplicationRequest( uplink );
+		if ( rep ) {
+			if ( !uplink_sendReplicationRequest( uplink ) )
+				goto error_cleanup;
+		}
 	}
 	return;
 	// Error handling from failed receive or message parsing
-	error_cleanup: ;
+error_cleanup: ;
 	uplink_connectionFailed( uplink, true );
 }
 
@@ -1005,8 +1028,10 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 {
 	if ( uplink->current.fd == -1 )
 		return;
+	setThreadName( "panic-uplink" );
 	altservers_serverFailed( uplink->current.index );
 	mutex_lock( &uplink->sendMutex );
+	uplink->image->problem.uplink = true;
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
@@ -1025,14 +1050,24 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 }
 
 /**
- * Send keep alive request to server
+ * Send keep alive request to server.
+ * Called from uplink thread, current.fd must be valid.
  */
-static int uplink_sendKeepalive(const int fd)
+static bool uplink_sendKeepalive(dnbd3_uplink_t *uplink)
 {
 	static const dnbd3_request_t request = { .magic = dnbd3_packet_magic, .cmd = net_order_16( CMD_KEEPALIVE ) };
-	return send( fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+	mutex_lock( &uplink->sendMutex );
+	bool sendOk = send( uplink->current.fd, &request, sizeof(request), MSG_NOSIGNAL ) == sizeof(request);
+	mutex_unlock( &uplink->sendMutex );
+	return sendOk;
 }
 
+/**
+ * Request crclist from uplink.
+ * Called from uplink thread, current.fd must be valid.
+ * FIXME This is broken as it could happen that another message arrives after sending
+ * the request. Refactor, split and move receive into general receive handler.
+ */
 static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 {
 	dnbd3_image_t *image = uplink->image;
@@ -1042,6 +1077,9 @@ static void uplink_addCrc32(dnbd3_uplink_t *uplink)
 	uint32_t *buffer = malloc( bytes );
 	mutex_lock( &uplink->sendMutex );
 	bool sendOk = dnbd3_get_crc32( uplink->current.fd, &masterCrc, buffer, &bytes );
+	if ( !sendOk ) {
+		uplink->image->problem.uplink = true;
+	}
 	mutex_unlock( &uplink->sendMutex );
 	if ( !sendOk || bytes == 0 ) {
 		free( buffer );
-- 
cgit v1.2.3-55-g7522


From 5bc3badd013b88201da64dc970600d19451daaec Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 14:55:01 +0100
Subject: [SERVER] Also add a flag for uplink queue overload

---
 src/server/globals.h |  3 ++-
 src/server/net.c     | 10 +++-------
 src/server/uplink.c  | 11 +++++++++++
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.h b/src/server/globals.h
index 31fbce5..0bd6e47 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -93,7 +93,7 @@ struct _dnbd3_uplink
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
 	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
-	atomic_int queueLen;        // length of queue
+	int queueLen;               // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
 	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
 	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
@@ -141,6 +141,7 @@ struct _dnbd3_image
 		atomic_bool write;       // Error writing to file
 		atomic_bool read;        // Error reading from file
 		atomic_bool changed;     // File disappeared or changed, thorough check required if it seems to be back
+		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
 	pthread_mutex_t lock;
diff --git a/src/server/net.c b/src/server/net.c
index 29147be..a478e0c 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -269,12 +269,11 @@ void* net_handleNewConnection(void *clientPtr)
 				// Image is fine so far, but occasionally drop a client if the uplink for the image is clogged or unavailable
 				bOk = true;
 				if ( image->ref_cacheMap != NULL ) {
-					dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-					if ( uplink != NULL && ( uplink->cacheFd == -1 || uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) ) {
+					if ( image->problem.queue || image->problem.write ) {
 						bOk = ( rand() % 4 ) == 1;
 					}
-					if ( bOk && uplink != NULL ) {
-						if ( uplink->cacheFd == -1 ) { // Wait 100ms if local caching is not working so this
+					if ( bOk ) {
+						if ( image->problem.write ) { // Wait 100ms if local caching is not working so this
 							usleep( 100000 ); // server gets a penalty and is less likely to be selected
 						}
 						if ( image->problem.uplink ) {
@@ -282,9 +281,6 @@ void* net_handleNewConnection(void *clientPtr)
 							usleep( ( 100 - image->completenessEstimate ) * 100 );
 						}
 					}
-					if ( uplink != NULL ) {
-						ref_put( &uplink->reference );
-					}
 				}
 				if ( bOk ) {
 					mutex_lock( &image->lock );
diff --git a/src/server/uplink.c b/src/server/uplink.c
index aba53ba..97cb2a9 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -118,6 +118,8 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	mutex_unlock( &uplink->sendMutex );
 	uplink->cycleDetected = false;
 	image->problem.uplink = true;
+	image->problem.write = true;
+	image->problem.queue = false;
 	if ( sock != -1 ) {
 		uplink->better.fd = sock;
 		int index = altservers_hostToIndex( host );
@@ -191,6 +193,7 @@ static void cancelAllRequests(dnbd3_uplink_t *uplink)
 		}
 	}
 	uplink->queueLen = 0;
+	uplink->image->problem.queue = false;
 }
 
 static void uplink_free(ref *ref)
@@ -328,6 +331,9 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 			goto fail_lock;
 		}
 		freeSlot = uplink->queueLen++;
+		if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) {
+			uplink->image->problem.queue = true;
+		}
 	}
 	// Do not send request to uplink server if we have a matching pending request AND the request either has the
 	// status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
@@ -904,6 +910,7 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 						continue; // Success, retry write
 					}
 					if ( err == EBADF || err == EINVAL || err == EIO ) {
+						uplink->image->problem.write = true;
 						if ( !tryAgain || !uplink_reopenCacheFd( uplink, true ) )
 							break;
 						tryAgain = false;
@@ -983,6 +990,9 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 			}
 			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
+		if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
+			uplink->image->problem.queue = false;
+		}
 		mutex_unlock( &uplink->queueLock );
 #ifdef _DEBUG
 		if ( !served && start != uplink->replicationHandle ) {
@@ -1121,6 +1131,7 @@ static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force)
 		close( uplink->cacheFd );
 	}
 	uplink->cacheFd = open( uplink->image->path, O_WRONLY | O_CREAT, 0644 );
+	uplink->image->problem.write = uplink->cacheFd == -1;
 	return uplink->cacheFd != -1;
 }
 
-- 
cgit v1.2.3-55-g7522


From 49a9cd2d89dd586db5e08c9d3e96b88a8e8346d7 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Tue, 3 Mar 2020 16:46:36 +0100
Subject: [SERVER] Optimize client handler for CMD_GET_BLOCK

Move CMD_GET_BLOCK out of switch block and mark as likely. Don't acquire
and release cache map for every single request, but keep reference
around and only release when a message other than CMD_GET_BLOCK arrives.
On idle links, this should happen through CMD_KEEPALIVE every now and
then.
---
 src/server/net.c | 68 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index a478e0c..0f7e169 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -207,6 +207,7 @@ void* net_handleNewConnection(void *clientPtr)
 	dnbd3_reply_t reply;
 
 	dnbd3_image_t *image = NULL;
+	dnbd3_cache_map_t *cache = NULL;
 	int image_file = -1;
 
 	int num;
@@ -315,9 +316,8 @@ void* net_handleNewConnection(void *clientPtr)
 		// client handling mainloop
 		while ( recv_request_header( client->sock, &request ) ) {
 			if ( _shutdown ) break;
-			switch ( request.cmd ) {
+			if ( likely ( request.cmd == CMD_GET_BLOCK ) ) {
 
-			case CMD_GET_BLOCK:;
 				const uint64_t offset = request.offset_small; // Copy to full uint64 to prevent repeated masking
 				reply.handle = request.handle;
 				if ( unlikely( offset >= image->virtualFilesize ) ) {
@@ -326,7 +326,7 @@ void* net_handleNewConnection(void *clientPtr)
 					reply.size = 0;
 					reply.cmd = CMD_ERROR;
 					send_reply( client->sock, &reply, NULL );
-					break;
+					continue;
 				}
 				if ( unlikely( offset + request.size > image->virtualFilesize ) ) {
 					// Sanity check
@@ -334,11 +334,14 @@ void* net_handleNewConnection(void *clientPtr)
 					reply.size = 0;
 					reply.cmd = CMD_ERROR;
 					send_reply( client->sock, &reply, NULL );
-					break;
+					continue;
+				}
+
+				if ( cache == NULL && image->uplinkref != NULL ) {
+					cache = ref_get_cachemap( image );
 				}
 
-				dnbd3_cache_map_t *cache;
-				if ( request.size != 0 && ( cache = ref_get_cachemap( image ) ) != NULL ) {
+				if ( request.size != 0 && cache != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
 					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
@@ -360,36 +363,39 @@ void* net_handleNewConnection(void *clientPtr)
 					// First byte
 					if ( isCached ) {
 						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-						for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
-							const int map_x = (pos >> 12) & 7; // mod 8
-							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-							if ( (b & bit_mask) == 0 ) {
-								isCached = false;
-								break;
+						if ( b != 0xff ) {
+							for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
+								const int map_x = (pos >> 12) & 7; // mod 8
+								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+								if ( (b & bit_mask) == 0 ) {
+									isCached = false;
+									break;
+								}
 							}
 						}
 					}
 					// Last byte - only check if request spans multiple bytes in cache map
 					if ( isCached && firstByteInMap != lastByteInMap ) {
 						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-						for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-							assert( lastByteInMap == (pos >> 15) );
-							const int map_x = (pos >> 12) & 7; // mod 8
-							const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-							if ( (b & bit_mask) == 0 ) {
-								isCached = false;
-								break;
+						if ( b != 0xff ) {
+							for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
+								assert( lastByteInMap == (pos >> 15) );
+								const int map_x = (pos >> 12) & 7; // mod 8
+								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
+								if ( (b & bit_mask) == 0 ) {
+									isCached = false;
+									break;
+								}
 							}
 						}
 					}
-					ref_put( &cache->reference );
 					if ( !isCached ) {
 						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
 							goto exit_client_cleanup;
 						}
-						break; // DONE, exit request.cmd switch
+						continue; // Reply arrives on uplink some time later, handle next request now
 					}
 				}
 
@@ -474,7 +480,16 @@ void* net_handleNewConnection(void *clientPtr)
 				if ( lock ) mutex_unlock( &client->sendMutex );
 				// Global per-client counter
 				client->bytesSent += request.size; // Increase counter for statistics.
-				break;
+				continue;
+			}
+			// Any other command
+			// Release cache map every now and then, in case the image was replicated
+			// entirely. Will be re-grabbed on next CMD_GET_BLOCK otherwise.
+			if ( cache != NULL ) {
+				ref_put( &cache->reference );
+				cache = NULL;
+			}
+			switch ( request.cmd ) {
 
 			case CMD_GET_SERVERS:
 				// Build list of known working alt servers
@@ -523,9 +538,9 @@ set_name: ;
 				logadd( LOG_ERROR, "Unknown command from client %s: %d", client->hostName, (int)request.cmd );
 				break;
 
-			}
-		}
-	}
+			} // end switch
+		} // end loop
+	} // end bOk
 exit_client_cleanup: ;
 	// First remove from list, then add to counter to prevent race condition
 	removeFromList( client );
@@ -536,6 +551,9 @@ exit_client_cleanup: ;
 		timing_get( &image->atime );
 		mutex_unlock( &image->lock );
 	}
+	if ( cache != NULL ) {
+		ref_put( &cache->reference );
+	}
 	freeClientStruct( client ); // This will also call image_release on client->image
 	return NULL ;
 fail_preadd: ;
-- 
cgit v1.2.3-55-g7522


From 5c92010d74451a46064e85484a6969a8a2f2cf82 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Wed, 4 Mar 2020 12:17:40 +0100
Subject: [SERVER] Likewise, get rid of same loops in client handler

---
 src/server/image.c | 32 +++++++++++++++---------------
 src/server/net.c   | 58 ++++++++++++++++++++++--------------------------------
 2 files changed, 39 insertions(+), 51 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/image.c b/src/server/image.c
index 886bf33..3583f86 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -121,8 +121,15 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 	// First and last byte masks
 	const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
 	const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
-	atomic_thread_fence( memory_order_acquire );
-	if ( firstByteInMap != lastByteInMap ) {
+	if ( firstByteInMap == lastByteInMap ) {
+		if ( set ) {
+			uint8_t o = atomic_fetch_or( &cache->map[firstByteInMap], (uint8_t)(fb & lb) );
+			setNewBlocks = o != ( o | (fb & lb) );
+		} else {
+			atomic_fetch_and( &cache->map[firstByteInMap], (uint8_t)~(fb & lb) );
+		}
+	} else {
+		atomic_thread_fence( memory_order_acquire );
 		if ( set ) {
 			uint8_t fo = atomic_fetch_or_explicit( &cache->map[firstByteInMap], fb, memory_order_relaxed );
 			uint8_t lo = atomic_fetch_or_explicit( &cache->map[lastByteInMap], lb, memory_order_relaxed );
@@ -131,22 +138,15 @@ void image_updateCachemap(dnbd3_image_t *image, uint64_t start, uint64_t end, co
 			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~fb, memory_order_relaxed );
 			atomic_fetch_and_explicit( &cache->map[lastByteInMap], (uint8_t)~lb, memory_order_relaxed );
 		}
-	} else {
-		if ( set ) {
-			uint8_t o = atomic_fetch_or_explicit( &cache->map[firstByteInMap], (uint8_t)(fb & lb), memory_order_relaxed );
-			setNewBlocks = o != ( o | (fb & lb) );
-		} else {
-			atomic_fetch_and_explicit( &cache->map[firstByteInMap], (uint8_t)~(fb & lb), memory_order_relaxed );
-		}
-	}
-	const uint8_t nval = set ? 0xff : 0;
-	// Everything in between
-	for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-		if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
-			setNewBlocks = true;
+		// Everything in between
+		const uint8_t nval = set ? 0xff : 0;
+		for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+			if ( atomic_exchange_explicit( &cache->map[pos], nval, memory_order_relaxed ) != nval && set ) {
+				setNewBlocks = true;
+			}
 		}
+		atomic_thread_fence( memory_order_release );
 	}
-	atomic_thread_fence( memory_order_release );
 	if ( setNewBlocks && image->crc32 != NULL ) {
 		// If setNewBlocks is set, at least one of the blocks was not cached before, so queue all hash blocks
 		// for checking, even though this might lead to checking some hash block again, if it was
diff --git a/src/server/net.c b/src/server/net.c
index 0f7e169..01056e0 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -216,7 +216,6 @@ void* net_handleNewConnection(void *clientPtr)
 
 	serialized_buffer_t payload;
 	uint16_t rid, client_version;
-	uint64_t start, end;
 
 	dnbd3_server_entry_t server_list[NUMBER_SERVERS];
 
@@ -343,46 +342,35 @@ void* net_handleNewConnection(void *clientPtr)
 
 				if ( request.size != 0 && cache != NULL ) {
 					// This is a proxyed image, check if we need to relay the request...
-					start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					bool isCached = true;
+					const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+					const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					const uint64_t firstByteInMap = start >> 15;
 					const uint64_t lastByteInMap = (end - 1) >> 15;
+					const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+					const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
 					uint64_t pos;
 					uint8_t b;
-					atomic_thread_fence( memory_order_acquire );
-					// Middle - quick checking
-					if ( isCached ) {
-						for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-							if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
-								isCached = false;
-								break;
-							}
+					bool isCached;
+					if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+						b = cache->map[firstByteInMap];
+						isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+					} else {
+						isCached = true;
+						atomic_thread_fence( memory_order_acquire );
+						// First byte
+						if ( isCached ) {
+							b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+							isCached = ( ( b & fb ) == fb );
 						}
-					}
-					// First byte
-					if ( isCached ) {
-						b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-						if ( b != 0xff ) {
-							for ( pos = start; firstByteInMap == (pos >> 15) && pos < end; pos += DNBD3_BLOCK_SIZE ) {
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (b & bit_mask) == 0 ) {
-									isCached = false;
-									break;
-								}
-							}
+						// Last byte
+						if ( isCached ) {
+							b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+							isCached = ( ( b & lb ) == lb );
 						}
-					}
-					// Last byte - only check if request spans multiple bytes in cache map
-					if ( isCached && firstByteInMap != lastByteInMap ) {
-						b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-						if ( b != 0xff ) {
-							for ( pos = lastByteInMap << 15; pos < end; pos += DNBD3_BLOCK_SIZE ) {
-								assert( lastByteInMap == (pos >> 15) );
-								const int map_x = (pos >> 12) & 7; // mod 8
-								const uint8_t bit_mask = (uint8_t)( 1 << map_x );
-								if ( (b & bit_mask) == 0 ) {
+						// Middle, must be all bits set (0xff)
+						if ( isCached ) {
+							for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+								if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
 									isCached = false;
 									break;
 								}
-- 
cgit v1.2.3-55-g7522


From a91bd049d6e33af29d5f941d556cd1c374b4dd7e Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 6 Mar 2020 19:07:44 +0100
Subject: [SERVER] Fix: Image would be assumed complete if no uplink exists

Severe data corruption on client. Nice.
---
 src/server/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/server/net.c')

diff --git a/src/server/net.c b/src/server/net.c
index 01056e0..954cb8a 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -336,7 +336,7 @@ void* net_handleNewConnection(void *clientPtr)
 					continue;
 				}
 
-				if ( cache == NULL && image->uplinkref != NULL ) {
+				if ( cache == NULL ) {
 					cache = ref_get_cachemap( image );
 				}
 
-- 
cgit v1.2.3-55-g7522


From 290d3478f245bb7d2112bb781286a9fbae42b983 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 13 Mar 2020 16:03:29 +0100
Subject: [SERVER] Rewrite uplink queue handling

- Now uses linked lists instead of huge array
- Does prefetch data on client requests
- Can have multiple replication requests in-flight
---
 src/server/globals.c   |   6 +
 src/server/globals.h   |  35 ++-
 src/server/image.c     |   3 +-
 src/server/image.h     |  44 +++
 src/server/net.c       |  44 +--
 src/server/reference.h |   5 +
 src/server/uplink.c    | 771 +++++++++++++++++++++++++++----------------------
 src/server/uplink.h    |   2 +-
 src/serverconfig.h     |   3 +-
 9 files changed, 518 insertions(+), 395 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.c b/src/server/globals.c
index ac079b1..98e0ddb 100644
--- a/src/server/globals.c
+++ b/src/server/globals.c
@@ -19,6 +19,7 @@ atomic_int _clientPenalty = 0;
 atomic_bool _isProxy = false;
 atomic_int _backgroundReplication = BGR_FULL;
 atomic_int _bgrMinClients = 0;
+atomic_int _bgrWindowSize = 1;
 atomic_bool _lookupMissingForProxy = true;
 atomic_bool _sparseFiles = false;
 atomic_bool _ignoreAllocErrors = false;
@@ -74,6 +75,7 @@ static int ini_handler(void *custom UNUSED, const char* section, const char* key
 	SAVE_TO_VAR_BOOL( dnbd3, isProxy );
 	SAVE_TO_VAR_BOOL( dnbd3, proxyPrivateOnly );
 	SAVE_TO_VAR_INT( dnbd3, bgrMinClients );
+	SAVE_TO_VAR_INT( dnbd3, bgrWindowSize );
 	SAVE_TO_VAR_BOOL( dnbd3, lookupMissingForProxy );
 	SAVE_TO_VAR_BOOL( dnbd3, sparseFiles );
 	SAVE_TO_VAR_BOOL( dnbd3, ignoreAllocErrors );
@@ -134,6 +136,9 @@ void globals_loadConfig()
 		logadd( LOG_WARNING, "Ignoring 'sparseFiles=true' since backgroundReplication is set to true and bgrMinClients is too low" );
 		_sparseFiles = false;
 	}
+	if ( _bgrWindowSize < 1 ) {
+		_bgrWindowSize = 1;
+	}
 	// Dump config as interpreted
 	char buffer[2000];
 	globals_dumpConfig( buffer, sizeof(buffer) );
@@ -325,6 +330,7 @@ size_t globals_dumpConfig(char *buffer, size_t size)
 		PBOOL(backgroundReplication);
 	}
 	PINT(bgrMinClients);
+	PINT(bgrWindowSize);
 	PBOOL(lookupMissingForProxy);
 	PBOOL(sparseFiles);
 	PBOOL(ignoreAllocErrors);
diff --git a/src/server/globals.h b/src/server/globals.h
index 1bb6857..5cee92a 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -18,18 +18,27 @@ typedef struct _dnbd3_uplink dnbd3_uplink_t;
 typedef struct _dnbd3_image dnbd3_image_t;
 typedef struct _dnbd3_client dnbd3_client_t;
 
-typedef struct
+typedef struct _dnbd3_queue_client
 {
-	uint64_t handle;  // Client defined handle to pass back in reply
-	uint64_t from;    // First byte offset of requested block (ie. 4096)
-	uint64_t to;      // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+	struct _dnbd3_queue_client *next;
+	uint64_t handle;    // Handle used by client
+	uint64_t from, to;  // Client range
 	dnbd3_client_t * client; // Client to send reply to
-	int status;      // status of this entry: ULR_*
+} dnbd3_queue_client_t;
+
+typedef struct _dnbd3_queue_entry
+{
+	struct _dnbd3_queue_entry *next;
+	uint64_t   handle;   // Our handle for this entry
+	uint64_t   from;     // First byte offset of requested block (ie. 4096)
+	uint64_t   to;       // Last byte + 1 of requested block (ie. 8192, if request len is 4096, resulting in bytes 4096-8191)
+	dnbd3_queue_client_t *clients;
 #ifdef _DEBUG
-	ticks entered;           // When this request entered the queue (for debugging)
+	ticks      entered;  // When this request entered the queue (for debugging)
 #endif
-	uint8_t hopCount;      // How many hops this request has already taken across proxies
-} dnbd3_queued_request_t;
+	uint8_t    hopCount; // How many hops this request has already taken across proxies
+	bool       sent;     // Already sent to uplink?
+} dnbd3_queue_entry_t;
 
 typedef struct _ns
 {
@@ -91,12 +100,12 @@ struct _dnbd3_uplink
 	bool cycleDetected;         // connection cycle between proxies detected for current remote server
 	int nextReplicationIndex;   // Which index in the cache map we should start looking for incomplete blocks at
 	                            // If BGR == BGR_HASHBLOCK, -1 means "currently no incomplete block"
-	uint64_t replicationHandle; // Handle of pending replication request
 	atomic_uint_fast64_t bytesReceived; // Number of bytes received by the uplink since startup.
 	atomic_uint_fast64_t bytesReceivedLastSave; // Number of bytes received when we last saved the cache map
 	int queueLen;               // length of queue
 	uint32_t idleTime;          // How many seconds the uplink was idle (apart from keep-alives)
-	dnbd3_queued_request_t queue[SERVER_MAX_UPLINK_QUEUE];
+	dnbd3_queue_entry_t *queue;
+	atomic_uint_fast32_t queueId;
 	dnbd3_alt_local_t altData[SERVER_MAX_ALTS];
 };
 
@@ -156,6 +165,7 @@ struct _dnbd3_client
 	atomic_uint_fast64_t bytesSent;   // Byte counter for this client.
 	dnbd3_image_t * _Atomic image;    // Image in use by this client, or NULL during handshake
 	int sock;
+	_Atomic uint8_t relayedCount;     // How many requests are in-flight to the uplink server
 	bool isServer;                    // true if a server in proxy mode, false if real client
 	dnbd3_host_t host;
 	char hostName[HOSTNAMELEN];       // inet_ntop version of host
@@ -242,6 +252,11 @@ extern atomic_int _backgroundReplication;
  */
 extern atomic_int _bgrMinClients;
 
+/**
+ * How many in-flight replication requests we should target (per uplink)
+ */
+extern atomic_int _bgrWindowSize;
+
 /**
  * (In proxy mode): If connecting client is a proxy, and the requested image
  * is not known locally, should we ask our known alt servers for it?
diff --git a/src/server/image.c b/src/server/image.c
index 86b6374..81ec479 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -340,7 +340,6 @@ dnbd3_image_t* image_byId(int imgId)
 dnbd3_image_t* image_get(char *name, uint16_t revision, bool ensureFdOpen)
 {
 	int i;
-	const char *removingText = _removeMissingImages ? ", removing from list" : "";
 	dnbd3_image_t *candidate = NULL;
 	// Simple sanity check
 	const size_t slen = strlen( name );
@@ -1895,7 +1894,7 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 			// We're not replicating this image, if there's a cache map, reload
 			// it periodically, since we might read from a shared storage that
 			// another server instance is writing to.
-			if ( full || !cache->unchanged && !image->problem.read ) {
+			if ( full || ( !cache->unchanged && !image->problem.read ) ) {
 				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
 				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
 				if ( onDisk == NULL ) {
diff --git a/src/server/image.h b/src/server/image.h
index 4614c74..b23711b 100644
--- a/src/server/image.h
+++ b/src/server/image.h
@@ -51,6 +51,50 @@ bool image_ensureDiskSpaceLocked(uint64_t size, bool force);
 
 bool image_saveCacheMap(dnbd3_image_t *image);
 
+/**
+ * Check if given range is cached. Be careful when using this function because:
+ * 1) you need to hold a reference to the cache map
+ * 2) start and end are assumed to be 4k aligned
+ * 3) start and end are not checked to be in bounds (we don't know the image in this context)
+ */
+static inline bool image_isRangeCachedUnsafe(dnbd3_cache_map_t *cache, uint64_t start, uint64_t end)
+{
+	const uint64_t firstByteInMap = start >> 15;
+	const uint64_t lastByteInMap = (end - 1) >> 15;
+	const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
+	const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
+	uint64_t pos;
+	uint8_t b;
+	bool isCached;
+	if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
+		b = cache->map[firstByteInMap];
+		isCached = ( b & ( fb & lb ) ) == ( fb & lb );
+	} else {
+		isCached = true;
+		atomic_thread_fence( memory_order_acquire );
+		// First byte
+		if ( isCached ) {
+			b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
+			isCached = ( ( b & fb ) == fb );
+		}
+		// Last byte
+		if ( isCached ) {
+			b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
+			isCached = ( ( b & lb ) == lb );
+		}
+		// Middle, must be all bits set (0xff)
+		if ( isCached ) {
+			for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
+				if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
+					isCached = false;
+					break;
+				}
+			}
+		}
+	}
+	return isCached;
+}
+
 // one byte in the map covers 8 4kib blocks, so 32kib per byte
 // "+ (1 << 15) - 1" is required to account for the last bit of
 // the image that is smaller than 32kib
diff --git a/src/server/net.c b/src/server/net.c
index 954cb8a..9ba9dbc 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -197,6 +197,7 @@ void* net_handleNewConnection(void *clientPtr)
 	client->hostName[HOSTNAMELEN-1] = '\0';
 	mutex_unlock( &client->lock );
 	client->bytesSent = 0;
+	client->relayedCount = 0;
 
 	if ( !addToList( client ) ) {
 		freeClientStruct( client );
@@ -344,41 +345,18 @@ void* net_handleNewConnection(void *clientPtr)
 					// This is a proxyed image, check if we need to relay the request...
 					const uint64_t start = offset & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 					const uint64_t end = (offset + request.size + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-					const uint64_t firstByteInMap = start >> 15;
-					const uint64_t lastByteInMap = (end - 1) >> 15;
-					const uint8_t fb = (uint8_t)(0xff << ((start >> 12) & 7));
-					const uint8_t lb = (uint8_t)(~(0xff << ((((end - 1) >> 12) & 7) + 1)));
-					uint64_t pos;
-					uint8_t b;
-					bool isCached;
-					if ( firstByteInMap == lastByteInMap ) { // Single byte to check, much simpler
-						b = cache->map[firstByteInMap];
-						isCached = ( b & ( fb & lb ) ) == ( fb & lb );
-					} else {
-						isCached = true;
-						atomic_thread_fence( memory_order_acquire );
-						// First byte
-						if ( isCached ) {
-							b = atomic_load_explicit( &cache->map[firstByteInMap], memory_order_relaxed );
-							isCached = ( ( b & fb ) == fb );
-						}
-						// Last byte
-						if ( isCached ) {
-							b = atomic_load_explicit( &cache->map[lastByteInMap], memory_order_relaxed );
-							isCached = ( ( b & lb ) == lb );
-						}
-						// Middle, must be all bits set (0xff)
-						if ( isCached ) {
-							for ( pos = firstByteInMap + 1; pos < lastByteInMap; ++pos ) {
-								if ( atomic_load_explicit( &cache->map[pos], memory_order_relaxed ) != 0xff ) {
-									isCached = false;
-									break;
-								}
+					if ( !image_isRangeCachedUnsafe( cache, start, end ) ) {
+						if ( unlikely( client->relayedCount > 250 ) ) {
+							logadd( LOG_DEBUG1, "Client is overloading uplink; throttling" );
+							for ( int i = 0; i < 100 && client->relayedCount > 200; ++i ) {
+								usleep( 10000 );
+							}
+							if ( client->relayedCount > 250 ) {
+								logadd( LOG_WARNING, "Could not lower client's uplink backlog; dropping client" );
+								goto exit_client_cleanup;
 							}
 						}
-					}
-					if ( !isCached ) {
-						if ( !uplink_request( client, request.handle, offset, request.size, request.hops ) ) {
+						if ( !uplink_request( NULL, client, request.handle, offset, request.size, request.hops ) ) {
 							logadd( LOG_DEBUG1, "Could not relay uncached request from %s to upstream proxy for image %s:%d",
 									client->hostName, image->name, image->rid );
 							goto exit_client_cleanup;
diff --git a/src/server/reference.h b/src/server/reference.h
index 4eda546..75a681f 100644
--- a/src/server/reference.h
+++ b/src/server/reference.h
@@ -39,6 +39,11 @@ static inline ref *ref_get( weakref *weakref )
 	return ref;
 }
 
+static inline void ref_inc( ref *ref )
+{
+	++ref->count;
+}
+
 static inline void ref_put( ref *ref )
 {
 	if ( --ref->count == 0 ) {
diff --git a/src/server/uplink.c b/src/server/uplink.c
index 7c7cd1c..188bf06 100644
--- a/src/server/uplink.c
+++ b/src/server/uplink.c
@@ -8,6 +8,7 @@
 #include "../shared/protocol.h"
 #include "../shared/timing.h"
 #include "../shared/crc32.h"
+#include "threadpool.h"
 #include "reference.h"
 
 #include <assert.h>
@@ -21,30 +22,6 @@
 #define MAP_BYTES_PER_HASH_BLOCK (int)( HASH_BLOCK_SIZE / FILE_BYTES_PER_MAP_BYTE )
 #define MAP_INDEX_HASH_START_MASK ( ~(int)( MAP_BYTES_PER_HASH_BLOCK - 1 ) )
 
-#define REP_NONE ( (uint64_t)0xffffffffffffffff )
-
-// Status of request in queue
-
-// Slot is free, can be used.
-// Must only be set in uplink_handle_receive() or uplink_remove_client()
-#define ULR_FREE 0
-// Slot has been filled with a request that hasn't been sent to the upstream server yet, matching request can safely rely on reuse.
-// Must only be set in uplink_request()
-#define ULR_NEW 1
-// Slot is occupied, reply has not yet been received, matching request can safely rely on reuse.
-// Must only be set in uplink_mainloop() or uplink_request()
-#define ULR_PENDING 2
-// Slot is being processed, do not consider for hop on.
-// Must only be set in uplink_handle_receive()
-#define ULR_PROCESSING 3
-
-static const char *const NAMES_ULR[4] = {
-	[ULR_FREE] = "ULR_FREE",
-	[ULR_NEW] = "ULR_NEW",
-	[ULR_PENDING] = "ULR_PENDING",
-	[ULR_PROCESSING] = "ULR_PROCESSING",
-};
-
 static atomic_uint_fast64_t totalBytesReceived = 0;
 
 static void cancelAllRequests(dnbd3_uplink_t *uplink);
@@ -59,6 +36,15 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink);
 static bool uplink_reopenCacheFd(dnbd3_uplink_t *uplink, const bool force);
 static bool uplink_connectionShouldShutdown(dnbd3_uplink_t *uplink);
 static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew);
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink);
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle);
+static void *prefetchForClient(void *data);
+
+typedef struct {
+	dnbd3_uplink_t *uplink;
+	uint64_t start;
+	uint32_t length;
+} prefetch_request_t;
 
 // ############ uplink connection handling
 
@@ -106,6 +92,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 	uplink->bytesReceived = 0;
 	uplink->bytesReceivedLastSave = 0;
 	uplink->idleTime = SERVER_UPLINK_IDLE_TIMEOUT - 90;
+	uplink->queue = NULL;
 	uplink->queueLen = 0;
 	uplink->cacheFd = -1;
 	uplink->signal = signal_new();
@@ -113,7 +100,6 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 		logadd( LOG_WARNING, "Error creating signal. Uplink unavailable." );
 		goto failure;
 	}
-	uplink->replicationHandle = REP_NONE;
 	mutex_lock( &uplink->rttLock );
 	mutex_lock( &uplink->sendMutex );
 	uplink->current.fd = -1;
@@ -175,9 +161,9 @@ bool uplink_shutdown(dnbd3_image_t *image)
 	}
 	cancelAllRequests( uplink );
 	ref_setref( &image->uplinkref, NULL );
-	ref_put( &uplink->reference );
 	mutex_unlock( &uplink->queueLock );
 	bool retval = ( exp && image->users == 0 );
+	ref_put( &uplink->reference );
 	mutex_unlock( &image->lock );
 	return retval;
 }
@@ -188,12 +174,21 @@ bool uplink_shutdown(dnbd3_image_t *image)
  */
 static void cancelAllRequests(dnbd3_uplink_t *uplink)
 {
-	for ( int i = 0; i < uplink->queueLen; ++i ) {
-		if ( uplink->queue[i].status != ULR_FREE ) {
-			net_sendReply( uplink->queue[i].client, CMD_ERROR, uplink->queue[i].handle );
-			uplink->queue[i].status = ULR_FREE;
+	dnbd3_queue_entry_t *it = uplink->queue;
+	while ( it != NULL ) {
+		dnbd3_queue_client_t *cit = it->clients;
+		while ( cit != NULL ) {
+			net_sendReply( cit->client, CMD_ERROR, cit->handle );
+			cit->client->relayedCount--;
+			dnbd3_queue_client_t *next = cit->next;
+			free( cit );
+			cit = next;
 		}
+		dnbd3_queue_entry_t *next = it->next;
+		free( it );
+		it = next;
 	}
+	uplink->queue = NULL;
 	uplink->queueLen = 0;
 	uplink->image->problem.queue = false;
 }
@@ -234,39 +229,54 @@ static void uplink_free(ref *ref)
  */
 void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client)
 {
+	if ( client->relayedCount == 0 )
+		return;
 	mutex_lock( &uplink->queueLock );
-	for (int i = uplink->queueLen - 1; i >= 0; --i) {
-		if ( uplink->queue[i].client == client ) {
-			// Make sure client doesn't get destroyed while we're sending it data
-			mutex_lock( &client->sendMutex );
-			mutex_unlock( &client->sendMutex );
-			uplink->queue[i].client = NULL;
-			uplink->queue[i].status = ULR_FREE;
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		for ( dnbd3_queue_client_t **cit = &it->clients; *cit != NULL; cit = &(**cit).next ) {
+			if ( (**cit).client == client ) {
+				--client->relayedCount;
+				dnbd3_queue_client_t *entry = *cit;
+				*cit = (**cit).next;
+				free( entry );
+			}
 		}
-		if ( uplink->queue[i].client == NULL && uplink->queueLen == i + 1 ) uplink->queueLen--;
 	}
 	mutex_unlock( &uplink->queueLock );
+	if ( unlikely( client->relayedCount != 0 ) ) {
+		logadd( LOG_DEBUG1, "Client has relayedCount == %"PRIu8" on disconnect..", client->relayedCount );
+		int i;
+		for ( i = 0; i < 1000 && client->relayedCount != 0; ++i ) {
+			usleep( 10000 );
+		}
+		if ( client->relayedCount != 0 ) {
+			logadd( LOG_WARNING, "Client relayedCount still %"PRIu8" after sleeping!", client->relayedCount );
+		}
+	}
 }
 
 /**
- * Request a chunk of data through an uplink server
- * Locks on: image.lock, uplink.queueLock
+ * Request a chunk of data through an uplink server. Either uplink or client has to be non-NULL.
+ * If client is NULL, this is assumed to be a background replication request.
+ * Locks on: uplink.queueLock, uplink.sendMutex
  */
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
+bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops)
 {
-	if ( client == NULL || client->image == NULL )
-		return false;
+	bool getUplink = ( uplink == NULL );
+	assert( client != NULL || uplink != NULL );
 	if ( length > (uint32_t)_maxPayload ) {
 		logadd( LOG_WARNING, "Cannot relay request by client; length of %" PRIu32 " exceeds maximum payload", length );
 		return false;
 	}
-	dnbd3_uplink_t * uplink = ref_get_uplink( &client->image->uplinkref );
-	if ( unlikely( uplink == NULL ) ) {
-		uplink_init( client->image, -1, NULL, -1 );
+	if ( getUplink ) {
 		uplink = ref_get_uplink( &client->image->uplinkref );
-		if ( uplink == NULL ) {
-			logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
-			return false;
+		if ( unlikely( uplink == NULL ) ) {
+			uplink_init( client->image, -1, NULL, -1 );
+			uplink = ref_get_uplink( &client->image->uplinkref );
+			if ( uplink == NULL ) {
+				logadd( LOG_DEBUG1, "Uplink request for image with no uplink" );
+				return false;
+			}
 		}
 	}
 	if ( uplink->shutdown ) {
@@ -275,163 +285,179 @@ bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uin
 	}
 	// Check if the client is the same host as the uplink. If so assume this is a circular proxy chain
 	// This might be a false positive if there are multiple instances running on the same host (IP)
-	if ( hops != 0 && isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
+	if ( client != NULL && hops != 0
+			&& isSameAddress( altservers_indexToHost( uplink->current.index ), &client->host ) ) {
 		uplink->cycleDetected = true;
 		signal_call( uplink->signal );
 		logadd( LOG_WARNING, "Proxy cycle detected (same host)." );
 		goto fail_ref;
 	}
 
-	int foundExisting = -1; // Index of a pending request that is a superset of our range, -1 otherwise
-	int existingType = -1; // ULR_* type of existing request
-	int i;
-	int freeSlot = -1;
-	int firstUsedSlot = -1;
-	bool requestLoop = false;
-	const uint64_t end = start + length;
-
-	mutex_lock( &uplink->queueLock );
-	if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
-		goto fail_lock;
-	}
-	for (i = 0; i < uplink->queueLen; ++i) {
-		// find free slot to place this request into
-		if ( uplink->queue[i].status == ULR_FREE ) {
-			if ( freeSlot == -1 || existingType != ULR_PROCESSING ) {
-				freeSlot = i;
-			}
-			continue;
-		}
-		if ( firstUsedSlot == -1 ) {
-			firstUsedSlot = i;
-		}
-		// find existing request to attach to
-		if ( uplink->queue[i].from > start || uplink->queue[i].to < end )
-			continue; // Range not suitable
-		// Detect potential proxy cycle. New request hopcount is greater, range is same, old request has already been sent -> suspicious
-		if ( hops > uplink->queue[i].hopCount && uplink->queue[i].from == start && uplink->queue[i].to == end && uplink->queue[i].status == ULR_PENDING ) {
-			requestLoop = true;
-			break;
-		}
-		if ( foundExisting == -1 || existingType == ULR_PROCESSING ) {
-			foundExisting = i;
-			existingType = uplink->queue[i].status;
-		}
-	}
-	if ( unlikely( requestLoop ) ) {
-		uplink->cycleDetected = true;
-		signal_call( uplink->signal );
-		logadd( LOG_WARNING, "Rejecting relay of request to upstream proxy because of possible cyclic proxy chain. Incoming hop-count is %" PRIu8 ".", hops );
-		goto fail_lock;
-	}
-	if ( freeSlot < firstUsedSlot && firstUsedSlot < 10 && existingType != ULR_PROCESSING ) {
-		freeSlot = -1; // Not attaching to existing request, make it use a higher slot
-	}
-	if ( freeSlot == -1 ) {
-		if ( uplink->queueLen >= SERVER_MAX_UPLINK_QUEUE ) {
-			logadd( LOG_WARNING, "Uplink queue is full, consider increasing SERVER_MAX_UPLINK_QUEUE. Dropping client..." );
+	struct {
+		uint64_t handle, start, end;
+	} req;
+	do {
+		const uint64_t end = start + length;
+		dnbd3_queue_entry_t *request = NULL, *last = NULL;
+		bool isNew;
+		mutex_lock( &uplink->queueLock );
+		if ( uplink->shutdown ) { // Check again after locking to prevent lost requests
 			goto fail_lock;
 		}
-		freeSlot = uplink->queueLen++;
-		if ( freeSlot > SERVER_UPLINK_QUEUELEN_THRES ) {
-			uplink->image->problem.queue = true;
+		for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+			if ( it->from <= start && it->to >= end ) {
+				// Matching range, attach
+				request = it;
+				break;
+			}
+			if ( it->next == NULL ) {
+				// Not matching, last in list, remember
+				last = it;
+				break;
+			}
 		}
-	}
-	// Do not send request to uplink server if we have a matching pending request AND the request either has the
-	// status ULR_NEW/PENDING OR we found a free slot with LOWER index than the one we attach to. Otherwise
-	// explicitly send this request to the uplink server. The second condition mentioned here is to prevent
-	// a race condition where the reply for the outstanding request already arrived and the uplink thread
-	// is currently traversing the request queue. As it is processing the queue from highest to lowest index, it might
-	// already have passed the index of the free slot we determined, but not reached the existing request we just found above.
-	if ( foundExisting != -1 && existingType == ULR_PROCESSING && freeSlot > foundExisting ) {
-		foundExisting = -1; // -1 means "send request"
-	}
-#ifdef _DEBUG
-	if ( foundExisting != -1 ) {
-		logadd( LOG_DEBUG2, "%p (%s) Found existing request of type %s at slot %d, attaching in slot %d.\n", (void*)uplink, uplink->image->name, NAMES_ULR[existingType], foundExisting, freeSlot );
-		logadd( LOG_DEBUG2, "Original %" PRIu64 "-%" PRIu64 " (%p)\n"
-				"New      %" PRIu64 "-%" PRIu64 " (%p)\n",
-				uplink->queue[foundExisting].from, uplink->queue[foundExisting].to, (void*)uplink->queue[foundExisting].client,
-				start, end, (void*)client );
-	}
-#endif
-	// Fill structure
-	uplink->queue[freeSlot].from = start;
-	uplink->queue[freeSlot].to = end;
-	uplink->queue[freeSlot].handle = handle;
-	uplink->queue[freeSlot].client = client;
-	//int old = uplink->queue[freeSlot].status;
-	uplink->queue[freeSlot].status = ( foundExisting == -1 ? ULR_NEW :
-			( existingType == ULR_NEW ? ULR_PENDING : existingType ) );
-	uplink->queue[freeSlot].hopCount = hops;
+		dnbd3_queue_client_t **c;
+		if ( request == NULL ) {
+			// No existing request to attach to
+			if ( uplink->queueLen >= UPLINK_MAX_QUEUE ) {
+				logadd( LOG_WARNING, "Uplink queue is full, consider increasing UPLINK_MAX_QUEUE. Dropping client..." );
+				goto fail_lock;
+			}
+			uplink->queueLen++;
+			if ( uplink->queueLen > SERVER_UPLINK_QUEUELEN_THRES ) {
+				uplink->image->problem.queue = true;
+			}
+			request = malloc( sizeof(*request) );
+			if ( last == NULL ) {
+				uplink->queue = request;
+			} else {
+				last->next = request;
+			}
+			request->next = NULL;
+			request->handle = ++uplink->queueId;
+			request->from = start & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
+			request->to = (end + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
 #ifdef _DEBUG
-	timing_get( &uplink->queue[freeSlot].entered );
-	//logadd( LOG_DEBUG2 %p] Inserting request at slot %d, was %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 "\n", (void*)uplink, freeSlot, old, uplink->queue[freeSlot].status, uplink->queue[freeSlot, ".handle, start, end );
+			timing_get( &request->entered );
 #endif
-	mutex_unlock( &uplink->queueLock );
+			request->hopCount = hops;
+			request->sent = true; // Optimistic; would be set to false on failure
+			if ( client == NULL ) {
+				// BGR
+				request->clients = NULL;
+			} else {
+				c = &request->clients;
+			}
+			isNew = true;
+		} else if ( client == NULL ) {
+			// Replication request that maches existing request. Do nothing
+			isNew = false;
+		} else {
+			// Existing request. Check if potential cycle
+			if ( hops > request->hopCount + 1 && request->from == start && request->to == end ) {
+				logadd( LOG_DEBUG1, "Request cycle detected on uplink for %s:%d", PIMG(uplink->image) );
+				goto fail_lock;
+			}
+			// Count number if clients, get tail of list
+			int count = 0;
+			c = &request->clients;
+			while ( *c != NULL ) {
+				c = &(**c).next;
+				if ( ++count >= UPLINK_MAX_CLIENTS_PER_REQUEST ) {
+					logadd( LOG_DEBUG2, "Won't accept more than %d clients per request, dropping client", count );
+					goto fail_lock;
+				}
+			}
+			isNew = false;
+		}
+		req.handle = request->handle;
+		req.start = request->from;
+		req.end = request->to;
+		if ( client != NULL ) {
+			*c = malloc( sizeof( *request->clients ) );
+			(**c).next = NULL;
+			(**c).handle = handle;
+			(**c).from = start;
+			(**c).to = end;
+			(**c).client = client;
+			client->relayedCount++;
+		}
+		mutex_unlock( &uplink->queueLock );
 
-	if ( foundExisting != -1 ) {
-		ref_put( &uplink->reference );
-		return true; // Attached to pending request, do nothing
-	}
+		if ( !isNew ) {
+			goto success_ref; // Attached to pending request, do nothing
+		}
+	} while (0);
 
-	// See if we can fire away the request
-	if ( unlikely( mutex_trylock( &uplink->sendMutex ) != 0 ) ) {
-		logadd( LOG_DEBUG2, "Could not trylock send mutex, queueing uplink request" );
+	// Fire away the request
+	mutex_lock( &uplink->sendMutex );
+	if ( unlikely( uplink->current.fd == -1 ) ) {
+		uplink->image->problem.uplink = true;
+		markRequestUnsent( uplink, req.handle );
+		mutex_unlock( &uplink->sendMutex );
+		logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
 	} else {
-		if ( unlikely( uplink->current.fd == -1 ) ) {
+		if ( hops < 200 ) ++hops;
+		const bool ret = dnbd3_get_block( uplink->current.fd, req.start, req.end - req.start,
+				req.handle, COND_HOPCOUNT( uplink->current.version, hops ) );
+		if ( unlikely( !ret ) ) {
+			markRequestUnsent( uplink, req.handle );
 			uplink->image->problem.uplink = true;
 			mutex_unlock( &uplink->sendMutex );
-			logadd( LOG_DEBUG2, "Cannot do direct uplink request: No socket open" );
+			logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing (%"PRIu64")", req.handle );
 		} else {
-			const uint64_t reqStart = uplink->queue[freeSlot].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-			const uint32_t reqSize = (uint32_t)(((uplink->queue[freeSlot].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
-			if ( hops < 200 ) ++hops;
-			const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-			if ( unlikely( !ret ) ) {
-				uplink->image->problem.uplink = true;
-				mutex_unlock( &uplink->sendMutex );
-				logadd( LOG_DEBUG2, "Could not send out direct uplink request, queueing" );
-			} else {
-				// Direct send succeeded, update queue entry from NEW to PENDING, so the request won't be sent again
-				int state;
-				mutex_unlock( &uplink->sendMutex );
-				mutex_lock( &uplink->queueLock );
-				if ( !uplink->shutdown && uplink->queue[freeSlot].handle == handle && uplink->queue[freeSlot].client == client ) {
-					state = uplink->queue[freeSlot].status;
-					if ( uplink->queue[freeSlot].status == ULR_NEW ) {
-						uplink->queue[freeSlot].status = ULR_PENDING;
-					}
-				} else {
-					state = -1;
-				}
-				mutex_unlock( &uplink->queueLock );
-				if ( state == -1 ) {
-					logadd( LOG_DEBUG2, "Direct uplink request queue entry gone after sending and re-locking queue. *shrug*" );
-				} else if ( state == ULR_NEW ) {
-					//logadd( LOG_DEBUG2, "Direct uplink request" );
-				} else {
-					logadd( LOG_DEBUG2, "Direct uplink request queue entry changed to %s afte sending (expected ULR_NEW).", NAMES_ULR[uplink->queue[freeSlot].status] );
-				}
-				ref_put( &uplink->reference );
-				return true;
-			}
-			// Fall through to waking up sender thread
+			// OK
+			mutex_unlock( &uplink->sendMutex );
+			goto success_ref;
 		}
+		// Fall through to waking up sender thread
 	}
 
 	if ( signal_call( uplink->signal ) == SIGNAL_ERROR ) {
 		logadd( LOG_WARNING, "Cannot wake up uplink thread; errno=%d", (int)errno );
 	}
-	ref_put( &uplink->reference );
+
+success_ref:
+	if ( client != NULL ) {
+		// Was from client -- potential prefetch
+		uint32_t len = MIN( uplink->image->virtualFilesize - req.end, req.end - req.start );
+		if ( len > 0 ) {
+			prefetch_request_t *job = malloc( sizeof( *job ) );
+			job->start = req.end;
+			job->length = len;
+			job->uplink = uplink;
+			ref_inc( &uplink->reference ); // Hold one for the thread, thread will return it
+			threadpool_run( &prefetchForClient, (void*)job );
+		}
+	}
+	if ( getUplink ) {
+		ref_put( &uplink->reference );
+	}
 	return true;
 fail_lock:
 	mutex_unlock( &uplink->queueLock );
 fail_ref:
-	ref_put( &uplink->reference );
+	if ( getUplink ) {
+		ref_put( &uplink->reference );
+	}
 	return false;
 }
 
+static void *prefetchForClient(void *data)
+{
+	prefetch_request_t *job = (prefetch_request_t*)data;
+	dnbd3_cache_map_t *cache = ref_get_cachemap( job->uplink->image );
+	if ( cache != NULL ) {
+		if ( !image_isRangeCachedUnsafe( cache, job->start, job->start + job->length ) ) {
+			uplink_request( job->uplink, NULL, ++job->uplink->queueId, job->start, job->length, 0 );
+		}
+		ref_put( &cache->reference );
+	}
+	ref_put( &job->uplink->reference );
+	free( job );
+	return NULL;
+}
+
 /**
  * Uplink thread.
  * Locks are irrelevant as this is never called from another function
@@ -443,7 +469,7 @@ static void* uplink_mainloop(void *data)
 #define EV_COUNT  (2)
 	struct pollfd events[EV_COUNT];
 	dnbd3_uplink_t * const uplink = (dnbd3_uplink_t*)data;
-	int numSocks, i, waitTime;
+	int numSocks, waitTime;
 	int altCheckInterval = SERVER_RTT_INTERVAL_INIT;
 	int rttTestResult;
 	uint32_t discoverFailCount = 0;
@@ -478,7 +504,7 @@ static void* uplink_mainloop(void *data)
 			declare_now;
 			waitTime = (int)timing_diffMs( &now, &nextAltCheck );
 			if ( waitTime < 100 ) waitTime = 100;
-			if ( waitTime > 10000 ) waitTime = 10000;
+			else if ( waitTime > 10000 ) waitTime = 10000;
 		}
 		events[EV_SOCKET].fd = uplink->current.fd;
 		numSocks = poll( events, EV_COUNT, waitTime );
@@ -505,7 +531,6 @@ static void* uplink_mainloop(void *data)
 			mutex_unlock( &uplink->rttLock );
 			discoverFailCount = 0;
 			if ( fd != -1 ) close( fd );
-			uplink->replicationHandle = REP_NONE;
 			uplink->image->problem.uplink = false;
 			uplink->replicatedLastBlock = false; // Reset this to be safe - request could've been sent but reply was never received
 			buffer[0] = '@';
@@ -559,11 +584,11 @@ static void* uplink_mainloop(void *data)
 		}
 		declare_now;
 		uint32_t timepassed = timing_diff( &lastKeepalive, &now );
-		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL ) {
+		if ( timepassed >= SERVER_UPLINK_KEEPALIVE_INTERVAL || ( timepassed >= 2 && uplink->idleTime < _bgrWindowSize ) ) {
 			lastKeepalive = now;
 			uplink->idleTime += timepassed;
 			// Keep-alive
-			if ( uplink->current.fd != -1 && uplink->replicationHandle == REP_NONE ) {
+			if ( uplink->current.fd != -1 && uplink->queueLen < _bgrWindowSize ) {
 				// Send keep-alive if nothing is happening, and try to trigger background rep.
 				if ( !uplink_sendKeepalive( uplink ) || !uplink_sendReplicationRequest( uplink ) ) {
 					uplink_connectionFailed( uplink, true );
@@ -612,19 +637,16 @@ static void* uplink_mainloop(void *data)
 			ticks deadline;
 			timing_set( &deadline, &now, -10 );
 			mutex_lock( &uplink->queueLock );
-			for (i = 0; i < uplink->queueLen; ++i) {
-				if ( uplink->queue[i].status != ULR_FREE && timing_reached( &uplink->queue[i].entered, &deadline ) ) {
-					snprintf( buffer, sizeof(buffer), "[DEBUG %p] Starving request slot %d detected:\n"
-							"%s\n(from %" PRIu64 " to %" PRIu64 ", status: %d)\n", (void*)uplink, i, uplink->queue[i].client->image->name,
-							uplink->queue[i].from, uplink->queue[i].to, uplink->queue[i].status );
-					uplink->queue[i].entered = now;
+			for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+				if ( timing_reached( &it->entered, &deadline ) ) {
+					logadd( LOG_WARNING, "Starving request detected:"
+							" (from %" PRIu64 " to %" PRIu64 ", sent: %d) %s:%d",
+							it->from, it->to, (int)it->sent, PIMG(uplink->image) );
+					it->entered = now;
 #ifdef _DEBUG_RESEND_STARVING
-					uplink->queue[i].status = ULR_NEW;
+					it->sent = false;
 					resend = true;
 #endif
-					mutex_unlock( &uplink->queueLock );
-					logadd( LOG_WARNING, "%s", buffer );
-					mutex_lock( &uplink->queueLock );
 				}
 			}
 			mutex_unlock( &uplink->queueLock );
@@ -667,37 +689,54 @@ cleanup: ;
  */
 static void uplink_sendRequests(dnbd3_uplink_t *uplink, bool newOnly)
 {
-	// Scan for new requests
-	int j;
+	// Scan for new requests, or optionally, (re)send all
+	// Build a buffer, so if there aren't too many requests, we can send them after
+	// unlocking the queue again. Otherwise we need flushes during iteration, which
+	// is no ideal, but in that case the uplink is probably overwhelmed anyways.
+	// Try 125 as that's exactly 300bytes, usually 2*MTU.
+#define MAX_RESEND_BATCH 125
+	dnbd3_request_t reqs[MAX_RESEND_BATCH];
+	int count = 0;
 	mutex_lock( &uplink->queueLock );
-	for (j = 0; j < uplink->queueLen; ++j) {
-		if ( uplink->queue[j].status != ULR_NEW && (newOnly || uplink->queue[j].status != ULR_PENDING) ) continue;
-		uplink->queue[j].status = ULR_PENDING;
-		uint8_t hops = uplink->queue[j].hopCount;
-		const uint64_t reqStart = uplink->queue[j].from & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1);
-		const uint32_t reqSize = (uint32_t)(((uplink->queue[j].to + DNBD3_BLOCK_SIZE - 1) & ~(uint64_t)(DNBD3_BLOCK_SIZE - 1)) - reqStart);
-		/*
-		logadd( LOG_DEBUG2, "[%p] Sending slot %d, now %d, handle %" PRIu64 ", Range: %" PRIu64 "-%" PRIu64 " (%" PRIu64 "-%" PRIu64 ")",
-				(void*)uplink, j, uplink->queue[j].status, uplink->queue[j].handle, uplink->queue[j].from, uplink->queue[j].to, reqStart, reqStart+reqSize );
-		*/
-		mutex_unlock( &uplink->queueLock );
-		if ( hops < 200 ) ++hops;
-		mutex_lock( &uplink->sendMutex );
-		const bool ret = dnbd3_get_block( uplink->current.fd, reqStart, reqSize, reqStart, COND_HOPCOUNT( uplink->current.version, hops ) );
-		if ( likely( ret ) ) {
-			mutex_unlock( &uplink->sendMutex );
-		} else {
-			// Non-critical - if the connection dropped or the server was changed
-			// the thread will re-send this request as soon as the connection
-			// is reestablished.
-			uplink->image->problem.uplink = true;
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( newOnly && it->sent )
+			continue;
+		it->sent = true;
+		dnbd3_request_t *hdr = &reqs[count++];
+		hdr->magic = dnbd3_packet_magic;
+		hdr->cmd = CMD_GET_BLOCK;
+		hdr->size = it->to - it->from;
+		hdr->offset_small = it->from;
+		hdr->hops = it->hopCount + 1;
+		hdr->handle = it->handle;
+		fixup_request( *hdr );
+		if ( count == MAX_RESEND_BATCH ) {
+			bool ok = false;
+			logadd( LOG_DEBUG2, "BLOCKING resend of %d", count );
+			count = 0;
+			mutex_lock( &uplink->sendMutex );
+			if ( uplink->current.fd != -1 ) {
+				ok = ( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH, 3 )
+						== DNBD3_REQUEST_SIZE * MAX_RESEND_BATCH );
+			}
 			mutex_unlock( &uplink->sendMutex );
-			logadd( LOG_DEBUG1, "Error forwarding request to uplink server!\n" );
-			return;
+			if ( !ok ) {
+				uplink->image->problem.uplink = true;
+				break;
+			}
 		}
-		mutex_lock( &uplink->queueLock );
 	}
 	mutex_unlock( &uplink->queueLock );
+	if ( count != 0 ) {
+		mutex_lock( &uplink->sendMutex );
+		if ( uplink->current.fd != -1 ) {
+			uplink->image->problem.uplink =
+				( sock_sendAll( uplink->current.fd, reqs, DNBD3_REQUEST_SIZE * count, 3 )
+					!= DNBD3_REQUEST_SIZE * count );
+		}
+		mutex_unlock( &uplink->sendMutex );
+	}
+#undef MAX_RESEND_BATCH
 }
 
 /**
@@ -720,71 +759,73 @@ static bool uplink_sendReplicationRequest(dnbd3_uplink_t *uplink)
 		return false; // Should never be called in this state, consider send error
 	if ( _backgroundReplication == BGR_DISABLED || uplink->cacheFd == -1 )
 		return true; // Don't do background replication
-	if ( uplink->nextReplicationIndex == -1 || uplink->replicationHandle != REP_NONE )
-		return true; // Already a replication request on the wire, or no more blocks to replicate
+	if ( uplink->nextReplicationIndex == -1 )
+		return true; // No more blocks to replicate
 	dnbd3_image_t * const image = uplink->image;
 	if ( image->users < _bgrMinClients )
 		return true; // Not enough active users
+	const int numNewRequests = numWantedReplicationRequests( uplink );
+	if ( numNewRequests <= 0 )
+		return true; // Already sufficient amount of requests on the wire
 	dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-	if ( cache == NULL || image->users ) {
+	if ( cache == NULL ) {
 		// No cache map (=image complete)
-		ref_put( &cache->reference );
 		return true;
 	}
 	const int mapBytes = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
 	const int lastBlockIndex = mapBytes - 1;
-	int endByte;
-	if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
-		endByte = uplink->nextReplicationIndex + mapBytes;
-	} else { // Hashblock based: Only look for match in current hash block
-		endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
-		if ( endByte > mapBytes ) {
-			endByte = mapBytes;
+	for ( int bc = 0; bc < numNewRequests; ++bc ) {
+		int endByte;
+		if ( _backgroundReplication == BGR_FULL ) { // Full mode: consider all blocks
+			endByte = uplink->nextReplicationIndex + mapBytes;
+		} else { // Hashblock based: Only look for match in current hash block
+			endByte = ( uplink->nextReplicationIndex + MAP_BYTES_PER_HASH_BLOCK ) & MAP_INDEX_HASH_START_MASK;
+			if ( endByte > mapBytes ) {
+				endByte = mapBytes;
+			}
 		}
-	}
-	atomic_thread_fence( memory_order_acquire );
-	int replicationIndex = -1;
-	for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
-		const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
-		if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
-				&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
-			// Found incomplete one
-			replicationIndex = i;
+		atomic_thread_fence( memory_order_acquire );
+		int replicationIndex = -1;
+		for ( int j = uplink->nextReplicationIndex; j < endByte; ++j ) {
+			const int i = j % ( mapBytes ); // Wrap around for BGR_FULL
+			if ( atomic_load_explicit( &cache->map[i], memory_order_relaxed ) != 0xff
+					&& ( i != lastBlockIndex || !uplink->replicatedLastBlock ) ) {
+				// Found incomplete one
+				replicationIndex = i;
+				break;
+			}
+		}
+		if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
+			// Nothing left in current block, find next one
+			replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
+		}
+		if ( replicationIndex == -1 ) {
+			// Replication might be complete, uplink_mainloop should take care....
+			uplink->nextReplicationIndex = -1;
 			break;
 		}
+		const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
+		const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
+		const uint64_t handle = ++uplink->queueId;
+		if ( !uplink_request( uplink, NULL, handle, offset, size, 0 ) ) {
+			logadd( LOG_DEBUG1, "Error sending background replication request to uplink server (%s:%d)",
+					PIMG(uplink->image) );
+			ref_put( &cache->reference );
+			return false;
+		}
+		if ( replicationIndex == lastBlockIndex ) {
+			uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
+		}
+		uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
+		if ( _backgroundReplication == BGR_HASHBLOCK
+				&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
+			// Just crossed a hash block boundary, look for new candidate starting at this very index
+			uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
+			if ( uplink->nextReplicationIndex == -1 )
+				break;
+		}
 	}
 	ref_put( &cache->reference );
-	if ( replicationIndex == -1 && _backgroundReplication == BGR_HASHBLOCK ) {
-		// Nothing left in current block, find next one
-		replicationIndex = uplink_findNextIncompleteHashBlock( uplink, endByte );
-	}
-	if ( replicationIndex == -1 ) {
-		// Replication might be complete, uplink_mainloop should take care....
-		uplink->nextReplicationIndex = -1;
-		return true;
-	}
-	const uint64_t offset = (uint64_t)replicationIndex * FILE_BYTES_PER_MAP_BYTE;
-	uplink->replicationHandle = offset;
-	const uint32_t size = (uint32_t)MIN( image->virtualFilesize - offset, FILE_BYTES_PER_MAP_BYTE );
-	mutex_lock( &uplink->sendMutex );
-	bool sendOk = dnbd3_get_block( uplink->current.fd, offset, size, uplink->replicationHandle, COND_HOPCOUNT( uplink->current.version, 1 ) );
-	if ( likely( sendOk ) ) {
-		mutex_unlock( &uplink->sendMutex );
-	} else {
-		uplink->image->problem.uplink = true;
-		mutex_unlock( &uplink->sendMutex );
-		logadd( LOG_DEBUG1, "Error sending background replication request to uplink server!\n" );
-		return false;
-	}
-	if ( replicationIndex == lastBlockIndex ) {
-		uplink->replicatedLastBlock = true; // Special treatment, last byte in map could represent less than 8 blocks
-	}
-	uplink->nextReplicationIndex = replicationIndex + 1; // Remember last incomplete offset for next time so we don't play Schlemiel the painter
-	if ( _backgroundReplication == BGR_HASHBLOCK
-			&& uplink->nextReplicationIndex % MAP_BYTES_PER_HASH_BLOCK == 0 ) {
-		// Just crossed a hash block boundary, look for new candidate starting at this very index
-		uplink->nextReplicationIndex = uplink_findNextIncompleteHashBlock( uplink, uplink->nextReplicationIndex );
-	}
 	return true;
 }
 
@@ -845,7 +886,7 @@ static int uplink_findNextIncompleteHashBlock(dnbd3_uplink_t *uplink, const int
 static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 {
 	dnbd3_reply_t inReply, outReply;
-	int ret, i;
+	int ret;
 	for (;;) {
 		ret = dnbd3_read_reply( uplink->current.fd, &inReply, false );
 		if ( unlikely( ret == REPLY_INTR ) && likely( !_shutdown && !uplink->shutdown ) ) continue;
@@ -881,13 +922,34 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 		}
 		// Payload read completely
 		// Bail out if we're not interested
-		if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) ) continue;
+		if ( unlikely( inReply.cmd != CMD_GET_BLOCK ) )
+			continue;
 		// Is a legit block reply
-		struct iovec iov[2];
-		const uint64_t start = inReply.handle;
-		const uint64_t end = inReply.handle + inReply.size;
 		totalBytesReceived += inReply.size;
 		uplink->bytesReceived += inReply.size;
+		// Get entry from queue
+		dnbd3_queue_entry_t *entry;
+		mutex_lock( &uplink->queueLock );
+		for ( entry = uplink->queue; entry != NULL; entry = entry->next ) {
+			if ( entry->handle == inReply.handle )
+				break;
+		}
+		if ( entry == NULL ) {
+			mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+			logadd( LOG_DEBUG1, "Received block reply on uplink, but handle %"PRIu64" is unknown (%s:%d)",
+					inReply.handle, PIMG(uplink->image) );
+			continue;
+		}
+		const uint64_t start = entry->from;
+		const uint64_t end = entry->to;
+		mutex_unlock( &uplink->queueLock ); // Do not dereference pointer after unlock!
+		// We don't remove the entry from the list here yet, to slightly increase the chance of other
+		// clients attaching to this request while we write the data to disk
+		if ( end - start != inReply.size ) {
+			logadd( LOG_WARNING, "Received payload length does not match! (is: %"PRIu32", expect: %u, %s:%d)",
+					inReply.size, (unsigned int)( end - start ), PIMG(uplink->image) );
+		}
+		struct iovec iov[2];
 		// 1) Write to cache file
 		if ( unlikely( uplink->cacheFd == -1 ) ) {
 			uplink_reopenCacheFd( uplink, false );
@@ -934,98 +996,76 @@ static void uplink_handleReceive(dnbd3_uplink_t *uplink)
 						PIMG(uplink->image), err );
 			}
 		}
-		// 2) Figure out which clients are interested in it
-		// Mark as ULR_PROCESSING, since we unlock repeatedly in the second loop
-		// below; this prevents uplink_request() from attaching to this request
-		// by populating a slot with index greater than the highest matching
-		// request with ULR_PROCESSING (assuming there is no ULR_PENDING or ULR_NEW
-		// where it's fine if the index is greater)
+		bool found = false;
+		dnbd3_queue_entry_t **it;
 		mutex_lock( &uplink->queueLock );
-		for (i = 0; i < uplink->queueLen; ++i) {
-			dnbd3_queued_request_t * const req = &uplink->queue[i];
-			assert( req->status != ULR_PROCESSING );
-			if ( req->status != ULR_PENDING && req->status != ULR_NEW ) continue;
-			assert( req->client != NULL );
-			if ( req->from >= start && req->to <= end ) { // Match :-)
-				req->status = ULR_PROCESSING;
-			}
-		}
-		// 3) Send to interested clients - iterate backwards so request collaboration works, and
-		// so we can decrease queueLen on the fly while iterating. Should you ever change this to start
-		// from 0, you also need to change the "attach to existing request"-logic in uplink_request()
-		outReply.magic = dnbd3_packet_magic;
-		bool served = false;
-		for ( i = uplink->queueLen - 1; i >= 0; --i ) {
-			dnbd3_queued_request_t * const req = &uplink->queue[i];
-			if ( req->status == ULR_PROCESSING ) {
-				size_t bytesSent = 0;
-				assert( req->from >= start && req->to <= end );
-				dnbd3_client_t * const client = req->client;
-				outReply.cmd = CMD_GET_BLOCK;
-				outReply.handle = req->handle;
-				outReply.size = (uint32_t)( req->to - req->from );
-				iov[0].iov_base = &outReply;
-				iov[0].iov_len = sizeof outReply;
-				iov[1].iov_base = uplink->recvBuffer + (req->from - start);
-				iov[1].iov_len = outReply.size;
-				fixup_reply( outReply );
-				req->status = ULR_FREE;
-				req->client = NULL;
-				served = true;
-				mutex_lock( &client->sendMutex );
-				mutex_unlock( &uplink->queueLock );
-				if ( client->sock != -1 ) {
-					ssize_t sent = writev( client->sock, iov, 2 );
-					if ( sent > (ssize_t)sizeof outReply ) {
-						bytesSent = (size_t)sent - sizeof outReply;
-					}
-				}
-				if ( bytesSent != 0 ) {
-					client->bytesSent += bytesSent;
-				}
-				mutex_unlock( &client->sendMutex );
-				mutex_lock( &uplink->queueLock );
-				if ( i > uplink->queueLen ) {
-					i = uplink->queueLen; // Might have been set to 0 by cancelAllRequests
-				}
+		for ( it = &uplink->queue; *it != NULL; it = &(**it).next ) {
+			if ( *it == entry && entry->handle == inReply.handle ) { // ABA check
+				assert( found == false );
+				*it = (**it).next;
+				found = true;
+				uplink->queueLen--;
+				break;
 			}
-			if ( req->status == ULR_FREE && i == uplink->queueLen - 1 ) uplink->queueLen--;
 		}
 		if ( uplink->queueLen < SERVER_UPLINK_QUEUELEN_THRES ) {
 			uplink->image->problem.queue = false;
 		}
 		mutex_unlock( &uplink->queueLock );
-#ifdef _DEBUG
-		if ( !served && start != uplink->replicationHandle ) {
-			logadd( LOG_DEBUG2, "%p, %s -- Unmatched reply: %" PRIu64 " to %" PRIu64, (void*)uplink, uplink->image->name, start, end );
+		if ( !found ) {
+			logadd( LOG_DEBUG1, "Replication request vanished from queue after writing to disk (%s:%d)",
+					PIMG(uplink->image) );
+			continue;
 		}
-#endif
-		if ( start == uplink->replicationHandle ) {
-			// Was our background replication
-			uplink->replicationHandle = REP_NONE;
-			// Try to remove from fs cache if no client was interested in this data
-			if ( !served && uplink->cacheFd != -1 ) {
-				posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+		outReply.magic = dnbd3_packet_magic;
+		dnbd3_queue_client_t *next;
+		for ( dnbd3_queue_client_t *c = entry->clients; c != NULL; c = next ) {
+			size_t bytesSent = 0;
+			assert( c->from >= start && c->to <= end );
+			dnbd3_client_t * const client = c->client;
+			outReply.cmd = CMD_GET_BLOCK;
+			outReply.handle = c->handle;
+			outReply.size = (uint32_t)( c->to - c->from );
+			iov[0].iov_base = &outReply;
+			iov[0].iov_len = sizeof outReply;
+			iov[1].iov_base = uplink->recvBuffer + (c->from - start);
+			iov[1].iov_len = outReply.size;
+			fixup_reply( outReply );
+			mutex_lock( &client->sendMutex );
+			if ( client->sock != -1 ) {
+				ssize_t sent = writev( client->sock, iov, 2 );
+				if ( sent > (ssize_t)sizeof outReply ) {
+					bytesSent = (size_t)sent - sizeof outReply;
+				}
+				if ( bytesSent != 0 ) {
+					client->bytesSent += bytesSent;
+				}
 			}
+			mutex_unlock( &client->sendMutex );
+			client->relayedCount--;
+			next = c->next;
+			free( c );
 		}
-		if ( served ) {
+		if ( entry->clients != NULL ) {
 			// Was some client -- reset idle counter
 			uplink->idleTime = 0;
 			// Re-enable replication if disabled
 			if ( uplink->nextReplicationIndex == -1 ) {
 				uplink->nextReplicationIndex = (int)( start / FILE_BYTES_PER_MAP_BYTE ) & MAP_INDEX_HASH_START_MASK;
 			}
+		} else {
+			if ( uplink->cacheFd != -1 ) {
+				// Try to remove from fs cache if no client was interested in this data
+				posix_fadvise( uplink->cacheFd, start, inReply.size, POSIX_FADV_DONTNEED );
+			}
 		}
+		free( entry );
+	} // main receive loop
+	// Trigger background replication if applicable
+	if ( !uplink_sendReplicationRequest( uplink ) ) {
+		goto error_cleanup;
 	}
-	if ( uplink->replicationHandle == REP_NONE ) {
-		mutex_lock( &uplink->queueLock );
-		const bool rep = ( uplink->queueLen == 0 );
-		mutex_unlock( &uplink->queueLock );
-		if ( rep ) {
-			if ( !uplink_sendReplicationRequest( uplink ) )
-				goto error_cleanup;
-		}
-	}
+	// Normal end
 	return;
 	// Error handling from failed receive or message parsing
 error_cleanup: ;
@@ -1046,7 +1086,6 @@ static void uplink_connectionFailed(dnbd3_uplink_t *uplink, bool findNew)
 	close( uplink->current.fd );
 	uplink->current.fd = -1;
 	mutex_unlock( &uplink->sendMutex );
-	uplink->replicationHandle = REP_NONE;
 	if ( _backgroundReplication == BGR_FULL && uplink->nextReplicationIndex == -1 ) {
 		uplink->nextReplicationIndex = 0;
 	}
@@ -1156,3 +1195,39 @@ bool uplink_getHostString(dnbd3_uplink_t *uplink, char *buffer, size_t len)
 		return false;
 	return altservers_toString( current, buffer, len );
 }
+
+/**
+ * Get number of replication requests that should be sent right now to
+ * meet the configured bgrWindowSize. Returns 0 if any client requests
+ * are pending
+ */
+static int numWantedReplicationRequests(dnbd3_uplink_t *uplink)
+{
+	int ret = MIN( _bgrWindowSize, uplink->idleTime + 1 );
+	if ( uplink->queueLen == 0 )
+		return ret;
+	mutex_lock( &uplink->queueLock );
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( it->clients == NULL ) {
+			ret--;
+		} else {
+			ret = 0; // Do not allow BGR if client requests are being handled
+			break;
+		}
+	}
+	mutex_unlock( &uplink->queueLock );
+	return ret;
+}
+
+static void markRequestUnsent(dnbd3_uplink_t *uplink, uint64_t handle)
+{
+	mutex_lock( &uplink->queueLock );
+	for ( dnbd3_queue_entry_t *it = uplink->queue; it != NULL; it = it->next ) {
+		if ( it->handle == handle ) {
+			it->sent = false;
+			break;
+		}
+	}
+	mutex_unlock( &uplink->queueLock );
+}
+
diff --git a/src/server/uplink.h b/src/server/uplink.h
index 49ff0b4..8f69b05 100644
--- a/src/server/uplink.h
+++ b/src/server/uplink.h
@@ -12,7 +12,7 @@ bool uplink_init(dnbd3_image_t *image, int sock, dnbd3_host_t *host, int version
 
 void uplink_removeClient(dnbd3_uplink_t *uplink, dnbd3_client_t *client);
 
-bool uplink_request(dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hopCount);
+bool uplink_request(dnbd3_uplink_t *uplink, dnbd3_client_t *client, uint64_t handle, uint64_t start, uint32_t length, uint8_t hops);
 
 bool uplink_shutdown(dnbd3_image_t *image);
 
diff --git a/src/serverconfig.h b/src/serverconfig.h
index 5c7301d..31708de 100644
--- a/src/serverconfig.h
+++ b/src/serverconfig.h
@@ -13,7 +13,8 @@
 #define SERVER_BAD_UPLINK_MAX 20 // Hard block server if it failed this many times
 #define SERVER_BAD_UPLINK_LOCAL_BLOCK 10 // If a server didn't supply the requested image this many times, block it for some time
 #define SERVER_BAD_UPLINK_IGNORE 180 // How many seconds is a server ignored
-#define SERVER_MAX_UPLINK_QUEUE  1500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_QUEUE  500 // Maximum number of queued requests per uplink
+#define UPLINK_MAX_CLIENTS_PER_REQUEST 32 // Maximum number of clients that can attach to one uplink request
 #define SERVER_UPLINK_QUEUELEN_THRES  900 // Threshold where we start dropping incoming clients
 #define SERVER_MAX_PENDING_ALT_CHECKS 500 // Length of queue for pending alt checks requested by uplinks
 
-- 
cgit v1.2.3-55-g7522


From 894eeb86f872a7f7f5f36bfa8649da3075dd28d6 Mon Sep 17 00:00:00 2001
From: Simon Rettberg
Date: Fri, 20 Mar 2020 21:22:18 +0100
Subject: [SERVER] Remember atime in .meta file

---
 src/server/globals.h |   1 +
 src/server/image.c   | 198 +++++++++++++++++++++++++++++++++++----------------
 src/server/net.c     |   2 +
 3 files changed, 139 insertions(+), 62 deletions(-)

(limited to 'src/server/net.c')

diff --git a/src/server/globals.h b/src/server/globals.h
index 08ec303..95d8ec2 100644
--- a/src/server/globals.h
+++ b/src/server/globals.h
@@ -155,6 +155,7 @@ struct _dnbd3_image
 		atomic_bool queue;       // Too many requests waiting on uplink
 	} problem;
 	uint16_t rid;          // revision of image
+	bool accessed;         // image was accessed since .meta was written
 	pthread_mutex_t lock;
 };
 #define PIMG(x) (x)->name, (int)(x)->rid
diff --git a/src/server/image.c b/src/server/image.c
index 67a763c..4944bfd 100644
--- a/src/server/image.c
+++ b/src/server/image.c
@@ -55,10 +55,12 @@ static dnbd3_cache_map_t* image_loadCacheMap(const char * const imagePath, const
 static uint32_t* image_loadCrcList(const char * const imagePath, const int64_t fileSize, uint32_t *masterCrc);
 static bool image_checkRandomBlocks(dnbd3_image_t *image, const int count, int fromFd);
 static void* closeUnusedFds(void*);
-static bool imageShouldSaveCacheMap(dnbd3_image_t *image);
+static bool isImageFromUpstream(dnbd3_image_t *image);
 static void* saveLoadAllCacheMaps(void*);
 static void saveCacheMap(dnbd3_image_t *image);
 static void allocCacheMap(dnbd3_image_t *image, bool complete);
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime);
+static void loadImageMeta(dnbd3_image_t *image);
 
 static void cmfree(ref *ref)
 {
@@ -630,8 +632,11 @@ static dnbd3_image_t* image_free(dnbd3_image_t *image)
 	// this will get called again when the uplink is done.
 	if ( !uplink_shutdown( image ) )
 		return NULL;
-	if ( imageShouldSaveCacheMap( image ) ) {
-		saveCacheMap( image );
+	if ( isImageFromUpstream( image ) ) {
+		saveMetaData( image, NULL, 0 );
+		if ( image->ref_cacheMap != NULL ) {
+			saveCacheMap( image );
+		}
 	}
 	mutex_lock( &image->lock );
 	ref_setref( &image->ref_cacheMap, NULL );
@@ -757,7 +762,6 @@ static bool image_addToList(dnbd3_image_t *image)
 static bool image_load(char *base, char *path, bool withUplink)
 {
 	int revision = -1;
-	struct stat st;
 	dnbd3_cache_map_t *cache = NULL;
 	uint32_t *crc32list = NULL;
 	dnbd3_image_t *existing = NULL;
@@ -901,15 +905,7 @@ static bool image_load(char *base, char *path, bool withUplink)
 	timing_get( &image->nextCompletenessEstimate );
 	image->completenessEstimate = -1;
 	mutex_init( &image->lock, LOCK_IMAGE );
-	int32_t offset;
-	if ( stat( path, &st ) == 0 ) {
-		// Negatively offset atime by file modification time
-		offset = (int32_t)( st.st_mtime - time( NULL ) );
-		if ( offset > 0 ) offset = 0;
-	} else {
-		offset = 0;
-	}
-	timing_gets( &image->atime, offset );
+	loadImageMeta( image );
 
 	// Prevent freeing in cleanup
 	cache = NULL;
@@ -1843,12 +1839,10 @@ static void* closeUnusedFds(void* nix UNUSED)
 	return NULL;
 }
 
-static bool imageShouldSaveCacheMap(dnbd3_image_t *image)
+static bool isImageFromUpstream(dnbd3_image_t *image)
 {
 	if ( !_isProxy )
 		return false; // Nothing to do
-	if ( image->ref_cacheMap == NULL )
-		return false; // Nothing to do
 	// Check if we're a "hybrid proxy", i.e. there are only some namespaces (directories)
 	// for which we have any upstream servers configured. If there's none, don't touch
 	// the cache map on disk.
@@ -1862,66 +1856,71 @@ static void* saveLoadAllCacheMaps(void* nix UNUSED)
 	static ticks nextSave;
 	declare_now;
 	bool full = timing_reached( &nextSave, &now );
+	time_t walltime = full ? time( NULL ) : 0;
 	setThreadName( "cache-mapper" );
 	mutex_lock( &imageListLock );
 	for ( int i = 0; i < _num_images; ++i ) {
 		dnbd3_image_t * const image = _images[i];
-		dnbd3_cache_map_t *cache = ref_get_cachemap( image );
-		if ( cache == NULL )
-			continue; // No users++ or mutex_unlock yet -> safe
 		image->users++;
 		mutex_unlock( &imageListLock );
-		if ( imageShouldSaveCacheMap( image ) ) {
-			// Replicated image, we're responsible for updating the map, so save it
-			// Save if dirty bit is set, blocks were invalidated
-			bool save = cache->dirty;
-			dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
-			if ( !save ) {
-				// Otherwise, consider longer timeout and byte count limits of uplink
-				if ( uplink != NULL ) {
-					assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
-					uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
-					if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
-						save = true;
+		const bool fromUpstream = isImageFromUpstream( image );
+		dnbd3_cache_map_t *cache = ref_get_cachemap( image );
+		if ( cache != NULL ) {
+			if ( fromUpstream ) {
+				// Replicated image, we're responsible for updating the map, so save it
+				// Save if dirty bit is set, blocks were invalidated
+				bool save = cache->dirty;
+				dnbd3_uplink_t *uplink = ref_get_uplink( &image->uplinkref );
+				if ( !save ) {
+					// Otherwise, consider longer timeout and byte count limits of uplink
+					if ( uplink != NULL ) {
+						assert( uplink->bytesReceivedLastSave <= uplink->bytesReceived );
+						uint64_t diff = uplink->bytesReceived - uplink->bytesReceivedLastSave;
+						if ( diff > CACHE_MAP_MAX_UNSAVED_BYTES || ( full && diff != 0 ) ) {
+							save = true;
+						}
 					}
 				}
-			}
-			if ( save ) {
-				cache->dirty = false;
+				if ( save ) {
+					cache->dirty = false;
+					if ( uplink != NULL ) {
+						uplink->bytesReceivedLastSave = uplink->bytesReceived;
+					}
+					saveCacheMap( image );
+				}
 				if ( uplink != NULL ) {
-					uplink->bytesReceivedLastSave = uplink->bytesReceived;
+					ref_put( &uplink->reference );
 				}
-				saveCacheMap( image );
-			}
-			if ( uplink != NULL ) {
-				ref_put( &uplink->reference );
-			}
-		} else {
-			// We're not replicating this image, if there's a cache map, reload
-			// it periodically, since we might read from a shared storage that
-			// another server instance is writing to.
-			if ( full || ( !cache->unchanged && !image->problem.read ) ) {
-				logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
-				dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
-				if ( onDisk == NULL ) {
-					// Should be complete now
-					logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
-					ref_setref( &image->ref_cacheMap, NULL );
-				} else {
-					const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
-					if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
-						// Unchanged
-						cache->unchanged = true;
-						onDisk->reference.free( &onDisk->reference );
+			} else {
+				// We're not replicating this image, if there's a cache map, reload
+				// it periodically, since we might read from a shared storage that
+				// another server instance is writing to.
+				if ( full || ( !cache->unchanged && !image->problem.read ) ) {
+					logadd( LOG_DEBUG2, "Reloading cache map of %s:%d", PIMG(image) );
+					dnbd3_cache_map_t *onDisk = image_loadCacheMap(image->path, image->virtualFilesize);
+					if ( onDisk == NULL ) {
+						// Should be complete now
+						logadd( LOG_DEBUG1, "External replication of %s:%d complete", PIMG(image) );
+						ref_setref( &image->ref_cacheMap, NULL );
 					} else {
-						// Replace
-						ref_setref( &image->ref_cacheMap, &onDisk->reference );
-						logadd( LOG_DEBUG2, "Map changed" );
+						const int mapSize = IMGSIZE_TO_MAPBYTES( image->virtualFilesize );
+						if ( memcmp( cache->map, onDisk->map, mapSize ) == 0 ) {
+							// Unchanged
+							cache->unchanged = true;
+							onDisk->reference.free( &onDisk->reference );
+						} else {
+							// Replace
+							ref_setref( &image->ref_cacheMap, &onDisk->reference );
+							logadd( LOG_DEBUG2, "Map changed" );
+						}
 					}
 				}
-			}
+			} // end reload cache map
+			ref_put( &cache->reference );
+		} // end has cache map
+		if ( full && fromUpstream ) {
+			saveMetaData( image, &now, walltime );
 		}
-		ref_put( &cache->reference );
 		image_release( image ); // Always do this instead of users-- to handle freeing
 		mutex_lock( &imageListLock );
 	}
@@ -2023,3 +2022,78 @@ static void allocCacheMap(dnbd3_image_t *image, bool complete)
 	}
 	mutex_unlock( &image->lock );
 }
+
+/**
+ * It's assumed you hold a reference to the image
+ */
+static void saveMetaData(dnbd3_image_t *image, ticks *now, time_t walltime)
+{
+	if ( !image->accessed )
+		return;
+	ticks tmp;
+	uint32_t diff;
+	char *fn;
+	if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+		logadd( LOG_WARNING, "Cannot asprintf meta" );
+		return;
+	}
+	if ( now == NULL ) {
+		timing_get( &tmp );
+		now = &tmp;
+		walltime = time( NULL );
+	}
+	mutex_lock( &image->lock );
+	image->accessed = false;
+	diff = timing_diff( &image->atime, now );
+	mutex_unlock( &image->lock );
+	FILE *f = fopen( fn, "w" );
+	if ( f == NULL ) {
+		logadd( LOG_WARNING, "Cannot open %s for writing", fn );
+	} else {
+		fprintf( f, "[main]\natime=%"PRIu64"\n", (uint64_t)( walltime - diff ) );
+		fclose( f );
+	}
+	free( fn );
+	// TODO: fsync() dir
+}
+
+static void loadImageMeta(dnbd3_image_t *image)
+{
+	int32_t offset = 1;
+	char *fn;
+	if ( asprintf( &fn, "%s.meta", image->path ) == -1 ) {
+		logadd( LOG_WARNING, "asprintf load" );
+	} else {
+		int fh = open( fn, O_RDONLY );
+		free( fn );
+		if ( fh != -1 ) {
+			char buf[200];
+			ssize_t ret = read( fh, buf, sizeof(buf)-1 );
+			close( fh );
+			if ( ret > 0 ) {
+				buf[ret] = '\0';
+				// Do it the cheap way until we actually store more stuff
+				char *pos = strstr( buf, "atime=" );
+				if ( pos != NULL ) {
+					offset = (int32_t)( atol( pos + 6 ) - time( NULL ) );
+				}
+			}
+		}
+	}
+	if ( offset == 1 ) {
+		// Nothing from .meta file, use old guesstimate
+		struct stat st;
+		if ( stat( image->path, &st ) == 0 ) {
+			// Negatively offset atime by file modification time
+			offset = (int32_t)( st.st_mtime - time( NULL ) );
+		} else {
+			offset = 0;
+		}
+		image->accessed = true;
+	}
+	if ( offset > 0 ) {
+		offset = 0;
+	}
+	timing_gets( &image->atime, offset );
+}
+
diff --git a/src/server/net.c b/src/server/net.c
index 9ba9dbc..6b930df 100644
--- a/src/server/net.c
+++ b/src/server/net.c
@@ -289,6 +289,7 @@ void* net_handleNewConnection(void *clientPtr)
 					if ( !client->isServer ) {
 						// Only update immediately if this is a client. Servers are handled on disconnect.
 						timing_get( &image->atime );
+						image->accessed = true;
 					}
 					mutex_unlock( &image->lock );
 					serializer_reset_write( &payload );
@@ -515,6 +516,7 @@ exit_client_cleanup: ;
 	if ( image != NULL && client->bytesSent > DNBD3_BLOCK_SIZE * 10 ) {
 		mutex_lock( &image->lock );
 		timing_get( &image->atime );
+		image->accessed = true;
 		mutex_unlock( &image->lock );
 	}
 	if ( cache != NULL ) {
-- 
cgit v1.2.3-55-g7522